cnbeta网站
将cnbeta网站的文章抓取分析,转换为markdown
#!/usr/bin/env python
# encoding:utf-8
import sys
import os
import time
import hashlib
import argparse
import requests
from bs4 import BeautifulSoup
import argparse
parser = argparse.ArgumentParser(description='manual to this script')
parser.add_argument('--url', type=str, default = None)
parser.add_argument('--dir', type=str, default = "./")
parser.add_argument('--path', type=str, default = None)
parser.add_argument('--date', type=str, default = time.strftime("%Y-%m-%-d", time.localtime()))
args = parser.parse_args()
if args.url is None:
print('Please use as: html2md --url url')
sys.exit()
path = args.path
if path is None:
path = '/' + hashlib.md5(args.url).hexdigest()
print('path:'+path)
date = args.date
save_dir = args.dir
if not os.path.isdir(save_dir):
os.mkdir(save_dir)
r = requests.get(args.url)
soup = BeautifulSoup(r.content, "lxml")
title = soup.title.string.split(' -')[0]
if '"' in title:
title = title.replace('"', '\\"')
description = soup.find(attrs={"property":"og:description"})['content']
mdMeta = '---\npath: "' + path + '"\ndate: "' + date + '"\ntitle: "' + title + '"\ntags: []\nexcerpt: "' + description + '"\n'
main_wrap = soup.find(name='div', attrs={"class":"main-wrap"})
if main_wrap is None:
print('get main_wrap failed')
sys.exit()
article_body = main_wrap.find(name='div', attrs={"class":"cnbeta-article-body"})
if article_body is None:
print('get article body failed')
sys.exit()
article_content = article_body.find('div',id="artibody")
if article_content is None:
print('get article_content failed')
sys.exit()
mdContent = ''
is_attached = False
pchilds = article_content.find_all('p')
for p in pchilds:
if p.parent != article_content:
continue
img = p.find('img')
if img is not None:
ir = requests.get(img['src'])
if ir.status_code == 200:
fn = os.path.basename(img['src'])
open(save_dir+'/'+fn, 'wb').write(ir.content)
if not is_attached:
is_attached = True
mdMeta += 'attachments: \n\t- "' + fn + '"\n---\n\n'
print("attachemet:" + fn)
else:
mdContent += '<div style="text-align: center"><img src="' + fn + '"</div>\n\n'
else:
print('download image:' + img['src'] + ' failed, status_code:' + ir.status_code)
continue
if 'style' in p.attrs and 'text-align: center' in p.attrs['style']:
continue
mdContent += p.text + '\n\n'
if not is_attached:
mdMeta += '---\n\n'
f = open(save_dir+'/'+path+'.md', 'wb')
f.write((mdMeta+mdContent).encode('utf-8'))
f.close()