核心解析
# 解析url,获得标题与内容
def AnalysisUrl(url):
try:
request = urllib2.Request(url, headers=headers)
response = urllib2.urlopen(request)
resHtml = response.read()
# 获取整个页面
html = BeautifulSoup(resHtml)
# 获取文章内容
a = html.select('div[class="show-content-free"]')
# 获取全部图片
img = html.select('div[class="image-package"]')
# 获取图片地址
image = html.select('div[class="show-content-free"] img')
# 第一段
content = a[0].__str__()
# 图片附加内容
imgpatt = '?imageMogr2/auto-orient/strip%7CimageView2/2/w/'
# 正则匹配出图片位置
patternImgSize = re.compile(r'<div class="image-container" '
r'style="max-width: (.*?)px;', re.S)
flat = 0
for i in img:
# 图片格式拼接
d = image[flat].attrs['data-original-src'] + imgpatt \
+ patternImgSize.findall(i.__str__())[0]
# 图片替换
content = content.replace(i.__str__(), "<img src=\"" + d + "\">")
flat = flat + 1
# 自定义格式需要
content = content.replace("<b>", "<strong>")
content = content.replace("</b>", "</strong>")
content = content.replace('<div class="show-content-free">', "")
content = content.replace("</div>", "")
# 获得标题
title = html.select('meta[property="og:title"]')[0].attrs['content']
#解析出内容与标题
writeArticle(content, title)
except:
print "该文章解析失败 url:" + url
- 解析过滤简书自定义标签,文章再用其他富文本逆向解析即可
完整代码
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
import urllib2
import re
import time
headers = {
"Accept": "application/json, text/javascript, */*; q=0.01",
"X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/56.0",
"Content-Type": "application/json;charset=UTF-8",
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
"Cookie": "JSESSIONID=2D1E55287F8B056E83FD29B114FBA389"
}
# 解析url,获得标题与内容
def AnalysisUrl(url):
try:
request = urllib2.Request(url, headers=headers)
response = urllib2.urlopen(request)
resHtml = response.read()
# 获取整个页面
html = BeautifulSoup(resHtml)
# 获取文章内容
a = html.select('div[class="show-content-free"]')
# 获取全部图片
img = html.select('div[class="image-package"]')
# 获取图片地址
image = html.select('div[class="show-content-free"] img')
# 第一段
content = a[0].__str__()
# 图片附加内容
imgpatt = '?imageMogr2/auto-orient/strip%7CimageView2/2/w/'
# 正则匹配出图片位置
patternImgSize = re.compile(r'<div class="image-container" '
r'style="max-width: (.*?)px;', re.S)
flat = 0
for i in img:
# 图片格式拼接
d = image[flat].attrs['data-original-src'] + imgpatt \
+ patternImgSize.findall(i.__str__())[0]
# 图片替换
content = content.replace(i.__str__(), "<img src=\"" + d + "\">")
flat = flat + 1
# 自定义格式需要
content = content.replace("<b>", "<strong>")
content = content.replace("</b>", "</strong>")
content = content.replace('<div class="show-content-free">', "")
content = content.replace("</div>", "")
# 获得标题
title = html.select('meta[property="og:title"]')[0].attrs['content']
#解析出内容与标题
writeArticle(content, title)
except:
print "该文章解析失败 url:" + url
# 文章写入,用其他富文本编辑器解析
def writeArticle(content, title):
with open(title + ".txt", "w") as f:
f.write(content)
if __name__ == "__main__":
# auto.py解析出来的url,进行文章解析
file = open("articleUrl.txt")
myTime = 0
while 1:
line = file.readline()
# 把这个字符串头和尾的空格,以及位于头尾的\n \t之类给删掉
url = line.strip('\n')
myTime = myTime + 1
AnalysisUrl(url)
if myTime > 10:
# 休眠策略,10篇文章休眠3秒
time.sleep(3)
myTime = 0
if not line:
break