为了将自己在简书上发表的历史文章放到个人网站以及以后两者更新的同步,初略写了下爬虫过程。主要利用bs4库,requests库和mysql驱动,实现根据个人ID爬取文章列表,爬取单个文章的标题内容,处理img标签的兼容性,将爬取的结果写入或更新到mysql数据库。
import requests
from bs4 import BeautifulSoup
import MySQLdb
import bs4
def getHTMLText(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def crawlArticleByUrl(url):
html = getHTMLText(url)
soup = BeautifulSoup(html, "html.parser")
article = soup.find_all('div', class_="article")[0]
title = article('h1', class_="title")[0].string
print(title)
content = article('div', class_="show-content")[0]
# 解决img标签的问题
imgs = content('div',class_="image-package")
strContent = str(content)
for img in imgs:
imgUrl = 'http:' + img('img')[0].attrs['data-original-src']
caption = str(img('div',class_='image-caption')[0].string)
new ='<div class="image-package"><img src=' + imgUrl + '>'
if caption != 'None':
new = new + '<div class="image-caption">' + str(caption) + '</div>'
new = new + '</div>'
strContent = strContent.replace(str(img),new)
return title, strContent[48:-7]
def writeIntoDB(title, content):
DBKWARGS = {'db':'myblog','user':'root','passwd':'root', 'host':'localhost','use_unicode':True,'charset':'utf8'}
con = MySQLdb.connect(**DBKWARGS)
cur = con.cursor()
sql = "insert into article(article_title,article_content) values(%s,%s)"
lis = (title, content)
try:
cur.execute(sql, lis)
except Exception as e:
print("Insert Error:", e)
con.rollback()
else:
con.commit()
cur.close()
con.close()
def updateArticleById(title, content,id):
DBKWARGS = {'db':'myblog','user':'root','passwd':'root', 'host':'localhost','use_unicode':True,'charset':'utf8'}
con = MySQLdb.connect(**DBKWARGS)
cur = con.cursor()
sql = "update article set article_title = %s, article_content = %s where article_id = %s;"
lis = (title, content, id)
try:
cur.execute(sql, lis)
except Exception as e:
print("Insert Error:", e)
con.rollback()
else:
con.commit()
cur.close()
con.close()
def GetArticleList(userId):
url = "http://www.jianshu.com/u/" + userId
html = getHTMLText(url)
soup = BeautifulSoup(html, "html.parser")
num = int(soup.find_all('div', class_="meta-block")[2]('p')[0].string) #获取文章总数
page = int(num / 9) + 1 #获取页数
for i in range(page):
newUrl = url + "?order_by=shared_at&page=" + str(page - i)
newHtml = getHTMLText(newUrl)
newSoup = BeautifulSoup(newHtml,"html.parser")
ul = newSoup.find_all('ul', class_="note-list")[0]
lis = []
# 倒序输出
for li in ul.children:
if isinstance(li,bs4.element.Tag):
lis.append(li)
for li in lis[::-1]:
articleUrl = "http://www.jianshu.com" + (li("a", class_="title")[0].attrs['href'])
title, content = crawlArticleByUrl(articleUrl)
writeIntoDB(title, content)
def main():
#GetArticleList("9f29e0217f4d")
title, content = crawlArticleByUrl('https://www.jianshu.com/p/326d5e75fa2d')
#writeIntoDB(title, content)
updateArticleById(title, content, 513)
print("Finish!")
if __name__ == '__main__':
main()