爬取简书文章到个人mysql数据库

为了将自己在简书上发表的历史文章放到个人网站以及以后两者更新的同步,初略写了下爬虫过程。主要利用bs4库,requests库和mysql驱动,实现根据个人ID爬取文章列表,爬取单个文章的标题内容,处理img标签的兼容性,将爬取的结果写入或更新到mysql数据库。

import requests
from bs4 import BeautifulSoup
import MySQLdb
import bs4

def getHTMLText(url):
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""

def crawlArticleByUrl(url):
    html = getHTMLText(url)
    soup = BeautifulSoup(html, "html.parser")
    article = soup.find_all('div', class_="article")[0]
    title = article('h1', class_="title")[0].string
    print(title)
    content = article('div', class_="show-content")[0]

    # 解决img标签的问题
    imgs = content('div',class_="image-package")
    strContent = str(content)
    for img in imgs:
        imgUrl = 'http:' + img('img')[0].attrs['data-original-src']
        caption = str(img('div',class_='image-caption')[0].string)
        new ='<div class="image-package"><img src=' + imgUrl + '>'
        if caption != 'None':
            new = new + '<div class="image-caption">' + str(caption) + '</div>'
        new = new + '</div>'
        strContent = strContent.replace(str(img),new)

    return title, strContent[48:-7]

def writeIntoDB(title, content):
    DBKWARGS = {'db':'myblog','user':'root','passwd':'root', 'host':'localhost','use_unicode':True,'charset':'utf8'}
    con = MySQLdb.connect(**DBKWARGS)
    cur = con.cursor()
    sql = "insert into article(article_title,article_content) values(%s,%s)"
    lis = (title, content)
    try:
        cur.execute(sql, lis)
    except Exception as e:
        print("Insert Error:", e)
        con.rollback()
    else:
        con.commit()
    cur.close()
    con.close()

def updateArticleById(title, content,id):
    DBKWARGS = {'db':'myblog','user':'root','passwd':'root', 'host':'localhost','use_unicode':True,'charset':'utf8'}
    con = MySQLdb.connect(**DBKWARGS)
    cur = con.cursor()
    sql = "update article set article_title = %s, article_content = %s where article_id = %s;"
    lis = (title, content, id)
    try:
        cur.execute(sql, lis)
    except Exception as e:
        print("Insert Error:", e)
        con.rollback()
    else:
        con.commit()
    cur.close()
    con.close()

def GetArticleList(userId):
    url = "http://www.jianshu.com/u/" + userId
    html = getHTMLText(url)
    soup = BeautifulSoup(html, "html.parser")
    num = int(soup.find_all('div', class_="meta-block")[2]('p')[0].string) #获取文章总数
    page = int(num / 9) + 1 #获取页数

    for i in range(page):
        newUrl = url + "?order_by=shared_at&page=" + str(page - i)
        newHtml = getHTMLText(newUrl)
        newSoup = BeautifulSoup(newHtml,"html.parser")
        ul = newSoup.find_all('ul', class_="note-list")[0]
        lis = []
        # 倒序输出
        for li in ul.children:
            if isinstance(li,bs4.element.Tag):
                lis.append(li)
        for li in lis[::-1]:
            articleUrl = "http://www.jianshu.com" + (li("a", class_="title")[0].attrs['href'])
            title, content = crawlArticleByUrl(articleUrl)
            writeIntoDB(title, content)

def main():
    #GetArticleList("9f29e0217f4d")
    title, content = crawlArticleByUrl('https://www.jianshu.com/p/326d5e75fa2d')
    #writeIntoDB(title, content)
    updateArticleById(title, content, 513)
    print("Finish!")

if __name__ == '__main__':
    main()
最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容