Python 爬取简书个人文章目录、查看数及链接地址
1. 通过360极速浏览器的审查元素,选“Network”->“XHR”选项,滚动页面,找出连接地址构成的规律。https://www.jianshu.com/u/55b597320c4e?order_by=shared_at&page=2
如下图:
2. 根据文件数和每页显示的数量,构建链接地址。
urls =[ 'https://www.jianshu.com/u/55b597320c4e?order_by=shared_at&page={}'.format(str(i)) for i in range(1,13)]
3. 使用LXML库,查找需要的标题,查看量,超链地址。
代码如下:
# -*- coding: utf-8 -*-
import requests,time
from lxml import etree
import pymongo
from multiprocessing import Pool #多线程库
client = pymongo.MongoClient('localhost',27017)
mydb = client['mydb']
jianshu_user_dy = mydb['jianshu_user_dy']
headers = {
'X-Requested-With': 'XMLHttpRequest',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'Referer': 'https://www.jianshu.com/u/9104ebf5e177'
}
def get_infos(url):
try:
html = requests.get(url,headers =headers)
selector = etree.HTML(html.text)
try:
links = selector.xpath('//*[@id="list-container"]/ul/li')
for link in links:
title = link.xpath('div/a/text()')[0]
view = link.xpath('div/div/a[1]/text()')[-1].strip()
title_url ='https://www.jianshu.com'+ link.xpath('div/a/@href')[0]
print(title,view)
infos = {
'title':title,
'url':title_url,
'view':view
}
jianshu_user_dy.insert_one(infos)
except:
print("抓取不到内容咯???????????????")
except requests.ConnectionError:
print("网页出错啦!***************")
urls =[ 'https://www.jianshu.com/u/55b597320c4e?order_by=shared_at&page={}'.format(str(i)) for i in range(1,13)]
if __name__ == '__main__':
start = time.time()
pool = Pool(processes=4)
pool.map(get_infos,urls)
print("合计用时:{}".format(str(time.time()-start)))