微信文章的时间是无法直接xpath取,是网页js渲染出来的。用正则取了。
from urllib.parse import urlencode
import requests, re
from requests.exceptions import ConnectionError, ReadTimeout
from lxml import etree
import pymongo
from config import *
client = pymongo.MongoClient(MONGO_URI)
db = client[MONGO_DB]
baseurl = 'http://weixin.sogou.com/weixin?'
# 动态修改登录后的cookie
headers = {
'Cookie': '',
'Host': 'weixin.sogou.com',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
proxy = None
def get_proxy():
try:
response = requests.get(PROXY_POOL_URL)
if response.status_code == 200:
return response.text
return None
except ConnectionError:
print('代理池空')
return None
def get_html(url, count=1):
print('crawling', url)
print('trying count', count)
global proxy
if count >= MAX_COUNT:
print("tried too many times")
return None
try:
if proxy:
proxies = {
'http' : 'http://' + proxy
}
response = requests.get(url, allow_redirects=False, headers=headers, proxies=proxies, timeout=10)
else:
response = requests.get(url, allow_redirects=False, headers=headers)
if response.status_code == 200:
return response.text
if response.status_code == 302:
# 加代理
print('302错误')
proxy = get_proxy()
if proxy:
print('using proxy', proxy)
return get_html(url)
else:
print('get proxy failed')
return None
except (ConnectionError,ReadTimeout):
proxy = get_proxy()
count += 1
return get_html(url, count)
def get_index(keyword, pagenumber):
data = {'query': keyword,
'type': 2,
'page': pagenumber
}
url = baseurl + urlencode(data)
html = get_html(url)
return html
def parse_index(html):
html = etree.HTML(html)
urls = html.xpath("//div[@class='news-box']/ul/li/div/h3/a/@href")
for url in urls:
yield url
def get_detail(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except ConnectionError:
return None
def parse_detail(html):
# print(html)
htmll = etree.HTML(html)
title = htmll.xpath("//h2[@class='rich_media_title']/text()")
if title:
title = title[0].strip()
else:
title = ''.join(htmll.xpath("//span[@id='video_title']/text()"))
content = htmll.xpath("//div[@id='js_content']")[0].xpath('string(.)').strip()
date = re.findall(r'var publish_time = \"(.*?)\"', html)[0]
nickname = htmll.xpath("//span[@class='rich_media_meta rich_media_meta_nickname']/a/text()")
if nickname:
nickname = nickname[0].strip()
else:
nickname = htmll.xpath("//strong[@class='account_nickname_inner']/text()")[0].strip()
wechat = ''.join(htmll.xpath("//div[@id='js_profile_qrcode']/div/p[1]/span/text()"))
return {
'title' : title,
'content' : content,
'date' : date,
'nickname' : nickname,
'wechat' : wechat
}
def save_to_mongo(item):
if db['articles'].update({'title':item['title']}, {'$set':item}, True):
print("保存成功")
else:
print("保存失败")
def main():
for page in range(1,101):
html = get_index(KEYWORD, page)
if html:
for url in parse_index(html):
article_html = get_detail(url)
if article_html:
article_data = parse_detail(article_html)
print(article_data)
save_to_mongo(article_data)
if __name__ == '__main__':
main()
config.py
KEYWORD = '风景'
MONGO_URI = 'localhost'
MONGO_DB = 'weixin'
PROXY_POOL_URL = ''
MAX_COUNT = 5