# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request, FormRequest
from scrapy.selector import Selector
import json
import os
from tutorial.items import ZhihuItem
class ZhihuSpider(scrapy.Spider):
name = "zhihu"
allowed_domains = ["zhihu.com"]
pwd = os.path.split(os.path.realpath(__file__))[0]
cookiesfilename = pwd + os.path.sep + "cookies.txt"
start_urls = [
"https://www.zhihu.com"
]
def start_requests(self):
cookies = self.load_cookies_from_mozilla(self.cookiesfilename)
self.logger.info(cookies)
for url in self.start_urls:
yield Request(
url,
cookies=cookies,
callback=self.after_login)
def after_login(self, response):
# 有/people/*链接说明登录成功
if not response.xpath('//a[re:test(@href, "/people/")]'):
self.logger.info("登录失败")
scrapy.shell.inspect_response(response, self)
return None
self.logger.info("登录成功")
# scrapy.shell.inspect_response(response, self)
_xsrf = response.xpath('///*[@name="_xsrf"]/@value').extract_first()
headers = response.request.headers
headers["X-Xsrftoken"] = _xsrf
return FormRequest(
url="https://www.zhihu.com/topic/19552832/followers",
headers=headers,
formdata={"offset": "0"},
meta={"offset": 0, "headers": headers},
callback=self.get_followers)
def get_followers(self, response):
msg = json.loads(response.body_as_unicode())['msg']
offset = response.meta["offset"] + 20
if not msg[0] == 0:
sel = Selector(text=msg[-1])
else:
return None
for i in sel.xpath('//a[@class="zg-link author-link"]'):
name = i.xpath('text()').extract_first()
href = i.xpath("@href").extract_first()
yield Request(
url="https://www.zhihu.com" + href,
meta={"name": name, "href": href},
callback=self.get_about
)
mi_ids = sel.xpath('///*[@class="zm-person-item"]/@id').extract()
yield FormRequest(
url="https://www.zhihu.com/topic/19552832/followers",
headers=response.meta["headers"],
formdata={"offset": str(offset), "start": mi_ids[-1].split('-')[-1]},
meta={"offset": offset, "headers": response.meta["headers"]},
callback=self.get_followers)
def get_about(self, response):
sel = Selector(text=response.body_as_unicode())
item = ZhihuItem()
item["name"] = response.meta["name"]
item["href"] = response.meta["href"]
item["location"] = sel.xpath('///*[@class="location item"]/@title').extract_first()
item["business"] = sel.xpath('///*[@class="business item"]/@title').extract_first()
if sel.xpath('///*[@class="icon icon-profile-male"]'):
item["gender"] = "male"
elif sel.xpath('///*[@class="icon icon-profile-female"]'):
item["gender"] = "female"
else:
item["gender"] = "unknown"
item["employment"] = sel.xpath('///*[@class="employment item"]/@title').extract_first()
item["position"] = sel.xpath('///*[@class="position item"]/@title').extract_first()
item["education"] = sel.xpath('///*[@class="education item"]/@title').extract_first()
item["major"] = sel.xpath('///*[@class="education-extra item"]/@title').extract_first()
yield item
def load_cookies_from_mozilla(self, filename):
cookies = []
with open(filename, "r", encoding="utf-8") as f:
for line in f:
cookies_list = line.split()
cookies.append(dict(
name=cookies_list[-2],
value=cookies_list[-1],
))
return cookies
Scrapy to zhihu
最后编辑于 :
©著作权归作者所有,转载或内容合作请联系作者
- 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
- 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
- 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
推荐阅读更多精彩内容
- 2015-11-07 14:43:43+0800 [meizitu] DEBUG: Filtered offsit...
- It is really painful if you try to install Scrapy on Wind...
- 听说scrapy已经可以用在python3版本上了,今天下午捣腾了两个小时想安装scrapy,但是却一直显示err...
- 前言 在activity中,经常需要获取view的width和height,但是在onCreate()获取view...