Scrapy to zhihu

# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request, FormRequest
from scrapy.selector import Selector
import json
import os
from tutorial.items import ZhihuItem


class ZhihuSpider(scrapy.Spider):
    name = "zhihu"
    allowed_domains = ["zhihu.com"]
    pwd = os.path.split(os.path.realpath(__file__))[0]
    cookiesfilename = pwd + os.path.sep + "cookies.txt"
    start_urls = [
        "https://www.zhihu.com"
    ]

    def start_requests(self):
        cookies = self.load_cookies_from_mozilla(self.cookiesfilename)
        self.logger.info(cookies)
        for url in self.start_urls:
            yield Request(
                url,
                cookies=cookies,
                callback=self.after_login)

    def after_login(self, response):
        # 有/people/*链接说明登录成功
        if not response.xpath('//a[re:test(@href, "/people/")]'):
            self.logger.info("登录失败")
            scrapy.shell.inspect_response(response, self)
            return None
        self.logger.info("登录成功")
        # scrapy.shell.inspect_response(response, self)
        _xsrf = response.xpath('///*[@name="_xsrf"]/@value').extract_first()
        headers = response.request.headers
        headers["X-Xsrftoken"] = _xsrf
        return FormRequest(
            url="https://www.zhihu.com/topic/19552832/followers",
            headers=headers,
            formdata={"offset": "0"},
            meta={"offset": 0, "headers": headers},
            callback=self.get_followers)

    def get_followers(self, response):

        msg = json.loads(response.body_as_unicode())['msg']
        offset = response.meta["offset"] + 20
        if not msg[0] == 0:
            sel = Selector(text=msg[-1])
        else:
            return None
        for i in sel.xpath('//a[@class="zg-link author-link"]'):
            name = i.xpath('text()').extract_first()
            href = i.xpath("@href").extract_first()
            yield Request(
                url="https://www.zhihu.com" + href,
                meta={"name": name, "href": href},
                callback=self.get_about
            )
        mi_ids = sel.xpath('///*[@class="zm-person-item"]/@id').extract()
        yield FormRequest(
            url="https://www.zhihu.com/topic/19552832/followers",
            headers=response.meta["headers"],
            formdata={"offset": str(offset), "start": mi_ids[-1].split('-')[-1]},
            meta={"offset": offset, "headers": response.meta["headers"]},
            callback=self.get_followers)

    def get_about(self, response):
        sel = Selector(text=response.body_as_unicode())
        item = ZhihuItem()
        item["name"] = response.meta["name"]
        item["href"] = response.meta["href"]
        item["location"] = sel.xpath('///*[@class="location item"]/@title').extract_first()
        item["business"] = sel.xpath('///*[@class="business item"]/@title').extract_first()
        if sel.xpath('///*[@class="icon icon-profile-male"]'):
            item["gender"] = "male"
        elif sel.xpath('///*[@class="icon icon-profile-female"]'):
            item["gender"] = "female"
        else:
            item["gender"] = "unknown"
        item["employment"] = sel.xpath('///*[@class="employment item"]/@title').extract_first()
        item["position"] = sel.xpath('///*[@class="position item"]/@title').extract_first()
        item["education"] = sel.xpath('///*[@class="education item"]/@title').extract_first()
        item["major"] = sel.xpath('///*[@class="education-extra item"]/@title').extract_first()

        yield item

    def load_cookies_from_mozilla(self, filename):
        cookies = []
        with open(filename, "r", encoding="utf-8") as f:
            for line in f:
                cookies_list = line.split()
                cookies.append(dict(
                    name=cookies_list[-2],
                    value=cookies_list[-1],
                ))
        return cookies
最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容