内容提取的源码

# -*- coding: utf-8 -*-
import scrapy
import re

class JobboleSpider(scrapy.Spider):
    name = "jobbole"
    allowed_domains = ["blog.jobbole.com"]
    start_urls = ['http://blog.jobbole.com/113560/']
    def parse(self, response):
       # re_selector = response.xpath("/html/body/div[1]/div[3]/div[1]/div[1]/h1")
       # re2_selector = response.xpath('//*[@id ="post-113560"]/div[1]/h1/text()')
       # 下面create_date表示日期，praise_nums点赞数 fav_num收藏数 comment_nums评论数cotent正文
       # tag_list是日期后面的内容 tags连接的字符串
       title = response.xpath('//*[@id ="post-113560"]/div[1]/h1/text()').extract()[0]
       create_date = response.xpath('//*[@id="post-113560"]/div[2]/p/text()').extract()[0].strip().replace("·", "").strip()
       praise_nums = int(response.xpath('//*[@id="post-113560"]/div[3]/div[3]/span[1]/h10/text()').extract()[0])
       fav_nums= response.xpath('//*[@id="post-113560"]/div[3]/div[3]/span[2]/text()').extract()[0]
       match_re=re.match(".*(\d+).*",fav_nums)
       if match_re:
           fav_nums=match_re.group(1)
       comment_nums=response.xpath('//*[@id="post-113560"]/div[3]/div[3]/a/span/text()').extract()[0]
       match_re = re.match(".*(\d+).*", comment_nums)
       if match_re:
           comment_nums=match_re.group(1)
       cotent= response.xpath('//*[@id="post-113560"]/div[3]').extract()[0]
       tag_list=response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
       tag_list=[element for element in tag_list if not element.strip().endswith("评论")]
       tags =",".join(tag_list)
       pass

©著作权归作者所有,转载或内容合作请联系作者
【社区内容提示】社区部分内容疑似由AI辅助生成，浏览时请结合常识与多方信息审慎甄别。
平台声明：文章内容（如有图片或视频亦包括在内）由作者上传并发布，文章内容仅代表作者本人观点，简书系信息发布平台，仅提供信息存储服务。

内容提取的源码

内容提取的源码

相关阅读更多精彩内容

友情链接更多精彩内容