搜狗新闻与清华新闻文章都是偏新闻类别,若要商品类别文本,则需要爬取,代码如下,参考https://www.cnblogs.com/sirkevin/p/5805795.html:
1, spider.py:
import time
import scrapy
from book_project.items import BookItem
class BookInfoSpider(scrapy.Spider):
name = "textinfo"
allowed_domains = ["bzbzc.com"]
start_urls = [
"http://www.bzbzc.com/lvxing",
]
def parse(self, response):
base_url = "http://www.bzbzc.com/lvxing/list_61_{0}.html"
for page in range(1, 2):
print(base_url.format(page))
yield scrapy.Request(base_url.format(page), dont_filter=True, callback=self.parse_page)
def parse_page(self, response):
for url in response.xpath('//h2/a/@href').extract():
yield scrapy.Request(url, callback=self.parse_text_info)
def parse_text_info(self, response):
content = response.xpath('//div[@class="article-content"]/p/text()').extract()
origin = response.xpath('//div[@class="single-time text-center"]/text()').extract()
title = response.xpath('//h1[@class="single-title text-center"]/text()').extract()
item = BookItem()
item['title'] = title
item['origin'] = origin
item['content'] = content
item['url'] = response.url
name = time.time()
with open("./data/{0}.txt".format(name),'w') as f:
f.write(','.join(content))
yield item
2, run.py(运行此文件 python run.py)
import os
os.system('scrapy crawl textinfo -o data.csv')
3, pipelines.py
class BookProjectPipeline(object):
def process_item(self, item, spider):
return item
4, items.py
import scrapy
class BookItem(scrapy.Item):
title = scrapy.Field()
origin = scrapy.Field()
content = scrapy.Field()
url = scrapy.Field()