当抓取的数据来自多个网站,唯一变化的的是Xpath表达式,为每个网站使用一个爬虫则小题大做。
import csv
import scrapy
from scrapy.http import Request
class FromcsvSpider(scrapy.Spider):
name = "fromcsv"
def start_requests(self):
with open("xx.csv", "rU") as f:
reader = csv.DictReader(f)
for line in reader:
yield scrapy.Request(line['url'], callback=self.parse,meta={'fields':line})
def parse(self, response):
html = response.text
f_html = ''.join(html.split())
# print(f_html)
html = etree.HTML(html)
# 去除js
ele = html.xpath('//script | //noscript | //style')
for e in ele:
e.getparent().remove(e)
# print(response.url)
record = html.xpath(response.meta['fields']['record_xath'])[0]
record = record.xpath('string(.)')
record = ''.join(record.split())