scrapy框架的使用

创建一个新的爬虫：
scrapy genspider tencent "tencent.com"

编写items.py
获取职位名称、详细信息、

class TencentItem(scrapy.Item):
name = scrapy.Field()
detailLink = scrapy.Field()
positionInfo = scrapy.Field()
peopleNumber = scrapy.Field()
workLocation = scrapy.Field()
publishTime = scrapy.Field()
编写tencent.py

tencent.py

from mySpider.items import TencentItem
import scrapy
import re

class TencentSpider(scrapy.Spider):
name = "tencent"
allowed_domains = ["hr.tencent.com"]
start_urls = [
"http://hr.tencent.com/position.php?&start=0#a"
]

def parse(self, response):
    for each in response.xpath('//*[@class="even"]'):

        item = TencentItem()
        name = each.xpath('./td[1]/a/text()').extract()[0]
        detailLink = each.xpath('./td[1]/a/@href').extract()[0]
        positionInfo = each.xpath('./td[2]/text()').extract()[0]
        peopleNumber = each.xpath('./td[3]/text()').extract()[0]
        workLocation = each.xpath('./td[4]/text()').extract()[0]
        publishTime = each.xpath('./td[5]/text()').extract()[0]

        #print name, detailLink, catalog, peopleNumber, workLocation,publishTime

        item['name'] = name.encode('utf-8')
        item['detailLink'] = detailLink.encode('utf-8')
        item['positionInfo'] = positionInfo.encode('utf-8')
        item['peopleNumber'] = peopleNumber.encode('utf-8')
        item['workLocation'] = workLocation.encode('utf-8')
        item['publishTime'] = publishTime.encode('utf-8')

        curpage = re.search('(\d+)',response.url).group(1)
        page = int(curpage) + 10
        url = re.sub('\d+', str(page), response.url)

        # 发送新的url请求加入待爬队列，并调用回调函数 self.parse
        yield scrapy.Request(url, callback = self.parse)

        # 将获取的数据交给pipeline
        yield item

编写pipeline.py文件
import json

class ItcastJsonPipeline(object):

class TencentJsonPipeline(object):

def __init__(self):
    #self.file = open('teacher.json', 'wb')
    self.file = open('tencent.json', 'wb')

def process_item(self, item, spider):
    content = json.dumps(dict(item), ensure_ascii=False) + "\n"
    self.file.write(content)
    return item

def close_spider(self, spider):
    self.file.close()

在 setting.py 里设置ITEM_PIPELINES
ITEM_PIPELINES = {
#'mySpider.pipelines.SomePipeline': 300,
#"mySpider.pipelines.ItcastJsonPipeline":300
"mySpider.pipelines.TencentJsonPipeline":300
}
执行爬虫：scrapy crawl tencent

©著作权归作者所有,转载或内容合作请联系作者
【社区内容提示】社区部分内容疑似由AI辅助生成，浏览时请结合常识与多方信息审慎甄别。
平台声明：文章内容（如有图片或视频亦包括在内）由作者上传并发布，文章内容仅代表作者本人观点，简书系信息发布平台，仅提供信息存储服务。

scrapy框架的使用

scrapy框架的使用

tencent.py

class ItcastJsonPipeline(object):

推荐阅读更多精彩内容

友情链接更多精彩内容