效果

序

本次爬取房天下官网的全国658个城市的房源信息，主要是新房与二手房的信息抓取！
Scrapy 使用了Twisted['twɪstɪd]异步网络框架，可以加快我们的下载速度。

文档地址：http://scrapy-chs.readthedocs.io/zh_CN/1.0/intro/overview.html

Scrapy 流程：

其流程可以描述如下：

调度器把requests-->引擎-->下载中间件--->下载器
下载器发送请求，获取响应---->下载中间件---->引擎--->爬虫中间件--->爬虫
爬虫提取url地址，组装成request对象---->爬虫中间件--->引擎--->调度器
爬虫提取数据--->引擎--->管道
管道进行数据的处理和保存

说明

正

创建一个爬虫Scrapy
首先在middlewares文件中，设置随机请求头（因为我只是加了一个请求头信息，所以后期测试的时候ip受到了限制，难受。。。），这个链接可以添加更多的请求头!
随后在setting文件中进行配置

setting

别忘了把robot协议打开，文明获取数据！
在items文件中定义数据对象！

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy

# 定义要获取的信息名称对象
# 新房
class NewHouseItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    # 省份
    procince = scrapy.Field()
    # 城市
    city = scrapy.Field()
    # 几居
    room = scrapy.Field()
    # 面积
    area = scrapy.Field()
    #地区
    district = scrapy.Field()
    # 地址
    address = scrapy.Field()
    # 销售状态
    sale = scrapy.Field()
    # 价格
    price = scrapy.Field()
    # 房子详情链接
    origin_url = scrapy.Field()
    # 小区的名字
    name = scrapy.Field()


# 二手房
class ESFHouseItem(scrapy.Item):

    # 省份
    procince = scrapy.Field()
    # 城市
    city = scrapy.Field()
    # 几室几厅
    room = scrapy.Field()
    # 建筑面积
    area = scrapy.Field()
    # 层
    floor = scrapy.Field()
    # 地址
    address = scrapy.Field()
    # 单价
    unit = scrapy.Field()
    # 总价
    price = scrapy.Field()
    # 年代
    year = scrapy.Field()
    # 小区的名字
    name = scrapy.Field()
    # 朝向
    toward = scrapy.Field()
    # 房子详情链接
    origin_url = scrapy.Field()

关键部分

在spider文件夹中，创建py文件编写代码（已注释）

import scrapy,re
from myspider.items import NewHouseItem,ESFHouseItem
from scrapy_redis.spiders import RedisSpider

class FangSpider(scrapy.Spider):
    name = 'fang' # 爬虫名字
    allowed_domains = ['fang.com'] # 爬取范围
    # 开始爬取的网址url
    start_urls = ['https://www.fang.com/SoufunFamily.htm']

    # 获取新房与为二手房的所有房源链接
    def parse(self,response):
        trs = response.xpath("//div[@class='outCont']//tr")
        for tr in trs:
            tds = tr.xpath(".//td[not(@class)]")
            procince_td = tds[0]
            procince_text = procince_td.xpath(".//text()").get()
            procince_text = re.sub(r"\s", "", procince_text)
            if procince_text:
                procince = procince_text
            # 海外除外
            if procince == "其它":
                continue
            city_td = tds[1]
            city_links = city_td.xpath(".//a")
            for city_link in city_links:
                city = city_link.xpath(".//text()").get()
                city_url = city_link.xpath(".//@href").get()
                # print('===================')
                # print('省份:',procince)
                # print('城市:',city)
                # print('城市链接:', city_url)
                #构建新房链接
                url_module = city_url.split("//")
                scheme = url_module[0]
                domain = url_module[1]
                # 北京特例,需要验证
                if "bj." in domain:
                    newhouse_url = 'https://newhouse.fang.com/house/s/'
                    esf_url = 'https://esf.fang.com/'
                else:
                    newhouse_url = scheme +"//" +"newhouse." + domain + "house/s/"
                    # 构建二手房链接
                    esf_url = scheme + "//" + "esf." + domain
                # print('新房链接:', newhouse_url)
                # print('二手房链接:', esf_url)

                #解析响应对象，返回数据对象(Item)或者新的请求对象(Request)
                yield scrapy.Request(url=newhouse_url,callback=self.parse_newhouse, meta={"info":(procince, city)})
                yield scrapy.Request(url=esf_url,callback=self.parse_esf, meta={"info":(procince, city)})
            #
            #     break
            # break

    # 获取新房信息
    def parse_newhouse(self, response):
        procince, city = response.meta.get('info')
        # 获取小区信息
        # contains:用于包含标签
        lis = response.xpath("//div[contains(@class,'nl_con')]/ul/li")
        for li in lis:
            name = li.xpath(".//div[@class='nlcd_name']/a/text()").get()
            if name:
                name = name.strip()
            house_type_list = li.xpath(".//div[contains(@class,'house_type')]/a/text()").getall()
            #去除空白字符
            house_type_list = list(map(lambda x:re.sub(r"\s","", x), house_type_list))
            # 过滤函数：filter（）
            # 几居
            room = list(filter(lambda x:x.endswith("居"),house_type_list))
            # 面积
            area = "".join(li.xpath(".//div[contains(@class,'house_type')]/text()").getall())
            area = re.sub("\s|-|/","",area)
            # 地区
            address = li.xpath(".//div[@class='address']/a/@title").get()
            district_text = "".join(li.xpath(".//div[@class='address']/a//text()").getall())
            district = re.search(r".*\[(.+)\].*", district_text)
            if district:
                district = district.group(1)
            #print(district)
            #销售状态
            sale = li.xpath(".//div[contains(@class,'fangyuan')]/span/text()").get()
            #print(sale)
            # 价格
            price= ''.join(li.xpath(".//div[@class='nhouse_price']//text()").getall())
            price = re.sub(r"\s|广告", "",price)
            #print(price)
            # 房子详情链接
            origin_url = li.xpath(".//div[@class='nlcd_name']/a/@href").get()
            #print(origin_url)

            item = NewHouseItem(name=name, procince = procince, city= city,room= room, sale=sale,price=price,address=address,district=district
                                ,origin_url= origin_url)
            yield item
        # 下一页地址
        next_url = response.xpath("//div[@class='page']//a[@class='next']//@href").get()
        if next_url:
            # 解析响应对象，返回数据对象(Item)或者新的请求对象(Request)
            yield scrapy.Request(url=response.urljoin(next_url),callback=self.parse_newhouse,
                              meta={'info':(procince,city)})
            
    # 获取二手房信息
    def parse_esf(self, response):
        procince, city = response.meta.get('info')
        dls = response.xpath("//div[@class='shop_list shop_list_4']/dl")
        for dl in dls:
            item = ESFHouseItem(procince=procince,city=city)
            # 小区名字
            name = dl.xpath(".//p[@class='add_shop']/a/text()").get()
            if name:
                item['name'] = name.strip()
            #print(name)
            # 房屋信息
            infors = dl.xpath(".//p[@class='tel_shop']/text()").getall()
            infors = list(map(lambda x:re.sub(r"\s","",x),infors))
            #print(infors)
            for infor in infors:
                if '厅' in infor:
                    item['room'] = infor
                elif '层' in infor:
                    item['floor'] = infor
                elif '向' in infor:
                    item['toward'] = infor
                elif '㎡' in infor:
                    item['area'] = infor
                else:
                    if infor:
                        item['year'] = infor.replace('建','')
            # 地址
            address = dl.xpath(".//p[@class='add_shop']/span/text()").get()
            item['address'] = address
            # 总价
            item['price'] = ''.join(dl.xpath(".//dd[@class='price_right']/span[1]//text()").getall()).replace('\r\n','').replace(' ','')
            #单价
            item['unit'] = ''.join(dl.xpath(".//dd[@class='price_right']/span[2]//text()").getall()).replace('\r\n','').replace(' ','')
            # 房子链接
            detail_url = dl.xpath(".//h4[@class='clearfix']/a/@href").get()
            if detail_url:
                item['origin_url'] = response.urljoin(detail_url)
            #print(item)
            yield item
        # 解析响应对象，返回数据对象(Item)或者新的请求对象(Request)
        next_url = response.xpath("//div[@class='page_al']/p[1]/a/@href").get()
        yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_esf,
                             meta={'info':(procince,city)})

爬取的数据是存储在字典之中，我们也可以存储在json文件之中
可在管道文件中进行编写代码存储到json文件之中

from scrapy.exporters import JsonLinesItemExporter


# 将获取的信息存入json文件
class MyspiderPipeline(object):
    def __init__(self):
        self.newhouse_fp = open('newhouse.json','wb')
        self.esfhouse_fp = open('esfhouse.json','wb')
        self.newhouse_exporter = JsonLinesItemExporter(self.newhouse_fp,
                                                       ensure_ascii=False)
        self.esfhouse_exporter = JsonLinesItemExporter(self.esfhouse_fp,
                                                       ensure_ascii=False)
    def process_item(self, item, spider):
        self.newhouse_exporter.export_item(item)
        self.esfhouse_exporter.export_item(item)
        return item

    def close_spider(self):
        self.newhouse_fp.close()
        self.esfhouse_fp.close()

大概这个样子：