R语言链家二手房爬取傻瓜教程

首先让我们来讲讲逻辑，这是搞定爬虫的第一步，看看链家网页里我们要找的东西在哪里

这是链家的主页，选择完地区之后（for example上海）
链家-上海 https://sh.lianjia.com/

链家-上海

然后戳二手房，发现在之前的地址后面加了个 /ershoufang/
链家-上海-二手房 https://sh.lianjia.com/ershoufang/

链家-上海-二手房

然后我们发现所有的房源是可以按区域筛选的，为了不漏网任何一条小鱼当一个合格的海王，我们要把筛选范围缩小一下。
在戳了浦东之后，发现了二级分区，并且地址后面加了个/pudong/
链家-上海-二手房-浦东 https://sh.lianjia.com/ershoufang/pudong/
以此类推点进下面的北蔡，地址后面加了/beicai/
链家-上海-二手房-浦东-北蔡 https://sh.lianjia.com/ershoufang/beicai/
好在下面是没有分级了

链家-上海-二手房-浦东-北蔡

所以到这里，我们已经知道了我们要找到的房源信息，需要在【链家-上海-二手房-浦东-北蔡】这个层级上，并且每个区域还有n页的信息，每页我们也都要去搜刮一遍。

网页层级

链家-上海-二手房-浦东-北蔡-分页

所以我们的工作拢共分三步
①爬取分区
②爬取页数
③爬房源信息
清晰了逻辑之后我们来搞起来吧

print('初始化>>> 清空环境和内存...');gc(rm(list=ls()))
enviorment_path <- 'D:/Code/lianjia'
setwd(enviorment_path)

library(bitops)
library(RCurl)
library(XML)
library(plyr)
library(rjson)
library(data.table)
library(stringr)
library(xml2)
library(rvest)
library(glue)

#32位最高内存分配4000，64位最高102400
memory.limit(4000)
#请求头（在chrome-inspect-network-主页面的request headers里面，下次别再忘了呀）
myheader=c(
  "user-agent"="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36",
  "path"="/sitemap/spotdis/c0",
  "accept"="*/*",
  "accept-encoding"="deflate, br",
  "accept-language"="zh-CN,zh;q=0.9",
  "cache-control"="max-age=0",
  "upgrade-insecure-requests"='1',
  "scheme"="https"
)

#系统时间
s_time <- Sys.time()
#主页
url <- "https://sh.lianjia.com/ershoufang/"
webpage <- getURL(url,httpheader=myheader,.encoding="utf-8")
pagexml <- htmlTreeParse(webpage,encoding="utf-8", error=function(...){}, useInternalNodes = TRUE,trim=TRUE)


#一级分区网页表
district_name <- xpathSApply (pagexml, '//*[@data-role="ershoufang"]//a',xmlValue ) %>% trim()
district_name <- gsub(' ','',district_name)

href <- xpathSApply (pagexml, '//*[@data-role="ershoufang"]//a',xmlGetAttr,name = 'href') 
href <- paste0('https://sh.lianjia.com',href)

district_ershoufang_table <- data.table(district_name=district_name,district_href = href)

#二级分区网页表
d=1
district_table_all <- NULL
for (d in d:dim(district_ershoufang_table)[1]) {
  url <- district_ershoufang_table$district_href[d]
  webpage <- getURL(url,httpheader=myheader,.encoding="utf-8")
  
  pagexml <- htmlTreeParse(webpage,encoding="utf-8", error=function(...){}, useInternalNodes = TRUE,trim=TRUE)
  
  name <- xpathSApply (pagexml, '//*[@data-role="ershoufang"]//div[2]//a',xmlValue) %>% trim()
  name <- gsub(' ','',name)
  
  href <- xpathSApply (pagexml, '//*[@data-role="ershoufang"]//div[2]//a',xmlGetAttr,name = 'href') 
  href <- paste0('https://sh.lianjia.com',href)
  
  table <- data.table(district_name=name,district_href = href,area = district_ershoufang_table$district_name[d])
  table$district_name[table$district_name=='不限'] <- district_ershoufang_table$district_name[d]
  
  district_table_all <- rbind(district_table_all,table)
  print(table)
}
save(district_table_all,file = 'district_table_ershoufang_all.rda')


load('district_table_ershoufang_all.rda')

#开始搞房源信息啦！也就是爬二级分区每一页的信息，
#拢共分两步，①爬每个分区总页数，②爬每页有啥

#①爬每个分区总页数
dis=1
HOUSE_TABLE <- NULL

for (dis in dis:length(district_table_all$district_name)) {
  
  #看一共有几页
  url <- district_table_all$district_href[dis]
  webpage <- getURL(url,httpheader=myheader,.encoding="utf-8")
  pagexml <- htmlTreeParse(webpage,encoding="utf-8", error=function(...){}, useInternalNodes = TRUE,trim=TRUE)
  
  #网页里有总页数那一趴长这样
  #<div class="page-box fr">
  #<div class="page-box house-lst-page-box" comp-module='page' page-url="/ershoufang/beicai/pg{page}"page-data='{"totalPage":40,"curPage":1}'></div>
  page_all <- xpathSApply (pagexml, '//*[@class="page-box fr"]//div',xmlGetAttr,name = 'page-data')
  
  
  #如果页面上是0个房源就会报错，如果有房源，则会爬下来这么个东西{"totalPage":40,"curPage":1}，
  #所以就if一下 然后把总页数用正则选出来
  #a(?=b)  匹配后面有 b 的 a    a(?!b)  匹配后面没有 b 的 a。
  #(?<=b)a 匹配前面有 b 的 a。  (?<!b)a 匹配前面没有 b 的 a
  if(length(page_all)==0) {
    next()
  } else {
    page_all <- str_extract(page_all,"(?<=:)[0-9]*(?=,)")
  }

 
  page = 1
  for (page in page:page_all) {
    
    page_url <-paste0(district_table_all$district_href[dis],'pg',page) 
    webpage <- read_html(page_url,httpheader=myheader,curl = getCurlHandle("useragent" = "Mozilla/5.0"),.encoding="utf-8")
    
    pagexml <- htmlTreeParse(webpage,encoding="utf-8", error=function(...){}, useInternalNodes = TRUE,trim=TRUE)
    
    title <- house_link <- xpathSApply (pagexml, '//*[@class="info clear"]//*[@class="title"]//a',xmlValue) %>% trim() 
    # title <- str_conv(title,'utf-8')
    
    house_link <- xpathSApply (pagexml, '//*[@class="info clear"]//*[@class="title"]//a',xmlGetAttr,name = 'href') %>% trim()
    id <- gsub('https://sh.lianjia.com/ershoufang/|.html','',house_link)
    
    #小区名
    community <- xpathSApply (pagexml, '//*[@class="flood"]//*[@class="positionInfo"]//a[1]',xmlValue) %>% trim() 
    community_link <- xpathSApply (pagexml, '//*[@class="flood"]//*[@class="positionInfo"]//a[1]',xmlGetAttr,name='href') %>% trim()
    
    #板块名（链家的板块不是我们自己的）
    circle <- xpathSApply (pagexml, '//*[@class="flood"]//*[@class="positionInfo"]//a[2]',xmlValue) %>% trim() 
    circle_link <- xpathSApply (pagexml, '//*[@class="flood"]//*[@class="positionInfo"]//a[2]',xmlGetAttr,name='href') %>% trim()
    
    #信息
    #爬下来的字符串长这样 “2室2厅|99.25平米|南西|精装|中楼层(共7层)|1999年建|板楼"
    info <- xpathSApply (pagexml, '//*[@class="address"]//div',xmlValue) %>% trim() 
    info <- gsub(' ','',info)
    
    #把信息一个一个抠出来！
    #R语言正则
    #[:print:]{1,2}+[室]+[:print:]{1}+[厅]+[:print:]{1}+[卫]
    #[:print:]任意可打印字符[:digit:]任何一个数字
    #[\u4e00-\u9fa5]任何中文字
    #[] 选择方括号中的任意一个(如[0-2]和[012]完全等价，[Rr]负责匹配字母R和r)
    #{} 前面的字符或表达式的重复次数。如{5,12}表示重复的次数不能小于5，不能多于12，否则都不匹配
    
    area_count_int <- unlist(lapply(1:length(info), function(x) {str_extract_all(info,"[:digit:]{1}+[室]+[:digit:]{1}+[厅]")[[x]]}))
    roomcount <- unlist(lapply(1:length(info), function(x) {str_extract_all(info,"[:digit:]{1}+[室]")[[x]]}))
    livingcount <- unlist(lapply(1:length(info), function(x) {str_extract_all(info,"[:digit:]{1}+[厅]")[[x]]}))
    area <- unlist(lapply(1:length(info), function(x) {str_extract_all(info,"[:digit:]{1,}+[平米]")[[x]]}))
    floor <- unlist(lapply(1:length(info), function(x) {str_extract_all(info,"[\u4e00-\u9fa5]{1}+[楼]+[层]")[[x]]}))
    height <- unlist(lapply(1:length(info), function(x) {str_extract_all(info,"[共][:digit:]{1,}+[层]")[[x]]}))
    year <- unlist(lapply(1:length(info), function(x) {str_extract_all(info,"[:digit:]{1,}+[年建]")[[x]]}))
    type <- unlist(lapply(1:length(info), function(x) {str_extract_all(info,"[\u4e00-\u9fa5]{1}+[楼]\\b")[[x]]}))
    
    if(length(type)==0){type <- 'other'}
    if(length(floor)==0){floor <- 'other'}
    
    totalprice <- xpathSApply (pagexml, '//*[@class="totalPrice"]//span',xmlValue) %>% trim()
    unitprice <- xpathSApply (pagexml, '//*[@class="priceInfo"]//div[2]',xmlGetAttr,name='data-price') %>% trim()
    
    district <- district_table_all$area[dis]
    house_table <- data.table(title=title,
                              id=id,
                              house_link=house_link,
                              district=district,
                              circle=circle,
                              circle_link=circle_link,
                              community=community,
                              community_link=community_link,
                              area=area,
                              room=roomcount,
                              living=livingcount,
                              height=height,
                              floor=floor,
                              year=year,
                              type=type,
                              totalprice=totalprice,
                              unitprice=unitprice)

    
    HOUSE_TABLE <- rbind(HOUSE_TABLE,house_table)
    print(house_table)
    
    Sys.sleep(runif(1,0,3))
    #runif产生随机数1个数 最小0最大1（我改成了5...被禁了一次ip学乖了...慢点就慢点吧...ps确实是太慢了，不至于这么怂，估计2就可以了）
  }
  save.image(file = '链家租房房源_列表_202002.RData')
}

HOUSE_TABLE_unique <- unique(HOUSE_TABLE,by=c('id'))
save(HOUSE_TABLE_unique,file = paste0('HOUSE_TABLE_ershou_unique',Sys.Date(),'.rda'))

③爬取房源详情，也就是上一步爬下来的每个链接里面的详细内容

下面代码是爬取链家成都所有小区详情，
原理是一样的，这个是改得最完整的版本。

特别吃内存，现在还没搞清楚原因，在改代码的过程中减少了rbind的使用，没用的东西各种清，但感觉还是不对，平均爬一条0.5M的内存？？？之后问一下专业人士解决后更新

print('初始化>>> 清空环境和内存...');gc(rm(list=ls()))

library(bitops)
library(RCurl)
library(XML)
library(plyr)
library(rjson)
library(data.table)
library(stringr)
library(xml2)
library(rvest)
library(glue)

enviorment_path <- 'D:/Code/lianjia'
setwd(enviorment_path)

myheader=c(
  "user-agent"="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36",
  "path"="/sitemap/spotdis/c0",
  "accept"="*/*",
  # "accept"="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
  "accept-encoding"="deflate, br",
  "accept-language"="zh-CN,zh;q=0.9",
  "cache-control"="max-age=0",
  "upgrade-insecure-requests"='1',
  # "authority"="brand.yoka.com/brandlist.htm",
  "scheme"="https"
)
s_time <- Sys.time()
load('D:/Code/lianjia/xiaoqu_table_chengdu_unique2020-02-19.rda')

id=XIAOQU_TABLE_unique$id
link=XIAOQU_TABLE_unique$xiaoqu_link

rm(XIAOQU_TABLE_unique)
gc()


i= 1
Detail_table <- NULL
total=length(id)

for (i in i:(round(total/50)+1)) {
  d = (i-1)*50+1
  Lng <- NULL
  Lat <- NULL
  ID <- NULL
  Building <- NULL
  Family <- NULL
  Address <- NULL

  for (d in d:(i*50)){
    
    if(d<=total) {
      xiaoqu_id <- id[d]
      url <- link[d]
      webpage <- getURL(url,httpheader=myheader,.encoding="utf-8")
      msg.load <- tryCatch({
        pagexml <- htmlTreeParse(webpage,encoding="utf-8", error=function(...){}, useInternalNodes = TRUE,trim=TRUE)
        building <- xpathSApply (pagexml, '//*[@class="xiaoquInfo"]//div[6]//span[2]',xmlValue) %>% trim()
        family <- xpathSApply (pagexml, '//*[@class="xiaoquInfo"]//div[7]//span[2]',xmlValue) %>% trim()
        address <- xpathSApply (pagexml, '//*[@class="detailHeader fl"]//div',xmlValue) %>% trim()
        address <- gsub('\\([\u4e00-\u9fa5]{1,}\\)','',address)
        
        infomation <- xpathSApply (pagexml, '//script',xmlValue)[22]
        location_int <- unlist(str_extract_all(infomation,"[resblockPosition]+[:']+[:digit:]{1,}+[.]+[:digit:]{1,}+[,]+[:digit:]{1,}+[.]+[:digit:]{1,}"))
        location_int <- gsub("resblockPosition:'","",location_int)
        location_int <- str_split(location_int,',') %>% unlist()
        
        lng <- location_int[1]
        lat <- location_int[2]
        
        
        msg.load <- "true"
      }, error = function(e) {
        "error"
      }
      )
      if(msg.load=='error'){
        lng <- NA
        lat <- NA
        xiaoqu_id <- xiaoqu_id
        building <- NA
        family <- NA
        address <- NA
      
        
      }
      Lng <- c(Lng,lng)
      Lat <- c(Lat,lat)
      ID <- c(ID,xiaoqu_id)
      Building <- c(Building,building)
      Family <- c(Family,family)
      Address <- c(Address,address)
      Sys.sleep(runif(1,0,1))
      
      }
    else{
      break
      }
  }
  detail_table <- data.table(xiaoquid=ID,
                             lng=Lng,
                             lat=Lat,
                             address = Address,
                             building_count = Building,
                             family_count = Family)
  Detail_table <- rbind(Detail_table,detail_table)
  print(detail_table)
  Sys.sleep(runif(1,0,5))
  gc()
}
save(Detail_table,file = paste0('xiaoqu_table_chengdu_detail_',Sys.Date(),'.rda'))

R语言 链家二手房爬取傻瓜教程

推荐阅读更多精彩内容

R语言链家二手房爬取傻瓜教程