首先让我们来讲讲逻辑,这是搞定爬虫的第一步,看看链家网页里我们要找的东西在哪里
这是链家的主页,选择完地区之后(for example上海)
链家-上海 https://sh.lianjia.com/
然后戳二手房,发现在之前的地址后面加了个 /ershoufang/
链家-上海-二手房 https://sh.lianjia.com/ershoufang/
然后我们发现所有的房源是可以按区域筛选的,为了不漏网任何一条小鱼当一个合格的海王,我们要把筛选范围缩小一下。
在戳了浦东之后,发现了二级分区,并且地址后面加了个/pudong/
链家-上海-二手房-浦东 https://sh.lianjia.com/ershoufang/pudong/
以此类推点进下面的北蔡,地址后面加了/beicai/
链家-上海-二手房-浦东-北蔡 https://sh.lianjia.com/ershoufang/beicai/
好在下面是没有分级了
所以到这里,我们已经知道了我们要找到的房源信息,需要在【链家-上海-二手房-浦东-北蔡】这个层级上,并且每个区域还有n页的信息,每页我们也都要去搜刮一遍。
所以我们的工作拢共分三步
①爬取分区
②爬取页数
③爬房源信息
清晰了逻辑之后我们来搞起来吧
print('初始化>>> 清空环境和内存...');gc(rm(list=ls()))
enviorment_path <- 'D:/Code/lianjia'
setwd(enviorment_path)
library(bitops)
library(RCurl)
library(XML)
library(plyr)
library(rjson)
library(data.table)
library(stringr)
library(xml2)
library(rvest)
library(glue)
#32位最高内存分配4000,64位最高102400
memory.limit(4000)
#请求头(在chrome-inspect-network-主页面的request headers里面,下次别再忘了呀)
myheader=c(
"user-agent"="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36",
"path"="/sitemap/spotdis/c0",
"accept"="*/*",
"accept-encoding"="deflate, br",
"accept-language"="zh-CN,zh;q=0.9",
"cache-control"="max-age=0",
"upgrade-insecure-requests"='1',
"scheme"="https"
)
#系统时间
s_time <- Sys.time()
#主页
url <- "https://sh.lianjia.com/ershoufang/"
webpage <- getURL(url,httpheader=myheader,.encoding="utf-8")
pagexml <- htmlTreeParse(webpage,encoding="utf-8", error=function(...){}, useInternalNodes = TRUE,trim=TRUE)
#一级分区网页表
district_name <- xpathSApply (pagexml, '//*[@data-role="ershoufang"]//a',xmlValue ) %>% trim()
district_name <- gsub(' ','',district_name)
href <- xpathSApply (pagexml, '//*[@data-role="ershoufang"]//a',xmlGetAttr,name = 'href')
href <- paste0('https://sh.lianjia.com',href)
district_ershoufang_table <- data.table(district_name=district_name,district_href = href)
#二级分区网页表
d=1
district_table_all <- NULL
for (d in d:dim(district_ershoufang_table)[1]) {
url <- district_ershoufang_table$district_href[d]
webpage <- getURL(url,httpheader=myheader,.encoding="utf-8")
pagexml <- htmlTreeParse(webpage,encoding="utf-8", error=function(...){}, useInternalNodes = TRUE,trim=TRUE)
name <- xpathSApply (pagexml, '//*[@data-role="ershoufang"]//div[2]//a',xmlValue) %>% trim()
name <- gsub(' ','',name)
href <- xpathSApply (pagexml, '//*[@data-role="ershoufang"]//div[2]//a',xmlGetAttr,name = 'href')
href <- paste0('https://sh.lianjia.com',href)
table <- data.table(district_name=name,district_href = href,area = district_ershoufang_table$district_name[d])
table$district_name[table$district_name=='不限'] <- district_ershoufang_table$district_name[d]
district_table_all <- rbind(district_table_all,table)
print(table)
}
save(district_table_all,file = 'district_table_ershoufang_all.rda')
load('district_table_ershoufang_all.rda')
#开始搞房源信息啦!也就是爬二级分区每一页的信息,
#拢共分两步,①爬每个分区总页数,②爬每页有啥
#①爬每个分区总页数
dis=1
HOUSE_TABLE <- NULL
for (dis in dis:length(district_table_all$district_name)) {
#看一共有几页
url <- district_table_all$district_href[dis]
webpage <- getURL(url,httpheader=myheader,.encoding="utf-8")
pagexml <- htmlTreeParse(webpage,encoding="utf-8", error=function(...){}, useInternalNodes = TRUE,trim=TRUE)
#网页里有总页数那一趴长这样
#<div class="page-box fr">
#<div class="page-box house-lst-page-box" comp-module='page' page-url="/ershoufang/beicai/pg{page}"page-data='{"totalPage":40,"curPage":1}'></div>
page_all <- xpathSApply (pagexml, '//*[@class="page-box fr"]//div',xmlGetAttr,name = 'page-data')
#如果页面上是0个房源就会报错,如果有房源,则会爬下来这么个东西{"totalPage":40,"curPage":1},
#所以就if一下 然后把总页数用正则选出来
#a(?=b) 匹配后面有 b 的 a a(?!b) 匹配后面没有 b 的 a。
#(?<=b)a 匹配前面有 b 的 a。 (?<!b)a 匹配前面没有 b 的 a
if(length(page_all)==0) {
next()
} else {
page_all <- str_extract(page_all,"(?<=:)[0-9]*(?=,)")
}
page = 1
for (page in page:page_all) {
page_url <-paste0(district_table_all$district_href[dis],'pg',page)
webpage <- read_html(page_url,httpheader=myheader,curl = getCurlHandle("useragent" = "Mozilla/5.0"),.encoding="utf-8")
pagexml <- htmlTreeParse(webpage,encoding="utf-8", error=function(...){}, useInternalNodes = TRUE,trim=TRUE)
title <- house_link <- xpathSApply (pagexml, '//*[@class="info clear"]//*[@class="title"]//a',xmlValue) %>% trim()
# title <- str_conv(title,'utf-8')
house_link <- xpathSApply (pagexml, '//*[@class="info clear"]//*[@class="title"]//a',xmlGetAttr,name = 'href') %>% trim()
id <- gsub('https://sh.lianjia.com/ershoufang/|.html','',house_link)
#小区名
community <- xpathSApply (pagexml, '//*[@class="flood"]//*[@class="positionInfo"]//a[1]',xmlValue) %>% trim()
community_link <- xpathSApply (pagexml, '//*[@class="flood"]//*[@class="positionInfo"]//a[1]',xmlGetAttr,name='href') %>% trim()
#板块名(链家的板块不是我们自己的)
circle <- xpathSApply (pagexml, '//*[@class="flood"]//*[@class="positionInfo"]//a[2]',xmlValue) %>% trim()
circle_link <- xpathSApply (pagexml, '//*[@class="flood"]//*[@class="positionInfo"]//a[2]',xmlGetAttr,name='href') %>% trim()
#信息
#爬下来的字符串长这样 “2室2厅|99.25平米|南西|精装|中楼层(共7层)|1999年建|板楼"
info <- xpathSApply (pagexml, '//*[@class="address"]//div',xmlValue) %>% trim()
info <- gsub(' ','',info)
#把信息一个一个抠出来!
#R语言正则
#[:print:]{1,2}+[室]+[:print:]{1}+[厅]+[:print:]{1}+[卫]
#[:print:]任意可打印字符[:digit:]任何一个数字
#[\u4e00-\u9fa5]任何中文字
#[] 选择方括号中的任意一个(如[0-2]和[012]完全等价,[Rr]负责匹配字母R和r)
#{} 前面的字符或表达式的重复次数。如{5,12}表示重复的次数不能小于5,不能多于12,否则都不匹配
area_count_int <- unlist(lapply(1:length(info), function(x) {str_extract_all(info,"[:digit:]{1}+[室]+[:digit:]{1}+[厅]")[[x]]}))
roomcount <- unlist(lapply(1:length(info), function(x) {str_extract_all(info,"[:digit:]{1}+[室]")[[x]]}))
livingcount <- unlist(lapply(1:length(info), function(x) {str_extract_all(info,"[:digit:]{1}+[厅]")[[x]]}))
area <- unlist(lapply(1:length(info), function(x) {str_extract_all(info,"[:digit:]{1,}+[平米]")[[x]]}))
floor <- unlist(lapply(1:length(info), function(x) {str_extract_all(info,"[\u4e00-\u9fa5]{1}+[楼]+[层]")[[x]]}))
height <- unlist(lapply(1:length(info), function(x) {str_extract_all(info,"[共][:digit:]{1,}+[层]")[[x]]}))
year <- unlist(lapply(1:length(info), function(x) {str_extract_all(info,"[:digit:]{1,}+[年建]")[[x]]}))
type <- unlist(lapply(1:length(info), function(x) {str_extract_all(info,"[\u4e00-\u9fa5]{1}+[楼]\\b")[[x]]}))
if(length(type)==0){type <- 'other'}
if(length(floor)==0){floor <- 'other'}
totalprice <- xpathSApply (pagexml, '//*[@class="totalPrice"]//span',xmlValue) %>% trim()
unitprice <- xpathSApply (pagexml, '//*[@class="priceInfo"]//div[2]',xmlGetAttr,name='data-price') %>% trim()
district <- district_table_all$area[dis]
house_table <- data.table(title=title,
id=id,
house_link=house_link,
district=district,
circle=circle,
circle_link=circle_link,
community=community,
community_link=community_link,
area=area,
room=roomcount,
living=livingcount,
height=height,
floor=floor,
year=year,
type=type,
totalprice=totalprice,
unitprice=unitprice)
HOUSE_TABLE <- rbind(HOUSE_TABLE,house_table)
print(house_table)
Sys.sleep(runif(1,0,3))
#runif产生随机数1个数 最小0最大1(我改成了5...被禁了一次ip学乖了...慢点就慢点吧...ps确实是太慢了,不至于这么怂,估计2就可以了)
}
save.image(file = '链家租房房源_列表_202002.RData')
}
HOUSE_TABLE_unique <- unique(HOUSE_TABLE,by=c('id'))
save(HOUSE_TABLE_unique,file = paste0('HOUSE_TABLE_ershou_unique',Sys.Date(),'.rda'))
③爬取房源详情,也就是上一步爬下来的每个链接里面的详细内容
下面代码是爬取链家成都所有小区详情,
原理是一样的,这个是改得最完整的版本。
特别吃内存,现在还没搞清楚原因,在改代码的过程中减少了rbind的使用,没用的东西各种清,但感觉还是不对,平均爬一条0.5M的内存???之后问一下专业人士解决后更新
print('初始化>>> 清空环境和内存...');gc(rm(list=ls()))
library(bitops)
library(RCurl)
library(XML)
library(plyr)
library(rjson)
library(data.table)
library(stringr)
library(xml2)
library(rvest)
library(glue)
enviorment_path <- 'D:/Code/lianjia'
setwd(enviorment_path)
myheader=c(
"user-agent"="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36",
"path"="/sitemap/spotdis/c0",
"accept"="*/*",
# "accept"="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
"accept-encoding"="deflate, br",
"accept-language"="zh-CN,zh;q=0.9",
"cache-control"="max-age=0",
"upgrade-insecure-requests"='1',
# "authority"="brand.yoka.com/brandlist.htm",
"scheme"="https"
)
s_time <- Sys.time()
load('D:/Code/lianjia/xiaoqu_table_chengdu_unique2020-02-19.rda')
id=XIAOQU_TABLE_unique$id
link=XIAOQU_TABLE_unique$xiaoqu_link
rm(XIAOQU_TABLE_unique)
gc()
i= 1
Detail_table <- NULL
total=length(id)
for (i in i:(round(total/50)+1)) {
d = (i-1)*50+1
Lng <- NULL
Lat <- NULL
ID <- NULL
Building <- NULL
Family <- NULL
Address <- NULL
for (d in d:(i*50)){
if(d<=total) {
xiaoqu_id <- id[d]
url <- link[d]
webpage <- getURL(url,httpheader=myheader,.encoding="utf-8")
msg.load <- tryCatch({
pagexml <- htmlTreeParse(webpage,encoding="utf-8", error=function(...){}, useInternalNodes = TRUE,trim=TRUE)
building <- xpathSApply (pagexml, '//*[@class="xiaoquInfo"]//div[6]//span[2]',xmlValue) %>% trim()
family <- xpathSApply (pagexml, '//*[@class="xiaoquInfo"]//div[7]//span[2]',xmlValue) %>% trim()
address <- xpathSApply (pagexml, '//*[@class="detailHeader fl"]//div',xmlValue) %>% trim()
address <- gsub('\\([\u4e00-\u9fa5]{1,}\\)','',address)
infomation <- xpathSApply (pagexml, '//script',xmlValue)[22]
location_int <- unlist(str_extract_all(infomation,"[resblockPosition]+[:']+[:digit:]{1,}+[.]+[:digit:]{1,}+[,]+[:digit:]{1,}+[.]+[:digit:]{1,}"))
location_int <- gsub("resblockPosition:'","",location_int)
location_int <- str_split(location_int,',') %>% unlist()
lng <- location_int[1]
lat <- location_int[2]
msg.load <- "true"
}, error = function(e) {
"error"
}
)
if(msg.load=='error'){
lng <- NA
lat <- NA
xiaoqu_id <- xiaoqu_id
building <- NA
family <- NA
address <- NA
}
Lng <- c(Lng,lng)
Lat <- c(Lat,lat)
ID <- c(ID,xiaoqu_id)
Building <- c(Building,building)
Family <- c(Family,family)
Address <- c(Address,address)
Sys.sleep(runif(1,0,1))
}
else{
break
}
}
detail_table <- data.table(xiaoquid=ID,
lng=Lng,
lat=Lat,
address = Address,
building_count = Building,
family_count = Family)
Detail_table <- rbind(Detail_table,detail_table)
print(detail_table)
Sys.sleep(runif(1,0,5))
gc()
}
save(Detail_table,file = paste0('xiaoqu_table_chengdu_detail_',Sys.Date(),'.rda'))