########################################################
#-------------------------------------------------------
# Topic:模拟浏览器访问alibaba爬虫
# Author:
# Date:Sun Mar 08 19:00:35 2020
# Mail:
#-------------------------------------------------------
########################################################
#-------------------------------------------------------
#Function1:从百度进入Alibaba的主页
#-------------------------------------------------------
library(RSelenium)
library(rvest)
library(stringr)
remDr<-remoteDriver(browserName="chrome",remoteServerAddr="localhost",port=4445L)
remDr$open()
url<-"http://www.baidu.com"
remDr$navigate(url)
search_box<-'//*[@id="kw"]'
search_box<-remDr$findElement(using = "xpath",search_box)
remDr$mouseMoveToLocation(webElement = search_box)
search_box$sendKeysToElement(list("阿里巴巴",key="enter"))
url_dir<-'//*[@id="2"]/h3/a' #这里的路径是当时我搜索的路径,不同时间搜索可能结果不同
url_dir<-remDr$findElement(using = "xpath",url_dir)
remDr$mouseMoveToLocation(webElement = url_dir)
remDr$click()
#-------------------------------------------------------
#Function2:使用alibaba模拟检索
#-------------------------------------------------------
myswitch(remDr,remDr$getWindowHandles()[[2]])
remDr$getCurrentUrl()
ali_search<-'//*[@id="J_SC_header"]/header/div[2]/div[3]/div/div/form/div[2]/input'
ali_search<-remDr$findElement(using = "xpath",ali_search)
ali_search$sendKeysToElement(list("25mm mink eyelashes",key="enter"))
list_vis<-'//*[@id="root"]/div/div[3]/div[1]/div[2]/div[2]/div[2]/a[2]'
list_vis<-remDr$findElement("xpath",list_vis)
remDr$mouseMoveToLocation(webElement = list_vis)
remDr$click()
#-------------------------------------------------------
#Function:对检索到的信息进行爬取
#-------------------------------------------------------
#使用revest爬取信息
library(rvest)
info_list<-list(title<-'//p[@class="organic-gallery-title__content"]',
price<-'//p[@class="gallery-offer-price"]')
ranges<-'//ul[contains(@class,"ma-ladder-price ma-ladder-price-count-")]/li'
detail_url_dir<-'//div[@class="organic-gallery-offer-section__title"]/h4/a'
input_num<-'//*[@id="root"]/div/div[3]/div/div[2]/input'
input_go<-'//*[@id="root"]/div/div[3]/div/div[2]/span[2]'
sub_dir_url='//a[contains(@class,"organic-gallery-title")]'
#current_url1<-remDr$getCurrentUrl()
remDr$navigate("https://www.alibaba.com/products/25mm_mink_eyelashes.html?spm=a2700.galleryofferlist.0.0.3615221bsbyKgv&IndexArea=product_en&viewtype=G")
detail_info_list<-list()
information_all<-list()
sub_dir<-c()
url_all<-list()
for (i in 1:100) {
#输入页码进入网页
input_num1<-remDr$findElement("xpath",input_num)
input_num1$sendKeysToElement(list(as.character(i)))
input_go1<-remDr$findElement("xpath",input_go)
remDr$mouseMoveToLocation(webElement = input_go1)
remDr$click()
#页面滑动功能
last_height = 0
repeat {
remDr$executeScript("window.scrollTo(0,document.body.scrollHeight);", list(remDr$findElement("css","body")))
Sys.sleep(1)
new_height=remDr$executeScript("return document.body.scrollHeight", list(remDr$findElement("css","body")))
if(unlist(last_height) == unlist(new_height)) {break} else
{last_height = new_height}
}
Sys.sleep(3)
#解析网页
web<-read_html(remDr$getPageSource()[[1]])
print(i)
information<-sapply(info_list,function(x){x=html_nodes(web,xpath = x);x=html_text(x,trim = T);return(x)},simplify = T)
information_all[[i]]<-information
names(information_all)[i]<-paste("page",i,sep = "")
sub_dir1<-html_nodes(web,xpath = sub_dir_url)
sub_dir1<-html_attr(sub_dir1,"href")
sub_dir1<-paste("http:",sub_dir1,sep = "")
sub_dir<-c(sub_dir,sub_dir1)
url_all[[i]]<-remDr$getPageSource()[[1]]
names(url_all)[[i]]<-paste("page",i,sep = "")
#这里原本计划爬取每一个产品的细节信息,由于动态页面加载时间太长(网速太慢),故放弃
# current_url<-remDr$getCurrentUrl()
# detail_url<-html_nodes(web,xpath=detail_url_dir)
# detail_url<-html_attr(detail_url,"href")
# detail_url<-paste("http:",detail_url,sep = "")
# detail_info_list[[i]]<-list()
# names(detail_info_list)[[i]]<-paste("page",i,sep = "")
# for (j in 1:length(detail_url)) {
#
# remDr$navigate(detail_url[j])
# Sys.sleep(10)
# web<-read_html(remDr$getPageSource()[[1]])
# detail_info<-html_nodes(web,xpath = ranges)
# detail_info<-html_text(detail_info,trim = T)
# if(length(detail_info)==0){
# detail_info="non_detected"
# }else{detail_info<-gsub("\n","",detail_info)}
# detail_info_list[[i]][[j]]<-detail_info
# names(detail_info_list[[i]])[j]<-remDr$getTitle()[[1]]
# print(j)
# }
# remDr$navigate(current_url[[1]])
Sys.sleep(5)
}
#-------------------------------------------------------
#Function4:对结果进行整理及保存
#-------------------------------------------------------
information_all$page1[[1]]<-information_all$page1[[1]][-1]
information_all$page1
information_all$page1<-data.frame(title=information_all$page1[[1]],price=information_all$page1[[2]])
information<-data.frame()
for (i in 1:100) {
information1<-information_all[[i]]
information1<-as.data.frame(information1)
colnames(information1)<-c("title","price")
information<-rbind(information,information1)
}
information2<-str_split(as.character(information$price)," / ",simplify = T)
information$price<-information2[,1]
information$unit<-information2[,2]
information
information[,c(4,5)]<-str_split(as.character(information$price),"-",simplify = T)
information[1:6,1:5]
colnames(information)[c(4,5)]<-c("low","high")
information$url<-sub_dir
colnames(information)
information<-information[,c(1,4,5,3,6)]
head(information)
library(xlsx)
write.xlsx(information,"information.xlsx",row.names = F)
使用RSelenium+rvest模拟浏览器爬取阿里巴巴商品信息
©著作权归作者所有,转载或内容合作请联系作者
- 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
- 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
- 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
推荐阅读更多精彩内容
- 马化腾:你们可以膜拜阿里巴巴和马云,为何要吐槽我的腾讯? 吐槽马化腾 阿里巴巴和腾讯两大科技公司,都是在中国或者世...
- 前言 今天为大家介绍一个Python利用selenium打开浏览器的方式来爬取淘宝商品的信息,下面就来看看,关于s...
- 前言 今天为大家介绍一个Python利用selenium打开浏览器的方式来爬取淘宝商品的信息,下面就来看看,关于s...