不唠闲嗑,直接来干的,需要从京东搜索关键词然后将商品列表导出,看了一下市面上的解决方案一个解析接口返回的内容,一个是通过selenium抓取页面,两套方案试了一下都不能满足需求,前者频繁请求会触发风控,后者解析页面需要人工干预,想了一下决定尝试一下chrome插件的方式来获取想要的内容。
先大概整理一下思路:
设置关键词--->触发搜索--->滚动页面--->解析内容--->数据过滤--->处理分页
根据页面元素确定输入框和点击按钮的标识将搜索功能搞定
//搜索
let search = (keyword) =>{
$("#search-2014").find("#key").focus()
$("#search-2014").find("#key").val(keyword)
$("#search-2014").find(".form").find("button").click()
}
拿到结果后将页面内容进行解析,页面内容解析有两段,一个是列表内容提取,一个是每条商品信息提取,由于京东的商品列表不会一次全部展示需要不断的滚动页面才会进行加载,所以需要做一下处理,
let scrollToBottom = async () =>{
let page_height = document.body.scrollHeight
let scroll_count = 15;
let offset = 400; //page_height / scroll_count
for(let i=0; i< scroll_count; i++){
let timeOut = 2
await sleep(timeOut)
window.scrollBy(0, offset)
console.log(`第${i}次滚动`)
}
console.log("滚动完成")
parse_content()
}
//解析页面内容
let parse_content = async () =>{
let keyword = $("#search-2014").find("#key").val()
let items = $("#J_goodsList ul").children()
await sleep(10)
let list = []
$.each(items, function(inx, it){
let data = parse_item(it)
if(data.goods_name.indexOf(keyword) != -1){
//TODO将采集的信息发给后台
}
})
if(list.length > 0){
post_data(list)
}
}
//提取内容
let parse_item = (it) =>{
let is_ad = false
let goods_id = $(it).data("sku")
let pic_dom = $(it).find(".p-img").find("img")
let pic_url = $(pic_dom).attr("src") || $(pic_dom).attr("data-lazy-img")
let price = $(it).find(".p-price").text().trim().replace("¥","")
let deal_num = 0
let goods_name = $(it).find(".p-name").text().trim()
goods_name = goods_name.replace("拍拍", "")
goods_name = goods_name.replace("广告词", "")
goods_name = goods_name.trim()
let goods_detail_url = "https:"+ $(it).find(".p-name a").attr("href")
let shop_name = $(it).find(".p-shopnum").text().trim()
let shop_link = "https:"+ $(it).find(".p-shopnum a").attr("href")
let location = ""
let keyword = $("#search-2014").find("#key").val()
let data = {
is_ad,
pic_url,
price,
deal_num,
goods_name,
goods_detail_url,
goods_id,
shop_name,
shop_link,
location,
keyword
}
return data
}
由于返回的商品信息中不一定会存在我们想要的关键字,所以还需要对数据进行一次处理,例如判断一下标题中是否存在关键字等,这里我只是简单处理一下
if(data.goods_name.indexOf(keyword) != -1){
//TODO将采集的信息发给后台
}
最后就是处理分页问题了,由于京东的分页参数加密比较简单就是采用的2*n-1这种模式,n为下一页的页数
//解析分页
let parse_page = async () =>{
await sleep(10)
let page = parseInt(localStorage.getItem("page") || "0");
if(page){
page = page + 1
}else{
page = 1
}
console.log(`第${page}次执行`)
localStorage.setItem("page",page)
//TODO 判断爬前几页
if(limit && page >limit){
localStorage.removeItem("page")
return
}
let current_page = parseInt($("#J_bottomPage").find(".curr").text().trim())
let total_page = $("#J_bottomPage").find(".p-skip").text().trim()
let regx = /(\d+)/
let result = total_page.match(regx)
total_page = parseInt(result[1])
if (current_page < total_page) {
let nextId = parseInt($("#J_bottomPage").find(".curr").next().text().trim())
let url = location.href
console.log("获取下一页", url)
let page_regx = /\&s=(\d+)/
let page_num = url.match(page_regx)
if(page_num){
page_num = page_num[1]
url = url.replace(page_num, (nextId - 1) * 60 + 1)
let _page_regx = /\&page=(\d+)/
let _page_num = url.match(_page_regx)
if(_page_num){
_page_num = _page_num[1]
url = url.replace(`&page=${_page_num}`, `&page=${(2 * nextId - 1)}`)
}
await sleep(30)
console.log("下一页地址",url)
location.href = url
}else{
url = url + `&page=${(2*nextId - 1)}&s=${(nextId - 1) * 60 + 1}&click=0`
await sleep(30)
location.href = url
}
}else{
//爬完了
localStorage.removeItem("page")
}
}
由于只是分析测试,所以代码写的比较粗糙,大家将就着看吧,后续有时间整理成完整的工程代码