# -*- coding: utf-8 -*-
# @Time : 2018/6/20 8:57
# @Author :
# @File : jd_phone_spider.py
# @Description : 京东的手机畅销榜爬取
import requests
import pymongo
import time
import json
import codecs
from lxml import etree
from datetime import datetime
from selenium import webdriver
from pyquery import PyQuery as pq
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.wait import WebDriverWait
from jd_phone.config import MONGO_URI, MONGO_DB, MONGO_TABLE
class JdPhone:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'}
def __init__(self):
self.base_url = 'https://item.jd.com/'
self.image_base_url = 'http://img10.360buyimg.com/n1/s450x450_' # 拼接手机大图地址
self.sale_url = 'https://top.jd.com/sale?cateId=655' # 手机热卖榜入口url
self.search_url = 'https://top.jd.com/search?cateId=655' # 手机热搜榜入口url
self.preferred_url = 'https://top.jd.com/preferred?cateId=655' # 手机好物榜入口url
self.options = webdriver.ChromeOptions()
self.options.add_argument("--headless")
self.browser = webdriver.Chrome(chrome_options=self.options)
self.browser.set_window_size(1500, 1000)
self.wait = WebDriverWait(self.browser, 10)
self.client = pymongo.MongoClient(MONGO_URI)
self.db = self.client[MONGO_DB]
self.file = codecs.open('jd_phones.json', 'wb+', encoding='utf-8')
def __del__(self):
self.browser.close()
self.file.close()
print('执行完成!')
def get_sale_search_html(self, url):
"""
获取热卖榜或者热搜榜网页
"""
self.browser.get(url)
try:
while True:
for i in range(5):
self.browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
self.browser.find_element_by_link_text('加载更多').click()
time.sleep(2)
except NoSuchElementException:
print('加载完成!')
return self.browser.page_source
def get_preferred_html(self, url):
"""
获取好物榜网页
"""
self.browser.get(url)
for i in range(20):
self.browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
time.sleep(1)
print('好物榜加载完成!')
return self.browser.page_source
def save_data(self, data):
"""
存储数据至mongo,以及json文件
:return:
"""
data['insert_time'] = datetime.strftime(datetime.now(), "%Y-%m-%d %H:%M:%S")
try:
insert_data_json = {}
insert_data_json['insert_time'] = data['insert_time'] # 入库时间
insert_data_json['source_type'] = data['source_type'] # 数据来源; 0:热卖榜, 1:热搜榜, 2:好物榜
insert_data_json['model'] = data['型号'] # 机型
insert_data_json['brand'] = data['品牌'] # 品牌
insert_data_json['public_year'] = data['上市年份'] # 上市年份
insert_data_json['public_month'] = data['上市月份'] # 上市月份
insert_data_json['price'] = data['price'] # 价格
insert_data_json['phone_name'] = data['phone_name'] # 商品名称
del data['insert_time']
del data['source_type']
del data['型号']
del data['品牌']
del data['上市年份']
del data['上市月份']
del data['price']
del data['phone_name']
insert_data_json['key_param'] = data # 主要参数
# 写入json文件
self.file.write(json.dumps(insert_data_json, ensure_ascii=False) + "\n")
# 存入Mongo
if self.db[MONGO_TABLE].insert(insert_data_json):
print(insert_data_json['key_param']['data_sku'], '入库成功!')
except:
pass
def parse_detail(self, data_sku, price, source_type):
"""
解析手机详情
:param data_sku: 商品编号
:param price: 商品价格
:param source_type: 数据来源; 0:热卖榜, 1:热搜榜, 2:好物榜
:return:
"""
phone_detail = {}
url = self.base_url + data_sku + '.html'
html = requests.get(url, headers=self.headers).text
doc = pq(html)
items = doc('.Ptable-item').items()
for item in items:
# 删除tips项
item.find('.Ptable-tips').remove()
dts = item('dt').items()
dds = item('dd').items()
for dt, dd in zip(dts, dds):
key = dt.text().strip()
value = dd.text().strip()
phone_detail[key] = value
image_list = []
image_urls = doc('#spec-list li').items()
for image_url in image_urls:
url = self.image_base_url + image_url('img').attr('data-url')
image_list.append(url)
phone_detail['model_pic_address'] = image_list
phone_name = doc('.parameter2.p-parameter-list li:first-child').attr('title') # 商品名称
phone_detail['phone_name'] = phone_name
phone_detail['data_sku'] = data_sku
phone_detail['price'] = price
phone_detail['source_type'] = source_type
return phone_detail
def parse_sale_html(self):
"""
解析热卖榜网页
"""
html = self.get_sale_search_html(self.sale_url)
tree = etree.HTML(html)
items = tree.xpath('//li[contains(@class, "saleitem") and @data-price-item="1"]')
for item in items:
data_sku = item.xpath('.//p[@class="saleitem_info_price"]/@data-price-id')[0]
price = item.xpath('.//p[@class="saleitem_info_price"]/text()')[0] # 获取价格,解析detail时,则无需动态渲染
phone_detail = self.parse_detail(data_sku, price, source_type=0)
self.save_data(phone_detail)
def parse_search_html(self):
"""
解析热搜榜网页
"""
html = self.get_sale_search_html(self.search_url)
tree = etree.HTML(html)
items = tree.xpath('//li[contains(@class, "toplanding_search_floor")]')
for item in items:
data_skus = item.xpath('.//div[@class="toplanding_search_goods"]//@data-sku')
prices = item.xpath('.//div[@class="toplanding_search_goods"]//'
'p[@class="toplanding_search_goods_price"]/text()')
for data_sku, price in zip(data_skus, prices):
phone_detail = self.parse_detail(data_sku, price, source_type=1)
self.save_data(phone_detail)
def parse_preferred_html(self):
"""
解析好物榜网页
"""
html = self.get_preferred_html(self.preferred_url)
tree = etree.HTML(html)
hrefs = tree.xpath('//div[@class="preferred_list_item pli"]//a[@class="pli_more"]/@href')
print(hrefs)
for href in hrefs:
self.parse_preferred_more(href)
def parse_preferred_more(self, href):
"""
解析好物榜, 点击查看全部的网页
"""
self.browser.get(href)
html = self.browser.page_source
tree = etree.HTML(html)
data_skus = tree.xpath('//li[contains(@class, "preferred_detail_item")]/a/@href')
prices = tree.xpath('//div[@class="preferred_detail_item_price"]/text()')
for sku, price in zip(data_skus, prices):
data_sku = sku.split('/')[-1].rstrip('.html')
phone_detail = self.parse_detail(data_sku, price, source_type=2)
self.save_data(phone_detail)
def main(self):
self.parse_sale_html()
self.parse_search_html()
self.parse_preferred_html()
if __name__ == '__main__':
jd = JdPhone()
jd.main()
jd phone
最后编辑于 :
©著作权归作者所有,转载或内容合作请联系作者
- 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
- 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
- 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
推荐阅读更多精彩内容
- 登录Apple ID 回答密保问题 进入账户资料 邮箱那里有一个添加资料 加上你的手机号码就行了
- 原题 给一个不包含01的数字字符串,每个数字代表一个字母,请返回其所有可能的字母组合。 下图的手机按键图,就表示了...
- Given a digit string, return all possible letter combinat...
- 综合实力iPhone X会更强,但是或许在针对游戏方面,Razer Phone有自己独到的地方~ 每当有新的智能手...