"""
filename: jd/spider.py
python: 3.7.0
description: 使用selenium搜索京东书籍
"""
from selenium.webdriver import Chrome
from config import *
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from time import sleep
from selenium.common.exceptions import NoSuchElementException
def next_page(client, wait, page_num):
# 确认完整加载网页,下拉到底部
while len(client.find_elements_by_class_name('gl-item')) < 60:
client.execute_script('window.scrollTo(0, document.body.scrollHeight)')
sleep(1)
print("[+] 第{}加载完成".format(page_num))
# 解析数据
parse_page(page_num, client)
# 下一页
page_num += 1
if page_num > END_PAGE:
print('前{}页爬取成功'.format(END_PAGE))
return
# 等待下一页输入框加载完成
wait.until(
EC.presence_of_element_located(
(By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > input')
)
)
# print("[+] 下一页输入框加载完成")
# 等待下一页输入框跳页按钮加载完成
wait.until(
EC.element_to_be_clickable(
(By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > a')
)
)
# print("[+] 跳页按钮加载完成")
# 输入页码
input_ = client.find_element_by_css_selector('#J_bottomPage > span.p-skip > input')
input_.clear()
input_.send_keys(page_num)
# print("[+] 输入页码完成")
# 点击跳页
input_.send_keys(Keys.ENTER)
# print("[+] 点击跳页完成")
# 等待下一页加载完成
wait.until(
EC.text_to_be_present_in_element(
(By.CSS_SELECTOR, '#J_bottomPage > span.p-num > a.curr'),
str(page_num)
)
)
# print("[+] 下一页加载完成")
# 跳下一页
next_page(client, wait, page_num)
def parse_page(page_num, client):
print("[+] 开始解析第{}页数据".format(page_num))
items = client.find_elements_by_class_name('gl-item')
index = 1
for item in items:
print("[{}] ".format(index), end="")
# 在前面的基础上继续解析, 如果那个属性没有提取到,保证其他属性可以正常提取
try:
title = item.find_element_by_css_selector("div.p-name > a > em").text
except NoSuchElementException:
title = None
try:
price = item.find_element_by_css_selector("div.p-price > strong > i").text
except NoSuchElementException:
price = None
try:
store = item.find_element_by_css_selector("div.p-shopnum > a").text
except NoSuchElementException:
store = None
try:
url = item.find_element_by_css_selector("div.p-img > a").get_attribute("href")
except NoSuchElementException:
url = None
try:
comment = item.find_element_by_css_selector(".p-commit a").text
except NoSuchElementException:
comment = None
print("{} >>> {} >>> {} >>> {} >>> {}".format(title, price, store, url, comment))
index += 1
print("[+] 解析第{}页数据完成".format(page_num))
def search(client, url, keyword,wait):
# 打开链接
client.get(url)
# 等待加载输入框完成,等待id为q的加载
wait.until(
EC.presence_of_element_located(
(By.ID, 'key')
)
)
# print("[+] 搜索框加载完成")
# 等待加载搜索按钮完成,等待css选择器满足条件,
wait.until(
EC.element_to_be_clickable(
(By.CSS_SELECTOR, '#search > div > div.form > button > i')
)
)
# print("[+] 搜索按钮加载完成")
# 输入关键字
input_ = client.find_element_by_id('key')
input_.send_keys(keyword)
# print("[+] 输入关键字完成")
# 点击搜索
botton = client.find_element_by_css_selector('#search > div > div.form > button > i')
botton.click()
print("[+] 点击搜索完成")
# 翻页
page_num = 1
next_page(client, wait, page_num)
def main():
# 创建一个浏览器
client = Chrome()
url = "http://www.jd.com"
# 等待对象
wait = WebDriverWait(client, 10)
search(client, url, KEYWORD, wait)
if __name__ == '__main__':
main()
"""
filename: jd/config.py
python: 3.7.0
description: 配置文件
"""
KEYWORD = 'python 书籍'
END_PAGE = 3
结果: