总结
一. selenium基础
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
# 1. 创建浏览器对象
# 注意:浏览器对象如果是局部变量,那么函数结束后浏览器会自动关闭。如果是全局变量浏览器需要手动关闭
b = webdriver.Chrome()
# 2. 打开网页
b.get('https://www.jd.com')
# 3. 获取网页内容
# 注意:在获取浏览器page_source值的时候,只能获取到当前浏览器已经加载出来的数据
print(b.page_source)
# 4. 获取和操作标签
# 1)输入框操作:获取到输入框 -> 输入内容 -> 按回车
# 根据id值获取输入框
input = b.find_element_by_css_selector('#key')
# 在输入框中输入电脑
input.send_keys('电脑')
# 在输入框按回车键
input.send_keys(Keys.ENTER)
time.sleep(1)
input2 = b.find_element_by_css_selector('#key')
# 清空输入框内容
input2.clear()
input2.send_keys('鼠标')
# 获取按钮标签
search_btn = b.find_element_by_css_selector('.button.cw-icon')
# 点击按钮
search_btn.click()
# 4.回退
time.sleep(1)
b.back()
time.sleep(1)
b.back()
time.sleep(1)
b.forward()
time.sleep(1)
b.forward()
# 关闭浏览器
# b.close()
二. selenium选项卡
from selenium import webdriver
import time
b = webdriver.Chrome()
b.get('https://www.jd.com')
# 获取秒杀对应的a标签
miaosha = b.find_element_by_css_selector('#navitems-group1>li>a')
miaosha.click()
# 获取所有选项卡
print(b.window_handles)
time.sleep(2)
# 切换选项卡
b.switch_to.window(b.window_handles[0])
三. selenium获取网页
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
url = 'https://www.taobao.com'
b = webdriver.Chrome()
b.get(url)
input = b.find_element_by_css_selector('#q')
input.send_keys('电脑')
input.send_keys(Keys.ENTER)
# 进入到登录页面, 等待人工操作登录页面
time.sleep(10)
print('人工操作结束')
# 人工登录成功后获取cookie值并且保存到本地文件中
cookies = b.get_cookies()
with open('files/taobao_cookies.txt', 'w', encoding='utf-8') as f:
f.write(str(cookies))
四.selenium使用cookie
from selenium import webdriver
import time
from selenium.webdriver.common.keys import Keys
b = webdriver.Chrome()
b.get('https://www.taobao.com')
# 设置cookie
with open(r'./files/taobao_cookies.txt', 'r', encoding='utf-8') as f:
py_obj = eval(f.read())
for obj in py_obj:
# cookie中secure键对应的值是True才支持https的请求
if obj['secure']:
b.add_cookie(obj)
time.sleep(1)
b.get('https://www.taobao.com')
# b.refresh()
input = b.find_element_by_css_selector('#q')
input.send_keys('电脑')
input.send_keys(Keys.ENTER)
五. 页面滚动
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from bs4 import BeautifulSoup
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
def get_net_data():
global b
b = webdriver.Chrome()
b.get('https://www.jd.com')
input = b.find_element_by_css_selector('#key')
input.send_keys('电脑')
input.send_keys(Keys.ENTER)
time.sleep(1)
# ==================滚动==================
# 提前设置滚动的最大距离
max_height = 7000
# 每次滚动的位置
y = 0
while True:
y += 500
b.execute_script(f'window.scrollTo(0, {y})')
if y > max_height:
break
time.sleep(1)
return b.page_source
def an_data(data):
# #J_goodsList > ul > li:nth-child(1)
soup = BeautifulSoup(data, 'lxml')
li_list = soup.select('#J_goodsList > ul > li')
print(len(li_list))
print(li_list)
if __name__ == '__main__':
an_data(get_net_data())
作业
"""
Time:2021/5/28 14:38
Author:Second
"""
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from lxml import etree
import csv
def get_net_data():
# global b
b.get('https://www.jd.com')
input = b.find_element_by_css_selector('#key')
input.send_keys('电脑')
input.send_keys(Keys.ENTER)
time.sleep(1)
def roll():
# 提前设置滚动的最大距离
max_height = 10000
# 每次滚动的位置
y = 0
while True:
y += 500
b.execute_script((f'window.scrollTo(0, {y})'))
if y > max_height:
break
time.sleep(1)
return b.page_source
# 名称、链接、图片地址、价格、评论数、店铺名称、店铺链接、标签
def an_data(data):
html = etree.HTML(data)
li_s = html.xpath('//li[@class="gl-item"]')
list1 = []
for li in li_s:
name = li.xpath('div/div[3]/a/em/text()')[0]
URL =li.xpath('div/div[3]/a/@href')[0]
img = li.xpath('div/div[1]/a/img/@src')[0]
# img = img if img else '无图片'
price = li.xpath('div/div[2]/strong/i/text()')[0]
comments = li.xpath('div/div[4]/strong/a/text()')[0]
shop_name = li.xpath('div/div[5]/span/a/text()')[0]
shop_URL = li.xpath('div/div[1]/a/@href')[0]
label = ','.join(li.xpath('div/div[6]/i/text()'))
label = label if label else '该店铺暂无标签'
list1.append([name, URL, img, price, comments, shop_name, shop_URL, label])
return list1
def page_turning():
input = b.find_element_by_tag_name('body')
input.send_keys(Keys.RIGHT)
def save(list1):
f = open('files/jd_computer1.csv', 'w', newline='', encoding='utf-8')
writer = csv.writer(f)
writer.writerow(['名称', '链接', '图片地址', '价格', '评论数', '店铺名称', '店铺链接', '标签'])
for x in list1:
writer.writerow(x)
writer.writerows(list1)
f.close()
if __name__ == '__main__':
data_list = []
b = webdriver.Chrome()
get_net_data()
for x in range(1):
data = roll()
list1 = an_data(data)
data_list.extend(list1)
page_turning()
save(data_list)