1.Xpath
from selenium import webdriver
# 获取驱动对象
driver = webdriver.Chrome()
# 把窗口全屏
driver.maximize_window()
'''
<html>
<head>
<base href='http://example.com/' />
<title>Example website</title>
</head>
<body>
<div id='images'>
<a href='image1.html'>Name: My image 1 <br /><img src='image1_thumb.jpg' /></a>
<a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a>
<a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a>
<a href='image4.html'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a>
<a href='image5.html'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a>
</div>
</body>
</html>
'''
try:
# 隐式等待写在get请求前
driver.implicitly_wait(5)
# 显示等待写在get请求后
# wait.until()...
# 发送get请求
driver.get('https://doc.scrapy.org/en/latest/_static/selectors-sample1.html')
# 根据xpath语法查找元素
# / 从根开始找第一个标签
html = driver.find_element_by_xpath('/html')
print(html.tag_name)
# // 从根节点开始找任意一个节点
div = driver.find_element_by_xpath('//div')
print(div.tag_name)
# @
# 查找ID为image的div节点
div_image = driver.find_element_by_xpath('//div[@id="images"]')
print(div_image.tag_name)
print(div_image.text)
# 找到第一个a标签
a = driver.find_element_by_xpath('//a')
print(a.tag_name)
# 找到所有a标签
a_s = driver.find_elements_by_xpath('//a')
print(a_s)
# 找到第一个a节点的href属性值
# get_attribute():获取节点中某个属性值
a_s_attr = driver.find_element_by_xpath('//a').get_attribute('href')
print(a_s_attr)
finally:
driver.close()
2.元素交互
from selenium import webdriver
# 键盘按键操作
from selenium.webdriver.common.keys import Keys
import time
# 获取驱动对象
driver = webdriver.Chrome()
# 把窗口全屏
driver.maximize_window()
try:
# 隐式等待
driver.implicitly_wait(10)
# 发送get请求
driver.get('https://www.jd.com/')
input_tag = driver.find_element_by_id('key')
input_tag.send_keys('围城')
input_tag.send_keys(Keys.ENTER)
time.sleep(2)
input_tag = driver.find_element_by_id('key')
input_tag.clear()
input_tag.send_keys('墨菲定律')
button = driver.find_element_by_class_name('button')
button.click()
time.sleep(10)
finally:
driver.close()
'''
获取cookies(了解)
'''
# 获取驱动对象
driver = webdriver.Chrome()
# 把窗口全屏
driver.maximize_window()
try:
# 隐式等待
driver.implicitly_wait(10)
# 发送get请求
driver.get('https://www.zhihu.com/explore')
print(driver.get_cookies())
# # 添加cookie信息
# driver.add_cookie({'Name': 'hy', 'Value': '123'})
# print(driver.get_cookies())
time.sleep(10)
finally:
driver.close()
3.选项卡
# 选项卡管理:切换选项卡,有js的方式windows.open,有windows快捷键:ctrl+t等,最通用的就是js的方式
import time
from selenium import webdriver
browser = webdriver.Chrome()
try:
browser.get('https://www.baidu.com')
# execute.script执行js代码
# 弹窗操作
# browser.execute_script('alert("warn")')
# 新建浏览器窗口
browser.execute_script('window.open()')
time.sleep(1)
# 获取所有的选项卡
print(browser.window_handles)
# 切换第二个窗口
browser.switch_to.window(browser.window_handles[1])
# 第二个窗口向淘宝发送get请求
browser.get('https://www.taobao.com')
time.sleep(5)
# 切换第一个窗口
browser.switch_to.window(browser.window_handles[0])
browser.get('https://www.sina.com.cn')
time.sleep(10)
finally:
browser.close()
'''
页面前进、回退
'''
import time
from selenium import webdriver
browser = webdriver.Chrome()
try:
browser.get('https://www.jd.com')
browser.get('https://www.baidu.com')
browser.get('https://www.cnblogs.com')
time.sleep(2)
# 页面回退
browser.back()
time.sleep(2)
# 页面前进
browser.forward()
time.sleep(2)
browser.back()
time.sleep(10)
finally:
browser.close()
4.动作链,滑动验证
from selenium import webdriver
from selenium.webdriver import ActionChains
import time
driver = webdriver.Chrome()
driver.implicitly_wait(3)
driver.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
driver.maximize_window()
try:
# 切换到ID为iframeResult的标签
driver.switch_to.frame('iframeResult')
# 获取源位置
sourse = driver.find_element_by_id('draggable')
# 获取目标位置
target = driver.find_element_by_id('droppable')
'''
# 方式一:瞬间移动,机器人
# 调用ActionChains(),传入驱动对象
# 获取动作链对象,赋值给一个变量
# actions = ActionChains(driver)
# 把动作放到动作链中,准备串行执行,瞬间移动
# actions.drag_and_drop(sourse, target) # 编写一个行为
# 执行编写好的行为
# actions.perform()
'''
# 方式二:不同的动作链,每次移动的位移都不同,模拟人的行为
# 每个动作都要调用preform()执行
# 点击并按住源图片
ActionChains(driver).click_and_hold(sourse).perform()
distance = target.location['x']-sourse.location['x']
track = 0
while track < distance:
ActionChains(driver).move_by_offset(xoffset=2, yoffset=0).perform()
track += 2
# 释放动作链
ActionChains(driver).release().perform()
time.sleep(10)
finally:
driver.close()
5.破解登陆
import time
from selenium import webdriver
from selenium.webdriver import ChromeOptions
'''
1. 删除C:/Users/administortra/AppData/Local/Google/Chrome/User Data文件夹下的Default文件
2. 打开谷歌浏览器,登录百度账号
--此时会创建一个新的Default缓存文件夹
3. 添加cookies信息
4. 关闭谷歌浏览器后执行程序
'''
# 获取options对象,参数对象
options = ChromeOptions()
# 获取cookie保存路径
profile_dir = '--user-data-dir=C:/Users/administortra/AppData/Local/Google/Chrome/User Data'
# 添加用户信息目录
options.add_argument(profile_dir)
# 把参数加载到当前驱动中,chorme_options默认参数用来接收options对象
browser = webdriver.Chrome(chrome_options=options)
try:
browser.implicitly_wait(10)
browser.get('https://www.baidu.com')
'''
找以下字段:
百度:
BDUSS:*******
'''
# 添加用户cookie信息
# name和value必须小写
browser.add_cookie({'name': 'BDUSS', 'value': '*********'})
# 刷新
browser.refresh()
time.sleep(10)
finally:
browser.close()
6.爬取京东商品信息
'''
爬去京东商品信息
https://www.jd.com/
提取商品信息:
1、商品详情页
2、商品评价
3、商品价格
4、评价人数
'''
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
'''
爬取单个页面
'''
# driver = webdriver.Chrome()
# driver.maximize_window()
#
# try:
# driver.implicitly_wait(10)
#
# driver.get('https://www.jd.com/')
#
# # 查找输入框,搜索macbook
# input_tag = driver.find_element_by_id('key')
# input_tag.send_keys('macbook')
# input_tag.send_keys(Keys.ENTER)
#
# # 通过js控制滚轮滑动获取所有商品信息
# js_code = 'window.scrollTo(0,5000);'
#
# # 执行js代码
# driver.execute_script(js_code)
#
# time.sleep(3)
#
# # 获取所有商品信息
# good_list = driver.find_elements_by_class_name('gl-item')
#
# for good in good_list:
# # 1、商品详情页
# good_url = good.find_element_by_css_selector('.p-img a').get_attribute('href')
#
# # 2、商品评价
# good_name = good.find_element_by_css_selector('.p-name em').text.replace('\n', '--')
#
# # 3.商品价格
# good_price = good.find_element_by_class_name('p-price').text.replace('\n', ':')
#
# # 4.评价人数
# num = good.find_element_by_class_name('p-commit').text.replace('\n', ' ')
# good_detail = '''
# 商品详情页:{}
# 商品评价:{}
# 商品价格:{}
# 评价人数:{}
# '''.format(good_url, good_name, good_price, num)
#
# print(good_detail)
#
# with open('jd_goods.txt', 'a', encoding='utf-8') as f:
# f.write(good_detail)
#
# time.sleep(10)
#
# finally:
# driver.close()
'''
爬取所有页面商品信息
'''
def get_goods(driver):
try:
# 通过js控制滚轮滑动获取所有商品信息
js_code = 'window.scrollTo(0,5000);'
# 执行js代码
driver.execute_script(js_code)
time.sleep(3)
# 获取所有商品信息
good_list = driver.find_elements_by_class_name('gl-item')
for good in good_list:
# 1、商品详情页
good_url = good.find_element_by_css_selector('.p-img a').get_attribute('href')
# 2、商品评价
good_name = good.find_element_by_css_selector('.p-name em').text.replace('\n', '--')
# 3.商品价格
good_price = good.find_element_by_class_name('p-price').text.replace('\n', ':')
# 4.评价人数
num = good.find_element_by_class_name('p-commit').text.replace('\n', ' ')
good_detail = '''
商品详情页:{}
商品评价:{}
商品价格:{}
评价人数:{}
'''.format(good_url, good_name, good_price, num)
# print(good_detail)
with open('weicheng.txt', 'a', encoding='utf-8') as f:
f.write(good_detail)
next_tag = driver.find_element_by_class_name('pn-next')
next_tag.click()
get_goods(driver)
time.sleep(10)
finally:
driver.close()
if __name__ == '__main__':
good_name = input('请输入爬取的商品:').strip()
driver = webdriver.Chrome()
driver.maximize_window()
driver.implicitly_wait(10)
driver.get('https://www.jd.com/')
# 查找输入框,搜索商品
input_tag = driver.find_element_by_id('key')
input_tag.send_keys(good_name)
input_tag.send_keys(Keys.ENTER)
time.sleep(2)
get_goods(driver)