今日内容:
1.selenium相关操作
2.selenium登录破解
3.爬取京东商品信息
4.破解滑动验证码的逻辑
1.selenium相关操作
(1)搭配使用xpath
from selenium import webdriver
import time
'''
<html>
<head>
<base href='http://example.com/' />
<title>Example website</title>
</head>
<body>
<div id='images'>
<a href='image1.html'>Name: My image 1 <br /><img src='image1_thumb.jpg' /></a>
<a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a>
<a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a>
<a href='image4.html'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a>
<a href='image5.html'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a>
</div>
</body>
</html>
'''
driver = webdriver.Chrome(r'/Users/nadia/Downloads/chromedriver')
try:
# 隐式等待写在get请求前
driver.implicitly_wait(5)
driver.get('https://doc.scrapy.org/en/latest/_static/selectors-sample1.html')
# 显示等待写在get请求后
html = driver.find_element_by_xpath('/html')
print(html.tag_name)
# 从根节点开始找任意一个节点
div = driver.find_element_by_xpath('//div')
print(div.tag_name)
# 查找id为images的节点
# div = driver.find_element_by_xpath('div[@id="images"]')
# print(div.tag_name)
# print(div.text)
# 找到第一个a标签
a = driver.find_element_by_xpath('//a')
print(a)
# 找到所有a标签
a_s = driver.find_elements_by_xpath('//a')
print(a_s)
# 找到第一个a节点的href属性
a = driver.find_element_by_xpath('//a').get_attribute('href')
print(a)
time.sleep(5)
finally:
driver.close()
(2)元素交互操作
-1点击,清除,搜索
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
driver = webdriver.Chrome(r'/Users/nadia/Downloads/chromedriver')
try:
driver.implicitly_wait(5)
driver.get('https://www.jd.com/')
input_tag =driver.find_element_by_id('key')
input_tag.send_keys('围城')
input_tag.send_keys(Keys.ENTER)
time.sleep(2)
input_tag = driver.find_element_by_id('key')
input_tag.clear()
input_tag.send_keys('航海王')
button = driver.find_element_by_class_name('button')
button.click()
time.sleep(10)
finally:
driver.close()
-2获取cookies
from selenium import webdriver
import time
driver = webdriver.Chrome(r'/Users/nadia/Downloads/chromedriver')
try:
driver.implicitly_wait(10)
driver.get('https://www.zhihu.com/explore')
print(driver.get_cookies())
time.sleep(10)
finally:
driver.close()
-3 选项卡
import time
from selenium import webdriver
browser = webdriver.Chrome(r'/Users/nadia/Downloads/chromedriver')
browser.get('https://www.baidu.com')
browser.execute_script('window.open()')
print(browser.window_handles) #get all xuanxiangka
browser.switch_to_window(browser.window_handles[1])
browser.get('https://www.taobao.com')
time.sleep(10)
browser.switch_to_window(browser.window_handles[0])
browser.get('https://www.sina.com.cn')
browser.close()
-4动作链
from selenium import webdriver
from selenium.webdriver import ActionChains
import time
driver = webdriver.Chrome()
driver.implicitly_wait(10)
driver.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
try:
# driver.switch_to_frame('iframeResult')
# 切换到id为iframeResult的窗口内
driver.switch_to.frame('iframeResult')
# 源位置
draggable = driver.find_element_by_id('draggable')
# 目标位置
droppable = driver.find_element_by_id('droppable')
# 调用ActionChains,必须把驱动对象传进去
# 得到一个动作链对象,复制给一个变量
actions = ActionChains(driver)
# 方式一: 机器人
# 瞬间把源图片位置秒移到目标图片位置
# actions.drag_and_drop(draggable, droppable) # 编写一个行为
# actions.perform() # 执行编写好的行为
# 方式二: 模拟人的行为
source = draggable.location['x']
target = droppable.location['x']
print(source, target)
distance = target - source
print(distance)
# perform:每个动作都要调用perform执行
# 点击并摁住源图片
ActionChains(driver).click_and_hold(draggable).perform()
s = 0
while s < distance:
# 执行位移操作
ActionChains(driver).move_by_offset(xoffset=2, yoffset=0).perform()
s += 2
# 释放动作链
ActionChains(driver).release().perform()
time.sleep(10)
finally:
driver.close()
-5前进,后退
from selenium import webdriver
import time
driver = webdriver.Chrome(r'/Users/nadia/Downloads/chromedriver')
try:
driver.implicitly_wait(10)
driver.get('https://www.jd.com/')
driver.get('https://www.baidu.com/')
driver.get('https://www.cnblogs.com/')
time.sleep(2)
# 回退操作
driver.back()
time.sleep(1)
# 前进操作
driver.forward()
time.sleep(1)
driver.back()
time.sleep(10)
finally:
driver.close()
2.selenium登录破解
# 由于selenium 驱动浏览器无缓存
#如何给selenium驱动的浏览器加缓存(cookie)避免登录
from selenium import webdriver
from selenium.webdriver import ChromeOptions
import time
options = ChromeOptions()
profile_directory = r'--user-data-dir=(填写chrome存放cookie的位置)'
options.add_argument(profile_directory)
driver = webdriver.Chrome(chrome_options=options)
try:
driver.implicitly_wait(10)
driver.get('https://www.baidu.com/')
# 添加用户cookies信息
# name、value必须小写
driver.add_cookie({"name": "BDUSS", "value": (value的值请在网页的响应头中自行获取})
# 刷新操作
driver.refresh()
time.sleep(10)
finally:
driver.close()
3.爬取京东商品信息
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
def get_good(driver):
try:
# 通过JS控制滚轮滑动获取所有商品
js_code = '''
window.scrollTo(0,50000);
'''
driver.execute_script(js_code)
time.sleep(2)
#查找所有商品div
# good_div = driver.find_element_by_class_id('J_goodList')
good_list = driver.find_elements_by_class_name('gl-item')
for good in good_list:
# 商品链接
good_link = driver.find_element_by_css_selector(' .p-img').get_attribute('href')
# 商品名称
good_name = driver.find_element_by_css_selector('.p-name em').text.replace("\n",'--')
# 商品价格
good_price = driver.find_element_by_class_name('p-price').text.replace('\n',':')
# 评价人数
comment_num = driver.find_element_by_class_name('p-commit').text.replace('\n',' ')
good_content = f'''
商品链接:{good_link}
商品名称:{good_name}
商品价格:{good_price}
评价人数:{comment_num}
'''
print(good_content)
next_tag = driver.find_element_by_class_name('pn-next')
next_tag.click()
time.sleep(2)
get_good(driver)
time.sleep(5)
finally:
driver.close()
if __name__ == "__main__":
good_tag = input("请输入想查找的商品:")
driver = webdriver.Chrome(r'/Users/nadia/Downloads/chromedriver')
driver.implicitly_wait(10)
# 1.往jd发送请求
driver.get("https://www.jd.com/")
# 输入商品名称,并回车搜索
input_tag = driver.find_element_by_id('key')
input_tag.send_keys(good_tag)
input_tag.send_keys(Keys.ENTER)
time.sleep(2)
get_good(driver)