今天搞了一个抓取某大厂TP的小程序,以yyds的小哥来做实验吧。这个网页是鼠标下滑动态加载的,而且想要下载原图必须是鼠标停留才能看得到原图地址。
于是。。。。我就直接上代码了。
```python
import os
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.keys import Keys
import time
import requests
from selenium.webdriver.chrome.options import Options
import base64
rootrurl = bytes.decode(base64.b64decode(b'aHR0cHM6Ly93d3cuYmFpZHUuY29t')) # 某大厂B
save_dir = 'D:/estimages/'
chromeExeLoc = 'D:/software/chrome/chromedriver_win32/chromedriver.exe'
headers = {
"Referer": rootrurl,
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
'Accept-Language': 'en-US,en;q=0.8',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive'
} ###设置请求的头部,伪装成浏览器
def del_file(path):
ls = os.listdir(path)
for i in ls:
c_path = os.path.join(path, i)
if os.path.isdir(c_path):
del_file(c_path)
else:
os.remove(c_path)
def saveOneImg(dir, img_url, idx):
new_headers = {
"Referer": img_url,
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
'Accept-Language': 'en-US,en;q=0.8',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive'
} ###设置请求的头部,伪装成浏览器,实时换成新的 header 是为了防止403 http code问题,防止反盗链,
try:
img = requests.get(img_url, headers=new_headers, timeout=(10, 30)) # 请求图片的实际URL
if (str(img).find('200') > 1):
with open(
'{}/{}.jpg'.format(dir, idx), 'wb') as jpg: # 请求图片并写进去到本地文件
jpg.write(img.content)
print(img_url, 'downloaded...')
jpg.close()
return True
else:
return False
except Exception as e:
print('exception occurs: ' + img_url)
print(e)
def test():
# 创建文件夹
# del_file(save_dir)
if not os.path.exists(save_dir):
os.makedirs(save_dir)
# 无头浏览器 这样浏览器就不会弹出那个chrome的web浏览器界面
options = Options()
options.add_argument('--headless')
driver = webdriver.Chrome(chromeExeLoc)
# driver = webdriver.Chrome(chromeExeLoc, options=options)
driver.get(rootrurl)
# 找到某大厂B的搜索框,并且输入“肖宇梁版张起灵”
elem = driver.find_element_by_xpath('//*[@id="kw"]')
elem.send_keys("肖宇梁版张起灵")
time.sleep(5)
elem.send_keys(Keys.RETURN)
time.sleep(4)
# 点击第一个链接,就是某大厂B的TP
elem = driver.find_element_by_xpath('//*[@id="1"]/div[1]/h3/a').click()
time.sleep(4)
# 定义鼠标下滑, 定义鼠标滑倒底部的动作
handles = driver.window_handles
driver.switch_to.window(handles[1]) # 假设此时只有两个窗口,那么就切换到了第二个窗口
for i in range(50):
driver.execute_script('window.scrollTo(0,document.body.scrollHeight)')
time.sleep(2)
# 获得所有的图片
idx=1
elems = driver.find_elements_by_class_name('imgitem')
for elem in elems:
# 鼠标悬停
print(elem)
ActionChains(driver).move_to_element(elem).perform()
time.sleep(3)
item=None
try:
item = elem.find_element_by_class_name('down')
except Exception as e:
print('skip....')
continue
href = item.get_attribute('href')
print('herf : {}'.format(href))
# 保存图片
idx = idx + 1
saveOneImg(save_dir, href, idx)
# 关闭
driver.close()
if __name__ == "__main__":
test()
```