一、Mac环境下相关配置
1、下载以下指定版本FireFox-54.0.1
http://ftp.mozilla.org/pub/firefox/releases/54.0.1/mac/zh-CN/
2、下载geckodriver,亲测版本:v0.20.1,将该文件复制进 /usr/local/bin
https://github.com/mozilla/geckodriver/releases
3、sudo easy_install selenium
4、如果easy_install命令不存在,运行以下命令, 再运行第三步命令
curl https://bootstrap.pypa.io/ez_setup.py -o - | python
5、运行脚本
https://github.com/liyuzero/pyWebDriver/blob/master/search.py
二、脚本
修改以下配置即可将脚本改为其他搜索引擎结果:
1、xpath,图片标签在html内的规则
2、url,搜索url
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import os,shutil
import re
from selenium import webdriver
import time
import urllib
#输出目录
OUTPUT_DIR = '/Users/xxxx/Documents/运动'
#关键字数组:将在输出目录内创建以以下关键字们命名的txt文件
SEARCH_KEY_WORDS = ['羽毛球','羽毛球拍','篮球','足球','乒乓球拍','网球拍','台球桌','弓箭','跑步机','动感单车']
#页数
PAGE_NUM = 15
repeateNum = 0
preLen = 0
def getSearchUrl(keyWord):
if(isEn(keyWord)):
return 'https://www.google.com.hk/search?q=' + keyWord + '&safe=strict&source=lnms&tbm=isch'
else:
return 'https://www.google.com.hk/search?q=' + keyWord + '&safe=strict&hl=zh-CN&source=lnms&tbm=isch'
def isEn(keyWord):
return all(ord(c) < 128 for c in keyWord)
# 启动Firefox浏览器
driver = webdriver.Firefox()
if os.path.exists(OUTPUT_DIR) == False:
os.makedirs(OUTPUT_DIR)
def output(SEARCH_KEY_WORD):
global repeateNum
global preLen
print('搜索' + SEARCH_KEY_WORD + '图片中,请稍后...')
# 如果此处为搜搜,搜索郁金香,此处可配置为:http://pic.sogou.com/pics?query=%D3%F4%BD%F0%CF%E3&di=2&_asf=pic.sogou.com&w=05009900&sut=9420&sst0=1523883106480
# 爬取页面地址,该处为google图片搜索url
url = getSearchUrl(SEARCH_KEY_WORD);
# 如果是搜搜,此处配置为:'//div[@id="imgid"]/ul/li/a/img'
# 目标元素的xpath,该处为google图片搜索结果内img标签所在路径
xpath = '//div[@id="rg"]/div/div/a/img'
# 浏览器打开爬取页面
driver.get(url)
outputFile = OUTPUT_DIR + '/' + SEARCH_KEY_WORD + '.txt'
outputSet = set()
# 模拟滚动窗口以浏览下载更多图片
pos = 0
m = 0 # 图片编号
for i in range(PAGE_NUM):
pos += i*600 # 每次下滚600
js = "document.documentElement.scrollTop=%d" % pos
driver.execute_script(js)
time.sleep(1)
for element in driver.find_elements_by_xpath(xpath):
img_url = element.get_attribute('src')
if img_url is not None and img_url.startswith('http'):
outputSet.add(img_url)
if preLen == len(outputSet):
if repeateNum == 2:
repeateNum = 0
preLen = 0
break
else:
repeateNum = repeateNum + 1
else:
repeateNum = 0
preLen = len(outputSet)
print('写入' + SEARCH_KEY_WORD + '图片中,请稍后...')
file = open(outputFile, 'wr')
for val in outputSet:
file.write(val + '\n')
file.close()
print(SEARCH_KEY_WORD+'图片搜索写入完毕')
print(len(outputSet))
for val in SEARCH_KEY_WORDS:
output(val)
driver.close()