概述
- 使用firefox浏览器
- 使用selenium时,只能使用扫码登录,不能使用用户名密码登录
- 使用用户名密码登录时会提示"==哎呀,出错了,点击刷新再来一次(error:9PiETg)=="
- 扫码登录后可以将cookie保存,下一次可以使用cookie登录,避免每次运行都要扫码
- 保存的cookie会有失效时间
- 支持淘宝链接,天猫链接和短链接
- 爬取时还有获取页面元素不稳定的问题,暂时没找到好办法,只能多试几次
- 商品评论的页数只能到99页,多于99页会提示错误
- 翻页太快会导致出现验证,并且验证时会提示"==哎呀,出错了,点击刷新再来一次(error:9PiETg)=="
效果
实现
源码文件
文件 | 介绍 |
---|---|
main.py | 爬虫入口,创建保存目录,扫码登录或cookie登录,开启爬虫 |
core.py | 爬虫判断,根据url创建TMall或者Taobao爬虫 |
spider文件夹 taobaoSpider.py和tmallSpider.py | 实现了淘宝商品评论爬取和天猫商品评论爬取 |
browser.py | WebDriver FireFox抽象 |
util.py | 工具类 |
settings.py | 配置如扫码登录还是cookie,图片存储目录,url等 |
main.py
# coding=utf-8
import json
import os
import settings
import util
from core import Crawler
from browser import FirefoxBrowser
#创建图片目录
util.mkStorePath(settings.STORE_PATH)
firefox = FirefoxBrowser()
#扫码登录并保存cookie
if settings.QRCODE == True:
cookies = firefox.get_cookies(settings.LOGIN_URL)
jsonCookies = json.dumps(cookies)
with open("cookies_tao.json", "w") as fp:
fp.write(jsonCookies)
fp.close()
print("cookie file done")
#否则使用保存的cookie登录
else:
firefox.get(settings.LOGIN_URL)
if os.path.exists('cookies_tao.json'):
with open("cookies_tao.json","r",encoding="utf8") as fp:
cookies = json.loads(fp.read())
firefox.set_cookies(cookies)
fp.close()
#爬取商品评论
failedList = []
#第一遍有失败的
for url in settings.URLS:
isSuccess = Crawler(url,firefox).start()
if isSuccess == False:
failedList.append(url)
#失败的重试一次
for url in failedList:
Crawler(url,firefox).start()
firefox.close()
core.py
# coding=utf-8
import os
from time import sleep
from spider.tmallSpider import TmallSpider
from spider.taobaoSpider import TaobaoSpider
import settings
class Crawler(object):
def __init__(self, target_url,firefoxBrowser):
#TODO 验证url可用性
self._firefox = firefoxBrowser
if(target_url.find('detail.tmall.com') != -1):
self._type = 1
elif(target_url.find('item.taobao.com') != -1):
self._type = 2
elif(target_url.find('m.tb.cn') != -1):
self._type = 0
self._firefox.get(target_url)
if self._type == 0:
self._firefox._wait_url(target_url,300)
self._url = self._firefox.driver().current_url
def start(self):
"""
:return: True False
"""
#判断爬虫类型
if(self._url.find('detail.tmall.com') != -1):
return TmallSpider(self._firefox).start()
elif(self._url.find('item.taobao.com') != -1):
return TaobaoSpider(self._firefox).start()
spider/tmallSpider.py
# coding=utf-8
import time
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
import settings
import util
from util import ElementFilter
class TmallSpider(object):
def __init__(self,firefoxBrowser):
self._firefox = firefoxBrowser
self._surl = self._firefox.driver().current_url
self._sid = util.getIdAndMkdir(self._surl,settings.STORE_PATH)
self._item = {}
self._rate = []
def start(self):
"""
:return: True-爬取完成 False-爬取失败
"""
#判断爬虫类型
print('start tmallSpider ' + str(self._sid))
#获取item的标题
print('get Title')
self._item['title'] = self._firefox.get_element(ElementFilter.tm_dict['Title']).text
#找到包含评论页面的view
print('get JTabBarBox')
element = self._firefox.get_element(ElementFilter.tm_dict['JTabBarBox'])
self._firefox.driver().execute_script("arguments[0].scrollIntoView()",element)
#找到包含radio的容器
print('get JTabbar')
jtabbar = self._firefox.get_element_without_wait(ElementFilter.tm_dict['JTabbar'])
if jtabbar is None:
print('JTabbar not found')
return False
jtabbar.click()
time.sleep(5)
#找到radio并点击包含图片的评论
print('get JReviews')
jreviews = self._firefox.get_element_without_wait(ElementFilter.tm_dict['JReviews'])
if jreviews is None:
print('JReviews not found')
return False
jreviews.click()
time.sleep(5)
#找到图片评论,最多99页
for num in range(1,99):
temp = self.parse(self._firefox.driver().page_source)
self._rate.append(temp)
print('page'+str(num))
num = num + 1
isLast = self._firefox.get_next_page_tmall('下一页>>')
if isLast == False:
break
time.sleep(5)
self._item['rates'] = self._rate
return True
def parse(self,html):
bs4 = BeautifulSoup(html,"html.parser")
div_rate = bs4.find("div",class_="rate-grid")
items = []
#选择每一行
trs = div_rate.select('tr')
for tr in trs:
item = {}
#td class="col-author" 作者
td3 = tr.select_one('td.col-author')
contents = td3.select_one('div.rate-user-info').contents
item['author'] = contents[0].strip()+ "***" + contents[2].strip()
item['rauthor'] = contents[0].strip() + contents[2].strip()
#td class="tm-col-master" 评论内容和图片地址
td1 = tr.select_one('td.tm-col-master')
#追评 tm-rate-premiere 否则 tm-rate-content
premiere = td1.select_one('div.tm-rate-premiere')
if premiere is not None:
print('premiere')
#tm-rate-premiere
#初始评论内容
fulltxt = premiere.select_one('div.tm-rate-fulltxt').contents
if len(fulltxt) > 1:
item['tm-rate-fulltxt'] = fulltxt[1].strip()
else:
item['tm-rate-fulltxt'] = fulltxt[0].strip()
item['tm-rate-fulltxt'] = fulltxt
#评论时间
date = premiere.select_one('div.tm-rate-date').contents[0].strip()
item['tm-rate-date'] = date
#评论图片url
lis = premiere.select('li')
datasrc=[]
for li in lis:
srcLi = li.attrs['data-src']
if srcLi.endswith(".png"):
continue
imgUrl = self.parseImg(srcLi,item['rauthor'])
datasrc.append(imgUrl)
#追评内容
append = td1.select_one('div.tm-rate-append')
fulltxt = append.select_one('div.tm-rate-fulltxt').contents
if len(fulltxt) > 1:
item['tm-rate-fulltxt'] = fulltxt[1].strip()
else:
item['tm-rate-fulltxt'] = fulltxt[0].strip()
item['append-rate-fulltxt'] = fulltxt
alis = append.select('li')
for li in alis:
srcLi = li.attrs['data-src']
if srcLi.endswith(".png"):
continue
imgUrl = self.parseImg(srcLi,item['rauthor'])
datasrc.append(imgUrl)
item['tm-m-photos'] = datasrc
else:
#tm-rate-content
content = td1.select_one('div.tm-rate-content')
#评论内容
fulltxt = content.select_one('div.tm-rate-fulltxt').contents
if len(fulltxt) > 1:
item['tm-rate-fulltxt'] = fulltxt[1].strip()
else:
item['tm-rate-fulltxt'] = fulltxt[0].strip()
#评论图片url
lis = content.select('li')
datasrc=[]
for li in lis:
srcLi = li.attrs['data-src']
if srcLi.endswith(".png"):
continue
imgUrl = self.parseImg(srcLi,item['rauthor'])
datasrc.append(imgUrl)
item['tm-m-photos'] = datasrc
#评论时间
date = td1.select_one('div.tm-rate-date').contents[0].strip()
item['tm-rate-date'] = date
#td class="col-meta" 颜色和鞋码
td2 = tr.select_one('td.col-meta div.rate-sku')
ps = td2.select('p')
item['color'] = ps[0]['title']
item['size'] = ps[1]['title']
items.append(item)
return items
def parseImg(self,picUrl,author):
picTemp = picUrl.rpartition('/')[2]
picDes = settings.STORE_PATH + '/' + self._sid + "/" + author + '_' + picTemp[:len(picTemp)-12]
picAll = "http:" + picUrl[:len(picUrl)-12]
urlretrieve(picAll,picDes)
return picAll
参考
selenium文档
Beautiful Soup 4.4.0 文档
CSS 选择器参考手册
selenium爬取淘宝评论信息
python +Selenium 爬取淘宝商品评论