@前言:
工作闲暇之余自学Python,想找个项目练练手,于是瞄准了我们客户阿里给我们下发任务的网站,往常同事都是手动登录网站,手动复制粘贴Case内容到Excel。Kanshan震惊,都9102年了,怎么还要做这么低效(无脑)的工作,于是自学python尝试自动化获取case内容并且保存到本地,想一想,能有多难???
然鹅:人生第一次认真爬的网页有万万个没想到...
@问题和方法
-
万万没想到①:不是所有的网站都随便逛的,遇到这种拦路虎怎么办,盘他? AVMS网址
方法①:先登录网站,拿到cookies,放到headers里面请求网页,发现网页是Ajax渲染的,而且提交方式为post,此路不通。
方法②:selenium模拟登录后获取cookies,保存到本地,每次使用时再调用。先上模拟登录的代码:
@模拟登录
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
def login():
driver.get(url) #加载页面
#定位输入用户名的表单
username = WAIT.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#exampleInputUser")))
#定位输入密码的表单
password = WAIT.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#exampleInputPassword")))
#定位登录的按钮
submit = WAIT.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="login-button"]')))
username.send_keys("XXXX") #引号内为用户和密码
password.send_keys("XXXX")
submit.click() #模拟鼠标点击
driver.refresh() #刷新页面
if __name__ =='__main__':
task_id = input("请输入需要抓取的task_id:")
url = 'http://www.aliavms.cn:7001/tsmanager/index.html#/detail?task_id=' + task_id
pages_string = input("请输入需要抓取得页数:")
pages = int(pages_string)
#options = webdriver.ChromeOptions() #使用chromeless需要的参数
#options.add_argument('headless')
#options.add_argument('disable-gpu')
#driver = webdriver.Chrome(options=options)
driver = webdriver.Firefox()
WAIT = WebDriverWait(driver, 10)
task_name, case_name = login() #为了生成excel名称和sheet表格名称
(下面的获取cookies、保存、读取后来都没有用到)
import os
import json
def get_cookies():
cookies = driver.get_cookies() #webdriver直接获取cookies
def save_cookies(cookies):
with open("cookies.txt", "w") as fp:
json.dump(cookies, fp)
def read_cookie():
if os.path.exists('cookies.text'):
cookies_dict = dict()
with open("cookies.txt", "r") as fp:
cookies = json.load(fp)
for cookie in cookies:
cookies_dict[cookie['name']] = cookie['value']
return cookies_dict
else:
get_cookies()
return read_cookie()
- 如果是静态网页,那就很简单了
import requests
headers = {
# 假装自己是浏览器
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/73.0.3683.75 Chrome/73.0.3683.75 Safari/537.36',
# 把你刚刚拿到的Cookie塞进来
'Cookie': 'eda38d470a662ef3606390ac3b84b86f9; Hm_lvt_f1d3b035c559e31c390733e79e080736=1553503899; biihu__user_login=omvZVatKKSlcXbJGmXXew9BmqediJ4lzNoYGzLQjTR%2Fjw1wOz3o4lIacanmcNncX1PsRne5tXpE9r1sqrkdhAYQrugGVfaBICYp8BAQ7yBKnMpAwicq7pZgQ2pg38ZzFyEZVUvOvFHYj3cChZFEWqQ%3D%3D; Hm_lpvt_f1d3b035c559e31c390733e79e080736=1553505597',
}
session = requests.Session()
url = "https://......."
response = session.get(url)
print(response.text)
- 万万没想到②:driver.page_source抓取的html只有部分代码,因为是基于Ajax渲染的(虽然kanshan很菜,但是kanshan不会这么容易屈服的...)最终使用selenium+xpath定位获取到需要抓取的每页项数。
data = driver.find_element_by_xpath('/html/body/div[2]/div/div/div[8]/div[2]/div/table/tbody').find_elements_by_tag_name('tr')
length = len(data) - 1
- 接着抓取每一项的内容,每一项又是单独的一个页面,所以规则是:点击抓取项,跳转到新的页面,因为第一次玩爬虫,所以这里也踩到坑了,因为driver的定位还在主页面,虽然另外加载了一个标签页,获取的仍然是主页面的信息,所以要做如下操作:
import time
def new_page(button1): #button1是抓取项的xpath路径
page_detail = WAIT.until(EC.element_to_be_clickable((By.XPATH, button1)))
page_detail.click()
time.sleep(2) #给足页面加载时间
#driver.window_handles是获取所有句柄
new_page = driver.window_handles[-1] #获取新标签页(子页面)的句柄
page = driver.window_handles[0] #获取主页面的句柄
driver.switch_to.window(new_page) #跳转到子页面
save_to_excel()
time.sleep(1)
driver.close() #抓取完成关闭子页面
driver.switch_to.window(page) #跳转到主页面
- 万万没想到③:紧接着遇到新的问题:子页面里面有框架iframe的嵌套...
最终解决办法是先定位到iframe,然后再跳出,进入下一个iframe,再跳出,所以Kanshan还写了个循环。
for i in range(1, 4):
#iframe的xpath
button2 = "/html/body/div/div[2]/div/div[4]/div[%d]/div[2]/div/div/div/iframe" % i
iframe = WAIT.until(EC.presence_of_element_located((By.XPATH, button2)))
driver.switch_to.frame(iframe) #跳转到指定的itrame框架
data = WAIT.until(EC.presence_of_all_elements_located((By.TAG_NAME, 'p')))
if len(data) == 0: #这里写了个判断是因为部分iframe没数据,为了不产生报错
tc_data = "" #没有数据的地方使其为空写到excel
else:
text = ""
for item in data:
text = text + item.text + '\n' #不同小标签的内容会换行
tc_data.append(text) #字典的append()方法追加内容
driver.switch_to.default_content() #切到出事的frame,为了跳出iframe,然后进入下一个iframe
- 万万没想到④:此外还遇到本身内容为空的情况,程序会报错,解决方法如下:
try:
tc_class = WAIT.until(EC.presence_of_element_located((By.XPATH, '/html/body/div/div[2]/div/div[2]/table/tr[1]/td[4]/span'))).text
except Exception as e: #出错也能继续执行
tc_class = ""
- 万万没想到⑤:接着是保存到excel的部分:这里我写了个循环,是为了解决在excel中追加新的sheet而不是覆盖。
import xlwt
import xlrd
from xlutils.copy import copy as xl_copy
if os.path.exists(u'%s.xls' % task_name):
#读取存在的excel文档
read_book = xlrd.open_workbook((u'%s.xls' % task_name), formatting_info=True)
write_book = xl_copy(read_book) #复制
#新增sheet
sheet = write_book.add_sheet(case_name, cell_overwrite_ok=True)
else:
#新建excel
write_book = xlwt.Workbook(encoding='utft-8', style_compression=0)
#新建sheet
sheet = write_book.add_sheet(case_name, cell_overwrite_ok=True)
# 表头内容
sheet.write(0, 0, '用例名称')
sheet.write(0, 1, '用例描述')
sheet.write(0, 2, '用例步骤')
sheet.write(0, 3, 'Pass/Fail标准')
sheet.write(0, 4, '用例类别')
sheet.write(0, 5, '备注说明')
sheet.write(0, 6, '结果')
n = 1
tc_num = 1
- 接上部分
def save_to_excel():
global n #这里很重要,设置全局变量
tc_name = WAIT.until(EC.presence_of_element_located((By.XPATH, '/html/body/div/div[2]/div/div[2]/table/tr[1]/td[2]/span'))).text
print("爬取第%d项 tc_name: %s" % (n, tc_name))
try:
tc_class = WAIT.until(EC.presence_of_element_located((By.XPATH, '/html/body/div/div[2]/div/div[2]/table/tr[1]/td[4]/span'))).text
except Exception as e:
tc_class = ""
try:
tc_comment = WAIT.until(EC.presence_of_element_located((By.XPATH, '/html/body/div/div[2]/div/div[2]/div[2]/div[2]'))).text
except Exception as e:
tc_comment = ""
tc_data = []
for i in range(1, 4):
button2 = "/html/body/div/div[2]/div/div[4]/div[%d]/div[2]/div/div/div/iframe" % i
iframe = WAIT.until(EC.presence_of_element_located((By.XPATH, button2)))
driver.switch_to.frame(iframe)
data = WAIT.until(EC.presence_of_all_elements_located((By.TAG_NAME, 'p')))
if len(data) == 0:
tc_data = ""
else:
text = ""
for item in data:
text = text + item.text + '\n'
tc_data.append(text)
driver.switch_to.default_content()
tc_description = tc_data[0]
tc_step = tc_data[1]
tc_criteria = tc_data[2]
sheet.write(n, 0, tc_name)
sheet.write(n, 1, tc_description)
sheet.write(n, 2, tc_step)
sheet.write(n, 3, tc_criteria)
sheet.write(n, 4, tc_class)
sheet.write(n, 5, tc_comment)
n += 1
- 全部代码:
# -*- coding:utf-8 -*-
# Copyright (c)2019, KanShan,All rightsreserved
# Author:KanShan
#Description:输入阿里avms的task_id和页面数,自动抓取Case_info并保存...
import time
import xlwt
import xlrd
import os
from xlutils.copy import copy as xl_copy
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
def login():
driver.get(url)
username = WAIT.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#exampleInputUser")))
password = WAIT.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#exampleInputPassword")))
submit = WAIT.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="login-button"]')))
username.send_keys("XXXX")
password.send_keys("XXXX")
submit.click()
driver.refresh()
#task_name.xlsx
task_name = WAIT.until(EC.presence_of_element_located((By.XPATH, '/html/body/div[2]/div/div/div[4]/div[2]/div/div[2]/div/table/tbody/tr/td[2]/span'))).text
#task里面的case
case_name = WAIT.until(EC.presence_of_element_located((By.XPATH, '//*[@id="app"]/div/div/div[2]/table/tr[1]/td[2]/span'))).text
return task_name, case_name
def save_to_excel():
global n
tc_name = WAIT.until(EC.presence_of_element_located((By.XPATH, '/html/body/div/div[2]/div/div[2]/table/tr[1]/td[2]/span'))).text
print("爬取第%d项 tc_name: %s" % (n, tc_name))
try:
tc_class = WAIT.until(EC.presence_of_element_located((By.XPATH, '/html/body/div/div[2]/div/div[2]/table/tr[1]/td[4]/span'))).text
except Exception as e:
tc_class = ""
try:
tc_comment = WAIT.until(EC.presence_of_element_located((By.XPATH, '/html/body/div/div[2]/div/div[2]/div[2]/div[2]'))).text
except Exception as e:
tc_comment = ""
tc_data = []
for i in range(1, 4):
button2 = "/html/body/div/div[2]/div/div[4]/div[%d]/div[2]/div/div/div/iframe" % i
iframe = WAIT.until(EC.presence_of_element_located((By.XPATH, button2)))
driver.switch_to.frame(iframe)
data = WAIT.until(EC.presence_of_all_elements_located((By.TAG_NAME, 'p')))
if len(data) == 0:
tc_data = ""
else:
text = ""
for item in data:
text = text + item.text + '\n'
tc_data.append(text)
driver.switch_to.default_content()
tc_description = tc_data[0]
tc_step = tc_data[1]
tc_criteria = tc_data[2]
sheet.write(n, 0, tc_name)
sheet.write(n, 1, tc_description)
sheet.write(n, 2, tc_step)
sheet.write(n, 3, tc_criteria)
sheet.write(n, 4, tc_class)
sheet.write(n, 5, tc_comment)
n += 1
def new_page(button1):
page_detail = WAIT.until(EC.element_to_be_clickable((By.XPATH, button1)))
page_detail.click()
time.sleep(2)
new_page = driver.window_handles[-1]
page = driver.window_handles[0]
driver.switch_to.window(new_page)
save_to_excel()
time.sleep(1)
driver.close()
driver.switch_to.window(page)
def page_detail():
data = driver.find_element_by_xpath('/html/body/div[2]/div/div/div[8]/div[2]/div/table/tbody').find_elements_by_tag_name('tr')
length = len(data) - 1
indexs = length
for index in range(2, indexs + 2):
if length <= 0:
break
else:
button1 = ('//*[@id="app"]/div/div/div[8]/div[2]/div/table/tbody/tr[%d]/td[3]/div/div/a' % index)
try:
new_page(button1)
length -= 1
except Exception as e:
pass
button2 = (
'/html/body/div[2]/div/div/div[8]/div[2]/div/table/tbody/tr[%d]/td[2]/table/tr/td[2]/div/div/span' % index)
result = WAIT.until(EC.presence_of_element_located((By.XPATH, button2))).text
print('抓取测试结果:%s' % result)
global n
n -= 1
sheet.write(n, 6, result)
n += 1
length -= 3
def main():
print("爬取Task_name: %s" % task_name)
print("爬取Case_name: %s" % case_name)
if pages == 1:
print("爬取第1页")
page_detail()
print("爬取完成:共1页,保存中")
driver.close()
elif pages >= 2:
try:
page_detail()
print("爬取完成:第1页")
for page in range(2, pages + 1):
print("爬取第%d页" % page)
if page > 6:
next_page = WAIT.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="app"]/div/div/div[8]/div[2]/div/div[2]/div/div/ul/li[7]')))
else:
next_page = WAIT.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="app"]/div/div/div[8]/div[2]/div/div[2]/div/div/ul/li[%d]' % (page + 1))))
next_page.click()
time.sleep(3)
page_detail()
print("爬取完成:第%d页" % page)
finally:
driver.close()
print("爬取完成:共%d页,保存中" % pages)
else:
print("页数输入错误,请输入大于等于1的整数")
exit()
if __name__ =='__main__':
task_id = input("请输入需要抓取的task_id:")
url = 'http://www.aliavms.cn:7001/tsmanager/index.html#/detail?task_id=' + task_id
pages_string = input("请输入需要抓取得页数:")
pages = int(pages_string)
#chrome_options = webdriver.ChromeOptions()
#chrome_options.add_argument('headless')
#chrome_options.add_argument('disable-gpu')
#driver = webdriver.Chrome(options=chrome_options)
driver = webdriver.Firefox()
WAIT = WebDriverWait(driver, 10)
task_name, case_name = login()
if os.path.exists(u'%s.xls' % task_name):
read_book = xlrd.open_workbook((u'%s.xls' % task_name), formatting_info=True)
write_book = xl_copy(read_book)
sheet = write_book.add_sheet(case_name, cell_overwrite_ok=True)
else:
write_book = xlwt.Workbook(encoding='utft-8', style_compression=0)
sheet = write_book.add_sheet(case_name, cell_overwrite_ok=True)
sheet.write(0, 0, '测试用例名称')
sheet.write(0, 1, '测试用例描述')
sheet.write(0, 2, '测试用例步骤')
sheet.write(0, 3, '测试Pass/Fail标准')
sheet.write(0, 4, '测试用例类别')
sheet.write(0, 5, '备注说明')
sheet.write(0, 6, '测试结果')
n = 1
tc_num = 1
main()
#保存为excel文件
write_book.save(u'%s.xls' % task_name)