翻页
- 对于这个需求我们两种方法,一个是解析源码
import time
from selenium import webdriver
from selenium.webdriver.support.select import Select
import random
from lxml import etree
def extract_content(item):
pass
driver = webdriver.PhantomJS()
# driver = webdriver.Chrome()
url = ''
driver.get(url)
doc = etree.HTML(driver.page_source)
page = int(doc.xpath('//*[@id="PageTotalSpan"]/text()')[0]) // 10 + 1 # 获取页码
for i in range(1, page):
response = etree.HTML(driver.page_source)
contents = response.xpath('//td[@valign="top"]/table/tbody/tr/td')
extract_content(contents)
driver.find_element_by_xpath('//*[@id="_PageBar_Index_list1"]').clear() # 清除页码
a = random.uniform(1, 2)
time.sleep(a)
driver.find_element_by_xpath('//*[@id="_PageBar_Index_list1"]').send_keys(i) # 填写页码
driver.find_element_by_xpath(
'//*[@id="PageBarDiv"]/table/tbody/tr/td/table/tbody/tr/td[7]/a/img').click() # 翻页
- 获取加载后的动态翻页针对于Elements 跟 page source不一致情况
import random
import time
from selenium import webdriver
def extract_content(item):
pass
url = 'xxxx'
driver = webdriver.PhantomJS()
driver.get(url)
for i in range(1, 10):
print(i)
driver.find_element_by_xpath('//*[@id="_PageBar_Index_list1"]').clear()
driver.find_element_by_xpath('//*[@id="_PageBar_Index_list1"]').send_keys(i)
driver.find_element_by_xpath(
'//*[@id="PageBarDiv"]/table/tbody/tr/td/table/tbody/tr/td[7]/a/img').click()
a = random.uniform(8, 10) # 加载时间
time.sleep(a)
contents = driver.find_elements_by_xpath('//table[@id="illExampleDataTable"]/tbody') # 捕获全部的加载动态
extract_content(contents)