爬取涉及到客户交互以及异步加载的页面,需要模拟客户点击并等待响应的操作。可以借助自动化测试软件selenium。
firefox浏览器驱动geckdriver下载地址
tar -xvzf chromedriver_linux64.zip
chmod +x chromedriver
sudo mv chromedriver /usr/bin/
pip install selenium
爬取法院失信名单(python)
# coding=utf-8
import datetime
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time
import sys
import os
# 打开chrome浏览器(需提前安装好chromedriver)
#----------------------------------------------------
browser = webdriver.Firefox()
# browser = webdriver.PhantomJS()
print("正在爬取...")
browser.get("http:\\www.baidu.com/")
elem = browser.find_element_by_name('wd')
elem.send_keys("全国法院被执行人信息查询 - 被执行人查询")
browser.find_element_by_xpath('//*[@id="su"]').click()
#initlize
data = pd.DataFrame()
print("正在爬取...")
soup = BeautifulSoup(browser.page_source, "lxml")
id_card = soup.find_all("span",attrs={"class":"op_trust_fl op_trust_papers"})
name = soup.find_all("span",attrs={"class":"op_trust_name"})
names = [x.get_text() for x in name]
id_cards = [x.get_text() for x in id_card]
tmp = pd.DataFrame({'names':names,'id_card':id_cards})
data = data.append(tmp)
data.to_csv("shixin.txt",mode='a',index=False)
time.sleep(1)
def scrapef():
# netx page
browser.find_element_by_xpath('//p/span[@class="op_trust_page_next OP_LOG_BTN"]').click()
time.sleep(1.5)
print("正在爬取...")
soup = BeautifulSoup(browser.page_source, "lxml")
id_card = soup.find_all("span",attrs={"class":"op_trust_fl op_trust_papers"})
name = soup.find_all("span",attrs={"class":"op_trust_name"})
names = [x.get_text() for x in name]
id_cards = [x.get_text() for x in id_card]
data = pd.DataFrame({'names':names,'id_card':id_cards})
data.to_csv("shixin.txt",mode='a',index=False)
print(data)
while(True):
scrapef()
如果是R,因为没有类似绑定java或python的版本,需要下载单独selenium软件,然后启动selenium服务。下载chrome或firefox驱动。
install.packages('Rselenium')