python+selenium爬取司法行政案例

所爬网址为:http://alk.12348.gov.cn/LawMultiSearch?checkDatabaseID=28%2C29%2C30%2C31%2C67%2C36%2C68%2C69%2C70%2C71

import unittest
import time
import re
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver

class seleniumTest(unittest.TestCase):
    def setUp(self):
        # 调试的时候用firefox比较直观
        # self.driver = webdriver.PhantomJS()
        try:
            self.driver = webdriver.Firefox()
        except Exception as e:
            print(e)
            try:
                self.driver = webdriver.Chrome()
            except Exception as e:
                print(e)

    def testEle(self):
        driver = self.driver
        # 浏览器窗口最大化
        driver.maximize_window()
        driver.get("http://alk.12348.gov.cn/LawMultiSearch?checkDatabaseID=28%2C29%2C30%2C31%2C67%2C36%2C68%2C69%2C70%2C71")
        time.sleep(3)

        file = open('C:/Users/ergou/Documents/PycharmProject/Community_corrections/case.txt', 'w')
        fh = open('C:/Users/ergou/Documents/PycharmProject/Community_corrections/essay.txt', 'w')
        while True:
            soup = BeautifulSoup(driver.page_source, 'xml')
            tab = soup.find_all('a', {'target': '_blank'})  # 案例网址
            for url in tab:
                file.write(str(url)+'\n')
            if driver.page_source.find('page-next') == -1:
                break
            # 找到“下一页”的按钮
            elem = driver.find_element_by_class_name('page-next')
            # 点击“下一页”
            elem.click()
            time.sleep(1)
        file.close()
        file = open('C:/Users/ergou/Documents/PycharmProject/Community_corrections/case.txt', 'r')
        data = file.read()
        # print(data)
        dbid = 'dbID=(.*?)&dbName='
        dbname = '&dbName=(.*?)&sysID='
        sysid = '&sysID=(.*?)" target="_blank'
        dbidlist = re.compile(dbid).findall(data)
        dbnamelist = re.compile(dbname).findall(data)
        sysidlist = re.compile(sysid).findall(data)
        for i in range(0, len(dbidlist)):
            try:
                url = "http://alk.12348.gov.cn/Detail?dbID=" + dbidlist[i] + "&dbName=" + dbnamelist[i] + "&sysID=" + \
                      sysidlist[i]
                print(url)
                headers = {
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'}
                req = urllib.request.Request(url, headers=headers)
                html_data = urllib.request.urlopen(req, timeout=1).read()
                content = html_data.decode("utf-8", "ignore")
                # print(content)
                start = content.find(u"<!--案例内文开始-->")
                end = content.find(u"<!--案例内文结束-->")
                text = content[start:end]
                # 过滤掉乱码
                pat = re.compile('<!?/?\w+[^>]*>')
                essay = pat.sub('', text)
                # print(essay)
                fh.write(str(essay) + '\n' * 3)
            except Exception as e:  # 抛出超时异常
                print('a', str(e))
        fh.close()
        file.close()
    def tearDown(self):
        print('down')

if __name__ == "__main__":
    unittest.main()
最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。