from selenium import webdriver
from selenium.webdriver import Firefox
from selenium.webdriver.firefox.options import Options
import requests
from bs4 import BeautifulSoup
import csv
import ssl
import re
import time
ssl._create_default_https_context = ssl._create_unverified_context
def get_newURL(surname):
if __name__ == "__main__":
options = Options()
options.add_argument('-headless')
global browser
browser = webdriver.Firefox(executable_path=r"C:\Users\weimengxin\Desktop\geckodriver.exe", firefox_options=options)
browser.get('http://search.library.sh.cn/jiapu/bSearch.htm')
input_str = browser.find_element_by_name('expr')
input_str.send_keys(surname)
browser.find_element_by_xpath("//*[@value='检索']").click()
time.sleep(1)
browser.switch_to.window(browser.window_handles[1])
global newurl
newurl = browser.current_url
browser.quit()
def get_next_page(new_url):
if __name__ == "__main__":
options = Options()
options.add_argument('-headless')
global browser_1
browser_1 = webdriver.Firefox(executable_path=r"C:\Users\weimengxin\Desktop\geckodriver.exe", firefox_options=options)
browser_1.get(new_url)
browser_1.find_element_by_xpath("//*[@value='下页']").click()
browser_1.switch_to.window(browser_1.window_handles[0])
global url_new
url_new = browser_1.current_url
browser_1.quit()
return get_next_page(url_new)
def get_current_data(url):
session = requests.Session()
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5)"
"AppleWebKit 537.36 (KHTML, like Gecko) Chrome",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"}
html = session.get(url, headers=headers)
html.encoding = "GBK"
html_code = html.text
bs = BeautifulSoup(html_code, "html.parser")
content_list = bs.find_all("td")
content_list_1 = bs.find("H3")
global data
data = []
try:
for content in content_list:
content = content.get_text()
data.append(content)
for i in data:
if i == '' or i == '*' or i == ' ':
data.remove(i)
except:
content = content_list_1
data.append(content)
with open(r"C:\Users\weimengxin\Desktop\surname.csv", "rt") as sur:
cin = csv.reader(sur)
surname = [i for i in cin]
surname_dict = dict(surname)
surname_dict.pop("")
def get_total_url():
global urls_dict
urls_dict = {}
urls = []
for t in surname_dict:
index = surname_dict[t] + "氏"
print("查询%s第1页数据"%index)
get_newURL(index)
urls.append(newurl)
try:
print("查询%s第2页数据"%index)
get_next_page(newurl)
urls.append(url_new)
except:
print("%s仅一页数据!"%index)
continue
count = 2
while True:
try:
count += 1
print("查询%s第%d页数据" % (index, count))
get_next_page(url_new)
urls.append(url_new)
except:
print("%s收集完毕!"%index)
urls_dict[index] = urls
try:
browser.quit()
browser_1.quit()
except:
continue
# surname_set = {}
# for t in surname_dict:
# get_newURL(surname_dict[t] + "氏")
# print("现在自动检索" + surname_dict[t] + "氏数据")
# print("-------------------------------------")
# get_current_data(newurl)
# all_data = data.copy()
#
# # 计算需要爬取的网页数
# try:
# total = all_data[1]
# pattern = re.compile('[0-9]+')
# match = pattern.search(total)
# total_true = int(match.group())
# sheets = total_true//10 + 1
# except:
# print("Notice: 本次检索未命中记录!")
# continue
# print("正在获取第1页数据... (总共%d页)" % sheets)
#
# try:
# get_next_page(newurl)
# print("正在获取第2页数据... (总共%d页)" % sheets)
# get_current_data(url_new)
# all_data.extend(data)
# except:
# print("Notice: 仅1页数据")
# surname_set[surname_dict[t]] = all_data
# browser_1.close()
# continue
#
# count = 2
# while True:
# try:
# get_next_page(url_new)
# get_current_data(url_new)
# count += 1
# print("正在获取第%d的数据... (总共%d页)" % (count, sheets))
# all_data.extend(data)
# except:
# surname_set[surname_dict[t]] = all_data
# break
# print("爬取" + surname_dict[t] + "氏完成 !!!")
# print("--------------------------")
# continue
2018-01-17
最后编辑于 :
©著作权归作者所有,转载或内容合作请联系作者
- 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
- 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
- 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
推荐阅读更多精彩内容
- 马太福音第17章 耶稣登山变像,向三位门徒显荣,是为了归正他们的信心。门徒因为只专注基督要受难,而忽略基督预言自己...
- 南怀瑾:“能控制早晨的人,方可控制人生。 富兰克林:“我未曾见过一个早起勤奋谨慎诚实的人抱怨命运不好。” 你沉醉于...