一、需求
爬取行政规划网(http://www.xzqy.net)的各省市四级行政规划(精确到乡镇街道)做成Excel表格。
二、具体步骤
- 设置代理ip
原本以为该网站不会限制ip,结果刚爬完一个省的数据,就gg了。。
爬取http://www.xicidaili.com/nn/的可用ip作为代理,并保存在一个文件夹中。
from bs4 import BeautifulSoup
import requests
import random
import lxml
def get_ip_list(url, headers):
web_data = requests.get(url, headers=headers)
soup = BeautifulSoup(web_data.text, 'lxml')
ips = soup.find_all('tr')
ip_list = []
for i in range(1, len(ips)):
ip_info = ips[i]
tds = ip_info.find_all('td')
ip_list.append(tds[1].text + ':' + tds[2].text)
return ip_list
if __name__ == '__main__':
url = 'http://www.xicidaili.com/nn/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
}
ip_list = get_ip_list(url, headers=headers)
#proxies = get_random_ip(ip_list)
file=open('G://ip.txt','w+') #新建文件夹保存可用的代理ip
urlTest="https://www.baidu.com"
for i in range(len(ip_list)):
ip= 'http:\\' + ip_list[i];
proxies = {'proxy': ip}
try: #检测ip是否可用
ss = requests.get(urlTest, proxies=proxies)
if str(ss) == '<Response [200]>':
str1 = ip_list[i] + "\n"
file.write(str1);
except Exception as e:
print (e)
file.close()
print(ip_list)
- 爬取数据
from bs4 import BeautifulSoup
import requests
import re
import xlwt
import datetime
import time
import random
import threading
def findCity(url):#获取市级或区级信息
Target = url;
Target0 = "http://www.xzqy.net/"
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Mobile Safari/537.36'}
ip=getip()
rep = requests.get(url=Target,headers=headers,proxies=ip)
text = rep.text;
soup = BeautifulSoup(text, "html5lib")
city = str(soup.find_all('td', class_="parent")).split('<a')
pattern = re.compile('"./(.*htm)"')
pattern1 = re.compile('>(.*)</a>')
del city[0]#去除无用信息
cityUrl = [];
cityName = [];
for i in range(len(city)):
cityU = Target0 + pattern.findall(city[i])[0]
name = pattern1.findall(city[i])[0] # 市或区名
cityUrl.append(cityU)
cityName.append(name)
return [cityUrl, cityName]
def getMessage(url):#获取街道信息
Target = url
headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Mobile Safari/537.36'}
ip = getip()
rep = requests.get(url=Target, headers=headers, proxies=ip)
text = rep.text;
soup = BeautifulSoup(text, "html5lib")
city = str(soup.find_all('td', class_="parent")).split('<a')
pattern1 = re.compile('>(.*)</a>')
del city[0]
message = []
for i in range(len(city)):
target2 = pattern1.findall(city[i])[0] # 省名
message.append(target2)
return message
def getip():#随机获取可用ip
f = open("G://ip.txt", 'r')
lines = f.readlines()
proxys = []
for i in range(0, len(lines)):
ip = lines[i]
proxy = 'http:\\' + ip
proxies = {'proxy': proxy}
proxys.append(proxies)
pro=random.choice(proxys);
f.close()
return pro
def province(url,Pname):#以省为单位保存数据
city = [];
name = [];
[city, name] = findCity(url)
wrd = xlwt.Workbook()
sheet = wrd.add_sheet(Pname)
CityRow = 0;
row = 0;
print("即将爬取"+Pname+"数据*************************************")
for i in range(len(name)):
sheet.write(CityRow, 0, name[i])
[countryU, CountryName] = findCity(city[i])
print("爬取" + name[i] + "数据中")
for j in range(len(CountryName)):
sheet.write(row, 1, CountryName[j])
message = getMessage(countryU[j])
print('正在爬取' + CountryName[j] + "数据")
sss="、".join(message)
sheet.write(row,2,sss)
print(CountryName[j] + "数据写入完成")
row = row + 1;
print(name[i] + "数据写入完成")
CityRow = CityRow + len(CountryName);
wrd.save("G://"+Pname+".xls")
print("爬取完成")
Target="http://www.xzqy.net/"
headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Mobile Safari/537.36'}
ip = getip()
rep = requests.get(url=Target, headers=headers, proxies=ip)
text=rep.text;
soup=BeautifulSoup(text, "html5lib")
province0=str(soup.find_all('div',class_="navi")).split('</a>')#得到各省名及网址
pattern = re.compile('./(.*)"')
pattern1=re.compile('>(.*)')
del province0[0]
del province0[0]
del province0[-1]
for i in range(0,len(province0)):
target1=Target+pattern.findall(province0[i])[0]
target2=pattern1.findall(province0[i])[0]#省名
province(target1,target2)
-
结果
耗费一个半小时下载完中国34个省、自治区、直辖市等数据。如下图
数据及源代码上传至本人github:https://github.com/nixuanhui/spider