1、抓取省级地址
区划和城乡划分的最新数据为2019年的,点击上方链接即可查看2019年相关数据。分析网页可以看出,各省的链接和文本信息都存放在如下的标签中。
<a href="11.html">北京市<br></a>
由于国家统计局的网址结构比较简单,因此可以直接使用正则表达式提取
pattern = re.compile("<a href='(.*?)'>(.*?)<")
具体地,抓取31省数据代码如下所示。由于后面抓取五级数据时需要频繁访问服务器,因此多准备几个请求头。另外,url为上方的链接, 为了避免乱码设置一下response的编码。
import requests
import re
import random
import time
import os
import pandas as pd
# 设置请求头
def get_headers():
user_agent = [
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
"Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
"Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
"Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
"Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
"UCWEB7.0.2.37/28/999",
"NOKIA5700/ UCWEB7.0.2.37/28/999",
"Openwave/ UCWEB7.0.2.37/28/999",
"Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999",
"Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25"
]
headers = {
'Cookie': '_trs_uv=kfp3v12j_6_8t0e; SF_cookie_1=37059734; _trs_ua_s_1=kfxdjigi_6_4w48',
'Host': 'www.stats.gov.cn',
'Referer': 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/',
'User-Agent': random.choice(user_agent),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
}
return headers
# 获取31省
def get_province():
url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html'
response = requests.get(url, headers=get_headers())
response.raise_for_status()
response.encoding = response.apparent_encoding
# response.encoding = 'gbk'
response.close()
pattern = re.compile("<a href='(.*?)'>(.*?)<")
result = list(set(re.findall(pattern, response.text)))
return result
# 写入到csv文件
def write_province():
province = get_province()
tem = []
for i in province:
tem.append([i[0], i[1]])
df_province = pd.DataFrame(tem)
df_province.to_csv('省.csv', index=0)
return None
2、抓取市级地址
分析河北省的网址可以发现,url由http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html变为http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/13.html,即后缀改为“13.html”,其中“13”为之前抓取的省级标签中的数据,如:
<a href="13.html">河北省<br></a>
。市级数据的存放与省级数据的存放存在一定的差异,市级数据依然存放在如下的标签中,不同之处在于,如果使用抓取省级数据的正则表达式来抓取市级数据,最终的结果会多了个地址编码:“130100000000”。实际的处理也很简单,将抓取的结果中的地址编码删除即可。因为地址编码为纯数字,容易删除。
<a href="13/1301.html">130100000000</a>
<a href="13/1301.html">石家庄市</a>
为了保证爬取的质量,笔者实际爬取一级数据之后立即进行保存,保存的文件中包含链接和文本,如河北的数据保存为:['11.html', '河北省']。爬取市级数据时只需适当修改一下url以及请求头中的referer参数,具体代码如下所示。
# 获取31省
write_province()
province = pd.read_csv('省.csv').values
# 获取342城市
def get_city(province_code):
url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/' + province_code
headers=get_headers()
headers['Referer'] = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html'
response = requests.get(url, headers=headers)
response.raise_for_status()
response.encoding = 'gbk'
response.close()
pattern = re.compile("<a href='(.*?)'>(.*?)<")
result = list(set(re.findall(pattern, response.text)))
res = []
for j in result:
if '0' not in j[1]:
res.append(j)
return res
def write_city():
tem = []
for i in province:
city = get_city(i[0])
print('正在抓取:' , i[1], '共{}个城市'.format(len(city)))
time.sleep(random.random())
for j in city:
tem.append([i[0], i[1], j[0], j[1]])
pd.DataFrame(tem).to_csv('市.csv', index=0)
return Non
3、抓取三级、四级(区县、街道)地址
三级、四级地址的抓取方式与市级地址的抓取类似,后面的代码几乎等于复制前面的代码,不同之处在于url与referer的构造,三级、四级地址的抓取代码如下所示。
# 获取3068区县
def get_district(city_code):
url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/' + city_code
headers=get_headers()
headers['Referer'] = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/{}.html'.format(city_code.split('/')[0])
response = requests.get(url, headers=headers)
response.raise_for_status()
response.encoding = 'gbk'
response.close()
pattern = re.compile("<a href='(.*?)'>(.*?)<")
result = list(set(re.findall(pattern, response.text)))
res = []
for j in result:
if '0' not in j[1]:
res.append(j)
return res
def write_district():
tem = []
for i in city:
district = get_district(i[2])
print('正在抓取:', i[1], i[3], '共{}个区'.format(len(district)))
time.sleep(random.random())
for j in district:
tem.append([i[0], i[1], i[2], i[3], j[0], j[1]])
print(tem[-1], '\n')
pd.DataFrame(tem).to_csv('区.csv', index=0)
return None
# 获取43027街道
def get_road(province_code, city_code, district_code):
url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/' + province_code.split('.')[0] + '/' + district_code
headers=get_headers()
headers['Referer'] = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/' + city_code
response = requests.get(url, headers=headers)
response.raise_for_status()
response.encoding = 'gbk'
response.close()
pattern = re.compile("<a href='(.*?)'>(.*?)<")
result = list(set(re.findall(pattern, response.text)))
res = []
for j in result:
if '0' not in j[1]:
res.append(j)
return res
def write_road():
tem = []
for i in district:
success = False
while not success:
try:
road = get_road(i[0], i[2], i[4])
print(i[1], i[3], i[5], '爬取成功,共{}个街道'.format(len(road)))
time.sleep(random.random() / 2)
success = True
except Exception as e:
print(e)
print(i[1], i[3], i[5], '爬取失败,重新爬取')
for j in road:
tem.append([i[0], i[1], i[2], i[3], i[4], i[5], j[0], j[1]])
print(tem[-1], '\n')
pd.DataFrame(tem).to_csv('路.csv', index=0)
return None
# 获取342城市
write_city()
city = pd.read_csv('市.csv').values
# 获取3068区县
write_district()
district = pd.read_csv('区.csv').values
# 获取43027街道
write_road()
df = pd.read_csv('路.csv')
4、抓取五级地址
抓取五级地址则略有不同,不同之处有两点。
- 五级地址所在的标签有所变化
- 五级地址数量较大,需要加入一定的优化手段
五级地址的标签多了两个地址编码,且标签类型有所改变,实际爬取中适当修改正则表达式,并在结果中将地址编码提取即可
<td>130202002001</td>
<td>111</td>
<td>友谊里社区居委会</td>
此外,五级地址的抓取,增加了try,except机制,当某条数据抓取失败时重新抓取该条数据,直至抓取成功。同时,为了保证抓取的稳定性,笔者采取逐省抓取、立即保存的方式抓取,最终利用pandas将数据合并。具体地、五级地址抓取代码如下所示。
# 获取656781五级地址
def get_community(province_code, district_code, road_code):
url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/' + province_code.split('.')[0] + '/' + district_code.split('/')[0] + '/' + road_code
headers=get_headers()
headers['Referer'] = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/' + province_code.split('.')[0] + '/' + district_code
response = requests.get(url, headers=headers)
response.raise_for_status()
response.encoding = 'gbk'
response.close()
pattern = re.compile('<td>(.*?)</td>')
result = list(set(re.findall(pattern, response.text)))
res = []
for j in result:
if not re.findall('^\d*$', j):
res.append(j)
res.remove('名称')
return res
def write_community(filename):
tem = []
for i in road:
success = False
while not success:
try:
community = get_community(i[0], i[4], i[6])
print(i[1], i[3], i[5], i[7], '\t------>爬取成功,共{}个村委会'.format(len(community)))
time.sleep(random.random() / 4)
success = True
except Exception as e:
print(e)
print(i[1], i[3], i[5], i[7], '\t------>爬取失败,重新爬取')
for j in community:
tem.append([i[1],i[3],i[5],i[7], j])
# print(tem[-1], '\n')
pd.DataFrame(tem).to_csv(filename, index=0)
return None
# 合并各省五级地址
def merge():
file_list = os.listdir('address/')
data = pd.DataFrame()
for i in file_list:
data = data.append(pd.read_csv('address/' + i))
data.rename(columns={'0':'一级', '1':'二级', '2':'三级', '3':'四级', '4':'五级', }, inplace=True)
return data
# 分省获取656781五级地址
lis = df['1'].unique()
for i in lis:
road = df[df['1']==i].values
write_community(i + '.csv')
# 合并各省五级地址
address = merge()
address.to_csv('address.csv', index=0)
address.head()
5、数据展示
完整数据:https://pan.baidu.com/s/1BAkVbjkJHipEArIrE7Ntwg
提取码:z6dx