1、抓取省级地址

2019年数据

province.png

区划和城乡划分的最新数据为2019年的，点击上方链接即可查看2019年相关数据。分析网页可以看出，各省的链接和文本信息都存放在如下的标签中。

<a href="11.html">北京市<br></a>

由于国家统计局的网址结构比较简单，因此可以直接使用正则表达式提取

pattern = re.compile("<a href='(.*?)'>(.*?)<")

具体地，抓取31省数据代码如下所示。由于后面抓取五级数据时需要频繁访问服务器，因此多准备几个请求头。另外，url为上方的链接，为了避免乱码设置一下response的编码。

import requests
import re
import random
import time
import os
import pandas as pd

# 设置请求头
def get_headers():
    user_agent = [
        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
        "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
        "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
        "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
        "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
        "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
        "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
        "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
        "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
        "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
        "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
        "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
        "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
        "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
        "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
        "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
        "UCWEB7.0.2.37/28/999",
        "NOKIA5700/ UCWEB7.0.2.37/28/999",
        "Openwave/ UCWEB7.0.2.37/28/999",
        "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999",
        "Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25"
    ]

    headers = {
        'Cookie': '_trs_uv=kfp3v12j_6_8t0e; SF_cookie_1=37059734; _trs_ua_s_1=kfxdjigi_6_4w48',
        'Host': 'www.stats.gov.cn',
        'Referer': 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/',
        'User-Agent': random.choice(user_agent),
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
    }

    return headers


# 获取31省
def get_province():
    url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html'
    response = requests.get(url, headers=get_headers())
    response.raise_for_status()  
    response.encoding = response.apparent_encoding 
    # response.encoding = 'gbk'
    response.close()
    pattern = re.compile("<a href='(.*?)'>(.*?)<")
    result = list(set(re.findall(pattern, response.text)))
    return result

# 写入到csv文件
def write_province():
    province = get_province()
    tem = []
    for i in province:
        tem.append([i[0], i[1]])
    df_province = pd.DataFrame(tem)
    df_province.to_csv('省.csv', index=0)
    return None

2、抓取市级地址

image.png

分析河北省的网址可以发现，url由http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html变为http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/13.html，即后缀改为“13.html”，其中“13”为之前抓取的省级标签中的数据，如：<a href="13.html">河北省<br></a>。
市级数据的存放与省级数据的存放存在一定的差异，市级数据依然存放在如下的标签中，不同之处在于，如果使用抓取省级数据的正则表达式来抓取市级数据，最终的结果会多了个地址编码：“130100000000”。实际的处理也很简单，将抓取的结果中的地址编码删除即可。因为地址编码为纯数字，容易删除。

<a href="13/1301.html">130100000000</a>
<a href="13/1301.html">石家庄市</a>

为了保证爬取的质量，笔者实际爬取一级数据之后立即进行保存，保存的文件中包含链接和文本，如河北的数据保存为：['11.html', '河北省']。爬取市级数据时只需适当修改一下url以及请求头中的referer参数，具体代码如下所示。

# 获取31省
write_province()
province = pd.read_csv('省.csv').values

# 获取342城市
def get_city(province_code):
    url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/' + province_code
    headers=get_headers()
    headers['Referer'] = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html'
    response = requests.get(url, headers=headers)
    response.raise_for_status()  
    response.encoding = 'gbk' 
    response.close()
    pattern = re.compile("<a href='(.*?)'>(.*?)<")
    result = list(set(re.findall(pattern, response.text)))
    res = []
    for j in result:
        if '0' not in j[1]:
            res.append(j)
    return res

def write_city():
    tem = []
    for i in province:
        city = get_city(i[0]) 
        print('正在抓取：' , i[1], '共{}个城市'.format(len(city)))
        time.sleep(random.random())
        for j in city:
            tem.append([i[0], i[1], j[0], j[1]])
    pd.DataFrame(tem).to_csv('市.csv', index=0)
    return Non

3、抓取三级、四级（区县、街道）地址

三级、四级地址的抓取方式与市级地址的抓取类似，后面的代码几乎等于复制前面的代码，不同之处在于url与referer的构造，三级、四级地址的抓取代码如下所示。

# 获取3068区县
def get_district(city_code):
    url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/' + city_code
    headers=get_headers()
    headers['Referer'] = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/{}.html'.format(city_code.split('/')[0])
    response = requests.get(url, headers=headers)
    response.raise_for_status()  
    response.encoding = 'gbk' 
    response.close()
    pattern = re.compile("<a href='(.*?)'>(.*?)<")
    result = list(set(re.findall(pattern, response.text)))
    res = []
    for j in result:
        if '0' not in j[1]:
            res.append(j)
    return res

def write_district():
    tem = []
    for i in city:
        district = get_district(i[2]) 
        print('正在抓取：', i[1], i[3], '共{}个区'.format(len(district)))
        time.sleep(random.random())
        for j in district:
            tem.append([i[0], i[1], i[2], i[3], j[0], j[1]])
        print(tem[-1], '\n')
    pd.DataFrame(tem).to_csv('区.csv', index=0)
    return None


# 获取43027街道
def get_road(province_code, city_code, district_code):
    url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/' + province_code.split('.')[0] + '/' + district_code
    headers=get_headers()
    headers['Referer'] = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/' + city_code
    response = requests.get(url, headers=headers)
    response.raise_for_status()  
    response.encoding = 'gbk' 
    response.close()
    pattern = re.compile("<a href='(.*?)'>(.*?)<")
    result = list(set(re.findall(pattern, response.text)))
    res = []
    for j in result:
        if '0' not in j[1]:
            res.append(j)
    return res

def write_road():
    tem = []
    for i in district:
        success = False
        while not success:
            try:
                road = get_road(i[0], i[2], i[4])
                print(i[1], i[3], i[5], '爬取成功，共{}个街道'.format(len(road)))
                time.sleep(random.random() / 2)
                success = True
            except Exception as e:
                print(e)
                print(i[1], i[3], i[5], '爬取失败，重新爬取')
        for j in road:
            tem.append([i[0], i[1], i[2], i[3], i[4], i[5], j[0], j[1]])
        print(tem[-1], '\n')
    pd.DataFrame(tem).to_csv('路.csv', index=0)
    return None

# 获取342城市
write_city()
city = pd.read_csv('市.csv').values

# 获取3068区县
write_district()
district = pd.read_csv('区.csv').values

# 获取43027街道
write_road()
df = pd.read_csv('路.csv')

4、抓取五级地址

抓取五级地址则略有不同，不同之处有两点。

五级地址所在的标签有所变化
五级地址数量较大，需要加入一定的优化手段

五级地址的标签多了两个地址编码，且标签类型有所改变，实际爬取中适当修改正则表达式，并在结果中将地址编码提取即可

<td>130202002001</td>
<td>111</td>
<td>友谊里社区居委会</td>

此外，五级地址的抓取，增加了try，except机制，当某条数据抓取失败时重新抓取该条数据，直至抓取成功。同时，为了保证抓取的稳定性，笔者采取逐省抓取、立即保存的方式抓取，最终利用pandas将数据合并。具体地、五级地址抓取代码如下所示。

# 获取656781五级地址
def get_community(province_code, district_code, road_code):
    url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/' + province_code.split('.')[0] + '/' + district_code.split('/')[0] + '/' + road_code
    headers=get_headers()
    headers['Referer'] = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/' + province_code.split('.')[0] + '/' + district_code
    response = requests.get(url, headers=headers)
    response.raise_for_status()  
    response.encoding = 'gbk'
    response.close()
    pattern = re.compile('<td>(.*?)</td>')
    result = list(set(re.findall(pattern, response.text)))
    res = []
    for j in result:
        if not re.findall('^\d*$', j):
            res.append(j)
    res.remove('名称')
    return res

def write_community(filename):
    tem = []
    for i in road:
        success = False
        while not success:
            try:
                community = get_community(i[0], i[4], i[6])
                print(i[1], i[3], i[5], i[7], '\t------>爬取成功，共{}个村委会'.format(len(community)))
                time.sleep(random.random() / 4)
                success = True
            except Exception as e:
                print(e)
                print(i[1], i[3], i[5], i[7], '\t------>爬取失败，重新爬取')
        for j in community:
            tem.append([i[1],i[3],i[5],i[7], j])
        # print(tem[-1], '\n')
    pd.DataFrame(tem).to_csv(filename, index=0)
    return None

# 合并各省五级地址
def merge():
    file_list = os.listdir('address/')
    data = pd.DataFrame()
    for i in file_list:
        data = data.append(pd.read_csv('address/' + i))
    data.rename(columns={'0':'一级', '1':'二级', '2':'三级', '3':'四级', '4':'五级', }, inplace=True)
    return data

# 分省获取656781五级地址
lis = df['1'].unique()
for i in lis:
    road = df[df['1']==i].values
    write_community(i + '.csv')

# 合并各省五级地址
address = merge()
address.to_csv('address.csv', index=0)
address.head()

5、数据展示

样例.png

完整数据：https://pan.baidu.com/s/1BAkVbjkJHipEArIrE7Ntwg
提取码：z6dx

利用python爬虫获取全国五级地址