1.简介
case study: openstreetmap
- available for download
- xml
- human edited
- relatable
3.迭代解析
找到数据的所有顶层标签
遍历数据集,创建一个字典
解析方式:
- 树形解析:
将数据读入内存,将它当成树结构上的节点来处理 - sax解析器/迭代解析
一次解析一个标签
每次看到一个标签时,都把它当成事件来处理
Your task is to use the iterative parsing to process the map file and
find out not only what tags are there, but also how many.
Fill out the count_tags function. It should return a dictionary with the
tag name as the key and number of times this tag can be encountered in
the map as value.
import xml.etree.cElementTree as ET
import pprint
def count_tags(filename):
tags={}
tree = ET.iterparse(filename,events=('start',))
for event,elem in tree:
if elem.tag not in tags.keys():
tags[elem.tag] =1
else:
tags[elem.tag] +=1
return tags
6.循环访问道路标签
def audit():
tree = ET.iterparse(filename,events=('start',))
for event,elem in tree:
if elem.tag == 'way':
for tag in elem.iter('tag'):
if is_street_name(tag):
audit_street_type(street_types,tag.attrib['v'])
pprint.pprint(dict(street_types))
7.标签类型
import xml.etree.cElementTree as ET
import pprint
import re
"""
Your task is to check the "k" value for each "<tag>" and see if there are any potential problems.
We have provided you with 3 regular expressions to check for certain patterns
in the tags.
Please complete the function 'key_type', such that we have a count of each of
four tag categories in a dictionary:
"lower", for tags that contain only lowercase letters and are valid,
"lower_colon", for otherwise valid tags with a colon in their names,
"problemchars", for tags with problematic characters, and
"other", for other tags that do not fall into the other three categories.
"""
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
def key_type(element, keys):
if element.tag == "tag":
k = element.attrib['k']
if re.search(lower,k):
keys['lower']+=1
elif re.search(lower_colon,k):
keys['lower_colon']+=1
elif re.search(problemchars,k):
keys['problemchars']+=1
else:
keys['other']+=1
# YOUR CODE HERE
pass
return keys
def process_map(filename):
keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
for event, element in ET.iterparse(filename):
keys = key_type(element, keys)
return keys
正则表达式匹配:match = re.search(pattern, string)
8.探索用户
import xml.etree.cElementTree as ET
import pprint
import re
"""
The task is find out how many unique users
have contributed to the map in this particular area!
The function process_map should return a set of unique user IDs ("uid")
"""
def get_user(element):
if 'uid' in element.attrib:
return element.attrib['uid']
return
def process_map(filename):
users = set()
for _, element in ET.iterparse(filename):
if get_user(element):
users.add(get_user(element))
return users
def test():
users = process_map('example.osm')
pprint.pprint(users)
assert len(users) == 6
if __name__ == "__main__":
test()
9.审查街道名
import xml.etree.cElementTree as ET
import pprint
import re
from collections import defaultdict
osm_file= 'example.xml'
street_type_re = re.compile(r'\b\S+\.?$',re.IGNORECASE) #提取或者匹配街道名称中的最后一个单词
street_types = defaultdict(set)
expected=['Street','Avenue','Boulevard','Drive','Court','Place']
def audit_street_type(street_types,street_name):
m= street_type_re.search(street_name)
if m:
street_type = m.group()
if street_type not in expected:
street_types[street_type].add(street_name)
def is_street_name(elem):
return (elem.attrib['k']=='addr:street')
def audit():
for event,elem in ET.iterparse(osm_file):
if elem.tag=='way':
for tag in elem.iter('tag'):
if is_street_name(tag):
audit_street_type(street_types,tag.attrib['v'])
pprint.pprint(dict(street_types))
process/blueprint:
-- audit
-- develop plan for cleaning
-- write code to clean
11.完善街道名
"""
Your task in this exercise has two steps:
- audit the OSMFILE and change the variable 'mapping' to reflect the changes needed to fix
the unexpected street types to the appropriate ones in the expected list.
You have to add mappings only for the actual problems you find in this OSMFILE,
not a generalized solution, since that may and will depend on the particular area you are auditing.
- write the update_name function, to actually fix the street name.
The function takes a string with street name as an argument and should return the fixed name
We have provided a simple test so that you see what exactly is expected
"""
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint
OSMFILE = "example.osm"
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road",
"Trail", "Parkway", "Commons"]
mapping = { "St.": "Street",
"Ave":"Avenue",
"Rd.":"Road"
}
def audit_street_type(street_types, street_name):
m = street_type_re.search(street_name)
if m:
street_type = m.group()
if street_type not in expected:
street_types[street_type].add(street_name)
def is_street_name(elem):
return (elem.attrib['k'] == "addr:street")
def audit(osmfile):
osm_file = open(osmfile, "r")
street_types = defaultdict(set)
for event, elem in ET.iterparse(osm_file, events=("start",)):
if elem.tag == "node" or elem.tag == "way":
for tag in elem.iter("tag"):
if is_street_name(tag):
audit_street_type(street_types, tag.attrib['v'])
osm_file.close()
return street_types
def update_name(name, mapping):
for x,y in mapping.items():
if name.endswith(x):
return name.replace(x,y)
7.准备数据库 - MongoDB
你的任务是处理数据并将数据形状变成我们之前提到的模型。输出应该是字典列表,如下所示:
{
"id": "2406124091",
"type: "node",
"visible":"true",
"created": {
"version":"2",
"changeset":"17206049",
"timestamp":"2013-08-03T16:43:42Z",
"user":"linuxUser16",
"uid":"1219059"
},
"pos": [41.9757030, -87.6921867],
"address": {
"housenumber": "5157",
"postcode": "60625",
"street": "North Lincoln Ave"
},
"amenity": "restaurant",
"cuisine": "mexican",
"name": "La Cabana De Don Luis",
"phone": "1 (773)-271-5176"
}
你需要完成函数 shape_element。
我们提供了用于解析地图文件的函数,并调用该函数,将元素作为参数。你应该返回一个字典,其中包含该元素的已调整形状的数据。我们还提供了将数据保存到文件中的方式,使你能够稍后使用 mongoimport 将已调整形状的数据导入到 MongoDB。
注意,在此练习中,我们不使用你在上一道练习中用到的“更新街道名称”流程。如果你在最终项目中使用该代码,强烈建议你使用上一道练习中的代码更新街道名称,然后将其保存到 JSON 中。
具体来说,你应该完成以下任务:
- 你应该只处理两种类型的顶级标记:“节点”和“道路”
- “节点”和“道路”应该转换为常规键值对,以下情况除外:
--CREATED 数组中的属性应该添加到键“created”下
--经纬度属性应该添加到“pos”数组中,以用于地理空间索引编制。确保“pos”数组中的值是浮点型,不是字符串。 - 如果二级标记“k”值包含存在问题的字符,则应忽略
- 如果二级标记“k”值以“addr:”开头,则应添加到字典“address”中
- 如果二级标记“k”值不是以“addr:”开头,但是包含“:”,你可以按照自己认为最合适的方式进行处理。例如,你可以将其拆分为二级字典,例如包含“addr:”,或者转换“:”以创建有效的键。
- 如果有第二个用于区分街道类型/方向的“:”,则应该忽略该标记,例如:
<tag k="addr:housenumber" v="5158"/>
<tag k="addr:street" v="North Lincoln Avenue"/>
<tag k="addr:street:name" v="Lincoln"/>
<tag k="addr:street:prefix" v="North"/>
<tag k="addr:street:type" v="Avenue"/>
<tag k="amenity" v="pharmacy"/>
应该改写为:
{...
"address": {
"housenumber": 5158,
"street": "North Lincoln Avenue"
}
"amenity": "pharmacy",
...
}
对于道路(ways)
<nd ref="305896090"/>
<nd ref="1719825889"/>
应该改写为:
"node_refs": ["305896090", "1719825889"]
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json
"""
Your task is to wrangle the data and transform the shape of the data
into the model we mentioned earlier. The output should be a list of dictionaries
that look like this:
{
"id": "2406124091",
"type: "node",
"visible":"true",
"created": {
"version":"2",
"changeset":"17206049",
"timestamp":"2013-08-03T16:43:42Z",
"user":"linuxUser16",
"uid":"1219059"
},
"pos": [41.9757030, -87.6921867],
"address": {
"housenumber": "5157",
"postcode": "60625",
"street": "North Lincoln Ave"
},
"amenity": "restaurant",
"cuisine": "mexican",
"name": "La Cabana De Don Luis",
"phone": "1 (773)-271-5176"
}
You have to complete the function 'shape_element'.
We have provided a function that will parse the map file, and call the function with the element
as an argument. You should return a dictionary, containing the shaped data for that element.
We have also provided a way to save the data in a file, so that you could use
mongoimport later on to import the shaped data into MongoDB.
Note that in this exercise we do not use the 'update street name' procedures
you worked on in the previous exercise. If you are using this code in your final
project, you are strongly encouraged to use the code from previous exercise to
update the street names before you save them to JSON.
In particular the following things should be done:
- you should process only 2 types of top level tags: "node" and "way"
- all attributes of "node" and "way" should be turned into regular key/value pairs, except:
- attributes in the CREATED array should be added under a key "created"
- attributes for latitude and longitude should be added to a "pos" array,
for use in geospacial indexing. Make sure the values inside "pos" array are floats
and not strings.
- if the second level tag "k" value contains problematic characters, it should be ignored
- if the second level tag "k" value starts with "addr:", it should be added to a dictionary "address"
- if the second level tag "k" value does not start with "addr:", but contains ":", you can
process it in a way that you feel is best. For example, you might split it into a two-level
dictionary like with "addr:", or otherwise convert the ":" to create a valid key.
- if there is a second ":" that separates the type/direction of a street,
the tag should be ignored, for example:
<tag k="addr:housenumber" v="5158"/>
<tag k="addr:street" v="North Lincoln Avenue"/>
<tag k="addr:street:name" v="Lincoln"/>
<tag k="addr:street:prefix" v="North"/>
<tag k="addr:street:type" v="Avenue"/>
<tag k="amenity" v="pharmacy"/>
should be turned into:
{...
"address": {
"housenumber": 5158,
"street": "North Lincoln Avenue"
}
"amenity": "pharmacy",
...
}
- for "way" specifically:
<nd ref="305896090"/>
<nd ref="1719825889"/>
should be turned into
"node_refs": ["305896090", "1719825889"]
"""
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
CREATED = [ "version", "changeset", "timestamp", "user", "uid"]
def shape_element(element):
node = {}
node['created'] = {}
if element.tag == "node" or element.tag == "way" :
attr = element.attrib
node['id'] = attr['id']
node['type'] = element.tag
node['visible'] = attr.get('visible')
for i in CREATED:
node['created'][i] = attr[i]
if element.tag == 'node':
node['pos'] = [float(attr['lat']),float(attr['lon'])]
if element.tag == 'way':
node['node_refs'] = [nd.attrib['ref'] for nd in element.iter('nd')]
node['address'] = {tag.attrib["k"][5:]:tag.attrib["v"]
for tag in element.iter("tag")
if tag.attrib['k'].startswith("addr:")
and not problemchars.search(tag.attrib['k'])
and tag.attrib['k'].count(":")==1}
# YOUR CODE HERE
return node
else:
return None
def process_map(file_in, pretty = False):
# You do not need to change this file
file_out = "{0}.json".format(file_in)
data = []
with codecs.open(file_out, "w") as fo:
for _, element in ET.iterparse(file_in):
el = shape_element(element)
if el:
data.append(el)
if pretty:
fo.write(json.dumps(el, indent=2)+"\n")
else:
fo.write(json.dumps(el) + "\n")
return data
def test():
# NOTE: if you are running this code on your computer, with a larger dataset,
# call the process_map procedure with pretty=False. The pretty=True option adds
# additional spaces to the output, making it significantly larger.
data = process_map('example.osm', True)
#pprint.pprint(data)
correct_first_elem = {
"id": "261114295",
"visible": "true",
"type": "node",
"pos": [41.9730791, -87.6866303],
"created": {
"changeset": "11129782",
"user": "bbmiller",
"version": "7",
"uid": "451048",
"timestamp": "2012-03-28T18:31:23Z"
}
}
assert data[0] == correct_first_elem
assert data[-1]["address"] == {
"street": "West Lexington St.",
"housenumber": "1412"
}
assert data[-1]["node_refs"] == [ "2199822281", "2199822390", "2199822392", "2199822369",
"2199822370", "2199822284", "2199822281"]
if __name__ == "__main__":
test()