数据提取基础(csv&xls&json)

1.解析csv文件

逐行阅读输入 datafile 文件,对于前 10 行(不包括标题),根据“,”拆分每行,然后为每行创建一个字典,键是字段的标题,值是该字段在该行中的值。函数 parse_file 应该返回一个字典列表,文件中的每个数据行是一个列表项。字段名称和值不应该包含多余的空白,例如空格或换行字符。可以使用 Python 字符串方法strip() 删除多余的空白。
对于这道练习,只需解析前 10 行数据,所以返回的列表应该有 10 项!

datafile='beatles-diskography.csv'

def parse_file(datafile):
    data=[]

    with open(datafile,'rb') as f:
        header = f.readline().split(',')
        counter=0
        
        for line in f:
            if counter==10:
                break
            
            fields=line.split(',')
            entry={}
            
            for i,value in enumerate(fields):
                entry[header[i].strip()]=value.strip()
                
            data.append(entry)
            counter+=1
    return data
parse_file(datafile) 

2.使用csv模块

import csv
filename = 'beatles-diskography.csv'
def parse_file(filename):
    data=[]
    with open(filename,'rb') as f:
        r = csv.DictReader(f)
        for line in r:
            data.append(line)
    return data
parse_file(filename)

3.xlrd简介

import xlrd
datafile = "2013_ERCOT_Hourly_Load_Data.xls"                     #设置要载入的文件


def parse_file(datafile):
    workbook = xlrd.open_workbook(datafile)                      #打开工作簿
    sheet = workbook.sheet_by_index(0)                           #设置有待处理的工作表

    data = [[sheet.cell_value(r, col)                            #遍历所有行和列,并将所有数据读取成python list
                for col in range(sheet.ncols)] 
                    for r in range(sheet.nrows)]

    print "\nList Comprehension"                                 #打印出我们刚才生成的列表中第三行第二列的值
    print "data[3][2]:",
    print data[3][2]

    print "\nCells in a nested loop:"                             #遍历整个工作表,每次只遍历一行,然后在列中进行移动
    for row in range(sheet.nrows):
        for col in range(sheet.ncols):
            if row == 50:                                        #当我们遍历至第50行时,就把这一行的所有值都打印出来,每次只打一列
                print sheet.cell_value(row, col),


    ### other useful methods:
    print "\nROWS, COLUMNS, and CELLS:"
    print "Number of rows in the sheet:", 
    print sheet.nrows                                             #取出工作表的行数
    print "Type of data in cell (row 3, col 2):", 
    print sheet.cell_type(3, 2)                                   #查看某一单元格的数据或值的类型
                                                                  #2 代表浮点数  #3 代表日期
    print "Value in cell (row 3, col 2):", 
    print sheet.cell_value(3, 2)                                 #以适当的Python值形式获取单元格的值
    print "Get a slice of values in column 3, from rows 1-3:"
    print sheet.col_values(3, start_rowx=1, end_rowx=4)           #将某一列(第三列)的值进行切片
                                                                  #需要这一列的三个值,从第一行开始取,一直到第四行,但不包含第四行
                                                                  #取出这一列的三个值

    print "\nDATES:"
    print "Type of data in cell (row 1, col 0):", 
    print sheet.cell_type(1, 0)                                                        
    exceltime = sheet.cell_value(1, 0)                                                 #取出单元格的值(日期)在xls中,日期显示为浮点数
    print "Time in Excel format:",
    print exceltime
    print "Convert time to a Python datetime tuple, from the Excel float:",
    print xlrd.xldate_as_tuple(exceltime, 0)                                           #获取这个时间,并让我们可以在Python中把它当作时间处理
                                                                                       #convert to python datetime tuple
    return data

data = parse_file(datafile)

4.练习:读取excel文件

import xlrd
import pprint 

datafile = '2013_ERCOT_Hourly_Load_Data.xls'

def parse_file(datafile):
    workbook  = xlrd.open_workbook(datafile)
    sheet = workbook.sheet_by_index(0)
    
    data = [[sheet.cell_value(r,col) for col in range(sheet.ncols)] for r in range(sheet.nrows)]
    
    cv = sheet.col_values(1,start_rowx = 1,end_rowx = None)
    maxval = max(cv)
    minval = min(cv)
    
    maxpos = cv.index(maxval) +1
    minpos = cv.index(minval) +1
    
    maxtime = sheet.cell_value(maxpos,0)
    realmaxtime = xlrd.xldate_as_tuple(maxtime,0)
    mintime = sheet.cell_value(minpos,0)
    realmintime = xlrd.xldate_as_tuple(mintime,0)
    
    data = {
            'maxtime': realmaxtime,
            'maxvalue': maxval,
            'mintime': realmintime,
            'minvalue': minval,
            'avgcoast': sum(cv)/float(len(cv))
    }
    return data
data = parse_file(datafile)
pprint.pprint(data)

5.探索json

5.1主函数

import json
import requests

BASE_URL = "http://musicbrainz.org/ws/2/"
ARTIST_URL = BASE_URL + "artist/"


# query parameters are given to the requests.get function as a dictionary; this
# variable contains some starter parameters.
query_type = {  "simple": {},
                "atr": {"inc": "aliases+tags+ratings"},
                "aliases": {"inc": "aliases"},
                "releases": {"inc": "releases"}}


def query_site(url, params, uid="", fmt="json"):
    """
    This is the main function for making queries to the musicbrainz API. The
    query should return a json document.
    """
    params["fmt"] = fmt
    r = requests.get(url + uid, params=params)
    print "requesting", r.url

    if r.status_code == requests.codes.ok:
        return r.json()
    else:
        r.raise_for_status()


def query_by_name(url, params, name):
    """
    This adds an artist name to the query parameters before making an API call
    to the function above.
    """
    params["query"] = "artist:" + name
    return query_site(url, params)


def pretty_print(data, indent=4):
    """
    After we get our output, we can use this function to format it to be more
    readable.
    """
    if type(data) == dict:
        print json.dumps(data, indent=indent, sort_keys=True)
    else:
        print data


def main():
    """
    Below is an example investigation to help you get started in your
    exploration. Modify the function calls and indexing below to answer the
    questions on the next quiz.

    HINT: Note how the output we get from the site is a multi-level JSON
    document, so try making print statements to step through the structure one
    level at a time or copy the output to a separate output file. Experimenting
    and iteration will be key to understand the structure of the data!
    """

    # Query for information in the database about bands named Nirvana
    results = query_by_name(ARTIST_URL, query_type["simple"], "Nirvana")
    pretty_print(results)

    # Isolate information from the 4th band returned (index 3)
    print "\nARTIST:"
    pretty_print(results["artists"][3])

    # Query for releases from that band using the artist_id
    artist_id = results["artists"][3]["id"]
    artist_data = query_site(ARTIST_URL, query_type["releases"], artist_id)
    releases = artist_data["releases"]

    # Print information about releases from the selected band
    print "\nONE RELEASE:"
    pretty_print(releases[0], indent=2)

    release_titles = [r["title"] for r in releases]
    print "\nALL TITLES:"
    for t in release_titles:
        print t

if __name__ == '__main__':
    main()

5.2练习

5.2.1 how many bands named 'FIRST AID KIT'?

results_1 = query_by_name(ARTIST_URL, query_type["simple"], "FIRST AID KIT")
pretty_print(results_1)
n=0
for result_1 in results_1['artists']:
    if result_1['name'].lower()=='first aid kit':
        n+=1
print n

5.2.2 begin-area name for queen?

results_2 = query_by_name(ARTIST_URL, query_type["simple"], "QUEEN")
for result_2 in results_2['artists']:
    if result_2['name'].lower()=='queen' and 'begin-area' in result_2.keys():
        print result_2['begin-area']['name']

5.2.3 spanish alias for beatles?

results_3 = query_by_name(ARTIST_URL, query_type["aliases"], "BEATLES")
result_3 = results_3['artists']
for i,value in enumerate(result_3):
    print(i,value['name'])

for aliase in results_3['artists'][7]['aliases']:
    print aliase['locale']

for aliase in results_3['artists'][7]['aliases']:
    if aliase['locale'] =='es':
        print aliase['name']

5.2.4 nirvana disambiguation?

results_4 = query_by_name(ARTIST_URL, query_type["simple"], "NIRVANA")
for result_4 in results_4['artists']:
    if result_4['name'] =='Nirvana' and 'country' in result_4.keys() and result_4['country']=='US':
        print result_4['disambiguation']

5.2.5 when was one direction formed ?

results_5 = query_by_name(ARTIST_URL, query_type["simple"], "one direction")
for result_5 in results_5['artists']:
    if result_5['name'].lower()=='one direction' and 'life-span' in result_5.keys():
        print result_5['life-span']['begin']

6.习题集

6.1 使用csv模块

csv文件第一行如下:
745090,"MOUNTAIN VIEW MOFFETT FLD NAS",CA,-8.0,37.400,-122.050,12
第一行既不是数据项,也不是标题。它是用来描述数据来源的行。你应该从该行中提取气象站名称。
数据应该返回为包含列表的列表(而不是字典)。可以使用模块“reader”方法获取此类格式的数据。另一个有用的方法是next(),可以用来获取迭代器中的下一行。

import csv

datafile = "745090.csv"


def parse_file(datafile):
    name = ""
    data = []
    with open(datafile,'rb') as f:
        csv_reader = csv.reader(f)
        name = next(csv_reader)[1]
        csv_reader.next()
        for line in csv_reader:
            data.append(line)
        pass
    # Do not change the line below
    return (name, data)
parse_file(datafile)

6.2 Excel 至 CSV

计算以下每个地区的最大加载时间和值:COAST、EAST、FAR_WEST、NORTH、NORTH_C、SOUTHERN、SOUTH_C、WEST,并将结果写到 csv 文件中,使用竖线字符“|”作为分隔符。

import xlrd

def parse_file(datafile):
    workbook = xlrd.open_workbook(datafile)
    sheet = workbook.sheet_by_index(0)
    data = {}
    for n in range (1, 9):
        station = sheet.cell_value(0, n)
        cv = sheet.col_values(n, start_rowx=1, end_rowx=None)

        maxval = max(cv)
        maxpos = cv.index(maxval) + 1
        maxtime = sheet.cell_value(maxpos, 0)
        realtime = xlrd.xldate_as_tuple(maxtime, 0)
        data[station] = {"maxval": maxval,
                         "maxtime": realtime}

    print data
    return data

def save_file(data, filename):
    with open(filename, "w") as f:
        w = csv.writer(f, delimiter='|')
        w.writerow(["Station", "Year", "Month", "Day", "Hour", "Max Load"])
        for s in data.keys():
            year, month, day, hour, _ , _= data[s]["maxtime"]
            w.writerow([s, year, month, day, hour, data[s]["maxval"]])
            
parse_file('2013_ERCOT_Hourly_Load_Data.xls')
save_file(parse_file('2013_ERCOT_Hourly_Load_Data.xls'),'2013_Max_Loads.csv')
最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
  • 序言:七十年代末,一起剥皮案震惊了整个滨河市,随后出现的几起案子,更是在滨河造成了极大的恐慌,老刑警刘岩,带你破解...
    沈念sama阅读 218,284评论 6 506
  • 序言:滨河连续发生了三起死亡事件,死亡现场离奇诡异,居然都是意外死亡,警方通过查阅死者的电脑和手机,发现死者居然都...
    沈念sama阅读 93,115评论 3 395
  • 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
    开封第一讲书人阅读 164,614评论 0 354
  • 文/不坏的土叔 我叫张陵,是天一观的道长。 经常有香客问我,道长,这世上最难降的妖魔是什么? 我笑而不...
    开封第一讲书人阅读 58,671评论 1 293
  • 正文 为了忘掉前任,我火速办了婚礼,结果婚礼上,老公的妹妹穿的比我还像新娘。我一直安慰自己,他们只是感情好,可当我...
    茶点故事阅读 67,699评论 6 392
  • 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
    开封第一讲书人阅读 51,562评论 1 305
  • 那天,我揣着相机与录音,去河边找鬼。 笑死,一个胖子当着我的面吹牛,可吹牛的内容都是我干的。 我是一名探鬼主播,决...
    沈念sama阅读 40,309评论 3 418
  • 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
    开封第一讲书人阅读 39,223评论 0 276
  • 序言:老挝万荣一对情侣失踪,失踪者是张志新(化名)和其女友刘颖,没想到半个月后,有当地人在树林里发现了一具尸体,经...
    沈念sama阅读 45,668评论 1 314
  • 正文 独居荒郊野岭守林人离奇死亡,尸身上长有42处带血的脓包…… 初始之章·张勋 以下内容为张勋视角 年9月15日...
    茶点故事阅读 37,859评论 3 336
  • 正文 我和宋清朗相恋三年,在试婚纱的时候发现自己被绿了。 大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
    茶点故事阅读 39,981评论 1 348
  • 序言:一个原本活蹦乱跳的男人离奇死亡,死状恐怖,灵堂内的尸体忽然破棺而出,到底是诈尸还是另有隐情,我是刑警宁泽,带...
    沈念sama阅读 35,705评论 5 347
  • 正文 年R本政府宣布,位于F岛的核电站,受9级特大地震影响,放射性物质发生泄漏。R本人自食恶果不足惜,却给世界环境...
    茶点故事阅读 41,310评论 3 330
  • 文/蒙蒙 一、第九天 我趴在偏房一处隐蔽的房顶上张望。 院中可真热闹,春花似锦、人声如沸。这庄子的主人今日做“春日...
    开封第一讲书人阅读 31,904评论 0 22
  • 文/苍兰香墨 我抬头看了看天上的太阳。三九已至,却和暖如春,着一层夹袄步出监牢的瞬间,已是汗流浃背。 一阵脚步声响...
    开封第一讲书人阅读 33,023评论 1 270
  • 我被黑心中介骗来泰国打工, 没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留,地道东北人。 一个月前我还...
    沈念sama阅读 48,146评论 3 370
  • 正文 我出身青楼,却偏偏与公主长得像,于是被迫代替她去往敌国和亲。 传闻我的和亲对象是个残疾皇子,可洞房花烛夜当晚...
    茶点故事阅读 44,933评论 2 355

推荐阅读更多精彩内容