2.1 基础Python与pandas
读写csv文件
基础Python,不使用csv
import sys
input_file = sys.argv[1]
output_file = sys.argv[2]
with open(input_file, 'r', newline='') as filereader:
with open(output_file, 'w', newline='') as filewriter:
header = filereader.readline()
header = header.strip()
header_list = header.split(',')
print(header_list)
filewriter.write(','.join(map(str, header_list))+'\n')
for row in filereader:
row = row.strip()
row_list = row.split(',')
print(row_list)
filewriter.write(','.join(map(str, row_list))+'\n')
-
join()
方法将列表元素以指定字符串相连。
seq = ['a', 'b', 'c', 'd']
return(','.join(seq))
>>>a,b,c,d
-
map()
函数根据提供函数对指定序列做映射。
def square(x):
return x ** 2
return(map(square, [1, 2, 3, 4, 5]))
>>>[1, 4, 9, 16, 25]
-
lambda
匿名函数。
f = lambda a,b,c: a+b+c
return(f(1, 2, 3))
>>>6
使用pandas
import sys
import pandas as pd
input_file = sys.argv[1]
output_file = sys.argv[2]
data_frame = pd.read_csv(input_file)
print(data_frame)
data_frame.to_csv(output_file, index=False)
- 在
pandas
中,使用数据框保留了“表格”的形式。
基础Python,使用csv模块
import sys
import csv
input_file = sys.argv[1]
output_file = sys.argv[2]
with open(input_file, 'r', newline='') as input_csv:
with open(output_file, 'w', newline='') as output_csv:
filereader = csv.reader(input_csv, delimiter=',')
filewriter = csv.writer(output_csv, delimiter=',')
for row_list in filereader:
print(row_list)
filewriter.writerow(row_list)
- 使用csv模块时候,尽管在csv的数字内增加了 ‘ , ’ 以表示数字位数,但csv会自动使用 “ ” 将内容括起。
2.2 筛选特定的行
行中的值满足某个条件
基础Python
import sys
import csv
input_file = sys.argv[1]
output_file = sys.argv[2]
with open(input_file, 'r', newline='') as input_csv:
with open(output_file, 'w', newline='') as output_csv:
filereader = csv.reader(input_csv, delimiter=',')
filewriter = csv.writer(output_csv, delimiter=',')
header = next(filereader)
filewriter.writerow(header)
for row in filereader:
supplier = str(row[0]).strip()
cost = str(row[3]).strip('$').replace(',', '')
if supplier.lower() == 'supplier Z'.lower() or float(cost) > 600.0:
filewriter.writerow(row)
Pandas
import sys
import pandas as pd
input_file = sys.argv[1]
output_file = sys.argv[2]
data_frame = pd.read_csv(input_file)
data_frame['Cost'] = data_frame['Cost'].str.strip('$').astype(float)
data_frame_value_meets_condition = data_frame.loc[(data_frame['Supplier Name']
.str.contains('Z')) | (data_frame['Cost'] > 600.0), :]
data_frame_value_meets_condition.to_csv(output_file, index=False)
-
contains()
函数,搜索包含参数的字符串。 -
loc()
函数接受两个参数,第一个为行第二个为列。 - DataFrame中,如果需要设定多个条件,将需要的逻辑顺序用"$"和"|"连接,分别为两个都必须为真和一个为真就可以。
行中的值属于某个集合
基础Python
import sys
import csv
input_file = sys.argv[1]
output_file = sys.argv[2]
important_dates = ['1/20/14', '1/30/14']
with open(input_file, 'r', newline='') as input_csv:
with open(output_file, 'w', newline='') as output_csv:
filereader = csv.reader(input_csv)
filewriter = csv.writer(output_csv)
header = next(filereader)
filewriter.writerow(header)
for row in filereader:
a_date = row[4]
if a_date in important_dates:
filewriter.writerow(row)
Pandas
import sys
import pandas as pd
input_file = sys.argv[1]
output_file = sys.argv[2]
data_frame = pd.read_csv(input_file)
important_dates = ['1/20/14', '1/30/14']
data_frame_value_in_set = data_frame.loc[(data_frame['Purchase Date'].isin(important_dates)), :]
data_frame_value_in_set.to_csv(output_file, index=False)
-
isin()
函数,搜索在集合内的参数。
行中的值匹配于某个模式/正则表达式
基础Python
import sys
import csv
import re
input_file = sys.argv[1]
output_file = sys.argv[2]
my_pattern = re.compile(r'(?P<pattern_group>^001-.*)', re.I)
with open(input_file, 'r', newline='') as input_csv:
with open(output_file, 'w', newline='') as output_csv:
filereader = csv.reader(input_csv, delimiter=',')
filewriter = csv.writer(output_csv, delimiter=',')
header = next(filereader)
filewriter.writerow(header)
for row in filereader:
invoice_number = row[1]
if my_pattern.search(invoice_number):
filewriter.writerow(row)
- 正则表达式
(r'(?P<pattern_group>^001-.*)', re.I)
内,^
为插入符号,代表只在开头搜索;.
句点表示可匹配除换行符外的任何字符;*
表示重复前面字符0次或更多次,.*
组合在一起表示除换行符外的任意字符可以在“001-”
后面出现任意次。
Pandas
import sys
import re
import pandas as pd
input_file = sys.argv[1]
output_file = sys.argv[2]
data_frame = pd.read_csv(input_file)
data_frame_value_matches_pattern = data_frame.loc[data_frame['Invoice Number'].
str.startswith('001-'), :]
data_frame_value_matches_pattern.to_csv(output_file, index=False)
-
startswith()
函数,搜索以参数为开头的内容。
2.3 选取特定的列
列索引值
基础Python
import sys
import csv
input_file = sys.argv[1]
output_file = sys.argv[2]
my_column = [0, 3]
with open(input_file, 'r', newline='') as input_csv:
with open(output_file, 'w', newline='') as output_csv:
filereader = csv.reader(input_csv)
filewriter = csv.writer(output_csv)
for row in filereader:
output_row = []
for index_value in my_column:
output_row.append(row[index_value])
filewriter.writerow(output_row)
Pandas
import sys
import pandas as pd
input_file = sys.argv[1]
output_file = sys.argv[2]
data_frame = pd.read_csv(input_file)
data_frame_column_by_index = data_frame.iloc[:, [0,3]]
data_frame_column_by_index.to_csv(output_file, index=False)
-
iloc()
函数只能根据行号和列号进行索引(0、1、-1),loc()
函数可以根据切片、名称等,但不能使用不存在的索引如-1。
列标题
基础Python
import sys
import csv
input_file = sys.argv[1]
output_file = sys.argv[2]
my_columns = ['Invoice Number', 'Purchase Date']
my_columns_index = []
with open(input_file, 'r', newline='') as input_csv:
with open(output_file, 'w', newline='') as output_csv:
filereader = csv.reader(input_csv)
filewriter = csv.writer(output_csv)
header = next(filereader, None)
for index_value in range(len(header)):
if header[index_value] in my_columns:
my_columns_index.append(index_value)
filewriter.writerow(my_columns)
for row in filereader:
row_list_output = []
for index_value in my_columns_index:
row_list_output.append(row[index_value])
filewriter.writerow(row_list_output)
- 使用
next()
读取文件第一行,获取对应名称的索引值。
Pandas
import sys
import pandas as pd
input_file = sys.argv[1]
output_file = sys.argv[2]
data_frame = pd.read_csv(input_file)
data_frame_column_by_name = data_frame.loc[:, ['Invoice Number', 'Purchase Date']]
data_frame_column_by_name.to_csv(output_file, index=False)
2.4 选取连续的行
基础Python
import sys
import csv
input_file = sys.argv[1]
output_file = sys.argv[2]
row_count = 0
with open(input_file, 'r', newline='') as input_csv:
with open(output_file, 'w', newline='') as output_csv:
filereader = csv.reader(input_csv)
filewriter = csv.writer(output_csv)
for row in filereader:
if row_count >= 3 and row_count <= 15:
filewriter.writerow(row)
row_count += 1
- 使用
row_count
变量进行计数,以选取索引值为3-15(表格中4-16)行的内容。
Pandas
import sys
import pandas as pd
input_file = sys.argv[1]
output_file = sys.argv[2]
data_frame = pd.read_csv(input_file, header=None)
data_frame = data_frame.drop([0, 1, 2, 16, 17, 18])
data_frame.columns = data_frame.iloc[0]
data_frame = data_frame.reindex(data_frame.index.drop(3))
data_frame.to_csv(output_file, index=False)
难点在于头行被注释占据,因此直接使用
iloc()
和loc()
函数选取4-16行的内容会出现IndexError: positional indexers are out-of-bounds
的错误内容。导入csv时给予参数
header=None
,使用drop()
函数丢弃前三行和后三行。data_frame.columns = data_frame.iloc[0]
根据此时索引值为0的行作为列索引,最后使用data_frame = data_frame.reindex(data_frame.index.drop(3))
将标签为3的行丢弃并重新生成索引。
2.5 添加标题行
基础Python
import sys
import csv
input_file = sys.argv[1]
output_file = sys.argv[2]
with open(input_file, 'r', newline='') as input_csv:
with open(output_file, 'w', newline='') as output_csv:
filereader = csv.reader(input_csv)
filewriter = csv.writer(output_csv)
headers = ['Supplier Name', 'Invoice Number', 'Part Number', 'Cost', 'Purchase Date']
filewriter.writerow(headers)
for row in filereader:
filewriter.writerow(row)
Pandas
import sys
import pandas as pd
input_file = sys.argv[1]
output_file = sys.argv[2]
headers = ['Supplier Name', 'Invoice Number', 'Part Number', 'Cost', 'Purchase Date']
data_frame = pd.read_csv(input_file, header=None, names=headers)
data_frame.to_csv(output_file, index=False)
-
data_frame = pd.read_csv(input_file, header=None, names=headers)
直接将现有列表设置为列标题行。
2.6 读取多个csv文件
文件计数与文件中的行列计算
import sys
import glob
import csv
import os
input_path = sys.argv[1]
file_counter = 0
for input_file in glob.glob(os.path.join(input_path, 'sales_*')):
row_counter = 1
with open(input_file, 'r', newline='') as csv_file:
filereader = csv.reader(csv_file)
header = next(filereader, None)
for row in filereader:
row_counter += 1
print(f"{os.path.basename(input_file)}:\t{row_counter} rows \t{len(header)} columns")
file_counter += 1
print(f"Number of files: {file_counter}.")
- 使用
file_counter
变量计算文件数,使用row_counter
计算文件内行数。 -
os
模块包含一系列路径名函数,os.path.join
函数将这个文件夹路径和所有符合特定模式的文件名连接起来。代码中是以“sales_*
"为开头的文件,实际情况中要获取路径内所有csv文件则为“*.csv
"。打印(os.path.join(input_path, 'sales_*')
的内容,结果为字符串:>>>input_files\sales_*
。 -
glob
模块可以找出与特定模式匹配的所有路径名,glob.glob()
函数扩展os.path.join
函数获取的内容,生成一个包含每个文件的列表。 -
os.path.basename()
函数从文件的完整路径名中抽出基本文件名。
2.7 从多个文件中连接数据
基础Python
import sys
import glob
import csv
import os
input_path = sys.argv[1]
output_file = sys.argv[2]
first_file = True
for input_file in glob.glob(os.path.join(input_path, 'sales_*')):
print(os.path.basename(input_file))
with open(input_file, 'r', newline='') as input_csv:
with open(output_file, 'a', newline='') as output_csv:
filereader = csv.reader(input_csv)
filewriter = csv.writer(output_csv)
if first_file:
for row in filereader:
filewriter.writerow(row)
first_file = False
else:
header = next(filereader, None)
for row in filereader:
filewriter.writerow(row)
- 使用
first_file
布尔值变量,仅仅获取第一个文件的第一行,后续文件使用next()
函数处理掉文件首行并追加进文件。
Pandas
import sys
import glob
import os
import pandas as pd
input_path = sys.argv[1]
output_file = sys.argv[2]
all_files = glob.glob(os.path.join(input_path, 'sales_*'))
all_data_frame = []
for file in all_files:
data_frame = pd.read_csv(file, index_col=None)
all_data_frame.append(data_frame)
data_frame_concat = pd.concat(all_data_frame, axis=0, ignore_index=True)
data_frame_concat.to_csv(output_file, index=False)
concat()
方法用于连接两个或多个数组。该方法不会改变现有的数组,而仅仅会返回被连接数组的一个副本。代码中将所有数据框连接成为一个数据框,axis
参数用于设置方法,0表示垂直堆叠,1表示并排平行堆叠NumPy
模块也提供了一些函数用于连接数据,通常导入为np
。垂直连接使用np.concatenate([array1, array2], axis=0)
,np.vstack((array1, array2))
或np.r_[array1, array2]
实现;平行连接使用np.concatenate([array1, array2], axis=1)
,np.hstack((array1, array2))
或np.c_[array1, array2]
。==Pandas提供了类似SQL join操作的
merge()
函数:pd.merge(DataFrame1, DataFrame2, on='key', how='inner')
。==
2.8 计算每个文件中值的总和与均值
基础Python
import sys
import glob
import csv
import os
input_path = sys.argv[1]
output_file = sys.argv[2]
output_headers = ['file_name', 'total_sales', 'average_sales']
with open(output_file, 'a', newline='') as output_csv:
filewriter = csv.writer(output_csv)
filewriter.writerow(output_headers)
for input_file in glob.glob(os.path.join(input_path, 'sales_*')):
with open(input_file, 'r', newline='') as input_csv:
filereader = csv.reader(input_csv)
header = next(filereader, None)
amounts = []
number_of_sales = 0.0
for row in filereader:
amount = float(str(row[3]).strip('$').replace(',', ''))
amounts.append(amount)
number_of_sales += 1.0
total_amount = sum(amounts)
print(type(total_amount))
print(type(number_of_sales))
average_amount = total_amount / number_of_sales
filewriter.writerow([os.path.basename(input_file), total_amount, average_amount])
- 主要是将
').replace(',', ''))``转化为浮点数进行运算。
Pandas
import sys
import glob
import os
import pandas as pd
input_path = sys.argv[1]
output_file = sys.argv[2]
all_files = glob.glob(os.path.join(input_path, 'sales_*'))
all_data_frames = []
for input_file in all_files:
data_frame = pd.read_csv(input_file, index_col=None)
total_sales = pd.DataFrame([float(str(value).strip('$').replace(',', ''))\
for value in data_frame.loc[:, 'Sale Amount']]).sum()
average_sales = pd.DataFrame([float(str(value).strip('$').replace(',', ''\
)) for value in data_frame.loc[:, 'Sale Amount']]).mean()
data = {'file_name': os.path.basename(input_file),
'total_sales': total_sales,
'average_sales': average_sales}
all_data_frames.append(pd.DataFrame(data,\
columns=['file_name', 'total_sales', 'average_sales']))
data_frames_concat = pd.concat(all_data_frames, axis=0, ignore_index=True)
data_frames_concat.to_csv(output_file, index=False)
- 注意17-21行的建立DataFrame的代码,通过字典的形式。