十九、数据整理(2)
作者:Chris Albon
译者:飞龙
协议:CC BY-NC-SA 4.0
将变量转换为时间序列
# 导入库
import pandas as pd
# 创建索引为一组名称的数据集
raw_data = {'date': ['2014-06-01T01:21:38.004053', '2014-06-02T01:21:38.004053', '2014-06-03T01:21:38.004053'],
'score': [25, 94, 57]}
df = pd.DataFrame(raw_data, columns = ['date', 'score'])
df
|
date |
score |
0 |
2014-06-01T01:21:38.004053 |
25 |
1 |
2014-06-02T01:21:38.004053 |
94 |
2 |
2014-06-03T01:21:38.004053 |
57 |
# 转置数据集,使索引(在本例中为名称)为列
df["date"] = pd.to_datetime(df["date"])
df = df.set_index(df["date"])
df
|
date |
score |
date |
|
|
--- |
--- |
--- |
2014-06-01 01:21:38.004053 |
2014-06-01 01:21:38.004053 |
25 |
2014-06-02 01:21:38.004053 |
2014-06-02 01:21:38.004053 |
94 |
2014-06-03 01:21:38.004053 |
2014-06-03 01:21:38.004053 |
57 |
在 Pandas 数据帧中计数
# 导入库
import pandas as pd
year = pd.Series([1875, 1876, 1877, 1878, 1879, 1880, 1881, 1882, 1883, 1884,
1885, 1886, 1887, 1888, 1889, 1890, 1891, 1892, 1893, 1894])
guardCorps = pd.Series([0,2,2,1,0,0,1,1,0,3,0,2,1,0,0,1,0,1,0,1])
corps1 = pd.Series([0,0,0,2,0,3,0,2,0,0,0,1,1,1,0,2,0,3,1,0])
corps2 = pd.Series([0,0,0,2,0,2,0,0,1,1,0,0,2,1,1,0,0,2,0,0])
corps3 = pd.Series([0,0,0,1,1,1,2,0,2,0,0,0,1,0,1,2,1,0,0,0])
corps4 = pd.Series([0,1,0,1,1,1,1,0,0,0,0,1,0,0,0,0,1,1,0,0])
corps5 = pd.Series([0,0,0,0,2,1,0,0,1,0,0,1,0,1,1,1,1,1,1,0])
corps6 = pd.Series([0,0,1,0,2,0,0,1,2,0,1,1,3,1,1,1,0,3,0,0])
corps7 = pd.Series([1,0,1,0,0,0,1,0,1,1,0,0,2,0,0,2,1,0,2,0])
corps8 = pd.Series([1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,1,0,1])
corps9 = pd.Series([0,0,0,0,0,2,1,1,1,0,2,1,1,0,1,2,0,1,0,0])
corps10 = pd.Series([0,0,1,1,0,1,0,2,0,2,0,0,0,0,2,1,3,0,1,1])
corps11 = pd.Series([0,0,0,0,2,4,0,1,3,0,1,1,1,1,2,1,3,1,3,1])
corps14 = pd.Series([ 1,1,2,1,1,3,0,4,0,1,0,3,2,1,0,2,1,1,0,0])
corps15 = pd.Series([0,1,0,0,0,0,0,1,0,1,1,0,0,0,2,2,0,0,0,0])
variables = dict(guardCorps = guardCorps, corps1 = corps1,
corps2 = corps2, corps3 = corps3, corps4 = corps4,
corps5 = corps5, corps6 = corps6, corps7 = corps7,
corps8 = corps8, corps9 = corps9, corps10 = corps10,
corps11 = corps11 , corps14 = corps14, corps15 = corps15)
horsekick = pd.DataFrame(variables, columns = ['guardCorps',
'corps1', 'corps2',
'corps3', 'corps4',
'corps5', 'corps6',
'corps7', 'corps8',
'corps9', 'corps10',
'corps11', 'corps14',
'corps15'])
horsekick.index = [1875, 1876, 1877, 1878, 1879, 1880, 1881, 1882, 1883, 1884,
1885, 1886, 1887, 1888, 1889, 1890, 1891, 1892, 1893, 1894]
horsekick
|
guardCorps |
corps1 |
corps2 |
corps3 |
corps4 |
corps5 |
corps6 |
corps7 |
corps8 |
corps9 |
corps10 |
corps11 |
corps14 |
corps15 |
1875 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
1 |
0 |
0 |
0 |
1 |
0 |
1876 |
2 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
1 |
1877 |
2 |
0 |
0 |
0 |
0 |
0 |
1 |
1 |
0 |
0 |
1 |
0 |
2 |
0 |
1878 |
1 |
2 |
2 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
1 |
0 |
1879 |
0 |
0 |
0 |
1 |
1 |
2 |
2 |
0 |
1 |
0 |
0 |
2 |
1 |
0 |
1880 |
0 |
3 |
2 |
1 |
1 |
1 |
0 |
0 |
0 |
2 |
1 |
4 |
3 |
0 |
1881 |
1 |
0 |
0 |
2 |
1 |
0 |
0 |
1 |
0 |
1 |
0 |
0 |
0 |
0 |
1882 |
1 |
2 |
0 |
0 |
0 |
0 |
1 |
0 |
1 |
1 |
2 |
1 |
4 |
1 |
1883 |
0 |
0 |
1 |
2 |
0 |
1 |
2 |
1 |
0 |
1 |
0 |
3 |
0 |
0 |
1884 |
3 |
0 |
1 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
2 |
0 |
1 |
1 |
1885 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
2 |
0 |
1 |
0 |
1 |
1886 |
2 |
1 |
0 |
0 |
1 |
1 |
1 |
0 |
0 |
1 |
0 |
1 |
3 |
0 |
1887 |
1 |
1 |
2 |
1 |
0 |
0 |
3 |
2 |
1 |
1 |
0 |
1 |
2 |
0 |
1888 |
0 |
1 |
1 |
0 |
0 |
1 |
1 |
0 |
0 |
0 |
0 |
1 |
1 |
0 |
1889 |
0 |
0 |
1 |
1 |
0 |
1 |
1 |
0 |
0 |
1 |
2 |
2 |
0 |
2 |
1890 |
1 |
2 |
0 |
2 |
0 |
1 |
1 |
2 |
0 |
2 |
1 |
1 |
2 |
2 |
1891 |
0 |
0 |
0 |
1 |
1 |
1 |
0 |
1 |
1 |
0 |
3 |
3 |
1 |
0 |
1892 |
1 |
3 |
2 |
0 |
1 |
1 |
3 |
0 |
1 |
1 |
0 |
1 |
1 |
0 |
1893 |
0 |
1 |
0 |
0 |
0 |
1 |
0 |
2 |
0 |
0 |
1 |
3 |
0 |
0 |
1894 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
1 |
1 |
0 |
0 |
# 计算每个团队中每个死亡人数的次数
result = horsekick.apply(pd.value_counts).fillna(0); result
| | guardCorps | corps1 | corps2 | corps3 | corps4 | corps5 | corps6 | corps7 | corps8 | corps9 | corps10 | corps11 | corps14 | corps15 |
| 0 | 9.0 | 11.0 | 12.0 | 11.0 | 12.0 | 10.0 | 9.0 | 11.0 | 13.0 | 10.0 | 10.0 | 6 | 6 | 14.0 |
| 1 | 7.0 | 4.0 | 4.0 | 6.0 | 8.0 | 9.0 | 7.0 | 6.0 | 7.0 | 7.0 | 6.0 | 8 | 8 | 4.0 |
| 2 | 3.0 | 3.0 | 4.0 | 3.0 | 0.0 | 1.0 | 2.0 | 3.0 | 0.0 | 3.0 | 3.0 | 2 | 3 | 2.0 |
| 3 | 1.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3 | 2 | 0.0 |
| 4 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1 | 1 | 0.0 |
# 计算每个月死亡总数出现在 guardCorps 的次数
pd.value_counts(horsekick['guardCorps'].values, sort=False)
'''
0 9
1 7
2 3
3 1
dtype: int64
'''
horsekick['guardCorps'].unique()
# array([0, 2, 1, 3])
在 Pandas 中创建流水线
Pandas 的流水线功能允许你将 Python 函数串联在一起,来构建数据处理流水线。
import pandas as pd
# 创建空数据帧
df = pd.DataFrame()
# Create a column
df['name'] = ['John', 'Steve', 'Sarah']
df['gender'] = ['Male', 'Male', 'Female']
df['age'] = [31, 32, 19]
# 查看数据帧
df
|
name |
gender |
age |
0 |
John |
Male |
31 |
1 |
Steve |
Male |
32 |
2 |
Sarah |
Female |
19 |
# 创建函数,
def mean_age_by_group(dataframe, col):
# 它按列分组数据,并返回每组的均值
return dataframe.groupby(col).mean()
# 创建函数,
def uppercase_column_name(dataframe):
# 它大写所有列标题
dataframe.columns = dataframe.columns.str.upper()
# 并返回它
return dataframe
# 创建流水线,它应用 mean_age_by_group 函数
(df.pipe(mean_age_by_group, col='gender')
# 之后应用 uppercase_column_name 函数
.pipe(uppercase_column_name)
)
|
AGE |
gender |
|
Female |
19.0 |
Male |
31.5 |
使用for
循环创建 Pandas 列
import pandas as pd
import numpy as np
raw_data = {'student_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze', 'Jacon', 'Ryaner', 'Sone', 'Sloan', 'Piger', 'Riani', 'Ali'],
'test_score': [76, 88, 84, 67, 53, 96, 64, 91, 77, 73, 52, np.NaN]}
df = pd.DataFrame(raw_data, columns = ['student_name', 'test_score'])
# 创建列表来储存数据
grades = []
# 对于列中的每一行
for row in df['test_score']:
# 如果大于某个值
if row > 95:
# 添加字母分数
grades.append('A')
# 或者,如果大于某个值
elif row > 90:
# 添加字母分数
grades.append('A-')
# 或者,如果大于某个值
elif row > 85:
# 添加字母分数
grades.append('B')
# 或者,如果大于某个值
elif row > 80:
# 添加字母分数
grades.append('B-')
# 或者,如果大于某个值
elif row > 75:
# 添加字母分数
grades.append('C')
# 或者,如果大于某个值
elif row > 70:
# 添加字母分数
grades.append('C-')
# 或者,如果大于某个值
elif row > 65:
# 添加字母分数
grades.append('D')
# 或者,如果大于某个值
elif row > 60:
# 添加字母分数
grades.append('D-')
# 否则
else:
# 添加不及格分数
grades.append('Failed')
# 从列表创建一列
df['grades'] = grades
# 查看新数据帧
df
|
student_name |
test_score |
grades |
0 |
Miller |
76.0 |
C |
1 |
Jacobson |
88.0 |
B |
2 |
Ali |
84.0 |
B- |
3 |
Milner |
67.0 |
D |
4 |
Cooze |
53.0 |
Failed |
5 |
Jacon |
96.0 |
A |
6 |
Ryaner |
64.0 |
D- |
7 |
Sone |
91.0 |
A- |
8 |
Sloan |
77.0 |
C |
9 |
Piger |
73.0 |
C- |
10 |
Riani |
52.0 |
Failed |
11 |
Ali |
NaN |
Failed |
创建项目计数
from collections import Counter
# 创建一个今天吃的水果的计数器
fruit_eaten = Counter(['Apple', 'Apple', 'Apple', 'Banana', 'Pear', 'Pineapple'])
# 查看计数器
fruit_eaten
# Counter({'Apple': 3, 'Banana': 1, 'Pear': 1, 'Pineapple': 1})
# 更新菠萝的计数(因为你只吃菠萝)
fruit_eaten.update(['Pineapple'])
# 查看计数器
fruit_eaten
# Counter({'Apple': 3, 'Banana': 1, 'Pear': 1, 'Pineapple': 2})
# 查看计数最大的三个项目
fruit_eaten.most_common(3)
# [('Apple', 3), ('Pineapple', 2), ('Banana', 1)]
基于条件创建一列
# 导入所需模块
import pandas as pd
import numpy as np
data = {'name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'age': [42, 52, 36, 24, 73],
'preTestScore': [4, 24, 31, 2, 3],
'postTestScore': [25, 94, 57, 62, 70]}
df = pd.DataFrame(data, columns = ['name', 'age', 'preTestScore', 'postTestScore'])
df
|
name |
age |
preTestScore |
postTestScore |
0 |
Jason |
42 |
4 |
25 |
1 |
Molly |
52 |
24 |
94 |
2 |
Tina |
36 |
31 |
57 |
3 |
Jake |
24 |
2 |
62 |
4 |
Amy |
73 |
3 |
70 |
# 创建一个名为 df.elderly 的新列
# 如果 df.age 大于 50 则值为 yes,否则为 no
df['elderly'] = np.where(df['age']>=50, 'yes', 'no')
# 查看数据帧
df
|
name |
age |
preTestScore |
postTestScore |
elderly |
0 |
Jason |
42 |
4 |
25 |
no |
1 |
Molly |
52 |
24 |
94 |
yes |
2 |
Tina |
36 |
31 |
57 |
no |
3 |
Jake |
24 |
2 |
62 |
no |
4 |
Amy |
73 |
3 |
70 |
yes |
从词典键和值创建列表
# 创建字典
dict = {'county': ['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma'],
'year': [2012, 2012, 2013, 2014, 2014],
'fireReports': [4, 24, 31, 2, 3]}
# 创建键的列表
list(dict.keys())
# ['fireReports', 'year', 'county']
# 创建值的列表
list(dict.values())
'''
[[4, 24, 31, 2, 3],
[2012, 2012, 2013, 2014, 2014],
['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma']]
'''
Pandas 中的交叉表
# 导入库
import pandas as pd
raw_data = {'regiment': ['Nighthawks', 'Nighthawks', 'Nighthawks', 'Nighthawks', 'Dragoons', 'Dragoons', 'Dragoons', 'Dragoons', 'Scouts', 'Scouts', 'Scouts', 'Scouts'],
'company': ['infantry', 'infantry', 'cavalry', 'cavalry', 'infantry', 'infantry', 'cavalry', 'cavalry','infantry', 'infantry', 'cavalry', 'cavalry'],
'experience': ['veteran', 'rookie', 'veteran', 'rookie', 'veteran', 'rookie', 'veteran', 'rookie','veteran', 'rookie', 'veteran', 'rookie'],
'name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze', 'Jacon', 'Ryaner', 'Sone', 'Sloan', 'Piger', 'Riani', 'Ali'],
'preTestScore': [4, 24, 31, 2, 3, 4, 24, 31, 2, 3, 2, 3],
'postTestScore': [25, 94, 57, 62, 70, 25, 94, 57, 62, 70, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['regiment', 'company', 'experience', 'name', 'preTestScore', 'postTestScore'])
df
|
regiment |
company |
experience |
name |
preTestScore |
postTestScore |
0 |
Nighthawks |
infantry |
veteran |
Miller |
4 |
25 |
1 |
Nighthawks |
infantry |
rookie |
Jacobson |
24 |
94 |
2 |
Nighthawks |
cavalry |
veteran |
Ali |
31 |
57 |
3 |
Nighthawks |
cavalry |
rookie |
Milner |
2 |
62 |
4 |
Dragoons |
infantry |
veteran |
Cooze |
3 |
70 |
5 |
Dragoons |
infantry |
rookie |
Jacon |
4 |
25 |
6 |
Dragoons |
cavalry |
veteran |
Ryaner |
24 |
94 |
7 |
Dragoons |
cavalry |
rookie |
Sone |
31 |
57 |
8 |
Scouts |
infantry |
veteran |
Sloan |
2 |
62 |
9 |
Scouts |
infantry |
rookie |
Piger |
3 |
70 |
10 |
Scouts |
cavalry |
veteran |
Riani |
2 |
62 |
11 |
Scouts |
cavalry |
rookie |
Ali |
3 |
70 |
按公司和团队创建交叉表。按公司和团队计算观测数量。
pd.crosstab(df.regiment, df.company, margins=True)
company |
cavalry |
infantry |
All |
regiment |
|
|
|
Dragoons |
2 |
2 |
4 |
Nighthawks |
2 |
2 |
4 |
Scouts |
2 |
2 |
4 |
All |
6 |
6 |
12 |
# 为每个团队创建公司和经验的交叉表
pd.crosstab([df.company, df.experience], df.regiment, margins=True)
|
regiment |
Dragoons |
Nighthawks |
Scouts |
All |
company |
experience |
|
|
|
|
cavalry |
rookie |
1 |
1 |
1 |
3 |
|
veteran |
1 |
1 |
1 |
3 |
infantry |
rookie |
1 |
1 |
1 |
3 |
|
veteran |
1 |
1 |
1 |
3 |
All |
|
4 |
4 |
4 |
12 |
删除重复
# 导入模块
import pandas as pd
raw_data = {'first_name': ['Jason', 'Jason', 'Jason','Tina', 'Jake', 'Amy'],
'last_name': ['Miller', 'Miller', 'Miller','Ali', 'Milner', 'Cooze'],
'age': [42, 42, 1111111, 36, 24, 73],
'preTestScore': [4, 4, 4, 31, 2, 3],
'postTestScore': [25, 25, 25, 57, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'preTestScore', 'postTestScore'])
df
|
first_name |
last_name |
age |
preTestScore |
postTestScore |
0 |
Jason |
Miller |
42 |
4 |
25 |
1 |
Jason |
Miller |
42 |
4 |
25 |
2 |
Jason |
Miller |
1111111 |
4 |
25 |
3 |
Tina |
Ali |
36 |
31 |
57 |
4 |
Jake |
Milner |
24 |
2 |
62 |
5 |
Amy |
Cooze |
73 |
3 |
70 |
# 确定哪些观测是重复的
df.duplicated()
'''
0 False
1 True
2 False
3 False
4 False
5 False
dtype: bool
'''
df.drop_duplicates()
|
first_name |
last_name |
age |
preTestScore |
postTestScore |
0 |
Jason |
Miller |
42 |
4 |
25 |
2 |
Jason |
Miller |
1111111 |
4 |
25 |
3 |
Tina |
Ali |
36 |
31 |
57 |
4 |
Jake |
Milner |
24 |
2 |
62 |
5 |
Amy |
Cooze |
73 |
3 |
70 |
# 删除 first_name 列中的重复项
# 但保留重复集中的最后一个观测
df.drop_duplicates(['first_name'], keep='last')
|
first_name |
last_name |
age |
preTestScore |
postTestScore |
2 |
Jason |
Miller |
1111111 |
4 |
25 |
3 |
Tina |
Ali |
36 |
31 |
57 |
4 |
Jake |
Milner |
24 |
2 |
62 |
5 |
Amy |
Cooze |
73 |
3 |
70 |
Pandas 数据帧的描述性统计
# 导入模块
import pandas as pd
data = {'name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'age': [42, 52, 36, 24, 73],
'preTestScore': [4, 24, 31, 2, 3],
'postTestScore': [25, 94, 57, 62, 70]}
df = pd.DataFrame(data, columns = ['name', 'age', 'preTestScore', 'postTestScore'])
df
|
name |
age |
preTestScore |
postTestScore |
0 |
Jason |
42 |
4 |
25 |
1 |
Molly |
52 |
24 |
94 |
2 |
Tina |
36 |
31 |
57 |
3 |
Jake |
24 |
2 |
62 |
4 |
Amy |
73 |
3 |
70 |
5 rows × 4 columns
# 所有年龄之和
df['age'].sum()
# 227
df['preTestScore'].mean()
# 12.800000000000001
df['preTestScore'].cumsum()
'''
0 4
1 28
2 59
3 61
4 64
Name: preTestScore, dtype: int64
'''
df['preTestScore'].describe()
'''
count 5.000000
mean 12.800000
std 13.663821
min 2.000000
25% 3.000000
50% 4.000000
75% 24.000000
max 31.000000
Name: preTestScore, dtype: float64
'''
df['preTestScore'].count()
# 5
df['preTestScore'].min()
# 2
df['preTestScore'].max()
# 31
df['preTestScore'].median()
# 4.0
df['preTestScore'].var()
# 186.69999999999999
df['preTestScore'].std()
# 13.663820841916802
df['preTestScore'].skew()
# 0.74334524573267591
df['preTestScore'].kurt()
# -2.4673543738411525
df.corr()
|
age |
preTestScore |
postTestScore |
age |
1.000000 |
-0.105651 |
0.328852 |
preTestScore |
-0.105651 |
1.000000 |
0.378039 |
postTestScore |
0.328852 |
0.378039 |
1.000000 |
3 rows × 3 columns
# 协方差矩阵
df.cov()
|
age |
preTestScore |
postTestScore |
age |
340.80 |
-26.65 |
151.20 |
preTestScore |
-26.65 |
186.70 |
128.65 |
postTestScore |
151.20 |
128.65 |
620.30 |
3 rows × 3 columns
丢弃行或者列
# 导入模块
import pandas as pd
data = {'name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'year': [2012, 2012, 2013, 2014, 2014],
'reports': [4, 24, 31, 2, 3]}
df = pd.DataFrame(data, index = ['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma'])
df
|
name |
reports |
year |
Cochice |
Jason |
4 |
2012 |
Pima |
Molly |
24 |
2012 |
Santa Cruz |
Tina |
31 |
2013 |
Maricopa |
Jake |
2 |
2014 |
Yuma |
Amy |
3 |
2014 |
# 丢弃观测(行)
df.drop(['Cochice', 'Pima'])
|
name |
reports |
year |
Santa Cruz |
Tina |
31 |
2013 |
Maricopa |
Jake |
2 |
2014 |
Yuma |
Amy |
3 |
2014 |
# 丢弃变量(列)
# 注意:`axis = 1`表示我们指的是列,而不是行
df.drop('reports', axis=1)
|
name |
year |
Cochice |
Jason |
2012 |
Pima |
Molly |
2012 |
Santa Cruz |
Tina |
2013 |
Maricopa |
Jake |
2014 |
Yuma |
Amy |
2014 |
如果它包含某个值(这里是Tina
),丢弃一行。
具体来说:创建一个名为df
的新数据框,名称列中的单元格的值不等于Tina
。
df[df.name != 'Tina']
|
name |
reports |
year |
Cochice |
Jason |
4 |
2012 |
Pima |
Molly |
24 |
2012 |
Maricopa |
Jake |
2 |
2014 |
Yuma |
Amy |
3 |
2014 |
按照行号丢弃一行(在本例中为第 3 行)。
请注意,Pandas使用从零开始的编号,因此 0 是第一行,1 是第二行,等等。
df.drop(df.index[2])
|
name |
reports |
year |
Cochice |
Jason |
4 |
2012 |
Pima |
Molly |
24 |
2012 |
Maricopa |
Jake |
2 |
2014 |
Yuma |
Amy |
3 |
2014 |
可以扩展到范围。
df.drop(df.index[[2,3]])
|
name |
reports |
year |
Cochice |
Jason |
4 |
2012 |
Pima |
Molly |
24 |
2012 |
Yuma |
Amy |
3 |
2014 |
或相对于 DF 的末尾来丢弃。
df.drop(df.index[-2])
|
name |
reports |
year |
Cochice |
Jason |
4 |
2012 |
Pima |
Molly |
24 |
2012 |
Santa Cruz |
Tina |
31 |
2013 |
Yuma |
Amy |
3 |
2014 |
你也可以选择相对于起始或末尾的范围。
df[:3] # 保留前三个
|
name |
reports |
year |
Cochice |
Jason |
4 |
2012 |
Pima |
Molly |
24 |
2012 |
Santa Cruz |
Tina |
31 |
2013 |
df[:-3] # 丢掉后三个
|
name |
reports |
year |
Cochice |
Jason |
4 |
2012 |
Pima |
Molly |
24 |
2012 |
枚举列表
# 创建字符串列表
data = ['One','Two','Three','Four','Five']
# 对于 enumerate(data) 中的每个项目
for item in enumerate(data):
# 打印整个枚举的元素
print(item)
# 只打印值(没有索引)
print(item[1])
'''
(0, 'One')
One
(1, 'Two')
Two
(2, 'Three')
Three
(3, 'Four')
Four
(4, 'Five')
Five
'''
在 Pandas 中将包含列表的单元扩展为自己的变量
# 导入 pandas
import pandas as pd
# 创建数据集
raw_data = {'score': [1,2,3],
'tags': [['apple','pear','guava'],['truck','car','plane'],['cat','dog','mouse']]}
df = pd.DataFrame(raw_data, columns = ['score', 'tags'])
# 查看数据集
df
|
score |
tags |
0 |
1 |
[apple, pear, guava] |
1 |
2 |
[truck, car, plane] |
2 |
3 |
[cat, dog, mouse] |
# 将 df.tags 扩展为自己的数据帧
tags = df['tags'].apply(pd.Series)
# 将每个变量重命名为标签
tags = tags.rename(columns = lambda x : 'tag_' + str(x))
# 查看 tags 数据帧
tags
|
tag_0 |
tag_1 |
tag_2 |
0 |
apple |
pear |
guava |
1 |
truck |
car |
plane |
2 |
cat |
dog |
mouse |
# 将 tags 数据帧添加回原始数据帧
pd.concat([df[:], tags[:]], axis=1)
|
score |
tags |
tag_0 |
tag_1 |
tag_2 |
0 |
1 |
[apple, pear, guava] |
apple |
pear |
guava |
1 |
2 |
[truck, car, plane] |
truck |
car |
plane |
2 |
3 |
[cat, dog, mouse] |
cat |
dog |
mouse |
过滤 pandas 数据帧
# 导入模块
import pandas as pd
data = {'name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'year': [2012, 2012, 2013, 2014, 2014],
'reports': [4, 24, 31, 2, 3],
'coverage': [25, 94, 57, 62, 70]}
df = pd.DataFrame(data, index = ['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma'])
df
|
coverage |
name |
reports |
year |
Cochice |
25 |
Jason |
4 |
2012 |
Pima |
94 |
Molly |
24 |
2012 |
Santa Cruz |
57 |
Tina |
31 |
2013 |
Maricopa |
62 |
Jake |
2 |
2014 |
Yuma |
70 |
Amy |
3 |
2014 |
# 查看列
df['name']
'''
Cochice Jason
Pima Molly
Santa Cruz Tina
Maricopa Jake
Yuma Amy
Name: name, dtype: object
'''
df[['name', 'reports']]
|
name |
reports |
Cochice |
Jason |
4 |
Pima |
Molly |
24 |
Santa Cruz |
Tina |
31 |
Maricopa |
Jake |
2 |
Yuma |
Amy |
3 |
# 查看前两行
df[:2]
|
coverage |
name |
reports |
year |
Cochice |
25 |
Jason |
4 |
2012 |
Pima |
94 |
Molly |
24 |
2012 |
# 查看 Coverage 大于 50 的行
df[df['coverage'] > 50]
|
coverage |
name |
reports |
year |
Pima |
94 |
Molly |
24 |
2012 |
Santa Cruz |
57 |
Tina |
31 |
2013 |
Maricopa |
62 |
Jake |
2 |
2014 |
Yuma |
70 |
Amy |
3 |
2014 |
# 查看 Coverage 大于 50 并且 Reports 小于 4 的行
df[(df['coverage'] > 50) & (df['reports'] < 4)]
|
coverage |
name |
reports |
year |
Maricopa |
62 |
Jake |
2 |
2014 |
Yuma |
70 |
Amy |
3 |
2014 |
寻找数据帧的列中的最大值
# 导入模块
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# 创建数据帧
raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'],
'age': [42, 52, 36, 24, 73],
'preTestScore': [4, 24, 31, 2, 3],
'postTestScore': [25, 94, 57, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'preTestScore', 'postTestScore'])
df
|
first_name |
last_name |
age |
preTestScore |
postTestScore |
0 |
Jason |
Miller |
42 |
4 |
25 |
1 |
Molly |
Jacobson |
52 |
24 |
94 |
2 |
Tina |
Ali |
36 |
31 |
57 |
3 |
Jake |
Milner |
24 |
2 |
62 |
4 |
Amy |
Cooze |
73 |
3 |
70 |
# 获取 preTestScore 列中的最大值的索引
df['preTestScore'].idxmax()
# 2