from pandas import Series, DataFrame
import pandas as pd
import numpy as np
obj = Series([1,2,3,4])
obj.values # array([1,2,3,4]) 是一个numpy.array数组类型
obj.index # int64Index([0,1,2,3])
obj2 = Series([1,2,3,4,5],index=['a','b','c','d']) # 构建索引
obj2['a'] # 根据索引取值
# numpy的数组运算都会保留索引和值之间的连接
obj2 > 2
obj2 * obj2
np.exp(obj2)
# 如果传入一个字典,则索引就是字典的key值
sdata = {
'Ohjo':35000,
'Texas':71000,
'Oregon':16000
}
obj3 = Series(sdata)
# 'Ohjo':35000,
# 'Texas':71000,
# 'Oregon':16000
# dtype: int64
# pandas isnull和notnull用于缺失值检测
states = ['California','Ohio','Oregon','Texas']
obj4 = Series(sdata,index=states)
obj4
# California NaN
# Ohio NaN
# Oregon 16000.0
# Texas 71000.0
# dtype: float64
pd.isna(obj4)
# California True
# Ohio True
# Oregon False
# Texas False
# dtype: bool
pd.notna(obj4)
# Series 自动对齐索引
obj3 + obj4 #相同索引值相加,如果有一个NA 则为NA
# Series对象本身有一个name属性,该属性跟pandas其他的关键功能非常密切
obj4.name = 'population' # 名称赋值
obj4.index.name = 'state'
# Series 名称可以就地修改
obj4.index = ['Bob','Steve','Jeff','Ryan']
# DataFrame 是一个表格型的数据结构,它含有一组有序序列,每列可以使不同的值类型(数值,字符串,布尔)
# DataFrame 既有行索引也有列索引
data = {'state':['Ohio','Ohio','Ohio','Nevada','Nevada'],
'year':[2000,2001,2002,2001,2002],
'pop':[1.5,1.7,3.6,2.4,2.9]
}
frame = DataFrame(data)
# state year pop
# 0 Ohio 2000 1.5
# 1 Ohio 2001 1.7
# 2 Ohio 2002 3.6
# 3 Nevada 2001 2.4
# 4 Nevada 2002 2.9
# 跟Series一样,如果传入的值找不到,就会产生NA
# 通过类似字典标记的方式或属性的方式,可以将DataFrame的列作为一个Series,注意,返回的Series拥有原DateFrame相同的索引
frame = DataFrame(data,columns=['year','state','pop','debt'],index=['one','two','three','four','five'])
# year state pop debt
# one 2000 Ohio 1.5 NaN
# two 2001 Ohio 1.7 NaN
# three 2002 Ohio 3.6 NaN
# four 2001 Nevada 2.4 NaN
# five 2002 Nevada 2.9 NaN
frame['state']
# one Ohio
# two Ohio
# three Ohio
# four Nevada
# five Nevada
# Name: state, dtype: object
#列可以通过赋值的方式进行修改,可以给空的debt列赋上一个标量
frame['debt'] = 16.5
# year state pop debt
# one 2000 Ohio 1.5 16.5
# two 2001 Ohio 1.7 16.5
# three 2002 Ohio 3.6 16.5
# four 2001 Nevada 2.4 16.5
# five 2002 Nevada 2.9 16.5
# 注意将列表赋给某个列时,其长度必须跟DataFrame的长度相匹配。
# 如果赋值的是一个Series,就会精确匹配DataFrame的索引,所有的空位都将被填上缺失值
val = Series([1,2,3],index=['two','three','five'])
frame['debt'] = val
# year state pop debt
# one 2000 Ohio 1.5 NaN
# two 2001 Ohio 1.7 1.0
# three 2002 Ohio 3.6 2.0
# four 2001 Nevada 2.4 NaN
# five 2002 Nevada 2.9 3.0
# del 关键字删除整列
del frame['debt']
# 另一种形式,嵌套字典:
# 它会被解释为外层字典的键作为列,内层键作为行索引
pop = {
'Nevada':{2001:2.4,2002:2.9},
'Ohio':{2000:1.5,20011:1.7,2002:3.6}
}
frame3 = DataFrame(pop)
# Nevada Ohio
# 2000 NaN 1.5
# 2001 2.4 NaN
# 2002 2.9 3.6
# 20011 NaN 1.7
# 索引对象
# Pandas的索引对象负责管理轴标签和其他元数据标签(比如轴名称等)
'Ohio' in frame3.columns # True
2003 in frame3.index # False
# 基本功能
# pandas对象的一个重要方法时reindex,其作用是创建一个使用新索引的新对象
# 调用Series的reindex将会根据新索引进行重新排序,如果某个索引值当前不存在,就会引入缺失值
obj = Series([4.5,7.2,5.3,3.6],index=['d','b','a','c'])
obj.reindex(['a','b','c','d','e'])
# a 5.3
# b 7.2
# c 3.6
# d 4.5
# e NaN
# dtype: float64
# 对于时间这样的有序数列,重新索引时可能需要做一些插值处理。method选项即可达到此目的
# 使用ffill(向前填充) bfill(相后填充)
obj3 = Series(['blue','purple','yellow'],index=[0,2,4])
obj3.reindex(range(6),method='ffill')
# 0 blue
# 1 blue
# 2 purple
# 3 purple
# 4 yellow
# 5 yellow
# dtype: object
# 对于DataFrame,reindex可以修改(行)索引,列,或两个都修改。如果仅传入一个序列,则会重新索引行
frame = DataFrame(np.arange(9).reshape((3,3)),index=['a','b','c'],columns=['Ohio','Texas','California'])
frame2 = frame.reindex(['a','b','c','d'])
# 原来的frame不糊进行改变,是返回的一个副本
# Ohio Texas California
# a 0.0 1.0 2.0
# b 3.0 4.0 5.0
# c 6.0 7.0 8.0
# d NaN NaN NaN
# 使用columns关键字即可重新索引
state = ['Texas','Utah','California']
frame.reindex(columns=state)
# Texas Utah California
# a 1 NaN 2
# b 4 NaN 5
# c 7 NaN 8
# 利用ix标签的索引功能,重新索引任务可以变得更加简洁
frame.ix[['a','b','c'],states]
# California Ohio Oregon Texas
# a 2 0 NaN 1
# b 5 3 NaN 4
# c 8 6 NaN 7
# 丢弃指定轴上的项
# 丢弃某条轴上一个或多个项很简单,只要有一个索引数组或列表即可。
# drop方法返回的是一个在指定轴上删除了指定值的新对象
# 注意:drop是删除行(不指定axis时),del是删除列
obj = Series(np.arange(5.),index=['a','b','c','d','e'])
new_obj = obj.drop('c')
# a 0.0
# b 1.0
# d 3.0
# e 4.0
# dtype: float64
# DataFrame 可以删除任意轴上的索引值
data = DataFrame(np.arange(16).reshape((4,4)),index=['Ohio','Colorado','Utah','New York'],
columns=['one','two','three','four'])
data.drop(['Colorado','Ohio'])
# one two three four
# Utah 8 9 10 11
# New York 12 13 14 15
data.drop('two',axis=1)
# one three four
# Ohio 0 2 3
# Colorado 4 6 7
# Utah 8 10 11
# New York 12 14 15
# 索引,选取和过滤
obj = Series(np.arange(4.),index=['a','b','c','d'])
obj['b'] # 1.0
obj[1] # 1.0
obj[2:4]
# c 2.0
# d 3.0
# dtype: float64
obj['c','d'] # 利用标签且切片都是末端包含的
obj['c':'d']
obj['c':'d'] = 5
# 对DataFrame进行索引就是获取一个或多个列
data = DataFrame(np.arange(16).reshape((4,4)),index=['Ohio','Colorado','Utah','NewYork'],
columns=['one','two','three','four'])
data['two'] # 这个是取列
# Ohio 1
# Colorado 5
# Utah 9
# NewYork 13
# Name: two, dtype: int64
# 通过这种索引集中特殊情况,首先通过切片或布尔类型整组选取
data[:2] # 取行
data[data['three'] > 5] #取行
data.ix[['Utah','NewYork'],1:3]
# 第一个参数行数,第二个参数列数
# 算数运算和数据对齐
s1 = Series([7.3,2.5,3.4,1.5], index=['a','c','d','e'])
s2 = Series([2.1,3.6,1.5,4,3.1],index=['a','c','e','f','g'])
s1 + s2
# 自动的数据对齐操作,在不重复的索引处,引入了NA值
# a 9.4
# c 6.1
# d NaN
# e 3.0
# f NaN
# g NaN
# dtype: float64
# DataFrame是把它们相加后会返回一个新的DataFrame,其索引和列为原来两个DF的并集
df1 = DataFrame(np.arange(9.).reshape((3,3)),columns=list('bcd'),index=['Ohio','Texas','Colorado'])
df2 = DataFrame(np.arange(12).reshape((4,3)),columns=list('bde'),index=['Utah','Ohio','Texas','Oregon'])
df1 + df2
# b c d e
# Colorado NaN NaN NaN NaN
# Ohio 3.0 NaN 6.0 NaN
# Oregon NaN NaN NaN NaN
# Texas 9.0 NaN 12.0 NaN
# Utah NaN NaN NaN NaN
# 在算数方法中填充值
# 上面例子中相加没有重叠的位置会产生NA值
df1 = DataFrame(np.arange(12).reshape((3,4)),columns=list('abcd'))
df2 = DataFrame(np.arange(20).reshape((4,5)),columns=list('abcde'))
df1.add(df2,fill_value = 0)
# a b c d e
# 0 0.0 2.0 4.0 6.0 4.0
# 1 9.0 11.0 13.0 15.0 9.0
# 2 18.0 20.0 22.0 24.0 14.0
# 3 15.0 16.0 17.0 18.0 19.0
# DataFrame和Series之间的运算
arr = np.arange(12).reshape((3,4))
arr - arr[0]
# array([[0, 0, 0, 0],
# [4, 4, 4, 4],
# [8, 8, 8, 8]])
# DataFrame和Series之间的运算将会将Series的索引匹配到DataFrames列,让后向下传播
# 如果希望匹配且在列上广播,则必须使用算术运算方法
series3 = frame['Ohio']
frame.sub(series3,axis=0)
# 在列上传播的一个值
# Ohio Texas California
# a 0 1 2
# b 0 1 2
# c 0 1 2
# 函数的应用和映射
# Numpy的ufuncs(元素级数组和方法)也可以用于操作Pandas对象
frame = DataFrame(np.random.randn(4,3),columns=list('bde'),index=['Utah','Ohio','Teax','Oregon'])
# b d e
# Utah 0.627102 -0.258646 1.602441
# Ohio -1.608849 -0.188285 0.860116
# Teax 0.331142 1.957922 -0.067314
# Oregon 1.077519 -0.289323 -0.959457
np.abs(frame)
# b d e
# Utah 0.627102 0.258646 1.602441
# Ohio 1.608849 0.188285 0.860116
# Teax 0.331142 1.957922 0.067314
# Oregon 1.077519 0.289323 0.959457
# 排序和排名
obj = Series(np.arange(4),index=['a','b','c','d'])
obj.sort_index()
# a 0
# b 1
# c 2
# d 3
# dtype: int64
frame = DataFrame(np.arange(8).reshape((2,4)),index=['three','one'],columns=['d','a','b','c'])
frame.sort_index() # 按照行进行排序
# d a b c
# one 4 5 6 7
# three 0 1 2 3
frame.sort_index(axis=1,ascending=False) # 按照列来排序,降序排列
# d c b a
# three 0 3 2 1
# one 4 7 6 5
# Series按值进行排序
obj = Series([1,5,6,8,3])
obj.sort_values()
# 0 1
# 4 3
# 1 5
# 2 6
# 3 8
# dtype: int64
# DataFrame上,根据一个或多个列中的值进行排序
frame = DataFrame({'b':[4,7,-3,2],'a':[0,1,0,1]})
frame.sort_index(by='b')
# b a
# 2 -3 0
# 3 2 1
# 0 4 0
# 1 7 1
frame.sort_index(by=['a','b'])
# b a
# 2 -3 0
# 0 4 0
# 3 2 1
# 1 7 1
# 带有重复值的轴索引
obj = Series(range(5),index=['a','a','b','b','c'])
obj.index.is_unique # False 检查索引值是不是唯一
# 对DataFrame同样如此
df = DataFrame(np.random.randn(4,3),index=['a','a','b','b'])
df.ix['b']
# 0 1 2
# b -2.301444 1.332480 1.519970
# b 1.411372 0.157736 0.228123
#汇总和计算统计描述
df = DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]],index=['a','b','c','d'],columns=['one','two'])
df.sum(axis=1) # NA 值会自动排除
# a 1.40
# b 2.60
# c 0.00
# d -0.55
# dtype: float64
df.sum()
# one 9.25
# two -5.80
# dtype: float64
df.idxmax() # 达到最大值和最小值的索引
# one b
# two d
# dtype: object
df.describe()
# one two
# count 3.000000 2.000000
# mean 3.083333 -2.900000
# std 3.493685 2.262742
# min 0.750000 -4.500000
# 25% 1.075000 -3.700000
# 50% 1.400000 -2.900000
# 75% 4.250000 -2.100000
# max 7.100000 -1.300000
# 相关系数和协方差
# 看书吧
# 唯一值以及成员资格
obj = Series(['c','a','d','a','a','b','b','c','c'])
uniques = obj.unique()
# array(['c', 'a', 'd', 'b'], dtype=object) 返回一个唯一值
obj.value_counts() # 一个用于计算Series中各值出现的频率
# c 3
# a 3
# b 2
# d 1
# dtype: int64
# 判断矢量化集合的成员资格,可用于选取Series中DataFrame列中数据的子集
mask = obj.isin(['b','c'])
# 0 True
# 1 False
# 2 False
# 3 False
# 4 False
# 5 True
# 6 True
# 7 True
# 8 True
# dtype: bool
obj[mask]
# 0 c
# 5 b
# 6 b
# 7 c
# 8 c
# dtype: object
#处理缺失数据
string_data = Series(['aardvark','artichoke',np.nan,'avocado'])
string_data.isnull()
# 0 False
# 1 False
# 2 True
# 3 False
# dtype: bool
# python 内置的None也会被当做NaN 处理
string_data.fillna(0)
# 0 aardvark
# 1 artichoke
# 2 0
# 3 avocado
# dtype: object
string_data.notna() # isnull的否定形式
from numpy import nan as NA
data = Series([1,NA,2,3,NA,7])
data[data.notnull()]
#DataFrame对象
data = DataFrame([[1,6.5,3],[1,NA,NA],[NA,NA,NA],[NA,6.5,3]])
cleaned = data.dropna()
# 0 1 2
# 0 1.0 6.5 3.0
data.dropna(how='all') #只丢弃全部为NA的行
data.dropna(axis=1,how='all') # 丢弃一列,全为NA时丢弃
df = DataFrame(np.random.randn(7,3))
df.ix[:4,1] = NA
df.ix[:2,2] = NA
# 0 1 2
# 0 0.611871 NaN NaN
# 1 0.011781 NaN NaN
# 2 -0.022545 NaN NaN
# 3 -1.591679 NaN 1.259344
# 4 -0.116075 NaN -1.721315
# 5 -1.380649 0.715962 0.378546
# 6 -0.494893 -0.763513 -0.272571
df.dropna(thresh=3) #只留下一部分观测数据,NA只的个数
# 0 1 2
# 5 -1.380649 0.715962 0.378546
# 6 -0.494893 -0.763513 -0.272571
# 填充缺失数据
df.fillna(0) # 填充
df.fillna({1:0.5,3:-1}) #对不同列进行填充
df.fillna(0,inplace=True) #fillna默认返回新对象,指定inplace=True原地进行修改
df.fillna(method='ffill') #向前填充
df.fillna(method='ffill',limit=2) #只限制填充两列
data = Series([1,NA,3.5,NA,7])
data.fillna(data.mean())
# 层次化索引
data = Series(np.random.randn(6),
index=[['a','a','b','b','c','c'],[1,2,1,2,1,2]])
# a 1 -1.177218
# 2 1.096212
# b 1 -0.995744
# 2 -0.135281
# c 1 -0.778764
# 2 0.015534
# dtype: float64
data['b']
# 1 -0.995744
# 2 -0.135281
# dtype: float64
data.ix[['b','c']]
# b 1 -0.995744
# 2 -0.135281
# c 1 -0.778764
# 2 0.015534
# dtype: float64
data[:,2]
# a 1.096212
# b -0.135281
# c 0.015534
# dtype: float64
data.unstack() # 重塑和分组
# 1 2
# a -1.177218 1.096212
# b -0.995744 -0.135281
# c -0.778764 0.015534
frame = DataFrame(np.arange(12).reshape(4,3),index=[['a','a','b','b'],[1,2,1,2]],
columns=[['Ohio','Ohio','Colorado'],['Green','Red','Green']])
# Ohio Colorado
# Green Red Green
# a 1 0 1 2
# 2 3 4 5
# b 1 6 7 8
# 2 9 10 11
frame.index.names = ['key1','key2']
# Ohio Colorado
# Green Red Green
# key1 key2
# a 1 0 1 2
# 2 3 4 5
# b 1 6 7 8
# 2 9 10 11
frame.columns.names = ['state','color']
# state Ohio Colorado
# color Green Red Green
# key1 key2
# a 1 0 1 2
# 2 3 4 5
# b 1 6 7 8
# 2 9 10 11
# 重排分级顺序
frame.sortlevel(0) # 行上的排序
# state Ohio Colorado
# color Green Red Green
# key1 key2
# a 1 0 1 2
# 2 3 4 5
# b 1 6 7 8
# 2 9 10 11
frame.sortlevel(1) # 列上的排序
# state Ohio Colorado
# color Green Red Green
# key1 key2
# a 1 0 1 2
# b 1 6 7 8
# a 2 3 4 5
# b 2 9 10 11
# 使用DataFrame的列当索引
frame = DataFrame({'a':range(7),'b':range(7,0,-1),'c':['one','one','one','two','two','two','two'],
'd':[0,1,2,0,1,2,3]})
# a b c d
# 0 0 7 one 0
# 1 1 6 one 1
# 2 2 5 one 2
# 3 3 4 two 0
# 4 4 3 two 1
# 5 5 2 two 2
# 6 6 1 two 3
frame.set_index(['c','d'])
# a b
# c d
# one 0 0 7
# 1 1 6
# 2 2 5
# two 0 3 4
# 1 4 3
# 2 5 2
# 3 6 1
Pandas入门
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。
推荐阅读更多精彩内容
- 这个是根据pandas官网文档翻译出来,文档里面是包含一切,这里只是记录一下实际会用到的东西。比如selectio...
- merge contactpandas提供一些组合数据的set方法,相当于join/merge操作吧。 joins...
- 上一节《Pandas入门3 -- Series基本操作》 如前所述,Series是用于表示一维数据的类,那带标签(...