pandas开始
代码:
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
np.random.seed(0)
# 配置pandas显示
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 10)
# Pandas的Series对象是一个带索引数据构成的一维数组。
data = pd.Series([0.25, 0.5, 0.75, 1.0])
print(data)
# values属性返回的结果与NumPy数组类似:与深度学习对标
print(data.values)
print(data.index)
print(data[1])
print(data[1:3])
data = pd.Series([0.25, 0.5, 0.75, 1.0],
index=['a', 'b', 'c', 'd'])
print(data)
print(data['a'])
print(data[0])
print(data['b':'d'])
population_dict = {'California': 38332521,
'Texas': 26448193,
'New York': 19651127,
'Florida': 19552860,
'Illinois': 12882135}
population = pd.Series(population_dict)
print(population)
print(population['California'])
print(population[0])
# DataFrame 既可以作为一个通用型 NumPy数组,也可以看作特殊的Python字典
# 可以把DataFrame看成是有序排列的若干Series对象。
# 这里的“排列”指的是它们拥有共同的索引。
area_dict = {'California': 423967,
'Texas': 695662,
'New York': 11297,
'Florida': 170312,
'Illinois': 149995}
area = pd.Series(area_dict)
print(area)
states = pd.DataFrame({'population': population,
'area': area})
print(states)
print(states.values)
print(states.index)
print(states.columns)
# 需要注意的是,在NumPy的二维数组里,data[0]返回第一行;
# 而在DataFrame中,data['col0']返回第一列。
print(states['area'])
# Pandas 的 Index 对象是一个很有趣的数据结构,
# 可以将它看作是一个不可变数组或有序集合
# (实际上是一个多集,因为Index对象可能会包含重复值)。
ind = pd.Index([2, 3, 5, 7, 11])
print(ind)
print(ind[1])
print(ind[::2])
print(ind.size, ind.shape, ind.ndim, ind.dtype)
indA = pd.Index([1, 3, 5, 7, 9])
indB = pd.Index([2, 3, 5, 7, 11])
print(indA & indB)
print(indA | indB)
print(indA ^ indB)
print(indA.intersection(indB))
# 数据取值与选择
data = pd.Series([0.25, 0.5, 0.75, 1.0],
index=['a', 'b', 'c', 'd'])
print(data['d'])
print('a' in data)
print(data.keys())
print(list(data.items()))
data['e'] = 1.25
print(data)
# 切片是绝大部分混乱之源
# 当使用显式索引(即 data['a':'c'])作切片时,结果包含最后一个索引;
# 而当使用隐式索引(即 data[0:2]) 作切片时,结果不包含最后一个索引。
print(data['a':'c'])
print(data[0:2])
print(data[(data > 0.3) & (data < 0.8)])
print(data[['a', 'e']])
# 索引器:loc、iloc和ix
# 由于整数索引很容易造成混淆,所以 Pandas 提供了一些索引器(indexer)属性来作为取值的方法。
# 它们不是Series对象的函数方法,而是暴露切片接口的属性
data = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])
# loc属性,表示取值和切片都是显式
print(data.loc[1])
print(data.loc[1:3])
# iloc属性,表示取值和切片都是Python形式的隐式索引:与深度学习对标
print(data.iloc[1])
print(data.iloc[1:3])
area = pd.Series({'California': 423967,
'Texas': 695662,
'New York': 11297,
'Florida': 170312,
'Illinois': 149995})
pop = pd.Series({'California': 38332521,
'Texas': 26448193,
'New York': 19651127,
'Florida': 19552860,
'Illinois': 12882135})
data = pd.DataFrame({'area': area, 'pop': pop})
print(data)
print(data['area'])
print(data.area)
print(data.area is data['area'])
print(data.pop is data['pop'])
data['density'] = data['pop'] / data['area']
print(data)
print(data.values)
print(data.T)
print(data.values[0])
print(data['area'])
print(data.iloc[:3, :2])
print(data.loc[:'Illinois', :'pop'])
print(data.loc[data.density > 100, ['pop', 'density']])
data.iloc[0, 2] = 90
print(data)
# 其它取值方法
print(data['Florida':'Illinois'])
print(data[1: 3])
print(data[data.density > 100])
输出:
0 0.25
1 0.50
2 0.75
3 1.00
dtype: float64
[0.25 0.5 0.75 1. ]
RangeIndex(start=0, stop=4, step=1)
0.5
1 0.50
2 0.75
dtype: float64
a 0.25
b 0.50
c 0.75
d 1.00
dtype: float64
0.25
0.25
b 0.50
c 0.75
d 1.00
dtype: float64
California 38332521
Texas 26448193
New York 19651127
Florida 19552860
Illinois 12882135
dtype: int64
38332521
38332521
California 423967
Texas 695662
New York 11297
Florida 170312
Illinois 149995
dtype: int64
population area
California 38332521 423967
Texas 26448193 695662
New York 19651127 11297
Florida 19552860 170312
Illinois 12882135 149995
[[38332521 423967]
[26448193 695662]
[19651127 11297]
[19552860 170312]
[12882135 149995]]
Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')
Index(['population', 'area'], dtype='object')
California 423967
Texas 695662
New York 11297
Florida 170312
Illinois 149995
Name: area, dtype: int64
Int64Index([2, 3, 5, 7, 11], dtype='int64')
3
Int64Index([2, 5, 11], dtype='int64')
5 (5,) 1 int64
Int64Index([3, 5, 7], dtype='int64')
Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')
Int64Index([1, 2, 9, 11], dtype='int64')
Int64Index([3, 5, 7], dtype='int64')
1.0
True
Index(['a', 'b', 'c', 'd'], dtype='object')
[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]
a 0.25
b 0.50
c 0.75
d 1.00
e 1.25
dtype: float64
a 0.25
b 0.50
c 0.75
dtype: float64
a 0.25
b 0.50
dtype: float64
b 0.50
c 0.75
dtype: float64
a 0.25
e 1.25
dtype: float64
a
1 a
3 b
dtype: object
b
3 b
5 c
dtype: object
area pop
California 423967 38332521
Texas 695662 26448193
New York 11297 19651127
Florida 170312 19552860
Illinois 149995 12882135
California 423967
Texas 695662
New York 11297
Florida 170312
Illinois 149995
Name: area, dtype: int64
California 423967
Texas 695662
New York 11297
Florida 170312
Illinois 149995
Name: area, dtype: int64
True
False
area pop density
California 423967 38332521 90.413926
Texas 695662 26448193 38.018740
New York 11297 19651127 1739.499602
Florida 170312 19552860 114.806121
Illinois 149995 12882135 85.883763
[[4.23967000e+05 3.83325210e+07 9.04139261e+01]
[6.95662000e+05 2.64481930e+07 3.80187404e+01]
[1.12970000e+04 1.96511270e+07 1.73949960e+03]
[1.70312000e+05 1.95528600e+07 1.14806121e+02]
[1.49995000e+05 1.28821350e+07 8.58837628e+01]]
California Texas New York Florida Illinois
area 4.239670e+05 6.956620e+05 1.129700e+04 1.703120e+05 1.499950e+05
pop 3.833252e+07 2.644819e+07 1.965113e+07 1.955286e+07 1.288214e+07
density 9.041393e+01 3.801874e+01 1.739500e+03 1.148061e+02 8.588376e+01
[4.23967000e+05 3.83325210e+07 9.04139261e+01]
California 423967
Texas 695662
New York 11297
Florida 170312
Illinois 149995
Name: area, dtype: int64
area pop
California 423967 38332521
Texas 695662 26448193
New York 11297 19651127
area pop
California 423967 38332521
Texas 695662 26448193
New York 11297 19651127
Florida 170312 19552860
Illinois 149995 12882135
pop density
New York 19651127 1739.499602
Florida 19552860 114.806121
area pop density
California 423967 38332521 90.000000
Texas 695662 26448193 38.018740
New York 11297 19651127 1739.499602
Florida 170312 19552860 114.806121
Illinois 149995 12882135 85.883763
area pop density
Florida 170312 19552860 114.806121
Illinois 149995 12882135 85.883763
area pop density
Texas 695662 26448193 38.018740
New York 11297 19651127 1739.499602
area pop density
New York 11297 19651127 1739.499602
Florida 170312 19552860 114.806121
Process finished with exit code 0