pandas
import os
import pandas as pd
import requests
PATH = r'D:/learn/'
# 请求获取数据
r = requests.get("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data")
# 打开文件
with open( PATH + 'iris.data','w') as f:
f.write(r.text)
#切换路径
os.chdir(PATH)
# 读入DataFrame
df = pd.read_csv(PATH + 'iris.data', names=['sepal length', 'sepal width', 'petal length', 'petal width', 'class'])
# 返回前10行 默认是5
# pandas 術語中 数据列称之为 系列, 表格称之为: DataFrame
print(df.head(10))
# 通过列名获取 一列 数据
print(df['sepal length'])
# 获取所有的列
print(df.columns)
# 结果: Index(['sepal length', 'sepal width', 'petal length', 'petal width', 'class'], dtype='object')
# 获取数据去重
print(df['class'].unique()) # ['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']
# 统计个数
print(df.count())
#sepal length 150
#sepal width 150
#petal length 150
#petal width 150
#class 150
#dtype: int64
# 统计 一列 的个数
print(df['class'].count())
# 筛选列数据 Iris-setosa
print(df[df['class'] == 'Iris-setosa'])
# 筛选出来 Iris-setosa
print(df[df['class'] == 'Iris-setosa'])
# 统计
print(df[df['class'] == 'Iris-setosa'].count())
# 重新设置索引
print(df[df['class'] == 'Iris-setosa'].reset_index(drop=False).head(3))
index sepal length sepal width petal length petal width class
0 0 5.1 3.5 1.4 0.2 Iris-setosa
1 1 4.9 3.0 1.4 0.2 Iris-setosa
2 2 4.7 3.2 1.3 0.2 Iris-setosa
print(df[df['class'] == 'Iris-setosa'].reset_index(drop=True).head(3))
sepal length sepal width petal length petal width class
0 5.1 3.5 1.4 0.2 Iris-setosa
1 4.9 3.0 1.4 0.2 Iris-setosa
2 4.7 3.2 1.3 0.2 Iris-setosa
# 多重过滤
print(df[ (df['class'] == 'Iris-setosa') & (df['sepal width'] > 3.8)])
# 统计信息
print(df.describe())
sepal length sepal width petal length petal width
count 150.000000 150.000000 150.000000 150.000000
mean 5.843333 3.054000 3.758667 1.198667
std 0.828066 0.433594 1.764420 0.763161
min 4.300000 2.000000 1.000000 0.100000
25% 5.100000 2.800000 1.600000 0.300000
50% 5.800000 3.000000 4.350000 1.300000
75% 6.400000 3.300000 5.100000 1.800000
max 7.900000 4.400000 6.900000 2.500000
print(df.describe(percentiles=[.20,.40,.60,.80,.90]))
sepal length sepal width petal length petal width
count 150.000000 150.000000 150.000000 150.000000
mean 5.843333 3.054000 3.758667 1.198667
std 0.828066 0.433594 1.764420 0.763161
min 4.300000 2.000000 1.000000 0.100000
20% 5.000000 2.700000 1.500000 0.200000
40% 5.600000 3.000000 3.900000 1.160000
50% 5.800000 3.000000 4.350000 1.300000
60% 6.100000 3.100000 4.640000 1.500000
80% 6.520000 3.400000 5.320000 1.900000
90% 6.900000 3.610000 5.800000 2.200000
max 7.900000 4.400000 6.900000 2.500000
# 相关性
print(df.corr())
sepal length sepal width petal length petal width
sepal length 1.000000 -0.109369 0.871754 0.817954
sepal width -0.109369 1.000000 -0.420516 -0.356544
petal length 0.871754 -0.420516 1.000000 0.962757
petal width 0.817954 -0.356544 0.962757 1.000000
print(df.corr(method='pearson'))
sepal length sepal width petal length petal width
sepal length 1.000000 -0.109369 0.871754 0.817954
sepal width -0.109369 1.000000 -0.420516 -0.356544
petal length 0.871754 -0.420516 1.000000 0.962757
petal width 0.817954 -0.356544 0.962757 1.000000
print(df.corr(method='kendall'))
sepal length sepal width petal length petal width
sepal length 1.000000 -0.072112 0.717624 0.654960
sepal width -0.072112 1.000000 -0.182391 -0.146988
petal length 0.717624 -0.182391 1.000000 0.803014
petal width 0.654960 -0.146988 0.803014 1.000000
# 画图
import matplotlib.pyplot as plt
plt.style.use("ggplot")
#import numpy as np
fig, ax = plt.subplots(2,2, figsize=(6,4))
ax[0][0].hist(df['sepal length'], color='black')
ax[0][0].set_xlabel('Iris sepal length', fontsize = 12)
ax[0][0].set_ylabel('Count', fontsize = 12)
ax[0][0].set_title('Iris sepal length')
#fig, ax = plt.subplots(figsize=(6,4))
ax[0][1].hist(df['sepal width'], color='black')
ax[0][1].set_xlabel('Iris sepal width', fontsize = 12)
ax[0][1].set_ylabel('Count', fontsize = 12)
ax[0][1].set_title('Iris sepal width')
#fig, ax = plt.subplots(figsize=(6,4))
ax[1][0].hist(df['petal length'], color='black')
ax[1][0].set_xlabel('Iris petal length', fontsize = 12)
ax[1][0].set_ylabel('Count', fontsize = 12)
ax[1][0].set_title('Iris petal length')
#fig, ax = plt.subplots(figsize=(6,4))
ax[1][1].hist(df['petal width'], color='black')
ax[1][1].set_xlabel('Iris petal width', fontsize = 12)
ax[1][1].set_ylabel('Count', fontsize = 12)
ax[1][1].set_title('Iris petal width')
plt.tight_layout()
# 使用 seaborn
import seaborn as sns
sns.pairplot(df, hue='class')
Figure 2020-09-10 131749.png
import matplotlib.pyplot as plt
plt.style.use("ggplot")
import seaborn as sns
fig, ax = plt.subplots(2,2, figsize=(10,8))
sns.set(style='white',palette='muted')
sns.violinplot(x=df['class'], y=df['sepal length'], ax = ax[0,0])
sns.violinplot(x=df['class'], y=df['sepal width'], ax = ax[0,1])
sns.violinplot(x=df['class'], y=df['petal length'], ax = ax[1,0])
sns.violinplot(x=df['class'], y=df['petal width'], ax = ax[1,1])
fig.suptitle('Violin Plots', fontsize = 16, y = 1.03)
for i in ax.flat:
plt.setp(i.get_xticklabels(), rotation = -90)
fig.tight_layout()
Figure 2020-09-10 090529.png