pandas

import os
import pandas as pd
import requests

PATH = r'D:/learn/'

# 请求获取数据
r = requests.get("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data")

# 打开文件
with open( PATH + 'iris.data','w') as f:
    f.write(r.text)
#切换路径
os.chdir(PATH)

# 读入DataFrame
df = pd.read_csv(PATH + 'iris.data', names=['sepal length', 'sepal width', 'petal length', 'petal width', 'class'])

# 返回前10行 默认是5
# pandas 術語中 数据列称之为 系列， 表格称之为： DataFrame
print(df.head(10))


# 通过列名获取 一列 数据
print(df['sepal length'])

# 获取所有的列
print(df.columns)
# 结果： Index(['sepal length', 'sepal width', 'petal length', 'petal width', 'class'], dtype='object')

# 获取数据去重
print(df['class'].unique()) # ['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']

# 统计个数
print(df.count())
#sepal length    150
#sepal width     150
#petal length    150
#petal width     150
#class           150
#dtype: int64

# 统计 一列 的个数
print(df['class'].count())

# 筛选列数据 Iris-setosa
print(df[df['class'] == 'Iris-setosa'])

# 筛选出来 Iris-setosa
print(df[df['class'] == 'Iris-setosa'])

# 统计
print(df[df['class'] == 'Iris-setosa'].count())

# 重新设置索引
print(df[df['class'] == 'Iris-setosa'].reset_index(drop=False).head(3))

   index  sepal length  sepal width  petal length  petal width        class
0      0           5.1          3.5           1.4          0.2  Iris-setosa
1      1           4.9          3.0           1.4          0.2  Iris-setosa
2      2           4.7          3.2           1.3          0.2  Iris-setosa

print(df[df['class'] == 'Iris-setosa'].reset_index(drop=True).head(3))

   sepal length  sepal width  petal length  petal width        class
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa


# 多重过滤
print(df[ (df['class'] == 'Iris-setosa') & (df['sepal width'] > 3.8)])


# 统计信息
print(df.describe())

       sepal length  sepal width  petal length  petal width
count    150.000000   150.000000    150.000000   150.000000
mean       5.843333     3.054000      3.758667     1.198667
std        0.828066     0.433594      1.764420     0.763161
min        4.300000     2.000000      1.000000     0.100000
25%        5.100000     2.800000      1.600000     0.300000
50%        5.800000     3.000000      4.350000     1.300000
75%        6.400000     3.300000      5.100000     1.800000
max        7.900000     4.400000      6.900000     2.500000

print(df.describe(percentiles=[.20,.40,.60,.80,.90]))

       sepal length  sepal width  petal length  petal width
count    150.000000   150.000000    150.000000   150.000000
mean       5.843333     3.054000      3.758667     1.198667
std        0.828066     0.433594      1.764420     0.763161
min        4.300000     2.000000      1.000000     0.100000
20%        5.000000     2.700000      1.500000     0.200000
40%        5.600000     3.000000      3.900000     1.160000
50%        5.800000     3.000000      4.350000     1.300000
60%        6.100000     3.100000      4.640000     1.500000
80%        6.520000     3.400000      5.320000     1.900000
90%        6.900000     3.610000      5.800000     2.200000
max        7.900000     4.400000      6.900000     2.500000


# 相关性
print(df.corr())

              sepal length  sepal width  petal length  petal width
sepal length      1.000000    -0.109369      0.871754     0.817954
sepal width      -0.109369     1.000000     -0.420516    -0.356544
petal length      0.871754    -0.420516      1.000000     0.962757
petal width       0.817954    -0.356544      0.962757     1.000000

print(df.corr(method='pearson'))

              sepal length  sepal width  petal length  petal width
sepal length      1.000000    -0.109369      0.871754     0.817954
sepal width      -0.109369     1.000000     -0.420516    -0.356544
petal length      0.871754    -0.420516      1.000000     0.962757
petal width       0.817954    -0.356544      0.962757     1.000000


print(df.corr(method='kendall'))

              sepal length  sepal width  petal length  petal width
sepal length      1.000000    -0.072112      0.717624     0.654960
sepal width      -0.072112     1.000000     -0.182391    -0.146988
petal length      0.717624    -0.182391      1.000000     0.803014
petal width       0.654960    -0.146988      0.803014     1.000000



# 画图
import matplotlib.pyplot as plt
plt.style.use("ggplot")
#import numpy as np


fig, ax = plt.subplots(2,2, figsize=(6,4))
ax[0][0].hist(df['sepal length'], color='black')
ax[0][0].set_xlabel('Iris sepal length', fontsize = 12)
ax[0][0].set_ylabel('Count', fontsize = 12)
ax[0][0].set_title('Iris sepal length')


#fig, ax = plt.subplots(figsize=(6,4))
ax[0][1].hist(df['sepal width'], color='black')
ax[0][1].set_xlabel('Iris sepal width', fontsize = 12)
ax[0][1].set_ylabel('Count', fontsize = 12)
ax[0][1].set_title('Iris sepal width')


#fig, ax = plt.subplots(figsize=(6,4))
ax[1][0].hist(df['petal length'], color='black')
ax[1][0].set_xlabel('Iris petal length', fontsize = 12)
ax[1][0].set_ylabel('Count', fontsize = 12)
ax[1][0].set_title('Iris petal length')


#fig, ax = plt.subplots(figsize=(6,4))
ax[1][1].hist(df['petal width'], color='black')
ax[1][1].set_xlabel('Iris petal width', fontsize = 12)
ax[1][1].set_ylabel('Count', fontsize = 12)
ax[1][1].set_title('Iris petal width')

plt.tight_layout()

# 使用 seaborn
import seaborn as sns
sns.pairplot(df, hue='class')

Figure 2020-09-10 131749.png

import matplotlib.pyplot as plt
plt.style.use("ggplot")
import seaborn as sns

fig, ax = plt.subplots(2,2, figsize=(10,8))

sns.set(style='white',palette='muted')
sns.violinplot(x=df['class'], y=df['sepal length'], ax = ax[0,0])
sns.violinplot(x=df['class'], y=df['sepal width'], ax = ax[0,1])
sns.violinplot(x=df['class'], y=df['petal length'], ax = ax[1,0])
sns.violinplot(x=df['class'], y=df['petal width'], ax = ax[1,1])

fig.suptitle('Violin Plots', fontsize = 16, y = 1.03)

for i in ax.flat:
    plt.setp(i.get_xticklabels(), rotation = -90)

fig.tight_layout()

Figure 2020-09-10 090529.png