今天我们学习一下python的数据可视化库，常用的库主要有两个：matplotlib和seaborn。这一篇主要讲matplotlib库，seaborn我们下一篇再讲。数据可视化在数据分析中是非常重要的，通过图像可以直观地表示出数据的分布特点，接下来我们学习一下matplotlib库的使用，在学习的过程中，我们首先要明确：知道画什么，比知道怎么画更重要！所以说这两个库的学习主要是技术的学习，实际使用时，先要明确我们画什么图，再利用技术进行实现。
Matplotlib是一个Python 2D绘图库，可以绘制出很多图形：

1、折线图

这里使用的数据集是美国1948年-2019年的失业率，如果有需要的可以私信我。

import pandas as pd
unrate=pd.read_csv('/opt/jupyter_file/dataset/数据可视化/美国1948-2019失业率.csv')
unrate['DATE']=pd.to_datetime(unrate['DATE'])#将1948/1/1转换成标准格式1948-01-01
unrate.head(10)

接下来我们导入matplotlib库，

import matplotlib.pyplot as plt
#%matplotlib inline#直接显示图像
plt.plot()
# plt.show()

下面我们将数据加入到图像中，

first_twelve=unrate[0:12]
plt.plot(first_twelve['DATE'],first_twelve['VALUE'])#两个参数，左边为x轴，右边为y轴
plt.show()

我们还可以转换一下横坐标，

plt.plot(first_twelve['DATE'],first_twelve['VALUE'])
plt.xticks(rotation=45)#横坐标变换45度
plt.show()

也可以添加横、纵坐标轴和标题，

plt.plot(first_twelve['DATE'],first_twelve['VALUE'])
plt.xticks(rotation=90)
plt.xlabel('Month')
plt.ylabel('Unemployment Rate')
plt.title('Unemployment Trends , 1948')
plt.show()

2、子图操作

import matplotlib.pyplot as plt
fig=plt.figure()#指定画图的区间
ax1=fig.add_subplot(2,2,1)#第一个参数是行数，第二个是列数，第三个参数是位置
ax1=fig.add_subplot(2,2,2)
ax1=fig.add_subplot(2,2,4)
plt.show()

import numpy as np
# fig=plt.figure()
fig=plt.figure(figsize=(10,6))#第一个参数表示当前画图域的长度，第二个参数代表宽度
ax1=fig.add_subplot(2,1,1)
ax2=fig.add_subplot(2,1,2)

ax1.plot(np.random.randint(1,5,5),np.arange(5))#画子图
ax2.plot(np.arange(10)*3,np.arange(10))
plt.show()

也可以画两条线做对比，

unrate['MONTH']=unrate['DATE'].dt.month #月份为1月= 1，12月= 12。
# unrate['MONTH']=unrate['DATE'].dt.month
fig=plt.figure(figsize=(6,3))
plt.plot(unrate[0:12]['MONTH'],unrate[0:12]['VALUE'],c='red')
plt.plot(unrate[12:24]['MONTH'],unrate[12:24]['VALUE'],c='blue')
plt.show()

还可以再加点复杂度，

fig=plt.figure(figsize=(10,6))
colors=['red','blue','green','orange','black']
for i in range(5):
    start_index=i*12
    end_index=(i+1)*12
    subset=unrate[start_index:end_index]
    label=str(1948+i)
    plt.plot(subset['MONTH'],subset['VALUE'],c=colors[i],label=label)
plt.legend(loc='upper left')#框的位置
plt.show()

可以加上横纵坐标轴，

fig=plt.figure(figsize=(10,6))
colors=['red','blue','green','orange','black']
for i in range(5):
    start_index=i*12
    end_index=(i+1)*12
    subset=unrate[start_index:end_index]
    label=str(1948+i)
    plt.plot(subset['MONTH'],subset['VALUE'],c=colors[i],label=label)
plt.legend(loc='upper left')#框的位置
plt.xlabel('Month Integer')
plt.ylabel('Unemployment Rate,Percent')
plt.title('Monthly Unemployment Trends,1948-1952')
plt.show()

3、条形图与散点图

这里的数据集是美国各大电影网站对部分电影的评分数据集，有需要的话可以私信我。

import pandas as pd
reviews=pd.read_csv('/opt/jupyter_file/dataset/数据可视化/电影评分.csv')
reviews.head(10)

我们可以只取其中的几列数据，

cols=['FILM','RT_user_norm','Metacritic_user_nom','IMDB_norm','Fandango_Ratingvalue','Fandango_Stars']
norm_reviews=reviews[cols]
print(norm_reviews[:1])

import matplotlib.pyplot as plt
from numpy import arange
num_cols=['RT_user_norm','Metacritic_user_nom','IMDB_norm','Fandango_Ratingvalue','Fandango_Stars']#将这些列拿出来
bar_heights=norm_reviews.loc[0,num_cols].values#当前柱的高度
print(bar_heights)
bar_positions=arange(5)+0.75#每个柱离原点的距离
print(bar_positions)

接下来进行画图，

# fig,ax = plt.subplots()等价于：
# fig = plt.figure()
# ax = fig.add_subplot(1,1,1)
# fig, ax = plt.subplots(1,3),其中参数1和3分别代表子图的行数和列数，一共有 1x3 个子图像。函数返回一个figure图像和子图ax的array列表。
# fig, ax = plt.subplots(1,3,1),最后一个参数1代表第一个子图。
# 如果想要设置子图的宽度和高度可以在函数内加入figsize值
# fig, ax = plt.subplots(1,3,figsize=(15,7))，这样就会有1行3个15x7大小的子图。
fig,ax=plt.subplots()
ax.bar(bar_positions,bar_heights,0.3)#0.3代表宽度
plt.show()

还可以设置的复杂一点，

num_cols=['RT_user_norm','Metacritic_user_nom','IMDB_norm','Fandango_Ratingvalue','Fandango_Stars']#将这些列拿出来
bar_heights=norm_reviews.loc[0,num_cols].values#当前柱的高度
bar_positions=arange(5)+0.75#每个柱离原点的距离
tick_positions=range(1,6)
fig,ax=plt.subplots()

ax.bar(bar_positions,bar_heights,0.5)
ax.set_xticks(tick_positions)
ax.set_xticklabels(num_cols,rotation=45)

ax.set_xlabel('Rating Source')
ax.set_ylabel('Average Rating')
ax.set_title('Average User Rating For Avengers:Age of Ultron(2015)')
plt.show()

也可以横着画，

import matplotlib.pyplot as plt
from numpy import arange
num_cols=['RT_user_norm','Metacritic_user_nom','IMDB_norm','Fandango_Ratingvalue','Fandango_Stars']

bar_widths=norm_reviews.loc[0,num_cols].values
bar_positions=arange(5)+0.75
tick_positions=range(1,6)
fig,ax=plt.subplots()
ax.barh(bar_positions,bar_widths,0.5)#将bar换成barh

ax.set_yticks(tick_positions)
ax.set_yticklabels(num_cols)
ax.set_ylabel('Rating Source')
ax.set_xlabel('Average Rating')
ax.set_title('Average User Rating For Avengers:Age of Ultron(2015)')
plt.show()

接下来是散点图，

fig,ax=plt.subplots()
ax.scatter(norm_reviews['Fandango_Ratingvalue'],norm_reviews['RT_user_norm'])#散点图
ax.set_xlabel('Fandango')
ax.set_ylabel('Rotten Tomatoes')
plt.show()

两张子图，

fig=plt.figure(figsize=(5,10))
ax1=fig.add_subplot(2,1,1)
ax2=fig.add_subplot(2,1,2)
ax1.scatter(norm_reviews['Fandango_Ratingvalue'],norm_reviews['RT_user_norm'])
ax1.set_xlabel('Fandango')
ax1.set_ylabel('Rotten Tomatoes')
ax2.scatter(norm_reviews['RT_user_norm'],norm_reviews['Fandango_Ratingvalue'])
ax2.set_xlabel('Rotten Tomatoes')
ax2.set_ylabel('Fandango')
plt.show()

4、柱形图和盒图

这里的数据集我们还是使用之前的电影评分数据集，

import pandas as pd
import matplotlib.pyplot as plt
reviews=pd.read_csv('/opt/jupyter_file/dataset/数据可视化/电影评分.csv')
cols=['FILM','RT_user_norm','Metacritic_user_nom','IMDB_norm','Fandango_Ratingvalue']
norm_reviews=reviews[cols]
print(norm_reviews[:5])

fandango_distribution=norm_reviews['Fandango_Ratingvalue'].value_counts()#分类统计，类似sql,统计每个评分出现的次数
fandango_distribution=fandango_distribution.sort_index()

imdb_distribution = norm_reviews['IMDB_norm'].value_counts()
imdb_distribution = imdb_distribution.sort_index()

print(fandango_distribution)
print(imdb_distribution)

fig,ax=plt.subplots()
ax.hist(norm_reviews['Fandango_Ratingvalue'])
ax.hist(norm_reviews['Fandango_Ratingvalue'],bins=20)
# ax.hist(norm_reviews['Fandango_Ratingvalue'],range=(4,5),bins=20)#bins : 这个参数指定bin(箱子)的个数,也就是总共有几条条状图
#还有一些参数
#normed：这个参数指定密度,也就是每个条状图的占比例比,默认为1
#color：这个指定条状图的颜色
#range:x轴的范围
#bottom:y轴的起始位置
plt.show()

fig=plt.figure(figsize=(5,20))
ax1=fig.add_subplot(4,1,1)
ax2=fig.add_subplot(4,1,2)
ax3=fig.add_subplot(4,1,3)
ax4=fig.add_subplot(4,1,4)
ax1.hist(norm_reviews['Fandango_Ratingvalue'],bins=20,range=(0,5))#直方图
ax1.set_title('Distribution of Fandango Ratings')
ax1.set_ylim(0,50)

ax2.hist(norm_reviews['RT_user_norm'],20,range=(0,5))
ax2.set_title('Distribution of Rotten Tomatoes Ratings')
ax2.set_ylim(0,50)

ax3.hist(norm_reviews['Metacritic_user_nom'],20,range=(0,5))
ax3.set_title('Distribution of Metacritic Ratings')
ax3.set_ylim(0,50)

ax4.hist(norm_reviews['IMDB_norm'],20,range=(0,5))
ax4.set_title('Distribution of IMDB Ratings')
ax4.set_ylim(0,50)

plt.show()

接下来是盒图，

fig,ax=plt.subplots()
ax.boxplot(norm_reviews['RT_user_norm'])
ax.set_xticklabels(['Rotten Tomatoes'])
ax.set_ylim(0,5)
plt.show()

num_cols=['RT_user_norm','Metacritic_user_nom','IMDB_norm','Fandango_Ratingvalue']
fig,ax=plt.subplots()
ax.boxplot(norm_reviews[num_cols].values)
ax.set_xticklabels(num_cols,rotation=90)
ax.set_ylim(0,5)
plt.show()

5、细节处理

这里我们用到的数据集是美国某大学的课程中女生的占比情况percent-bachelors-degrees-women-usa.csv，有需要的同学可以私信我。

import pandas as pd
import matplotlib.pyplot as plt

women_degrees=pd.read_csv('/opt/jupyter_file/dataset/数据可视化/percent-bachelors-degrees-women-usa.csv')
plt.plot(women_degrees['Year'],women_degrees['Biology'])
plt.show()

我们进行一下男女生的对比，

plt.plot(women_degrees['Year'],women_degrees['Biology'],c='red',label='women')
plt.plot(women_degrees['Year'],100-women_degrees['Biology'],c='blue',label='men')
plt.legend(loc='upper right')
plt.title('Percentage of Biology Degrees Awarded By Gender')
plt.show()

接下来，添加或者去除图像上的小齿，注释掉的部分也可以运行，为了方便发现不同，建议去掉注释分别运行一下，

# fig,ax=plt.subplots()
fig,ax=plt.subplots()
ax.plot(women_degrees['Year'],women_degrees['Biology'],c='red',label='women')
ax.plot(women_degrees['Year'],100-women_degrees['Biology'],c='blue',label='men')
plt.legend(loc='upper right')
ax.tick_params(bottom="off",top="off",left="off",right="off")#可以去除轴上的标记

# ax.spines['right'].set_visible(False)
# ax.spines['left'].set_visible(False)
# ax.spines['top'].set_visible(False)
# ax.spines['bottom'].set_visible(False)


#for循环的方法
# for key,spine in ax.spines.items():
#     spine.set_visible(False)
ax.set_title('Percentage of Biology Degrees Awarded By Gender')
plt.show()

我们画一下各科历年的男女生占比情况，

major_cats=['Biology', 'Computer Science', 'Engineering', 'Math and Statistics']
fig=plt.figure(figsize=(12,12))

for sp in range(0,4):
    ax=fig.add_subplot(2,2,sp+1)
    ax.plot(women_degrees['Year'],women_degrees[major_cats[sp]],c='red',label='women')
    ax.plot(women_degrees['Year'],100-women_degrees[major_cats[sp]],c='blue',label='men')
    
plt.legend(loc='upper right')
plt.show()

我们加上标题，同时去掉边框，

major_cats=['Biology', 'Computer Science', 'Engineering', 'Math and Statistics']
fig=plt.figure(figsize=(12,12))

for sp in range(0,4):
    ax=fig.add_subplot(2,2,sp+1)
    ax.plot(women_degrees['Year'],women_degrees[major_cats[sp]],c='red',label='women')
    ax.plot(women_degrees['Year'],100-women_degrees[major_cats[sp]],c='blue',label='men')
    for key,spine in ax.spines.items():
        spine.set_visible(False)
    ax.set_xlim(1968,2011)
    ax.set_ylim(0,100)
    ax.set_title(major_cats[sp])
    ax.tick_params(bottom="off",top="off",left="off",right="off")
    
plt.legend(loc='upper right')
plt.show()

我们改一下曲线的颜色，

major_cats=['Biology', 'Computer Science', 'Engineering', 'Math and Statistics']
cb_dark_blue=(0/255,107/255,164/255)#RGB颜色三通道
cb_orange=(255/255,128/255,14/255)

fig=plt.figure(figsize=(12,12))

for sp in range(0,4):
    ax=fig.add_subplot(2,2,sp+1)
    ax.plot(women_degrees['Year'],women_degrees[major_cats[sp]],c=cb_dark_blue,label='women')
    ax.plot(women_degrees['Year'],100-women_degrees[major_cats[sp]],c=cb_orange,label='men')
    for key,spine in ax.spines.items():
        spine.set_visible(False)
    ax.set_xlim(1968,2011)
    ax.set_ylim(0,100)
    ax.set_title(major_cats[sp])
    ax.tick_params(bottom="off",top="off",left="off",right="off")
    
plt.legend(loc='upper right')
plt.show()

也可以改变一下，曲线的宽度，

major_cats=['Biology', 'Computer Science', 'Engineering', 'Math and Statistics']
cb_dark_blue=(0/255,107/255,164/255)#RGB颜色三通道
cb_orange=(255/255,128/255,14/255)

fig=plt.figure(figsize=(12,12))

for sp in range(0,4):
    ax=fig.add_subplot(2,2,sp+1)
    ax.plot(women_degrees['Year'],women_degrees[major_cats[sp]],c=cb_dark_blue,label='women',linewidth=10)#线条宽度
    ax.plot(women_degrees['Year'],100-women_degrees[major_cats[sp]],c=cb_orange,label='men',linewidth=10)
    for key,spine in ax.spines.items():
        spine.set_visible(False)
    ax.set_xlim(1968,2011)
    ax.set_ylim(0,100)
    ax.set_title(major_cats[sp])
    ax.tick_params(bottom="off",top="off",left="off",right="off")
    
plt.legend(loc='upper right')
plt.show()

设置一下图片的排版，

major_cats=['Engineering', 'Computer Science','Psychology', 'Biology', 'Physical Sciences','Math and Statistics']


fig=plt.figure(figsize=(18,3))

for sp in range(0,6):
    ax=fig.add_subplot(1,6,sp+1)
    ax.plot(women_degrees['Year'],women_degrees[major_cats[sp]],c=cb_dark_blue,label='women',linewidth=3)
    ax.plot(women_degrees['Year'],100-women_degrees[major_cats[sp]],c=cb_orange,label='men',linewidth=3)
    for key,spine in ax.spines.items():
        spine.set_visible(False)
    ax.set_xlim(1968,2011)
    ax.set_ylim(0,100)
    ax.set_title(major_cats[sp])
    ax.tick_params(bottom="off",top="off",left="off",right="off")
    
plt.legend(loc='upper right')
plt.show()

还可以在曲线上设置文字，

major_cats=['Engineering', 'Computer Science','Psychology', 'Biology', 'Physical Sciences','Math and Statistics']


fig=plt.figure(figsize=(18,3))

for sp in range(0,6):
    ax=fig.add_subplot(1,6,sp+1)
    ax.plot(women_degrees['Year'],women_degrees[major_cats[sp]],c=cb_dark_blue,label='women',linewidth=3)
    ax.plot(women_degrees['Year'],100-women_degrees[major_cats[sp]],c=cb_orange,label='men',linewidth=3)
    for key,spine in ax.spines.items():
        spine.set_visible(False)
    ax.set_xlim(1968,2011)
    ax.set_ylim(0,100)
    ax.set_title(major_cats[sp])
    ax.tick_params(bottom="off",top="off",left="off",right="off")
    if sp==0:
        ax.text(2005,87,'men')#添加文字，前两个参数是坐标，最后一个参数是文字
        ax.text(2002,8,'women')
    elif sp==5:
        ax.text(2005,62,'men')
        ax.text(2001,35,'women')

plt.show()

好了，matplotlib库就学到这里，里面其实还有很多参数是没有讲到的，如果有需要，建议去官网看一下具体细节(https://matplotlib.org/),下一篇我们学习seaborn库。

Python数据分析(五):数据可视化库之Matplotlib