kaggle-top50
top50
的数据是kaggle官网上关于一个音乐
的数据集。
There are 50 songs and 13 variables to be explored
新知识
数据本身是比较完美的,没有涉及到太多的数据预处理工作,主要是学习到了多种图形的绘制
直方图
直方图+折线
热力图
饼图
等高线图
属性
分析过程
导入库和包
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy import stats
import squarify as sq
from pandas.plotting import scatter_matrix
import seaborn as sns
import sklearn
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import MinMaxScaler, LabelEncoder # 预处理模块
from sklearn.linear_model import LinearRegression # 线性回归
from sklearn.model_selection import train_test_split,cross_val_score, KFold # 数据分离,交叉验证,K折验证
from sklearn import metrics # 矩阵模块
from sklearn.metrics import confusion_matrix, classification_report # 混淆矩阵,分类报告
%matplotlib inline
#提供汉字支持
mpl.rcParams["font.family"]="sans-serif"
mpl.rcParams["font.sans-serif"]=u'SimHei'
数据查看
filename='/Users/peter/data-visualization/top50.csv'
data = pd.read_csv(filename
,encoding = "ISO-8859-1" # 解决UnicodeError问题
,engine='python'
,index_col=0) # 解决已知文件的第一列当做属性问题
data.head()
属性重命名rename
data.rename(columns={'Track.Name':'track_name','Artist.Name':'artist_name','Beats.Per.Minute':'beats_per_minute','Loudness..dB..':'Loudness(dB)','Valence.':'Valence','Length.':'Length', 'Acousticness..':'Acousticness','Speechiness.':'Speechiness'},inplace=True)
Calculating the number of songs of each genre
popular_genre = data.groupby('Genre').size() # 根据类别分组,再统计每个类别多少首歌
print(popular_genre)
genre_list = data['Genre'].values.tolist() # 将每个类别转成列表形式
Calculating the number of songs by each of the artists
popular_artist = data.groupby('artist_name').size() # 统计每个作家几首歌
print(popular_artist)
artist_list = data['artist_name'].values.tolist() # 作家的名字转成列表
查看属性的统计信息
pd.set_option('precision', 3) # 设置最多显示的小数位
data.describe() # 查看统计信息
Finding out the skew for each attribute
找出每个属性的偏度skew
skew = data.skew() # skew是偏态,偏态系数
print(skew)
transform = np.asarray(data[['Liveness']].values) # 取出每个Liveness的值,转成ndarray型数据
print(type(transform))
data_transform = stats.boxcox(transform)[0]
plt.hist(data['Liveness'], bins=10) # 原始数据
plt.title("original data")
plt.show()
plt.hist(data_transform, bins=10) # 修正偏态之后的数据
plt.title("skew corrected data")
plt.show()
如何在直方图的基础上画出折线趋势
transform1 = np.asarray(data[['Popularity']].values)
data_transform1 = stats.boxcox(transform1)[0]
# 类似上面的做法,画出直方图
# plt.hist(data['Popularity'],bins=10) #original data
# plt.show()
# plt.hist(data_transform1,bins=10) #corrected skew data
# plt.show()
sns.distplot(data['Popularity'],bins=10,kde=True,kde_kws={"color":"k", "lw":2, "label":"KDE"}, color='blue')
plt.title("original data")
plt.show()
sns.distplot(data_transform1, bins=10, kde=True, kde_kws={"color":"k", "lw":2, "label":"KDE"}, color='green')
plt.title("skew corrected data")
plt.show()
Bar graph to see the number of songs of each genre
fig, ax = plt.subplots(figsize=(30,12)) # 指定画布大小
length = np.arange(len(popular_genre))
plt.bar(length, popular_genre, color='g',edgecolor='black',alpha=0.7)
plt.xticks(length, genre_list) # 显示的是横轴上的每个刻度
plt.title("Most popular genre", fontsize=28)
plt.xlabel("Genre", fontsize=25)
plt.ylabel("Number On Songs", fontsize=25)
plt.show()
相关系数correction
如何求解相关系数
pd.set_option('display.width', 100) # 每行最多显示的数据量为100,多的话就隔行再显示
pd.set_option('precision', 3) # 最多精确的小数位
correclation = data.corr(method='spearman') # method系数相关:pearson 线性数据之间的相关性;kendall分类变量相关性,无序序列;spearman 非线性的,非正态的数据的相关系数
print(correclation)
8.2 根据相关系数画出热力图
plt.figure(figsize=(10,10))
plt.title("Correclation heatmap")
sns.heatmap(correclation, annot=True,vmin=-1, vmax=1,cmap="GnBu_r", center=1)
barh of most popular artists
fig, ax=plt.subplots(figsize=(12,12))
length=np.arange(len(popular_artist))
plt.barh(length, popular_artist,color='r',edgecolor='black',alpha=0.7)
# plt.barh(y, width, height=0.8, left=None, *, align='center', **kwargs)
plt.yticks(length, artist_list) # y轴上的刻度
plt.title("Most popular artists", fontsize=18)
plt.ylabel("Artists", fontsize=18) # 横纵轴的标签
plt.xlabel("Number of songs", fontsize=16)
plt.show()
Analysing the relationship between energy and loudness
fig = plt.subplots(figsize=(10,10))
sns.regplot(x='Energy', y='Loudness(dB)', data=data, color='black')
Dependence between energy and popularity
fig = plt.subplots(figsize=(10,10))
plt.title('Dependence between energy and popularity')
sns.regplot(x='Energy', y='Popularity', ci=None, data=data)
sns.kdeplot(data.Energy, data.Popularity)
plt.figure(figsize=(14,8))
sq.plot(sizes=data.Genre.value_counts(), label=data['Genre'].unique(), alpha=0.8)
plt.axis('off')
plt.show()
Pie charts 饼图
通过每个歌手和其歌曲数目制作饼图
labels = data.artist_name.value_counts().index # 每小块的标签
sizes = data.artist_name.value_counts().values # 每块的大小
colors = ['red', 'yellowgreen', 'lightcoral', 'lightskyblue','cyan', 'green', 'black','yellow']
plt.figure(figsize = (10,10))
plt.pie(sizes, labels=labels,colors=colors) # 画图
autopct = ("%1.1f%%")
plt.axis('equal')
plt.show()
Linear Regression
数据构建和TTS
# 构建训练集和测试集
x = data.loc[:, ['Energy','Danceability','Length','Loudness(dB)','Acousticness']].values
y = data.loc[:, 'Popularity'].values
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.3)
reg = LinearRegression()
reg.fit(X_train, y_train)
预测
# 进行预测,真实值和预测值之间的比较
y_pred = reg.predict(X_test)
data_output = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(data_output)
# 计算LR的准确率:MAE:mean absolute error;MSE: mean sqaured error
print("MAE", metrics.mean_absolute_error(y_test, y_pred))
print("MSE", metrics.mean_squared_error(y_test, y_pred))
print("Root MSE:", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
# 预测值和真实的测试值之间的散点图
plt.figure(figsize=(10,10))
plt.plot(y_pred, y_test, color='black', linestyle='dashed',marker='*',markerfacecolor='red',markersize=10)
plt.title("Error analsis")
plt.xlabel("Predicted values")
plt.ylabel("Test values")
[图片上传失败...(image-d45de2-1579152436836)]
交叉验证
x = data.loc[:, ['Energy', 'Danceability']].values
y = data.loc[:, 'Popularity'].values
reg = LinearRegression()
mse = cross_val_score(reg, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
mean_mse = np.mean(mse)
print(mean_mse)
diff = metrics.mean_squared_error(y_test, y_pred) - abs(mean_mse)
print(diff)