31. 加载电信客户流失数据集
df = pd.read_csv("Telco-Customer-Churn.csv")
print(df.head(5))
32. 统计每一列数据的缺失值
print(df.isnull())
print(df.isnull().sum())
33. 正确设置数据列的类型
print(df.info())
print(df["TotalCharges"].value_counts())
median = df["TotalCharges"][df["TotalCharges"] != " "].median()
df.loc[df["TotalCharges"] == " ", "TotalCharges"] = median
df["TotalCharges"] = df["TotalCharges"].astype(float)
print()
print(df["TotalCharges"].value_counts())
34. 将类别字段转换成cat类型
print(df.columns)
number_columns = ["tenure", "MonthlyCharges", "TotalCharges"]
for column in number_columns:
df[column] = df[column].astype(float)
for column in set(df.columns) - set(number_columns):
df[column] = pd.Categorical(df[column])
print(df.info())
35. 对cat类型字段数据统计
print(df.describe(include=["category"]))
36. churn字段的数据分布
print(df["Churn"].value_counts())
37. 多维度查看MonthlyCharges字段统计
print(df.columns)
print(df.groupby(["Churn", "PaymentMethod"])["MonthlyCharges"].mean())
38. Churn字段的数据映射
print(df["Churn"].value_counts())
df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})
print()
print(df["Churn"].value_counts())
39. 查看字段相关矩阵
print(df.head(3))
print(df.info())
print()
print(df.corr())
40. 从数据集中采样数据行
print(df.sample(10))
课程参考链接:https://ke.qq.com/course/4000626#term_id=104152097