数据探查
- 读取数据
df = pd.read_csv("./car_price.csv")
df.head(3)
- 数据信息
df.info()
- 空值查看
df.isnull().sum()
- 删除空值
df.dropna(inplace=True)
清洗处理
- 字段提取数字
删除空格后面的字符串
df["mileage"] = df["mileage"].map(lambda x : float(x.split(" ")[0]))
df["mileage"].head(3)
df["engine"] = df["engine"].map(lambda x : float(x.split(" ")[0]))
df["engine"].head(3)
df["max_power"] = df["max_power"].map(lambda x : float(x.split(" ")[0]))
df["max_power"].head(3)
正则表达式提取字段中最大值
import re
def parse_rpm(torque):
torque = torque.replace(",", "")
return max([float(x) for x in re.findall("\d+", torque)])
df["torque"] = df["torque"].map(parse_rpm)
df["torque"].head(3)
统计分析
- 删除name字段
df.drop(columns=["name"], inplace=True)
df.head(3)
- object类统计
df.select_dtypes(include=["object"]).describe()
- float类统计
df.select_dtypes(include=["float"]).describe()
- 相关性矩阵
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(8,8))
sns.heatmap(df.corr(), annot=True)
可以看出销售价格(selling_price)和最大马力(max_power)强相关,其次是引擎容积(engine)、年份(year)
特征工程
- 类别字段one-hot
cat_columns = ["fuel", "seller_type", "transmission", "owner"]
from sklearn.preprocessing import OneHotEncoder
oneHotEncoder = OneHotEncoder(drop='first')
cat_features = oneHotEncoder.fit_transform(df[cat_columns]).toarray()
cat_features
- 数值字段标准化
num_columns = ["mileage", "engine", "max_power", "torque", "seats"]
from sklearn.preprocessing import StandardScaler
standardScaler = StandardScaler()
num_features = standardScaler.fit_transform(df[num_columns])
num_features
- 构建x和y
X = np.hstack([cat_features, num_features])
X
y = df["selling_price"].to_numpy()
y
模型训练
- 数据集划分
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
投票分类器 线性回归 SVM kNN回归 随机森林
Bagging
Adaboost回归
XGBoost
stacking
- 随机森林
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
random_model = RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1)
random_model.fit(X_train, y_train)
y_pred = random_model.predict(X_test)
y_pred
random_model.score(X_train, y_train)
random_model.score(X_test, y_test)
交叉验证
网格搜索
模型评估
之前训练的随机森林模型与LR对比
- 训练线性回归模型
from sklearn.linear_model import LinearRegression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_model.score(X_train, y_train)
lr_model.score(X_test, y_test)
模型存储
- joblib.dump,存储random_model、oneHotEncoder、standardScaler
model_dir = "./flask-carprice/models"
import joblib
joblib.dump(random_model, f"{model_dir}/random_model.joblib")
joblib.dump(oneHotEncoder, f"{model_dir}/oneHotEncoder.joblib")
joblib.dump(standardScaler, f"{model_dir}/standardScaler.joblib")
在线预估
- 初始化flask
from flask import Flask, render_template, request
import pandas as pd
import numpy as np
import model
app = Flask("carprice_predict")
def get_predict_result(df):
# datajson = """{"year":{"6046":2017},"selling_price":{"6046":2100000},"km_driven":{"6046":48000},"fuel":{"6046":"Diesel"},"seller_type":{"6046":"Individual"},"transmission":{"6046":"Automatic"},"owner":{"6046":"Second Owner"},"mileage":{"6046":17.9},"engine":{"6046":2143.0},"max_power":{"6046":136.0},"torque":{"6046":3000.0},"seats":{"6046":5.0}}"""
# df = pd.read_json(datajson)
cat_columns = ["fuel", "seller_type", "transmission", "owner"]
load_cat_fatures = model.carPriceModel.onehot.transform(df[cat_columns]).toarray()
num_columns = ["mileage", "engine", "max_power", "torque", "seats"]
load_num_fatures = model.carPriceModel.scaler.transform(df[num_columns])
load_final_fatures = np.hstack([load_cat_fatures, load_num_fatures])
result = model.carPriceModel.predictor.predict(load_final_fatures)
print(result)
return result
@app.route("/predict", methods=["get", "post"])
def predict():
result = None
if request.method == "POST":
data = dict(request.form)
df = pd.DataFrame([data.values()], columns=data.keys())
result = str(get_predict_result(df))
return render_template("predict.html", result=result)
app.run(host="0.0.0.0", port=5010)
- 加载模型
import joblib
class CarPriceModel:
def __init__(self):
self.onehot = None
self.scaler = None
self.predictor = None
def load_models(self):
print("load_models")
model_dir = "./models"
self.onehot = joblib.load(f"{model_dir}/oneHotEncoder.joblib")
self.scaler = joblib.load(f"{model_dir}/standardScaler.joblib")
self.predictor = joblib.load(f"{model_dir}/random_model.joblib")
carPriceModel = CarPriceModel()
carPriceModel.load_models()
- html页面
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>二手车价格预估</title>
</head>
<body>
{% if result %}
<h1>预估结果:{{ result }}</h1>
{% endif %}
<form action="/predict" method="post">
<table>
<tr>
<th>请输入fuel:</th>
<td><input type="text" name="fuel"></td>
</tr>
<tr>
<th>请输入seller_type:</th>
<td><input type="text" name="seller_type"></td>
</tr>
<tr>
<th>请输入transmission:</th>
<td><input type="text" name="transmission"></td>
</tr>
<tr>
<th>请输入owner:</th>
<td><input type="text" name="owner"></td>
</tr>
<tr>
<th>请输入mileage:</th>
<td><input type="text" name="mileage"></td>
</tr>
<tr>
<th>请输入engine:</th>
<td><input type="text" name="engine"></td>
</tr>
<tr>
<th>请输入max_power:</th>
<td><input type="text" name="max_power"></td>
</tr>
<tr>
<th>请输入torque:</th>
<td><input type="text" name="torque"></td>
</tr>
<tr>
<th>请输入seats:</th>
<td><input type="text" name="seats"></td>
</tr>
<tr>
<th>提交:</th>
<td><input type="submit" name="submit" value="提交"></td>
</tr>
</table>
</form>
</body>
</html>
服务部署
- 将项目打包上传到linux服务器
tar zcvf flask-carprice.tar.gz flask-carprice
- 上传后解压
tar zxvf flask-carprice.tar.gz
- 安装依赖
pip3 install -i https://pypi.tuna.tsinghua.edu.cn/simple -r requirements.txt
- 启动命令
nohup python3 app.py 1>log.log 2>&1 &
-
配置安全组
-
访问页面
公网ip:5010/predict