15、Poisson回归的梯度提升回归树
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.datasets import fetch_openml
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
df = fetch_openml(data_id=41214, as_frame=True).frame
#泊松分布模拟频率
df["Frequency"] = df["ClaimNb"] / df["Exposure"]
print("Average Frequency = {}"
.format(np.average(df["Frequency"], weights=df["Exposure"])))
print("Fraction of exposure with zero claims = {0:.1%}"
.format(df.loc[df["ClaimNb"] == 0, "Exposure"].sum() /
df["Exposure"].sum()))
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.compose import ColumnTransformer
log_scale_transformer = make_pipeline(
FunctionTransformer(np.log, validate=False),
StandardScaler()
)
linear_model_preprocessor = ColumnTransformer(
[
("passthrough_numeric", "passthrough",
["BonusMalus"]),
("binned_numeric", KBinsDiscretizer(n_bins=10),
["VehAge", "DrivAge"]),
("log_scaled_numeric", log_scale_transformer,
["Density"]),
("onehot_categorical", OneHotEncoder(),
["VehBrand", "VehPower", "VehGas", "Region", "Area"]),
],
remainder="drop",
)
from sklearn.dummy import DummyRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.33, random_state=0)
dummy = Pipeline([
("preprocessor", linear_model_preprocessor),
("regressor", DummyRegressor(strategy='mean')),
]).fit(df_train, df_train["Frequency"],
regressor__sample_weight=df_train["Exposure"])
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_poisson_deviance
def score_estimator(estimator, df_test):
#在测试集上给一个估计量打分
y_pred = estimator.predict(df_test)
print("MSE: %.3f" %
mean_squared_error(df_test["Frequency"], y_pred,
sample_weight=df_test["Exposure"]))
print("MAE: %.3f" %
mean_absolute_error(df_test["Frequency"], y_pred,
sample_weight=df_test["Exposure"]))
# 忽略非积极预测,因为它们对泊松越轨。
mask = y_pred > 0
if (~mask).any():
n_masked, n_samples = (~mask).sum(), mask.shape[0]
print(f"WARNING: Estimator yields invalid, non-positive predictions "
f" for {n_masked} samples out of {n_samples}. These predictions "
f"are ignored when computing the Poisson deviance.")
print("mean Poisson deviance: %.3f" %
mean_poisson_deviance(df_test["Frequency"][mask],
y_pred[mask],
sample_weight=df_test["Exposure"][mask]))
print("Constant mean frequency evaluation:")
score_estimator(dummy, df_test)
from sklearn.linear_model import Ridge
ridge_glm = Pipeline([
("preprocessor", linear_model_preprocessor),
("regressor", Ridge(alpha=1e-6)),
]).fit(df_train, df_train["Frequency"],
regressor__sample_weight=df_train["Exposure"])
from sklearn.linear_model import PoissonRegressor
n_samples = df_train.shape[0]
poisson_glm = Pipeline([
("preprocessor", linear_model_preprocessor),
("regressor", PoissonRegressor(alpha=1e-12, max_iter=300))
])
poisson_glm.fit(df_train, df_train["Frequency"],
regressor__sample_weight=df_train["Exposure"])
print("PoissonRegressor evaluation:")
score_estimator(poisson_glm, df_test)
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.preprocessing import OrdinalEncoder
tree_preprocessor = ColumnTransformer(
[
("categorical", OrdinalEncoder(),
["VehBrand", "VehPower", "VehGas", "Region", "Area"]),
("numeric", "passthrough",
["VehAge", "DrivAge", "BonusMalus", "Density"]),
],
remainder="drop",
)
poisson_gbrt = Pipeline([
("preprocessor", tree_preprocessor),
("regressor", HistGradientBoostingRegressor(loss="poisson",
max_leaf_nodes=128)),
])
poisson_gbrt.fit(df_train, df_train["Frequency"],
regressor__sample_weight=df_train["Exposure"])
print("Poisson Gradient Boosted Trees evaluation:")
score_estimator(poisson_gbrt, df_test)
fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(16, 6), sharey=True)
fig.subplots_adjust(bottom=0.2)
n_bins = 20
for row_idx, label, df in zip(range(2),
["train", "test"],
[df_train, df_test]):
df["Frequency"].hist(bins=np.linspace(-1, 30, n_bins),
ax=axes[row_idx, 0])
axes[row_idx, 0].set_title("Data")
axes[row_idx, 0].set_yscale('log')
axes[row_idx, 0].set_xlabel("y (observed Frequency)")
axes[row_idx, 0].set_ylim([1e1, 5e5])
axes[row_idx, 0].set_ylabel(label + " samples")
for idx, model in enumerate([ridge_glm, poisson_glm, poisson_gbrt]):
y_pred = model.predict(df)
pd.Series(y_pred).hist(bins=np.linspace(-1, 4, n_bins),
ax=axes[row_idx, idx+1])
axes[row_idx, idx + 1].set(
title=model[-1].__class__.__name__,
yscale='log',
xlabel="y_pred (predicted expected Frequency)"
)
plt.tight_layout()