18、严重程度模型-伽玛分布

18、严重程度模型-伽玛分布

import numpy as np

import matplotlib.pyplot as plt

import pandas as pd

from sklearn.datasets import fetch_openml

from sklearn.compose import ColumnTransformer

from sklearn.linear_model import  GammaRegressor

from sklearn.model_selection import train_test_split

from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import FunctionTransformer, OneHotEncoder

from sklearn.preprocessing import StandardScaler, KBinsDiscretizer

def load_mtpl2(n_samples=100000):

    """Fetch the French Motor Third-Party Liability Claims dataset.

    Parameters

    ----------

    n_samples: int, default=100000

      number of samples to select (for faster run time). Full dataset has

      678013 samples.

    """

    # freMTPL2freq dataset from https://www.openml.org/d/41214

    df_freq = fetch_openml(data_id=41214, as_frame=True)['data']

    df_freq['IDpol'] = df_freq['IDpol'].astype(np.int)

    df_freq.set_index('IDpol', inplace=True)

    # freMTPL2sev dataset from https://www.openml.org/d/41215

    df_sev = fetch_openml(data_id=41215, as_frame=True)['data']

    # sum ClaimAmount over identical IDs

    df_sev = df_sev.groupby('IDpol').sum()

    df = df_freq.join(df_sev, how="left")

    df["ClaimAmount"].fillna(0, inplace=True)

    # unquote string fields

    for column_name in df.columns[df.dtypes.values == np.object]:

        df[column_name] = df[column_name].str.strip("'")

    return df.iloc[:n_samples]

def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None,

                  title=None, ax=None, fill_legend=False):

    """Plot observed and predicted - aggregated per feature level.

    Parameters

    ----------

    df : DataFrame

        input data

    feature: str

        a column name of df for the feature to be plotted

    weight : str

        column name of df with the values of weights or exposure

    observed : str

        a column name of df with the observed target

    predicted : DataFrame

        a dataframe, with the same index as df, with the predicted target

    fill_legend : bool, default=False

        whether to show fill_between legend

    """

    # aggregate observed and predicted variables by feature level

    df_ = df.loc[:, [feature, weight]].copy()

    df_["observed"] = df[observed] * df[weight]

    df_["predicted"] = predicted * df[weight]

    df_ = (

        df_.groupby([feature])[[weight, "observed", "predicted"]]

        .sum()

        .assign(observed=lambda x: x["observed"] / x[weight])

        .assign(predicted=lambda x: x["predicted"] / x[weight])

    )

    ax = df_.loc[:, ["observed", "predicted"]].plot(style=".", ax=ax)

    y_max = df_.loc[:, ["observed", "predicted"]].values.max() * 0.8

    p2 = ax.fill_between(

        df_.index,

        0,

        y_max * df_[weight] / df_[weight].values.max(),

        color="g",

        alpha=0.1,

    )

    if fill_legend:

        ax.legend([p2], ["{} distribution".format(feature)])

    ax.set(

        ylabel=y_label if y_label is not None else None,

        title=title if title is not None else "Train: Observed vs Predicted",

    )

def score_estimator(

    estimator, X_train, X_test, df_train, df_test, target, weights,

    tweedie_powers=None,

):

    """Evaluate an estimator on train and test sets with different metrics"""   

df = load_mtpl2(n_samples=60000)

# Note: filter out claims with zero amount, as the severity model

# requires strictly positive target values.

df.loc[(df["ClaimAmount"] == 0) & (df["ClaimNb"] >= 1), "ClaimNb"] = 0

# Correct for unreasonable observations (that might be data error)

# and a few exceptionally large claim amounts

df["ClaimNb"] = df["ClaimNb"].clip(upper=4)

df["Exposure"] = df["Exposure"].clip(upper=1)

df["ClaimAmount"] = df["ClaimAmount"].clip(upper=200000)

log_scale_transformer = make_pipeline(

    FunctionTransformer(func=np.log),

    StandardScaler()

)

column_trans = ColumnTransformer(

    [

        ("binned_numeric", KBinsDiscretizer(n_bins=10),

            ["VehAge", "DrivAge"]),

        ("onehot_categorical", OneHotEncoder(),

            ["VehBrand", "VehPower", "VehGas", "Region", "Area"]),

        ("passthrough_numeric", "passthrough",

            ["BonusMalus"]),

        ("log_scaled_numeric", log_scale_transformer,

            ["Density"]),

    ],

    remainder="drop",

)

X = column_trans.fit_transform(df)

# Insurances companies are interested in modeling the Pure Premium, that is

# the expected total claim amount per unit of exposure for each policyholder

# in their portfolio:

df["PurePremium"] = df["ClaimAmount"] / df["Exposure"]

# This can be indirectly approximated by a 2-step modeling: the product of the

# Frequency times the average claim amount per claim:

df["Frequency"] = df["ClaimNb"] / df["Exposure"]

df["AvgClaimAmount"] = df["ClaimAmount"] / np.fmax(df["ClaimNb"], 1)

with pd.option_context("display.max_columns", 15):

    print(df[df.ClaimAmount > 0].head())


df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=0)

mask_train = df_train["ClaimAmount"] > 0

mask_test = df_test["ClaimAmount"] > 0

glm_sev = GammaRegressor(alpha=10., max_iter=10000)

glm_sev.fit(

    X_train[mask_train.values],

    df_train.loc[mask_train, "AvgClaimAmount"],

    sample_weight=df_train.loc[mask_train, "ClaimNb"],

)

scores = score_estimator(

    glm_sev,

    X_train[mask_train.values],

    X_test[mask_test.values],

    df_train[mask_train],

    df_test[mask_test],

    target="AvgClaimAmount",

    weights="ClaimNb",

)

print("Evaluation of GammaRegressor on target AvgClaimAmount")

print(scores)

fig, ax = plt.subplots(ncols=1, nrows=2, figsize=(16, 6))

plot_obs_pred(

    df=df_train.loc[mask_train],

    feature="DrivAge",

    weight="Exposure",

    observed="AvgClaimAmount",

    predicted=glm_sev.predict(X_train[mask_train.values]),

    y_label="Average Claim Severity",

    title="train data",

    ax=ax[0],

)

plot_obs_pred(

    df=df_test.loc[mask_test],

    feature="DrivAge",

    weight="Exposure",

    observed="AvgClaimAmount",

    predicted=glm_sev.predict(X_test[mask_test.values]),

    y_label="Average Claim Severity",

    title="test data",

    ax=ax[1],

    fill_legend=True

)

plt.tight_layout()



©著作权归作者所有,转载或内容合作请联系作者
【社区内容提示】社区部分内容疑似由AI辅助生成,浏览时请结合常识与多方信息审慎甄别。
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

相关阅读更多精彩内容

友情链接更多精彩内容