保险索赔的Tweedie回归#

此示例说明了在法国机动车第三者责任索赔数据集上使用泊松、伽马和Tweedie回归,灵感来自R教程[1]

在此数据集中,每个样本对应一个保险单,即保险公司与个人(投保人)之间的合同。可用特征包括驾驶员年龄、车辆年龄、车辆动力等。

一些定义:索赔是投保人向保险公司提出的赔偿保险所涵盖损失的要求。索赔金额是保险公司必须支付的金额。承保期限是指给定保单的保险期限,以年为单位。

我们的目标是预测预期值,即每承保单位的总索赔金额的平均值,也称为纯保费。

有几种方法可以做到这一点,其中两种是

  1. 使用泊松分布对索赔次数进行建模,并使用伽马分布对每个索赔的平均索赔金额(也称为严重程度)进行建模,并将两者的预测相乘以得到总索赔金额。

  2. 直接对每承保期限的总索赔金额进行建模,通常使用Tweedie分布,Tweedie幂为\(p \in (1, 2)\)

在这个例子中,我们将说明这两种方法。我们首先定义一些辅助函数来加载数据和可视化结果。

from functools import partial

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_openml
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    mean_tweedie_deviance,
)


def load_mtpl2(n_samples=None):
    """Fetch the French Motor Third-Party Liability Claims dataset.

    Parameters
    ----------
    n_samples: int, default=None
      number of samples to select (for faster run time). Full dataset has
      678013 samples.
    """
    # freMTPL2freq dataset from https://www.openml.org/d/41214
    df_freq = fetch_openml(data_id=41214, as_frame=True).data
    df_freq["IDpol"] = df_freq["IDpol"].astype(int)
    df_freq.set_index("IDpol", inplace=True)

    # freMTPL2sev dataset from https://www.openml.org/d/41215
    df_sev = fetch_openml(data_id=41215, as_frame=True).data

    # sum ClaimAmount over identical IDs
    df_sev = df_sev.groupby("IDpol").sum()

    df = df_freq.join(df_sev, how="left")
    df["ClaimAmount"] = df["ClaimAmount"].fillna(0)

    # unquote string fields
    for column_name in df.columns[[t is object for t in df.dtypes.values]]:
        df[column_name] = df[column_name].str.strip("'")
    return df.iloc[:n_samples]


def plot_obs_pred(
    df,
    feature,
    weight,
    observed,
    predicted,
    y_label=None,
    title=None,
    ax=None,
    fill_legend=False,
):
    """Plot observed and predicted - aggregated per feature level.

    Parameters
    ----------
    df : DataFrame
        input data
    feature: str
        a column name of df for the feature to be plotted
    weight : str
        column name of df with the values of weights or exposure
    observed : str
        a column name of df with the observed target
    predicted : DataFrame
        a dataframe, with the same index as df, with the predicted target
    fill_legend : bool, default=False
        whether to show fill_between legend
    """
    # aggregate observed and predicted variables by feature level
    df_ = df.loc[:, [feature, weight]].copy()
    df_["observed"] = df[observed] * df[weight]
    df_["predicted"] = predicted * df[weight]
    df_ = (
        df_.groupby([feature])[[weight, "observed", "predicted"]]
        .sum()
        .assign(observed=lambda x: x["observed"] / x[weight])
        .assign(predicted=lambda x: x["predicted"] / x[weight])
    )

    ax = df_.loc[:, ["observed", "predicted"]].plot(style=".", ax=ax)
    y_max = df_.loc[:, ["observed", "predicted"]].values.max() * 0.8
    p2 = ax.fill_between(
        df_.index,
        0,
        y_max * df_[weight] / df_[weight].values.max(),
        color="g",
        alpha=0.1,
    )
    if fill_legend:
        ax.legend([p2], ["{} distribution".format(feature)])
    ax.set(
        ylabel=y_label if y_label is not None else None,
        title=title if title is not None else "Train: Observed vs Predicted",
    )


def score_estimator(
    estimator,
    X_train,
    X_test,
    df_train,
    df_test,
    target,
    weights,
    tweedie_powers=None,
):
    """Evaluate an estimator on train and test sets with different metrics"""

    metrics = [
        ("D² explained", None),  # Use default scorer if it exists
        ("mean abs. error", mean_absolute_error),
        ("mean squared error", mean_squared_error),
    ]
    if tweedie_powers:
        metrics += [
            (
                "mean Tweedie dev p={:.4f}".format(power),
                partial(mean_tweedie_deviance, power=power),
            )
            for power in tweedie_powers
        ]

    res = []
    for subset_label, X, df in [
        ("train", X_train, df_train),
        ("test", X_test, df_test),
    ]:
        y, _weights = df[target], df[weights]
        for score_label, metric in metrics:
            if isinstance(estimator, tuple) and len(estimator) == 2:
                # Score the model consisting of the product of frequency and
                # severity models.
                est_freq, est_sev = estimator
                y_pred = est_freq.predict(X) * est_sev.predict(X)
            else:
                y_pred = estimator.predict(X)

            if metric is None:
                if not hasattr(estimator, "score"):
                    continue
                score = estimator.score(X, y, sample_weight=_weights)
            else:
                score = metric(y, y_pred, sample_weight=_weights)

            res.append({"subset": subset_label, "metric": score_label, "score": score})

    res = (
        pd.DataFrame(res)
        .set_index(["metric", "subset"])
        .score.unstack(-1)
        .round(4)
        .loc[:, ["train", "test"]]
    )
    return res

加载数据集、基本特征提取和目标定义#

我们通过连接包含索赔次数(ClaimNb)的freMTPL2freq表和包含同一保单ID(IDpol)的索赔金额(ClaimAmount)的freMTPL2sev表来构建freMTPL2数据集。

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import (
    FunctionTransformer,
    KBinsDiscretizer,
    OneHotEncoder,
    StandardScaler,
)

df = load_mtpl2()


# Correct for unreasonable observations (that might be data error)
# and a few exceptionally large claim amounts
df["ClaimNb"] = df["ClaimNb"].clip(upper=4)
df["Exposure"] = df["Exposure"].clip(upper=1)
df["ClaimAmount"] = df["ClaimAmount"].clip(upper=200000)
# If the claim amount is 0, then we do not count it as a claim. The loss function
# used by the severity model needs strictly positive claim amounts. This way
# frequency and severity are more consistent with each other.
df.loc[(df["ClaimAmount"] == 0) & (df["ClaimNb"] >= 1), "ClaimNb"] = 0

log_scale_transformer = make_pipeline(
    FunctionTransformer(func=np.log), StandardScaler()
)

column_trans = ColumnTransformer(
    [
        (
            "binned_numeric",
            KBinsDiscretizer(n_bins=10, random_state=0),
            ["VehAge", "DrivAge"],
        ),
        (
            "onehot_categorical",
            OneHotEncoder(),
            ["VehBrand", "VehPower", "VehGas", "Region", "Area"],
        ),
        ("passthrough_numeric", "passthrough", ["BonusMalus"]),
        ("log_scaled_numeric", log_scale_transformer, ["Density"]),
    ],
    remainder="drop",
)
X = column_trans.fit_transform(df)

# Insurances companies are interested in modeling the Pure Premium, that is
# the expected total claim amount per unit of exposure for each policyholder
# in their portfolio:
df["PurePremium"] = df["ClaimAmount"] / df["Exposure"]

# This can be indirectly approximated by a 2-step modeling: the product of the
# Frequency times the average claim amount per claim:
df["Frequency"] = df["ClaimNb"] / df["Exposure"]
df["AvgClaimAmount"] = df["ClaimAmount"] / np.fmax(df["ClaimNb"], 1)

with pd.option_context("display.max_columns", 15):
    print(df[df.ClaimAmount > 0].head())
       ClaimNb  Exposure Area  VehPower  VehAge  DrivAge  BonusMalus VehBrand  \
IDpol
139          1      0.75    F         7       1       61          50      B12
190          1      0.14    B        12       5       50          60      B12
414          1      0.14    E         4       0       36          85      B12
424          2      0.62    F        10       0       51         100      B12
463          1      0.31    A         5       0       45          50      B12

          VehGas  Density Region  ClaimAmount   PurePremium  Frequency  \
IDpol
139    'Regular'    27000    R11       303.00    404.000000   1.333333
190     'Diesel'       56    R25      1981.84  14156.000000   7.142857
414    'Regular'     4792    R11      1456.55  10403.928571   7.142857
424    'Regular'    27000    R11     10834.00  17474.193548   3.225806
463    'Regular'       12    R73      3986.67  12860.225806   3.225806

       AvgClaimAmount
IDpol
139            303.00
190           1981.84
414           1456.55
424           5417.00
463           3986.67

频率模型 – 泊松分布#

索赔次数(ClaimNb)是一个正整数(包括0)。因此,此目标可以使用泊松分布进行建模。然后假设它是给定时间间隔内(Exposure,以年为单位)以恒定速率发生的离散事件的数量。在这里,我们对频率y = ClaimNb / Exposure进行建模,它仍然是(缩放的)泊松分布,并使用Exposure作为sample_weight

from sklearn.linear_model import PoissonRegressor
from sklearn.model_selection import train_test_split

df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=0)

让我们记住,尽管此数据集中看似有大量数据点,但索赔金额非零的评估点数却非常少。

len(df_test)
169504
len(df_test[df_test["ClaimAmount"] > 0])
6237

因此,我们预计在训练测试集的随机重采样时,我们的评估会存在很大的差异。

模型的参数是通过使用牛顿求解器最小化训练集上的泊松偏差来估计的。一些特征是共线的(例如,因为我们没有在OneHotEncoder中删除任何类别级别),我们使用弱L2惩罚来避免数值问题。

glm_freq = PoissonRegressor(alpha=1e-4, solver="newton-cholesky")
glm_freq.fit(X_train, df_train["Frequency"], sample_weight=df_train["Exposure"])

scores = score_estimator(
    glm_freq,
    X_train,
    X_test,
    df_train,
    df_test,
    target="Frequency",
    weights="Exposure",
)
print("Evaluation of PoissonRegressor on target Frequency")
print(scores)
Evaluation of PoissonRegressor on target Frequency
subset               train    test
metric
D² explained        0.0448  0.0427
mean abs. error     0.1379  0.1378
mean squared error  0.2441  0.2246

请注意,在测试集上测量的分数令人惊讶地好于训练集。这可能是特定于此随机训练测试分割的。适当的交叉验证可以帮助我们评估这些结果的采样变异性。

我们可以根据驾驶员年龄(DrivAge)、车辆年龄(VehAge)和保险奖励/罚款(BonusMalus)对观察值和预测值进行可视化比较。

fig, ax = plt.subplots(ncols=2, nrows=2, figsize=(16, 8))
fig.subplots_adjust(hspace=0.3, wspace=0.2)

plot_obs_pred(
    df=df_train,
    feature="DrivAge",
    weight="Exposure",
    observed="Frequency",
    predicted=glm_freq.predict(X_train),
    y_label="Claim Frequency",
    title="train data",
    ax=ax[0, 0],
)

plot_obs_pred(
    df=df_test,
    feature="DrivAge",
    weight="Exposure",
    observed="Frequency",
    predicted=glm_freq.predict(X_test),
    y_label="Claim Frequency",
    title="test data",
    ax=ax[0, 1],
    fill_legend=True,
)

plot_obs_pred(
    df=df_test,
    feature="VehAge",
    weight="Exposure",
    observed="Frequency",
    predicted=glm_freq.predict(X_test),
    y_label="Claim Frequency",
    title="test data",
    ax=ax[1, 0],
    fill_legend=True,
)

plot_obs_pred(
    df=df_test,
    feature="BonusMalus",
    weight="Exposure",
    observed="Frequency",
    predicted=glm_freq.predict(X_test),
    y_label="Claim Frequency",
    title="test data",
    ax=ax[1, 1],
    fill_legend=True,
)
train data, test data, test data, test data

根据观察数据,30岁以下驾驶员的事故频率较高,并且与BonusMalus变量呈正相关。我们的模型能够在很大程度上正确地模拟这种行为。

严重程度模型 - 伽马分布#

平均索赔金额或严重程度(AvgClaimAmount)经经验表明大致服从伽马分布。我们使用与频率模型相同的特征来拟合严重程度的GLM模型。

注意

  • 我们过滤掉ClaimAmount == 0,因为伽马分布的支持在\((0, \infty)\)上,而不是\([0, \infty)\)上。

  • 我们使用ClaimNb作为sample_weight来考虑包含多个索赔的保单。

from sklearn.linear_model import GammaRegressor

mask_train = df_train["ClaimAmount"] > 0
mask_test = df_test["ClaimAmount"] > 0

glm_sev = GammaRegressor(alpha=10.0, solver="newton-cholesky")

glm_sev.fit(
    X_train[mask_train.values],
    df_train.loc[mask_train, "AvgClaimAmount"],
    sample_weight=df_train.loc[mask_train, "ClaimNb"],
)

scores = score_estimator(
    glm_sev,
    X_train[mask_train.values],
    X_test[mask_test.values],
    df_train[mask_train],
    df_test[mask_test],
    target="AvgClaimAmount",
    weights="ClaimNb",
)
print("Evaluation of GammaRegressor on target AvgClaimAmount")
print(scores)
Evaluation of GammaRegressor on target AvgClaimAmount
subset                     train          test
metric
D² explained        3.900000e-03  4.400000e-03
mean abs. error     1.756746e+03  1.744042e+03
mean squared error  5.801770e+07  5.030677e+07

这些指标的值并不一定容易解释。将它们与在相同设置下不使用任何输入特征并始终预测恒定值(即平均索赔金额)的模型进行比较可能更有见地。

from sklearn.dummy import DummyRegressor

dummy_sev = DummyRegressor(strategy="mean")
dummy_sev.fit(
    X_train[mask_train.values],
    df_train.loc[mask_train, "AvgClaimAmount"],
    sample_weight=df_train.loc[mask_train, "ClaimNb"],
)

scores = score_estimator(
    dummy_sev,
    X_train[mask_train.values],
    X_test[mask_test.values],
    df_train[mask_train],
    df_test[mask_test],
    target="AvgClaimAmount",
    weights="ClaimNb",
)
print("Evaluation of a mean predictor on target AvgClaimAmount")
print(scores)
Evaluation of a mean predictor on target AvgClaimAmount
subset                     train          test
metric
D² explained        0.000000e+00 -0.000000e+00
mean abs. error     1.756687e+03  1.744497e+03
mean squared error  5.803882e+07  5.033764e+07

我们的结论是索赔金额非常难以预测。尽管如此,GammaRegressor能够利用输入特征中的某些信息,在D²方面略微优于平均基线。

请注意,生成的模型是每个索赔的平均索赔金额。因此,它以至少有一个索赔为条件,不能用于预测每个保单的平均索赔金额。为此,需要将其与索赔频率模型结合。

print(
    "Mean AvgClaim Amount per policy:              %.2f "
    % df_train["AvgClaimAmount"].mean()
)
print(
    "Mean AvgClaim Amount | NbClaim > 0:           %.2f"
    % df_train["AvgClaimAmount"][df_train["AvgClaimAmount"] > 0].mean()
)
print(
    "Predicted Mean AvgClaim Amount | NbClaim > 0: %.2f"
    % glm_sev.predict(X_train).mean()
)
print(
    "Predicted Mean AvgClaim Amount (dummy) | NbClaim > 0: %.2f"
    % dummy_sev.predict(X_train).mean()
)
Mean AvgClaim Amount per policy:              71.78
Mean AvgClaim Amount | NbClaim > 0:           1951.21
Predicted Mean AvgClaim Amount | NbClaim > 0: 1940.95
Predicted Mean AvgClaim Amount (dummy) | NbClaim > 0: 1978.59

我们可以直观地比较观察值和预测值,这些值按驾驶员年龄(DrivAge)汇总。

fig, ax = plt.subplots(ncols=1, nrows=2, figsize=(16, 6))

plot_obs_pred(
    df=df_train.loc[mask_train],
    feature="DrivAge",
    weight="Exposure",
    observed="AvgClaimAmount",
    predicted=glm_sev.predict(X_train[mask_train.values]),
    y_label="Average Claim Severity",
    title="train data",
    ax=ax[0],
)

plot_obs_pred(
    df=df_test.loc[mask_test],
    feature="DrivAge",
    weight="Exposure",
    observed="AvgClaimAmount",
    predicted=glm_sev.predict(X_test[mask_test.values]),
    y_label="Average Claim Severity",
    title="test data",
    ax=ax[1],
    fill_legend=True,
)
plt.tight_layout()
train data, test data

总体而言,驾驶员年龄(DrivAge)对索赔严重程度的影响较弱,无论是在观察数据还是预测数据中。

通过产品模型与单个TweedieRegressor进行纯保费建模#

如引言中所述,每个风险单位的总索赔金额可以建模为频率模型预测与严重程度模型预测的乘积。

或者,可以使用唯一的复合泊松伽马广义线性模型(具有对数连接函数)直接对总损失进行建模。此模型是具有“幂”参数\(p \in (1, 2)\)的Tweedie GLM的特例。在这里,我们预先将Tweedie模型的power参数固定为有效范围内的某个任意值(1.9)。理想情况下,可以通过最小化Tweedie模型的负对数似然来通过网格搜索选择此值,但不幸的是,当前的实现(尚)不允许这样做。

我们将比较两种方法的性能。为了量化这两种模型的性能,可以假设总索赔金额服从复合泊松-伽马分布,计算训练数据和测试数据的平均偏差。这等效于具有介于1和2之间的power参数的Tweedie分布。

sklearn.metrics.mean_tweedie_deviance取决于power参数。由于我们不知道power参数的真实值,因此我们在这里计算一系列可能值的平均偏差,并并排比较模型,即我们在power的相同值下进行比较。理想情况下,我们希望一个模型始终优于另一个模型,而不管power如何。

from sklearn.linear_model import TweedieRegressor

glm_pure_premium = TweedieRegressor(power=1.9, alpha=0.1, solver="newton-cholesky")
glm_pure_premium.fit(
    X_train, df_train["PurePremium"], sample_weight=df_train["Exposure"]
)

tweedie_powers = [1.5, 1.7, 1.8, 1.9, 1.99, 1.999, 1.9999]

scores_product_model = score_estimator(
    (glm_freq, glm_sev),
    X_train,
    X_test,
    df_train,
    df_test,
    target="PurePremium",
    weights="Exposure",
    tweedie_powers=tweedie_powers,
)

scores_glm_pure_premium = score_estimator(
    glm_pure_premium,
    X_train,
    X_test,
    df_train,
    df_test,
    target="PurePremium",
    weights="Exposure",
    tweedie_powers=tweedie_powers,
)

scores = pd.concat(
    [scores_product_model, scores_glm_pure_premium],
    axis=1,
    sort=True,
    keys=("Product Model", "TweedieRegressor"),
)
print("Evaluation of the Product Model and the Tweedie Regressor on target PurePremium")
with pd.option_context("display.expand_frame_repr", False):
    print(scores)
Evaluation of the Product Model and the Tweedie Regressor on target PurePremium
                          Product Model               TweedieRegressor
subset                            train          test            train          test
metric
D² explained                        NaN           NaN     1.640000e-02  1.370000e-02
mean Tweedie dev p=1.5000  7.669930e+01  7.617050e+01     7.640770e+01  7.640880e+01
mean Tweedie dev p=1.7000  3.695740e+01  3.683980e+01     3.682880e+01  3.692270e+01
mean Tweedie dev p=1.8000  3.046010e+01  3.040530e+01     3.037600e+01  3.045390e+01
mean Tweedie dev p=1.9000  3.387580e+01  3.385000e+01     3.382120e+01  3.387830e+01
mean Tweedie dev p=1.9900  2.015716e+02  2.015414e+02     2.015347e+02  2.015587e+02
mean Tweedie dev p=1.9990  1.914573e+03  1.914370e+03     1.914538e+03  1.914387e+03
mean Tweedie dev p=1.9999  1.904751e+04  1.904556e+04     1.904747e+04  1.904558e+04
mean abs. error            2.730119e+02  2.722128e+02     2.739865e+02  2.731249e+02
mean squared error         3.295040e+07  3.212197e+07     3.295505e+07  3.213056e+07

在这个例子中,两种建模方法都产生了可比的性能指标。由于实现原因,对于产品模型,解释方差的百分比\(D^2\)不可用。

此外,我们可以通过比较测试和训练子集上的观察到的总索赔金额和预测的总索赔金额来验证这些模型。我们看到,平均而言,两种模型都倾向于低估总索赔(但这种行为取决于正则化量)。

res = []
for subset_label, X, df in [
    ("train", X_train, df_train),
    ("test", X_test, df_test),
]:
    exposure = df["Exposure"].values
    res.append(
        {
            "subset": subset_label,
            "observed": df["ClaimAmount"].values.sum(),
            "predicted, frequency*severity model": np.sum(
                exposure * glm_freq.predict(X) * glm_sev.predict(X)
            ),
            "predicted, tweedie, power=%.2f"
            % glm_pure_premium.power: np.sum(exposure * glm_pure_premium.predict(X)),
        }
    )

print(pd.DataFrame(res).set_index("subset").T)
subset                                      train          test
observed                             3.917618e+07  1.299546e+07
predicted, frequency*severity model  3.916555e+07  1.313276e+07
predicted, tweedie, power=1.90       3.951751e+07  1.325198e+07

最后,我们可以使用累积索赔图来比较这两个模型:对于每个模型,保单持有人根据模型预测从最安全到风险最高的排序,并将累积索赔金额比例与累积风险暴露比例作图。该图通常被称为模型的有序洛伦兹曲线。

基尼系数(基于曲线与对角线之间的面积)可以用作模型选择指标,以量化模型对保单持有人进行排序的能力。请注意,此指标并不反映模型在总索赔金额绝对值方面进行准确预测的能力,而仅反映作为排序指标的相对金额。基尼系数的上限为1.0,但即使是根据观察到的索赔金额对保单持有人进行排序的神谕模型也无法达到1.0的分数。

我们观察到,尽管这两个模型都远未达到神谕模型的水平(因为从少量特征进行预测问题的固有难度:大多数事故是不可预测的,并且可能是由输入特征根本没有描述的环境因素造成的),但它们都能够比偶然性更好地对保单持有人按风险等级进行排序。

请注意,基尼指数仅表征模型的排序性能,而不表征其校准:预测的任何单调变换都不会改变模型的基尼指数。

最后,应该强调的是,直接拟合到纯保费上的复合泊松伽马模型在操作上更易于开发和维护,因为它由单个scikit-learn估计器组成,而不是一对模型,每个模型都有自己的一组超参数。

from sklearn.metrics import auc


def lorenz_curve(y_true, y_pred, exposure):
    y_true, y_pred = np.asarray(y_true), np.asarray(y_pred)
    exposure = np.asarray(exposure)

    # order samples by increasing predicted risk:
    ranking = np.argsort(y_pred)
    ranked_exposure = exposure[ranking]
    ranked_pure_premium = y_true[ranking]
    cumulative_claim_amount = np.cumsum(ranked_pure_premium * ranked_exposure)
    cumulative_claim_amount /= cumulative_claim_amount[-1]
    cumulative_exposure = np.cumsum(ranked_exposure)
    cumulative_exposure /= cumulative_exposure[-1]
    return cumulative_exposure, cumulative_claim_amount


fig, ax = plt.subplots(figsize=(8, 8))

y_pred_product = glm_freq.predict(X_test) * glm_sev.predict(X_test)
y_pred_total = glm_pure_premium.predict(X_test)

for label, y_pred in [
    ("Frequency * Severity model", y_pred_product),
    ("Compound Poisson Gamma", y_pred_total),
]:
    cum_exposure, cum_claims = lorenz_curve(
        df_test["PurePremium"], y_pred, df_test["Exposure"]
    )
    gini = 1 - 2 * auc(cum_exposure, cum_claims)
    label += " (Gini index: {:.3f})".format(gini)
    ax.plot(cum_exposure, cum_claims, linestyle="-", label=label)

# Oracle model: y_pred == y_test
cum_exposure, cum_claims = lorenz_curve(
    df_test["PurePremium"], df_test["PurePremium"], df_test["Exposure"]
)
gini = 1 - 2 * auc(cum_exposure, cum_claims)
label = "Oracle (Gini index: {:.3f})".format(gini)
ax.plot(cum_exposure, cum_claims, linestyle="-.", color="gray", label=label)

# Random baseline
ax.plot([0, 1], [0, 1], linestyle="--", color="black", label="Random baseline")
ax.set(
    title="Lorenz Curves",
    xlabel=(
        "Cumulative proportion of exposure\n"
        "(ordered by model from safest to riskiest)"
    ),
    ylabel="Cumulative proportion of claim amounts",
)
ax.legend(loc="upper left")
plt.plot()
Lorenz Curves
[]

脚本的总运行时间:(0 分钟 8.864 秒)

相关示例

泊松回归和非正态损失

泊松回归和非正态损失

分类器校准比较

分类器校准比较

将数据映射到正态分布

将数据映射到正态分布

使用网格搜索对模型进行统计比较

使用网格搜索对模型进行统计比较

由Sphinx-Gallery生成的图库