## 
## 载入程序包:'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## 载入需要的程序包:ggplot2
## 载入需要的程序包:lattice
## 载入需要的程序包:Matrix
## Loaded glmnet 4.1-9

数据的探索性分析

首先我们进行数据的探索,大致看一下我们用的两个数据结构。下面这张图是Power的数据结构。

par(mfrow=c(1,2))
dataori$Power <- as.factor(dataori$Power)
datapred$Power <- as.factor(datapred$Power)
countsorg1 <- table(dataori$Power)
barplot(countsorg1,main="Power")
countspred1 <- table(datapred$Power)
barplot(countspred1,main="Power")

par(mfrow=c(1,2))
countsorg1 <- table(dataori$Power)
barplot(countsorg1,main="Power")
countspred1 <- table(datapred$Power)
barplot(countspred1,main="Power")

对于CarAge,在训练集和预测集中都出现了非常离谱的数据,比如100、78,这是显然不可能的。因此在实际处理的时候我认为应该进行限制,比如将超过30的都限制在30。以下是频率密度直方图:

dataori$CarAge <- pmin(dataori$CarAge,30)
datapred$CarAge <- pmin(datapred$CarAge,30)
par(mfrow=c(1,2))
hist(dataori$CarAge,freq = FALSE,breaks = 20,main="CarAge")
lines(density(dataori$CarAge),col="red",lwd=2)
hist(datapred$CarAge,freq=FALSE,breaks = 20,main="CarAge")
lines(density(datapred$CarAge),col="red",lwd=2)

对于DriverAge,这里应该指的是驾驶员的年龄,而非驾驶汽车的时间,虽然99岁有点离谱,但应该还是可以理解的。它们的频率密度直方图如下:

par(mfrow=c(1,2))
hist(dataori$DriverAge,freq = FALSE,breaks = 20,main="DriverAge")
lines(density(dataori$DriverAge),col="red",lwd=2)
hist(datapred$DriverAge,freq=FALSE,breaks = 20,main="DriverAge")
lines(density(datapred$DriverAge),col="red",lwd=2)

下面是关于汽车品牌的条形图,从左到右分别是”Fiat”,“Japanese (except Nissan) or Korean”,“Mercedes, Chrysler or BMW”,“Opel, General Motors or Ford”,“other”,“Renault, Nissan or Citroen”,“Volkswagen, Audi, Skoda or Seat”:

dataori$Brand <- as.factor(dataori$Brand)
datapred$Brand <- as.factor(datapred$Brand)
par(mfrow=c(1,2))
countsorg2 <- table(dataori$Brand)
barplot(countsorg2,main="Brand")
countspred2 <- table(datapred$Brand)
barplot(countspred2,main="Brand")

下面是关于汽车燃油的条形图:

dataori$Gas <- as.factor(dataori$Gas)
datapred$Gas <- as.factor(datapred$Gas)
par(mfrow=c(1,2))
countsorg3 <- table(dataori$Gas)
barplot(countsorg3,main="Gas")
countspred3 <- table(datapred$Gas)
barplot(countspred3,main="Gas")

下面是关于各个地区车辆的条形图:

dataori$Region <- as.factor(dataori$Region)
datapred$Region <- as.factor(datapred$Region)
par(mfrow=c(1,2))
countsorg4 <- table(dataori$Region)
barplot(countsorg4,main="Region")
countspred4 <- table(datapred$Region)
barplot(countspred4,main="Region")

下面是人口密度的直方图,可以看出大部分都是在人口密度小于5000的地区,但是也存在超过25000的地区,这部分可能得特别注意:

par(mfrow=c(1,2))
hist(dataori$Density,freq = FALSE,breaks = 20,main="Density")
lines(density(dataori$Density),col="red",lwd=1)
hist(datapred$Density,freq=FALSE,breaks = 20,main="Density")
lines(density(datapred$Density),col="red",lwd=1)

对人口密度取对数,得到下图:

dataori$logDensity <- log(dataori$Density+1)
par(mfrow=c(1,2))
hist(dataori$logDensity,freq = FALSE,breaks = 20,main="logDensity")
lines(density(dataori$logDensity),col="red",lwd=1)
datapred$logDensity <- log(datapred$Density+1)
hist(datapred$logDensity,freq=FALSE,breaks = 20,main="logDensity")
lines(density(datapred$logDensity),col="red",lwd=1)

我们可以看出两个数据集有着类似的结构,用训练用数据得出的模型去计算保费是可行的。

接下来我们对训练用数据进行分析。 首先是对于ClaimNb。它的均值为0.051,方差为0.052,适合用泊松连接函数,大约有95%的数据没有索赔。

table(dataori$ClaimNb)
## 
##      0      1      2      3 
## 106203   5181    229      4

对于有赔案的数据,呈现很明显的右偏特征:

dataori1 <- dataori[dataori$ClaimNb>0,]
par(mfrow=c(1,1))
hist(dataori$ClaimAmount[dataori$ClaimNb>0],breaks = 20,
     freq = FALSE,main="ClaimAmount")
lines(density(dataori$ClaimAmount[dataori$ClaimNb>0]),col="red",lwd=1)

这是对有赔案数据取对数后的结果:

dataori1$logClaimAmount <- log(dataori1$ClaimAmount+1)
hist(dataori1$logClaimAmount,breaks = 20,
     freq = FALSE,main="logClaimAmount")
lines(density(dataori1$logClaimAmount),col="red",lwd=1)

接下来分析各个变量之间的相关性,可以看出各个连续型变量对于索赔次数的影响不大(感觉我这里这个做得有问题):

library(psych)
## 
## 载入程序包:'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
pairs.panels(dataori[c("CarAge","DriverAge","Density","logDensity","ClaimNb")],pch=".")

对于连续型变量和ClaimAmount之间的关系,可以看出这些连续型变量与索赔额的关系不大。

pairs.panels(dataori1[c("CarAge","DriverAge","Density","logDensity","logClaimAmount")],pch=".")

接下来是分析Brand,GasRegion对于索赔额的影响,但看上去好像这些变量取不同值对于索赔额的影响不大。

boxplot(logClaimAmount~Brand,data=dataori1)

boxplot(logClaimAmount~Gas,data=dataori1)

boxplot(logClaimAmount~Region,data=dataori1)

boxplot(logClaimAmount~Power,data=dataori1)

最后分析Brand,GasRegion对于索赔次数的影响,可以看出BrandGasRegion对于索赔次数的影响比较大。

a1 <- aggregate(dataori$ClaimNb,by=list(dataori$Brand),FUN=mean)
colnames(a1) <- c("Brand","ClaimNb")
a1
##                                Brand    ClaimNb
## 1                               Fiat 0.04922760
## 2 Japanese (except Nissan) or Korean 0.06763590
## 3          Mercedes, Chrysler or BMW 0.05995589
## 4       Opel, General Motors or Ford 0.05805375
## 5                              other 0.05474328
## 6         Renault, Nissan or Citroen 0.04741631
## 7    Volkswagen, Audi, Skoda or Seat 0.05648065
a2 <- aggregate(dataori$ClaimNb,by=list(dataori$Gas),FUN=mean)
colnames(a2) <- c("Gas","ClaimNb")
a2
##       Gas   ClaimNb
## 1  Diesel 0.0561098
## 2 Regular 0.0457035
a3 <- aggregate(dataori$ClaimNb,by=list(dataori$Region),FUN=mean)
colnames(a3) <- c("Region","ClaimNb")
a3
##                Region    ClaimNb
## 1           Aquitaine 0.06509946
## 2     Basse-Normandie 0.05882353
## 3            Bretagne 0.05117324
## 4              Centre 0.04407949
## 5     Haute-Normandie 0.05177665
## 6       Ile-de-France 0.07506434
## 7            Limousin 0.06548857
## 8  Nord-Pas-de-Calais 0.06499080
## 9    Pays-de-la-Loire 0.05670484
## 10   Poitou-Charentes 0.04639175

或许我们还需要检查不同Power的车辆是否存在等级,也就是ClaimNb是否存在区别。

a4 <- aggregate(dataori$ClaimNb,by=list(dataori$Power),FUN=mean)
colnames(a4) <- c("Power","ClaimNb")
a4
##    Power    ClaimNb
## 1      d 0.04224806
## 2      e 0.05237672
## 3      f 0.05218180
## 4      g 0.04865891
## 5      h 0.05102383
## 6      i 0.05506131
## 7      j 0.06282579
## 8      k 0.06516724
## 9      l 0.06099291
## 10     m 0.06813187
## 11     n 0.08695652
## 12     o 0.05952381

模型构建

数据处理

建立一个变量,看看是否发生索赔。

dataori$isClaim <- ifelse(dataori$ClaimNb==0,0,1)
table(dataori$isClaim)
## 
##      0      1 
## 106203   5414

判断哪些连续型变量和是否索赔有关。考虑到相对于一次索赔来说,多次索赔的比例较小,因此直接用是否索赔来处理。这里我采用的是似然比检验方法。 首先我们判断CarAge

full_model1 <- glm(isClaim~CarAge,data=dataori,family=binomial)
null_model1 <- glm(isClaim~1,data=dataori,family=binomial)
anova_result1 <- anova(null_model1, full_model1, test = "LRT")
print(anova_result1)
## Analysis of Deviance Table
## 
## Model 1: isClaim ~ 1
## Model 2: isClaim ~ CarAge
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1    111616      43327                          
## 2    111615      43089  1   238.65 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

可以看出P值接近0,说明CarAge对于索赔频率是有显著影响的。

然后我们判断Density对于索赔频率是否有影响。

full_model2 <- glm(isClaim~logDensity,data=dataori,family=binomial)
null_model2 <- glm(isClaim~1,data=dataori,family=binomial)
anova_result2 <- anova(null_model2, full_model2, test = "LRT")
print(anova_result2)
## Analysis of Deviance Table
## 
## Model 1: isClaim ~ 1
## Model 2: isClaim ~ logDensity
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1    111616      43327                          
## 2    111615      43089  1   238.47 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

可以看出Density对于索赔频率也是有影响的。

对于DriverAge,为了更贴近精算实务,我对它进行分箱操作。

dataori <- within(dataori,{
  DriverAgeGroup <- NA
  DriverAgeGroup[DriverAge<=29] <- "18-29"
  DriverAgeGroup[DriverAge>=30&DriverAge<=34] <- "30-34" 
  DriverAgeGroup[DriverAge>=35&DriverAge<=39] <- "35-39" 
  DriverAgeGroup[DriverAge>=40&DriverAge<=44] <- "40-44" 
  DriverAgeGroup[DriverAge>=45&DriverAge<=49] <- "45-49" 
  DriverAgeGroup[DriverAge>=50&DriverAge<=54] <- "50-54" 
  DriverAgeGroup[DriverAge>=55&DriverAge<=64] <- "55-64"
  DriverAgeGroup[DriverAge>=65&DriverAge<=74] <- "65-74"
  DriverAgeGroup[DriverAge>=75] <- "75及以上"
})
dataori$DriverAgeGroup <- as.factor(dataori$DriverAgeGroup)

然后验证分箱的合理性。

dataori %>% 
  count(DriverAgeGroup) %>% 
  mutate(Prop = n/sum(n))
## # A tibble: 9 × 3
##   DriverAgeGroup     n   Prop
##   <fct>          <int>  <dbl>
## 1 18-29           7612 0.0682
## 2 30-34          10316 0.0924
## 3 35-39          12665 0.113 
## 4 40-44          13190 0.118 
## 5 45-49          13726 0.123 
## 6 50-54          15371 0.138 
## 7 55-64          18276 0.164 
## 8 65-74          12832 0.115 
## 9 75及以上        7629 0.0683

然后检验不同的DriverAge对于索赔频率的影响。 展示不同的年龄组对于索赔频率的影响。

a5 <- aggregate(dataori$ClaimNb,by=list(dataori$DriverAgeGroup),FUN=mean)
colnames(a5) <- c("DriverAgeGroup","ClaimNb")
a5
##   DriverAgeGroup    ClaimNb
## 1          18-29 0.06305833
## 2          30-34 0.04953470
## 3          35-39 0.05005922
## 4          40-44 0.05329795
## 5          45-49 0.05340230
## 6          50-54 0.05061479
## 7          55-64 0.04771285
## 8          65-74 0.04301746
## 9       75及以上 0.05085857
library(gmodels)
tb <- table(dataori$DriverAgeGroup,dataori$isClaim)
chisq.test(tb)
## 
##  Pearson's Chi-squared test
## 
## data:  tb
## X-squared = 42.855, df = 8, p-value = 9.355e-07

其中p值近似为0,说明DriverAge对于是否索赔是有很强的影响的。

已经做了的数据处理有将CarAge限制在30年以内,对Density取对数,即logDensity=log(Density+1)

合并出现频率小于100的BrandRegion

brand_counts <- table(dataori$Brand)
rare_brands <- names(brand_counts[brand_counts < 100]) 
dataori$BrandAdj <- ifelse(dataori$Brand %in% rare_brands, 
                          "Other", 
                          as.character(dataori$Brand))
dataori$BrandAdj <- as.factor(dataori$BrandAdj)
Region_counts <- table(dataori$Region)
rare_Region <- names(Region_counts[Region_counts < 100]) 
dataori$RegionAdj <- ifelse(dataori$Region %in% rare_Region, 
                          "Other", 
                          as.character(dataori$Region))
dataori$RegionAdj <- as.factor(dataori$RegionAdj)

划分训练集和测试集

采用了分层抽样的方法,按照DriverAgeClaimNb进行分层抽样,抽样比例为\(训练集:测试集=8:2\)(如果低于这个值零膨胀泊松模型会出问题).

set.seed(2023)
# 分层抽样:按年龄组+是否理赔分层(保持分布一致性)
train_index <- createDataPartition(
  paste(dataori$DriverAgeGroup), 
  p = 0.8, 
  list = FALSE
)
train_data <- dataori[train_index, ]
test_data <- dataori[-train_index, ]

接下来进行零膨胀泊松模型对索赔频率的拟合。

library(pscl)
zeropoisson_model <- zeroinfl(ClaimNb~CarAge+DriverAgeGroup+Gas+logDensity+Power|BrandAdj+RegionAdj,data = train_data,dist = "poisson")

查看模型的摘要

summary(zeropoisson_model)
## 
## Call:
## zeroinfl(formula = ClaimNb ~ CarAge + DriverAgeGroup + Gas + logDensity + 
##     Power | BrandAdj + RegionAdj, data = train_data, dist = "poisson")
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4126 -0.2407 -0.2144 -0.1896 12.5517 
## 
## Count model coefficients (poisson with log link):
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            -2.684266   0.117848 -22.777  < 2e-16 ***
## CarAge                 -0.034021   0.003125 -10.887  < 2e-16 ***
## DriverAgeGroup30-34    -0.228632   0.071306  -3.206 0.001344 ** 
## DriverAgeGroup35-39    -0.236794   0.068481  -3.458 0.000545 ***
## DriverAgeGroup40-44    -0.189020   0.067398  -2.805 0.005039 ** 
## DriverAgeGroup45-49    -0.164464   0.066580  -2.470 0.013505 *  
## DriverAgeGroup50-54    -0.215479   0.065692  -3.280 0.001038 ** 
## DriverAgeGroup55-64    -0.284139   0.064601  -4.398 1.09e-05 ***
## DriverAgeGroup65-74    -0.337284   0.070948  -4.754 1.99e-06 ***
## DriverAgeGroup75及以上 -0.093240   0.077829  -1.198 0.230912    
## GasRegular             -0.215415   0.033421  -6.446 1.15e-10 ***
## logDensity              0.105860   0.009863  10.733  < 2e-16 ***
## Powere                  0.088403   0.054191   1.631 0.102824    
## Powerf                  0.109742   0.053224   2.062 0.039217 *  
## Powerg                  0.086449   0.053377   1.620 0.105318    
## Powerh                  0.062921   0.080429   0.782 0.434026    
## Poweri                  0.252372   0.085746   2.943 0.003248 ** 
## Powerj                  0.303782   0.087733   3.463 0.000535 ***
## Powerk                  0.351526   0.117959   2.980 0.002882 ** 
## Powerl                  0.184787   0.179679   1.028 0.303749    
## Powerm                  0.453359   0.201180   2.253 0.024228 *  
## Powern                  0.585547   0.245770   2.383 0.017196 *  
## Powero                  0.134225   0.361848   0.371 0.710680    
## 
## Zero-inflation model coefficients (binomial with logit link):
##                                            Estimate Std. Error z value Pr(>|z|)
## (Intercept)                                 -1.1888     0.5181  -2.295   0.0218
## BrandAdjJapanese (except Nissan) or Korean  -0.7803     0.4716  -1.654   0.0981
## BrandAdjMercedes, Chrysler or BMW           -0.4164     0.3360  -1.239   0.2153
## BrandAdjOpel, General Motors or Ford        -0.6686     0.3046  -2.194   0.0282
## BrandAdjother                               -0.2794     0.3677  -0.760   0.4474
## BrandAdjRenault, Nissan or Citroen          -0.1703     0.2159  -0.789   0.4301
## BrandAdjVolkswagen, Audi, Skoda or Seat     -0.2764     0.2702  -1.023   0.3063
## RegionAdjBasse-Normandie                     0.3470     0.4665   0.744   0.4570
## RegionAdjBretagne                            0.6994     0.3869   1.808   0.0706
## RegionAdjCentre                              0.7681     0.3787   2.028   0.0426
## RegionAdjHaute-Normandie                     0.5411     0.6294   0.860   0.3900
## RegionAdjIle-de-France                       0.3783     0.4069   0.930   0.3525
## RegionAdjLimousin                           -4.9271    91.5057  -0.054   0.9571
## RegionAdjNord-Pas-de-Calais                  0.4219     0.4596   0.918   0.3587
## RegionAdjPays-de-la-Loire                    0.3889     0.3892   0.999   0.3177
## RegionAdjPoitou-Charentes                    0.7322     0.4231   1.731   0.0835
##                                             
## (Intercept)                                *
## BrandAdjJapanese (except Nissan) or Korean .
## BrandAdjMercedes, Chrysler or BMW           
## BrandAdjOpel, General Motors or Ford       *
## BrandAdjother                               
## BrandAdjRenault, Nissan or Citroen          
## BrandAdjVolkswagen, Audi, Skoda or Seat     
## RegionAdjBasse-Normandie                    
## RegionAdjBretagne                          .
## RegionAdjCentre                            *
## RegionAdjHaute-Normandie                    
## RegionAdjIle-de-France                      
## RegionAdjLimousin                           
## RegionAdjNord-Pas-de-Calais                 
## RegionAdjPays-de-la-Loire                   
## RegionAdjPoitou-Charentes                  .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Number of iterations in BFGS optimization: 73 
## Log-likelihood: -1.799e+04 on 39 Df

检验模型效能

在训练集上验证预测出的数据的合理性 首先查看分布。

mean(train_data$ClaimNb)
## [1] 0.05112211
train_pred <- predict(zeropoisson_model,train_data,type = "response")
summary(train_pred)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.01267 0.03843 0.04833 0.05112 0.06062 0.17635

可以看出预测的索赔次数均值和实际的索赔次数均值是基本上相等的。预测分布具有合理性。

然后在测试集上验证。可能存在高估索赔频率的问题。

这个是测试集中有索赔的比例。

sum(test_data$isClaim)/nrow(test_data)
## [1] 0.04614489

这个是平均索赔次数。

mean(test_data$ClaimNb)
## [1] 0.04865373

这个是测试集中估计的索赔次数的摘要。

test_pred <- predict(zeropoisson_model,test_data,type = "response")
summary(test_pred)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.01214 0.03846 0.04836 0.05105 0.06053 0.17259

可以看出估计索赔次数的中位数与有索赔的比例接近,均值也接近,说明估计结果是合理的。

预测索赔次数展现。

set.seed(123)  
predicted_counts <- rpois(n = length(test_pred), lambda = test_pred)
table(predicted_counts) 
## predicted_counts
##     0     1     2 
## 21264  1030    27

实际的索赔次数

table(test_data$ClaimNb)
## 
##     0     1     2     3 
## 21291   976    52     2

接下来评估模型的效能:

##测试集上MSE, RMSE和MAE
library(Metrics)
MSE_zeropoisson <- mse(test_data$ClaimNb,test_pred)
RMSE_zeropoisson <- rmse(test_data$ClaimNb,test_pred)
MAE_zeropoisson <- mae(test_data$ClaimNb,test_pred)
AIC_zeropoisson <- AIC(zeropoisson_model)
BIC_zeropoisson <- BIC(zeropoisson_model)
cat("零膨胀泊松分布的MSE",MSE_zeropoisson,"\n")
## 零膨胀泊松分布的MSE 0.05117727
cat("零膨胀泊松分布的RMSE",RMSE_zeropoisson,"\n")
## 零膨胀泊松分布的RMSE 0.2262239
cat("零膨胀泊松分布的MAE",MAE_zeropoisson,"\n")
## 零膨胀泊松分布的MAE 0.09442513
cat("零膨胀泊松分布的AIC",AIC_zeropoisson,"\n")
## 零膨胀泊松分布的AIC 36061.74
cat("零膨胀泊松分布的BIC",BIC_zeropoisson)
## 零膨胀泊松分布的BIC 36428.33
##模型调优,加入高阶项和二次项
zeropoisson_model2 <- zeroinfl(ClaimNb~CarAge+I(CarAge^2)+DriverAgeGroup+DriverAgeGroup:logDensity+Gas+logDensity+Power|BrandAdj+RegionAdj,data = train_data,dist = "poisson")
summary(zeropoisson_model2)
## 
## Call:
## zeroinfl(formula = ClaimNb ~ CarAge + I(CarAge^2) + DriverAgeGroup + 
##     DriverAgeGroup:logDensity + Gas + logDensity + Power | BrandAdj + 
##     RegionAdj, data = train_data, dist = "poisson")
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4563 -0.2400 -0.2147 -0.1903 12.4652 
## 
## Count model coefficients (poisson with log link):
##                                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                       -3.1340635  0.1968436 -15.922  < 2e-16 ***
## CarAge                            -0.0182730  0.0092282  -1.980 0.047690 *  
## I(CarAge^2)                       -0.0008240  0.0004503  -1.830 0.067260 .  
## DriverAgeGroup30-34                0.1211981  0.2316722   0.523 0.600874    
## DriverAgeGroup35-39               -0.0298133  0.2237205  -0.133 0.893987    
## DriverAgeGroup40-44                0.2071409  0.2202241   0.941 0.346914    
## DriverAgeGroup45-49                0.3296192  0.2157756   1.528 0.126611    
## DriverAgeGroup50-54                0.2185877  0.2136984   1.023 0.306365    
## DriverAgeGroup55-64                0.2170944  0.2100762   1.033 0.301413    
## DriverAgeGroup65-74                0.2267456  0.2269123   0.999 0.317666    
## DriverAgeGroup75及以上             0.3549136  0.2466372   1.439 0.150147    
## GasRegular                        -0.2125605  0.0334360  -6.357 2.05e-10 ***
## logDensity                         0.1713680  0.0272963   6.278 3.43e-10 ***
## Powere                             0.0906521  0.0541971   1.673 0.094399 .  
## Powerf                             0.1080798  0.0532600   2.029 0.042429 *  
## Powerg                             0.0886266  0.0533949   1.660 0.096948 .  
## Powerh                             0.0706172  0.0804779   0.877 0.380230    
## Poweri                             0.2597168  0.0857980   3.027 0.002469 ** 
## Powerj                             0.3035031  0.0877441   3.459 0.000542 ***
## Powerk                             0.3661325  0.1181331   3.099 0.001940 ** 
## Powerl                             0.2041233  0.1798889   1.135 0.256493    
## Powerm                             0.4740123  0.2014455   2.353 0.018620 *  
## Powern                             0.6000029  0.2457649   2.441 0.014632 *  
## Powero                             0.1419812  0.3619187   0.392 0.694836    
## DriverAgeGroup30-34:logDensity    -0.0589490  0.0373892  -1.577 0.114881    
## DriverAgeGroup35-39:logDensity    -0.0340342  0.0359378  -0.947 0.343623    
## DriverAgeGroup40-44:logDensity    -0.0668625  0.0356901  -1.873 0.061010 .  
## DriverAgeGroup45-49:logDensity    -0.0843960  0.0352178  -2.396 0.016557 *  
## DriverAgeGroup50-54:logDensity    -0.0734539  0.0346834  -2.118 0.034189 *  
## DriverAgeGroup55-64:logDensity    -0.0853705  0.0341903  -2.497 0.012528 *  
## DriverAgeGroup65-74:logDensity    -0.0973854  0.0377434  -2.580 0.009874 ** 
## DriverAgeGroup75及以上:logDensity -0.0758297  0.0409357  -1.852 0.063967 .  
## 
## Zero-inflation model coefficients (binomial with logit link):
##                                            Estimate Std. Error z value Pr(>|z|)
## (Intercept)                                 -1.2103     0.5279  -2.293   0.0219
## BrandAdjJapanese (except Nissan) or Korean  -0.8182     0.4882  -1.676   0.0937
## BrandAdjMercedes, Chrysler or BMW           -0.4319     0.3405  -1.269   0.2046
## BrandAdjOpel, General Motors or Ford        -0.6807     0.3085  -2.207   0.0273
## BrandAdjother                               -0.2887     0.3712  -0.778   0.4366
## BrandAdjRenault, Nissan or Citroen          -0.1761     0.2169  -0.812   0.4169
## BrandAdjVolkswagen, Audi, Skoda or Seat     -0.2887     0.2727  -1.059   0.2898
## RegionAdjBasse-Normandie                     0.3415     0.4782   0.714   0.4751
## RegionAdjBretagne                            0.7095     0.3969   1.788   0.0738
## RegionAdjCentre                              0.7871     0.3893   2.022   0.0432
## RegionAdjHaute-Normandie                     0.5434     0.6429   0.845   0.3979
## RegionAdjIle-de-France                       0.4032     0.4163   0.969   0.3327
## RegionAdjLimousin                           -4.8892    88.7288  -0.055   0.9561
## RegionAdjNord-Pas-de-Calais                  0.4332     0.4695   0.923   0.3562
## RegionAdjPays-de-la-Loire                    0.4024     0.3987   1.009   0.3129
## RegionAdjPoitou-Charentes                    0.7463     0.4329   1.724   0.0847
##                                             
## (Intercept)                                *
## BrandAdjJapanese (except Nissan) or Korean .
## BrandAdjMercedes, Chrysler or BMW           
## BrandAdjOpel, General Motors or Ford       *
## BrandAdjother                               
## BrandAdjRenault, Nissan or Citroen          
## BrandAdjVolkswagen, Audi, Skoda or Seat     
## RegionAdjBasse-Normandie                    
## RegionAdjBretagne                          .
## RegionAdjCentre                            *
## RegionAdjHaute-Normandie                    
## RegionAdjIle-de-France                      
## RegionAdjLimousin                           
## RegionAdjNord-Pas-de-Calais                 
## RegionAdjPays-de-la-Loire                   
## RegionAdjPoitou-Charentes                  .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Number of iterations in BFGS optimization: 91 
## Log-likelihood: -1.798e+04 on 48 Df

利用新模型进行预测

test_pred2 <- predict(zeropoisson_model2,test_data,type = "response")
summary(test_pred2)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00959 0.03871 0.04838 0.05102 0.06022 0.20412
##各个索赔次数的预测
set.seed(123)  
predicted_counts2 <- rpois(n = length(test_pred2), lambda = test_pred2)
table(predicted_counts2) 
## predicted_counts2
##     0     1     2 
## 21264  1031    26
##MSE,MAE和RMSE
MSE_zeropoisson2 <- mse(test_data$ClaimNb,test_pred2)
RMSE_zeropoisson2 <- rmse(test_data$ClaimNb,test_pred2)
MAE_zeropoisson2 <- mae(test_data$ClaimNb,test_pred2)
AIC_zeropoisson2 <- AIC(zeropoisson_model2)
BIC_zeropoisson2 <- BIC(zeropoisson_model2)
cat("新零膨胀泊松分布的MSE",MSE_zeropoisson2,"\n")
## 新零膨胀泊松分布的MSE 0.05116148
cat("新零膨胀泊松分布的RMSE",RMSE_zeropoisson2,"\n")
## 新零膨胀泊松分布的RMSE 0.226189
cat("新零膨胀泊松分布的MAE",MAE_zeropoisson2,"\n")
## 新零膨胀泊松分布的MAE 0.09437108
cat("新零膨胀泊松分布的AIC",AIC_zeropoisson2,"\n")
## 新零膨胀泊松分布的AIC 36065.68
cat("新零膨胀泊松分布的BIC",BIC_zeropoisson2,"\n")
## 新零膨胀泊松分布的BIC 36516.87

或许原来的模型更好。

##Vuong检验
poisson_model <- glm(ClaimNb~CarAge+DriverAgeGroup+Gas+logDensity+Power+BrandAdj+RegionAdj, data = train_data, family = poisson)
summary(poisson_model)
## 
## Call:
## glm(formula = ClaimNb ~ CarAge + DriverAgeGroup + Gas + logDensity + 
##     Power + BrandAdj + RegionAdj, family = poisson, data = train_data)
## 
## Coefficients:
##                                             Estimate Std. Error z value
## (Intercept)                                -2.969301   0.129729 -22.889
## CarAge                                     -0.034054   0.003093 -11.011
## DriverAgeGroup30-34                        -0.228757   0.070382  -3.250
## DriverAgeGroup35-39                        -0.236822   0.067597  -3.503
## DriverAgeGroup40-44                        -0.189844   0.066513  -2.854
## DriverAgeGroup45-49                        -0.165448   0.065693  -2.519
## DriverAgeGroup50-54                        -0.216691   0.064827  -3.343
## DriverAgeGroup55-64                        -0.284728   0.063777  -4.464
## DriverAgeGroup65-74                        -0.338921   0.070083  -4.836
## DriverAgeGroup75及以上                     -0.095150   0.076809  -1.239
## GasRegular                                 -0.212901   0.033033  -6.445
## logDensity                                  0.104527   0.009845  10.617
## Powere                                      0.089026   0.053646   1.660
## Powerf                                      0.109952   0.052661   2.088
## Powerg                                      0.084435   0.052869   1.597
## Powerh                                      0.062151   0.079634   0.780
## Poweri                                      0.251292   0.084812   2.963
## Powerj                                      0.305121   0.086895   3.511
## Powerk                                      0.347805   0.116534   2.985
## Powerl                                      0.176310   0.178215   0.989
## Powerm                                      0.437145   0.198714   2.200
## Powern                                      0.577049   0.241933   2.385
## Powero                                      0.132685   0.358201   0.370
## BrandAdjJapanese (except Nissan) or Korean  0.192456   0.108767   1.769
## BrandAdjMercedes, Chrysler or BMW           0.156784   0.101571   1.544
## BrandAdjOpel, General Motors or Ford        0.184868   0.085583   2.160
## BrandAdjother                               0.093004   0.116262   0.800
## BrandAdjRenault, Nissan or Citroen          0.065057   0.076212   0.854
## BrandAdjVolkswagen, Audi, Skoda or Seat     0.089096   0.089532   0.995
## RegionAdjBasse-Normandie                   -0.080404   0.100509  -0.800
## RegionAdjBretagne                          -0.182412   0.073606  -2.478
## RegionAdjCentre                            -0.212788   0.067239  -3.165
## RegionAdjHaute-Normandie                   -0.101761   0.160310  -0.635
## RegionAdjIle-de-France                     -0.083390   0.082771  -1.007
## RegionAdjLimousin                           0.253958   0.154421   1.645
## RegionAdjNord-Pas-de-Calais                -0.077428   0.099168  -0.781
## RegionAdjPays-de-la-Loire                  -0.099847   0.077170  -1.294
## RegionAdjPoitou-Charentes                  -0.204945   0.092846  -2.207
##                                            Pr(>|z|)    
## (Intercept)                                 < 2e-16 ***
## CarAge                                      < 2e-16 ***
## DriverAgeGroup30-34                        0.001153 ** 
## DriverAgeGroup35-39                        0.000459 ***
## DriverAgeGroup40-44                        0.004314 ** 
## DriverAgeGroup45-49                        0.011785 *  
## DriverAgeGroup50-54                        0.000830 ***
## DriverAgeGroup55-64                        8.03e-06 ***
## DriverAgeGroup65-74                        1.32e-06 ***
## DriverAgeGroup75及以上                     0.215428    
## GasRegular                                 1.16e-10 ***
## logDensity                                  < 2e-16 ***
## Powere                                     0.097011 .  
## Powerf                                     0.036804 *  
## Powerg                                     0.110253    
## Powerh                                     0.435120    
## Poweri                                     0.003047 ** 
## Powerj                                     0.000446 ***
## Powerk                                     0.002840 ** 
## Powerl                                     0.322511    
## Powerm                                     0.027816 *  
## Powern                                     0.017072 *  
## Powero                                     0.711069    
## BrandAdjJapanese (except Nissan) or Korean 0.076821 .  
## BrandAdjMercedes, Chrysler or BMW          0.122688    
## BrandAdjOpel, General Motors or Ford       0.030765 *  
## BrandAdjother                              0.423739    
## BrandAdjRenault, Nissan or Citroen         0.393308    
## BrandAdjVolkswagen, Audi, Skoda or Seat    0.319676    
## RegionAdjBasse-Normandie                   0.423725    
## RegionAdjBretagne                          0.013203 *  
## RegionAdjCentre                            0.001553 ** 
## RegionAdjHaute-Normandie                   0.525575    
## RegionAdjIle-de-France                     0.313707    
## RegionAdjLimousin                          0.100057    
## RegionAdjNord-Pas-de-Calais                0.434932    
## RegionAdjPays-de-la-Loire                  0.195713    
## RegionAdjPoitou-Charentes                  0.027288 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 27652  on 89295  degrees of freedom
## Residual deviance: 27125  on 89258  degrees of freedom
## AIC: 36079
## 
## Number of Fisher Scoring iterations: 6
cat("\n")
vuong(zeropoisson_model,poisson_model)
## Vuong Non-Nested Hypothesis Test-Statistic: 
## (test-statistic is asymptotically distributed N(0,1) under the
##  null that the models are indistinguishible)
## -------------------------------------------------------------
##               Vuong z-statistic             H_A  p-value
## Raw                   1.9893442 model1 > model2 0.023332
## AIC-corrected         1.7841867 model1 > model2 0.037197
## BIC-corrected         0.8199758 model1 > model2 0.206115
##直接用泊松分布进行拟合的结果
test_pred_poisson <- predict(poisson_model,test_data,type = "response")
summary(test_pred_poisson)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.01219 0.03849 0.04830 0.05105 0.06042 0.17720
cat("\n")
MSE_zeropoisson_poisson <- mse(test_data$ClaimNb,test_pred_poisson)
RMSE_zeropoisson_poisson <- rmse(test_data$ClaimNb,test_pred_poisson)
MAE_zeropoisson_poisson <- mae(test_data$ClaimNb,test_pred_poisson)
AIC_zeropoisson_poisson <- AIC(poisson_model)
BIC_zeropoisson_poisson <- BIC(poisson_model)
cat("泊松分布的MSE",MSE_zeropoisson_poisson,"\n")
## 泊松分布的MSE 0.0511785
cat("泊松分布的RMSE",RMSE_zeropoisson_poisson,"\n")
## 泊松分布的RMSE 0.2262267
cat("泊松分布的MAE",MAE_zeropoisson_poisson,"\n")
## 泊松分布的MAE 0.09442018
cat("泊松分布的AIC",AIC_zeropoisson_poisson,"\n")
## 泊松分布的AIC 36079.13
cat("泊松分布的BIC",BIC_zeropoisson_poisson,"\n")
## 泊松分布的BIC 36436.32

从Vuong可以看出零膨胀泊松模型比泊松模型要好,但是从测试集结果来看好像差不多。

##使用负二项模型
library(MASS)
## 
## 载入程序包:'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
negbio_model <- glm.nb(ClaimNb~CarAge+DriverAgeGroup+Gas+logDensity+Power+BrandAdj+RegionAdj,data = train_data)
summary(negbio_model)
## 
## Call:
## glm.nb(formula = ClaimNb ~ CarAge + DriverAgeGroup + Gas + logDensity + 
##     Power + BrandAdj + RegionAdj, data = train_data, init.theta = 2.363465756, 
##     link = log)
## 
## Coefficients:
##                                             Estimate Std. Error z value
## (Intercept)                                -2.968464   0.131328 -22.604
## CarAge                                     -0.034130   0.003126 -10.916
## DriverAgeGroup30-34                        -0.228504   0.071333  -3.203
## DriverAgeGroup35-39                        -0.237223   0.068522  -3.462
## DriverAgeGroup40-44                        -0.189204   0.067428  -2.806
## DriverAgeGroup45-49                        -0.164902   0.066603  -2.476
## DriverAgeGroup50-54                        -0.216413   0.065718  -3.293
## DriverAgeGroup55-64                        -0.284242   0.064635  -4.398
## DriverAgeGroup65-74                        -0.338341   0.070962  -4.768
## DriverAgeGroup75及以上                     -0.094743   0.077829  -1.217
## GasRegular                                 -0.213113   0.033431  -6.375
## logDensity                                  0.104585   0.009966  10.494
## Powere                                      0.088607   0.054234   1.634
## Powerf                                      0.109994   0.053233   2.066
## Powerg                                      0.084475   0.053425   1.581
## Powerh                                      0.062042   0.080520   0.771
## Poweri                                      0.251742   0.085855   2.932
## Powerj                                      0.304782   0.088089   3.460
## Powerk                                      0.348274   0.118163   2.947
## Powerl                                      0.175786   0.180800   0.972
## Powerm                                      0.436792   0.202058   2.162
## Powern                                      0.575228   0.246590   2.333
## Powero                                      0.136020   0.362837   0.375
## BrandAdjJapanese (except Nissan) or Korean  0.190802   0.110213   1.731
## BrandAdjMercedes, Chrysler or BMW           0.157656   0.102814   1.533
## BrandAdjOpel, General Motors or Ford        0.184218   0.086577   2.128
## BrandAdjother                               0.092410   0.117657   0.785
## BrandAdjRenault, Nissan or Citroen          0.064536   0.077047   0.838
## BrandAdjVolkswagen, Audi, Skoda or Seat     0.088406   0.090574   0.976
## RegionAdjBasse-Normandie                   -0.079684   0.101876  -0.782
## RegionAdjBretagne                          -0.182623   0.074626  -2.447
## RegionAdjCentre                            -0.213122   0.068186  -3.126
## RegionAdjHaute-Normandie                   -0.101951   0.162408  -0.628
## RegionAdjIle-de-France                     -0.082194   0.084023  -0.978
## RegionAdjLimousin                           0.255657   0.156644   1.632
## RegionAdjNord-Pas-de-Calais                -0.077564   0.100622  -0.771
## RegionAdjPays-de-la-Loire                  -0.100690   0.078265  -1.287
## RegionAdjPoitou-Charentes                  -0.205538   0.094025  -2.186
##                                            Pr(>|z|)    
## (Intercept)                                 < 2e-16 ***
## CarAge                                      < 2e-16 ***
## DriverAgeGroup30-34                        0.001358 ** 
## DriverAgeGroup35-39                        0.000536 ***
## DriverAgeGroup40-44                        0.005016 ** 
## DriverAgeGroup45-49                        0.013291 *  
## DriverAgeGroup50-54                        0.000991 ***
## DriverAgeGroup55-64                        1.09e-05 ***
## DriverAgeGroup65-74                        1.86e-06 ***
## DriverAgeGroup75及以上                     0.223481    
## GasRegular                                 1.83e-10 ***
## logDensity                                  < 2e-16 ***
## Powere                                     0.102303    
## Powerf                                     0.038801 *  
## Powerg                                     0.113834    
## Powerh                                     0.440996    
## Poweri                                     0.003366 ** 
## Powerj                                     0.000540 ***
## Powerk                                     0.003204 ** 
## Powerl                                     0.330918    
## Powerm                                     0.030640 *  
## Powern                                     0.019662 *  
## Powero                                     0.707750    
## BrandAdjJapanese (except Nissan) or Korean 0.083414 .  
## BrandAdjMercedes, Chrysler or BMW          0.125176    
## BrandAdjOpel, General Motors or Ford       0.033355 *  
## BrandAdjother                              0.432210    
## BrandAdjRenault, Nissan or Citroen         0.402243    
## BrandAdjVolkswagen, Audi, Skoda or Seat    0.329035    
## RegionAdjBasse-Normandie                   0.434118    
## RegionAdjBretagne                          0.014398 *  
## RegionAdjCentre                            0.001774 ** 
## RegionAdjHaute-Normandie                   0.530173    
## RegionAdjIle-de-France                     0.327959    
## RegionAdjLimousin                          0.102660    
## RegionAdjNord-Pas-de-Calais                0.440796    
## RegionAdjPays-de-la-Loire                  0.198260    
## RegionAdjPoitou-Charentes                  0.028816 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(2.3635) family taken to be 1)
## 
##     Null deviance: 25935  on 89295  degrees of freedom
## Residual deviance: 25418  on 89258  degrees of freedom
## AIC: 36062
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  2.363 
##           Std. Err.:  0.604 
## 
##  2 x log-likelihood:  -35983.578
pred_ori1 <- predict(negbio_model,dataori,type = "response")
sum(pred_ori1)
## [1] 5704.832
summary(pred_ori1)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.01217 0.03843 0.04828 0.05111 0.06052 0.17792
##对负二项模型结果进行检验
pred_ori <- predict(zeropoisson_model,dataori,type="response")
test_pred_negbio <- predict(negbio_model,test_data,type = "response")
summary(test_pred_negbio)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.01217 0.03846 0.04831 0.05105 0.06042 0.17792
cat("\n")
MSE_zeronegbio_negbio <- mse(test_data$ClaimNb,test_pred_negbio)
RMSE_zeronegbio_negbio <- rmse(test_data$ClaimNb,test_pred_negbio)
MAE_zeronegbio_negbio <- mae(test_data$ClaimNb,test_pred_negbio)
AIC_zeronegbio_negbio <- AIC(negbio_model)
BIC_zeronegbio_negbio <- BIC(negbio_model)
cat("负二项分布的MSE",MSE_zeronegbio_negbio,"\n")
## 负二项分布的MSE 0.05117871
cat("负二项分布的RMSE",RMSE_zeronegbio_negbio,"\n")
## 负二项分布的RMSE 0.2262271
cat("负二项分布的MAE",MAE_zeronegbio_negbio,"\n")
## 负二项分布的MAE 0.09442322
cat("负二项分布的AIC",AIC_zeronegbio_negbio,"\n")
## 负二项分布的AIC 36061.58
cat("负二项分布的BIC",BIC_zeronegbio_negbio,"\n")
## 负二项分布的BIC 36428.17

对损失分布进行拟合

首先选取出有损失的

train_data_Amount <- train_data[train_data$ClaimNb>0,]

然后进行GLM建模,使用Gamma连接函数

Gamma_model <- glm(ClaimAmount~CarAge+DriverAgeGroup+Gas+logDensity+Power+BrandAdj+RegionAdj,data = train_data_Amount,family = Gamma((link = "log")))
summary(Gamma_model)
## 
## Call:
## glm(formula = ClaimAmount ~ CarAge + DriverAgeGroup + Gas + logDensity + 
##     Power + BrandAdj + RegionAdj, family = Gamma((link = "log")), 
##     data = train_data_Amount)
## 
## Coefficients:
##                                             Estimate Std. Error t value
## (Intercept)                                 7.286776   0.462560  15.753
## CarAge                                     -0.002974   0.011132  -0.267
## DriverAgeGroup30-34                        -0.417048   0.250475  -1.665
## DriverAgeGroup35-39                        -0.112840   0.241141  -0.468
## DriverAgeGroup40-44                        -0.435221   0.237147  -1.835
## DriverAgeGroup45-49                        -0.500002   0.233898  -2.138
## DriverAgeGroup50-54                        -0.501311   0.231374  -2.167
## DriverAgeGroup55-64                        -0.286989   0.228076  -1.258
## DriverAgeGroup65-74                        -0.325786   0.250127  -1.302
## DriverAgeGroup75及以上                     -0.003913   0.274616  -0.014
## GasRegular                                  0.069843   0.119025   0.587
## logDensity                                 -0.008464   0.034438  -0.246
## Powere                                      0.072938   0.191680   0.381
## Powerf                                      0.138251   0.188272   0.734
## Powerg                                     -0.009519   0.186439  -0.051
## Powerh                                     -0.113840   0.281530  -0.404
## Poweri                                      0.233846   0.296658   0.788
## Powerj                                      0.540524   0.308500   1.752
## Powerk                                     -0.043508   0.407927  -0.107
## Powerl                                      0.313698   0.627140   0.500
## Powerm                                      1.071542   0.696423   1.539
## Powern                                     -0.071288   0.883098  -0.081
## Powero                                      0.027639   1.240119   0.022
## BrandAdjJapanese (except Nissan) or Korean  0.022635   0.383150   0.059
## BrandAdjMercedes, Chrysler or BMW          -0.035546   0.356668  -0.100
## BrandAdjOpel, General Motors or Ford        0.106812   0.304392   0.351
## BrandAdjother                              -0.104197   0.412816  -0.252
## BrandAdjRenault, Nissan or Citroen          0.219640   0.270672   0.811
## BrandAdjVolkswagen, Audi, Skoda or Seat     0.053851   0.319271   0.169
## RegionAdjBasse-Normandie                    0.053366   0.359512   0.148
## RegionAdjBretagne                           0.249496   0.262797   0.949
## RegionAdjCentre                             0.236696   0.239296   0.989
## RegionAdjHaute-Normandie                    0.286535   0.575736   0.498
## RegionAdjIle-de-France                     -0.040007   0.293682  -0.136
## RegionAdjLimousin                           0.030556   0.563497   0.054
## RegionAdjNord-Pas-de-Calais                 0.157209   0.356420   0.441
## RegionAdjPays-de-la-Loire                   0.095198   0.274082   0.347
## RegionAdjPoitou-Charentes                   0.159846   0.328690   0.486
##                                            Pr(>|t|)    
## (Intercept)                                  <2e-16 ***
## CarAge                                       0.7893    
## DriverAgeGroup30-34                          0.0960 .  
## DriverAgeGroup35-39                          0.6398    
## DriverAgeGroup40-44                          0.0665 .  
## DriverAgeGroup45-49                          0.0326 *  
## DriverAgeGroup50-54                          0.0303 *  
## DriverAgeGroup55-64                          0.2084    
## DriverAgeGroup65-74                          0.1928    
## DriverAgeGroup75及以上                       0.9886    
## GasRegular                                   0.5574    
## logDensity                                   0.8059    
## Powere                                       0.7036    
## Powerf                                       0.4628    
## Powerg                                       0.9593    
## Powerh                                       0.6860    
## Poweri                                       0.4306    
## Powerj                                       0.0798 .  
## Powerk                                       0.9151    
## Powerl                                       0.6170    
## Powerm                                       0.1240    
## Powern                                       0.9357    
## Powero                                       0.9822    
## BrandAdjJapanese (except Nissan) or Korean   0.9529    
## BrandAdjMercedes, Chrysler or BMW            0.9206    
## BrandAdjOpel, General Motors or Ford         0.7257    
## BrandAdjother                                0.8007    
## BrandAdjRenault, Nissan or Citroen           0.4171    
## BrandAdjVolkswagen, Audi, Skoda or Seat      0.8661    
## RegionAdjBasse-Normandie                     0.8820    
## RegionAdjBretagne                            0.3425    
## RegionAdjCentre                              0.3227    
## RegionAdjHaute-Normandie                     0.6187    
## RegionAdjIle-de-France                       0.8917    
## RegionAdjLimousin                            0.9568    
## RegionAdjNord-Pas-de-Calais                  0.6592    
## RegionAdjPays-de-la-Loire                    0.7284    
## RegionAdjPoitou-Charentes                    0.6268    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 11.99684)
## 
##     Null deviance: 5596.7  on 4383  degrees of freedom
## Residual deviance: 5234.5  on 4346  degrees of freedom
## AIC: 73266
## 
## Number of Fisher Scoring iterations: 10

检验模型的合理性

train_pred_Amount <- predict(Gamma_model,train_data_Amount,type = "response")
summary(train_pred_Amount)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   653.9  1295.6  1509.6  1598.6  1835.1  6862.0
summary(train_data_Amount$ClaimAmount)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       2     695    1150    1607    1223  254944

很明显对于小额算是和大额损失建模并不理想

upper_bound <- quantile(train_data_Amount$ClaimAmount, 0.99)
train_clean <- subset(train_data_Amount, ClaimAmount < upper_bound)
summary(train_clean$ClaimAmount)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       2     681    1148    1175    1219    8903
model_clean <- glm(ClaimAmount~CarAge+DriverAgeGroup+Gas+logDensity+Power+BrandAdj+RegionAdj,data = train_clean,family = Gamma(link = "log"))
summary(model_clean)
## 
## Call:
## glm(formula = ClaimAmount ~ CarAge + DriverAgeGroup + Gas + logDensity + 
##     Power + BrandAdj + RegionAdj, family = Gamma(link = "log"), 
##     data = train_clean)
## 
## Coefficients:
##                                              Estimate Std. Error t value
## (Intercept)                                 7.0957448  0.1066508  66.532
## CarAge                                     -0.0063351  0.0025687  -2.466
## DriverAgeGroup30-34                        -0.0364012  0.0577943  -0.630
## DriverAgeGroup35-39                        -0.0574252  0.0557395  -1.030
## DriverAgeGroup40-44                         0.0287509  0.0546935   0.526
## DriverAgeGroup45-49                        -0.0131701  0.0539641  -0.244
## DriverAgeGroup50-54                        -0.0198147  0.0533901  -0.371
## DriverAgeGroup55-64                        -0.0130412  0.0526658  -0.248
## DriverAgeGroup65-74                         0.0159976  0.0576650   0.277
## DriverAgeGroup75及以上                      0.1251510  0.0636797   1.965
## GasRegular                                 -0.0405220  0.0274470  -1.476
## logDensity                                  0.0051268  0.0079489   0.645
## Powere                                      0.0915564  0.0441485   2.074
## Powerf                                      0.0490266  0.0434099   1.129
## Powerg                                      0.0353665  0.0429824   0.823
## Powerh                                     -0.0332626  0.0647947  -0.513
## Poweri                                      0.0455577  0.0689136   0.661
## Powerj                                      0.1223444  0.0713591   1.714
## Powerk                                      0.0154525  0.0941853   0.164
## Powerl                                     -0.0009286  0.1461375  -0.006
## Powerm                                      0.1810393  0.1624593   1.114
## Powern                                      0.0505307  0.2027080   0.249
## Powero                                      0.1500743  0.2846490   0.527
## BrandAdjJapanese (except Nissan) or Korean  0.1307475  0.0884079   1.479
## BrandAdjMercedes, Chrysler or BMW           0.0381039  0.0822158   0.463
## BrandAdjOpel, General Motors or Ford        0.0064714  0.0702297   0.092
## BrandAdjother                               0.0091774  0.0951726   0.096
## BrandAdjRenault, Nissan or Citroen          0.0312653  0.0625135   0.500
## BrandAdjVolkswagen, Audi, Skoda or Seat     0.0669240  0.0737098   0.908
## RegionAdjBasse-Normandie                   -0.1260140  0.0829448  -1.519
## RegionAdjBretagne                          -0.0249851  0.0604742  -0.413
## RegionAdjCentre                            -0.0779555  0.0550752  -1.415
## RegionAdjHaute-Normandie                   -0.0191123  0.1335339  -0.143
## RegionAdjIle-de-France                     -0.1247256  0.0675743  -1.846
## RegionAdjLimousin                           0.0242156  0.1293909   0.187
## RegionAdjNord-Pas-de-Calais                -0.0162660  0.0821808  -0.198
## RegionAdjPays-de-la-Loire                  -0.0658295  0.0631368  -1.043
## RegionAdjPoitou-Charentes                  -0.0726122  0.0757043  -0.959
##                                            Pr(>|t|)    
## (Intercept)                                  <2e-16 ***
## CarAge                                       0.0137 *  
## DriverAgeGroup30-34                          0.5288    
## DriverAgeGroup35-39                          0.3030    
## DriverAgeGroup40-44                          0.5991    
## DriverAgeGroup45-49                          0.8072    
## DriverAgeGroup50-54                          0.7106    
## DriverAgeGroup55-64                          0.8044    
## DriverAgeGroup65-74                          0.7815    
## DriverAgeGroup75及以上                       0.0494 *  
## GasRegular                                   0.1399    
## logDensity                                   0.5190    
## Powere                                       0.0382 *  
## Powerf                                       0.2588    
## Powerg                                       0.4107    
## Powerh                                       0.6077    
## Poweri                                       0.5086    
## Powerj                                       0.0865 .  
## Powerk                                       0.8697    
## Powerl                                       0.9949    
## Powerm                                       0.2652    
## Powern                                       0.8032    
## Powero                                       0.5981    
## BrandAdjJapanese (except Nissan) or Korean   0.1392    
## BrandAdjMercedes, Chrysler or BMW            0.6431    
## BrandAdjOpel, General Motors or Ford         0.9266    
## BrandAdjother                                0.9232    
## BrandAdjRenault, Nissan or Citroen           0.6170    
## BrandAdjVolkswagen, Audi, Skoda or Seat      0.3640    
## RegionAdjBasse-Normandie                     0.1288    
## RegionAdjBretagne                            0.6795    
## RegionAdjCentre                              0.1570    
## RegionAdjHaute-Normandie                     0.8862    
## RegionAdjIle-de-France                       0.0650 .  
## RegionAdjLimousin                            0.8516    
## RegionAdjNord-Pas-de-Calais                  0.8431    
## RegionAdjPays-de-la-Loire                    0.2972    
## RegionAdjPoitou-Charentes                    0.3375    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 0.6319334)
## 
##     Null deviance: 3128.3  on 4339  degrees of freedom
## Residual deviance: 3099.0  on 4302  degrees of freedom
## AIC: 69675
## 
## Number of Fisher Scoring iterations: 6
train_pred_Amount1 <- predict(model_clean,train_clean,type = "response")
summary(train_pred_Amount1)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   892.3  1109.0  1169.3  1175.5  1234.2  1554.9
par(mfrow=c(1,1))
hist(dataori$ClaimAmount[dataori$ClaimNb>0&dataori$ClaimAmount<5000],breaks = 20,
     freq = FALSE,main="ClaimAmount")
lines(density(dataori$ClaimAmount[dataori$ClaimNb>0&dataori$ClaimAmount<5000]),col="red",lwd=1)

使用tweedie模型看看

library(statmod)
tweedie_model <- glm(ClaimAmount~ns(CarAge,3)+DriverAgeGroup+Gas+logDensity+Power+BrandAdj+RegionAdj,
             data = train_clean, family = tweedie(var.power = 1.5, link.power = 0))
summary(tweedie_model)
## 
## Call:
## glm(formula = ClaimAmount ~ ns(CarAge, 3) + DriverAgeGroup + 
##     Gas + logDensity + Power + BrandAdj + RegionAdj, family = tweedie(var.power = 1.5, 
##     link.power = 0), data = train_clean)
## 
## Coefficients:
##                                              Estimate Std. Error t value
## (Intercept)                                 7.1028947  0.1160541  61.203
## ns(CarAge, 3)1                             -0.1674943  0.0656113  -2.553
## ns(CarAge, 3)2                             -0.0552235  0.1450321  -0.381
## ns(CarAge, 3)3                              0.0521085  0.1679896   0.310
## DriverAgeGroup30-34                        -0.0376334  0.0581383  -0.647
## DriverAgeGroup35-39                        -0.0536680  0.0561281  -0.956
## DriverAgeGroup40-44                         0.0275997  0.0546564   0.505
## DriverAgeGroup45-49                        -0.0148583  0.0541236  -0.275
## DriverAgeGroup50-54                        -0.0224819  0.0535782  -0.420
## DriverAgeGroup55-64                        -0.0146246  0.0528267  -0.277
## DriverAgeGroup65-74                         0.0112031  0.0577619   0.194
## DriverAgeGroup75及以上                      0.1200067  0.0629136   1.907
## GasRegular                                 -0.0391757  0.0275289  -1.423
## logDensity                                  0.0047375  0.0079753   0.594
## Powere                                      0.0901775  0.0444654   2.028
## Powerf                                      0.0498820  0.0439126   1.136
## Powerg                                      0.0329894  0.0435817   0.757
## Powerh                                     -0.0342054  0.0662446  -0.516
## Poweri                                      0.0442652  0.0695846   0.636
## Powerj                                      0.1199580  0.0709156   1.692
## Powerk                                      0.0069533  0.0955835   0.073
## Powerl                                     -0.0129740  0.1472630  -0.088
## Powerm                                      0.1373273  0.1602180   0.857
## Powern                                      0.0643217  0.2022682   0.318
## Powero                                      0.1523082  0.2794901   0.545
## BrandAdjJapanese (except Nissan) or Korean  0.1267815  0.0879535   1.441
## BrandAdjMercedes, Chrysler or BMW           0.0321598  0.0830805   0.387
## BrandAdjOpel, General Motors or Ford        0.0020067  0.0710803   0.028
## BrandAdjother                              -0.0006022  0.0964146  -0.006
## BrandAdjRenault, Nissan or Citroen          0.0269032  0.0634093   0.424
## BrandAdjVolkswagen, Audi, Skoda or Seat     0.0605535  0.0742773   0.815
## RegionAdjBasse-Normandie                   -0.1252347  0.0832112  -1.505
## RegionAdjBretagne                          -0.0254981  0.0596325  -0.428
## RegionAdjCentre                            -0.0775929  0.0543373  -1.428
## RegionAdjHaute-Normandie                   -0.0164831  0.1323895  -0.125
## RegionAdjIle-de-France                     -0.1263059  0.0673984  -1.874
## RegionAdjLimousin                           0.0193881  0.1277755   0.152
## RegionAdjNord-Pas-de-Calais                -0.0140926  0.0809588  -0.174
## RegionAdjPays-de-la-Loire                  -0.0652617  0.0625392  -1.044
## RegionAdjPoitou-Charentes                  -0.0716682  0.0753007  -0.952
##                                            Pr(>|t|)    
## (Intercept)                                  <2e-16 ***
## ns(CarAge, 3)1                               0.0107 *  
## ns(CarAge, 3)2                               0.7034    
## ns(CarAge, 3)3                               0.7564    
## DriverAgeGroup30-34                          0.5175    
## DriverAgeGroup35-39                          0.3390    
## DriverAgeGroup40-44                          0.6136    
## DriverAgeGroup45-49                          0.7837    
## DriverAgeGroup50-54                          0.6748    
## DriverAgeGroup55-64                          0.7819    
## DriverAgeGroup65-74                          0.8462    
## DriverAgeGroup75及以上                       0.0565 .  
## GasRegular                                   0.1548    
## logDensity                                   0.5525    
## Powere                                       0.0426 *  
## Powerf                                       0.2560    
## Powerg                                       0.4491    
## Powerh                                       0.6056    
## Poweri                                       0.5247    
## Powerj                                       0.0908 .  
## Powerk                                       0.9420    
## Powerl                                       0.9298    
## Powerm                                       0.3914    
## Powern                                       0.7505    
## Powero                                       0.5858    
## BrandAdjJapanese (except Nissan) or Korean   0.1495    
## BrandAdjMercedes, Chrysler or BMW            0.6987    
## BrandAdjOpel, General Motors or Ford         0.9775    
## BrandAdjother                                0.9950    
## BrandAdjRenault, Nissan or Citroen           0.6714    
## BrandAdjVolkswagen, Audi, Skoda or Seat      0.4150    
## RegionAdjBasse-Normandie                     0.1324    
## RegionAdjBretagne                            0.6690    
## RegionAdjCentre                              0.1534    
## RegionAdjHaute-Normandie                     0.9009    
## RegionAdjIle-de-France                       0.0610 .  
## RegionAdjLimousin                            0.8794    
## RegionAdjNord-Pas-de-Calais                  0.8618    
## RegionAdjPays-de-la-Loire                    0.2968    
## RegionAdjPoitou-Charentes                    0.3413    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Tweedie family taken to be 21.7472)
## 
##     Null deviance: 84911  on 4339  degrees of freedom
## Residual deviance: 83858  on 4300  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 4
train_pred_tweedie <- predict(tweedie_model,train_clean,type = "response")
summary(train_pred_tweedie)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   876.2  1106.8  1168.6  1175.4  1237.1  1538.8
plot(residuals(tweedie_model, type = "deviance") ~ fitted(tweedie_model))