R Markdown

GLM’s for pricing…

install.packages("CASdatasets", repos = "http://cas.uqam.ca/pub/", type="source")
library("CASdatasets")
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sp
data(freMTPLfreq)
data<-data.frame(freMTPLfreq)
remove(freMTPLfreq)

Now, we can explore the dataset:

names(data)
##  [1] "PolicyID"  "ClaimNb"   "Exposure"  "Power"     "CarAge"    "DriverAge"
##  [7] "Brand"     "Gas"       "Region"    "Density"
table(data$ClaimNb)
## 
##      0      1      2      3      4 
## 397779  14633    726     28      3
data$Claim<- ifelse(data$ClaimNb>0,1,0)
table(data$Claim)
## 
##      0      1 
## 397779  15390
prop.table(table(data$Claim))
## 
##          0          1 
## 0.96275132 0.03724868
length(data$Claim)
## [1] 413169

Contingency tables

table(data$Claim,data$Gas)
##    
##     Diesel Regular
##   0 197904  199875
##   1   8041    7349
prop.table(table(data$Claim,data$Gas))
##    
##         Diesel    Regular
##   0 0.47899044 0.48376088
##   1 0.01946177 0.01778691

Chi-square test for proportions:

table(data$Claim,data$Gas)
##    
##     Diesel Regular
##   0 197904  199875
##   1   8041    7349
prop.table(table(data$Claim,data$Gas))
##    
##         Diesel    Regular
##   0 0.47899044 0.48376088
##   1 0.01946177 0.01778691
chisq.test(table(data$Claim,data$Gas))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(data$Claim, data$Gas)
## X-squared = 36.823, df = 1, p-value = 1.294e-09

Relative risks, odds and odds ratio:

t1<-data.frame(table(data$Claim,data$Gas))
n11<- t1[1,3]
n12<- t1[3,3]
n21<- t1[2,3]
n22<- t1[4,3]

pi1<-n21/(n21+n11)
pi2<-n22/(n12+n22)
pi1
## [1] 0.03904441
pi2
## [1] 0.03546404

Relative risk:

rr<-pi1/pi2

Odds:

odds1<- pi1/(1-pi1)
odds2<- pi2/(1-pi2)
odds1
## [1] 0.04063081
odds2
## [1] 0.03676798

Odds ratio:

theta<- odds1/odds2
theta
## [1] 1.10506

Logistic regression for at least one claim:

We start with a null model, this is … just including the dependent variable.

names(data)
##  [1] "PolicyID"  "ClaimNb"   "Exposure"  "Power"     "CarAge"    "DriverAge"
##  [7] "Brand"     "Gas"       "Region"    "Density"   "Claim"
table(data$Claim)
## 
##      0      1 
## 397779  15390
model1<- glm(Claim ~1, data=data,family="binomial")
summary(model1)
## 
## Call:
## glm(formula = Claim ~ 1, family = "binomial", data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.2755  -0.2755  -0.2755  -0.2755   2.5652  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.252179   0.008215  -395.9   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 131470  on 413168  degrees of freedom
## Residual deviance: 131470  on 413168  degrees of freedom
## AIC: 131472
## 
## Number of Fisher Scoring iterations: 6

We can obtain the odds and then the probability of at least one claim:

exp(coef(model1))
## (Intercept) 
##  0.03868983
odds<-exp(coef(model1))
odds/(odds+1)
## (Intercept) 
##  0.03724868
prop.table(table(data$Claim))
## 
##          0          1 
## 0.96275132 0.03724868

A logistic regression controling for type of Gas(0=Diesel, 1=Regular)

model2<-glm(Claim~Gas, family="binomial", data=data)
summary(model2)
## 
## Call:
## glm(formula = Claim ~ Gas, family = "binomial", data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.2822  -0.2822  -0.2687  -0.2687   2.5843  
## 
## Coefficients:
##             Estimate Std. Error  z value Pr(>|z|)    
## (Intercept) -3.20323    0.01138 -281.575  < 2e-16 ***
## GasRegular  -0.09990    0.01645   -6.074 1.25e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 131470  on 413168  degrees of freedom
## Residual deviance: 131433  on 413167  degrees of freedom
## AIC: 131437
## 
## Number of Fisher Scoring iterations: 6
exp(coef(model2))
## (Intercept)  GasRegular 
##  0.04063081  0.90492853
odds<-exp(coef(model2))

And again, we can estimate the odds:

odds<-exp(coef(model2))
odd1<-odds[1]
odd2<-odds[2]
odd1
## (Intercept) 
##  0.04063081
odd2
## GasRegular 
##  0.9049285
odd1*odd2
## (Intercept) 
##  0.03676798
p<-(odd1*odd2)/(odd1*odd2+1)
p
## (Intercept) 
##  0.03546404

Now, a new model with more covariates:

model3<- glm(Claim~Exposure+CarAge+DriverAge+Gas+Density, 
             family="binomial",data=data)
summary(model3)
## 
## Call:
## glm(formula = Claim ~ Exposure + CarAge + DriverAge + Gas + Density, 
##     family = "binomial", data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7024  -0.3219  -0.2573  -0.2088   3.0029  
## 
## Coefficients:
##               Estimate Std. Error  z value Pr(>|z|)    
## (Intercept) -3.641e+00  3.258e-02 -111.746  < 2e-16 ***
## Exposure     1.281e+00  2.500e-02   51.220  < 2e-16 ***
## CarAge      -3.682e-03  1.517e-03   -2.428   0.0152 *  
## DriverAge   -8.095e-03  5.977e-04  -13.544  < 2e-16 ***
## GasRegular  -1.217e-01  1.689e-02   -7.207 5.74e-13 ***
## Density      1.507e-05  1.652e-06    9.119  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 131470  on 413168  degrees of freedom
## Residual deviance: 128559  on 413163  degrees of freedom
## AIC: 128571
## 
## Number of Fisher Scoring iterations: 6
exp(coef(model3))
## (Intercept)    Exposure      CarAge   DriverAge  GasRegular     Density 
##  0.02622766  3.59867946  0.99632436  0.99193789  0.88540484  1.00001507