R Markdown

GLM’s for pricing…

install.packages("CASdatasets", repos = "http://cas.uqam.ca/pub/", type="source")
library("CASdatasets")

## Loading required package: xts

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Loading required package: sp

data(freMTPLfreq)
data<-data.frame(freMTPLfreq)
remove(freMTPLfreq)

Now, we can explore the dataset:

names(data)

##  [1] "PolicyID"  "ClaimNb"   "Exposure"  "Power"     "CarAge"    "DriverAge"
##  [7] "Brand"     "Gas"       "Region"    "Density"

table(data$ClaimNb)

## 
##      0      1      2      3      4 
## 397779  14633    726     28      3

data$Claim<- ifelse(data$ClaimNb>0,1,0)
table(data$Claim)

## 
##      0      1 
## 397779  15390

prop.table(table(data$Claim))

## 
##          0          1 
## 0.96275132 0.03724868

length(data$Claim)

## [1] 413169

Contingency tables

table(data$Claim,data$Gas)

##    
##     Diesel Regular
##   0 197904  199875
##   1   8041    7349

prop.table(table(data$Claim,data$Gas))

##    
##         Diesel    Regular
##   0 0.47899044 0.48376088
##   1 0.01946177 0.01778691

Chi-square test for proportions:

table(data$Claim,data$Gas)

##    
##     Diesel Regular
##   0 197904  199875
##   1   8041    7349

prop.table(table(data$Claim,data$Gas))

##    
##         Diesel    Regular
##   0 0.47899044 0.48376088
##   1 0.01946177 0.01778691

chisq.test(table(data$Claim,data$Gas))

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(data$Claim, data$Gas)
## X-squared = 36.823, df = 1, p-value = 1.294e-09

Relative risks, odds and odds ratio:

t1<-data.frame(table(data$Claim,data$Gas))
n11<- t1[1,3]
n12<- t1[3,3]
n21<- t1[2,3]
n22<- t1[4,3]

pi1<-n21/(n21+n11)
pi2<-n22/(n12+n22)
pi1

## [1] 0.03904441

pi2

## [1] 0.03546404

Relative risk:

rr<-pi1/pi2

Odds:

odds1<- pi1/(1-pi1)
odds2<- pi2/(1-pi2)
odds1

## [1] 0.04063081

odds2

## [1] 0.03676798

Odds ratio:

theta<- odds1/odds2
theta

## [1] 1.10506

Logistic regression for at least one claim:

We start with a null model, this is … just including the dependent variable.

names(data)

##  [1] "PolicyID"  "ClaimNb"   "Exposure"  "Power"     "CarAge"    "DriverAge"
##  [7] "Brand"     "Gas"       "Region"    "Density"   "Claim"

table(data$Claim)

## 
##      0      1 
## 397779  15390

model1<- glm(Claim ~1, data=data,family="binomial")
summary(model1)

## 
## Call:
## glm(formula = Claim ~ 1, family = "binomial", data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.2755  -0.2755  -0.2755  -0.2755   2.5652  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.252179   0.008215  -395.9   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 131470  on 413168  degrees of freedom
## Residual deviance: 131470  on 413168  degrees of freedom
## AIC: 131472
## 
## Number of Fisher Scoring iterations: 6

We can obtain the odds and then the probability of at least one claim:

exp(coef(model1))

## (Intercept) 
##  0.03868983

odds<-exp(coef(model1))
odds/(odds+1)

## (Intercept) 
##  0.03724868

prop.table(table(data$Claim))

## 
##          0          1 
## 0.96275132 0.03724868

A logistic regression controling for type of Gas(0=Diesel, 1=Regular)

model2<-glm(Claim~Gas, family="binomial", data=data)
summary(model2)

## 
## Call:
## glm(formula = Claim ~ Gas, family = "binomial", data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.2822  -0.2822  -0.2687  -0.2687   2.5843  
## 
## Coefficients:
##             Estimate Std. Error  z value Pr(>|z|)    
## (Intercept) -3.20323    0.01138 -281.575  < 2e-16 ***
## GasRegular  -0.09990    0.01645   -6.074 1.25e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 131470  on 413168  degrees of freedom
## Residual deviance: 131433  on 413167  degrees of freedom
## AIC: 131437
## 
## Number of Fisher Scoring iterations: 6

exp(coef(model2))

## (Intercept)  GasRegular 
##  0.04063081  0.90492853

odds<-exp(coef(model2))

And again, we can estimate the odds:

odds<-exp(coef(model2))
odd1<-odds[1]
odd2<-odds[2]
odd1

## (Intercept) 
##  0.04063081

odd2

## GasRegular 
##  0.9049285

odd1*odd2

## (Intercept) 
##  0.03676798

p<-(odd1*odd2)/(odd1*odd2+1)
p

## (Intercept) 
##  0.03546404

Now, a new model with more covariates:

model3<- glm(Claim~Exposure+CarAge+DriverAge+Gas+Density, 
             family="binomial",data=data)
summary(model3)

## 
## Call:
## glm(formula = Claim ~ Exposure + CarAge + DriverAge + Gas + Density, 
##     family = "binomial", data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7024  -0.3219  -0.2573  -0.2088   3.0029  
## 
## Coefficients:
##               Estimate Std. Error  z value Pr(>|z|)    
## (Intercept) -3.641e+00  3.258e-02 -111.746  < 2e-16 ***
## Exposure     1.281e+00  2.500e-02   51.220  < 2e-16 ***
## CarAge      -3.682e-03  1.517e-03   -2.428   0.0152 *  
## DriverAge   -8.095e-03  5.977e-04  -13.544  < 2e-16 ***
## GasRegular  -1.217e-01  1.689e-02   -7.207 5.74e-13 ***
## Density      1.507e-05  1.652e-06    9.119  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 131470  on 413168  degrees of freedom
## Residual deviance: 128559  on 413163  degrees of freedom
## AIC: 128571
## 
## Number of Fisher Scoring iterations: 6

exp(coef(model3))

## (Intercept)    Exposure      CarAge   DriverAge  GasRegular     Density 
##  0.02622766  3.59867946  0.99632436  0.99193789  0.88540484  1.00001507

LAT4082_S12

Mauricio Rodriguez Abreu

3/10/2022

R Markdown

GLM’s for pricing…

Contingency tables