install.packages("CASdatasets", repos = "http://cas.uqam.ca/pub/", type="source")
library("CASdatasets")
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sp
data(freMTPLfreq)
data<-data.frame(freMTPLfreq)
remove(freMTPLfreq)
Now, we can explore the dataset:
names(data)
## [1] "PolicyID" "ClaimNb" "Exposure" "Power" "CarAge" "DriverAge"
## [7] "Brand" "Gas" "Region" "Density"
table(data$ClaimNb)
##
## 0 1 2 3 4
## 397779 14633 726 28 3
data$Claim<- ifelse(data$ClaimNb>0,1,0)
table(data$Claim)
##
## 0 1
## 397779 15390
prop.table(table(data$Claim))
##
## 0 1
## 0.96275132 0.03724868
length(data$Claim)
## [1] 413169
table(data$Claim,data$Gas)
##
## Diesel Regular
## 0 197904 199875
## 1 8041 7349
prop.table(table(data$Claim,data$Gas))
##
## Diesel Regular
## 0 0.47899044 0.48376088
## 1 0.01946177 0.01778691
Chi-square test for proportions:
table(data$Claim,data$Gas)
##
## Diesel Regular
## 0 197904 199875
## 1 8041 7349
prop.table(table(data$Claim,data$Gas))
##
## Diesel Regular
## 0 0.47899044 0.48376088
## 1 0.01946177 0.01778691
chisq.test(table(data$Claim,data$Gas))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(data$Claim, data$Gas)
## X-squared = 36.823, df = 1, p-value = 1.294e-09
Relative risks, odds and odds ratio:
t1<-data.frame(table(data$Claim,data$Gas))
n11<- t1[1,3]
n12<- t1[3,3]
n21<- t1[2,3]
n22<- t1[4,3]
pi1<-n21/(n21+n11)
pi2<-n22/(n12+n22)
pi1
## [1] 0.03904441
pi2
## [1] 0.03546404
Relative risk:
rr<-pi1/pi2
Odds:
odds1<- pi1/(1-pi1)
odds2<- pi2/(1-pi2)
odds1
## [1] 0.04063081
odds2
## [1] 0.03676798
Odds ratio:
theta<- odds1/odds2
theta
## [1] 1.10506
Logistic regression for at least one claim:
We start with a null model, this is … just including the dependent variable.
names(data)
## [1] "PolicyID" "ClaimNb" "Exposure" "Power" "CarAge" "DriverAge"
## [7] "Brand" "Gas" "Region" "Density" "Claim"
table(data$Claim)
##
## 0 1
## 397779 15390
model1<- glm(Claim ~1, data=data,family="binomial")
summary(model1)
##
## Call:
## glm(formula = Claim ~ 1, family = "binomial", data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.2755 -0.2755 -0.2755 -0.2755 2.5652
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.252179 0.008215 -395.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 131470 on 413168 degrees of freedom
## Residual deviance: 131470 on 413168 degrees of freedom
## AIC: 131472
##
## Number of Fisher Scoring iterations: 6
We can obtain the odds and then the probability of at least one claim:
exp(coef(model1))
## (Intercept)
## 0.03868983
odds<-exp(coef(model1))
odds/(odds+1)
## (Intercept)
## 0.03724868
prop.table(table(data$Claim))
##
## 0 1
## 0.96275132 0.03724868
A logistic regression controling for type of Gas(0=Diesel, 1=Regular)
model2<-glm(Claim~Gas, family="binomial", data=data)
summary(model2)
##
## Call:
## glm(formula = Claim ~ Gas, family = "binomial", data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.2822 -0.2822 -0.2687 -0.2687 2.5843
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.20323 0.01138 -281.575 < 2e-16 ***
## GasRegular -0.09990 0.01645 -6.074 1.25e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 131470 on 413168 degrees of freedom
## Residual deviance: 131433 on 413167 degrees of freedom
## AIC: 131437
##
## Number of Fisher Scoring iterations: 6
exp(coef(model2))
## (Intercept) GasRegular
## 0.04063081 0.90492853
odds<-exp(coef(model2))
And again, we can estimate the odds:
odds<-exp(coef(model2))
odd1<-odds[1]
odd2<-odds[2]
odd1
## (Intercept)
## 0.04063081
odd2
## GasRegular
## 0.9049285
odd1*odd2
## (Intercept)
## 0.03676798
p<-(odd1*odd2)/(odd1*odd2+1)
p
## (Intercept)
## 0.03546404
Now, a new model with more covariates:
model3<- glm(Claim~Exposure+CarAge+DriverAge+Gas+Density,
family="binomial",data=data)
summary(model3)
##
## Call:
## glm(formula = Claim ~ Exposure + CarAge + DriverAge + Gas + Density,
## family = "binomial", data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.7024 -0.3219 -0.2573 -0.2088 3.0029
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.641e+00 3.258e-02 -111.746 < 2e-16 ***
## Exposure 1.281e+00 2.500e-02 51.220 < 2e-16 ***
## CarAge -3.682e-03 1.517e-03 -2.428 0.0152 *
## DriverAge -8.095e-03 5.977e-04 -13.544 < 2e-16 ***
## GasRegular -1.217e-01 1.689e-02 -7.207 5.74e-13 ***
## Density 1.507e-05 1.652e-06 9.119 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 131470 on 413168 degrees of freedom
## Residual deviance: 128559 on 413163 degrees of freedom
## AIC: 128571
##
## Number of Fisher Scoring iterations: 6
exp(coef(model3))
## (Intercept) Exposure CarAge DriverAge GasRegular Density
## 0.02622766 3.59867946 0.99632436 0.99193789 0.88540484 1.00001507