Preparing Data
CCI0<-read.csv("c:/R/CanadaCarInsurance.csv")
colnames(CCI0)<-c("Kilometres","Zone","Bonus","Make","Insured","Claims","Payment")
CCI0$Kilometres<-as.factor(CCI0$Kilometres)
CCI0$Zone<-as.factor(CCI0$Zone)
CCI0$Make<-as.factor(CCI0$Make)
CCI<-CCI0[-which(CCI0$Insured==0),]
CCI<-CCI0[-which(CCI0$Claims==0),]
EDA
head(CCI);str(CCI)
## Kilometres Zone Bonus Make Insured Claims Payment
## 1 1 1 1 1 455 108 392491
## 2 1 1 1 2 69 19 46221
## 3 1 1 1 3 73 13 15694
## 4 1 1 1 4 1292 124 422201
## 5 1 1 1 5 191 40 119373
## 6 1 1 1 6 478 57 170913
## 'data.frame': 1797 obs. of 7 variables:
## $ Kilometres: Factor w/ 5 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Zone : Factor w/ 7 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Bonus : int 1 1 1 1 1 1 1 1 1 2 ...
## $ Make : Factor w/ 9 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 1 ...
## $ Insured : int 455 69 73 1292 191 478 106 33 9998 315 ...
## $ Claims : int 108 19 13 124 40 57 23 14 1704 45 ...
## $ Payment : int 392491 46221 15694 422201 119373 170913 56940 77487 6805992 214011 ...
library(ggplot2);require(MASS)
## Loading required package: MASS
ggplot(CCI,aes(x=CCI$Claims))+geom_histogram(binwidth = 100) #large 0

Fitting
##Bonus as a Continuous
fit1<-glm(Payment/Claims~Kilometres+Zone+Bonus+Make+Kilometres:Zone+Kilometres:Make+Zone:Make,
family=Gamma("log"),weights = Claims,data=CCI)
#selection with AIC
stepAIC(fit1,direction="both")$anova #AIC 1870714
## Start: AIC=1870714
## Payment/Claims ~ Kilometres + Zone + Bonus + Make + Kilometres:Zone +
## Kilometres:Make + Zone:Make
##
## Df Deviance AIC
## - Zone:Make 48 4383.2 1870671
## - Kilometres:Make 32 4327.3 1870683
## - Kilometres:Zone 24 4319.3 1870696
## <none> 4237.7 1870714
## - Bonus 1 4411.5 1870775
##
## Step: AIC=1874465
## Payment/Claims ~ Kilometres + Zone + Bonus + Make + Kilometres:Zone +
## Kilometres:Make
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## Payment/Claims ~ Kilometres + Zone + Bonus + Make + Kilometres:Zone +
## Kilometres:Make + Zone:Make
##
## Final Model:
## Payment/Claims ~ Kilometres + Zone + Bonus + Make + Kilometres:Zone +
## Kilometres:Make
##
##
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 1673 4237.691 1870714
stepAIC(fit1,direction="forward")$anova
## Start: AIC=1870714
## Payment/Claims ~ Kilometres + Zone + Bonus + Make + Kilometres:Zone +
## Kilometres:Make + Zone:Make
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## Payment/Claims ~ Kilometres + Zone + Bonus + Make + Kilometres:Zone +
## Kilometres:Make + Zone:Make
##
## Final Model:
## Payment/Claims ~ Kilometres + Zone + Bonus + Make + Kilometres:Zone +
## Kilometres:Make + Zone:Make
##
##
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 1673 4237.691 1870714
stepAIC(fit1,direction="backward")$anova
## Start: AIC=1870714
## Payment/Claims ~ Kilometres + Zone + Bonus + Make + Kilometres:Zone +
## Kilometres:Make + Zone:Make
##
## Df Deviance AIC
## - Zone:Make 48 4383.2 1870671
## - Kilometres:Make 32 4327.3 1870683
## - Kilometres:Zone 24 4319.3 1870696
## <none> 4237.7 1870714
## - Bonus 1 4411.5 1870775
##
## Step: AIC=1874465
## Payment/Claims ~ Kilometres + Zone + Bonus + Make + Kilometres:Zone +
## Kilometres:Make
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## Payment/Claims ~ Kilometres + Zone + Bonus + Make + Kilometres:Zone +
## Kilometres:Make + Zone:Make
##
## Final Model:
## Payment/Claims ~ Kilometres + Zone + Bonus + Make + Kilometres:Zone +
## Kilometres:Make
##
##
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 1673 4237.691 1870714
##Bonus as a Factor =>Better
fit2<-glm(Payment/Claims~Kilometres+Zone+factor(Bonus)+Make+Kilometres:Zone+
Kilometres:Make+Zone:Make+factor(Bonus):Kilometres+factor(Bonus):Zone+
factor(Bonus):Make,
family=Gamma("log"),weights = Claims,data=CCI)
#selection with AIC
stepAIC(fit2,direction="both")$anova
## Start: AIC=1859773
## Payment/Claims ~ Kilometres + Zone + factor(Bonus) + Make + Kilometres:Zone +
## Kilometres:Make + Zone:Make + factor(Bonus):Kilometres +
## factor(Bonus):Zone + factor(Bonus):Make
##
## Df Deviance AIC
## - Zone:Make 48 3984.4 1859730
## - factor(Bonus):Make 48 3985.9 1859731
## - Kilometres:Make 32 3932.0 1859743
## - Kilometres:factor(Bonus) 24 3912.7 1859752
## - Kilometres:Zone 24 3915.8 1859753
## - Zone:factor(Bonus) 36 4000.8 1859760
## <none> 3841.8 1859773
##
## Step: AIC=1863828
## Payment/Claims ~ Kilometres + Zone + factor(Bonus) + Make + Kilometres:Zone +
## Kilometres:Make + Kilometres:factor(Bonus) + Zone:factor(Bonus) +
## factor(Bonus):Make
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## Payment/Claims ~ Kilometres + Zone + factor(Bonus) + Make + Kilometres:Zone +
## Kilometres:Make + Zone:Make + factor(Bonus):Kilometres +
## factor(Bonus):Zone + factor(Bonus):Make
##
## Final Model:
## Payment/Claims ~ Kilometres + Zone + factor(Bonus) + Make + Kilometres:Zone +
## Kilometres:Make + Kilometres:factor(Bonus) + Zone:factor(Bonus) +
## factor(Bonus):Make
##
##
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 1560 3841.752 1859773
stepAIC(fit2,direction="forward")$anova
## Start: AIC=1859773
## Payment/Claims ~ Kilometres + Zone + factor(Bonus) + Make + Kilometres:Zone +
## Kilometres:Make + Zone:Make + factor(Bonus):Kilometres +
## factor(Bonus):Zone + factor(Bonus):Make
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## Payment/Claims ~ Kilometres + Zone + factor(Bonus) + Make + Kilometres:Zone +
## Kilometres:Make + Zone:Make + factor(Bonus):Kilometres +
## factor(Bonus):Zone + factor(Bonus):Make
##
## Final Model:
## Payment/Claims ~ Kilometres + Zone + factor(Bonus) + Make + Kilometres:Zone +
## Kilometres:Make + Zone:Make + factor(Bonus):Kilometres +
## factor(Bonus):Zone + factor(Bonus):Make
##
##
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 1560 3841.752 1859773
stepAIC(fit2,direction="backward")$anova
## Start: AIC=1859773
## Payment/Claims ~ Kilometres + Zone + factor(Bonus) + Make + Kilometres:Zone +
## Kilometres:Make + Zone:Make + factor(Bonus):Kilometres +
## factor(Bonus):Zone + factor(Bonus):Make
##
## Df Deviance AIC
## - Zone:Make 48 3984.4 1859730
## - factor(Bonus):Make 48 3985.9 1859731
## - Kilometres:Make 32 3932.0 1859743
## - Kilometres:factor(Bonus) 24 3912.7 1859752
## - Kilometres:Zone 24 3915.8 1859753
## - Zone:factor(Bonus) 36 4000.8 1859760
## <none> 3841.8 1859773
##
## Step: AIC=1863828
## Payment/Claims ~ Kilometres + Zone + factor(Bonus) + Make + Kilometres:Zone +
## Kilometres:Make + Kilometres:factor(Bonus) + Zone:factor(Bonus) +
## factor(Bonus):Make
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## Payment/Claims ~ Kilometres + Zone + factor(Bonus) + Make + Kilometres:Zone +
## Kilometres:Make + Zone:Make + factor(Bonus):Kilometres +
## factor(Bonus):Zone + factor(Bonus):Make
##
## Final Model:
## Payment/Claims ~ Kilometres + Zone + factor(Bonus) + Make + Kilometres:Zone +
## Kilometres:Make + Kilometres:factor(Bonus) + Zone:factor(Bonus) +
## factor(Bonus):Make
##
##
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 1560 3841.752 1859773
#without interaction (bonus as a factor)
fit3<-glm(Payment/Claims~Kilometres+Zone+factor(Bonus)+Make,
family=Gamma("log"),weights = Claims,data=CCI)
#without interaction (bonus as a factor) (with grouping)
CCI[which(CCI$Kilometres==2 | CCI$Kilometres==3),"Kilometres"]<-2
CCI[which(CCI$Zone==2 | CCI$Zone==7),"Zone"]<-2
CCI[which(CCI$Bonus==3 | CCI$Bonus==6),"Bonus"]<-3
CCI[which(CCI$Make==2 | CCI$Make==6),"Bonus"]<-2
CCI2<-CCI[,c("Kilometres","Zone","Bonus","Make","Claims","Payment")]
fit4<-glm(Payment/Claims~.*.,
family=Gamma("log"),weights = Claims,data=CCI2)
step(fit4,direction="both")$anova
## Start: AIC=1872750
## Payment/Claims ~ (Kilometres + Zone + Bonus + Make) * (Kilometres +
## Zone + Bonus + Make)
##
## Df Deviance AIC
## - Zone:Make 40 4420.3 1872707
## - Kilometres:Make 24 4377.7 1872724
## - Kilometres:Zone 15 4353.0 1872733
## - Zone:Bonus 5 4320.3 1872742
## - Bonus:Make 6 4329.8 1872743
## - Kilometres:Bonus 3 4322.7 1872747
## <none> 4315.1 1872750
##
## Step: AIC=1875414
## Payment/Claims ~ Kilometres + Zone + Bonus + Make + Kilometres:Zone +
## Kilometres:Bonus + Kilometres:Make + Zone:Bonus + Bonus:Make
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 NA NA 1686 4315.114 1872750
Histogram of coef
# Kilo<-data.frame("name"=c("K1","K2","K3","K4","K5"),"Kilo"=summary(fit3)$coef[1:5,1])
# ggplot(Kilo,aes(x=name,y=Kilo))+geom_col(fill="#8491B4B2")+labs(title="Kilometres")
# Zone<-data.frame("name"=c("Z1","Z2","Z3","Z4","Z5","Z6"),"Kilo"=summary(fit3)$coef[6:11,1])
# ggplot(Kilo,aes(x=name,y=Kilo))+geom_col(fill="#8491B4B2")+labs(title="Kilometres")
# Bonus<-data.frame("name"=c("B1","B2","B3","B4","B5"),"Kilo"=summary(fit3)$coef[12:17,1])
# ggplot(Kilo,aes(x=name,y=Kilo))+geom_col(fill="#8491B4B2")+labs(title="Kilometres")
# Make<-data.frame("name"=c("K1","K2","K3","K4","K5"),"Kilo"=summary(fit3)$coef[1:5,1])
# ggplot(Kilo,aes(x=name,y=Kilo))+geom_col(fill="#8491B4B2")+labs(title="Kilometres")
Compare
#Bonus as a Continuous
stepAIC(fit1,direction="both")$anova
## Start: AIC=1870714
## Payment/Claims ~ Kilometres + Zone + Bonus + Make + Kilometres:Zone +
## Kilometres:Make + Zone:Make
##
## Df Deviance AIC
## - Zone:Make 48 4383.2 1870671
## - Kilometres:Make 32 4327.3 1870683
## - Kilometres:Zone 24 4319.3 1870696
## <none> 4237.7 1870714
## - Bonus 1 4411.5 1870775
##
## Step: AIC=1876136
## Payment/Claims ~ Kilometres + Zone + Bonus + Make + Kilometres:Zone +
## Kilometres:Make
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## Payment/Claims ~ Kilometres + Zone + Bonus + Make + Kilometres:Zone +
## Kilometres:Make + Zone:Make
##
## Final Model:
## Payment/Claims ~ Kilometres + Zone + Bonus + Make + Kilometres:Zone +
## Kilometres:Make
##
##
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 1673 4237.691 1870714
#Bonus as a Factor
stepAIC(fit2,direction="both")$anova
## Start: AIC=1859773
## Payment/Claims ~ Kilometres + Zone + factor(Bonus) + Make + Kilometres:Zone +
## Kilometres:Make + Zone:Make + factor(Bonus):Kilometres +
## factor(Bonus):Zone + factor(Bonus):Make
##
## Df Deviance AIC
## - Zone:Make 48 3984.4 1859730
## - factor(Bonus):Make 48 3985.9 1859731
## - Kilometres:Make 32 3932.0 1859743
## - Kilometres:factor(Bonus) 24 3912.7 1859752
## - Kilometres:Zone 24 3915.8 1859753
## - Zone:factor(Bonus) 36 4000.8 1859760
## <none> 3841.8 1859773
##
## Step: AIC=1868697
## Payment/Claims ~ Kilometres + Zone + factor(Bonus) + Make + Kilometres:Zone +
## Kilometres:Make + Kilometres:factor(Bonus) + Zone:factor(Bonus) +
## factor(Bonus):Make
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## Payment/Claims ~ Kilometres + Zone + factor(Bonus) + Make + Kilometres:Zone +
## Kilometres:Make + Zone:Make + factor(Bonus):Kilometres +
## factor(Bonus):Zone + factor(Bonus):Make
##
## Final Model:
## Payment/Claims ~ Kilometres + Zone + factor(Bonus) + Make + Kilometres:Zone +
## Kilometres:Make + Kilometres:factor(Bonus) + Zone:factor(Bonus) +
## factor(Bonus):Make
##
##
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 1560 3841.752 1859773
#without interaction (bonus as a factor)
stepAIC(fit3,direction="both")$anova
## Start: AIC=1878028
## Payment/Claims ~ Kilometres + Zone + factor(Bonus) + Make
##
## Df Deviance AIC
## - Kilometres 4 4547.3 1878027
## <none> 4526.6 1878028
## - Make 8 4712.6 1878075
## - factor(Bonus) 6 4734.3 1878087
## - Zone 6 4866.0 1878131
##
## Step: AIC=1878573
## Payment/Claims ~ Zone + factor(Bonus) + Make
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## Payment/Claims ~ Kilometres + Zone + factor(Bonus) + Make
##
## Final Model:
## Payment/Claims ~ Zone + factor(Bonus) + Make
##
##
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 1772 4526.591 1878028
#without interaction (bonus as a factor) (with grouping)
stepAIC(fit4,direction="both")$anova
## Start: AIC=1872750
## Payment/Claims ~ (Kilometres + Zone + Bonus + Make) * (Kilometres +
## Zone + Bonus + Make)
##
## Df Deviance AIC
## - Zone:Make 40 4420.3 1872707
## - Kilometres:Make 24 4377.7 1872724
## - Kilometres:Zone 15 4353.0 1872733
## - Zone:Bonus 5 4320.3 1872742
## - Bonus:Make 6 4329.8 1872743
## - Kilometres:Bonus 3 4322.7 1872747
## <none> 4315.1 1872750
##
## Step: AIC=1875414
## Payment/Claims ~ Kilometres + Zone + Bonus + Make + Kilometres:Zone +
## Kilometres:Bonus + Kilometres:Make + Zone:Bonus + Bonus:Make
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## Payment/Claims ~ (Kilometres + Zone + Bonus + Make) * (Kilometres +
## Zone + Bonus + Make)
##
## Final Model:
## Payment/Claims ~ Kilometres + Zone + Bonus + Make + Kilometres:Zone +
## Kilometres:Bonus + Kilometres:Make + Zone:Bonus + Bonus:Make
##
##
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 1686 4315.114 1872750
Plot
library(reshape2)
fit.final<-glm(Payment/Claims~Kilometres+Zone+factor(Bonus)+Make+Kilometres:Zone+
Kilometres:Make+factor(Bonus):Kilometres+factor(Bonus):Zone+
factor(Bonus):Make,
family=Gamma("log"),weights = Claims,data=CCI)
res<-data.frame("pearson"=residuals(fit.final,type="pearson"),"deviance"=summary(fit.final)$deviance.resid)
res2<-melt(res)
## No id variables; using all as measure variables
#boxplot
ggplot(res2,aes(x="",y=value,fill=variable))+geom_boxplot()+labs(title="Boxplot")

#Histogram
ggplot(res,aes(pearson))+geom_histogram(aes(y=..density..),fill="#8491B4B2")+geom_density(color="#3C5488B2")+labs(title="Histogram for pearson")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(res,aes(deviance))+geom_histogram(aes(y=..density..),fill="#8491B4B2")+geom_density(color="#3C5488B2")+labs(title="Histogram for deviance")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Normal Q-Q plot
pear<-data.frame("pearson.q"=qnorm(rank(res$pearson)/(NROW(res$pearson)+1)),"pearson.res"=res$pearson)
dev<-data.frame("deviance.q"=qnorm(rank(res$deviance)/(NROW(res$deviance)+1)),"deviance.res"=res$deviance)
ggplot(pear,aes(x=pearson.q,y=pearson.res))+geom_point(color="#F39B7FB2",size=1)+geom_smooth(method="lm",color="#DC0000B2")+
labs(title=paste("Normal Q-Q plot of Pearson, R^2=",round(summary(lm(pearson.q~pearson.res,data=pear))$adj.r.squared,3)))

ggplot(dev,aes(x=deviance.q,y=deviance.res))+geom_point(color="#F39B7FB2",size=1)+geom_smooth(method="lm",color="#DC0000B2")+
labs(title=paste("Normal Q-Q plot of deviance, R^2=",round(summary(lm(deviance.q~deviance.res,data=dev))$adj.r.squared,3)))
