Preparing Data

CCI0<-read.csv("c:/R/CanadaCarInsurance.csv")
colnames(CCI0)<-c("Kilometres","Zone","Bonus","Make","Insured","Claims","Payment")
CCI0$Kilometres<-as.factor(CCI0$Kilometres)
CCI0$Zone<-as.factor(CCI0$Zone)
CCI0$Make<-as.factor(CCI0$Make)
CCI<-CCI0[-which(CCI0$Insured==0),]
CCI<-CCI0[-which(CCI0$Claims==0),]

EDA

head(CCI);str(CCI)
##   Kilometres Zone Bonus Make Insured Claims Payment
## 1          1    1     1    1     455    108  392491
## 2          1    1     1    2      69     19   46221
## 3          1    1     1    3      73     13   15694
## 4          1    1     1    4    1292    124  422201
## 5          1    1     1    5     191     40  119373
## 6          1    1     1    6     478     57  170913
## 'data.frame':    1797 obs. of  7 variables:
##  $ Kilometres: Factor w/ 5 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Zone      : Factor w/ 7 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Bonus     : int  1 1 1 1 1 1 1 1 1 2 ...
##  $ Make      : Factor w/ 9 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 1 ...
##  $ Insured   : int  455 69 73 1292 191 478 106 33 9998 315 ...
##  $ Claims    : int  108 19 13 124 40 57 23 14 1704 45 ...
##  $ Payment   : int  392491 46221 15694 422201 119373 170913 56940 77487 6805992 214011 ...
library(ggplot2);require(MASS)
## Loading required package: MASS
ggplot(CCI,aes(x=CCI$Claims))+geom_histogram(binwidth = 100) #large 0

Fitting

##Bonus as a Continuous
fit1<-glm(Payment/Claims~Kilometres+Zone+Bonus+Make+Kilometres:Zone+Kilometres:Make+Zone:Make,
    family=Gamma("log"),weights = Claims,data=CCI)

#selection with AIC
stepAIC(fit1,direction="both")$anova #AIC 1870714
## Start:  AIC=1870714
## Payment/Claims ~ Kilometres + Zone + Bonus + Make + Kilometres:Zone + 
##     Kilometres:Make + Zone:Make
## 
##                   Df Deviance     AIC
## - Zone:Make       48   4383.2 1870671
## - Kilometres:Make 32   4327.3 1870683
## - Kilometres:Zone 24   4319.3 1870696
## <none>                 4237.7 1870714
## - Bonus            1   4411.5 1870775
## 
## Step:  AIC=1874465
## Payment/Claims ~ Kilometres + Zone + Bonus + Make + Kilometres:Zone + 
##     Kilometres:Make
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## Payment/Claims ~ Kilometres + Zone + Bonus + Make + Kilometres:Zone + 
##     Kilometres:Make + Zone:Make
## 
## Final Model:
## Payment/Claims ~ Kilometres + Zone + Bonus + Make + Kilometres:Zone + 
##     Kilometres:Make
## 
## 
##   Step Df Deviance Resid. Df Resid. Dev     AIC
## 1                       1673   4237.691 1870714
stepAIC(fit1,direction="forward")$anova    
## Start:  AIC=1870714
## Payment/Claims ~ Kilometres + Zone + Bonus + Make + Kilometres:Zone + 
##     Kilometres:Make + Zone:Make
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## Payment/Claims ~ Kilometres + Zone + Bonus + Make + Kilometres:Zone + 
##     Kilometres:Make + Zone:Make
## 
## Final Model:
## Payment/Claims ~ Kilometres + Zone + Bonus + Make + Kilometres:Zone + 
##     Kilometres:Make + Zone:Make
## 
## 
##   Step Df Deviance Resid. Df Resid. Dev     AIC
## 1                       1673   4237.691 1870714
stepAIC(fit1,direction="backward")$anova  
## Start:  AIC=1870714
## Payment/Claims ~ Kilometres + Zone + Bonus + Make + Kilometres:Zone + 
##     Kilometres:Make + Zone:Make
## 
##                   Df Deviance     AIC
## - Zone:Make       48   4383.2 1870671
## - Kilometres:Make 32   4327.3 1870683
## - Kilometres:Zone 24   4319.3 1870696
## <none>                 4237.7 1870714
## - Bonus            1   4411.5 1870775
## 
## Step:  AIC=1874465
## Payment/Claims ~ Kilometres + Zone + Bonus + Make + Kilometres:Zone + 
##     Kilometres:Make
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## Payment/Claims ~ Kilometres + Zone + Bonus + Make + Kilometres:Zone + 
##     Kilometres:Make + Zone:Make
## 
## Final Model:
## Payment/Claims ~ Kilometres + Zone + Bonus + Make + Kilometres:Zone + 
##     Kilometres:Make
## 
## 
##   Step Df Deviance Resid. Df Resid. Dev     AIC
## 1                       1673   4237.691 1870714
##Bonus as a Factor =>Better
fit2<-glm(Payment/Claims~Kilometres+Zone+factor(Bonus)+Make+Kilometres:Zone+
            Kilometres:Make+Zone:Make+factor(Bonus):Kilometres+factor(Bonus):Zone+
            factor(Bonus):Make,
          family=Gamma("log"),weights = Claims,data=CCI)
#selection with AIC
stepAIC(fit2,direction="both")$anova
## Start:  AIC=1859773
## Payment/Claims ~ Kilometres + Zone + factor(Bonus) + Make + Kilometres:Zone + 
##     Kilometres:Make + Zone:Make + factor(Bonus):Kilometres + 
##     factor(Bonus):Zone + factor(Bonus):Make
## 
##                            Df Deviance     AIC
## - Zone:Make                48   3984.4 1859730
## - factor(Bonus):Make       48   3985.9 1859731
## - Kilometres:Make          32   3932.0 1859743
## - Kilometres:factor(Bonus) 24   3912.7 1859752
## - Kilometres:Zone          24   3915.8 1859753
## - Zone:factor(Bonus)       36   4000.8 1859760
## <none>                          3841.8 1859773
## 
## Step:  AIC=1863828
## Payment/Claims ~ Kilometres + Zone + factor(Bonus) + Make + Kilometres:Zone + 
##     Kilometres:Make + Kilometres:factor(Bonus) + Zone:factor(Bonus) + 
##     factor(Bonus):Make
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## Payment/Claims ~ Kilometres + Zone + factor(Bonus) + Make + Kilometres:Zone + 
##     Kilometres:Make + Zone:Make + factor(Bonus):Kilometres + 
##     factor(Bonus):Zone + factor(Bonus):Make
## 
## Final Model:
## Payment/Claims ~ Kilometres + Zone + factor(Bonus) + Make + Kilometres:Zone + 
##     Kilometres:Make + Kilometres:factor(Bonus) + Zone:factor(Bonus) + 
##     factor(Bonus):Make
## 
## 
##   Step Df Deviance Resid. Df Resid. Dev     AIC
## 1                       1560   3841.752 1859773
stepAIC(fit2,direction="forward")$anova    
## Start:  AIC=1859773
## Payment/Claims ~ Kilometres + Zone + factor(Bonus) + Make + Kilometres:Zone + 
##     Kilometres:Make + Zone:Make + factor(Bonus):Kilometres + 
##     factor(Bonus):Zone + factor(Bonus):Make
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## Payment/Claims ~ Kilometres + Zone + factor(Bonus) + Make + Kilometres:Zone + 
##     Kilometres:Make + Zone:Make + factor(Bonus):Kilometres + 
##     factor(Bonus):Zone + factor(Bonus):Make
## 
## Final Model:
## Payment/Claims ~ Kilometres + Zone + factor(Bonus) + Make + Kilometres:Zone + 
##     Kilometres:Make + Zone:Make + factor(Bonus):Kilometres + 
##     factor(Bonus):Zone + factor(Bonus):Make
## 
## 
##   Step Df Deviance Resid. Df Resid. Dev     AIC
## 1                       1560   3841.752 1859773
stepAIC(fit2,direction="backward")$anova  
## Start:  AIC=1859773
## Payment/Claims ~ Kilometres + Zone + factor(Bonus) + Make + Kilometres:Zone + 
##     Kilometres:Make + Zone:Make + factor(Bonus):Kilometres + 
##     factor(Bonus):Zone + factor(Bonus):Make
## 
##                            Df Deviance     AIC
## - Zone:Make                48   3984.4 1859730
## - factor(Bonus):Make       48   3985.9 1859731
## - Kilometres:Make          32   3932.0 1859743
## - Kilometres:factor(Bonus) 24   3912.7 1859752
## - Kilometres:Zone          24   3915.8 1859753
## - Zone:factor(Bonus)       36   4000.8 1859760
## <none>                          3841.8 1859773
## 
## Step:  AIC=1863828
## Payment/Claims ~ Kilometres + Zone + factor(Bonus) + Make + Kilometres:Zone + 
##     Kilometres:Make + Kilometres:factor(Bonus) + Zone:factor(Bonus) + 
##     factor(Bonus):Make
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## Payment/Claims ~ Kilometres + Zone + factor(Bonus) + Make + Kilometres:Zone + 
##     Kilometres:Make + Zone:Make + factor(Bonus):Kilometres + 
##     factor(Bonus):Zone + factor(Bonus):Make
## 
## Final Model:
## Payment/Claims ~ Kilometres + Zone + factor(Bonus) + Make + Kilometres:Zone + 
##     Kilometres:Make + Kilometres:factor(Bonus) + Zone:factor(Bonus) + 
##     factor(Bonus):Make
## 
## 
##   Step Df Deviance Resid. Df Resid. Dev     AIC
## 1                       1560   3841.752 1859773
#without interaction (bonus as a factor)
fit3<-glm(Payment/Claims~Kilometres+Zone+factor(Bonus)+Make,
          family=Gamma("log"),weights = Claims,data=CCI)
#without interaction (bonus as a factor) (with grouping)
CCI[which(CCI$Kilometres==2 | CCI$Kilometres==3),"Kilometres"]<-2
CCI[which(CCI$Zone==2 | CCI$Zone==7),"Zone"]<-2
CCI[which(CCI$Bonus==3 | CCI$Bonus==6),"Bonus"]<-3
CCI[which(CCI$Make==2 | CCI$Make==6),"Bonus"]<-2
CCI2<-CCI[,c("Kilometres","Zone","Bonus","Make","Claims","Payment")]
fit4<-glm(Payment/Claims~.*.,
          family=Gamma("log"),weights = Claims,data=CCI2)
step(fit4,direction="both")$anova
## Start:  AIC=1872750
## Payment/Claims ~ (Kilometres + Zone + Bonus + Make) * (Kilometres + 
##     Zone + Bonus + Make)
## 
##                    Df Deviance     AIC
## - Zone:Make        40   4420.3 1872707
## - Kilometres:Make  24   4377.7 1872724
## - Kilometres:Zone  15   4353.0 1872733
## - Zone:Bonus        5   4320.3 1872742
## - Bonus:Make        6   4329.8 1872743
## - Kilometres:Bonus  3   4322.7 1872747
## <none>                  4315.1 1872750
## 
## Step:  AIC=1875414
## Payment/Claims ~ Kilometres + Zone + Bonus + Make + Kilometres:Zone + 
##     Kilometres:Bonus + Kilometres:Make + Zone:Bonus + Bonus:Make
##   Step Df Deviance Resid. Df Resid. Dev     AIC
## 1      NA       NA      1686   4315.114 1872750

Histogram of coef

# Kilo<-data.frame("name"=c("K1","K2","K3","K4","K5"),"Kilo"=summary(fit3)$coef[1:5,1])
# ggplot(Kilo,aes(x=name,y=Kilo))+geom_col(fill="#8491B4B2")+labs(title="Kilometres")
# Zone<-data.frame("name"=c("Z1","Z2","Z3","Z4","Z5","Z6"),"Kilo"=summary(fit3)$coef[6:11,1])
# ggplot(Kilo,aes(x=name,y=Kilo))+geom_col(fill="#8491B4B2")+labs(title="Kilometres")
# Bonus<-data.frame("name"=c("B1","B2","B3","B4","B5"),"Kilo"=summary(fit3)$coef[12:17,1])
# ggplot(Kilo,aes(x=name,y=Kilo))+geom_col(fill="#8491B4B2")+labs(title="Kilometres")
# Make<-data.frame("name"=c("K1","K2","K3","K4","K5"),"Kilo"=summary(fit3)$coef[1:5,1])
# ggplot(Kilo,aes(x=name,y=Kilo))+geom_col(fill="#8491B4B2")+labs(title="Kilometres")

Compare

#Bonus as a Continuous
stepAIC(fit1,direction="both")$anova
## Start:  AIC=1870714
## Payment/Claims ~ Kilometres + Zone + Bonus + Make + Kilometres:Zone + 
##     Kilometres:Make + Zone:Make
## 
##                   Df Deviance     AIC
## - Zone:Make       48   4383.2 1870671
## - Kilometres:Make 32   4327.3 1870683
## - Kilometres:Zone 24   4319.3 1870696
## <none>                 4237.7 1870714
## - Bonus            1   4411.5 1870775
## 
## Step:  AIC=1876136
## Payment/Claims ~ Kilometres + Zone + Bonus + Make + Kilometres:Zone + 
##     Kilometres:Make
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## Payment/Claims ~ Kilometres + Zone + Bonus + Make + Kilometres:Zone + 
##     Kilometres:Make + Zone:Make
## 
## Final Model:
## Payment/Claims ~ Kilometres + Zone + Bonus + Make + Kilometres:Zone + 
##     Kilometres:Make
## 
## 
##   Step Df Deviance Resid. Df Resid. Dev     AIC
## 1                       1673   4237.691 1870714
#Bonus as a Factor
stepAIC(fit2,direction="both")$anova
## Start:  AIC=1859773
## Payment/Claims ~ Kilometres + Zone + factor(Bonus) + Make + Kilometres:Zone + 
##     Kilometres:Make + Zone:Make + factor(Bonus):Kilometres + 
##     factor(Bonus):Zone + factor(Bonus):Make
## 
##                            Df Deviance     AIC
## - Zone:Make                48   3984.4 1859730
## - factor(Bonus):Make       48   3985.9 1859731
## - Kilometres:Make          32   3932.0 1859743
## - Kilometres:factor(Bonus) 24   3912.7 1859752
## - Kilometres:Zone          24   3915.8 1859753
## - Zone:factor(Bonus)       36   4000.8 1859760
## <none>                          3841.8 1859773
## 
## Step:  AIC=1868697
## Payment/Claims ~ Kilometres + Zone + factor(Bonus) + Make + Kilometres:Zone + 
##     Kilometres:Make + Kilometres:factor(Bonus) + Zone:factor(Bonus) + 
##     factor(Bonus):Make
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## Payment/Claims ~ Kilometres + Zone + factor(Bonus) + Make + Kilometres:Zone + 
##     Kilometres:Make + Zone:Make + factor(Bonus):Kilometres + 
##     factor(Bonus):Zone + factor(Bonus):Make
## 
## Final Model:
## Payment/Claims ~ Kilometres + Zone + factor(Bonus) + Make + Kilometres:Zone + 
##     Kilometres:Make + Kilometres:factor(Bonus) + Zone:factor(Bonus) + 
##     factor(Bonus):Make
## 
## 
##   Step Df Deviance Resid. Df Resid. Dev     AIC
## 1                       1560   3841.752 1859773
#without interaction (bonus as a factor)
stepAIC(fit3,direction="both")$anova
## Start:  AIC=1878028
## Payment/Claims ~ Kilometres + Zone + factor(Bonus) + Make
## 
##                 Df Deviance     AIC
## - Kilometres     4   4547.3 1878027
## <none>               4526.6 1878028
## - Make           8   4712.6 1878075
## - factor(Bonus)  6   4734.3 1878087
## - Zone           6   4866.0 1878131
## 
## Step:  AIC=1878573
## Payment/Claims ~ Zone + factor(Bonus) + Make
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## Payment/Claims ~ Kilometres + Zone + factor(Bonus) + Make
## 
## Final Model:
## Payment/Claims ~ Zone + factor(Bonus) + Make
## 
## 
##   Step Df Deviance Resid. Df Resid. Dev     AIC
## 1                       1772   4526.591 1878028
#without interaction (bonus as a factor) (with grouping)
stepAIC(fit4,direction="both")$anova
## Start:  AIC=1872750
## Payment/Claims ~ (Kilometres + Zone + Bonus + Make) * (Kilometres + 
##     Zone + Bonus + Make)
## 
##                    Df Deviance     AIC
## - Zone:Make        40   4420.3 1872707
## - Kilometres:Make  24   4377.7 1872724
## - Kilometres:Zone  15   4353.0 1872733
## - Zone:Bonus        5   4320.3 1872742
## - Bonus:Make        6   4329.8 1872743
## - Kilometres:Bonus  3   4322.7 1872747
## <none>                  4315.1 1872750
## 
## Step:  AIC=1875414
## Payment/Claims ~ Kilometres + Zone + Bonus + Make + Kilometres:Zone + 
##     Kilometres:Bonus + Kilometres:Make + Zone:Bonus + Bonus:Make
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## Payment/Claims ~ (Kilometres + Zone + Bonus + Make) * (Kilometres + 
##     Zone + Bonus + Make)
## 
## Final Model:
## Payment/Claims ~ Kilometres + Zone + Bonus + Make + Kilometres:Zone + 
##     Kilometres:Bonus + Kilometres:Make + Zone:Bonus + Bonus:Make
## 
## 
##   Step Df Deviance Resid. Df Resid. Dev     AIC
## 1                       1686   4315.114 1872750

Plot

library(reshape2)
fit.final<-glm(Payment/Claims~Kilometres+Zone+factor(Bonus)+Make+Kilometres:Zone+
            Kilometres:Make+factor(Bonus):Kilometres+factor(Bonus):Zone+
            factor(Bonus):Make,
          family=Gamma("log"),weights = Claims,data=CCI)
res<-data.frame("pearson"=residuals(fit.final,type="pearson"),"deviance"=summary(fit.final)$deviance.resid)
res2<-melt(res)
## No id variables; using all as measure variables
#boxplot
ggplot(res2,aes(x="",y=value,fill=variable))+geom_boxplot()+labs(title="Boxplot")

#Histogram
ggplot(res,aes(pearson))+geom_histogram(aes(y=..density..),fill="#8491B4B2")+geom_density(color="#3C5488B2")+labs(title="Histogram for pearson")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(res,aes(deviance))+geom_histogram(aes(y=..density..),fill="#8491B4B2")+geom_density(color="#3C5488B2")+labs(title="Histogram for deviance")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Normal Q-Q plot
pear<-data.frame("pearson.q"=qnorm(rank(res$pearson)/(NROW(res$pearson)+1)),"pearson.res"=res$pearson)
dev<-data.frame("deviance.q"=qnorm(rank(res$deviance)/(NROW(res$deviance)+1)),"deviance.res"=res$deviance)
ggplot(pear,aes(x=pearson.q,y=pearson.res))+geom_point(color="#F39B7FB2",size=1)+geom_smooth(method="lm",color="#DC0000B2")+
  labs(title=paste("Normal Q-Q plot of Pearson, R^2=",round(summary(lm(pearson.q~pearson.res,data=pear))$adj.r.squared,3)))

ggplot(dev,aes(x=deviance.q,y=deviance.res))+geom_point(color="#F39B7FB2",size=1)+geom_smooth(method="lm",color="#DC0000B2")+
  labs(title=paste("Normal Q-Q plot of deviance, R^2=",round(summary(lm(deviance.q~deviance.res,data=dev))$adj.r.squared,3)))