Load the data

library(insuranceData)
data(dataCar)

summary(dataCar)
##    veh_value         exposure             clm            numclaims      
##  Min.   : 0.000   Min.   :0.002738   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.: 1.010   1st Qu.:0.219028   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median : 1.500   Median :0.446270   Median :0.00000   Median :0.00000  
##  Mean   : 1.777   Mean   :0.468651   Mean   :0.06814   Mean   :0.07276  
##  3rd Qu.: 2.150   3rd Qu.:0.709103   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :34.560   Max.   :0.999316   Max.   :1.00000   Max.   :4.00000  
##                                                                         
##    claimcst0          veh_body        veh_age      gender    area     
##  Min.   :    0.0   SEDAN  :22233   Min.   :1.000   F:38603   A:16312  
##  1st Qu.:    0.0   HBACK  :18915   1st Qu.:2.000   M:29253   B:13341  
##  Median :    0.0   STNWG  :16261   Median :3.000             C:20540  
##  Mean   :  137.3   UTE    : 4586   Mean   :2.674             D: 8173  
##  3rd Qu.:    0.0   TRUCK  : 1750   3rd Qu.:4.000             E: 5912  
##  Max.   :55922.1   HDTOP  : 1579   Max.   :4.000             F: 3578  
##                    (Other): 2532                                      
##      agecat                     X_OBSTAT_    
##  Min.   :1.000   01101    0    0    0:67856  
##  1st Qu.:2.000                               
##  Median :3.000                               
##  Mean   :3.485                               
##  3rd Qu.:5.000                               
##  Max.   :6.000                               
## 

deal with questionable variables and rows

# remove some variables
dataCar$numclaims <- NULL
dataCar$claimcst0 <- NULL
dataCar$X_OBSTAT_ <- NULL

# remove the rows with the veh_value equals 0
dataCar <- dataCar[dataCar$veh_value > 0, ]
summary(dataCar)
##    veh_value         exposure             clm             veh_body    
##  Min.   : 0.180   Min.   :0.002738   Min.   :0.00000   SEDAN  :22232  
##  1st Qu.: 1.010   1st Qu.:0.219028   1st Qu.:0.00000   HBACK  :18915  
##  Median : 1.500   Median :0.446270   Median :0.00000   STNWG  :16234  
##  Mean   : 1.778   Mean   :0.468481   Mean   :0.06811   UTE    : 4586  
##  3rd Qu.: 2.150   3rd Qu.:0.709103   3rd Qu.:0.00000   TRUCK  : 1742  
##  Max.   :34.560   Max.   :0.999316   Max.   :1.00000   HDTOP  : 1579  
##                                                        (Other): 2515  
##     veh_age      gender    area          agecat     
##  Min.   :1.000   F:38584   A:16302   Min.   :1.000  
##  1st Qu.:2.000   M:29219   B:13328   1st Qu.:2.000  
##  Median :3.000             C:20530   Median :3.000  
##  Mean   :2.673             D: 8161   Mean   :3.485  
##  3rd Qu.:4.000             E: 5907   3rd Qu.:5.000  
##  Max.   :4.000             F: 3575   Max.   :6.000  
## 

change the agecat and veh_age to factors and change the baseline level

dataCar$agecat <- as.factor(dataCar$agecat)
dataCar$veh_age <- as.factor(dataCar$veh_age)

categorical <- c('veh_body', 'veh_age', 'gender', 'area', 'agecat')
for (col in categorical) {
  table.cnt <- as.data.frame(table(dataCar[, col]))
  max.cnt <- which.max(table.cnt[, 2])
  baseline.level <- as.character(table.cnt[max.cnt, 1])
  dataCar[, col] <- relevel(dataCar[, col], ref = baseline.level)
}

summary(dataCar)
##    veh_value         exposure             clm             veh_body    
##  Min.   : 0.180   Min.   :0.002738   Min.   :0.00000   SEDAN  :22232  
##  1st Qu.: 1.010   1st Qu.:0.219028   1st Qu.:0.00000   HBACK  :18915  
##  Median : 1.500   Median :0.446270   Median :0.00000   STNWG  :16234  
##  Mean   : 1.778   Mean   :0.468481   Mean   :0.06811   UTE    : 4586  
##  3rd Qu.: 2.150   3rd Qu.:0.709103   3rd Qu.:0.00000   TRUCK  : 1742  
##  Max.   :34.560   Max.   :0.999316   Max.   :1.00000   HDTOP  : 1579  
##                                                        (Other): 2515  
##  veh_age   gender    area      agecat   
##  3:20060   F:38584   C:20530   4:16179  
##  1:12254   M:29219   A:16302   1: 5734  
##  2:16582             B:13328   2:12868  
##  4:18907             D: 8161   3:15757  
##                      E: 5907   5:10722  
##                      F: 3575   6: 6543  
## 

dive into veh_value

visualization the veh_value

library(ggplot2)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ tibble  3.1.8     ✔ dplyr   1.1.0
## ✔ tidyr   1.3.0     ✔ stringr 1.5.0
## ✔ readr   2.1.4     ✔ forcats 1.0.0
## ✔ purrr   1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
dataCar %>% 
  ggplot(aes(x = veh_value)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

log transformation on veh_value

dataCar$log_veh_value <- log(dataCar$veh_value)
dataCar$veh_value <- NULL

dataCar %>% 
  ggplot(aes(x = log_veh_value)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

explore the relationship between predictors and clm

log_veh_value and clm

dataCar %>% 
  ggplot(aes(y = log_veh_value, x = as.factor(clm))) +
  geom_boxplot()

exposure and clm

dataCar %>% 
  ggplot(aes(x = factor(clm), y = exposure)) +
  geom_boxplot()

categorical predictors with clm

for (col in categorical) {
  plot <- dataCar %>% ggplot(aes(x = dataCar[, col], fill = factor(clm))) +
    geom_bar(position = 'fill') +
    labs(x = col, y = 'percent') +
    theme(axis.text.x = element_text(angle = 90, hjust = 1))
  print(plot)
}

a table summarise

for (col in categorical) {
  print(col)
  x <- dataCar %>% 
    group_by_(col) %>% 
    summarise(mean = mean(clm), n = n())
  print(x)
}
## [1] "veh_body"
## Warning: `group_by_()` was deprecated in dplyr 0.7.0.
## ℹ Please use `group_by()` instead.
## ℹ See vignette('programming') for more help
## # A tibble: 13 × 3
##    veh_body   mean     n
##    <fct>     <dbl> <int>
##  1 SEDAN    0.0664 22232
##  2 BUS      0.184     38
##  3 CONVT    0.0370    81
##  4 COUPE    0.0872   780
##  5 HBACK    0.0668 18915
##  6 HDTOP    0.0823  1579
##  7 MCARA    0.116    121
##  8 MIBUS    0.0601   716
##  9 PANVN    0.0824   752
## 10 RDSTR    0.0741    27
## 11 STNWG    0.0721 16234
## 12 TRUCK    0.0683  1742
## 13 UTE      0.0567  4586
## [1] "veh_age"
## Warning: `group_by_()` was deprecated in dplyr 0.7.0.
## ℹ Please use `group_by()` instead.
## ℹ See vignette('programming') for more help
## # A tibble: 4 × 3
##   veh_age   mean     n
##   <fct>    <dbl> <int>
## 1 3       0.0679 20060
## 2 1       0.0672 12254
## 3 2       0.0759 16582
## 4 4       0.0620 18907
## [1] "gender"
## Warning: `group_by_()` was deprecated in dplyr 0.7.0.
## ℹ Please use `group_by()` instead.
## ℹ See vignette('programming') for more help
## # A tibble: 2 × 3
##   gender   mean     n
##   <fct>   <dbl> <int>
## 1 F      0.0686 38584
## 2 M      0.0675 29219
## [1] "area"
## Warning: `group_by_()` was deprecated in dplyr 0.7.0.
## ℹ Please use `group_by()` instead.
## ℹ See vignette('programming') for more help
## # A tibble: 6 × 3
##   area    mean     n
##   <fct>  <dbl> <int>
## 1 C     0.0688 20530
## 2 A     0.0664 16302
## 3 B     0.0723 13328
## 4 D     0.0607  8161
## 5 E     0.0652  5907
## 6 F     0.0783  3575
## [1] "agecat"
## Warning: `group_by_()` was deprecated in dplyr 0.7.0.
## ℹ Please use `group_by()` instead.
## ℹ See vignette('programming') for more help
## # A tibble: 6 × 3
##   agecat   mean     n
##   <fct>   <dbl> <int>
## 1 4      0.0682 16179
## 2 1      0.0863  5734
## 3 2      0.0724 12868
## 4 3      0.0705 15757
## 5 5      0.0572 10722
## 6 6      0.0556  6543

combine some observations

library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following object is masked from 'package:purrr':
## 
##     compact
# veh_body
var <- "veh_body"
var.levels <- levels(dataCar[, var])
dataCar[, var] <- mapvalues(dataCar[, var], var.levels,
                            c("HYBRID", "BUS", "CONVT", "HYBRID", "HYBRID",
                              "HYBRID", "MCARA", "HYBRID", "HYBRID", "HYBRID",
                              "HYBRID", "HYBRID", "HYBRID"))

# area
var <- "area"
var.levels <- levels(dataCar[, var])
dataCar[, var] <- mapvalues(dataCar[, var], var.levels,
                            c("BASE", "BASE", "OTHER", "BASE", "BASE", "OTHER"))

# agecat
var <- "agecat"
var.levels <- levels(dataCar[, var])
dataCar[, var] <- mapvalues(dataCar[, var], var.levels,
                            c("2", "1", "2", "2", "3", "3"))

# Relevel
for (i in categorical){
  table <- as.data.frame(table(dataCar[, i]))
  max <- which.max(table[, 2])
  level.name <- as.character(table[max, 1])
  dataCar[, i] <- relevel(dataCar[, i], ref = level.name)
}

summary(dataCar)
##     exposure             clm            veh_body     veh_age   gender   
##  Min.   :0.002738   Min.   :0.00000   HYBRID:67563   3:20060   F:38584  
##  1st Qu.:0.219028   1st Qu.:0.00000   BUS   :   38   1:12254   M:29219  
##  Median :0.446270   Median :0.00000   CONVT :   81   2:16582            
##  Mean   :0.468481   Mean   :0.06811   MCARA :  121   4:18907            
##  3rd Qu.:0.709103   3rd Qu.:0.00000                                     
##  Max.   :0.999316   Max.   :1.00000                                     
##     area       agecat    log_veh_value     
##  BASE :50900   2:44804   Min.   :-1.71480  
##  OTHER:16903   1: 5734   1st Qu.: 0.00995  
##                3:17265   Median : 0.40547  
##                          Mean   : 0.38675  
##                          3rd Qu.: 0.76547  
##                          Max.   : 3.54270

explore an interaction

ggplot(dataCar, aes(x = factor(clm), y = log_veh_value)) +
  geom_boxplot() +
  facet_wrap(~ veh_body)

creation of training and test sets

library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
set.seed(4769)
partition <- createDataPartition(dataCar$clm, p = 0.75, list = FALSE)
train <- dataCar[partition, ]
test <- dataCar[-partition, ]

mean(train$clm)
## [1] 0.06870784
mean(test$clm)
## [1] 0.06631268

build the logit and probit models

logit <- glm(clm ~ . - exposure + log_veh_value:veh_body, data = train, family = binomial(link='logit'))

probit <- glm(clm ~ . - exposure + log_veh_value:veh_body, data = train, family = binomial(link='probit'))

summary(logit)
## 
## Call:
## glm(formula = clm ~ . - exposure + log_veh_value:veh_body, family = binomial(link = "logit"), 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1054  -0.3962  -0.3710  -0.3442   2.5928  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 -2.666221   0.040753 -65.425  < 2e-16 ***
## veh_bodyBUS                  1.893682   0.614643   3.081  0.00206 ** 
## veh_bodyCONVT                0.344542   1.097079   0.314  0.75348    
## veh_bodyMCARA               -0.676286   1.037468  -0.652  0.51449    
## veh_age1                    -0.126600   0.056819  -2.228  0.02587 *  
## veh_age2                     0.063206   0.048253   1.310  0.19023    
## veh_age4                    -0.008732   0.052120  -0.168  0.86695    
## genderM                     -0.036598   0.036311  -1.008  0.31350    
## areaOTHER                    0.140962   0.039342   3.583  0.00034 ***
## agecat1                      0.240163   0.057624   4.168 3.08e-05 ***
## agecat3                     -0.236353   0.044210  -5.346 8.98e-08 ***
## log_veh_value                0.184019   0.038525   4.777 1.78e-06 ***
## veh_bodyBUS:log_veh_value   -2.120656   1.262239  -1.680  0.09294 .  
## veh_bodyCONVT:log_veh_value -0.580723   0.642281  -0.904  0.36591    
## veh_bodyMCARA:log_veh_value  1.135352   0.860314   1.320  0.18694    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 25455  on 50852  degrees of freedom
## Residual deviance: 25319  on 50838  degrees of freedom
## AIC: 25349
## 
## Number of Fisher Scoring iterations: 5
summary(probit)
## 
## Call:
## glm(formula = clm ~ . - exposure + log_veh_value:veh_body, family = binomial(link = "probit"), 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1137  -0.3965  -0.3713  -0.3444   2.5943  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 -1.514209   0.019676 -76.957  < 2e-16 ***
## veh_bodyBUS                  1.061052   0.370979   2.860 0.004235 ** 
## veh_bodyCONVT                0.197288   0.537099   0.367 0.713380    
## veh_bodyMCARA               -0.474145   0.535292  -0.886 0.375743    
## veh_age1                    -0.061956   0.027459  -2.256 0.024049 *  
## veh_age2                     0.030565   0.023584   1.296 0.194982    
## veh_age4                    -0.002869   0.025106  -0.114 0.909019    
## genderM                     -0.018266   0.017570  -1.040 0.298530    
## areaOTHER                    0.068524   0.019242   3.561 0.000369 ***
## agecat1                      0.118946   0.028861   4.121 3.77e-05 ***
## agecat3                     -0.112495   0.020910  -5.380 7.45e-08 ***
## log_veh_value                0.090207   0.018679   4.829 1.37e-06 ***
## veh_bodyBUS:log_veh_value   -1.237413   0.695959  -1.778 0.075404 .  
## veh_bodyCONVT:log_veh_value -0.291791   0.303498  -0.961 0.336338    
## veh_bodyMCARA:log_veh_value  0.692255   0.457376   1.514 0.130143    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 25455  on 50852  degrees of freedom
## Residual deviance: 25318  on 50838  degrees of freedom
## AIC: 25348
## 
## Number of Fisher Scoring iterations: 5

add the exposure variable as the offset

logit.offset <- glm(clm ~ . - exposure + log_veh_value:veh_body,
                    data = train,
                    family = binomial(link='logit'),
                    offset = log(exposure))

summary(logit.offset)
## 
## Call:
## glm(formula = clm ~ . - exposure + log_veh_value:veh_body, family = binomial(link = "logit"), 
##     data = train, offset = log(exposure))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1703  -0.4491  -0.3469  -0.2233   3.9391  
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 -1.86688    0.04144 -45.052  < 2e-16 ***
## veh_bodyBUS                  2.20290    0.63434   3.473 0.000515 ***
## veh_bodyCONVT                0.60617    1.12344   0.540 0.589498    
## veh_bodyMCARA               -0.32226    1.04546  -0.308 0.757894    
## veh_age1                    -0.01809    0.05743  -0.315 0.752804    
## veh_age2                     0.07601    0.04877   1.558 0.119165    
## veh_age4                    -0.02796    0.05297  -0.528 0.597649    
## genderM                     -0.04855    0.03678  -1.320 0.186847    
## areaOTHER                    0.13262    0.03989   3.325 0.000884 ***
## agecat1                      0.27453    0.05862   4.683 2.82e-06 ***
## agecat3                     -0.27413    0.04466  -6.138 8.37e-10 ***
## log_veh_value                0.14231    0.03913   3.636 0.000277 ***
## veh_bodyBUS:log_veh_value   -2.86477    1.36959  -2.092 0.036465 *  
## veh_bodyCONVT:log_veh_value -0.64058    0.65592  -0.977 0.328760    
## veh_bodyMCARA:log_veh_value  0.84156    0.86913   0.968 0.332903    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 24567  on 50852  degrees of freedom
## Residual deviance: 24426  on 50838  degrees of freedom
## AIC: 24456
## 
## Number of Fisher Scoring iterations: 6

build up confusion matrix

cutoff <- 0.06811

pred.train <- predict(logit.offset, type="response")
class.train <- 1 * (pred.train > cutoff)
confusionMatrix(factor(class.train), factor(train$clm), positive="1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 25424  1001
##          1 21935  2493
##                                           
##                Accuracy : 0.549           
##                  95% CI : (0.5446, 0.5533)
##     No Information Rate : 0.9313          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0663          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.71351         
##             Specificity : 0.53684         
##          Pos Pred Value : 0.10206         
##          Neg Pred Value : 0.96212         
##              Prevalence : 0.06871         
##          Detection Rate : 0.04902         
##    Detection Prevalence : 0.48036         
##       Balanced Accuracy : 0.62517         
##                                           
##        'Positive' Class : 1               
## 
pred.test <- predict(logit.offset, newdata = test, type = 'response')
class.test <- ifelse(pred.test > cutoff, 1, 0)
confusionMatrix(as.factor(class.test), as.factor(test$clm), positive="1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 8494  328
##          1 7332  796
##                                           
##                Accuracy : 0.5481          
##                  95% CI : (0.5406, 0.5556)
##     No Information Rate : 0.9337          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0629          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.70819         
##             Specificity : 0.53671         
##          Pos Pred Value : 0.09793         
##          Neg Pred Value : 0.96282         
##              Prevalence : 0.06631         
##          Detection Rate : 0.04696         
##    Detection Prevalence : 0.47953         
##       Balanced Accuracy : 0.62245         
##                                           
##        'Positive' Class : 1               
## 

build ROC curve and get the AUC

library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
roc.train <- roc(train$clm, pred.train)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
par(pty='s')
plot(roc.train)

auc(roc.train)
## Area under the curve: 0.6643
roc.test <- roc(test$clm, pred.test)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc.test)

auc(roc.test)
## Area under the curve: 0.6588

binarized some variables

library(caret)

binarizer <- dummyVars(~ log_veh_value * veh_body + veh_age + agecat,
                       data = dataCar,
                       fullRank = TRUE)
binarized.vars <- as.data.frame(predict(binarizer, newdata=dataCar))

data.bin <- cbind(dataCar, binarized.vars)
data.bin$log_veh_value <- NULL
data.bin$veh_body <- NULL
data.bin$veh_age <- NULL
data.bin$agecat <- NULL
head(data.bin)
##    exposure clm gender area log_veh_value veh_body.BUS veh_body.CONVT
## 1 0.3039014   0      F BASE    0.05826891            0              0
## 2 0.6488706   0      F BASE    0.02955880            0              0
## 3 0.5694730   0      F BASE    1.18172720            0              0
## 4 0.3175907   0      F BASE    1.42069579            0              0
## 5 0.6488706   0      F BASE   -0.32850407            0              0
## 6 0.8542094   0      M BASE    0.69813472            0              0
##   veh_body.MCARA veh_age.1 veh_age.2 veh_age.4 agecat.1 agecat.3
## 1              0         0         0         0        0        0
## 2              0         0         1         0        0        0
## 3              0         0         1         0        0        0
## 4              0         0         1         0        0        0
## 5              0         0         0         1        0        0
## 6              0         0         0         0        0        0
##   log_veh_value:veh_bodyBUS log_veh_value:veh_bodyCONVT
## 1                         0                           0
## 2                         0                           0
## 3                         0                           0
## 4                         0                           0
## 5                         0                           0
## 6                         0                           0
##   log_veh_value:veh_bodyMCARA
## 1                           0
## 2                           0
## 3                           0
## 4                           0
## 5                           0
## 6                           0

split the binarized data into training and test set

train.bin <- data.bin[partition, ]
test.bin <- data.bin[-partition, ]

logit.full <- glm(clm ~ . - exposure,
                  data = train.bin,
                  family = binomial,
                  offset = log(exposure))

summary(logit.full)
## 
## Call:
## glm(formula = clm ~ . - exposure, family = binomial, data = train.bin, 
##     offset = log(exposure))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1703  -0.4491  -0.3469  -0.2233   3.9391  
## 
## Coefficients:
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -1.86688    0.04144 -45.052  < 2e-16 ***
## genderM                       -0.04855    0.03678  -1.320 0.186847    
## areaOTHER                      0.13262    0.03989   3.325 0.000884 ***
## log_veh_value                  0.14231    0.03913   3.636 0.000277 ***
## veh_body.BUS                   2.20290    0.63434   3.473 0.000515 ***
## veh_body.CONVT                 0.60617    1.12344   0.540 0.589498    
## veh_body.MCARA                -0.32226    1.04546  -0.308 0.757894    
## veh_age.1                     -0.01809    0.05743  -0.315 0.752804    
## veh_age.2                      0.07601    0.04877   1.558 0.119165    
## veh_age.4                     -0.02796    0.05297  -0.528 0.597649    
## agecat.1                       0.27453    0.05862   4.683 2.82e-06 ***
## agecat.3                      -0.27413    0.04466  -6.138 8.37e-10 ***
## `log_veh_value:veh_bodyBUS`   -2.86477    1.36959  -2.092 0.036465 *  
## `log_veh_value:veh_bodyCONVT` -0.64058    0.65592  -0.977 0.328760    
## `log_veh_value:veh_bodyMCARA`  0.84156    0.86913   0.968 0.332903    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 24567  on 50852  degrees of freedom
## Residual deviance: 24426  on 50838  degrees of freedom
## AIC: 24456
## 
## Number of Fisher Scoring iterations: 6

using the stepAIC function to feature selection

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
logit.AIC <- stepAIC(logit.full)
## Start:  AIC=24455.94
## clm ~ (exposure + gender + area + log_veh_value + veh_body.BUS + 
##     veh_body.CONVT + veh_body.MCARA + veh_age.1 + veh_age.2 + 
##     veh_age.4 + agecat.1 + agecat.3 + `log_veh_value:veh_bodyBUS` + 
##     `log_veh_value:veh_bodyCONVT` + `log_veh_value:veh_bodyMCARA`) - 
##     exposure
## 
##                                 Df Deviance   AIC
## - veh_body.MCARA                 1    24426 24454
## - veh_age.1                      1    24426 24454
## - veh_body.CONVT                 1    24426 24454
## - veh_age.4                      1    24426 24454
## - `log_veh_value:veh_bodyCONVT`  1    24427 24455
## - `log_veh_value:veh_bodyMCARA`  1    24427 24455
## - gender                         1    24428 24456
## <none>                                24426 24456
## - veh_age.2                      1    24428 24456
## - `log_veh_value:veh_bodyBUS`    1    24432 24460
## - veh_body.BUS                   1    24435 24463
## - area                           1    24437 24465
## - log_veh_value                  1    24439 24467
## - agecat.1                       1    24447 24475
## - agecat.3                       1    24465 24493
## 
## Step:  AIC=24454.04
## clm ~ gender + area + log_veh_value + veh_body.BUS + veh_body.CONVT + 
##     veh_age.1 + veh_age.2 + veh_age.4 + agecat.1 + agecat.3 + 
##     `log_veh_value:veh_bodyBUS` + `log_veh_value:veh_bodyCONVT` + 
##     `log_veh_value:veh_bodyMCARA`
## 
##                                 Df Deviance   AIC
## - veh_age.1                      1    24426 24452
## - veh_body.CONVT                 1    24426 24452
## - veh_age.4                      1    24426 24452
## - `log_veh_value:veh_bodyCONVT`  1    24427 24453
## - gender                         1    24428 24454
## <none>                                24426 24454
## - veh_age.2                      1    24429 24455
## - `log_veh_value:veh_bodyMCARA`  1    24430 24456
## - `log_veh_value:veh_bodyBUS`    1    24432 24458
## - veh_body.BUS                   1    24435 24461
## - area                           1    24437 24463
## - log_veh_value                  1    24439 24465
## - agecat.1                       1    24447 24473
## - agecat.3                       1    24465 24491
## 
## Step:  AIC=24452.14
## clm ~ gender + area + log_veh_value + veh_body.BUS + veh_body.CONVT + 
##     veh_age.2 + veh_age.4 + agecat.1 + agecat.3 + `log_veh_value:veh_bodyBUS` + 
##     `log_veh_value:veh_bodyCONVT` + `log_veh_value:veh_bodyMCARA`
## 
##                                 Df Deviance   AIC
## - veh_age.4                      1    24426 24450
## - veh_body.CONVT                 1    24426 24450
## - `log_veh_value:veh_bodyCONVT`  1    24427 24451
## - gender                         1    24428 24452
## <none>                                24426 24452
## - `log_veh_value:veh_bodyMCARA`  1    24430 24454
## - veh_age.2                      1    24430 24454
## - `log_veh_value:veh_bodyBUS`    1    24432 24456
## - veh_body.BUS                   1    24435 24459
## - area                           1    24437 24461
## - log_veh_value                  1    24440 24464
## - agecat.1                       1    24447 24471
## - agecat.3                       1    24466 24490
## 
## Step:  AIC=24450.38
## clm ~ gender + area + log_veh_value + veh_body.BUS + veh_body.CONVT + 
##     veh_age.2 + agecat.1 + agecat.3 + `log_veh_value:veh_bodyBUS` + 
##     `log_veh_value:veh_bodyCONVT` + `log_veh_value:veh_bodyMCARA`
## 
##                                 Df Deviance   AIC
## - veh_body.CONVT                 1    24427 24449
## - `log_veh_value:veh_bodyCONVT`  1    24427 24449
## - gender                         1    24428 24450
## <none>                                24426 24450
## - `log_veh_value:veh_bodyMCARA`  1    24430 24452
## - veh_age.2                      1    24431 24453
## - `log_veh_value:veh_bodyBUS`    1    24432 24454
## - veh_body.BUS                   1    24435 24457
## - area                           1    24437 24459
## - agecat.1                       1    24447 24469
## - log_veh_value                  1    24449 24471
## - agecat.3                       1    24466 24488
## 
## Step:  AIC=24448.64
## clm ~ gender + area + log_veh_value + veh_body.BUS + veh_age.2 + 
##     agecat.1 + agecat.3 + `log_veh_value:veh_bodyBUS` + `log_veh_value:veh_bodyCONVT` + 
##     `log_veh_value:veh_bodyMCARA`
## 
##                                 Df Deviance   AIC
## - `log_veh_value:veh_bodyCONVT`  1    24428 24448
## - gender                         1    24429 24449
## <none>                                24427 24449
## - `log_veh_value:veh_bodyMCARA`  1    24430 24450
## - veh_age.2                      1    24431 24451
## - `log_veh_value:veh_bodyBUS`    1    24432 24452
## - veh_body.BUS                   1    24436 24456
## - area                           1    24438 24458
## - agecat.1                       1    24448 24468
## - log_veh_value                  1    24449 24469
## - agecat.3                       1    24466 24486
## 
## Step:  AIC=24447.94
## clm ~ gender + area + log_veh_value + veh_body.BUS + veh_age.2 + 
##     agecat.1 + agecat.3 + `log_veh_value:veh_bodyBUS` + `log_veh_value:veh_bodyMCARA`
## 
##                                 Df Deviance   AIC
## - gender                         1    24430 24448
## <none>                                24428 24448
## - `log_veh_value:veh_bodyMCARA`  1    24431 24449
## - veh_age.2                      1    24433 24451
## - `log_veh_value:veh_bodyBUS`    1    24434 24452
## - veh_body.BUS                   1    24437 24455
## - area                           1    24439 24457
## - agecat.1                       1    24449 24467
## - log_veh_value                  1    24450 24468
## - agecat.3                       1    24467 24485
## 
## Step:  AIC=24447.81
## clm ~ area + log_veh_value + veh_body.BUS + veh_age.2 + agecat.1 + 
##     agecat.3 + `log_veh_value:veh_bodyBUS` + `log_veh_value:veh_bodyMCARA`
## 
##                                 Df Deviance   AIC
## <none>                                24430 24448
## - `log_veh_value:veh_bodyMCARA`  1    24433 24449
## - veh_age.2                      1    24435 24451
## - `log_veh_value:veh_bodyBUS`    1    24435 24451
## - veh_body.BUS                   1    24439 24455
## - area                           1    24441 24457
## - agecat.1                       1    24451 24467
## - log_veh_value                  1    24451 24467
## - agecat.3                       1    24470 24486
summary(logit.AIC)
## 
## Call:
## glm(formula = clm ~ area + log_veh_value + veh_body.BUS + veh_age.2 + 
##     agecat.1 + agecat.3 + `log_veh_value:veh_bodyBUS` + `log_veh_value:veh_bodyMCARA`, 
##     family = binomial, data = train.bin, offset = log(exposure))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1746  -0.4494  -0.3468  -0.2235   3.9478  
## 
## Coefficients:
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -1.90098    0.02866 -66.317  < 2e-16 ***
## areaOTHER                      0.13220    0.03986   3.316 0.000913 ***
## log_veh_value                  0.13876    0.03047   4.554 5.27e-06 ***
## veh_body.BUS                   2.17744    0.63456   3.431 0.000600 ***
## veh_age.2                      0.09403    0.04155   2.263 0.023643 *  
## agecat.1                       0.27304    0.05849   4.668 3.04e-06 ***
## agecat.3                      -0.27806    0.04451  -6.247 4.18e-10 ***
## `log_veh_value:veh_bodyBUS`   -2.81962    1.36626  -2.064 0.039041 *  
## `log_veh_value:veh_bodyMCARA`  0.58442    0.29180   2.003 0.045196 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 24567  on 50852  degrees of freedom
## Residual deviance: 24430  on 50844  degrees of freedom
## AIC: 24448
## 
## Number of Fisher Scoring iterations: 5

stepAIC using BIC

logit.BIC <- stepAIC(logit.full, k = log(nrow(train.bin)))
## Start:  AIC=24588.49
## clm ~ (exposure + gender + area + log_veh_value + veh_body.BUS + 
##     veh_body.CONVT + veh_body.MCARA + veh_age.1 + veh_age.2 + 
##     veh_age.4 + agecat.1 + agecat.3 + `log_veh_value:veh_bodyBUS` + 
##     `log_veh_value:veh_bodyCONVT` + `log_veh_value:veh_bodyMCARA`) - 
##     exposure
## 
##                                 Df Deviance   AIC
## - veh_body.MCARA                 1    24426 24578
## - veh_age.1                      1    24426 24578
## - veh_body.CONVT                 1    24426 24578
## - veh_age.4                      1    24426 24578
## - `log_veh_value:veh_bodyCONVT`  1    24427 24579
## - `log_veh_value:veh_bodyMCARA`  1    24427 24579
## - gender                         1    24428 24579
## - veh_age.2                      1    24428 24580
## - `log_veh_value:veh_bodyBUS`    1    24432 24583
## - veh_body.BUS                   1    24435 24587
## <none>                                24426 24589
## - area                           1    24437 24589
## - log_veh_value                  1    24439 24591
## - agecat.1                       1    24447 24599
## - agecat.3                       1    24465 24617
## 
## Step:  AIC=24577.76
## clm ~ gender + area + log_veh_value + veh_body.BUS + veh_body.CONVT + 
##     veh_age.1 + veh_age.2 + veh_age.4 + agecat.1 + agecat.3 + 
##     `log_veh_value:veh_bodyBUS` + `log_veh_value:veh_bodyCONVT` + 
##     `log_veh_value:veh_bodyMCARA`
## 
##                                 Df Deviance   AIC
## - veh_age.1                      1    24426 24567
## - veh_body.CONVT                 1    24426 24567
## - veh_age.4                      1    24426 24567
## - `log_veh_value:veh_bodyCONVT`  1    24427 24568
## - gender                         1    24428 24569
## - veh_age.2                      1    24429 24569
## - `log_veh_value:veh_bodyMCARA`  1    24430 24570
## - `log_veh_value:veh_bodyBUS`    1    24432 24573
## - veh_body.BUS                   1    24435 24576
## <none>                                24426 24578
## - area                           1    24437 24578
## - log_veh_value                  1    24439 24580
## - agecat.1                       1    24447 24588
## - agecat.3                       1    24465 24606
## 
## Step:  AIC=24567.02
## clm ~ gender + area + log_veh_value + veh_body.BUS + veh_body.CONVT + 
##     veh_age.2 + veh_age.4 + agecat.1 + agecat.3 + `log_veh_value:veh_bodyBUS` + 
##     `log_veh_value:veh_bodyCONVT` + `log_veh_value:veh_bodyMCARA`
## 
##                                 Df Deviance   AIC
## - veh_age.4                      1    24426 24556
## - veh_body.CONVT                 1    24426 24556
## - `log_veh_value:veh_bodyCONVT`  1    24427 24557
## - gender                         1    24428 24558
## - `log_veh_value:veh_bodyMCARA`  1    24430 24560
## - veh_age.2                      1    24430 24560
## - `log_veh_value:veh_bodyBUS`    1    24432 24562
## - veh_body.BUS                   1    24435 24565
## <none>                                24426 24567
## - area                           1    24437 24567
## - log_veh_value                  1    24440 24570
## - agecat.1                       1    24447 24577
## - agecat.3                       1    24466 24596
## 
## Step:  AIC=24556.42
## clm ~ gender + area + log_veh_value + veh_body.BUS + veh_body.CONVT + 
##     veh_age.2 + agecat.1 + agecat.3 + `log_veh_value:veh_bodyBUS` + 
##     `log_veh_value:veh_bodyCONVT` + `log_veh_value:veh_bodyMCARA`
## 
##                                 Df Deviance   AIC
## - veh_body.CONVT                 1    24427 24546
## - `log_veh_value:veh_bodyCONVT`  1    24427 24547
## - gender                         1    24428 24548
## - `log_veh_value:veh_bodyMCARA`  1    24430 24549
## - veh_age.2                      1    24431 24550
## - `log_veh_value:veh_bodyBUS`    1    24432 24551
## - veh_body.BUS                   1    24435 24555
## <none>                                24426 24556
## - area                           1    24437 24556
## - agecat.1                       1    24447 24567
## - log_veh_value                  1    24449 24568
## - agecat.3                       1    24466 24585
## 
## Step:  AIC=24545.84
## clm ~ gender + area + log_veh_value + veh_body.BUS + veh_age.2 + 
##     agecat.1 + agecat.3 + `log_veh_value:veh_bodyBUS` + `log_veh_value:veh_bodyCONVT` + 
##     `log_veh_value:veh_bodyMCARA`
## 
##                                 Df Deviance   AIC
## - `log_veh_value:veh_bodyCONVT`  1    24428 24536
## - gender                         1    24429 24537
## - `log_veh_value:veh_bodyMCARA`  1    24430 24538
## - veh_age.2                      1    24431 24540
## - `log_veh_value:veh_bodyBUS`    1    24432 24541
## - veh_body.BUS                   1    24436 24544
## <none>                                24427 24546
## - area                           1    24438 24546
## - agecat.1                       1    24448 24556
## - log_veh_value                  1    24449 24558
## - agecat.3                       1    24466 24574
## 
## Step:  AIC=24536.3
## clm ~ gender + area + log_veh_value + veh_body.BUS + veh_age.2 + 
##     agecat.1 + agecat.3 + `log_veh_value:veh_bodyBUS` + `log_veh_value:veh_bodyMCARA`
## 
##                                 Df Deviance   AIC
## - gender                         1    24430 24527
## - `log_veh_value:veh_bodyMCARA`  1    24431 24529
## - veh_age.2                      1    24433 24530
## - `log_veh_value:veh_bodyBUS`    1    24434 24531
## - veh_body.BUS                   1    24437 24534
## <none>                                24428 24536
## - area                           1    24439 24536
## - agecat.1                       1    24449 24546
## - log_veh_value                  1    24450 24547
## - agecat.3                       1    24467 24565
## 
## Step:  AIC=24527.34
## clm ~ area + log_veh_value + veh_body.BUS + veh_age.2 + agecat.1 + 
##     agecat.3 + `log_veh_value:veh_bodyBUS` + `log_veh_value:veh_bodyMCARA`
## 
##                                 Df Deviance   AIC
## - `log_veh_value:veh_bodyMCARA`  1    24433 24520
## - veh_age.2                      1    24435 24522
## - `log_veh_value:veh_bodyBUS`    1    24435 24522
## - veh_body.BUS                   1    24439 24525
## - area                           1    24441 24527
## <none>                                24430 24527
## - agecat.1                       1    24451 24537
## - log_veh_value                  1    24451 24537
## - agecat.3                       1    24470 24557
## 
## Step:  AIC=24519.91
## clm ~ area + log_veh_value + veh_body.BUS + veh_age.2 + agecat.1 + 
##     agecat.3 + `log_veh_value:veh_bodyBUS`
## 
##                               Df Deviance   AIC
## - veh_age.2                    1    24438 24514
## - `log_veh_value:veh_bodyBUS`  1    24439 24515
## - veh_body.BUS                 1    24442 24518
## - area                         1    24444 24520
## <none>                              24433 24520
## - agecat.1                     1    24454 24530
## - log_veh_value                1    24455 24531
## - agecat.3                     1    24473 24549
## 
## Step:  AIC=24514
## clm ~ area + log_veh_value + veh_body.BUS + agecat.1 + agecat.3 + 
##     `log_veh_value:veh_bodyBUS`
## 
##                               Df Deviance   AIC
## - `log_veh_value:veh_bodyBUS`  1    24444 24509
## - veh_body.BUS                 1    24447 24512
## - area                         1    24449 24514
## <none>                              24438 24514
## - agecat.1                     1    24459 24524
## - log_veh_value                1    24469 24534
## - agecat.3                     1    24478 24543
## 
## Step:  AIC=24508.7
## clm ~ area + log_veh_value + veh_body.BUS + agecat.1 + agecat.3
## 
##                 Df Deviance   AIC
## - veh_body.BUS   1    24447 24501
## - area           1    24454 24508
## <none>                24444 24509
## - agecat.1       1    24465 24519
## - log_veh_value  1    24474 24528
## - agecat.3       1    24483 24537
## 
## Step:  AIC=24501.2
## clm ~ area + log_veh_value + agecat.1 + agecat.3
## 
##                 Df Deviance   AIC
## - area           1    24458 24501
## <none>                24447 24501
## - agecat.1       1    24468 24511
## - log_veh_value  1    24477 24520
## - agecat.3       1    24486 24530
## 
## Step:  AIC=24500.88
## clm ~ log_veh_value + agecat.1 + agecat.3
## 
##                 Df Deviance   AIC
## <none>                24458 24501
## - agecat.1       1    24479 24511
## - log_veh_value  1    24488 24520
## - agecat.3       1    24498 24531
summary(logit.BIC)
## 
## Call:
## glm(formula = clm ~ log_veh_value + agecat.1 + agecat.3, family = binomial, 
##     data = train.bin, offset = log(exposure))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7242  -0.4502  -0.3480  -0.2237   3.9341  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -1.84841    0.02532 -72.998  < 2e-16 ***
## log_veh_value  0.16016    0.02912   5.499 3.81e-08 ***
## agecat.1       0.27479    0.05845   4.701 2.59e-06 ***
## agecat.3      -0.27735    0.04443  -6.242 4.33e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 24567  on 50852  degrees of freedom
## Residual deviance: 24458  on 50849  degrees of freedom
## AIC: 24466
## 
## Number of Fisher Scoring iterations: 5

plot the ROC of logit.full, logit.AIC, logit.BIC

pred.full <- predict(logit.full, newdata = test.bin, type = 'response')
roc.full <- roc(test.bin$clm, pred.full)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc.full)
## Area under the curve: 0.6588
pred.AIC <- predict(logit.AIC, newdata = test.bin, type = 'response')
roc.AIC <- roc(test.bin$clm, pred.AIC)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc.AIC)
## Area under the curve: 0.6587
pred.BIC <- predict(logit.BIC, newdata = test.bin, type = 'response')
roc.BIC <- roc(test.bin$clm, pred.BIC)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc.BIC)
## Area under the curve: 0.6598

run the final model on full dataset

logit.final <- glm(clm ~ log_veh_value + agecat.1 + agecat.3, data = data.bin,
                  offset = log(exposure), family = binomial)
summary(logit.final)
## 
## Call:
## glm(formula = clm ~ log_veh_value + agecat.1 + agecat.3, family = binomial, 
##     data = data.bin, offset = log(exposure))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7127  -0.4490  -0.3471  -0.2236   3.9375  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -1.86168    0.02204 -84.483  < 2e-16 ***
## log_veh_value  0.15807    0.02527   6.255 3.97e-10 ***
## agecat.1       0.25731    0.05138   5.008 5.51e-07 ***
## agecat.3      -0.24851    0.03825  -6.496 8.24e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 32572  on 67802  degrees of freedom
## Residual deviance: 32445  on 67799  degrees of freedom
## AIC: 32453
## 
## Number of Fisher Scoring iterations: 5
exp(logit.final$coefficients)
##   (Intercept) log_veh_value      agecat.1      agecat.3 
##     0.1554116     1.1712493     1.2934434     0.7799632

predictions on new data

new.data <- data.frame(
  exposure = c(0.468481, 0.515329, 0.468481, 0.468481, 0.468481),
  log_veh_value = c(0.38675, 0.38675, 0.42543, 0.38675, 0.38675),
  agecat.1 = c(0, 0, 0, 1, 0),
  agecat.3 = c(0, 0, 0, 0, 1)
  )
new.data
##   exposure log_veh_value agecat.1 agecat.3
## 1 0.468481       0.38675        0        0
## 2 0.515329       0.38675        0        0
## 3 0.468481       0.42543        0        0
## 4 0.468481       0.38675        1        0
## 5 0.468481       0.38675        0        1
predict(logit.final, newdata = new.data, type = "response")
##          1          2          3          4          5 
## 0.07183724 0.07845733 0.07224598 0.09099912 0.05693029