Load the data
library(insuranceData)
data(dataCar)
summary(dataCar)
## veh_value exposure clm numclaims
## Min. : 0.000 Min. :0.002738 Min. :0.00000 Min. :0.00000
## 1st Qu.: 1.010 1st Qu.:0.219028 1st Qu.:0.00000 1st Qu.:0.00000
## Median : 1.500 Median :0.446270 Median :0.00000 Median :0.00000
## Mean : 1.777 Mean :0.468651 Mean :0.06814 Mean :0.07276
## 3rd Qu.: 2.150 3rd Qu.:0.709103 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :34.560 Max. :0.999316 Max. :1.00000 Max. :4.00000
##
## claimcst0 veh_body veh_age gender area
## Min. : 0.0 SEDAN :22233 Min. :1.000 F:38603 A:16312
## 1st Qu.: 0.0 HBACK :18915 1st Qu.:2.000 M:29253 B:13341
## Median : 0.0 STNWG :16261 Median :3.000 C:20540
## Mean : 137.3 UTE : 4586 Mean :2.674 D: 8173
## 3rd Qu.: 0.0 TRUCK : 1750 3rd Qu.:4.000 E: 5912
## Max. :55922.1 HDTOP : 1579 Max. :4.000 F: 3578
## (Other): 2532
## agecat X_OBSTAT_
## Min. :1.000 01101 0 0 0:67856
## 1st Qu.:2.000
## Median :3.000
## Mean :3.485
## 3rd Qu.:5.000
## Max. :6.000
##
deal with questionable variables and rows
# remove some variables
dataCar$numclaims <- NULL
dataCar$claimcst0 <- NULL
dataCar$X_OBSTAT_ <- NULL
# remove the rows with the veh_value equals 0
dataCar <- dataCar[dataCar$veh_value > 0, ]
summary(dataCar)
## veh_value exposure clm veh_body
## Min. : 0.180 Min. :0.002738 Min. :0.00000 SEDAN :22232
## 1st Qu.: 1.010 1st Qu.:0.219028 1st Qu.:0.00000 HBACK :18915
## Median : 1.500 Median :0.446270 Median :0.00000 STNWG :16234
## Mean : 1.778 Mean :0.468481 Mean :0.06811 UTE : 4586
## 3rd Qu.: 2.150 3rd Qu.:0.709103 3rd Qu.:0.00000 TRUCK : 1742
## Max. :34.560 Max. :0.999316 Max. :1.00000 HDTOP : 1579
## (Other): 2515
## veh_age gender area agecat
## Min. :1.000 F:38584 A:16302 Min. :1.000
## 1st Qu.:2.000 M:29219 B:13328 1st Qu.:2.000
## Median :3.000 C:20530 Median :3.000
## Mean :2.673 D: 8161 Mean :3.485
## 3rd Qu.:4.000 E: 5907 3rd Qu.:5.000
## Max. :4.000 F: 3575 Max. :6.000
##
change the agecat and veh_age to factors and change the baseline
level
dataCar$agecat <- as.factor(dataCar$agecat)
dataCar$veh_age <- as.factor(dataCar$veh_age)
categorical <- c('veh_body', 'veh_age', 'gender', 'area', 'agecat')
for (col in categorical) {
table.cnt <- as.data.frame(table(dataCar[, col]))
max.cnt <- which.max(table.cnt[, 2])
baseline.level <- as.character(table.cnt[max.cnt, 1])
dataCar[, col] <- relevel(dataCar[, col], ref = baseline.level)
}
summary(dataCar)
## veh_value exposure clm veh_body
## Min. : 0.180 Min. :0.002738 Min. :0.00000 SEDAN :22232
## 1st Qu.: 1.010 1st Qu.:0.219028 1st Qu.:0.00000 HBACK :18915
## Median : 1.500 Median :0.446270 Median :0.00000 STNWG :16234
## Mean : 1.778 Mean :0.468481 Mean :0.06811 UTE : 4586
## 3rd Qu.: 2.150 3rd Qu.:0.709103 3rd Qu.:0.00000 TRUCK : 1742
## Max. :34.560 Max. :0.999316 Max. :1.00000 HDTOP : 1579
## (Other): 2515
## veh_age gender area agecat
## 3:20060 F:38584 C:20530 4:16179
## 1:12254 M:29219 A:16302 1: 5734
## 2:16582 B:13328 2:12868
## 4:18907 D: 8161 3:15757
## E: 5907 5:10722
## F: 3575 6: 6543
##
dive into veh_value
visualization the veh_value
library(ggplot2)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ tibble 3.1.8 ✔ dplyr 1.1.0
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.4 ✔ forcats 1.0.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
dataCar %>%
ggplot(aes(x = veh_value)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

explore the relationship between predictors and clm
log_veh_value and clm
dataCar %>%
ggplot(aes(y = log_veh_value, x = as.factor(clm))) +
geom_boxplot()

exposure and clm
dataCar %>%
ggplot(aes(x = factor(clm), y = exposure)) +
geom_boxplot()

a table summarise
for (col in categorical) {
print(col)
x <- dataCar %>%
group_by_(col) %>%
summarise(mean = mean(clm), n = n())
print(x)
}
## [1] "veh_body"
## Warning: `group_by_()` was deprecated in dplyr 0.7.0.
## ℹ Please use `group_by()` instead.
## ℹ See vignette('programming') for more help
## # A tibble: 13 × 3
## veh_body mean n
## <fct> <dbl> <int>
## 1 SEDAN 0.0664 22232
## 2 BUS 0.184 38
## 3 CONVT 0.0370 81
## 4 COUPE 0.0872 780
## 5 HBACK 0.0668 18915
## 6 HDTOP 0.0823 1579
## 7 MCARA 0.116 121
## 8 MIBUS 0.0601 716
## 9 PANVN 0.0824 752
## 10 RDSTR 0.0741 27
## 11 STNWG 0.0721 16234
## 12 TRUCK 0.0683 1742
## 13 UTE 0.0567 4586
## [1] "veh_age"
## Warning: `group_by_()` was deprecated in dplyr 0.7.0.
## ℹ Please use `group_by()` instead.
## ℹ See vignette('programming') for more help
## # A tibble: 4 × 3
## veh_age mean n
## <fct> <dbl> <int>
## 1 3 0.0679 20060
## 2 1 0.0672 12254
## 3 2 0.0759 16582
## 4 4 0.0620 18907
## [1] "gender"
## Warning: `group_by_()` was deprecated in dplyr 0.7.0.
## ℹ Please use `group_by()` instead.
## ℹ See vignette('programming') for more help
## # A tibble: 2 × 3
## gender mean n
## <fct> <dbl> <int>
## 1 F 0.0686 38584
## 2 M 0.0675 29219
## [1] "area"
## Warning: `group_by_()` was deprecated in dplyr 0.7.0.
## ℹ Please use `group_by()` instead.
## ℹ See vignette('programming') for more help
## # A tibble: 6 × 3
## area mean n
## <fct> <dbl> <int>
## 1 C 0.0688 20530
## 2 A 0.0664 16302
## 3 B 0.0723 13328
## 4 D 0.0607 8161
## 5 E 0.0652 5907
## 6 F 0.0783 3575
## [1] "agecat"
## Warning: `group_by_()` was deprecated in dplyr 0.7.0.
## ℹ Please use `group_by()` instead.
## ℹ See vignette('programming') for more help
## # A tibble: 6 × 3
## agecat mean n
## <fct> <dbl> <int>
## 1 4 0.0682 16179
## 2 1 0.0863 5734
## 3 2 0.0724 12868
## 4 3 0.0705 15757
## 5 5 0.0572 10722
## 6 6 0.0556 6543
combine some observations
library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:purrr':
##
## compact
# veh_body
var <- "veh_body"
var.levels <- levels(dataCar[, var])
dataCar[, var] <- mapvalues(dataCar[, var], var.levels,
c("HYBRID", "BUS", "CONVT", "HYBRID", "HYBRID",
"HYBRID", "MCARA", "HYBRID", "HYBRID", "HYBRID",
"HYBRID", "HYBRID", "HYBRID"))
# area
var <- "area"
var.levels <- levels(dataCar[, var])
dataCar[, var] <- mapvalues(dataCar[, var], var.levels,
c("BASE", "BASE", "OTHER", "BASE", "BASE", "OTHER"))
# agecat
var <- "agecat"
var.levels <- levels(dataCar[, var])
dataCar[, var] <- mapvalues(dataCar[, var], var.levels,
c("2", "1", "2", "2", "3", "3"))
# Relevel
for (i in categorical){
table <- as.data.frame(table(dataCar[, i]))
max <- which.max(table[, 2])
level.name <- as.character(table[max, 1])
dataCar[, i] <- relevel(dataCar[, i], ref = level.name)
}
summary(dataCar)
## exposure clm veh_body veh_age gender
## Min. :0.002738 Min. :0.00000 HYBRID:67563 3:20060 F:38584
## 1st Qu.:0.219028 1st Qu.:0.00000 BUS : 38 1:12254 M:29219
## Median :0.446270 Median :0.00000 CONVT : 81 2:16582
## Mean :0.468481 Mean :0.06811 MCARA : 121 4:18907
## 3rd Qu.:0.709103 3rd Qu.:0.00000
## Max. :0.999316 Max. :1.00000
## area agecat log_veh_value
## BASE :50900 2:44804 Min. :-1.71480
## OTHER:16903 1: 5734 1st Qu.: 0.00995
## 3:17265 Median : 0.40547
## Mean : 0.38675
## 3rd Qu.: 0.76547
## Max. : 3.54270
explore an interaction
ggplot(dataCar, aes(x = factor(clm), y = log_veh_value)) +
geom_boxplot() +
facet_wrap(~ veh_body)

creation of training and test sets
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
set.seed(4769)
partition <- createDataPartition(dataCar$clm, p = 0.75, list = FALSE)
train <- dataCar[partition, ]
test <- dataCar[-partition, ]
mean(train$clm)
## [1] 0.06870784
mean(test$clm)
## [1] 0.06631268
build the logit and probit models
logit <- glm(clm ~ . - exposure + log_veh_value:veh_body, data = train, family = binomial(link='logit'))
probit <- glm(clm ~ . - exposure + log_veh_value:veh_body, data = train, family = binomial(link='probit'))
summary(logit)
##
## Call:
## glm(formula = clm ~ . - exposure + log_veh_value:veh_body, family = binomial(link = "logit"),
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1054 -0.3962 -0.3710 -0.3442 2.5928
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.666221 0.040753 -65.425 < 2e-16 ***
## veh_bodyBUS 1.893682 0.614643 3.081 0.00206 **
## veh_bodyCONVT 0.344542 1.097079 0.314 0.75348
## veh_bodyMCARA -0.676286 1.037468 -0.652 0.51449
## veh_age1 -0.126600 0.056819 -2.228 0.02587 *
## veh_age2 0.063206 0.048253 1.310 0.19023
## veh_age4 -0.008732 0.052120 -0.168 0.86695
## genderM -0.036598 0.036311 -1.008 0.31350
## areaOTHER 0.140962 0.039342 3.583 0.00034 ***
## agecat1 0.240163 0.057624 4.168 3.08e-05 ***
## agecat3 -0.236353 0.044210 -5.346 8.98e-08 ***
## log_veh_value 0.184019 0.038525 4.777 1.78e-06 ***
## veh_bodyBUS:log_veh_value -2.120656 1.262239 -1.680 0.09294 .
## veh_bodyCONVT:log_veh_value -0.580723 0.642281 -0.904 0.36591
## veh_bodyMCARA:log_veh_value 1.135352 0.860314 1.320 0.18694
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 25455 on 50852 degrees of freedom
## Residual deviance: 25319 on 50838 degrees of freedom
## AIC: 25349
##
## Number of Fisher Scoring iterations: 5
summary(probit)
##
## Call:
## glm(formula = clm ~ . - exposure + log_veh_value:veh_body, family = binomial(link = "probit"),
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1137 -0.3965 -0.3713 -0.3444 2.5943
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.514209 0.019676 -76.957 < 2e-16 ***
## veh_bodyBUS 1.061052 0.370979 2.860 0.004235 **
## veh_bodyCONVT 0.197288 0.537099 0.367 0.713380
## veh_bodyMCARA -0.474145 0.535292 -0.886 0.375743
## veh_age1 -0.061956 0.027459 -2.256 0.024049 *
## veh_age2 0.030565 0.023584 1.296 0.194982
## veh_age4 -0.002869 0.025106 -0.114 0.909019
## genderM -0.018266 0.017570 -1.040 0.298530
## areaOTHER 0.068524 0.019242 3.561 0.000369 ***
## agecat1 0.118946 0.028861 4.121 3.77e-05 ***
## agecat3 -0.112495 0.020910 -5.380 7.45e-08 ***
## log_veh_value 0.090207 0.018679 4.829 1.37e-06 ***
## veh_bodyBUS:log_veh_value -1.237413 0.695959 -1.778 0.075404 .
## veh_bodyCONVT:log_veh_value -0.291791 0.303498 -0.961 0.336338
## veh_bodyMCARA:log_veh_value 0.692255 0.457376 1.514 0.130143
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 25455 on 50852 degrees of freedom
## Residual deviance: 25318 on 50838 degrees of freedom
## AIC: 25348
##
## Number of Fisher Scoring iterations: 5
add the exposure variable as the offset
logit.offset <- glm(clm ~ . - exposure + log_veh_value:veh_body,
data = train,
family = binomial(link='logit'),
offset = log(exposure))
summary(logit.offset)
##
## Call:
## glm(formula = clm ~ . - exposure + log_veh_value:veh_body, family = binomial(link = "logit"),
## data = train, offset = log(exposure))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1703 -0.4491 -0.3469 -0.2233 3.9391
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.86688 0.04144 -45.052 < 2e-16 ***
## veh_bodyBUS 2.20290 0.63434 3.473 0.000515 ***
## veh_bodyCONVT 0.60617 1.12344 0.540 0.589498
## veh_bodyMCARA -0.32226 1.04546 -0.308 0.757894
## veh_age1 -0.01809 0.05743 -0.315 0.752804
## veh_age2 0.07601 0.04877 1.558 0.119165
## veh_age4 -0.02796 0.05297 -0.528 0.597649
## genderM -0.04855 0.03678 -1.320 0.186847
## areaOTHER 0.13262 0.03989 3.325 0.000884 ***
## agecat1 0.27453 0.05862 4.683 2.82e-06 ***
## agecat3 -0.27413 0.04466 -6.138 8.37e-10 ***
## log_veh_value 0.14231 0.03913 3.636 0.000277 ***
## veh_bodyBUS:log_veh_value -2.86477 1.36959 -2.092 0.036465 *
## veh_bodyCONVT:log_veh_value -0.64058 0.65592 -0.977 0.328760
## veh_bodyMCARA:log_veh_value 0.84156 0.86913 0.968 0.332903
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 24567 on 50852 degrees of freedom
## Residual deviance: 24426 on 50838 degrees of freedom
## AIC: 24456
##
## Number of Fisher Scoring iterations: 6
build up confusion matrix
cutoff <- 0.06811
pred.train <- predict(logit.offset, type="response")
class.train <- 1 * (pred.train > cutoff)
confusionMatrix(factor(class.train), factor(train$clm), positive="1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 25424 1001
## 1 21935 2493
##
## Accuracy : 0.549
## 95% CI : (0.5446, 0.5533)
## No Information Rate : 0.9313
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0663
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.71351
## Specificity : 0.53684
## Pos Pred Value : 0.10206
## Neg Pred Value : 0.96212
## Prevalence : 0.06871
## Detection Rate : 0.04902
## Detection Prevalence : 0.48036
## Balanced Accuracy : 0.62517
##
## 'Positive' Class : 1
##
pred.test <- predict(logit.offset, newdata = test, type = 'response')
class.test <- ifelse(pred.test > cutoff, 1, 0)
confusionMatrix(as.factor(class.test), as.factor(test$clm), positive="1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 8494 328
## 1 7332 796
##
## Accuracy : 0.5481
## 95% CI : (0.5406, 0.5556)
## No Information Rate : 0.9337
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0629
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.70819
## Specificity : 0.53671
## Pos Pred Value : 0.09793
## Neg Pred Value : 0.96282
## Prevalence : 0.06631
## Detection Rate : 0.04696
## Detection Prevalence : 0.47953
## Balanced Accuracy : 0.62245
##
## 'Positive' Class : 1
##
build ROC curve and get the AUC
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
roc.train <- roc(train$clm, pred.train)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
par(pty='s')
plot(roc.train)

auc(roc.train)
## Area under the curve: 0.6643
roc.test <- roc(test$clm, pred.test)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc.test)

auc(roc.test)
## Area under the curve: 0.6588
binarized some variables
library(caret)
binarizer <- dummyVars(~ log_veh_value * veh_body + veh_age + agecat,
data = dataCar,
fullRank = TRUE)
binarized.vars <- as.data.frame(predict(binarizer, newdata=dataCar))
data.bin <- cbind(dataCar, binarized.vars)
data.bin$log_veh_value <- NULL
data.bin$veh_body <- NULL
data.bin$veh_age <- NULL
data.bin$agecat <- NULL
head(data.bin)
## exposure clm gender area log_veh_value veh_body.BUS veh_body.CONVT
## 1 0.3039014 0 F BASE 0.05826891 0 0
## 2 0.6488706 0 F BASE 0.02955880 0 0
## 3 0.5694730 0 F BASE 1.18172720 0 0
## 4 0.3175907 0 F BASE 1.42069579 0 0
## 5 0.6488706 0 F BASE -0.32850407 0 0
## 6 0.8542094 0 M BASE 0.69813472 0 0
## veh_body.MCARA veh_age.1 veh_age.2 veh_age.4 agecat.1 agecat.3
## 1 0 0 0 0 0 0
## 2 0 0 1 0 0 0
## 3 0 0 1 0 0 0
## 4 0 0 1 0 0 0
## 5 0 0 0 1 0 0
## 6 0 0 0 0 0 0
## log_veh_value:veh_bodyBUS log_veh_value:veh_bodyCONVT
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## log_veh_value:veh_bodyMCARA
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
split the binarized data into training and test set
train.bin <- data.bin[partition, ]
test.bin <- data.bin[-partition, ]
logit.full <- glm(clm ~ . - exposure,
data = train.bin,
family = binomial,
offset = log(exposure))
summary(logit.full)
##
## Call:
## glm(formula = clm ~ . - exposure, family = binomial, data = train.bin,
## offset = log(exposure))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1703 -0.4491 -0.3469 -0.2233 3.9391
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.86688 0.04144 -45.052 < 2e-16 ***
## genderM -0.04855 0.03678 -1.320 0.186847
## areaOTHER 0.13262 0.03989 3.325 0.000884 ***
## log_veh_value 0.14231 0.03913 3.636 0.000277 ***
## veh_body.BUS 2.20290 0.63434 3.473 0.000515 ***
## veh_body.CONVT 0.60617 1.12344 0.540 0.589498
## veh_body.MCARA -0.32226 1.04546 -0.308 0.757894
## veh_age.1 -0.01809 0.05743 -0.315 0.752804
## veh_age.2 0.07601 0.04877 1.558 0.119165
## veh_age.4 -0.02796 0.05297 -0.528 0.597649
## agecat.1 0.27453 0.05862 4.683 2.82e-06 ***
## agecat.3 -0.27413 0.04466 -6.138 8.37e-10 ***
## `log_veh_value:veh_bodyBUS` -2.86477 1.36959 -2.092 0.036465 *
## `log_veh_value:veh_bodyCONVT` -0.64058 0.65592 -0.977 0.328760
## `log_veh_value:veh_bodyMCARA` 0.84156 0.86913 0.968 0.332903
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 24567 on 50852 degrees of freedom
## Residual deviance: 24426 on 50838 degrees of freedom
## AIC: 24456
##
## Number of Fisher Scoring iterations: 6
using the stepAIC function to feature selection
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
logit.AIC <- stepAIC(logit.full)
## Start: AIC=24455.94
## clm ~ (exposure + gender + area + log_veh_value + veh_body.BUS +
## veh_body.CONVT + veh_body.MCARA + veh_age.1 + veh_age.2 +
## veh_age.4 + agecat.1 + agecat.3 + `log_veh_value:veh_bodyBUS` +
## `log_veh_value:veh_bodyCONVT` + `log_veh_value:veh_bodyMCARA`) -
## exposure
##
## Df Deviance AIC
## - veh_body.MCARA 1 24426 24454
## - veh_age.1 1 24426 24454
## - veh_body.CONVT 1 24426 24454
## - veh_age.4 1 24426 24454
## - `log_veh_value:veh_bodyCONVT` 1 24427 24455
## - `log_veh_value:veh_bodyMCARA` 1 24427 24455
## - gender 1 24428 24456
## <none> 24426 24456
## - veh_age.2 1 24428 24456
## - `log_veh_value:veh_bodyBUS` 1 24432 24460
## - veh_body.BUS 1 24435 24463
## - area 1 24437 24465
## - log_veh_value 1 24439 24467
## - agecat.1 1 24447 24475
## - agecat.3 1 24465 24493
##
## Step: AIC=24454.04
## clm ~ gender + area + log_veh_value + veh_body.BUS + veh_body.CONVT +
## veh_age.1 + veh_age.2 + veh_age.4 + agecat.1 + agecat.3 +
## `log_veh_value:veh_bodyBUS` + `log_veh_value:veh_bodyCONVT` +
## `log_veh_value:veh_bodyMCARA`
##
## Df Deviance AIC
## - veh_age.1 1 24426 24452
## - veh_body.CONVT 1 24426 24452
## - veh_age.4 1 24426 24452
## - `log_veh_value:veh_bodyCONVT` 1 24427 24453
## - gender 1 24428 24454
## <none> 24426 24454
## - veh_age.2 1 24429 24455
## - `log_veh_value:veh_bodyMCARA` 1 24430 24456
## - `log_veh_value:veh_bodyBUS` 1 24432 24458
## - veh_body.BUS 1 24435 24461
## - area 1 24437 24463
## - log_veh_value 1 24439 24465
## - agecat.1 1 24447 24473
## - agecat.3 1 24465 24491
##
## Step: AIC=24452.14
## clm ~ gender + area + log_veh_value + veh_body.BUS + veh_body.CONVT +
## veh_age.2 + veh_age.4 + agecat.1 + agecat.3 + `log_veh_value:veh_bodyBUS` +
## `log_veh_value:veh_bodyCONVT` + `log_veh_value:veh_bodyMCARA`
##
## Df Deviance AIC
## - veh_age.4 1 24426 24450
## - veh_body.CONVT 1 24426 24450
## - `log_veh_value:veh_bodyCONVT` 1 24427 24451
## - gender 1 24428 24452
## <none> 24426 24452
## - `log_veh_value:veh_bodyMCARA` 1 24430 24454
## - veh_age.2 1 24430 24454
## - `log_veh_value:veh_bodyBUS` 1 24432 24456
## - veh_body.BUS 1 24435 24459
## - area 1 24437 24461
## - log_veh_value 1 24440 24464
## - agecat.1 1 24447 24471
## - agecat.3 1 24466 24490
##
## Step: AIC=24450.38
## clm ~ gender + area + log_veh_value + veh_body.BUS + veh_body.CONVT +
## veh_age.2 + agecat.1 + agecat.3 + `log_veh_value:veh_bodyBUS` +
## `log_veh_value:veh_bodyCONVT` + `log_veh_value:veh_bodyMCARA`
##
## Df Deviance AIC
## - veh_body.CONVT 1 24427 24449
## - `log_veh_value:veh_bodyCONVT` 1 24427 24449
## - gender 1 24428 24450
## <none> 24426 24450
## - `log_veh_value:veh_bodyMCARA` 1 24430 24452
## - veh_age.2 1 24431 24453
## - `log_veh_value:veh_bodyBUS` 1 24432 24454
## - veh_body.BUS 1 24435 24457
## - area 1 24437 24459
## - agecat.1 1 24447 24469
## - log_veh_value 1 24449 24471
## - agecat.3 1 24466 24488
##
## Step: AIC=24448.64
## clm ~ gender + area + log_veh_value + veh_body.BUS + veh_age.2 +
## agecat.1 + agecat.3 + `log_veh_value:veh_bodyBUS` + `log_veh_value:veh_bodyCONVT` +
## `log_veh_value:veh_bodyMCARA`
##
## Df Deviance AIC
## - `log_veh_value:veh_bodyCONVT` 1 24428 24448
## - gender 1 24429 24449
## <none> 24427 24449
## - `log_veh_value:veh_bodyMCARA` 1 24430 24450
## - veh_age.2 1 24431 24451
## - `log_veh_value:veh_bodyBUS` 1 24432 24452
## - veh_body.BUS 1 24436 24456
## - area 1 24438 24458
## - agecat.1 1 24448 24468
## - log_veh_value 1 24449 24469
## - agecat.3 1 24466 24486
##
## Step: AIC=24447.94
## clm ~ gender + area + log_veh_value + veh_body.BUS + veh_age.2 +
## agecat.1 + agecat.3 + `log_veh_value:veh_bodyBUS` + `log_veh_value:veh_bodyMCARA`
##
## Df Deviance AIC
## - gender 1 24430 24448
## <none> 24428 24448
## - `log_veh_value:veh_bodyMCARA` 1 24431 24449
## - veh_age.2 1 24433 24451
## - `log_veh_value:veh_bodyBUS` 1 24434 24452
## - veh_body.BUS 1 24437 24455
## - area 1 24439 24457
## - agecat.1 1 24449 24467
## - log_veh_value 1 24450 24468
## - agecat.3 1 24467 24485
##
## Step: AIC=24447.81
## clm ~ area + log_veh_value + veh_body.BUS + veh_age.2 + agecat.1 +
## agecat.3 + `log_veh_value:veh_bodyBUS` + `log_veh_value:veh_bodyMCARA`
##
## Df Deviance AIC
## <none> 24430 24448
## - `log_veh_value:veh_bodyMCARA` 1 24433 24449
## - veh_age.2 1 24435 24451
## - `log_veh_value:veh_bodyBUS` 1 24435 24451
## - veh_body.BUS 1 24439 24455
## - area 1 24441 24457
## - agecat.1 1 24451 24467
## - log_veh_value 1 24451 24467
## - agecat.3 1 24470 24486
summary(logit.AIC)
##
## Call:
## glm(formula = clm ~ area + log_veh_value + veh_body.BUS + veh_age.2 +
## agecat.1 + agecat.3 + `log_veh_value:veh_bodyBUS` + `log_veh_value:veh_bodyMCARA`,
## family = binomial, data = train.bin, offset = log(exposure))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1746 -0.4494 -0.3468 -0.2235 3.9478
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.90098 0.02866 -66.317 < 2e-16 ***
## areaOTHER 0.13220 0.03986 3.316 0.000913 ***
## log_veh_value 0.13876 0.03047 4.554 5.27e-06 ***
## veh_body.BUS 2.17744 0.63456 3.431 0.000600 ***
## veh_age.2 0.09403 0.04155 2.263 0.023643 *
## agecat.1 0.27304 0.05849 4.668 3.04e-06 ***
## agecat.3 -0.27806 0.04451 -6.247 4.18e-10 ***
## `log_veh_value:veh_bodyBUS` -2.81962 1.36626 -2.064 0.039041 *
## `log_veh_value:veh_bodyMCARA` 0.58442 0.29180 2.003 0.045196 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 24567 on 50852 degrees of freedom
## Residual deviance: 24430 on 50844 degrees of freedom
## AIC: 24448
##
## Number of Fisher Scoring iterations: 5
stepAIC using BIC
logit.BIC <- stepAIC(logit.full, k = log(nrow(train.bin)))
## Start: AIC=24588.49
## clm ~ (exposure + gender + area + log_veh_value + veh_body.BUS +
## veh_body.CONVT + veh_body.MCARA + veh_age.1 + veh_age.2 +
## veh_age.4 + agecat.1 + agecat.3 + `log_veh_value:veh_bodyBUS` +
## `log_veh_value:veh_bodyCONVT` + `log_veh_value:veh_bodyMCARA`) -
## exposure
##
## Df Deviance AIC
## - veh_body.MCARA 1 24426 24578
## - veh_age.1 1 24426 24578
## - veh_body.CONVT 1 24426 24578
## - veh_age.4 1 24426 24578
## - `log_veh_value:veh_bodyCONVT` 1 24427 24579
## - `log_veh_value:veh_bodyMCARA` 1 24427 24579
## - gender 1 24428 24579
## - veh_age.2 1 24428 24580
## - `log_veh_value:veh_bodyBUS` 1 24432 24583
## - veh_body.BUS 1 24435 24587
## <none> 24426 24589
## - area 1 24437 24589
## - log_veh_value 1 24439 24591
## - agecat.1 1 24447 24599
## - agecat.3 1 24465 24617
##
## Step: AIC=24577.76
## clm ~ gender + area + log_veh_value + veh_body.BUS + veh_body.CONVT +
## veh_age.1 + veh_age.2 + veh_age.4 + agecat.1 + agecat.3 +
## `log_veh_value:veh_bodyBUS` + `log_veh_value:veh_bodyCONVT` +
## `log_veh_value:veh_bodyMCARA`
##
## Df Deviance AIC
## - veh_age.1 1 24426 24567
## - veh_body.CONVT 1 24426 24567
## - veh_age.4 1 24426 24567
## - `log_veh_value:veh_bodyCONVT` 1 24427 24568
## - gender 1 24428 24569
## - veh_age.2 1 24429 24569
## - `log_veh_value:veh_bodyMCARA` 1 24430 24570
## - `log_veh_value:veh_bodyBUS` 1 24432 24573
## - veh_body.BUS 1 24435 24576
## <none> 24426 24578
## - area 1 24437 24578
## - log_veh_value 1 24439 24580
## - agecat.1 1 24447 24588
## - agecat.3 1 24465 24606
##
## Step: AIC=24567.02
## clm ~ gender + area + log_veh_value + veh_body.BUS + veh_body.CONVT +
## veh_age.2 + veh_age.4 + agecat.1 + agecat.3 + `log_veh_value:veh_bodyBUS` +
## `log_veh_value:veh_bodyCONVT` + `log_veh_value:veh_bodyMCARA`
##
## Df Deviance AIC
## - veh_age.4 1 24426 24556
## - veh_body.CONVT 1 24426 24556
## - `log_veh_value:veh_bodyCONVT` 1 24427 24557
## - gender 1 24428 24558
## - `log_veh_value:veh_bodyMCARA` 1 24430 24560
## - veh_age.2 1 24430 24560
## - `log_veh_value:veh_bodyBUS` 1 24432 24562
## - veh_body.BUS 1 24435 24565
## <none> 24426 24567
## - area 1 24437 24567
## - log_veh_value 1 24440 24570
## - agecat.1 1 24447 24577
## - agecat.3 1 24466 24596
##
## Step: AIC=24556.42
## clm ~ gender + area + log_veh_value + veh_body.BUS + veh_body.CONVT +
## veh_age.2 + agecat.1 + agecat.3 + `log_veh_value:veh_bodyBUS` +
## `log_veh_value:veh_bodyCONVT` + `log_veh_value:veh_bodyMCARA`
##
## Df Deviance AIC
## - veh_body.CONVT 1 24427 24546
## - `log_veh_value:veh_bodyCONVT` 1 24427 24547
## - gender 1 24428 24548
## - `log_veh_value:veh_bodyMCARA` 1 24430 24549
## - veh_age.2 1 24431 24550
## - `log_veh_value:veh_bodyBUS` 1 24432 24551
## - veh_body.BUS 1 24435 24555
## <none> 24426 24556
## - area 1 24437 24556
## - agecat.1 1 24447 24567
## - log_veh_value 1 24449 24568
## - agecat.3 1 24466 24585
##
## Step: AIC=24545.84
## clm ~ gender + area + log_veh_value + veh_body.BUS + veh_age.2 +
## agecat.1 + agecat.3 + `log_veh_value:veh_bodyBUS` + `log_veh_value:veh_bodyCONVT` +
## `log_veh_value:veh_bodyMCARA`
##
## Df Deviance AIC
## - `log_veh_value:veh_bodyCONVT` 1 24428 24536
## - gender 1 24429 24537
## - `log_veh_value:veh_bodyMCARA` 1 24430 24538
## - veh_age.2 1 24431 24540
## - `log_veh_value:veh_bodyBUS` 1 24432 24541
## - veh_body.BUS 1 24436 24544
## <none> 24427 24546
## - area 1 24438 24546
## - agecat.1 1 24448 24556
## - log_veh_value 1 24449 24558
## - agecat.3 1 24466 24574
##
## Step: AIC=24536.3
## clm ~ gender + area + log_veh_value + veh_body.BUS + veh_age.2 +
## agecat.1 + agecat.3 + `log_veh_value:veh_bodyBUS` + `log_veh_value:veh_bodyMCARA`
##
## Df Deviance AIC
## - gender 1 24430 24527
## - `log_veh_value:veh_bodyMCARA` 1 24431 24529
## - veh_age.2 1 24433 24530
## - `log_veh_value:veh_bodyBUS` 1 24434 24531
## - veh_body.BUS 1 24437 24534
## <none> 24428 24536
## - area 1 24439 24536
## - agecat.1 1 24449 24546
## - log_veh_value 1 24450 24547
## - agecat.3 1 24467 24565
##
## Step: AIC=24527.34
## clm ~ area + log_veh_value + veh_body.BUS + veh_age.2 + agecat.1 +
## agecat.3 + `log_veh_value:veh_bodyBUS` + `log_veh_value:veh_bodyMCARA`
##
## Df Deviance AIC
## - `log_veh_value:veh_bodyMCARA` 1 24433 24520
## - veh_age.2 1 24435 24522
## - `log_veh_value:veh_bodyBUS` 1 24435 24522
## - veh_body.BUS 1 24439 24525
## - area 1 24441 24527
## <none> 24430 24527
## - agecat.1 1 24451 24537
## - log_veh_value 1 24451 24537
## - agecat.3 1 24470 24557
##
## Step: AIC=24519.91
## clm ~ area + log_veh_value + veh_body.BUS + veh_age.2 + agecat.1 +
## agecat.3 + `log_veh_value:veh_bodyBUS`
##
## Df Deviance AIC
## - veh_age.2 1 24438 24514
## - `log_veh_value:veh_bodyBUS` 1 24439 24515
## - veh_body.BUS 1 24442 24518
## - area 1 24444 24520
## <none> 24433 24520
## - agecat.1 1 24454 24530
## - log_veh_value 1 24455 24531
## - agecat.3 1 24473 24549
##
## Step: AIC=24514
## clm ~ area + log_veh_value + veh_body.BUS + agecat.1 + agecat.3 +
## `log_veh_value:veh_bodyBUS`
##
## Df Deviance AIC
## - `log_veh_value:veh_bodyBUS` 1 24444 24509
## - veh_body.BUS 1 24447 24512
## - area 1 24449 24514
## <none> 24438 24514
## - agecat.1 1 24459 24524
## - log_veh_value 1 24469 24534
## - agecat.3 1 24478 24543
##
## Step: AIC=24508.7
## clm ~ area + log_veh_value + veh_body.BUS + agecat.1 + agecat.3
##
## Df Deviance AIC
## - veh_body.BUS 1 24447 24501
## - area 1 24454 24508
## <none> 24444 24509
## - agecat.1 1 24465 24519
## - log_veh_value 1 24474 24528
## - agecat.3 1 24483 24537
##
## Step: AIC=24501.2
## clm ~ area + log_veh_value + agecat.1 + agecat.3
##
## Df Deviance AIC
## - area 1 24458 24501
## <none> 24447 24501
## - agecat.1 1 24468 24511
## - log_veh_value 1 24477 24520
## - agecat.3 1 24486 24530
##
## Step: AIC=24500.88
## clm ~ log_veh_value + agecat.1 + agecat.3
##
## Df Deviance AIC
## <none> 24458 24501
## - agecat.1 1 24479 24511
## - log_veh_value 1 24488 24520
## - agecat.3 1 24498 24531
summary(logit.BIC)
##
## Call:
## glm(formula = clm ~ log_veh_value + agecat.1 + agecat.3, family = binomial,
## data = train.bin, offset = log(exposure))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.7242 -0.4502 -0.3480 -0.2237 3.9341
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.84841 0.02532 -72.998 < 2e-16 ***
## log_veh_value 0.16016 0.02912 5.499 3.81e-08 ***
## agecat.1 0.27479 0.05845 4.701 2.59e-06 ***
## agecat.3 -0.27735 0.04443 -6.242 4.33e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 24567 on 50852 degrees of freedom
## Residual deviance: 24458 on 50849 degrees of freedom
## AIC: 24466
##
## Number of Fisher Scoring iterations: 5
plot the ROC of logit.full, logit.AIC, logit.BIC
pred.full <- predict(logit.full, newdata = test.bin, type = 'response')
roc.full <- roc(test.bin$clm, pred.full)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc.full)
## Area under the curve: 0.6588
pred.AIC <- predict(logit.AIC, newdata = test.bin, type = 'response')
roc.AIC <- roc(test.bin$clm, pred.AIC)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc.AIC)
## Area under the curve: 0.6587
pred.BIC <- predict(logit.BIC, newdata = test.bin, type = 'response')
roc.BIC <- roc(test.bin$clm, pred.BIC)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc.BIC)
## Area under the curve: 0.6598
run the final model on full dataset
logit.final <- glm(clm ~ log_veh_value + agecat.1 + agecat.3, data = data.bin,
offset = log(exposure), family = binomial)
summary(logit.final)
##
## Call:
## glm(formula = clm ~ log_veh_value + agecat.1 + agecat.3, family = binomial,
## data = data.bin, offset = log(exposure))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.7127 -0.4490 -0.3471 -0.2236 3.9375
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.86168 0.02204 -84.483 < 2e-16 ***
## log_veh_value 0.15807 0.02527 6.255 3.97e-10 ***
## agecat.1 0.25731 0.05138 5.008 5.51e-07 ***
## agecat.3 -0.24851 0.03825 -6.496 8.24e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 32572 on 67802 degrees of freedom
## Residual deviance: 32445 on 67799 degrees of freedom
## AIC: 32453
##
## Number of Fisher Scoring iterations: 5
exp(logit.final$coefficients)
## (Intercept) log_veh_value agecat.1 agecat.3
## 0.1554116 1.1712493 1.2934434 0.7799632
predictions on new data
new.data <- data.frame(
exposure = c(0.468481, 0.515329, 0.468481, 0.468481, 0.468481),
log_veh_value = c(0.38675, 0.38675, 0.42543, 0.38675, 0.38675),
agecat.1 = c(0, 0, 0, 1, 0),
agecat.3 = c(0, 0, 0, 0, 1)
)
new.data
## exposure log_veh_value agecat.1 agecat.3
## 1 0.468481 0.38675 0 0
## 2 0.515329 0.38675 0 0
## 3 0.468481 0.42543 0 0
## 4 0.468481 0.38675 1 0
## 5 0.468481 0.38675 0 1
predict(logit.final, newdata = new.data, type = "response")
## 1 2 3 4 5
## 0.07183724 0.07845733 0.07224598 0.09099912 0.05693029