In this homework assignment, you will explore, analyze and model a data set containing approximately 8000 records representing a customer at an auto insurance company. Each record has two response variables. The first response variable, TARGET_FLAG, is a 1 or a 0. A “1” means that the person was in a car crash. A zero means that the person was not in a car crash. The second response variable is TARGET_AMT. This value is zero if the person did not crash their car. But if they did crash their car, this number will be a value greater than zero.
Your objective is to build multiple linear regression and binary logistic regression models on the training data to predict the probability that a person will crash their car and also the amount of money it will cost if the person does crash their car. You can only use the variables given to you (or variables that you derive from the variables provided)
library(dplyr)
library(ggplot2)
library(ggpubr)
library(psych)
##Load dataset
ins.train <- read.csv("insurance_training_data.csv", header= TRUE)
str(ins.train)
## 'data.frame': 8161 obs. of 26 variables:
## $ INDEX : int 1 2 4 5 6 7 8 11 12 13 ...
## $ TARGET_FLAG: int 0 0 0 0 0 1 0 1 1 0 ...
## $ TARGET_AMT : num 0 0 0 0 0 ...
## $ KIDSDRIV : int 0 0 0 0 0 0 0 1 0 0 ...
## $ AGE : int 60 43 35 51 50 34 54 37 34 50 ...
## $ HOMEKIDS : int 0 0 1 0 0 1 0 2 0 0 ...
## $ YOJ : int 11 11 10 14 NA 12 NA NA 10 7 ...
## $ INCOME : Factor w/ 6613 levels "","$0","$1,007",..: 5033 6292 1250 1 509 746 1488 315 4765 282 ...
## $ PARENT1 : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 2 1 1 1 1 ...
## $ HOME_VAL : Factor w/ 5107 levels "","$0","$100,093",..: 2 3259 348 3917 3034 2 1 4167 2 2 ...
## $ MSTATUS : Factor w/ 2 levels "Yes","z_No": 2 2 1 1 1 2 1 1 2 2 ...
## $ SEX : Factor w/ 2 levels "M","z_F": 1 1 2 1 2 2 2 1 2 1 ...
## $ EDUCATION : Factor w/ 5 levels "<High School",..: 4 5 5 1 4 2 1 2 2 2 ...
## $ JOB : Factor w/ 9 levels "","Clerical",..: 7 9 2 9 3 9 9 9 2 7 ...
## $ TRAVTIME : int 14 22 5 32 36 46 33 44 34 48 ...
## $ CAR_USE : Factor w/ 2 levels "Commercial","Private": 2 1 2 2 2 1 2 1 2 1 ...
## $ BLUEBOOK : Factor w/ 2789 levels "$1,500","$1,520",..: 434 503 2212 553 802 746 2672 701 135 852 ...
## $ TIF : int 11 1 4 7 1 1 1 1 1 7 ...
## $ CAR_TYPE : Factor w/ 6 levels "Minivan","Panel Truck",..: 1 1 6 1 6 4 6 5 6 5 ...
## $ RED_CAR : Factor w/ 2 levels "no","yes": 2 2 1 2 1 1 1 2 1 1 ...
## $ OLDCLAIM : Factor w/ 2857 levels "$0","$1,000",..: 1449 1 1311 1 432 1 1 510 1 1 ...
## $ CLM_FREQ : int 2 0 2 0 2 0 0 1 0 0 ...
## $ REVOKED : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 1 2 1 1 ...
## $ MVR_PTS : int 3 0 3 0 3 0 0 10 0 1 ...
## $ CAR_AGE : int 18 1 10 6 17 7 1 7 1 17 ...
## $ URBANICITY : Factor w/ 2 levels "Highly Urban/ Urban",..: 1 1 1 1 1 1 1 1 1 2 ...
There are 8161 rows and 23 attributes of actuarial data, representing behavioral and demographic information about a specific policyholder. Two response variables: TARGET_FLAG: Indicates whether a client has been in a car accident, TARGET_AMOUNT: The insurance claim payout related to that car accident
The original dataset contains 13 numeric predictors (some are erroneously defined as factors and viceversa), 7 binary categorical variables and 3 multi-categorical ones
## Preliminary data prep - Convert erroneous factor varables into numeric and viceversa so it can be plotted easily
ins.train$KIDSDRIV <- as.numeric(ins.train$KIDSDRIV)
ins.train$HOMEKIDS <- as.factor(ins.train$HOMEKIDS)
ins.train$INCOME <- as.numeric(ins.train$INCOME)
ins.train$HOME_VAL <- as.numeric(ins.train$HOME_VAL)
ins.train$BLUEBOOK <- as.numeric(ins.train$BLUEBOOK)
ins.train$OLDCLAIM <- as.numeric(ins.train$OLDCLAIM)
#Separate theoretical predictors with known effect on the dependent variables
ins_effect_coll <- select(ins.train, 2,3,4,5,7,8,10,11,13,14,15,16,18,21,22,23,24)
ins_effect_amt <- select(ins.train, 2,3,17,19,25)
par(mfrow=c(3,3))
boxplot(ins_effect_coll$AGE ~ ins_effect_coll$TARGET_FLAG, horizontal=TRUE, col=rainbow(length(unique(ins_effect_coll$TARGET_FLAG))))
boxplot(ins_effect_coll$YOJ ~ ins_effect_coll$TARGET_FLAG, horizontal=TRUE, col=rainbow(length(unique(ins_effect_coll$TARGET_FLAG))))
boxplot(ins_effect_coll$INCOME ~ ins_effect_coll$TARGET_FLAG, horizontal=TRUE, col=rainbow(length(unique(ins_effect_coll$TARGET_FLAG))))
boxplot(ins_effect_coll$HOME_VAL ~ ins_effect_coll$TARGET_FLAG, horizontal=TRUE, col=rainbow(length(unique(ins_effect_coll$TARGET_FLAG))))
boxplot(ins_effect_coll$TRAVTIME ~ ins_effect_coll$TARGET_FLAG, horizontal=TRUE, col=rainbow(length(unique(ins_effect_coll$TARGET_FLAG))))
boxplot(ins_effect_coll$TIF ~ ins_effect_coll$TARGET_FLAG, horizontal=TRUE, col=rainbow(length(unique(ins_effect_coll$TARGET_FLAG))))
boxplot(ins_effect_coll$OLDCLAIM ~ ins_effect_coll$TARGET_FLAG, horizontal=TRUE, col=rainbow(length(unique(ins_effect_coll$TARGET_FLAG))))
boxplot(ins_effect_coll$CLM_FREQ ~ ins_effect_coll$TARGET_FLAG, horizontal=TRUE, col=rainbow(length(unique(ins_effect_coll$TARGET_FLAG))))
boxplot(ins_effect_coll$MVR_PTS ~ ins_effect_coll$TARGET_FLAG, horizontal=TRUE, col=rainbow(length(unique(ins_effect_coll$TARGET_FLAG))))
c1 <- ggplot(data = ins_effect_coll, aes(x=TARGET_FLAG, fill=as.factor(KIDSDRIV))) + geom_bar(position="dodge")
c2 <- ggplot(data = ins_effect_coll, aes(x=TARGET_FLAG, fill=MSTATUS)) + geom_bar(position="dodge")
c3 <- ggplot(data = ins_effect_coll, aes(x=TARGET_FLAG, fill=EDUCATION)) + geom_bar(position="dodge")
c4 <- ggplot(data = ins_effect_coll, aes(x=TARGET_FLAG, fill=JOB)) + geom_bar(position="dodge")
c5 <- ggplot(data = ins_effect_coll, aes(x=TARGET_FLAG, fill=CAR_USE)) + geom_bar(position="dodge")
c6 <- ggplot(data = ins_effect_coll, aes(x=TARGET_FLAG, fill=REVOKED)) + geom_bar(position="dodge")
ggarrange(c1,c2,c3,c4,c5,c6, ncol = 2, nrow = 3)
par(mfrow=c(1,2))
boxplot(ins_effect_amt$BLUEBOOK ~ ins_effect_amt$TARGET_FLAG, horizontal=TRUE, col=rainbow(length(unique(ins_effect_amt$TARGET_FLAG))))
boxplot(ins_effect_amt$CAR_AGE ~ ins_effect_amt$TARGET_FLAG, horizontal=TRUE, col=rainbow(length(unique(ins_effect_amt$TARGET_FLAG))))
The following variables seem to have strong correlation to the response variable TARGET_FLAG: - HOME_VAL, OLDCLAIM, CLM_FREQ and MVR_PTS The following variables seem to have mild correlation to the response variable TARGET_FLAG: - AGE, YOJ, INCOME, TRAVTIME, TIF, EDUCATION, CAR_AGE, CAR_TYPE The following variables seem to have no representative correlation to the response variable TARGET_FLAG: - KIDSRV, MSTATUS, JOB, CAR_USE, REVOKED, BLUEBOOK
## TARGET_FLAG TARGET_AMT KIDSDRIV AGE YOJ INCOME
## TARGET_FLAG 1.00000000 0.534246061 0.103668296 NA NA -0.033836495
## TARGET_AMT 0.53424606 1.000000000 0.055394177 NA NA -0.008419273
## KIDSDRIV 0.10366830 0.055394177 1.000000000 NA NA -0.026828869
## AGE NA NA NA 1 NA NA
## YOJ NA NA NA NA 1 NA
## INCOME -0.03383649 -0.008419273 -0.026828869 NA NA 1.000000000
## HOME_VAL -0.14857148 -0.076824577 -0.013791248 NA NA 0.026269850
## TRAVTIME 0.04836831 0.027987016 0.008447299 NA NA -0.010833149
## TIF -0.08237005 -0.046480831 -0.001988715 NA NA 0.018719317
## OLDCLAIM 0.19028750 0.097147800 0.029131208 NA NA 0.008749478
## CLM_FREQ 0.21619606 0.116419159 0.037062929 NA NA 0.016492078
## MVR_PTS 0.21919705 0.137865509 0.053566373 NA NA 0.012043161
## HOME_VAL TRAVTIME TIF OLDCLAIM
## TARGET_FLAG -0.148571484 0.048368310 -0.082370050 0.190287499
## TARGET_AMT -0.076824577 0.027987016 -0.046480831 0.097147800
## KIDSDRIV -0.013791248 0.008447299 -0.001988715 0.029131208
## AGE NA NA NA NA
## YOJ NA NA NA NA
## INCOME 0.026269850 -0.010833149 0.018719317 0.008749478
## HOME_VAL 1.000000000 -0.018648579 -0.005136827 -0.083497284
## TRAVTIME -0.018648579 1.000000000 -0.011604626 -0.008827686
## TIF -0.005136827 -0.011604626 1.000000000 -0.020576485
## OLDCLAIM -0.083497284 -0.008827686 -0.020576485 1.000000000
## CLM_FREQ -0.081004667 0.006560211 -0.023022955 0.701444946
## MVR_PTS -0.072864451 0.010598511 -0.041045734 0.382824464
## CLM_FREQ MVR_PTS
## TARGET_FLAG 0.216196061 0.21919705
## TARGET_AMT 0.116419159 0.13786551
## KIDSDRIV 0.037062929 0.05356637
## AGE NA NA
## YOJ NA NA
## INCOME 0.016492078 0.01204316
## HOME_VAL -0.081004667 -0.07286445
## TRAVTIME 0.006560211 0.01059851
## TIF -0.023022955 -0.04104573
## OLDCLAIM 0.701444946 0.38282446
## CLM_FREQ 1.000000000 0.39663837
## MVR_PTS 0.396638373 1.00000000
## TARGET_FLAG TARGET_AMT BLUEBOOK CAR_AGE
## TARGET_FLAG 1.00000000 0.53424606 0.05044526 NA
## TARGET_AMT 0.53424606 1.00000000 0.02359552 NA
## BLUEBOOK 0.05044526 0.02359552 1.00000000 NA
## CAR_AGE NA NA NA 1
The correlation matrices on all the main numeric variables confirm the graphic exploration done already, even though the correlation coefficients are not very strong. The highest overall correspond to: OLDCLAIM, CLM_FREQ and MVR_PTS (>0.19), next: KIDSDRIV, AGE, HOME_VAL (>.10)
## 'data.frame': 8161 obs. of 17 variables:
## $ TARGET_FLAG: int 0 0 0 0 0 1 0 1 1 0 ...
## $ TARGET_AMT : num 0 0 0 0 0 ...
## $ KIDSDRIV : num 0 0 0 0 0 0 0 1 0 0 ...
## $ AGE : int 60 43 35 51 50 34 54 37 34 50 ...
## $ YOJ : int 11 11 10 14 NA 12 NA NA 10 7 ...
## $ INCOME : num 5033 6292 1250 1 509 ...
## $ HOME_VAL : num 2 3259 348 3917 3034 ...
## $ MSTATUS : Factor w/ 2 levels "Yes","z_No": 2 2 1 1 1 2 1 1 2 2 ...
## $ EDUCATION : Factor w/ 5 levels "<High School",..: 4 5 5 1 4 2 1 2 2 2 ...
## $ JOB : Factor w/ 9 levels "","Clerical",..: 7 9 2 9 3 9 9 9 2 7 ...
## $ TRAVTIME : int 14 22 5 32 36 46 33 44 34 48 ...
## $ CAR_USE : Factor w/ 2 levels "Commercial","Private": 2 1 2 2 2 1 2 1 2 1 ...
## $ TIF : int 11 1 4 7 1 1 1 1 1 7 ...
## $ OLDCLAIM : num 1449 1 1311 1 432 ...
## $ CLM_FREQ : int 2 0 2 0 2 0 0 1 0 0 ...
## $ REVOKED : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 1 2 1 1 ...
## $ MVR_PTS : int 3 0 3 0 3 0 0 10 0 1 ...
## 'data.frame': 8161 obs. of 5 variables:
## $ TARGET_FLAG: int 0 0 0 0 0 1 0 1 1 0 ...
## $ TARGET_AMT : num 0 0 0 0 0 ...
## $ BLUEBOOK : num 434 503 2212 553 802 ...
## $ CAR_TYPE : Factor w/ 6 levels "Minivan","Panel Truck",..: 1 1 6 1 6 4 6 5 6 5 ...
## $ CAR_AGE : int 18 1 10 6 17 7 1 7 1 17 ...
## TARGET_FLAG TARGET_AMT KIDSDRIV AGE
## Min. :0.0000 Min. : 0 Min. :0.0000 Min. :16.00
## 1st Qu.:0.0000 1st Qu.: 0 1st Qu.:0.0000 1st Qu.:39.00
## Median :0.0000 Median : 0 Median :0.0000 Median :45.00
## Mean :0.2638 Mean : 1504 Mean :0.1711 Mean :44.79
## 3rd Qu.:1.0000 3rd Qu.: 1036 3rd Qu.:0.0000 3rd Qu.:51.00
## Max. :1.0000 Max. :107586 Max. :4.0000 Max. :81.00
## NA's :6
## YOJ INCOME HOME_VAL MSTATUS
## Min. : 0.0 Min. : 1 Min. : 1 Yes :4894
## 1st Qu.: 9.0 1st Qu.: 926 1st Qu.: 2 z_No:3267
## Median :11.0 Median :2817 Median :1245
## Mean :10.5 Mean :2876 Mean :1685
## 3rd Qu.:13.0 3rd Qu.:4701 3rd Qu.:3164
## Max. :23.0 Max. :6613 Max. :5107
## NA's :454
## EDUCATION JOB TRAVTIME
## <High School :1203 z_Blue Collar:1825 Min. : 5.00
## Bachelors :2242 Clerical :1271 1st Qu.: 22.00
## Masters :1658 Professional :1117 Median : 33.00
## PhD : 728 Manager : 988 Mean : 33.49
## z_High School:2330 Lawyer : 835 3rd Qu.: 44.00
## Student : 712 Max. :142.00
## (Other) :1413
## CAR_USE TIF OLDCLAIM CLM_FREQ
## Commercial:3029 Min. : 1.000 Min. : 1.0 Min. :0.0000
## Private :5132 1st Qu.: 1.000 1st Qu.: 1.0 1st Qu.:0.0000
## Median : 4.000 Median : 1.0 Median :0.0000
## Mean : 5.351 Mean : 552.3 Mean :0.7986
## 3rd Qu.: 7.000 3rd Qu.:1015.0 3rd Qu.:2.0000
## Max. :25.000 Max. :2857.0 Max. :5.0000
##
## REVOKED MVR_PTS
## No :7161 Min. : 0.000
## Yes:1000 1st Qu.: 0.000
## Median : 1.000
## Mean : 1.696
## 3rd Qu.: 3.000
## Max. :13.000
##
## TARGET_FLAG TARGET_AMT BLUEBOOK CAR_TYPE
## Min. :0.0000 Min. : 0 Min. : 1 Minivan :2145
## 1st Qu.:0.0000 1st Qu.: 0 1st Qu.: 478 Panel Truck: 676
## Median :0.0000 Median : 0 Median :1124 Pickup :1389
## Mean :0.2638 Mean : 1504 Mean :1284 Sports Car : 907
## 3rd Qu.:1.0000 3rd Qu.: 1036 3rd Qu.:2234 Van : 750
## Max. :1.0000 Max. :107586 Max. :2789 z_SUV :2294
##
## CAR_AGE
## Min. :-3.000
## 1st Qu.: 1.000
## Median : 8.000
## Mean : 8.328
## 3rd Qu.:12.000
## Max. :28.000
## NA's :510
#Data Imputation for Missing values (NAs), wrong values or to improve model training
ins_effect_coll$AGE[is.na(ins_effect_coll$AGE)] <- median(ins_effect_coll$AGE, na.rm = T)
ins_effect_coll$YOJ[is.na(ins_effect_coll$YOJ)] <- median(ins_effect_coll$YOJ, na.rm = T)
ins_effect_amt$CAR_AGE[is.na(ins_effect_amt$CAR_AGE)] <- median(ins_effect_amt$CAR_AGE, na.rm = T)
# Binarize or reduce multiple level factor variables
ins_effect_coll$KIDSDRIV <- as.factor(ins_effect_coll$KIDSDRIV)
levels(ins_effect_coll$KIDSDRIV) <- list("0"=c("0"),"1"=c("1","2","3","4"))
levels(ins_effect_coll$EDUCATION) <- list("Low"=c("High School", "z_High School"),"High"=c("Bachelors","Masters","PhD" ))
ins_effect_amt$CAR_AGE <- as.factor(ins_effect_amt$CAR_AGE)
levels(ins_effect_amt$CAR_AGE) <- list("New"=c("0","1","2","3"),"Medium"=c("4","5","6","7"), "Old"=as.character(c(8:28)))
levels(ins_effect_coll$JOB) <- list("Blue"=c("Student", "z_Blue Collar","Home Maker",""),"White"=c("Clerical", "Doctor", "Lawyer","Manager", "Professional"))
# Join datasets
ins.train.prep <- cbind(ins_effect_coll, ins_effect_amt[,-(1:2)])
ins.train.prep <- na.omit(ins.train.prep) #avoiding errors in Step() functions
log_reg_Mod1 <- glm(formula = TARGET_FLAG ~ HOME_VAL + OLDCLAIM + CLM_FREQ + MVR_PTS +
AGE + YOJ + INCOME + TRAVTIME + TIF + EDUCATION + CAR_AGE + CAR_TYPE,
family = binomial(link = "logit"), data = ins.train.prep)
summary(log_reg_Mod1)
##
## Call:
## glm(formula = TARGET_FLAG ~ HOME_VAL + OLDCLAIM + CLM_FREQ +
## MVR_PTS + AGE + YOJ + INCOME + TRAVTIME + TIF + EDUCATION +
## CAR_AGE + CAR_TYPE, family = binomial(link = "logit"), data = ins.train.prep)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9628 -0.7694 -0.5408 0.7871 2.5075
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.311e-01 2.005e-01 -3.148 0.00164 **
## HOME_VAL -1.667e-04 1.876e-05 -8.887 < 2e-16 ***
## OLDCLAIM 1.157e-04 4.377e-05 2.645 0.00818 **
## CLM_FREQ 2.240e-01 3.263e-02 6.864 6.72e-12 ***
## MVR_PTS 1.271e-01 1.402e-02 9.070 < 2e-16 ***
## AGE -1.897e-02 3.504e-03 -5.414 6.15e-08 ***
## YOJ -1.674e-02 7.867e-03 -2.128 0.03330 *
## INCOME -2.787e-06 1.506e-05 -0.185 0.85314
## TRAVTIME 8.365e-03 1.849e-03 4.525 6.04e-06 ***
## TIF -4.627e-02 7.450e-03 -6.211 5.26e-10 ***
## EDUCATIONHigh -4.545e-01 7.195e-02 -6.317 2.67e-10 ***
## CAR_AGEMedium -1.113e-01 1.009e-01 -1.103 0.27002
## CAR_AGEOld -7.308e-02 8.288e-02 -0.882 0.37788
## CAR_TYPEPanel Truck 8.151e-01 1.193e-01 6.833 8.30e-12 ***
## CAR_TYPEPickup 8.035e-01 9.735e-02 8.254 < 2e-16 ***
## CAR_TYPESports Car 9.729e-01 1.073e-01 9.064 < 2e-16 ***
## CAR_TYPEVan 8.236e-01 1.157e-01 7.116 1.11e-12 ***
## CAR_TYPEz_SUV 7.748e-01 8.675e-02 8.931 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7884.7 on 6956 degrees of freedom
## Residual deviance: 6996.8 on 6939 degrees of freedom
## AIC: 7032.8
##
## Number of Fisher Scoring iterations: 4
## Start: AIC=7032.78
## TARGET_FLAG ~ HOME_VAL + OLDCLAIM + CLM_FREQ + MVR_PTS + AGE +
## YOJ + INCOME + TRAVTIME + TIF + EDUCATION + CAR_AGE + CAR_TYPE
##
## Df Deviance AIC
## - CAR_AGE 2 6998.1 7030.1
## - INCOME 1 6996.8 7030.8
## <none> 6996.8 7032.8
## - YOJ 1 7001.3 7035.3
## - OLDCLAIM 1 7003.7 7037.7
## - TRAVTIME 1 7017.2 7051.2
## - AGE 1 7026.3 7060.3
## - EDUCATION 1 7036.5 7070.5
## - TIF 1 7036.7 7070.7
## - CLM_FREQ 1 7042.9 7076.9
## - HOME_VAL 1 7078.1 7112.1
## - MVR_PTS 1 7079.2 7113.2
## - CAR_TYPE 5 7130.0 7156.0
##
## Step: AIC=7030.12
## TARGET_FLAG ~ HOME_VAL + OLDCLAIM + CLM_FREQ + MVR_PTS + AGE +
## YOJ + INCOME + TRAVTIME + TIF + EDUCATION + CAR_TYPE
##
## Df Deviance AIC
## - INCOME 1 6998.1 7028.1
## <none> 6998.1 7030.1
## - YOJ 1 7002.6 7032.6
## - OLDCLAIM 1 7005.2 7035.2
## - TRAVTIME 1 7018.5 7048.5
## - AGE 1 7027.6 7057.6
## - TIF 1 7037.9 7067.9
## - CLM_FREQ 1 7044.1 7074.1
## - EDUCATION 1 7056.4 7086.4
## - HOME_VAL 1 7079.2 7109.2
## - MVR_PTS 1 7080.4 7110.4
## - CAR_TYPE 5 7131.6 7153.6
##
## Step: AIC=7028.15
## TARGET_FLAG ~ HOME_VAL + OLDCLAIM + CLM_FREQ + MVR_PTS + AGE +
## YOJ + TRAVTIME + TIF + EDUCATION + CAR_TYPE
##
## Df Deviance AIC
## <none> 6998.1 7028.1
## - YOJ 1 7003.4 7031.4
## - OLDCLAIM 1 7005.2 7033.2
## - TRAVTIME 1 7018.5 7046.5
## - AGE 1 7027.6 7055.6
## - TIF 1 7037.9 7065.9
## - CLM_FREQ 1 7044.1 7072.1
## - EDUCATION 1 7056.9 7084.9
## - HOME_VAL 1 7079.2 7107.2
## - MVR_PTS 1 7080.4 7108.4
## - CAR_TYPE 5 7132.1 7152.1
##
## Call: glm(formula = TARGET_FLAG ~ HOME_VAL + OLDCLAIM + CLM_FREQ +
## MVR_PTS + AGE + YOJ + TRAVTIME + TIF + EDUCATION + CAR_TYPE,
## family = binomial(link = "logit"), data = ins.train.prep)
##
## Coefficients:
## (Intercept) HOME_VAL OLDCLAIM
## -0.6850840 -0.0001665 0.0001170
## CLM_FREQ MVR_PTS AGE
## 0.2233070 0.1269131 -0.0189255
## YOJ TRAVTIME TIF
## -0.0171008 0.0083516 -0.0461787
## EDUCATIONHigh CAR_TYPEPanel Truck CAR_TYPEPickup
## -0.4803310 0.8177697 0.8047035
## CAR_TYPESports Car CAR_TYPEVan CAR_TYPEz_SUV
## 0.9754832 0.8256120 0.7741464
##
## Degrees of Freedom: 6956 Total (i.e. Null); 6942 Residual
## Null Deviance: 7885
## Residual Deviance: 6998 AIC: 7028
Modeling approach is based on Step Backward selection optimizing for AIC. Final iteration removed the INCOME and CAR_AGE predictors, leaving 10 predictors total achieving the lowest AIC (7028).
log_reg_Mod2 <- glm(formula = TARGET_FLAG ~ HOME_VAL + OLDCLAIM + log(CLM_FREQ + 1) + log(MVR_PTS + 1) +
AGE + YOJ + log(INCOME + 1) + log(TRAVTIME) + log(TIF) + EDUCATION + CAR_AGE + CAR_TYPE,
family = binomial(link = "logit"), data = ins.train.prep)
summary(log_reg_Mod2)
##
## Call:
## glm(formula = TARGET_FLAG ~ HOME_VAL + OLDCLAIM + log(CLM_FREQ +
## 1) + log(MVR_PTS + 1) + AGE + YOJ + log(INCOME + 1) + log(TRAVTIME) +
## log(TIF) + EDUCATION + CAR_AGE + CAR_TYPE, family = binomial(link = "logit"),
## data = ins.train.prep)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9631 -0.7688 -0.5380 0.8151 2.5944
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.190e+00 2.618e-01 -4.546 5.48e-06 ***
## HOME_VAL -1.682e-04 1.874e-05 -8.976 < 2e-16 ***
## OLDCLAIM 4.294e-05 4.702e-05 0.913 0.361
## log(CLM_FREQ + 1) 6.306e-01 7.552e-02 8.351 < 2e-16 ***
## log(MVR_PTS + 1) 3.248e-01 4.264e-02 7.617 2.59e-14 ***
## AGE -1.940e-02 3.504e-03 -5.536 3.10e-08 ***
## YOJ -1.048e-02 8.760e-03 -1.196 0.232
## log(INCOME + 1) -2.081e-02 1.375e-02 -1.514 0.130
## log(TRAVTIME) 2.633e-01 5.102e-02 5.160 2.48e-07 ***
## log(TIF) -1.987e-01 3.087e-02 -6.436 1.23e-10 ***
## EDUCATIONHigh -4.529e-01 7.178e-02 -6.310 2.79e-10 ***
## CAR_AGEMedium -1.261e-01 1.010e-01 -1.248 0.212
## CAR_AGEOld -8.635e-02 8.287e-02 -1.042 0.297
## CAR_TYPEPanel Truck 8.033e-01 1.194e-01 6.729 1.71e-11 ***
## CAR_TYPEPickup 8.042e-01 9.737e-02 8.259 < 2e-16 ***
## CAR_TYPESports Car 9.760e-01 1.074e-01 9.085 < 2e-16 ***
## CAR_TYPEVan 8.284e-01 1.158e-01 7.154 8.40e-13 ***
## CAR_TYPEz_SUV 7.732e-01 8.683e-02 8.904 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7884.7 on 6956 degrees of freedom
## Residual deviance: 6982.9 on 6939 degrees of freedom
## AIC: 7018.9
##
## Number of Fisher Scoring iterations: 4
## Start: AIC=7018.95
## TARGET_FLAG ~ HOME_VAL + OLDCLAIM + log(CLM_FREQ + 1) + log(MVR_PTS +
## 1) + AGE + YOJ + log(INCOME + 1) + log(TRAVTIME) + log(TIF) +
## EDUCATION + CAR_AGE + CAR_TYPE
##
## Df Deviance AIC
## - CAR_AGE 2 6984.7 7016.7
## - OLDCLAIM 1 6983.8 7017.8
## - YOJ 1 6984.4 7018.4
## <none> 6982.9 7018.9
## - log(INCOME + 1) 1 6985.2 7019.2
## - log(TRAVTIME) 1 7010.5 7044.5
## - AGE 1 7013.8 7047.8
## - EDUCATION 1 7022.5 7056.5
## - log(TIF) 1 7024.4 7058.4
## - log(MVR_PTS + 1) 1 7041.2 7075.2
## - log(CLM_FREQ + 1) 1 7051.2 7085.2
## - HOME_VAL 1 7065.8 7099.8
## - CAR_TYPE 5 7115.9 7141.9
##
## Step: AIC=7016.7
## TARGET_FLAG ~ HOME_VAL + OLDCLAIM + log(CLM_FREQ + 1) + log(MVR_PTS +
## 1) + AGE + YOJ + log(INCOME + 1) + log(TRAVTIME) + log(TIF) +
## EDUCATION + CAR_TYPE
##
## Df Deviance AIC
## - OLDCLAIM 1 6985.6 7015.6
## - YOJ 1 6986.1 7016.1
## <none> 6984.7 7016.7
## - log(INCOME + 1) 1 6986.9 7016.9
## - log(TRAVTIME) 1 7012.2 7042.2
## - AGE 1 7015.6 7045.6
## - log(TIF) 1 7025.9 7055.9
## - log(MVR_PTS + 1) 1 7042.6 7072.6
## - EDUCATION 1 7044.0 7074.0
## - log(CLM_FREQ + 1) 1 7052.7 7082.7
## - HOME_VAL 1 7067.3 7097.3
## - CAR_TYPE 5 7118.0 7140.0
##
## Step: AIC=7015.6
## TARGET_FLAG ~ HOME_VAL + log(CLM_FREQ + 1) + log(MVR_PTS + 1) +
## AGE + YOJ + log(INCOME + 1) + log(TRAVTIME) + log(TIF) +
## EDUCATION + CAR_TYPE
##
## Df Deviance AIC
## - YOJ 1 6987.0 7015.0
## <none> 6985.6 7015.6
## - log(INCOME + 1) 1 6987.9 7015.9
## - log(TRAVTIME) 1 7012.9 7040.9
## - AGE 1 7016.5 7044.5
## - log(TIF) 1 7026.8 7054.8
## - EDUCATION 1 7044.7 7072.7
## - log(MVR_PTS + 1) 1 7045.1 7073.1
## - HOME_VAL 1 7068.8 7096.8
## - CAR_TYPE 5 7119.4 7139.4
## - log(CLM_FREQ + 1) 1 7137.2 7165.2
##
## Step: AIC=7014.96
## TARGET_FLAG ~ HOME_VAL + log(CLM_FREQ + 1) + log(MVR_PTS + 1) +
## AGE + log(INCOME + 1) + log(TRAVTIME) + log(TIF) + EDUCATION +
## CAR_TYPE
##
## Df Deviance AIC
## <none> 6987.0 7015.0
## - log(INCOME + 1) 1 6993.2 7019.2
## - log(TRAVTIME) 1 7014.4 7040.4
## - AGE 1 7019.6 7045.6
## - log(TIF) 1 7028.4 7054.4
## - EDUCATION 1 7046.3 7072.3
## - log(MVR_PTS + 1) 1 7046.9 7072.9
## - HOME_VAL 1 7072.4 7098.4
## - CAR_TYPE 5 7121.9 7139.9
## - log(CLM_FREQ + 1) 1 7138.5 7164.5
##
## Call: glm(formula = TARGET_FLAG ~ HOME_VAL + log(CLM_FREQ + 1) + log(MVR_PTS +
## 1) + AGE + log(INCOME + 1) + log(TRAVTIME) + log(TIF) + EDUCATION +
## CAR_TYPE, family = binomial(link = "logit"), data = ins.train.prep)
##
## Coefficients:
## (Intercept) HOME_VAL log(CLM_FREQ + 1)
## -1.2735595 -0.0001702 0.6781152
## log(MVR_PTS + 1) AGE log(INCOME + 1)
## 0.3282210 -0.0198132 -0.0292323
## log(TRAVTIME) log(TIF) EDUCATIONHigh
## 0.2627794 -0.1985378 -0.4830558
## CAR_TYPEPanel Truck CAR_TYPEPickup CAR_TYPESports Car
## 0.8048125 0.8076874 0.9860129
## CAR_TYPEVan CAR_TYPEz_SUV
## 0.8302137 0.7766723
##
## Degrees of Freedom: 6956 Total (i.e. Null); 6943 Residual
## Null Deviance: 7885
## Residual Deviance: 6987 AIC: 7015
Model 2 implemented some log() transformations (to normalize scales and minimize strongly skewed distributions (not normally distributed)) to the following predictors: CLM_FREQ, MVR_PTS, INCOME, TRAVTIME and TIF
Final iteration removed the OLDCLAIM, YOJ, CAR_AGE predictors, leaving 9 predictors total achieving the lowest AIC (7015)
#Considering data only for TARGET_FLAG = 1, which are the car crash incidents
ins.train.prep.positives <- filter(ins.train.prep, TARGET_FLAG == 1)
lm_Mod1 <- lm(formula = TARGET_AMT ~ HOME_VAL + OLDCLAIM + log(CLM_FREQ + 1) + log(MVR_PTS + 1) + AGE + YOJ + log(INCOME + 1) + log(TRAVTIME) + log(TIF) + EDUCATION + CAR_AGE + CAR_TYPE + log(BLUEBOOK), data = ins.train.prep)
summary(lm_Mod1)
##
## Call:
## lm(formula = TARGET_AMT ~ HOME_VAL + OLDCLAIM + log(CLM_FREQ +
## 1) + log(MVR_PTS + 1) + AGE + YOJ + log(INCOME + 1) + log(TRAVTIME) +
## log(TIF) + EDUCATION + CAR_AGE + CAR_TYPE + log(BLUEBOOK),
## data = ins.train.prep)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4061 -1628 -911 65 103666
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 510.91408 554.09626 0.922 0.356526
## HOME_VAL -0.16255 0.03348 -4.855 1.23e-06 ***
## OLDCLAIM -0.08718 0.09766 -0.893 0.372064
## log(CLM_FREQ + 1) 754.25858 154.09777 4.895 1.01e-06 ***
## log(MVR_PTS + 1) 484.68112 81.95119 5.914 3.49e-09 ***
## AGE -8.24734 6.55319 -1.259 0.208245
## YOJ -0.15480 16.42355 -0.009 0.992480
## log(INCOME + 1) 9.43362 25.75735 0.366 0.714190
## log(TRAVTIME) 245.12131 89.33494 2.744 0.006088 **
## log(TIF) -221.48023 57.12097 -3.877 0.000107 ***
## EDUCATIONHigh -218.64252 138.22362 -1.582 0.113740
## CAR_AGEMedium -303.08524 193.90055 -1.563 0.118076
## CAR_AGEOld -393.46976 159.25759 -2.471 0.013511 *
## CAR_TYPEPanel Truck 1071.62400 215.29814 4.977 6.60e-07 ***
## CAR_TYPEPickup 633.14802 173.33101 3.653 0.000261 ***
## CAR_TYPESports Car 841.83176 197.57983 4.261 2.06e-05 ***
## CAR_TYPEVan 1141.84661 206.65635 5.525 3.41e-08 ***
## CAR_TYPEz_SUV 635.77059 148.43830 4.283 1.87e-05 ***
## log(BLUEBOOK) 40.07382 41.06419 0.976 0.329157
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4536 on 6938 degrees of freedom
## Multiple R-squared: 0.0392, Adjusted R-squared: 0.03671
## F-statistic: 15.73 on 18 and 6938 DF, p-value: < 2.2e-16
#Removing the least significant predictors to simplify model
lm_Mod2 <- lm(formula = log(TARGET_AMT + 1) ~ log(CLM_FREQ + 1) + log(MVR_PTS + 1) + AGE + log(INCOME + 1) + CAR_TYPE + log(BLUEBOOK),
data = ins.train.prep.positives)
summary(lm_Mod2)
##
## Call:
## lm(formula = log(TARGET_AMT + 1) ~ log(CLM_FREQ + 1) + log(MVR_PTS +
## 1) + AGE + log(INCOME + 1) + CAR_TYPE + log(BLUEBOOK), data = ins.train.prep.positives)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.7317 -0.3914 0.0347 0.3868 3.1546
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.880277 0.143853 54.780 <2e-16 ***
## log(CLM_FREQ + 1) -0.059759 0.034717 -1.721 0.0854 .
## log(MVR_PTS + 1) 0.071457 0.025707 2.780 0.0055 **
## AGE 0.003024 0.002010 1.504 0.1327
## log(INCOME + 1) 0.014105 0.007186 1.963 0.0498 *
## CAR_TYPEPanel Truck 0.183793 0.081185 2.264 0.0237 *
## CAR_TYPEPickup -0.008992 0.065804 -0.137 0.8913
## CAR_TYPESports Car -0.043755 0.070290 -0.622 0.5337
## CAR_TYPEVan 0.080147 0.077740 1.031 0.3027
## CAR_TYPEz_SUV -0.009831 0.059227 -0.166 0.8682
## log(BLUEBOOK) 0.019397 0.013394 1.448 0.1477
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8021 on 1756 degrees of freedom
## Multiple R-squared: 0.01697, Adjusted R-squared: 0.01137
## F-statistic: 3.031 on 10 and 1756 DF, p-value: 0.0008198
#Residual Analysis
plot(lm_Mod2$residuals ~ lm_Mod2$fitted.values)
abline(h = 0, lty = 3, col="red")
#Applying a Box-Cox Transformation to the response variable on Model2
library(MASS)
b <- boxcox(log(TARGET_AMT + 1) ~ log(CLM_FREQ + 1) + log(MVR_PTS + 1) + AGE + log(INCOME + 1) + CAR_TYPE + log(BLUEBOOK),
data = ins.train.prep.positives)
## lambda lik
## [1,] 1.111111 -2483.535
## [2,] 1.151515 -2483.543
## [3,] 1.070707 -2483.622
## [4,] 1.191919 -2483.644
## [5,] 1.030303 -2483.804
lm_Mod3 <- lm(formula = (TARGET_AMT)^1.1 ~ log(CLM_FREQ + 1) + log(MVR_PTS + 1) + AGE + log(INCOME + 1) + CAR_TYPE + log(BLUEBOOK),
data = ins.train.prep.positives)
summary(lm_Mod3)
##
## Call:
## lm(formula = (TARGET_AMT)^1.1 ~ log(CLM_FREQ + 1) + log(MVR_PTS +
## 1) + AGE + log(INCOME + 1) + CAR_TYPE + log(BLUEBOOK), data = ins.train.prep.positives)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21197 -8748 -4526 227 321443
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2228.57 4109.95 0.542 0.5877
## log(CLM_FREQ + 1) -204.38 991.87 -0.206 0.8368
## log(MVR_PTS + 1) 1261.37 734.47 1.717 0.0861 .
## AGE 98.84 57.44 1.721 0.0855 .
## log(INCOME + 1) 371.97 205.31 1.812 0.0702 .
## CAR_TYPEPanel Truck 5414.27 2319.48 2.334 0.0197 *
## CAR_TYPEPickup 116.35 1880.06 0.062 0.9507
## CAR_TYPESports Car 584.62 2008.21 0.291 0.7710
## CAR_TYPEVan 5582.30 2221.05 2.513 0.0120 *
## CAR_TYPEz_SUV 279.96 1692.15 0.165 0.8686
## log(BLUEBOOK) 430.88 382.67 1.126 0.2603
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22920 on 1756 degrees of freedom
## Multiple R-squared: 0.01604, Adjusted R-squared: 0.01043
## F-statistic: 2.862 on 10 and 1756 DF, p-value: 0.001526
Model 1 implemented some log() transformations (to normalize scales and minimize strongly skewed distributions (not normally distributed)) to the following predictors: CLM_FREQ, MVR_PTS, INCOME, TRAVTIME, TIF and BLUEBOOK and most importantly to the response variable TARGET_AMT
Model 2, simplifying based on predictor significance shows a very low R_squared (0.01697). Reducing more decreases R_squared so the model selected is left with the minimum varaibles that maximize R_squared and keep F-Statistic significant It appears the available predictors cannot explain the varibility in the response variable, still the model could be useful if estimate predictions are enough.
Model 3, a Box Cox transformation was tried which resulted in a Lambda value of 1.1, applying that to the response variabel did not help compensating the non-normal distribution, so this alternative is discarded.
For predicting the TARGET_FLAG response variable, the logistic regression model selected is log_reg_Mod2, containing 9 predictors total achieving the lowest AIC (7015)
For predicting the TARGET_AMT response variable, the linear regression model selected is lm_Mod2, containing 6 predictors total achieving the highest R_squared (0.01697)
ins.eval <- read.csv("insurance-evaluation-data.csv", header= TRUE)
#Evaluation set data preparation
ins.eval$HOMEKIDS <- as.factor(ins.eval$HOMEKIDS)
ins.eval$INCOME <- as.numeric(ins.eval$INCOME)
ins.eval$HOME_VAL <- as.numeric(ins.eval$HOME_VAL)
ins.eval$BLUEBOOK <- as.numeric(ins.eval$BLUEBOOK)
ins.eval$OLDCLAIM <- as.numeric(ins.eval$OLDCLAIM)
ins.eval$KIDSDRIV <- as.factor(ins.eval$KIDSDRIV)
levels(ins.eval$KIDSDRIV) <- list("0"=c("0"),"1"=c("1","2","3","4"))
levels(ins.eval$EDUCATION) <- list("Low"=c("High School", "z_High School"),"High"=c("Bachelors","Masters","PhD" ))
ins.eval$CAR_AGE <- as.factor(ins.eval$CAR_AGE)
levels(ins.eval$CAR_AGE) <- list("New"=c("0","1","2","3"),"Medium"=c("4","5","6","7"), "Old"=as.character(c(8:28)))
levels(ins.eval$JOB) <- list("Blue"=c("Student", "z_Blue Collar","Home Maker",""),"White"=c("Clerical", "Doctor", "Lawyer","Manager", "Professional"))
pred_flag <- predict(log_reg_Mod2, newdata=ins.eval,type = "response")
#Binarizing model probability of an accident with 0.5 threshold
pred_flag[which(pred_flag <0.50)] <- 0
pred_flag[which(pred_flag >0.50)] <- 1
head(pred_flag,30)
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## 0 0 0 NA 0 0 0 NA 0 0 NA NA NA 0 0 NA NA 0 0 0 1 0 0 0 0
## 26 27 28 29 30
## NA 0 0 0 NA
ins.eval$PRED_FLAG <- as.numeric(pred_flag)
pred_amt <- predict(lm_Mod2, newdata=filter(ins.eval, PRED_FLAG == 1))
head(pred_amt)
## 1 2 3 4 5 6
## 8.272266 8.277675 8.241666 8.238499 8.302749 8.303779
ins.eval.amt <- filter(ins.eval, PRED_FLAG == 1)
ins.eval.amt$PRED_AMT <- as.numeric(exp(pred_amt))
head(ins.eval.amt$PRED_AMT)
## [1] 3913.809 3935.033 3795.861 3783.858 4034.951 4039.109