First, i will import the data.
itrain <- read.csv("insurance_training_data.csv")
ieval <- read.csv("insurance-evaluation-data.csv")
DATA EXPLORATION
First, I will look at the first rows, to get a sense of the data and see what types of variable each column contians.
head(itrain)
## INDEX TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ INCOME PARENT1
## 1 1 0 0 0 60 0 11 $67,349 No
## 2 2 0 0 0 43 0 11 $91,449 No
## 3 4 0 0 0 35 1 10 $16,039 No
## 4 5 0 0 0 51 0 14 No
## 5 6 0 0 0 50 0 NA $114,986 No
## 6 7 1 2946 0 34 1 12 $125,301 Yes
## HOME_VAL MSTATUS SEX EDUCATION JOB TRAVTIME CAR_USE
## 1 $0 z_No M PhD Professional 14 Private
## 2 $257,252 z_No M z_High School z_Blue Collar 22 Commercial
## 3 $124,191 Yes z_F z_High School Clerical 5 Private
## 4 $306,251 Yes M <High School z_Blue Collar 32 Private
## 5 $243,925 Yes z_F PhD Doctor 36 Private
## 6 $0 z_No z_F Bachelors z_Blue Collar 46 Commercial
## BLUEBOOK TIF CAR_TYPE RED_CAR OLDCLAIM CLM_FREQ REVOKED MVR_PTS
## 1 $14,230 11 Minivan yes $4,461 2 No 3
## 2 $14,940 1 Minivan yes $0 0 No 0
## 3 $4,010 4 z_SUV no $38,690 2 No 3
## 4 $15,440 7 Minivan yes $0 0 No 0
## 5 $18,000 1 z_SUV no $19,217 2 Yes 3
## 6 $17,430 1 Sports Car no $0 0 No 0
## CAR_AGE URBANICITY
## 1 18 Highly Urban/ Urban
## 2 1 Highly Urban/ Urban
## 3 10 Highly Urban/ Urban
## 4 6 Highly Urban/ Urban
## 5 17 Highly Urban/ Urban
## 6 7 Highly Urban/ Urban
I willuse a box plot for each variable so I can see the outliers.
boxplot(itrain)
As we can see, TARGET_AMT has many outliers. So we will remove this and look again at the box plots of the other variables.
boxplot_itrain <- itrain[,-1:-3]
boxplot(boxplot_itrain)
DATA PREPARATION
First, we will clean up the data.
#clean money data
money = function(input) {
out = sub("\\$", "", input)
out = as.numeric(sub(",", "", out))
return(out)
}
#clean spaces
underscore = function(input) {
out = sub(" ", "_", input)
return(out)
}
itrain = as.tbl(itrain) %>%
mutate_at(c("INCOME","HOME_VAL","BLUEBOOK","OLDCLAIM"),
money) %>%
mutate_at(c("EDUCATION","JOB","CAR_TYPE","URBANICITY"),
underscore) %>%
mutate_at(c("EDUCATION","JOB","CAR_TYPE","URBANICITY"),
as.factor) %>%
mutate(TARGET_FLAG = as.factor(TARGET_FLAG))
## Warning: The `printer` argument is deprecated as of rlang 0.3.0.
## This warning is displayed once per session.
itrain$TARGET_AMT = as.factor(itrain$TARGET_AMT)
itrain$INCOME = as.numeric(itrain$INCOME)
itrain$HOME_VAL = as.numeric(itrain$HOME_VAL)
itrain$BLUEBOOK = as.numeric(itrain$BLUEBOOK)
itrain$OLDCLAIM = as.numeric(itrain$OLDCLAIM)
itrain$MSTATUS = as.factor(str_remove(itrain$MSTATUS, "^z_"))
itrain$MSTATUS = as.factor(str_remove(itrain$MSTATUS, "z_"))
itrain$SEX = as.factor(str_remove(itrain$SEX, "^z_"))
itrain$EDUCATION = as.factor(str_remove(itrain$EDUCATION, "^z_"))
itrain$JOB = as.factor(str_remove(itrain$JOB, "^z_"))
itrain$CAR_TYPE = as.factor(str_remove(itrain$CAR_TYPE, "^z_"))
itrain$URBANICITY = as.factor(str_remove(itrain$URBANICITY, "^z_"))
Then, we will replace missing variables with the mean for that variable.
#check for missing data
sapply(itrain, function(x) sum(is.na(x)))
## INDEX TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS
## 0 0 0 0 6 0
## YOJ INCOME PARENT1 HOME_VAL MSTATUS SEX
## 454 445 0 464 0 0
## EDUCATION JOB TRAVTIME CAR_USE BLUEBOOK TIF
## 0 0 0 0 0 0
## CAR_TYPE RED_CAR OLDCLAIM CLM_FREQ REVOKED MVR_PTS
## 0 0 0 0 0 0
## CAR_AGE URBANICITY
## 510 0
We see YOJ, INCOME, HOME_VAL, and CAR_AGE have missing variables. We will use the mean to estimate the missing variables.
itrain$YOJ[is.na(itrain$YOJ)] <- mean(itrain$YOJ, na.rm = TRUE)
itrain$INCOME[is.na(itrain$INCOME)] <- mean(itrain$INCOME, na.rm = TRUE)
itrain$HOME_VAL[is.na(itrain$HOME_VAL)] <- mean(itrain$HOME_VAL, na.rm = TRUE)
itrain$CAR_AGE[is.na(itrain$CAR_AGE)] <- mean(itrain$CAR_AGE, na.rm = TRUE)
We will then drop unused levels.
itrain %>%
droplevels() %>%
as.data.frame() -> itrain
We will then transform the skewed variables.
m1_data <- itrain
#transform data using log for skewed variables
m1_data$HOMEKIDS <- log(m1_data$HOMEKIDS+1)
m1_data$MVR_PTS <- log(m1_data$MVR_PTS+1)
m1_data$OLDCLAIM <- log(m1_data$OLDCLAIM+1)
m1_data$TIF <- log(m1_data$TIF+1)
m1_data$KIDSDRIV <- log(m1_data$KIDSDRIV+1)
m1_data$CLM_FREQ <- log(m1_data$CLM_FREQ+1)
m1_data <- m1_data[,-1]
BUILD MODELS
We will build a binomial model based on the original data.
m1 <- glm(formula = TARGET_FLAG ~ . - TARGET_AMT, data=m1_data, family = "binomial" (link="logit"))
summary(m1)
##
## Call:
## glm(formula = TARGET_FLAG ~ . - TARGET_AMT, family = binomial(link = "logit"),
## data = m1_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5272 -0.7179 -0.3987 0.6466 3.1460
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.723e+00 3.468e-01 -7.851 4.12e-15 ***
## KIDSDRIV 6.918e-01 1.104e-01 6.268 3.66e-10 ***
## AGE -3.904e-04 4.084e-03 -0.096 0.923845
## HOMEKIDS 1.419e-01 8.315e-02 1.707 0.087900 .
## YOJ -1.296e-02 8.592e-03 -1.508 0.131536
## INCOME -3.465e-06 1.077e-06 -3.218 0.001289 **
## PARENT1Yes 3.245e-01 1.145e-01 2.835 0.004588 **
## HOME_VAL -1.327e-06 3.420e-07 -3.880 0.000105 ***
## MSTATUSYes -5.132e-01 8.494e-02 -6.042 1.52e-09 ***
## SEXM 9.408e-02 1.121e-01 0.840 0.401143
## EDUCATIONBachelors -3.723e-01 1.156e-01 -3.221 0.001276 **
## EDUCATIONHigh_School 2.257e-02 9.500e-02 0.238 0.812228
## EDUCATIONMasters -2.776e-01 1.786e-01 -1.554 0.120219
## EDUCATIONPhD -1.455e-01 2.136e-01 -0.681 0.495817
## JOBBlue_Collar 3.112e-01 1.853e-01 1.679 0.093131 .
## JOBClerical 3.984e-01 1.964e-01 2.029 0.042445 *
## JOBDoctor -4.225e-01 2.662e-01 -1.587 0.112475
## JOBHome_Maker 1.993e-01 2.099e-01 0.949 0.342556
## JOBLawyer 1.182e-01 1.693e-01 0.698 0.484943
## JOBManager -5.594e-01 1.712e-01 -3.268 0.001085 **
## JOBProfessional 1.698e-01 1.782e-01 0.953 0.340586
## JOBStudent 1.988e-01 2.141e-01 0.929 0.353126
## TRAVTIME 1.489e-02 1.881e-03 7.915 2.46e-15 ***
## CAR_USEPrivate -7.611e-01 9.182e-02 -8.290 < 2e-16 ***
## BLUEBOOK -2.067e-05 5.258e-06 -3.931 8.47e-05 ***
## TIF -3.256e-01 4.141e-02 -7.862 3.78e-15 ***
## CAR_TYPEPanel_Truck 5.691e-01 1.614e-01 3.526 0.000421 ***
## CAR_TYPEPickup 5.600e-01 1.008e-01 5.558 2.72e-08 ***
## CAR_TYPESports_Car 1.030e+00 1.298e-01 7.930 2.19e-15 ***
## CAR_TYPESUV 7.801e-01 1.112e-01 7.017 2.27e-12 ***
## CAR_TYPEVan 6.109e-01 1.266e-01 4.826 1.39e-06 ***
## RED_CARyes -1.620e-02 8.655e-02 -0.187 0.851499
## OLDCLAIM 6.589e-03 1.698e-02 0.388 0.698046
## CLM_FREQ 3.207e-01 1.278e-01 2.509 0.012097 *
## REVOKEDYes 7.241e-01 8.185e-02 8.847 < 2e-16 ***
## MVR_PTS 2.789e-01 4.207e-02 6.628 3.40e-11 ***
## CAR_AGE -1.822e-03 7.531e-03 -0.242 0.808846
## URBANICITYHighly_Urban/ Urban 2.365e+00 1.130e-01 20.936 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9404.0 on 8154 degrees of freedom
## Residual deviance: 7302.1 on 8117 degrees of freedom
## (6 observations deleted due to missingness)
## AIC: 7378.1
##
## Number of Fisher Scoring iterations: 5
For the second model, we will only use signifacnt variables with probability of less than .05.
#create new df
m2_data <- m1_data %>% select(TARGET_FLAG, TARGET_AMT, KIDSDRIV, HOMEKIDS, INCOME, PARENT1, HOME_VAL, MSTATUS, EDUCATION, JOB, TRAVTIME, CAR_USE, BLUEBOOK, TIF, CAR_TYPE, CLM_FREQ, REVOKED, MVR_PTS, URBANICITY)
#create model
m2 <- glm(formula = TARGET_FLAG ~ . - TARGET_AMT, data=m2_data, family = "binomial" (link="logit"))
summary(m2)
##
## Call:
## glm(formula = TARGET_FLAG ~ . - TARGET_AMT, family = binomial(link = "logit"),
## data = m2_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5310 -0.7201 -0.3974 0.6534 3.1469
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.776e+00 2.817e-01 -9.852 < 2e-16 ***
## KIDSDRIV 6.869e-01 1.079e-01 6.368 1.91e-10 ***
## HOMEKIDS 1.287e-01 7.475e-02 1.721 0.085202 .
## INCOME -3.627e-06 1.072e-06 -3.382 0.000719 ***
## PARENT1Yes 3.373e-01 1.141e-01 2.956 0.003117 **
## HOME_VAL -1.331e-06 3.409e-07 -3.904 9.45e-05 ***
## MSTATUSYes -5.249e-01 8.459e-02 -6.205 5.47e-10 ***
## EDUCATIONBachelors -3.782e-01 1.087e-01 -3.480 0.000502 ***
## EDUCATIONHigh_School 2.009e-02 9.452e-02 0.213 0.831693
## EDUCATIONMasters -2.943e-01 1.614e-01 -1.824 0.068190 .
## EDUCATIONPhD -1.615e-01 1.999e-01 -0.808 0.419125
## JOBBlue_Collar 3.068e-01 1.852e-01 1.656 0.097685 .
## JOBClerical 3.940e-01 1.962e-01 2.008 0.044688 *
## JOBDoctor -4.161e-01 2.659e-01 -1.565 0.117515
## JOBHome_Maker 2.569e-01 2.038e-01 1.260 0.207562
## JOBLawyer 1.168e-01 1.691e-01 0.691 0.489550
## JOBManager -5.617e-01 1.711e-01 -3.283 0.001028 **
## JOBProfessional 1.654e-01 1.781e-01 0.928 0.353296
## JOBStudent 2.567e-01 2.106e-01 1.219 0.222878
## TRAVTIME 1.479e-02 1.878e-03 7.877 3.36e-15 ***
## CAR_USEPrivate -7.636e-01 9.160e-02 -8.337 < 2e-16 ***
## BLUEBOOK -2.297e-05 4.717e-06 -4.869 1.12e-06 ***
## TIF -3.268e-01 4.137e-02 -7.899 2.81e-15 ***
## CAR_TYPEPanel_Truck 6.234e-01 1.506e-01 4.139 3.49e-05 ***
## CAR_TYPEPickup 5.556e-01 1.006e-01 5.524 3.31e-08 ***
## CAR_TYPESports_Car 9.732e-01 1.074e-01 9.063 < 2e-16 ***
## CAR_TYPESUV 7.186e-01 8.589e-02 8.366 < 2e-16 ***
## CAR_TYPEVan 6.485e-01 1.220e-01 5.315 1.07e-07 ***
## CLM_FREQ 3.619e-01 5.467e-02 6.620 3.60e-11 ***
## REVOKEDYes 7.313e-01 8.026e-02 9.111 < 2e-16 ***
## MVR_PTS 2.851e-01 4.139e-02 6.888 5.66e-12 ***
## URBANICITYHighly_Urban/ Urban 2.371e+00 1.128e-01 21.023 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9418.0 on 8160 degrees of freedom
## Residual deviance: 7311.8 on 8129 degrees of freedom
## AIC: 7375.8
##
## Number of Fisher Scoring iterations: 5
I will repeat this step again, removing HOMEKIDS and JOB.
#create new df
m3_data <- m2_data %>% select(TARGET_FLAG, TARGET_AMT, KIDSDRIV, INCOME, PARENT1, HOME_VAL, MSTATUS, EDUCATION, TRAVTIME, CAR_USE, BLUEBOOK, TIF, CAR_TYPE, CLM_FREQ, REVOKED, MVR_PTS, URBANICITY)
#create model
m3 <- glm(formula = TARGET_FLAG ~ . - TARGET_AMT, data=m3_data, family = "binomial" (link="logit"))
summary(m3)
##
## Call:
## glm(formula = TARGET_FLAG ~ . - TARGET_AMT, family = binomial(link = "logit"),
## data = m3_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5307 -0.7293 -0.4108 0.6638 3.1409
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.297e+00 1.959e-01 -11.728 < 2e-16 ***
## KIDSDRIV 7.570e-01 9.728e-02 7.782 7.11e-15 ***
## INCOME -4.582e-06 9.750e-07 -4.699 2.61e-06 ***
## PARENT1Yes 4.509e-01 9.367e-02 4.814 1.48e-06 ***
## HOME_VAL -1.345e-06 3.298e-07 -4.078 4.54e-05 ***
## MSTATUSYes -4.706e-01 7.868e-02 -5.982 2.21e-09 ***
## EDUCATIONBachelors -5.776e-01 9.763e-02 -5.916 3.29e-09 ***
## EDUCATIONHigh_School -7.096e-02 9.149e-02 -0.776 0.437959
## EDUCATIONMasters -5.842e-01 1.092e-01 -5.348 8.92e-08 ***
## EDUCATIONPhD -5.917e-01 1.475e-01 -4.010 6.07e-05 ***
## TRAVTIME 1.526e-02 1.866e-03 8.174 2.99e-16 ***
## CAR_USEPrivate -8.579e-01 7.319e-02 -11.721 < 2e-16 ***
## BLUEBOOK -2.372e-05 4.683e-06 -5.065 4.08e-07 ***
## TIF -3.209e-01 4.107e-02 -7.812 5.63e-15 ***
## CAR_TYPEPanel_Truck 5.370e-01 1.422e-01 3.777 0.000158 ***
## CAR_TYPEPickup 4.942e-01 9.796e-02 5.046 4.52e-07 ***
## CAR_TYPESports_Car 9.566e-01 1.058e-01 9.042 < 2e-16 ***
## CAR_TYPESUV 7.195e-01 8.481e-02 8.484 < 2e-16 ***
## CAR_TYPEVan 5.992e-01 1.195e-01 5.015 5.30e-07 ***
## CLM_FREQ 3.585e-01 5.435e-02 6.597 4.20e-11 ***
## REVOKEDYes 7.431e-01 7.969e-02 9.326 < 2e-16 ***
## MVR_PTS 2.986e-01 4.119e-02 7.250 4.15e-13 ***
## URBANICITYHighly_Urban/ Urban 2.314e+00 1.125e-01 20.566 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9418.0 on 8160 degrees of freedom
## Residual deviance: 7378.4 on 8138 degrees of freedom
## AIC: 7424.4
##
## Number of Fisher Scoring iterations: 5
This last model increased the probability, AIC score and the deviance. I will be proceeding with model 2. SELECT MODEL
I’m selecting model 2, because it has the lowest AIC score and the lowest probability of the 3 models, meaning it will be the best fit. I will check this with the ROC curve.
itrain$predict <- predict(m2, m2_data, type='response')
iroc <- roc(itrain$TARGET_FLAG, itrain$predict, plot=T, asp=NA,
legacy.axes=T, main = "ROC Curve", ret="tp", col="blue")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
iroc["auc"]
## $auc
## Area under the curve: 0.8131
The area under the curve is above .8, so I will proceed with this model for the prediction. I will plug in the evaluation data.
Finally, I will clean up the test data.
ieval = as.tbl(ieval) %>%
mutate_at(c("INCOME","HOME_VAL","BLUEBOOK","OLDCLAIM"),
money) %>%
mutate_at(c("EDUCATION","JOB","CAR_TYPE","URBANICITY"),
underscore) %>%
mutate_at(c("EDUCATION","JOB","CAR_TYPE","URBANICITY"),
as.factor)
ieval$TARGET_AMT = as.factor(ieval$TARGET_AMT)
ieval$INCOME = as.numeric(ieval$INCOME)
ieval$HOME_VAL = as.numeric(ieval$HOME_VAL)
ieval$BLUEBOOK = as.numeric(ieval$BLUEBOOK)
ieval$OLDCLAIM = as.numeric(ieval$OLDCLAIM)
ieval$MSTATUS = as.factor(str_remove(ieval$MSTATUS, "^z_"))
ieval$MSTATUS = as.factor(str_remove(ieval$MSTATUS, "z_"))
ieval$SEX = as.factor(str_remove(ieval$SEX, "^z_"))
ieval$EDUCATION = as.factor(str_remove(ieval$EDUCATION, "^z_"))
ieval$JOB = as.factor(str_remove(ieval$JOB, "^z_"))
ieval$CAR_TYPE = as.factor(str_remove(ieval$CAR_TYPE, "^z_"))
ieval$URBANICITY = as.factor(str_remove(ieval$URBANICITY, "^z_"))
#final data prep
ieval %>%
droplevels() %>%
as.data.frame() -> ieval
#run
final_Pred =predict(m2, newdata=ieval)
final_Pred = ifelse(final_Pred<.5,0,1)
hist(final_Pred)
We can see the above distribution of the estimated accidents.