DATA621 HW4

First, i will import the data.

itrain <- read.csv("insurance_training_data.csv")
ieval <- read.csv("insurance-evaluation-data.csv")

DATA EXPLORATION

First, I will look at the first rows, to get a sense of the data and see what types of variable each column contians.

head(itrain)

##   INDEX TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ   INCOME PARENT1
## 1     1           0          0        0  60        0  11  $67,349      No
## 2     2           0          0        0  43        0  11  $91,449      No
## 3     4           0          0        0  35        1  10  $16,039      No
## 4     5           0          0        0  51        0  14               No
## 5     6           0          0        0  50        0  NA $114,986      No
## 6     7           1       2946        0  34        1  12 $125,301     Yes
##   HOME_VAL MSTATUS SEX     EDUCATION           JOB TRAVTIME    CAR_USE
## 1       $0    z_No   M           PhD  Professional       14    Private
## 2 $257,252    z_No   M z_High School z_Blue Collar       22 Commercial
## 3 $124,191     Yes z_F z_High School      Clerical        5    Private
## 4 $306,251     Yes   M  <High School z_Blue Collar       32    Private
## 5 $243,925     Yes z_F           PhD        Doctor       36    Private
## 6       $0    z_No z_F     Bachelors z_Blue Collar       46 Commercial
##   BLUEBOOK TIF   CAR_TYPE RED_CAR OLDCLAIM CLM_FREQ REVOKED MVR_PTS
## 1  $14,230  11    Minivan     yes   $4,461        2      No       3
## 2  $14,940   1    Minivan     yes       $0        0      No       0
## 3   $4,010   4      z_SUV      no  $38,690        2      No       3
## 4  $15,440   7    Minivan     yes       $0        0      No       0
## 5  $18,000   1      z_SUV      no  $19,217        2     Yes       3
## 6  $17,430   1 Sports Car      no       $0        0      No       0
##   CAR_AGE          URBANICITY
## 1      18 Highly Urban/ Urban
## 2       1 Highly Urban/ Urban
## 3      10 Highly Urban/ Urban
## 4       6 Highly Urban/ Urban
## 5      17 Highly Urban/ Urban
## 6       7 Highly Urban/ Urban

I willuse a box plot for each variable so I can see the outliers.

boxplot(itrain)

As we can see, TARGET_AMT has many outliers. So we will remove this and look again at the box plots of the other variables.

boxplot_itrain <- itrain[,-1:-3]
boxplot(boxplot_itrain)

DATA PREPARATION

First, we will clean up the data.

#clean money data
money = function(input) {
  out = sub("\\$", "", input)
  out = as.numeric(sub(",", "", out))
  return(out)
}

#clean spaces
underscore = function(input) {
  out = sub(" ", "_", input)
  return(out)
}


itrain = as.tbl(itrain) %>% 
  mutate_at(c("INCOME","HOME_VAL","BLUEBOOK","OLDCLAIM"),
            money) %>% 
  mutate_at(c("EDUCATION","JOB","CAR_TYPE","URBANICITY"),
            underscore) %>% 
  mutate_at(c("EDUCATION","JOB","CAR_TYPE","URBANICITY"),
            as.factor) %>% 
  mutate(TARGET_FLAG = as.factor(TARGET_FLAG))

## Warning: The `printer` argument is deprecated as of rlang 0.3.0.
## This warning is displayed once per session.

itrain$TARGET_AMT = as.factor(itrain$TARGET_AMT)
itrain$INCOME = as.numeric(itrain$INCOME)
itrain$HOME_VAL = as.numeric(itrain$HOME_VAL)
itrain$BLUEBOOK = as.numeric(itrain$BLUEBOOK)
itrain$OLDCLAIM = as.numeric(itrain$OLDCLAIM)
itrain$MSTATUS = as.factor(str_remove(itrain$MSTATUS, "^z_"))
itrain$MSTATUS = as.factor(str_remove(itrain$MSTATUS, "z_"))
itrain$SEX = as.factor(str_remove(itrain$SEX, "^z_"))
itrain$EDUCATION = as.factor(str_remove(itrain$EDUCATION, "^z_"))
itrain$JOB = as.factor(str_remove(itrain$JOB, "^z_"))
itrain$CAR_TYPE = as.factor(str_remove(itrain$CAR_TYPE, "^z_"))
itrain$URBANICITY = as.factor(str_remove(itrain$URBANICITY, "^z_"))

Then, we will replace missing variables with the mean for that variable.

#check for missing data
sapply(itrain, function(x) sum(is.na(x)))

##       INDEX TARGET_FLAG  TARGET_AMT    KIDSDRIV         AGE    HOMEKIDS 
##           0           0           0           0           6           0 
##         YOJ      INCOME     PARENT1    HOME_VAL     MSTATUS         SEX 
##         454         445           0         464           0           0 
##   EDUCATION         JOB    TRAVTIME     CAR_USE    BLUEBOOK         TIF 
##           0           0           0           0           0           0 
##    CAR_TYPE     RED_CAR    OLDCLAIM    CLM_FREQ     REVOKED     MVR_PTS 
##           0           0           0           0           0           0 
##     CAR_AGE  URBANICITY 
##         510           0

We see YOJ, INCOME, HOME_VAL, and CAR_AGE have missing variables. We will use the mean to estimate the missing variables.

itrain$YOJ[is.na(itrain$YOJ)] <- mean(itrain$YOJ, na.rm = TRUE)
itrain$INCOME[is.na(itrain$INCOME)] <- mean(itrain$INCOME, na.rm = TRUE)
itrain$HOME_VAL[is.na(itrain$HOME_VAL)] <- mean(itrain$HOME_VAL, na.rm = TRUE)
itrain$CAR_AGE[is.na(itrain$CAR_AGE)] <- mean(itrain$CAR_AGE, na.rm = TRUE)

We will then drop unused levels.

itrain %>%
    droplevels() %>%
    as.data.frame() -> itrain

We will then transform the skewed variables.

m1_data <- itrain
#transform data using log for skewed variables
m1_data$HOMEKIDS <- log(m1_data$HOMEKIDS+1)
m1_data$MVR_PTS <- log(m1_data$MVR_PTS+1)
m1_data$OLDCLAIM <- log(m1_data$OLDCLAIM+1)
m1_data$TIF <- log(m1_data$TIF+1)
m1_data$KIDSDRIV <- log(m1_data$KIDSDRIV+1)
m1_data$CLM_FREQ <- log(m1_data$CLM_FREQ+1)

m1_data <- m1_data[,-1]

BUILD MODELS

We will build a binomial model based on the original data.

m1 <- glm(formula = TARGET_FLAG ~ . - TARGET_AMT, data=m1_data, family = "binomial" (link="logit"))

summary(m1)

## 
## Call:
## glm(formula = TARGET_FLAG ~ . - TARGET_AMT, family = binomial(link = "logit"), 
##     data = m1_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5272  -0.7179  -0.3987   0.6466   3.1460  
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -2.723e+00  3.468e-01  -7.851 4.12e-15 ***
## KIDSDRIV                       6.918e-01  1.104e-01   6.268 3.66e-10 ***
## AGE                           -3.904e-04  4.084e-03  -0.096 0.923845    
## HOMEKIDS                       1.419e-01  8.315e-02   1.707 0.087900 .  
## YOJ                           -1.296e-02  8.592e-03  -1.508 0.131536    
## INCOME                        -3.465e-06  1.077e-06  -3.218 0.001289 ** 
## PARENT1Yes                     3.245e-01  1.145e-01   2.835 0.004588 ** 
## HOME_VAL                      -1.327e-06  3.420e-07  -3.880 0.000105 ***
## MSTATUSYes                    -5.132e-01  8.494e-02  -6.042 1.52e-09 ***
## SEXM                           9.408e-02  1.121e-01   0.840 0.401143    
## EDUCATIONBachelors            -3.723e-01  1.156e-01  -3.221 0.001276 ** 
## EDUCATIONHigh_School           2.257e-02  9.500e-02   0.238 0.812228    
## EDUCATIONMasters              -2.776e-01  1.786e-01  -1.554 0.120219    
## EDUCATIONPhD                  -1.455e-01  2.136e-01  -0.681 0.495817    
## JOBBlue_Collar                 3.112e-01  1.853e-01   1.679 0.093131 .  
## JOBClerical                    3.984e-01  1.964e-01   2.029 0.042445 *  
## JOBDoctor                     -4.225e-01  2.662e-01  -1.587 0.112475    
## JOBHome_Maker                  1.993e-01  2.099e-01   0.949 0.342556    
## JOBLawyer                      1.182e-01  1.693e-01   0.698 0.484943    
## JOBManager                    -5.594e-01  1.712e-01  -3.268 0.001085 ** 
## JOBProfessional                1.698e-01  1.782e-01   0.953 0.340586    
## JOBStudent                     1.988e-01  2.141e-01   0.929 0.353126    
## TRAVTIME                       1.489e-02  1.881e-03   7.915 2.46e-15 ***
## CAR_USEPrivate                -7.611e-01  9.182e-02  -8.290  < 2e-16 ***
## BLUEBOOK                      -2.067e-05  5.258e-06  -3.931 8.47e-05 ***
## TIF                           -3.256e-01  4.141e-02  -7.862 3.78e-15 ***
## CAR_TYPEPanel_Truck            5.691e-01  1.614e-01   3.526 0.000421 ***
## CAR_TYPEPickup                 5.600e-01  1.008e-01   5.558 2.72e-08 ***
## CAR_TYPESports_Car             1.030e+00  1.298e-01   7.930 2.19e-15 ***
## CAR_TYPESUV                    7.801e-01  1.112e-01   7.017 2.27e-12 ***
## CAR_TYPEVan                    6.109e-01  1.266e-01   4.826 1.39e-06 ***
## RED_CARyes                    -1.620e-02  8.655e-02  -0.187 0.851499    
## OLDCLAIM                       6.589e-03  1.698e-02   0.388 0.698046    
## CLM_FREQ                       3.207e-01  1.278e-01   2.509 0.012097 *  
## REVOKEDYes                     7.241e-01  8.185e-02   8.847  < 2e-16 ***
## MVR_PTS                        2.789e-01  4.207e-02   6.628 3.40e-11 ***
## CAR_AGE                       -1.822e-03  7.531e-03  -0.242 0.808846    
## URBANICITYHighly_Urban/ Urban  2.365e+00  1.130e-01  20.936  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9404.0  on 8154  degrees of freedom
## Residual deviance: 7302.1  on 8117  degrees of freedom
##   (6 observations deleted due to missingness)
## AIC: 7378.1
## 
## Number of Fisher Scoring iterations: 5

For the second model, we will only use signifacnt variables with probability of less than .05.

#create new df
m2_data <- m1_data %>% select(TARGET_FLAG, TARGET_AMT, KIDSDRIV, HOMEKIDS, INCOME, PARENT1, HOME_VAL, MSTATUS, EDUCATION, JOB, TRAVTIME, CAR_USE, BLUEBOOK, TIF, CAR_TYPE, CLM_FREQ, REVOKED, MVR_PTS, URBANICITY)
#create model
m2 <- glm(formula = TARGET_FLAG ~ . - TARGET_AMT, data=m2_data, family = "binomial" (link="logit"))

summary(m2)

## 
## Call:
## glm(formula = TARGET_FLAG ~ . - TARGET_AMT, family = binomial(link = "logit"), 
##     data = m2_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5310  -0.7201  -0.3974   0.6534   3.1469  
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -2.776e+00  2.817e-01  -9.852  < 2e-16 ***
## KIDSDRIV                       6.869e-01  1.079e-01   6.368 1.91e-10 ***
## HOMEKIDS                       1.287e-01  7.475e-02   1.721 0.085202 .  
## INCOME                        -3.627e-06  1.072e-06  -3.382 0.000719 ***
## PARENT1Yes                     3.373e-01  1.141e-01   2.956 0.003117 ** 
## HOME_VAL                      -1.331e-06  3.409e-07  -3.904 9.45e-05 ***
## MSTATUSYes                    -5.249e-01  8.459e-02  -6.205 5.47e-10 ***
## EDUCATIONBachelors            -3.782e-01  1.087e-01  -3.480 0.000502 ***
## EDUCATIONHigh_School           2.009e-02  9.452e-02   0.213 0.831693    
## EDUCATIONMasters              -2.943e-01  1.614e-01  -1.824 0.068190 .  
## EDUCATIONPhD                  -1.615e-01  1.999e-01  -0.808 0.419125    
## JOBBlue_Collar                 3.068e-01  1.852e-01   1.656 0.097685 .  
## JOBClerical                    3.940e-01  1.962e-01   2.008 0.044688 *  
## JOBDoctor                     -4.161e-01  2.659e-01  -1.565 0.117515    
## JOBHome_Maker                  2.569e-01  2.038e-01   1.260 0.207562    
## JOBLawyer                      1.168e-01  1.691e-01   0.691 0.489550    
## JOBManager                    -5.617e-01  1.711e-01  -3.283 0.001028 ** 
## JOBProfessional                1.654e-01  1.781e-01   0.928 0.353296    
## JOBStudent                     2.567e-01  2.106e-01   1.219 0.222878    
## TRAVTIME                       1.479e-02  1.878e-03   7.877 3.36e-15 ***
## CAR_USEPrivate                -7.636e-01  9.160e-02  -8.337  < 2e-16 ***
## BLUEBOOK                      -2.297e-05  4.717e-06  -4.869 1.12e-06 ***
## TIF                           -3.268e-01  4.137e-02  -7.899 2.81e-15 ***
## CAR_TYPEPanel_Truck            6.234e-01  1.506e-01   4.139 3.49e-05 ***
## CAR_TYPEPickup                 5.556e-01  1.006e-01   5.524 3.31e-08 ***
## CAR_TYPESports_Car             9.732e-01  1.074e-01   9.063  < 2e-16 ***
## CAR_TYPESUV                    7.186e-01  8.589e-02   8.366  < 2e-16 ***
## CAR_TYPEVan                    6.485e-01  1.220e-01   5.315 1.07e-07 ***
## CLM_FREQ                       3.619e-01  5.467e-02   6.620 3.60e-11 ***
## REVOKEDYes                     7.313e-01  8.026e-02   9.111  < 2e-16 ***
## MVR_PTS                        2.851e-01  4.139e-02   6.888 5.66e-12 ***
## URBANICITYHighly_Urban/ Urban  2.371e+00  1.128e-01  21.023  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9418.0  on 8160  degrees of freedom
## Residual deviance: 7311.8  on 8129  degrees of freedom
## AIC: 7375.8
## 
## Number of Fisher Scoring iterations: 5

I will repeat this step again, removing HOMEKIDS and JOB.

#create new df
m3_data <- m2_data %>% select(TARGET_FLAG, TARGET_AMT, KIDSDRIV, INCOME, PARENT1, HOME_VAL, MSTATUS, EDUCATION, TRAVTIME, CAR_USE, BLUEBOOK, TIF, CAR_TYPE, CLM_FREQ, REVOKED, MVR_PTS, URBANICITY)
#create model
m3 <- glm(formula = TARGET_FLAG ~ . - TARGET_AMT, data=m3_data, family = "binomial" (link="logit"))

summary(m3)

## 
## Call:
## glm(formula = TARGET_FLAG ~ . - TARGET_AMT, family = binomial(link = "logit"), 
##     data = m3_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5307  -0.7293  -0.4108   0.6638   3.1409  
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -2.297e+00  1.959e-01 -11.728  < 2e-16 ***
## KIDSDRIV                       7.570e-01  9.728e-02   7.782 7.11e-15 ***
## INCOME                        -4.582e-06  9.750e-07  -4.699 2.61e-06 ***
## PARENT1Yes                     4.509e-01  9.367e-02   4.814 1.48e-06 ***
## HOME_VAL                      -1.345e-06  3.298e-07  -4.078 4.54e-05 ***
## MSTATUSYes                    -4.706e-01  7.868e-02  -5.982 2.21e-09 ***
## EDUCATIONBachelors            -5.776e-01  9.763e-02  -5.916 3.29e-09 ***
## EDUCATIONHigh_School          -7.096e-02  9.149e-02  -0.776 0.437959    
## EDUCATIONMasters              -5.842e-01  1.092e-01  -5.348 8.92e-08 ***
## EDUCATIONPhD                  -5.917e-01  1.475e-01  -4.010 6.07e-05 ***
## TRAVTIME                       1.526e-02  1.866e-03   8.174 2.99e-16 ***
## CAR_USEPrivate                -8.579e-01  7.319e-02 -11.721  < 2e-16 ***
## BLUEBOOK                      -2.372e-05  4.683e-06  -5.065 4.08e-07 ***
## TIF                           -3.209e-01  4.107e-02  -7.812 5.63e-15 ***
## CAR_TYPEPanel_Truck            5.370e-01  1.422e-01   3.777 0.000158 ***
## CAR_TYPEPickup                 4.942e-01  9.796e-02   5.046 4.52e-07 ***
## CAR_TYPESports_Car             9.566e-01  1.058e-01   9.042  < 2e-16 ***
## CAR_TYPESUV                    7.195e-01  8.481e-02   8.484  < 2e-16 ***
## CAR_TYPEVan                    5.992e-01  1.195e-01   5.015 5.30e-07 ***
## CLM_FREQ                       3.585e-01  5.435e-02   6.597 4.20e-11 ***
## REVOKEDYes                     7.431e-01  7.969e-02   9.326  < 2e-16 ***
## MVR_PTS                        2.986e-01  4.119e-02   7.250 4.15e-13 ***
## URBANICITYHighly_Urban/ Urban  2.314e+00  1.125e-01  20.566  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9418.0  on 8160  degrees of freedom
## Residual deviance: 7378.4  on 8138  degrees of freedom
## AIC: 7424.4
## 
## Number of Fisher Scoring iterations: 5

This last model increased the probability, AIC score and the deviance. I will be proceeding with model 2. SELECT MODEL

I’m selecting model 2, because it has the lowest AIC score and the lowest probability of the 3 models, meaning it will be the best fit. I will check this with the ROC curve.

itrain$predict <- predict(m2, m2_data, type='response')

iroc <- roc(itrain$TARGET_FLAG, itrain$predict, plot=T, asp=NA,
                legacy.axes=T, main = "ROC Curve", ret="tp", col="blue")

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

iroc["auc"]

## $auc
## Area under the curve: 0.8131

The area under the curve is above .8, so I will proceed with this model for the prediction. I will plug in the evaluation data.

Finally, I will clean up the test data.

ieval = as.tbl(ieval) %>% 
  mutate_at(c("INCOME","HOME_VAL","BLUEBOOK","OLDCLAIM"),
            money) %>% 
  mutate_at(c("EDUCATION","JOB","CAR_TYPE","URBANICITY"),
            underscore) %>% 
  mutate_at(c("EDUCATION","JOB","CAR_TYPE","URBANICITY"),
            as.factor)


ieval$TARGET_AMT = as.factor(ieval$TARGET_AMT)
ieval$INCOME = as.numeric(ieval$INCOME)
ieval$HOME_VAL = as.numeric(ieval$HOME_VAL)
ieval$BLUEBOOK = as.numeric(ieval$BLUEBOOK)
ieval$OLDCLAIM = as.numeric(ieval$OLDCLAIM)
ieval$MSTATUS = as.factor(str_remove(ieval$MSTATUS, "^z_"))
ieval$MSTATUS = as.factor(str_remove(ieval$MSTATUS, "z_"))
ieval$SEX = as.factor(str_remove(ieval$SEX, "^z_"))
ieval$EDUCATION = as.factor(str_remove(ieval$EDUCATION, "^z_"))
ieval$JOB = as.factor(str_remove(ieval$JOB, "^z_"))
ieval$CAR_TYPE = as.factor(str_remove(ieval$CAR_TYPE, "^z_"))
ieval$URBANICITY = as.factor(str_remove(ieval$URBANICITY, "^z_"))

#final data prep
ieval %>%
    droplevels() %>%
    as.data.frame() -> ieval

#run
final_Pred =predict(m2, newdata=ieval)
final_Pred = ifelse(final_Pred<.5,0,1)
hist(final_Pred)

We can see the above distribution of the estimated accidents.