Section 1 - Data Exploration

The training dataset contains 8161 rows and 26 columns. The data types are of integer, numeric and character. The column values are mixed of continuous, discrete, categorical, and binary. There are 23 predictors, and 2 response variables related to a customer at an auto insurance company. The training dataset contains about 27% classifications of a person in a car crash and the corresponding amount of money it costs.

A summary and boxplots of all variables suggest some columns have incorrect formats, some have missing data, and some may contain outliers.

library(ggcorrplot)
library(car)
library(MASS)
library(dplyr)
library(ggplot2)
library(caret)
library(pROC)
library(pscl)
library(psych)
library(data.table) 

# Read the training data
train_df <- read.csv("https://raw.githubusercontent.com/L-Velasco/DATA621_FA18/master/HW4/insurance_training_data.csv", stringsAsFactors = FALSE)

1.1 Data Dimension and Structure

dim(train_df)
## [1] 8161   26
str(train_df)
## 'data.frame':    8161 obs. of  26 variables:
##  $ INDEX      : int  1 2 4 5 6 7 8 11 12 13 ...
##  $ TARGET_FLAG: int  0 0 0 0 0 1 0 1 1 0 ...
##  $ TARGET_AMT : num  0 0 0 0 0 ...
##  $ KIDSDRIV   : int  0 0 0 0 0 0 0 1 0 0 ...
##  $ AGE        : int  60 43 35 51 50 34 54 37 34 50 ...
##  $ HOMEKIDS   : int  0 0 1 0 0 1 0 2 0 0 ...
##  $ YOJ        : int  11 11 10 14 NA 12 NA NA 10 7 ...
##  $ INCOME     : chr  "$67,349" "$91,449" "$16,039" "" ...
##  $ PARENT1    : chr  "No" "No" "No" "No" ...
##  $ HOME_VAL   : chr  "$0" "$257,252" "$124,191" "$306,251" ...
##  $ MSTATUS    : chr  "z_No" "z_No" "Yes" "Yes" ...
##  $ SEX        : chr  "M" "M" "z_F" "M" ...
##  $ EDUCATION  : chr  "PhD" "z_High School" "z_High School" "<High School" ...
##  $ JOB        : chr  "Professional" "z_Blue Collar" "Clerical" "z_Blue Collar" ...
##  $ TRAVTIME   : int  14 22 5 32 36 46 33 44 34 48 ...
##  $ CAR_USE    : chr  "Private" "Commercial" "Private" "Private" ...
##  $ BLUEBOOK   : chr  "$14,230" "$14,940" "$4,010" "$15,440" ...
##  $ TIF        : int  11 1 4 7 1 1 1 1 1 7 ...
##  $ CAR_TYPE   : chr  "Minivan" "Minivan" "z_SUV" "Minivan" ...
##  $ RED_CAR    : chr  "yes" "yes" "no" "yes" ...
##  $ OLDCLAIM   : chr  "$4,461" "$0" "$38,690" "$0" ...
##  $ CLM_FREQ   : int  2 0 2 0 2 0 0 1 0 0 ...
##  $ REVOKED    : chr  "No" "No" "No" "No" ...
##  $ MVR_PTS    : int  3 0 3 0 3 0 0 10 0 1 ...
##  $ CAR_AGE    : int  18 1 10 6 17 7 1 7 1 17 ...
##  $ URBANICITY : chr  "Highly Urban/ Urban" "Highly Urban/ Urban" "Highly Urban/ Urban" "Highly Urban/ Urban" ...

Displaying first few rows of the dataset

head(train_df)
##   INDEX TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ   INCOME PARENT1
## 1     1           0          0        0  60        0  11  $67,349      No
## 2     2           0          0        0  43        0  11  $91,449      No
## 3     4           0          0        0  35        1  10  $16,039      No
## 4     5           0          0        0  51        0  14               No
## 5     6           0          0        0  50        0  NA $114,986      No
## 6     7           1       2946        0  34        1  12 $125,301     Yes
##   HOME_VAL MSTATUS SEX     EDUCATION           JOB TRAVTIME    CAR_USE
## 1       $0    z_No   M           PhD  Professional       14    Private
## 2 $257,252    z_No   M z_High School z_Blue Collar       22 Commercial
## 3 $124,191     Yes z_F z_High School      Clerical        5    Private
## 4 $306,251     Yes   M  <High School z_Blue Collar       32    Private
## 5 $243,925     Yes z_F           PhD        Doctor       36    Private
## 6       $0    z_No z_F     Bachelors z_Blue Collar       46 Commercial
##   BLUEBOOK TIF   CAR_TYPE RED_CAR OLDCLAIM CLM_FREQ REVOKED MVR_PTS
## 1  $14,230  11    Minivan     yes   $4,461        2      No       3
## 2  $14,940   1    Minivan     yes       $0        0      No       0
## 3   $4,010   4      z_SUV      no  $38,690        2      No       3
## 4  $15,440   7    Minivan     yes       $0        0      No       0
## 5  $18,000   1      z_SUV      no  $19,217        2     Yes       3
## 6  $17,430   1 Sports Car      no       $0        0      No       0
##   CAR_AGE          URBANICITY
## 1      18 Highly Urban/ Urban
## 2       1 Highly Urban/ Urban
## 3      10 Highly Urban/ Urban
## 4       6 Highly Urban/ Urban
## 5      17 Highly Urban/ Urban
## 6       7 Highly Urban/ Urban

1.2 Initial Data Conversion and Cleanup

Some columns need to be removed, converted, and cleaned prior to providing some meaningful description of the data.

# Exclude the INDEX column
tr <- train_df[-1]

#  Convert to numeric
tr$INCOME <- as.numeric(gsub('[$,]', '', tr$INCOME))
tr$HOME_VAL <- as.numeric(gsub('[$,]', '', tr$HOME_VAL))
tr$BLUEBOOK <- as.numeric(gsub('[$,]', '', tr$BLUEBOOK))
tr$OLDCLAIM <- as.numeric(gsub('[$,]', '', tr$OLDCLAIM))

# Remove irrelevant characters
tr$MSTATUS  <- gsub("z_", "", tr$MSTATUS)
tr$SEX  <- gsub("z_", "", tr$SEX)
tr$EDUCATION  <- gsub("z_", "", tr$EDUCATION)
tr$JOB  <- gsub("z_", "", tr$JOB)
tr$CAR_USE  <- gsub("z_", "", tr$CAR_USE)
tr$CAR_TYPE  <- gsub("z_", "", tr$CAR_TYPE)
tr$URBANICITY  <- gsub("z_", "", tr$URBANICITY)


# Reorder columns -- predictor categorical, predictor numeric, target

indx <- c(8, 10:13, 15, 18:19, 22, 25, 3:7, 9, 14, 16:17, 20:21, 23:24, 1:2)
tr_ordered <- tr
setcolorder(tr_ordered,indx)
## categorical 1:10 / numeric 11:23 / response 24:25

1.3 Summary of Categorical Columns and Relationship with TARGET_FLAG

table(tr$PARENT1)
## 
##   No  Yes 
## 7084 1077
table(tr$MSTATUS)
## 
##   No  Yes 
## 3267 4894
table(tr$SEX)
## 
##    F    M 
## 4375 3786
table(tr$EDUCATION)
## 
## <High School    Bachelors  High School      Masters          PhD 
##         1203         2242         2330         1658          728
table(tr$JOB)
## 
##               Blue Collar     Clerical       Doctor   Home Maker 
##          526         1825         1271          246          641 
##       Lawyer      Manager Professional      Student 
##          835          988         1117          712
table(tr$CAR_USE)
## 
## Commercial    Private 
##       3029       5132
table(tr$CAR_TYPE)
## 
##     Minivan Panel Truck      Pickup  Sports Car         SUV         Van 
##        2145         676        1389         907        2294         750
table(tr$RED_CAR)
## 
##   no  yes 
## 5783 2378
table(tr$REVOKED)
## 
##   No  Yes 
## 7161 1000
table(tr$URBANICITY)
## 
## Highly Rural/ Rural Highly Urban/ Urban 
##                1669                6492

1.4 Summary of Numeric Columns and Relationship with TARGET_FLAG

##     KIDSDRIV           AGE           HOMEKIDS           YOJ      
##  Min.   :0.0000   Min.   :16.00   Min.   :0.0000   Min.   : 0.0  
##  1st Qu.:0.0000   1st Qu.:39.00   1st Qu.:0.0000   1st Qu.: 9.0  
##  Median :0.0000   Median :45.00   Median :0.0000   Median :11.0  
##  Mean   :0.1711   Mean   :44.79   Mean   :0.7212   Mean   :10.5  
##  3rd Qu.:0.0000   3rd Qu.:51.00   3rd Qu.:1.0000   3rd Qu.:13.0  
##  Max.   :4.0000   Max.   :81.00   Max.   :5.0000   Max.   :23.0  
##                   NA's   :6                        NA's   :454   
##      INCOME          HOME_VAL         TRAVTIME         BLUEBOOK    
##  Min.   :     0   Min.   :     0   Min.   :  5.00   Min.   : 1500  
##  1st Qu.: 28097   1st Qu.:     0   1st Qu.: 22.00   1st Qu.: 9280  
##  Median : 54028   Median :161160   Median : 33.00   Median :14440  
##  Mean   : 61898   Mean   :154867   Mean   : 33.49   Mean   :15710  
##  3rd Qu.: 85986   3rd Qu.:238724   3rd Qu.: 44.00   3rd Qu.:20850  
##  Max.   :367030   Max.   :885282   Max.   :142.00   Max.   :69740  
##  NA's   :445      NA's   :464                                      
##       TIF            OLDCLAIM        CLM_FREQ         MVR_PTS      
##  Min.   : 1.000   Min.   :    0   Min.   :0.0000   Min.   : 0.000  
##  1st Qu.: 1.000   1st Qu.:    0   1st Qu.:0.0000   1st Qu.: 0.000  
##  Median : 4.000   Median :    0   Median :0.0000   Median : 1.000  
##  Mean   : 5.351   Mean   : 4037   Mean   :0.7986   Mean   : 1.696  
##  3rd Qu.: 7.000   3rd Qu.: 4636   3rd Qu.:2.0000   3rd Qu.: 3.000  
##  Max.   :25.000   Max.   :57037   Max.   :5.0000   Max.   :13.000  
##                                                                    
##     CAR_AGE      
##  Min.   :-3.000  
##  1st Qu.: 1.000  
##  Median : 8.000  
##  Mean   : 8.328  
##  3rd Qu.:12.000  
##  Max.   :28.000  
##  NA's   :510

Summary plot of numeric predictors against target flag response (0=Car was not in a crash, 1=Car was in a crash)

Section 2 - Data Preparation

To prepare the data for modeling, variables will be added, removed, imputed, and treated for outliers.

tr_prep <- tr_ordered

2.1 Impute missing data

Columns with Missing Data

M <- sapply(tr_prep, function(x) sum(x=="") | sum(is.na(x))); names(M[(M>0)])
## [1] "JOB"      "AGE"      "YOJ"      "INCOME"   "HOME_VAL" "CAR_AGE"

For the numeric variables with missing data below, they will be populated with the variable’s mean or median (if skewed).

#colnames(tr_prep)[colSums(is.na(tr_prep)) > 0]

x <- c(12, 14, 15, 16, 23)
par(mfrow=c(2,3))
for (val in x) {
  hist(tr_prep[,val],xlab=names(tr_prep[val]), main="")
}
par(mfrow=c(1,1))

For missing categorical data (JOB), it will be imputed based on the most common value based on Education.

#impute

tr_prep = tr_prep %>% 
  mutate(AGE = 
           ifelse(is.na(AGE), 
                  mean(AGE, na.rm=TRUE), AGE)) %>% 

  mutate(YOJ = 
           ifelse(is.na(YOJ), 
                  mean(YOJ, na.rm=TRUE), YOJ)) %>% 

  mutate(INCOME = 
           ifelse(is.na(INCOME), 
                  median(INCOME, na.rm=TRUE), INCOME)) %>% 

  mutate(HOME_VAL = 
           ifelse(is.na(HOME_VAL), 
                  mean(HOME_VAL, na.rm=TRUE), HOME_VAL)) %>% 

  mutate(CAR_AGE = 
           ifelse(is.na(CAR_AGE), 
                  mean(CAR_AGE, na.rm=TRUE), CAR_AGE)) %>% 

  mutate(JOB = 
           ifelse((JOB == "" & EDUCATION == 'PhD'),
                  "Doctor", JOB)) %>% 

  mutate(JOB = 
           ifelse((JOB == "" & EDUCATION == 'Masters'),
                  "Lawyer", JOB))

Check that no columns has missing data

M <- sapply(tr_prep, function(x) sum(x=="") | sum(is.na(x))); names(M[(M>0)])
## character(0)

2.2 Cap Outliers

Any outliers outside of lower 1.5IQR would be capped at 5th %ile, and observations above the upper 1.5IQR would be capped at 95th %ile.

# Outlier Capping

tr_prep2 <- tr_prep

id <- c(11:23)
for (val in id) {
  qnt <- quantile(tr_prep2[,val], probs=c(.25, .75), na.rm = T)
  caps <- quantile(tr_prep2[,val], probs=c(.05, .95), na.rm = T)
  H <- 1.5 * IQR(tr_prep2[,val], na.rm = T)
  tr_prep2[,val][tr_prep2[,val] < (qnt[1] - H)] <- caps[1]
  tr_prep2[,val][tr_prep2[,val] > (qnt[2] + H)] <- caps[2]

}

Section 3 - Build and Evaluate Models

Each model will have its logistics and multiple linear regression version.

For logistic regression, the training data set will be split such that 80% of the observations will be used to train the model and 20% will be used to test the model to derive performance metrics.

nTrain <- createDataPartition(tr_prep2$TARGET_FLAG, p=0.8, list=FALSE)
ntraining <- tr_prep2[nTrain,]
ntesting <- tr_prep2[-nTrain,]

set.seed(123)

3.1 Model with all predictor variables

# Logistic Regression build the model using training set
full.model_FLAG <- glm(TARGET_FLAG ~.-TARGET_AMT, data = ntraining , family = binomial)
summary(full.model_FLAG)
## 
## Call:
## glm(formula = TARGET_FLAG ~ . - TARGET_AMT, family = binomial, 
##     data = ntraining)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2073  -0.7169  -0.3970   0.6696   3.1598  
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -2.532e+00  3.195e-01  -7.927 2.25e-15 ***
## PARENT1Yes                     2.931e-01  1.246e-01   2.353 0.018604 *  
## MSTATUSYes                    -4.692e-01  9.432e-02  -4.974 6.55e-07 ***
## SEXM                           9.600e-02  1.247e-01   0.770 0.441300    
## EDUCATIONBachelors            -3.992e-01  1.295e-01  -3.082 0.002056 ** 
## EDUCATIONHigh School           2.464e-02  1.052e-01   0.234 0.814770    
## EDUCATIONMasters              -3.847e-01  2.052e-01  -1.875 0.060780 .  
## EDUCATIONPhD                  -1.230e-01  2.576e-01  -0.477 0.633054    
## JOBClerical                    1.055e-01  1.186e-01   0.890 0.373675    
## JOBDoctor                     -7.330e-01  2.793e-01  -2.624 0.008682 ** 
## JOBHome Maker                 -1.202e-01  1.698e-01  -0.708 0.479005    
## JOBLawyer                     -1.293e-01  1.952e-01  -0.662 0.507894    
## JOBManager                    -7.869e-01  1.533e-01  -5.133 2.85e-07 ***
## JOBProfessional               -1.201e-01  1.341e-01  -0.895 0.370688    
## JOBStudent                    -1.050e-01  1.450e-01  -0.724 0.469077    
## CAR_USEPrivate                -7.674e-01  9.742e-02  -7.877 3.36e-15 ***
## CAR_TYPEPanel Truck            5.309e-01  1.759e-01   3.019 0.002539 ** 
## CAR_TYPEPickup                 5.769e-01  1.121e-01   5.146 2.67e-07 ***
## CAR_TYPESports Car             1.045e+00  1.446e-01   7.227 4.95e-13 ***
## CAR_TYPESUV                    7.752e-01  1.245e-01   6.224 4.86e-10 ***
## CAR_TYPEVan                    6.807e-01  1.400e-01   4.864 1.15e-06 ***
## RED_CARyes                    -2.540e-02  9.669e-02  -0.263 0.792779    
## REVOKEDYes                     9.430e-01  1.016e-01   9.284  < 2e-16 ***
## URBANICITYHighly Urban/ Urban  2.390e+00  1.252e-01  19.094  < 2e-16 ***
## KIDSDRIV                       6.144e-01  1.091e-01   5.629 1.81e-08 ***
## AGE                           -5.143e-03  4.529e-03  -1.136 0.256101    
## HOMEKIDS                       5.039e-02  4.463e-02   1.129 0.258872    
## YOJ                           -6.208e-03  9.527e-03  -0.652 0.514621    
## INCOME                        -4.383e-06  1.369e-06  -3.202 0.001364 ** 
## HOME_VAL                      -1.403e-06  3.877e-07  -3.619 0.000296 ***
## TRAVTIME                       1.573e-02  2.174e-03   7.238 4.54e-13 ***
## BLUEBOOK                      -1.626e-05  6.035e-06  -2.694 0.007062 ** 
## TIF                           -6.038e-02  8.589e-03  -7.030 2.07e-12 ***
## OLDCLAIM                      -1.761e-05  5.318e-06  -3.312 0.000928 ***
## CLM_FREQ                       2.193e-01  3.268e-02   6.712 1.92e-11 ***
## MVR_PTS                        1.116e-01  1.640e-02   6.807 9.94e-12 ***
## CAR_AGE                        2.768e-03  8.500e-03   0.326 0.744733    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7568.3  on 6528  degrees of freedom
## Residual deviance: 5859.8  on 6492  degrees of freedom
## AIC: 5933.8
## 
## Number of Fisher Scoring iterations: 5
round(exp(cbind(Estimate=coef(full.model_FLAG))),2)
##                               Estimate
## (Intercept)                       0.08
## PARENT1Yes                        1.34
## MSTATUSYes                        0.63
## SEXM                              1.10
## EDUCATIONBachelors                0.67
## EDUCATIONHigh School              1.02
## EDUCATIONMasters                  0.68
## EDUCATIONPhD                      0.88
## JOBClerical                       1.11
## JOBDoctor                         0.48
## JOBHome Maker                     0.89
## JOBLawyer                         0.88
## JOBManager                        0.46
## JOBProfessional                   0.89
## JOBStudent                        0.90
## CAR_USEPrivate                    0.46
## CAR_TYPEPanel Truck               1.70
## CAR_TYPEPickup                    1.78
## CAR_TYPESports Car                2.84
## CAR_TYPESUV                       2.17
## CAR_TYPEVan                       1.98
## RED_CARyes                        0.97
## REVOKEDYes                        2.57
## URBANICITYHighly Urban/ Urban    10.92
## KIDSDRIV                          1.85
## AGE                               0.99
## HOMEKIDS                          1.05
## YOJ                               0.99
## INCOME                            1.00
## HOME_VAL                          1.00
## TRAVTIME                          1.02
## BLUEBOOK                          1.00
## TIF                               0.94
## OLDCLAIM                          1.00
## CLM_FREQ                          1.25
## MVR_PTS                           1.12
## CAR_AGE                           1.00
# evaluate the model by predicting using the testing set
m1_prob <- predict(full.model_FLAG, ntesting, type = "response")
m1_pclass <- ifelse(m1_prob >= 0.5, 1, 0)

# create confusion matrix
pclass <- factor(m1_pclass,levels = c(1,0))
aclass <- factor(ntesting$TARGET_FLAG,levels = c(1,0))
confusionMatrix(pclass, aclass);
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    1    0
##          1  171   92
##          0  243 1126
##                                           
##                Accuracy : 0.7947          
##                  95% CI : (0.7743, 0.8141)
##     No Information Rate : 0.7463          
##     P-Value [Acc > NIR] : 2.504e-06       
##                                           
##                   Kappa : 0.3837          
##  Mcnemar's Test P-Value : 2.498e-16       
##                                           
##             Sensitivity : 0.4130          
##             Specificity : 0.9245          
##          Pos Pred Value : 0.6502          
##          Neg Pred Value : 0.8225          
##              Prevalence : 0.2537          
##          Detection Rate : 0.1048          
##    Detection Prevalence : 0.1612          
##       Balanced Accuracy : 0.6688          
##                                           
##        'Positive' Class : 1               
## 
# plot and show area under the curve
plot(roc(ntesting$TARGET_FLAG, m1_prob),print.auc=TRUE)

# get McFadden
m1_mcFadden <- pR2(full.model_FLAG); m1_mcFadden["McFadden"]
##  McFadden 
## 0.2257429
## --------------------------------
# Linear Regression - TARGET_AMT
full.model.AMT <- lm(TARGET_AMT ~. -TARGET_FLAG,  data = tr_prep2)
summary(full.model.AMT)
## 
## Call:
## lm(formula = TARGET_AMT ~ . - TARGET_FLAG, data = tr_prep2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -5169  -1704   -754    344 103686 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    9.821e+01  4.779e+02   0.205 0.837204    
## PARENT1Yes                     5.364e+02  2.043e+02   2.626 0.008654 ** 
## MSTATUSYes                    -5.759e+02  1.453e+02  -3.963 7.47e-05 ***
## SEXM                           3.471e+02  1.833e+02   1.894 0.058294 .  
## EDUCATIONBachelors            -2.659e+02  2.055e+02  -1.294 0.195735    
## EDUCATIONHigh School          -9.582e+01  1.716e+02  -0.558 0.576571    
## EDUCATIONMasters              -4.219e+01  3.061e+02  -0.138 0.890376    
## EDUCATIONPhD                   3.898e+02  3.757e+02   1.038 0.299462    
## JOBClerical                    1.773e+01  1.917e+02   0.093 0.926295    
## JOBDoctor                     -1.046e+03  4.009e+02  -2.608 0.009117 ** 
## JOBHome Maker                 -1.745e+02  2.684e+02  -0.650 0.515478    
## JOBLawyer                     -2.484e+02  2.928e+02  -0.848 0.396242    
## JOBManager                    -9.855e+02  2.328e+02  -4.233 2.33e-05 ***
## JOBProfessional               -2.882e+01  2.117e+02  -0.136 0.891724    
## JOBStudent                    -2.275e+02  2.358e+02  -0.965 0.334779    
## CAR_USEPrivate                -8.000e+02  1.578e+02  -5.071 4.05e-07 ***
## CAR_TYPEPanel Truck            2.668e+02  2.724e+02   0.979 0.327500    
## CAR_TYPEPickup                 3.707e+02  1.706e+02   2.172 0.029869 *  
## CAR_TYPESports Car             1.014e+03  2.178e+02   4.656 3.28e-06 ***
## CAR_TYPESUV                    7.434e+02  1.793e+02   4.145 3.43e-05 ***
## CAR_TYPEVan                    5.220e+02  2.127e+02   2.454 0.014162 *  
## RED_CARyes                    -3.741e+01  1.491e+02  -0.251 0.801877    
## REVOKEDYes                     5.906e+02  1.743e+02   3.387 0.000709 ***
## URBANICITYHighly Urban/ Urban  1.675e+03  1.392e+02  12.032  < 2e-16 ***
## KIDSDRIV                       6.104e+02  1.789e+02   3.411 0.000650 ***
## AGE                            6.643e+00  7.158e+00   0.928 0.353395    
## HOMEKIDS                       8.003e+01  6.981e+01   1.146 0.251677    
## YOJ                           -5.048e+00  1.503e+01  -0.336 0.737007    
## INCOME                        -4.652e-03  2.081e-03  -2.236 0.025408 *  
## HOME_VAL                      -6.471e-04  5.961e-04  -1.085 0.277733    
## TRAVTIME                       1.285e+01  3.340e+00   3.847 0.000120 ***
## BLUEBOOK                       1.357e-02  8.954e-03   1.515 0.129769    
## TIF                           -5.120e+01  1.294e+01  -3.958 7.62e-05 ***
## OLDCLAIM                      -1.685e-02  9.119e-03  -1.848 0.064627 .  
## CLM_FREQ                       1.667e+02  5.686e+01   2.932 0.003381 ** 
## MVR_PTS                        1.731e+02  2.790e+01   6.203 5.79e-10 ***
## CAR_AGE                       -2.708e+01  1.286e+01  -2.105 0.035304 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4544 on 8124 degrees of freedom
## Multiple R-squared:  0.0709, Adjusted R-squared:  0.06679 
## F-statistic: 17.22 on 36 and 8124 DF,  p-value: < 2.2e-16
vif(full.model.AMT)
##                 GVIF Df GVIF^(1/(2*Df))
## PARENT1     1.888802  1        1.374337
## MSTATUS     2.003847  1        1.415573
## SEX         3.302562  1        1.817295
## EDUCATION  15.755173  4        1.411490
## JOB        29.731384  7        1.274177
## CAR_USE     2.296181  1        1.515316
## CAR_TYPE    5.324276  5        1.182023
## RED_CAR     1.813330  1        1.346599
## REVOKED     1.291470  1        1.136429
## URBANICITY  1.245363  1        1.115958
## KIDSDRIV    1.338024  1        1.156730
## AGE         1.462748  1        1.209441
## HOMEKIDS    2.140216  1        1.462948
## YOJ         1.416953  1        1.190358
## INCOME      2.863846  1        1.692290
## HOME_VAL    2.143594  1        1.464102
## TRAVTIME    1.034456  1        1.017082
## BLUEBOOK    2.032225  1        1.425561
## TIF         1.006054  1        1.003022
## OLDCLAIM    1.827781  1        1.351954
## CLM_FREQ    1.714439  1        1.309366
## MVR_PTS     1.229565  1        1.108858
## CAR_AGE     1.975294  1        1.405451
par(mfrow=c(2,2))
plot(full.model.AMT)

par(mfrow=c(1,1))

3.2 Stepwise variable selection

# Logistic Regression build the model using training set
step.model_FLAG <- full.model_FLAG %>% stepAIC(trace = FALSE)
summary(step.model_FLAG)
## 
## Call:
## glm(formula = TARGET_FLAG ~ PARENT1 + MSTATUS + EDUCATION + JOB + 
##     CAR_USE + CAR_TYPE + REVOKED + URBANICITY + KIDSDRIV + AGE + 
##     INCOME + HOME_VAL + TRAVTIME + BLUEBOOK + TIF + OLDCLAIM + 
##     CLM_FREQ + MVR_PTS, family = binomial, data = ntraining)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2373  -0.7173  -0.3962   0.6727   3.1660  
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -2.413e+00  2.874e-01  -8.397  < 2e-16 ***
## PARENT1Yes                     3.498e-01  1.114e-01   3.140 0.001692 ** 
## MSTATUSYes                    -4.482e-01  8.998e-02  -4.981 6.32e-07 ***
## EDUCATIONBachelors            -3.871e-01  1.217e-01  -3.181 0.001467 ** 
## EDUCATIONHigh School           2.544e-02  1.048e-01   0.243 0.808155    
## EDUCATIONMasters              -3.584e-01  1.854e-01  -1.933 0.053188 .  
## EDUCATIONPhD                  -9.788e-02  2.432e-01  -0.402 0.687353    
## JOBClerical                    1.070e-01  1.184e-01   0.904 0.366156    
## JOBDoctor                     -7.340e-01  2.792e-01  -2.629 0.008553 ** 
## JOBHome Maker                 -1.020e-01  1.607e-01  -0.635 0.525620    
## JOBLawyer                     -1.293e-01  1.952e-01  -0.663 0.507617    
## JOBManager                    -7.891e-01  1.532e-01  -5.150 2.61e-07 ***
## JOBProfessional               -1.255e-01  1.340e-01  -0.937 0.348867    
## JOBStudent                    -7.319e-02  1.386e-01  -0.528 0.597453    
## CAR_USEPrivate                -7.698e-01  9.730e-02  -7.911 2.55e-15 ***
## CAR_TYPEPanel Truck            5.756e-01  1.653e-01   3.482 0.000497 ***
## CAR_TYPEPickup                 5.729e-01  1.120e-01   5.117 3.10e-07 ***
## CAR_TYPESports Car             9.912e-01  1.201e-01   8.254  < 2e-16 ***
## CAR_TYPESUV                    7.228e-01  9.668e-02   7.476 7.64e-14 ***
## CAR_TYPEVan                    7.074e-01  1.351e-01   5.236 1.64e-07 ***
## REVOKEDYes                     9.462e-01  1.015e-01   9.323  < 2e-16 ***
## URBANICITYHighly Urban/ Urban  2.389e+00  1.252e-01  19.089  < 2e-16 ***
## KIDSDRIV                       6.648e-01  9.828e-02   6.765 1.34e-11 ***
## AGE                           -6.861e-03  4.149e-03  -1.654 0.098213 .  
## INCOME                        -4.437e-06  1.361e-06  -3.259 0.001117 ** 
## HOME_VAL                      -1.415e-06  3.873e-07  -3.653 0.000259 ***
## TRAVTIME                       1.572e-02  2.172e-03   7.235 4.67e-13 ***
## BLUEBOOK                      -1.826e-05  5.464e-06  -3.343 0.000830 ***
## TIF                           -6.027e-02  8.585e-03  -7.020 2.21e-12 ***
## OLDCLAIM                      -1.780e-05  5.314e-06  -3.350 0.000809 ***
## CLM_FREQ                       2.204e-01  3.265e-02   6.751 1.47e-11 ***
## MVR_PTS                        1.122e-01  1.637e-02   6.851 7.32e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7568.3  on 6528  degrees of freedom
## Residual deviance: 5862.0  on 6497  degrees of freedom
## AIC: 5926
## 
## Number of Fisher Scoring iterations: 5
round(exp(cbind(Estimate=coef(step.model_FLAG))),2)
##                               Estimate
## (Intercept)                       0.09
## PARENT1Yes                        1.42
## MSTATUSYes                        0.64
## EDUCATIONBachelors                0.68
## EDUCATIONHigh School              1.03
## EDUCATIONMasters                  0.70
## EDUCATIONPhD                      0.91
## JOBClerical                       1.11
## JOBDoctor                         0.48
## JOBHome Maker                     0.90
## JOBLawyer                         0.88
## JOBManager                        0.45
## JOBProfessional                   0.88
## JOBStudent                        0.93
## CAR_USEPrivate                    0.46
## CAR_TYPEPanel Truck               1.78
## CAR_TYPEPickup                    1.77
## CAR_TYPESports Car                2.69
## CAR_TYPESUV                       2.06
## CAR_TYPEVan                       2.03
## REVOKEDYes                        2.58
## URBANICITYHighly Urban/ Urban    10.90
## KIDSDRIV                          1.94
## AGE                               0.99
## INCOME                            1.00
## HOME_VAL                          1.00
## TRAVTIME                          1.02
## BLUEBOOK                          1.00
## TIF                               0.94
## OLDCLAIM                          1.00
## CLM_FREQ                          1.25
## MVR_PTS                           1.12
# evaluate the model by predicting using the testing set
m2_prob <- predict(step.model_FLAG, ntesting, type = "response")
m2_pclass <- ifelse(m2_prob >= 0.5, 1, 0)

# create confusion matrix
pclass <- factor(m2_pclass,levels = c(1,0))
aclass <- factor(ntesting$TARGET_FLAG,levels = c(1,0))
confusionMatrix(pclass, aclass);
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    1    0
##          1  174   89
##          0  240 1129
##                                           
##                Accuracy : 0.7984          
##                  95% CI : (0.7781, 0.8176)
##     No Information Rate : 0.7463          
##     P-Value [Acc > NIR] : 4.255e-07       
##                                           
##                   Kappa : 0.3947          
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.4203          
##             Specificity : 0.9269          
##          Pos Pred Value : 0.6616          
##          Neg Pred Value : 0.8247          
##              Prevalence : 0.2537          
##          Detection Rate : 0.1066          
##    Detection Prevalence : 0.1612          
##       Balanced Accuracy : 0.6736          
##                                           
##        'Positive' Class : 1               
## 
# plot and show area under the curve
plot(roc(ntesting$TARGET_FLAG, m2_prob),print.auc=TRUE)

# get McFadden
m2_mcFadden <- pR2(step.model_FLAG); m2_mcFadden["McFadden"]
##  McFadden 
## 0.2254604
## --------------------------------

# Linear Regression - TARGET_AMT
step.model.AMT <- full.model.AMT %>% stepAIC(trace = FALSE)
summary(step.model.AMT)
## 
## Call:
## lm(formula = TARGET_AMT ~ PARENT1 + MSTATUS + SEX + JOB + CAR_USE + 
##     CAR_TYPE + REVOKED + URBANICITY + KIDSDRIV + INCOME + TRAVTIME + 
##     BLUEBOOK + TIF + OLDCLAIM + CLM_FREQ + MVR_PTS + CAR_AGE, 
##     data = tr_prep2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -5204  -1696   -761    339 103637 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    1.990e+02  3.427e+02   0.581 0.561477    
## PARENT1Yes                     5.940e+02  1.783e+02   3.331 0.000869 ***
## MSTATUSYes                    -6.201e+02  1.196e+02  -5.184 2.23e-07 ***
## SEXM                           3.182e+02  1.604e+02   1.984 0.047322 *  
## JOBClerical                    3.265e+01  1.907e+02   0.171 0.864042    
## JOBDoctor                     -5.451e+02  2.860e+02  -1.906 0.056685 .  
## JOBHome Maker                 -1.140e+02  2.465e+02  -0.462 0.643874    
## JOBLawyer                     -1.610e+02  2.166e+02  -0.743 0.457219    
## JOBManager                    -9.717e+02  2.116e+02  -4.592 4.46e-06 ***
## JOBProfessional               -1.250e+02  1.967e+02  -0.635 0.525251    
## JOBStudent                    -1.356e+02  2.218e+02  -0.611 0.541042    
## CAR_USEPrivate                -7.477e+02  1.510e+02  -4.951 7.51e-07 ***
## CAR_TYPEPanel Truck            3.094e+02  2.684e+02   1.153 0.249058    
## CAR_TYPEPickup                 3.965e+02  1.693e+02   2.342 0.019200 *  
## CAR_TYPESports Car             1.029e+03  2.164e+02   4.755 2.02e-06 ***
## CAR_TYPESUV                    7.487e+02  1.785e+02   4.194 2.77e-05 ***
## CAR_TYPEVan                    5.400e+02  2.114e+02   2.555 0.010648 *  
## REVOKEDYes                     5.992e+02  1.742e+02   3.439 0.000587 ***
## URBANICITYHighly Urban/ Urban  1.667e+03  1.391e+02  11.985  < 2e-16 ***
## KIDSDRIV                       7.014e+02  1.620e+02   4.330 1.51e-05 ***
## INCOME                        -5.499e-03  1.838e-03  -2.991 0.002785 ** 
## TRAVTIME                       1.274e+01  3.338e+00   3.816 0.000137 ***
## BLUEBOOK                       1.391e-02  8.854e-03   1.571 0.116175    
## TIF                           -5.065e+01  1.293e+01  -3.917 9.05e-05 ***
## OLDCLAIM                      -1.693e-02  9.114e-03  -1.858 0.063258 .  
## CLM_FREQ                       1.688e+02  5.681e+01   2.972 0.002968 ** 
## MVR_PTS                        1.739e+02  2.786e+01   6.241 4.56e-10 ***
## CAR_AGE                       -2.765e+01  1.130e+01  -2.447 0.014435 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4544 on 8133 degrees of freedom
## Multiple R-squared:  0.06987,    Adjusted R-squared:  0.06678 
## F-statistic: 22.63 on 27 and 8133 DF,  p-value: < 2.2e-16
vif(step.model.AMT)
##                GVIF Df GVIF^(1/(2*Df))
## PARENT1    1.439404  1        1.199751
## MSTATUS    1.357795  1        1.165245
## SEX        2.529123  1        1.590322
## JOB        4.392786  7        1.111501
## CAR_USE    2.103321  1        1.450283
## CAR_TYPE   5.046346  5        1.175703
## REVOKED    1.289951  1        1.135760
## URBANICITY 1.243742  1        1.115232
## KIDSDRIV   1.096511  1        1.047144
## INCOME     2.234825  1        1.494933
## TRAVTIME   1.033216  1        1.016472
## BLUEBOOK   1.986772  1        1.409529
## TIF        1.005226  1        1.002610
## OLDCLAIM   1.825934  1        1.351271
## CLM_FREQ   1.711751  1        1.308339
## MVR_PTS    1.226375  1        1.107418
## CAR_AGE    1.524544  1        1.234724
par(mfrow=c(2,2))
plot(step.model.AMT)

par(mfrow=c(1,1))

3.3 Significant value variable selection

# Logistic Regression build the model using training set
select.model_FLAG <- glm(TARGET_FLAG ~.
                       -TARGET_AMT
                       -EDUCATION
                       -SEX
                       -RED_CAR
                       -KIDSDRIV
                       -AGE
                       -HOMEKIDS
                       -YOJ
                       -HOME_VAL
                       -OLDCLAIM
                       -BLUEBOOK 

                         , data = ntraining , family = binomial)
summary(select.model_FLAG)
## 
## Call:
## glm(formula = TARGET_FLAG ~ . - TARGET_AMT - EDUCATION - SEX - 
##     RED_CAR - KIDSDRIV - AGE - HOMEKIDS - YOJ - HOME_VAL - OLDCLAIM - 
##     BLUEBOOK, family = binomial, data = ntraining)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2107  -0.7298  -0.4161   0.6879   3.0244  
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -2.942e+00  1.998e-01 -14.728  < 2e-16 ***
## PARENT1Yes                     5.937e-01  1.013e-01   5.860 4.63e-09 ***
## MSTATUSYes                    -4.950e-01  7.459e-02  -6.636 3.23e-11 ***
## JOBClerical                    1.235e-01  1.169e-01   1.056 0.290784    
## JOBDoctor                     -6.608e-01  1.900e-01  -3.477 0.000507 ***
## JOBHome Maker                 -2.078e-01  1.514e-01  -1.373 0.169867    
## JOBLawyer                     -3.432e-01  1.359e-01  -2.526 0.011531 *  
## JOBManager                    -9.475e-01  1.375e-01  -6.890 5.59e-12 ***
## JOBProfessional               -3.489e-01  1.229e-01  -2.838 0.004535 ** 
## JOBStudent                     5.239e-02  1.341e-01   0.391 0.695921    
## CAR_USEPrivate                -7.046e-01  9.120e-02  -7.725 1.12e-14 ***
## CAR_TYPEPanel Truck            3.845e-01  1.494e-01   2.574 0.010057 *  
## CAR_TYPEPickup                 6.319e-01  1.088e-01   5.808 6.32e-09 ***
## CAR_TYPESports Car             1.018e+00  1.176e-01   8.653  < 2e-16 ***
## CAR_TYPESUV                    7.800e-01  9.443e-02   8.259  < 2e-16 ***
## CAR_TYPEVan                    6.278e-01  1.307e-01   4.803 1.56e-06 ***
## REVOKEDYes                     7.982e-01  8.819e-02   9.051  < 2e-16 ***
## URBANICITYHighly Urban/ Urban  2.315e+00  1.237e-01  18.707  < 2e-16 ***
## INCOME                        -7.539e-06  1.217e-06  -6.195 5.82e-10 ***
## TRAVTIME                       1.545e-02  2.145e-03   7.204 5.87e-13 ***
## TIF                           -5.976e-02  8.511e-03  -7.021 2.20e-12 ***
## CLM_FREQ                       1.746e-01  2.804e-02   6.226 4.78e-10 ***
## MVR_PTS                        1.084e-01  1.611e-02   6.728 1.72e-11 ***
## CAR_AGE                       -9.808e-03  7.267e-03  -1.350 0.177145    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7568.3  on 6528  degrees of freedom
## Residual deviance: 5970.6  on 6505  degrees of freedom
## AIC: 6018.6
## 
## Number of Fisher Scoring iterations: 5
round(exp(cbind(Estimate=coef(select.model_FLAG))),2)
##                               Estimate
## (Intercept)                       0.05
## PARENT1Yes                        1.81
## MSTATUSYes                        0.61
## JOBClerical                       1.13
## JOBDoctor                         0.52
## JOBHome Maker                     0.81
## JOBLawyer                         0.71
## JOBManager                        0.39
## JOBProfessional                   0.71
## JOBStudent                        1.05
## CAR_USEPrivate                    0.49
## CAR_TYPEPanel Truck               1.47
## CAR_TYPEPickup                    1.88
## CAR_TYPESports Car                2.77
## CAR_TYPESUV                       2.18
## CAR_TYPEVan                       1.87
## REVOKEDYes                        2.22
## URBANICITYHighly Urban/ Urban    10.12
## INCOME                            1.00
## TRAVTIME                          1.02
## TIF                               0.94
## CLM_FREQ                          1.19
## MVR_PTS                           1.11
## CAR_AGE                           0.99
# evaluate the model by predicting using the testing set
m3_prob <- predict(select.model_FLAG, ntesting, type = "response")
m3_pclass <- ifelse(m3_prob >= 0.5, 1, 0)

# create confusion matrix
pclass <- factor(m3_pclass,levels = c(1,0))
aclass <- factor(ntesting$TARGET_FLAG,levels = c(1,0))
confusionMatrix(pclass, aclass);
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    1    0
##          1  160   89
##          0  254 1129
##                                           
##                Accuracy : 0.7898          
##                  95% CI : (0.7692, 0.8094)
##     No Information Rate : 0.7463          
##     P-Value [Acc > NIR] : 2.172e-05       
##                                           
##                   Kappa : 0.3609          
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.38647         
##             Specificity : 0.92693         
##          Pos Pred Value : 0.64257         
##          Neg Pred Value : 0.81634         
##              Prevalence : 0.25368         
##          Detection Rate : 0.09804         
##    Detection Prevalence : 0.15257         
##       Balanced Accuracy : 0.65670         
##                                           
##        'Positive' Class : 1               
## 
# plot and show area under the curve
plot(roc(ntesting$TARGET_FLAG, m3_prob),print.auc=TRUE)

# get McFadden
m3_mcFadden <- pR2(select.model_FLAG); m3_mcFadden["McFadden"]
##  McFadden 
## 0.2111078
## --------------------------------

# Linear Regression - TARGET_AMT
select.model.AMT <- lm(TARGET_AMT ~. 
                       -TARGET_FLAG
                       -EDUCATION
                       -SEX
                       -RED_CAR
                       -KIDSDRIV
                       -AGE
                       -HOMEKIDS
                       -YOJ
                       -HOME_VAL
                       -OLDCLAIM
                       -BLUEBOOK 
                       ,  data = tr_prep2)
summary(select.model.AMT)
## 
## Call:
## lm(formula = TARGET_AMT ~ . - TARGET_FLAG - EDUCATION - SEX - 
##     RED_CAR - KIDSDRIV - AGE - HOMEKIDS - YOJ - HOME_VAL - OLDCLAIM - 
##     BLUEBOOK, data = tr_prep2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -4919  -1694   -764    329 103709 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    6.323e+02  2.756e+02   2.294 0.021833 *  
## PARENT1Yes                     7.893e+02  1.716e+02   4.600 4.29e-06 ***
## MSTATUSYes                    -5.333e+02  1.180e+02  -4.521 6.23e-06 ***
## JOBClerical                    1.389e+01  1.908e+02   0.073 0.941986    
## JOBDoctor                     -5.647e+02  2.861e+02  -1.974 0.048433 *  
## JOBHome Maker                 -1.995e+02  2.438e+02  -0.818 0.413208    
## JOBLawyer                     -1.830e+02  2.167e+02  -0.844 0.398476    
## JOBManager                    -9.853e+02  2.118e+02  -4.653 3.32e-06 ***
## JOBProfessional               -1.459e+02  1.967e+02  -0.742 0.458352    
## JOBStudent                    -1.712e+02  2.216e+02  -0.773 0.439748    
## CAR_USEPrivate                -7.361e+02  1.511e+02  -4.871 1.13e-06 ***
## CAR_TYPEPanel Truck            5.732e+02  2.388e+02   2.400 0.016396 *  
## CAR_TYPEPickup                 3.797e+02  1.680e+02   2.260 0.023828 *  
## CAR_TYPESports Car             7.770e+02  1.831e+02   4.243 2.23e-05 ***
## CAR_TYPESUV                    5.240e+02  1.389e+02   3.771 0.000164 ***
## CAR_TYPEVan                    6.602e+02  2.038e+02   3.239 0.001202 ** 
## REVOKEDYes                     4.733e+02  1.550e+02   3.054 0.002266 ** 
## URBANICITYHighly Urban/ Urban  1.646e+03  1.392e+02  11.822  < 2e-16 ***
## INCOME                        -5.164e-03  1.805e-03  -2.860 0.004242 ** 
## TRAVTIME                       1.292e+01  3.341e+00   3.866 0.000111 ***
## TIF                           -5.077e+01  1.295e+01  -3.921 8.89e-05 ***
## CLM_FREQ                       1.235e+02  4.891e+01   2.524 0.011616 *  
## MVR_PTS                        1.696e+02  2.775e+01   6.115 1.01e-09 ***
## CAR_AGE                       -2.778e+01  1.131e+01  -2.455 0.014105 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4551 on 8137 degrees of freedom
## Multiple R-squared:  0.06681,    Adjusted R-squared:  0.06418 
## F-statistic: 25.33 on 23 and 8137 DF,  p-value: < 2.2e-16
vif(select.model.AMT)
##                GVIF Df GVIF^(1/(2*Df))
## PARENT1    1.329311  1        1.152957
## MSTATUS    1.316334  1        1.147316
## JOB        4.219698  7        1.108314
## CAR_USE    2.100340  1        1.449255
## CAR_TYPE   1.803986  5        1.060775
## REVOKED    1.017562  1        1.008743
## URBANICITY 1.242415  1        1.114637
## INCOME     2.149521  1        1.466124
## TRAVTIME   1.032640  1        1.016189
## TIF        1.005153  1        1.002573
## CLM_FREQ   1.265076  1        1.124756
## MVR_PTS    1.212759  1        1.101253
## CAR_AGE    1.524066  1        1.234531
par(mfrow=c(2,2))
plot(select.model.AMT)

par(mfrow=c(1,1))

Section 4 - Select Models

Although not a big difference in the performance of all 3 models, the stepwise model performs slightly better with lowest AIC and lowest difference between multiple and adjusted R-squared, with acceptable accuracy, recall, precision, and roc measure.

The second model will be used to predict the evaluation dataset.

# Read the evaluation dataset
 eval_df <- read.csv("https://raw.githubusercontent.com/L-Velasco/DATA621_FA18/master/HW4/insurance-evaluation-data.csv", stringsAsFactors = FALSE)


# Remove columns not selected in 2nd model
#eval_df <- dplyr::select(eval_df, -YOJ, -MSTATUS, -RED_CAR)

#  Convert to numeric
eval_df$INCOME <- as.numeric(gsub('[$,]', '', eval_df$INCOME))
eval_df$HOME_VAL <- as.numeric(gsub('[$,]', '', eval_df$HOME_VAL))
eval_df$BLUEBOOK <- as.numeric(gsub('[$,]', '', eval_df$BLUEBOOK))
eval_df$OLDCLAIM <- as.numeric(gsub('[$,]', '', eval_df$OLDCLAIM))

# Remove irrelevant characters
eval_df$MSTATUS  <- gsub("z_", "", eval_df$MSTATUS)
eval_df$SEX  <- gsub("z_", "", eval_df$SEX)
eval_df$EDUCATION  <- gsub("z_", "", eval_df$EDUCATION)
eval_df$JOB  <- gsub("z_", "", eval_df$JOB)
eval_df$CAR_USE  <- gsub("z_", "", eval_df$CAR_USE)
eval_df$CAR_TYPE  <- gsub("z_", "", eval_df$CAR_TYPE)
eval_df$URBANICITY  <- gsub("z_", "", eval_df$URBANICITY)

#impute
eval_df = eval_df %>% 
  mutate(AGE = 
           ifelse(is.na(AGE), 
                  mean(AGE, na.rm=TRUE), AGE)) %>% 

  mutate(YOJ = 
           ifelse(is.na(YOJ), 
                  mean(YOJ, na.rm=TRUE), YOJ)) %>% 

  mutate(INCOME = 
           ifelse(is.na(INCOME), 
                  median(INCOME, na.rm=TRUE), INCOME)) %>% 

  mutate(HOME_VAL = 
           ifelse(is.na(HOME_VAL), 
                  mean(HOME_VAL, na.rm=TRUE), HOME_VAL)) %>% 

  mutate(CAR_AGE = 
           ifelse(is.na(CAR_AGE), 
                  mean(CAR_AGE, na.rm=TRUE), CAR_AGE)) %>% 

  mutate(JOB = 
           ifelse((JOB == "" & EDUCATION == 'PhD'),
                  "Doctor", JOB)) %>% 

  mutate(JOB = 
           ifelse((JOB == "" & EDUCATION == 'Masters'),
                  "Lawyer", JOB))


 eval_prob <- predict(step.model_FLAG, eval_df, type = "response")
 eval_pclass <- ifelse(eval_prob >= 0.5, 1, 0)
 
 eval_amt <- ifelse(eval_pclass == 1, predict(step.model.AMT, eval_df, type = "response"), 0)
 
 
 eval_df$TARGET_FLAG <- eval_pclass
 eval_df$TARGET_AMT <- eval_amt
  
# Export
  write.csv(eval_df,file="HW4_Auto Insurance.csv")