Introduction

This homework exercise is to build a logistic regression model and a multiple regression model that will estimate the likelihood of car accident, and if so, we try to predict the cost when such accidents happen. We have two response variables, i.e. TARGET_FLAG and TARGET_AMT. TARGET_FLAG is a binary field where 1 is equal to crash, and 0 is equal to no crash. TARGET_AMT, on the other hand, is the amount of time spent on repairs given there is a car crash accident. TARGET_FLAG is the response variable for our logistic regression model, whereas TARGET_AMT is the response variable for multiple regression.

# load packages
if(!require(pacman)){install.packages("pacman"); require(pacman)}
## Loading required package: pacman
## Warning: package 'pacman' was built under R version 3.6.2
packages <- c('tidyverse', 'sqldf', 'broom', 'caret', 'kableExtra', 'janitor', 'Hmisc', 'MASS', 'corrplot', 'Metrics')
pacman::p_load(char = packages)

EDA

# read data
dfTrain <- read.csv("insurance_training_data.csv", header = TRUE)
dfEval <- read.csv("insurance-evaluation-data.csv", header = TRUE)

# check dim
dim(dfTrain); dim(dfEval)
## [1] 8161   26
## [1] 2141   26

# are they compatible?
if(!any(names(dfTrain) == names(dfEval))){print("the two data sets are different, please check for consistency")}

# clean names
dfTrain <- dfTrain %>% janitor::clean_names()
dfEval <- dfEval %>% janitor::clean_names()

# head
head(dfTrain) %>% kable()
index target_flag target_amt kidsdriv age homekids yoj income parent1 home_val mstatus sex education job travtime car_use bluebook tif car_type red_car oldclaim clm_freq revoked mvr_pts car_age urbanicity
1 0 0 0 60 0 11 $67,349 No $0 z_No M PhD Professional 14 Private $14,230 11 Minivan yes $4,461 2 No 3 18 Highly Urban/ Urban
2 0 0 0 43 0 11 $91,449 No $257,252 z_No M z_High School z_Blue Collar 22 Commercial $14,940 1 Minivan yes $0 0 No 0 1 Highly Urban/ Urban
4 0 0 0 35 1 10 $16,039 No $124,191 Yes z_F z_High School Clerical 5 Private $4,010 4 z_SUV no $38,690 2 No 3 10 Highly Urban/ Urban
5 0 0 0 51 0 14 No $306,251 Yes M <High School z_Blue Collar 32 Private $15,440 7 Minivan yes $0 0 No 0 6 Highly Urban/ Urban
6 0 0 0 50 0 NA $114,986 No $243,925 Yes z_F PhD Doctor 36 Private $18,000 1 z_SUV no $19,217 2 Yes 3 17 Highly Urban/ Urban
7 1 2946 0 34 1 12 $125,301 Yes $0 z_No z_F Bachelors z_Blue Collar 46 Commercial $17,430 1 Sports Car no $0 0 No 0 7 Highly Urban/ Urban

# target flag - proportion
with(dfTrain, prop.table(ftable(target_flag), 1))
## target_flag         0         1
##                                
##             0.7361843 0.2638157

# target amt - distribution
hist(dfTrain$target_amt)


# target amt - log transformed
boxplot(log(dfTrain$target_amt) ~ dfTrain$target_flag)
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z$group
## == : Outlier (-Inf) in boxplot 1 is not drawn

DATA PREP

# length of unique values of each variable from dfTrain
sapply(dfTrain, function(x) length(unique(x)))
##       index target_flag  target_amt    kidsdriv         age    homekids 
##        8161           2        1949           5          61           6 
##         yoj      income     parent1    home_val     mstatus         sex 
##          22        6613           2        5107           2           2 
##   education         job    travtime     car_use    bluebook         tif 
##           5           9          97           2        2789          23 
##    car_type     red_car    oldclaim    clm_freq     revoked     mvr_pts 
##           6           2        2857           6           2          13 
##     car_age  urbanicity 
##          31           2

# quickly glance at the class of each variable
fields <- data.frame(fields = names(dfTrain),
                     class = sapply(dfTrain, class) %>% unlist %>% as.vector)
fields
##         fields   class
## 1        index integer
## 2  target_flag integer
## 3   target_amt numeric
## 4     kidsdriv integer
## 5          age integer
## 6     homekids integer
## 7          yoj integer
## 8       income  factor
## 9      parent1  factor
## 10    home_val  factor
## 11     mstatus  factor
## 12         sex  factor
## 13   education  factor
## 14         job  factor
## 15    travtime integer
## 16     car_use  factor
## 17    bluebook  factor
## 18         tif integer
## 19    car_type  factor
## 20     red_car  factor
## 21    oldclaim  factor
## 22    clm_freq integer
## 23     revoked  factor
## 24     mvr_pts integer
## 25     car_age integer
## 26  urbanicity  factor

# mutate target_flag
# mutate these 4 factors - "income", "home_val", "bluebook", "oldclaim"
# these should be numeric variables, not factor
dfTrain <- dfTrain %>%
        dplyr::mutate(target_flag = dplyr::case_when(target_flag == 1 ~ "Y", 
                                              TRUE ~ "N") %>%
                              factor(., levels = c("Y", "N"), labels = c("Yes", "No")),
                      income = as.numeric(income),
                      home_val = as.numeric(home_val),
                      bluebook = as.numeric(bluebook),
                      oldclaim = as.numeric(oldclaim))

dfEval <- dfEval %>%
        dplyr::mutate(target_flag = dplyr::case_when(target_flag == 1 ~ "Y", 
                                              TRUE ~ "N") %>%
                              factor(., levels = c("Y", "N"), labels = c("Yes", "No")),
                      income = as.numeric(income),
                      home_val = as.numeric(home_val),
                      bluebook = as.numeric(bluebook),
                      oldclaim = as.numeric(oldclaim))

# rerun
fields <- data.frame(fields = names(dfTrain),
                     class = sapply(dfTrain, class) %>% unlist %>% as.vector)
fields
##         fields   class
## 1        index integer
## 2  target_flag  factor
## 3   target_amt numeric
## 4     kidsdriv integer
## 5          age integer
## 6     homekids integer
## 7          yoj integer
## 8       income numeric
## 9      parent1  factor
## 10    home_val numeric
## 11     mstatus  factor
## 12         sex  factor
## 13   education  factor
## 14         job  factor
## 15    travtime integer
## 16     car_use  factor
## 17    bluebook numeric
## 18         tif integer
## 19    car_type  factor
## 20     red_car  factor
## 21    oldclaim numeric
## 22    clm_freq integer
## 23     revoked  factor
## 24     mvr_pts integer
## 25     car_age integer
## 26  urbanicity  factor

# set variables
binary_var <- "target_flag"
quant_var <- "target_amt"
variables <- names(dfTrain)[names(dfTrain) %nin% c(binary_var, quant_var, "index")]

# check missing
colSums(is.na(dfTrain))
##       index target_flag  target_amt    kidsdriv         age    homekids 
##           0           0           0           0           6           0 
##         yoj      income     parent1    home_val     mstatus         sex 
##         454           0           0           0           0           0 
##   education         job    travtime     car_use    bluebook         tif 
##           0           0           0           0           0           0 
##    car_type     red_car    oldclaim    clm_freq     revoked     mvr_pts 
##           0           0           0           0           0           0 
##     car_age  urbanicity 
##         510           0
colSums(is.na(dfEval))
##       index target_flag  target_amt    kidsdriv         age    homekids 
##           0           0        2141           0           1           0 
##         yoj      income     parent1    home_val     mstatus         sex 
##          94           0           0           0           0           0 
##   education         job    travtime     car_use    bluebook         tif 
##           0           0           0           0           0           0 
##    car_type     red_car    oldclaim    clm_freq     revoked     mvr_pts 
##           0           0           0           0           0           0 
##     car_age  urbanicity 
##         129           0

# fix missing - train
preProcValuesTrain <- caret::preProcess(dfTrain[, variables], method = c("medianImpute", "center", "scale"))
dfTrain <- predict(preProcValuesTrain, dfTrain)
colSums(is.na(dfTrain))
##       index target_flag  target_amt    kidsdriv         age    homekids 
##           0           0           0           0           0           0 
##         yoj      income     parent1    home_val     mstatus         sex 
##           0           0           0           0           0           0 
##   education         job    travtime     car_use    bluebook         tif 
##           0           0           0           0           0           0 
##    car_type     red_car    oldclaim    clm_freq     revoked     mvr_pts 
##           0           0           0           0           0           0 
##     car_age  urbanicity 
##           0           0

# fix missing - eval
preProcValuesEval <- caret::preProcess(dfEval[, variables], method = c("medianImpute", "center", "scale"))
dfEval <- predict(preProcValuesEval, dfEval)
colSums(is.na(dfEval))
##       index target_flag  target_amt    kidsdriv         age    homekids 
##           0           0        2141           0           0           0 
##         yoj      income     parent1    home_val     mstatus         sex 
##           0           0           0           0           0           0 
##   education         job    travtime     car_use    bluebook         tif 
##           0           0           0           0           0           0 
##    car_type     red_car    oldclaim    clm_freq     revoked     mvr_pts 
##           0           0           0           0           0           0 
##     car_age  urbanicity 
##           0           0

# split data into train (70%), test (30%) sets
set.seed(1234)
index <- caret::createDataPartition(dfTrain$target_flag, p = 0.7, list = FALSE)
trainSet <- dfTrain[index, ]
testSet <- dfTrain[-index, ]

# trainSetControl - let's do 10-fold cross-validation 
trainSet.control <- caret::trainControl(method = "cv", number = 10, savePredictions = 'final', classProbs = TRUE)

MODEL - CLASSIFICATION PROBLEM (TARGET_FLAG)

We will first build three classification models and then two multi-linear regression models. Besides logistic regression, let’s try two different classification algorithms, i.e. naivey bayes and random forest. In addition, let’s try to improve our accuracy by 1) doing 10-fold cross-validation, and 2) ensembling our models, i.e. using glm (generalized linear model), nb (naive bayes) and rf (random forest) as our base layer and stacking on top by building a top layer using glm, nb, in addition gbm (gradient boosting machine).

# turn off warning
options(warn = -1)

# count time - start
start <- Sys.time()

# set seed
set.seed(1234)

# methods
baseTrainMethods <- c("glm", "nb", "rf")
topTrainMethods <- c("glm", "nb", "gbm")

# output
modelSummaryList <- vector(mode = "list")

# train base layer
for(baseLayer in baseTrainMethods){
        
        # set parameters
        ml = baseLayer
        model = paste0("model_base_", ml)
        OOF_prediction = paste0("OOF_pred_", ml)
        prediction = paste0("pred_", ml)
        result = paste0("result_", ml)
        
        # model
        assign(bquote(.(model)), caret::train(trainSet[, variables], trainSet[, binary_var], 
                                              method = ml, 
                                              trControl = trainSet.control,
                                              trace = FALSE))
        
        # Out-Of-Fold probability predictions - trainSet    
        if(ml == "glm"){trainSet$OOF_pred_glm = eval(sym(model))$pred$Y[order(eval(sym(model))$pred$rowIndex)]}        
        if(ml == "nb"){trainSet$OOF_pred_nb = eval(sym(model))$pred$Y[order(eval(sym(model))$pred$rowIndex)]}        
        if(ml == "rf"){trainSet$OOF_pred_rf = eval(sym(model))$pred$Y[order(eval(sym(model))$pred$rowIndex)]}
        
        # Out-Of-Fold probability predictions - testSet 
        assign(bquote(.(OOF_prediction)), predict(eval(sym(model)), testSet[, variables], type = "prob")$Y)
        if(ml == "glm"){testSet$OOF_pred_glm = eval(sym(OOF_prediction))}        
        if(ml == "nb"){testSet$OOF_pred_nb = eval(sym(OOF_prediction))}        
        if(ml == "rf"){testSet$OOF_pred_rf = eval(sym(OOF_prediction))}
        
        # Y/N predictions for Confusion Matrix - testSet    
        assign(bquote(.(prediction)), predict(eval(sym(model)), testSet[, variables]))
        if(ml == "glm"){testSet$pred_glm = eval(sym(prediction))}        
        if(ml == "nb"){testSet$pred_nb = eval(sym(prediction))}        
        if(ml == "rf"){testSet$pred_rf = eval(sym(prediction))}
        
        # output
        assign(bquote(.(result)), broom::tidy(caret::confusionMatrix(testSet[, prediction], testSet[, binary_var])) %>%
                       dplyr::mutate(trainMethod = ml) %>%
                       dplyr::select(trainMethod, everything()))
        
        # store output into a list
        tempModelList <- list(eval(sym(result)))
        modelSummaryList <<- c(modelSummaryList, tempModelList)        
        
}

# train top layer
for(topLayer in topTrainMethods){
        
        # set parameters
        ml = topLayer
        model = paste0("model_top_", ml)        
        OOF_predictors_top = c("OOF_pred_glm", "OOF_pred_nb", "OOF_pred_rf")
        OOF_prediction_top = paste0("OOF_pred_top_", ml)
        prediction_top = paste0("pred_top_", ml)
        result = paste0("result_top_", ml)
        
        # model
        assign(bquote(.(model)), caret::train(trainSet[, OOF_predictors_top], trainSet[, binary_var], 
                                              method = ml, 
                                              trControl = trainSet.control))
        
        # Out-Of-Fold probability predictions - testSet 
        assign(bquote(.(OOF_prediction_top)), predict(eval(sym(model)), testSet[, OOF_predictors_top], type = "prob")$Y)
        if(ml == "glm"){testSet$OOF_pred_top_glm = eval(sym(OOF_prediction_top))}
        if(ml == "nb"){testSet$OOF_pred_top_nb = eval(sym(OOF_prediction_top))}
        if(ml == "gbm"){testSet$OOF_pred_top_gbm = eval(sym(OOF_prediction_top))}
        
        # Y/N predictions for Confusion Matrix - testSet    
        assign(bquote(.(prediction_top)), predict(eval(sym(model)), testSet[, OOF_predictors_top]))
        if(ml == "glm"){testSet$pred_top_glm = eval(sym(prediction_top))}
        if(ml == "nb"){testSet$pred_top_nb = eval(sym(prediction_top))}
        if(ml == "gbm"){testSet$pred_top_gbm = eval(sym(prediction_top))}
        
        # output
        assign(bquote(.(result)), broom::tidy(caret::confusionMatrix(testSet[, prediction_top], testSet[, binary_var])) %>%
                       dplyr::mutate(trainMethod = paste0(ml, " - top layer")) %>%
                       dplyr::select(trainMethod, everything()))
        
        # store output into a list
        tempModelList <- list(eval(sym(result)))
        modelSummaryList <<- c(modelSummaryList, tempModelList)
        
}

# put together - averaging the OOF prediction probability
testSet <- testSet %>%
        dplyr::mutate(pred_final_prob_avg = (OOF_pred_top_glm + OOF_pred_top_nb + OOF_pred_top_gbm) / length(topTrainMethods),
                      pred_final = ifelse(pred_final_prob_avg > 0.5, "Y", "N") %>%
                              factor(., levels = c("Y", "N"), labels = c("Yes", "No")))

finalResult <- broom::tidy(caret::confusionMatrix(testSet$pred_final, testSet$target_flag)) %>%
        dplyr::mutate(trainMethod = "final - averaging") %>%
        dplyr::select(trainMethod, everything())

# store finalResult output into a list
tempModelList <- list(finalResult)
modelSummaryList <<- c(modelSummaryList, tempModelList)

# count time - finish
finish <- Sys.time()

MODEL SELECTION - CLASSIFICATION PROBLEM (TARGET_FLAG)

First, we build a base layer (using glm, nb and rf) to predict the “target_flag” using all the variables (minus the index and target_amt) from the train set. Second, we build a top layer (using glm, nb and gbm) to predict the “target_flag” based on the outcomes (OOF or Out-Of-Fold prediction) of our base layer. In other words, we make prediction (of the target variable) based on the predictions of our base models. Finally, we simply average the outcome probabilities of our top layer to get our final probability and decision using 0.5 as cut-off.

Take a look of our results, i.e. modelSummaryDf, we decide to apply the final model (averaging) for our evaluation set because of the highest f1 score displayed for the final result from the test set. Looking at the results, “nb” from the base layer did the worst in terms of sensitivity, but did the best among all models in precision. All three models from the top layer can significantly enhance accuracy, sensitivy and f1 score. The final result is far from perfect but better than relying on just a single classification algorithm. Lastly, when we compare the distribution between train and evaluation set, we see that that the proportion is very similar, i.e. Yes from evaluation (21.4%) vs Yes from train set (26.4%). Although it improved accuracy, the downside of ensembling is that the result is hard to interpret and communicate with stakeholder (such as what contributed more significantly to causing an accident).

# let's look at the time it finished running the models
print(paste0("The models ensembling exercise took roughly ", round(finish - start, 1), " mins to run"))
## [1] "The models ensembling exercise took roughly 5.8 mins to run"

# let's look at the result
modelSummaryDf <- modelSummaryList %>% 
        dplyr::bind_rows() %>% 
        dplyr::select(trainMethod, term, estimate) %>%
        tidyr::spread(term, estimate) %>%
        arrange(desc(f1))

modelSummaryDf %>% kable()
trainMethod accuracy balanced_accuracy detection_prevalence detection_rate f1 kappa neg_pred_value pos_pred_value precision prevalence recall sensitivity specificity
final - averaging 0.7870862 0.6921848 0.2084185 0.1295464 0.5489177 0.4120569 0.8306660 0.6215686 0.6215686 0.2635881 0.4914729 0.4914729 0.8928968
gbm - top layer 0.7854516 0.6915727 0.2108705 0.1299550 0.5478036 0.4094336 0.8306577 0.6162791 0.6162791 0.2635881 0.4930233 0.4930233 0.8901221
nb - top layer 0.7825909 0.6916213 0.2170004 0.1315897 0.5476190 0.4062960 0.8314196 0.6064030 0.6064030 0.2635881 0.4992248 0.4992248 0.8840178
glm - top layer 0.7850429 0.6708885 0.1777687 0.1131998 0.5129630 0.3816700 0.8170974 0.6367816 0.6367816 0.2635881 0.4294574 0.4294574 0.9123196
glm 0.7793216 0.6620267 0.1753167 0.1091132 0.4972067 0.3630885 0.8126858 0.6223776 0.6223776 0.2635881 0.4139535 0.4139535 0.9100999
rf 0.7834083 0.6593264 0.1622395 0.1046179 0.4913628 0.3635247 0.8102439 0.6448363 0.6448363 0.2635881 0.3968992 0.3968992 0.9217536
nb 0.7621577 0.5697416 0.0600736 0.0429097 0.2651515 0.1854502 0.7652174 0.7142857 0.7142857 0.2635881 0.1627907 0.1627907 0.9766926
options(warn = -1)

# let's fit the dfEval using the final averaging method

# base layer
dfEval <- dfEval %>%
        dplyr::mutate(OOF_pred_glm = predict(model_base_glm, dfEval[, variables], type = "prob")$Y,
                      OOF_pred_nb = predict(model_base_nb, dfEval[, variables], type = "prob")$Y,
                      OOF_pred_rf = predict(model_base_rf, dfEval[, variables], type = "prob")$Y)

# top layer
dfEval <- dfEval %>%
        dplyr::mutate(OOF_pred_top_glm = predict(model_top_glm, dfEval[, c("OOF_pred_glm", "OOF_pred_nb", "OOF_pred_rf")], type = "prob")$Y,
                      OOF_pred_top_nb = predict(model_top_nb, dfEval[, c("OOF_pred_glm", "OOF_pred_nb", "OOF_pred_rf")], type = "prob")$Y,
                      OOF_pred_top_gbm = predict(model_top_gbm, dfEval[, c("OOF_pred_glm", "OOF_pred_nb", "OOF_pred_rf")], type = "prob")$Y)

# final predicion
dfEval <- dfEval %>%
        dplyr::mutate(target_flag_prob = round((OOF_pred_top_glm + OOF_pred_top_nb + OOF_pred_top_gbm) / 3, 3),
                      target_flag = ifelse(target_flag_prob > 0.5, "Y", "N") %>%
                              factor(., levels = c("Y", "N"), labels = c("Yes", "No")))
# see prediction
evalTable <- ftable(dfEval$target_flag)
evalTable
##   Yes   No
##           
##   459 1682

# comparison between evaluation and train set
with(dfEval, prop.table(evalTable, 1))  # evaluation
##        Yes        No
##                     
##  0.2143858 0.7856142
with(dfTrain, prop.table(ftable(dfTrain$target_flag)))  # train
##        Yes        No
##                     
##  0.2638157 0.7361843

# dfEval
dfEval %>% dplyr::select(index, target_flag, target_flag_prob, everything()) %>% head() %>% kable()
index target_flag target_flag_prob target_amt kidsdriv age homekids yoj income parent1 home_val mstatus sex education job travtime car_use bluebook tif car_type red_car oldclaim clm_freq revoked mvr_pts car_age urbanicity OOF_pred_glm OOF_pred_nb OOF_pred_rf OOF_pred_top_glm OOF_pred_top_nb OOF_pred_top_gbm
3 No 0.063 NA -0.3337943 0.3499326 -0.6425179 0.1488987 0.6619537 No -0.9915932 z_No M Bachelors Manager -0.4549095 Private 0.0016269 -1.0689293 Van yes -0.6571645 -0.7111922 No 0.1062001 0.3150395 Highly Urban/ Urban 0.1220784 0.0013785 0.092 0.0915576 0.0098567 0.0886053
9 No 0.232 NA 1.7198080 -0.5884831 0.2530751 0.1488987 0.6011222 Yes -0.9915932 z_No M z_High School Manager -0.7729272 Private -0.3709252 0.1901913 Minivan no 0.4022653 0.1679432 No 0.1062001 -1.2457634 Highly Urban/ Urban 0.2336235 0.2439135 0.328 0.2164859 0.2120411 0.2684035
10 No 0.053 NA -0.3337943 -0.1192753 1.1486681 0.3887064 0.3491062 Yes -0.9915932 z_No z_F z_High School z_Blue Collar -0.2004953 Commercial 1.1124265 1.1974878 z_SUV no -0.6571645 -0.7111922 No -0.8014825 0.3150395 z_Highly Rural/ Rural 0.0908932 0.0091209 0.012 0.0667108 0.0056237 0.0872490
18 No 0.109 NA -0.3337943 -1.1749930 1.1486681 0.1488987 -0.4521312 Yes -0.9915932 z_No M z_High School Clerical 2.5980610 Private 1.5329762 0.1901913 Pickup no -0.6571645 -0.7111922 Yes -0.8014825 -0.7254958 z_Highly Rural/ Rural 0.2403932 0.0106405 0.112 0.1351010 0.0187269 0.1722218
21 No 0.151 NA -0.3337943 1.6402542 -0.6425179 0.3887064 1.5865921 No -0.9915932 z_No M z_High School Manager 0.7535580 Private -0.8166164 -1.0689293 Minivan yes 1.2701376 1.0470786 No 1.0138828 -1.2457634 Highly Urban/ Urban 0.2392829 0.0536335 0.256 0.1920650 0.0462885 0.2155320
30 No 0.117 NA -0.3337943 0.1153287 -0.6425179 0.8683218 -1.3420087 No 0.3708270 Yes M Bachelors Professional -1.6633769 Commercial 0.3696079 -1.0689293 Panel Truck no -0.1254949 0.1679432 No 0.1062001 0.6618846 Highly Urban/ Urban 0.1864743 0.0259940 0.226 0.1558029 0.0287481 0.1669544

MODEL - MULTILINEAR REGRESSION (TARGET_AMT)

Let’s turn our focus to build two multilinear regression models using the same data. Interestingly, the variables do not seem to correlate with the target variable in this exercise. Nothing seems to indicate anything relevant to “target_amt”.

# corrplot - quantitative variables
trainSet %>%
        dplyr::filter(target_flag == "Yes") %>%
        dplyr::select(fields %>% dplyr::filter(class != 'factor') %>% .$fields) %>% 
        dplyr::select(-index) %>%
        cor %>%
        corrplot(method = "number", type = "upper", order = "hclust")


# density - categorical variables
dfGather <- trainSet %>%
        dplyr::filter(target_flag == "Yes") %>%
        dplyr::select(fields %>% dplyr::filter(class == 'factor') %>% .$fields, target_amt) %>% 
        tidyr::gather(key, value, -target_amt) 

dfGather %>%
        ggplot(aes(target_amt, color = value)) +
        geom_density() +
        geom_vline(data = aggregate(target_amt ~ key + value, dfGather, median), 
                   aes(xintercept = target_amt,
                       color = value),
                   linetype = "dashed") +
        facet_wrap(~ key, nrow = 5, scales = "free") + 
        theme(legend.position = "none") + 
        ggtitle("Distribution of Target Amount by Various Categorical Variables")


# build full model
fullModel <- lm(target_amt ~., data = trainSet %>% 
                        dplyr::filter(target_flag == "Yes") %>% 
                        dplyr::select(variables, quant_var))
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(variables)` instead of `variables` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(quant_var)` instead of `quant_var` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
summary(fullModel)
## 
## Call:
## lm(formula = target_amt ~ ., data = trainSet %>% dplyr::filter(target_flag == 
##     "Yes") %>% dplyr::select(variables, quant_var))
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -7646  -2980  -1428    479  78846 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      6420.76    1622.17   3.958 7.91e-05 ***
## kidsdriv                          -41.87     186.91  -0.224   0.8228    
## age                                63.94     209.70   0.305   0.7605    
## homekids                          241.18     275.06   0.877   0.3807    
## yoj                                50.77     237.11   0.214   0.8305    
## income                             26.37     243.05   0.109   0.9136    
## parent1Yes                       -135.57     677.69  -0.200   0.8415    
## home_val                          181.00     232.56   0.778   0.4365    
## mstatusz_No                       855.11     534.28   1.600   0.1097    
## sexz_F                          -1352.67     708.49  -1.909   0.0564 .  
## educationBachelors               -176.93     740.56  -0.239   0.8112    
## educationMasters                 1170.32    1245.90   0.939   0.3477    
## educationPhD                     1820.80    1428.87   1.274   0.2028    
## educationz_High School           -390.17     590.99  -0.660   0.5092    
## jobClerical                      -465.61    1380.79  -0.337   0.7360    
## jobDoctor                       -3129.01    2092.81  -1.495   0.1351    
## jobHome Maker                    -851.86    1411.45  -0.604   0.5462    
## jobLawyer                       -1153.76    1182.09  -0.976   0.3292    
## jobManager                      -1606.24    1235.83  -1.300   0.1939    
## jobProfessional                   286.88    1297.24   0.221   0.8250    
## jobStudent                       -307.51    1454.56  -0.211   0.8326    
## jobz_Blue Collar                   29.85    1320.61   0.023   0.9820    
## travtime                           31.93     202.57   0.158   0.8748    
## car_usePrivate                   -396.40     595.54  -0.666   0.5058    
## bluebook                         -130.25     194.72  -0.669   0.5037    
## tif                               -49.40     206.61  -0.239   0.8111    
## car_typePanel Truck               806.16     962.96   0.837   0.4026    
## car_typePickup                    -80.94     695.63  -0.116   0.9074    
## car_typeSports Car                740.04     814.52   0.909   0.3637    
## car_typeVan                       370.21     838.62   0.441   0.6589    
## car_typez_SUV                     327.53     715.49   0.458   0.6472    
## red_caryes                       -725.27     571.83  -1.268   0.2049    
## oldclaim                          -56.25     220.52  -0.255   0.7987    
## clm_freq                         -118.32     228.37  -0.518   0.6045    
## revokedYes                       -756.60     485.77  -1.558   0.1196    
## mvr_pts                           103.60     171.38   0.604   0.5456    
## car_age                          -527.25     289.71  -1.820   0.0690 .  
## urbanicityz_Highly Rural/ Rural   -52.34     877.12  -0.060   0.9524    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7391 on 1470 degrees of freedom
## Multiple R-squared:  0.02302,    Adjusted R-squared:  -0.001574 
## F-statistic: 0.936 on 37 and 1470 DF,  p-value: 0.5804

# stepwise regression - direction default to both
step <- MASS::stepAIC(fullModel, trace = FALSE)
step$anova
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## target_amt ~ kidsdriv + age + homekids + yoj + income + parent1 + 
##     home_val + mstatus + sex + education + job + travtime + car_use + 
##     bluebook + tif + car_type + red_car + oldclaim + clm_freq + 
##     revoked + mvr_pts + car_age + urbanicity
## 
## Final Model:
## target_amt ~ mstatus + sex + car_use + revoked
## 
## 
##            Step Df  Deviance Resid. Df  Resid. Dev      AIC
## 1                                 1470 80306517739 26904.19
## 2         - job  8 350075913      1478 80656593652 26894.75
## 3    - car_type  5 156824305      1483 80813417957 26887.68
## 4   - education  4 209630349      1487 81023048307 26883.59
## 5    - kidsdriv  1    345385      1488 81023393692 26881.59
## 6     - parent1  1   1850124      1489 81025243816 26879.63
## 7  - urbanicity  1   2060081      1490 81027303897 26877.66
## 8    - travtime  1   2040288      1491 81029344185 26875.70
## 9         - tif  1   2786998      1492 81032131183 26873.75
## 10   - oldclaim  1   3433572      1493 81035564754 26871.82
## 11     - income  1   4926100      1494 81040490854 26869.91
## 12        - age  1  17593993      1495 81058084847 26868.24
## 13   - homekids  1  20065899      1496 81078150746 26866.61
## 14    - mvr_pts  1  24168604      1497 81102319350 26865.06
## 15   - clm_freq  1  17348896      1498 81119668246 26863.38
## 16   - bluebook  1  27790046      1499 81147458292 26861.90
## 17        - yoj  1  36399906      1500 81183858198 26860.58
## 18    - car_age  1  73266728      1501 81257124926 26859.94
## 19   - home_val  1  54967527      1502 81312092454 26858.96
## 20    - red_car  1  85939499      1503 81398031953 26858.55

# 2 degree of interactions - full model
step2 <- MASS::stepAIC(fullModel, ~ .^2, trace = FALSE)
step2$anova
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## target_amt ~ kidsdriv + age + homekids + yoj + income + parent1 + 
##     home_val + mstatus + sex + education + job + travtime + car_use + 
##     bluebook + tif + car_type + red_car + oldclaim + clm_freq + 
##     revoked + mvr_pts + car_age + urbanicity
## 
## Final Model:
## target_amt ~ kidsdriv + age + homekids + yoj + income + home_val + 
##     mstatus + sex + tif + car_type + red_car + oldclaim + clm_freq + 
##     mvr_pts + car_age + urbanicity + homekids:clm_freq + homekids:car_type + 
##     car_type:urbanicity + home_val:mstatus + age:car_type + kidsdriv:age + 
##     income:mstatus + age:home_val + homekids:car_age + yoj:tif + 
##     kidsdriv:oldclaim + income:car_type + homekids:red_car + 
##     home_val:sex + homekids:home_val + kidsdriv:car_age + age:urbanicity + 
##     home_val:oldclaim + age:red_car + age:clm_freq + kidsdriv:clm_freq
## 
## 
##                     Step Df     Deviance Resid. Df  Resid. Dev      AIC
## 1                                             1470 80306517739 26904.19
## 2    + homekids:clm_freq  1 6.753112e+08      1469 79631206509 26893.46
## 3    + homekids:car_type  5 1.011300e+09      1464 78619906978 26884.18
## 4                  - job  8 3.895629e+08      1472 79009469844 26875.64
## 5  + car_type:urbanicity  4 7.850784e+08      1468 78224391452 26868.58
## 6     + home_val:mstatus  1 4.600644e+08      1467 77764327040 26861.68
## 7         + age:car_type  5 8.447631e+08      1462 76919563928 26855.21
## 8            - education  4 1.752619e+08      1466 77094825815 26850.64
## 9         + kidsdriv:age  1 2.948190e+08      1465 76800006825 26846.86
## 10      + income:mstatus  1 2.600865e+08      1464 76539920341 26843.75
## 11        + age:home_val  1 2.833268e+08      1463 76256593508 26840.16
## 12    + homekids:car_age  1 2.494893e+08      1462 76007104200 26837.21
## 13             + yoj:tif  1 2.338405e+08      1461 75773263655 26834.57
## 14   + kidsdriv:oldclaim  1 2.327973e+08      1460 75540466362 26831.93
## 15            - travtime  1 5.909411e+03      1461 75540472271 26829.93
## 16     + income:car_type  5 5.975858e+08      1456 74942886461 26827.95
## 17             - parent1  1 6.248921e+06      1457 74949135382 26826.08
## 18             - car_use  1 2.510220e+07      1458 74974237583 26824.58
## 19            - bluebook  1 2.596424e+07      1459 75000201824 26823.10
## 20             - revoked  1 3.376960e+07      1460 75033971420 26821.78
## 21    + homekids:red_car  1 1.592836e+08      1459 74874687832 26820.58
## 22        + home_val:sex  1 1.455318e+08      1458 74729155997 26819.64
## 23   + homekids:home_val  1 1.492159e+08      1457 74579940090 26818.63
## 24    + kidsdriv:car_age  1 1.541971e+08      1456 74425743008 26817.51
## 25      + age:urbanicity  1 1.541162e+08      1455 74271626808 26816.38
## 26   + home_val:oldclaim  1 1.458966e+08      1454 74125730252 26815.42
## 27         + age:red_car  1 9.948659e+07      1453 74026243664 26815.39
## 28        + age:clm_freq  1 1.051529e+08      1452 73921090735 26815.25
## 29   + kidsdriv:clm_freq  1 1.467009e+08      1451 73774389817 26814.25

# 3 degree of interactions - full model
step3 <- MASS::stepAIC(fullModel, ~ .^3, trace = FALSE)
step3$anova
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## target_amt ~ kidsdriv + age + homekids + yoj + income + parent1 + 
##     home_val + mstatus + sex + education + job + travtime + car_use + 
##     bluebook + tif + car_type + red_car + oldclaim + clm_freq + 
##     revoked + mvr_pts + car_age + urbanicity
## 
## Final Model:
## target_amt ~ kidsdriv + age + homekids + yoj + income + home_val + 
##     mstatus + sex + tif + car_type + red_car + oldclaim + clm_freq + 
##     mvr_pts + car_age + urbanicity + homekids:clm_freq + homekids:car_type + 
##     car_type:urbanicity + home_val:mstatus + age:car_type + kidsdriv:age + 
##     income:mstatus + age:home_val + homekids:car_age + yoj:tif + 
##     kidsdriv:oldclaim + income:car_type + homekids:red_car + 
##     home_val:sex + homekids:home_val + kidsdriv:car_age + age:urbanicity + 
##     home_val:oldclaim + kidsdriv:clm_freq + age:clm_freq + age:car_type:urbanicity + 
##     kidsdriv:age:clm_freq
## 
## 
##                         Step Df     Deviance Resid. Df  Resid. Dev      AIC
## 1                                                 1470 80306517739 26904.19
## 2        + homekids:clm_freq  1 6.753112e+08      1469 79631206509 26893.46
## 3        + homekids:car_type  5 1.011300e+09      1464 78619906978 26884.18
## 4                      - job  8 3.895629e+08      1472 79009469844 26875.64
## 5      + car_type:urbanicity  4 7.850784e+08      1468 78224391452 26868.58
## 6         + home_val:mstatus  1 4.600644e+08      1467 77764327040 26861.68
## 7             + age:car_type  5 8.447631e+08      1462 76919563928 26855.21
## 8                - education  4 1.752619e+08      1466 77094825815 26850.64
## 9             + kidsdriv:age  1 2.948190e+08      1465 76800006825 26846.86
## 10          + income:mstatus  1 2.600865e+08      1464 76539920341 26843.75
## 11            + age:home_val  1 2.833268e+08      1463 76256593508 26840.16
## 12        + homekids:car_age  1 2.494893e+08      1462 76007104200 26837.21
## 13                 + yoj:tif  1 2.338405e+08      1461 75773263655 26834.57
## 14       + kidsdriv:oldclaim  1 2.327973e+08      1460 75540466362 26831.93
## 15                - travtime  1 5.909411e+03      1461 75540472271 26829.93
## 16         + income:car_type  5 5.975858e+08      1456 74942886461 26827.95
## 17                 - parent1  1 6.248921e+06      1457 74949135382 26826.08
## 18                 - car_use  1 2.510220e+07      1458 74974237583 26824.58
## 19                - bluebook  1 2.596424e+07      1459 75000201824 26823.10
## 20                 - revoked  1 3.376960e+07      1460 75033971420 26821.78
## 21        + homekids:red_car  1 1.592836e+08      1459 74874687832 26820.58
## 22            + home_val:sex  1 1.455318e+08      1458 74729155997 26819.64
## 23       + homekids:home_val  1 1.492159e+08      1457 74579940090 26818.63
## 24        + kidsdriv:car_age  1 1.541971e+08      1456 74425743008 26817.51
## 25          + age:urbanicity  1 1.541162e+08      1455 74271626808 26816.38
## 26 + age:car_type:urbanicity  4 1.848856e+09      1451 72422770876 26786.37
## 27       + home_val:oldclaim  1 1.482064e+08      1450 72274564496 26785.28
## 28       + kidsdriv:clm_freq  1 1.109008e+08      1449 72163663686 26784.96
## 29            + age:clm_freq  1 1.759370e+08      1448 71987726674 26783.28
## 30   + kidsdriv:age:clm_freq  1 2.379337e+08      1447 71749793011 26780.29

# final model (based on the 3-degree of interactions)
finalModel <- lm(
        target_amt ~ kidsdriv + age + homekids + yoj + income + home_val + 
            mstatus + sex + tif + car_type + red_car + oldclaim + clm_freq + 
            mvr_pts + car_age + urbanicity + homekids:clm_freq + homekids:car_type + 
            car_type:urbanicity + home_val:mstatus + age:car_type + kidsdriv:age + 
            income:mstatus + age:home_val + homekids:car_age + yoj:tif + 
            kidsdriv:oldclaim + income:car_type + homekids:red_car + 
            home_val:sex + homekids:home_val + kidsdriv:car_age + age:urbanicity + 
            home_val:oldclaim + kidsdriv:clm_freq + age:clm_freq + age:car_type:urbanicity + 
            kidsdriv:age:clm_freq,
        data = trainSet %>% 
                        dplyr::filter(target_flag == "Yes") %>% 
                        dplyr::select(variables, quant_var)
)

summary(finalModel)
## 
## Call:
## lm(formula = target_amt ~ kidsdriv + age + homekids + yoj + income + 
##     home_val + mstatus + sex + tif + car_type + red_car + oldclaim + 
##     clm_freq + mvr_pts + car_age + urbanicity + homekids:clm_freq + 
##     homekids:car_type + car_type:urbanicity + home_val:mstatus + 
##     age:car_type + kidsdriv:age + income:mstatus + age:home_val + 
##     homekids:car_age + yoj:tif + kidsdriv:oldclaim + income:car_type + 
##     homekids:red_car + home_val:sex + homekids:home_val + kidsdriv:car_age + 
##     age:urbanicity + home_val:oldclaim + kidsdriv:clm_freq + 
##     age:clm_freq + age:car_type:urbanicity + kidsdriv:age:clm_freq, 
##     data = trainSet %>% dplyr::filter(target_flag == "Yes") %>% 
##         dplyr::select(variables, quant_var))
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -18504  -3101  -1091    912  75691 
## 
## Coefficients: (2 not defined because of singularities)
##                                                         Estimate Std. Error
## (Intercept)                                              6157.26     664.87
## kidsdriv                                                 -220.46     203.47
## age                                                       421.19     480.96
## homekids                                                  442.93     541.58
## yoj                                                      -104.28     201.34
## income                                                  -1780.86     525.57
## home_val                                                  -77.16     385.25
## mstatusz_No                                              1194.44     432.93
## sexz_F                                                  -1264.66     689.06
## tif                                                       -47.34     199.78
## car_typePanel Truck                                      1227.58     825.18
## car_typePickup                                           -371.90     661.09
## car_typeSports Car                                        431.36     813.56
## car_typeVan                                              -176.79     798.68
## car_typez_SUV                                            -285.40     710.90
## red_caryes                                               -368.62     548.91
## oldclaim                                                   21.94     214.98
## clm_freq                                                 -166.52     219.32
## mvr_pts                                                   259.13     166.20
## car_age                                                  -287.20     208.52
## urbanicityz_Highly Rural/ Rural                         -3096.97    1774.29
## homekids:clm_freq                                         735.41     228.39
## homekids:car_typePanel Truck                             3451.02     835.13
## homekids:car_typePickup                                   657.81     640.92
## homekids:car_typeSports Car                              -842.79     724.14
## homekids:car_typeVan                                     -678.26     816.93
## homekids:car_typez_SUV                                   -128.95     611.79
## car_typePanel Truck:urbanicityz_Highly Rural/ Rural           NA         NA
## car_typePickup:urbanicityz_Highly Rural/ Rural           5557.25    2881.32
## car_typeSports Car:urbanicityz_Highly Rural/ Rural       2089.92    2894.18
## car_typeVan:urbanicityz_Highly Rural/ Rural             20566.09    3644.58
## car_typez_SUV:urbanicityz_Highly Rural/ Rural            2609.11    2294.87
## home_val:mstatusz_No                                     1567.46     438.06
## age:car_typePanel Truck                                  1924.33     824.48
## age:car_typePickup                                        -38.02     680.70
## age:car_typeSports Car                                   -391.10     638.73
## age:car_typeVan                                         -2235.39     824.49
## age:car_typez_SUV                                        -375.29     575.19
## kidsdriv:age                                              266.52     216.77
## income:mstatusz_No                                       1066.00     388.41
## age:home_val                                              677.16     192.65
## homekids:car_age                                          695.12     234.39
## yoj:tif                                                  -418.88     180.08
## kidsdriv:oldclaim                                         476.50     172.21
## income:car_typePanel Truck                               1169.47     775.82
## income:car_typePickup                                    1929.04     673.58
## income:car_typeSports Car                                2066.00     689.29
## income:car_typeVan                                       1701.52     737.94
## income:car_typez_SUV                                     1564.40     584.45
## homekids:red_caryes                                     -1015.11     470.41
## home_val:sexz_F                                          -682.46     401.28
## homekids:home_val                                         468.87     210.25
## kidsdriv:car_age                                         -369.25     180.45
## age:urbanicityz_Highly Rural/ Rural                      -702.84    1300.16
## home_val:oldclaim                                         272.93     177.81
## kidsdriv:clm_freq                                        -336.48     197.71
## age:clm_freq                                              328.15     175.17
## age:car_typePanel Truck:urbanicityz_Highly Rural/ Rural       NA         NA
## age:car_typePickup:urbanicityz_Highly Rural/ Rural       5226.66    3060.49
## age:car_typeSports Car:urbanicityz_Highly Rural/ Rural    882.66    2245.88
## age:car_typeVan:urbanicityz_Highly Rural/ Rural         17335.75    2991.61
## age:car_typez_SUV:urbanicityz_Highly Rural/ Rural         428.86    1815.43
## kidsdriv:age:clm_freq                                     354.29     161.74
##                                                         t value Pr(>|t|)    
## (Intercept)                                               9.261  < 2e-16 ***
## kidsdriv                                                 -1.084 0.278761    
## age                                                       0.876 0.381325    
## homekids                                                  0.818 0.413582    
## yoj                                                      -0.518 0.604584    
## income                                                   -3.388 0.000722 ***
## home_val                                                 -0.200 0.841287    
## mstatusz_No                                               2.759 0.005871 ** 
## sexz_F                                                   -1.835 0.066659 .  
## tif                                                      -0.237 0.812718    
## car_typePanel Truck                                       1.488 0.137062    
## car_typePickup                                           -0.563 0.573819    
## car_typeSports Car                                        0.530 0.596051    
## car_typeVan                                              -0.221 0.824848    
## car_typez_SUV                                            -0.401 0.688143    
## red_caryes                                               -0.672 0.501981    
## oldclaim                                                  0.102 0.918722    
## clm_freq                                                 -0.759 0.447823    
## mvr_pts                                                   1.559 0.119184    
## car_age                                                  -1.377 0.168620    
## urbanicityz_Highly Rural/ Rural                          -1.745 0.081114 .  
## homekids:clm_freq                                         3.220 0.001311 ** 
## homekids:car_typePanel Truck                              4.132 3.80e-05 ***
## homekids:car_typePickup                                   1.026 0.304897    
## homekids:car_typeSports Car                              -1.164 0.244676    
## homekids:car_typeVan                                     -0.830 0.406534    
## homekids:car_typez_SUV                                   -0.211 0.833090    
## car_typePanel Truck:urbanicityz_Highly Rural/ Rural          NA       NA    
## car_typePickup:urbanicityz_Highly Rural/ Rural            1.929 0.053962 .  
## car_typeSports Car:urbanicityz_Highly Rural/ Rural        0.722 0.470343    
## car_typeVan:urbanicityz_Highly Rural/ Rural               5.643 2.01e-08 ***
## car_typez_SUV:urbanicityz_Highly Rural/ Rural             1.137 0.255754    
## home_val:mstatusz_No                                      3.578 0.000357 ***
## age:car_typePanel Truck                                   2.334 0.019732 *  
## age:car_typePickup                                       -0.056 0.955469    
## age:car_typeSports Car                                   -0.612 0.540428    
## age:car_typeVan                                          -2.711 0.006783 ** 
## age:car_typez_SUV                                        -0.652 0.514203    
## kidsdriv:age                                              1.229 0.219097    
## income:mstatusz_No                                        2.745 0.006135 ** 
## age:home_val                                              3.515 0.000453 ***
## homekids:car_age                                          2.966 0.003070 ** 
## yoj:tif                                                  -2.326 0.020155 *  
## kidsdriv:oldclaim                                         2.767 0.005731 ** 
## income:car_typePanel Truck                                1.507 0.131927    
## income:car_typePickup                                     2.864 0.004246 ** 
## income:car_typeSports Car                                 2.997 0.002770 ** 
## income:car_typeVan                                        2.306 0.021265 *  
## income:car_typez_SUV                                      2.677 0.007519 ** 
## homekids:red_caryes                                      -2.158 0.031097 *  
## home_val:sexz_F                                          -1.701 0.089212 .  
## homekids:home_val                                         2.230 0.025895 *  
## kidsdriv:car_age                                         -2.046 0.040914 *  
## age:urbanicityz_Highly Rural/ Rural                      -0.541 0.588878    
## home_val:oldclaim                                         1.535 0.125010    
## kidsdriv:clm_freq                                        -1.702 0.088983 .  
## age:clm_freq                                              1.873 0.061221 .  
## age:car_typePanel Truck:urbanicityz_Highly Rural/ Rural      NA       NA    
## age:car_typePickup:urbanicityz_Highly Rural/ Rural        1.708 0.087891 .  
## age:car_typeSports Car:urbanicityz_Highly Rural/ Rural    0.393 0.694368    
## age:car_typeVan:urbanicityz_Highly Rural/ Rural           5.795 8.38e-09 ***
## age:car_typez_SUV:urbanicityz_Highly Rural/ Rural         0.236 0.813289    
## kidsdriv:age:clm_freq                                     2.191 0.028644 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7042 on 1447 degrees of freedom
## Multiple R-squared:  0.1271, Adjusted R-squared:  0.09092 
## F-statistic: 3.512 on 60 and 1447 DF,  p-value: < 2.2e-16
broom::tidy(finalModel) %>% arrange(p.value) %>% kable()
term estimate std.error statistic p.value
(Intercept) 6157.26102 664.8728 9.2608107 0.0000000
age:car_typeVan:urbanicityz_Highly Rural/ Rural 17335.74875 2991.6108 5.7947874 0.0000000
car_typeVan:urbanicityz_Highly Rural/ Rural 20566.08610 3644.5828 5.6429191 0.0000000
homekids:car_typePanel Truck 3451.01882 835.1349 4.1322889 0.0000380
home_val:mstatusz_No 1567.46217 438.0585 3.5782032 0.0003574
age:home_val 677.15735 192.6520 3.5149249 0.0004534
income -1780.85560 525.5706 -3.3884234 0.0007218
homekids:clm_freq 735.40808 228.3909 3.2199540 0.0013106
income:car_typeSports Car 2065.99967 689.2879 2.9972959 0.0027703
homekids:car_age 695.12298 234.3885 2.9656877 0.0030696
income:car_typePickup 1929.04279 673.5836 2.8638507 0.0042456
kidsdriv:oldclaim 476.49886 172.2145 2.7668918 0.0057314
mstatusz_No 1194.43930 432.9270 2.7589854 0.0058712
income:mstatusz_No 1066.00114 388.4118 2.7445130 0.0061350
age:car_typeVan -2235.39108 824.4937 -2.7112287 0.0067827
income:car_typez_SUV 1564.40159 584.4488 2.6767129 0.0075188
age:car_typePanel Truck 1924.33174 824.4782 2.3339996 0.0197320
yoj:tif -418.87507 180.0818 -2.3260261 0.0201547
income:car_typeVan 1701.51669 737.9440 2.3057532 0.0212653
homekids:home_val 468.86746 210.2472 2.2300770 0.0258952
kidsdriv:age:clm_freq 354.29002 161.7361 2.1905435 0.0286439
homekids:red_caryes -1015.10786 470.4060 -2.1579398 0.0310966
kidsdriv:car_age -369.24951 180.4533 -2.0462326 0.0409139
car_typePickup:urbanicityz_Highly Rural/ Rural 5557.24992 2881.3229 1.9287147 0.0539617
age:clm_freq 328.15357 175.1687 1.8733572 0.0612205
sexz_F -1264.65757 689.0559 -1.8353484 0.0666592
urbanicityz_Highly Rural/ Rural -3096.97458 1774.2871 -1.7454755 0.0811144
age:car_typePickup:urbanicityz_Highly Rural/ Rural 5226.65734 3060.4936 1.7077825 0.0878912
kidsdriv:clm_freq -336.48308 197.7066 -1.7019313 0.0889831
home_val:sexz_F -682.46038 401.2797 -1.7007098 0.0892124
mvr_pts 259.13257 166.2033 1.5591304 0.1191842
home_val:oldclaim 272.92888 177.8076 1.5349677 0.1250104
income:car_typePanel Truck 1169.47047 775.8212 1.5073968 0.1319273
car_typePanel Truck 1227.58097 825.1824 1.4876480 0.1370616
car_age -287.20185 208.5190 -1.3773414 0.1686197
kidsdriv:age 266.51728 216.7747 1.2294669 0.2190967
homekids:car_typeSports Car -842.79430 724.1426 -1.1638513 0.2446761
car_typez_SUV:urbanicityz_Highly Rural/ Rural 2609.11289 2294.8677 1.1369339 0.2557542
kidsdriv -220.46485 203.4723 -1.0835129 0.2787613
homekids:car_typePickup 657.80690 640.9177 1.0263516 0.3048974
age 421.19239 480.9643 0.8757249 0.3813249
homekids:car_typeVan -678.25917 816.9334 -0.8302502 0.4065343
homekids 442.92558 541.5782 0.8178423 0.4135819
clm_freq -166.51752 219.3169 -0.7592554 0.4478235
car_typeSports Car:urbanicityz_Highly Rural/ Rural 2089.92132 2894.1821 0.7221112 0.4703428
red_caryes -368.62000 548.9139 -0.6715443 0.5019810
age:car_typez_SUV -375.29216 575.1881 -0.6524686 0.5142026
age:car_typeSports Car -391.09925 638.7256 -0.6123118 0.5404277
car_typePickup -371.90288 661.0853 -0.5625641 0.5738189
age:urbanicityz_Highly Rural/ Rural -702.84499 1300.1613 -0.5405829 0.5888783
car_typeSports Car 431.35601 813.5639 0.5302055 0.5960508
yoj -104.27931 201.3376 -0.5179326 0.6045845
car_typez_SUV -285.39708 710.9025 -0.4014574 0.6881427
age:car_typeSports Car:urbanicityz_Highly Rural/ Rural 882.65837 2245.8772 0.3930128 0.6943680
tif -47.34196 199.7847 -0.2369649 0.8127176
age:car_typez_SUV:urbanicityz_Highly Rural/ Rural 428.85546 1815.4302 0.2362280 0.8132892
car_typeVan -176.79136 798.6800 -0.2213544 0.8248477
homekids:car_typez_SUV -128.95172 611.7876 -0.2107786 0.8330897
home_val -77.16001 385.2538 -0.2002836 0.8412869
oldclaim 21.94131 214.9811 0.1020616 0.9187219
age:car_typePickup -38.01724 680.7042 -0.0558499 0.9554691

# comparison
testSet$target_amt_predicted <- predict(finalModel, testSet[, variables])

# root mean squared error (rmse)
Metrics::rmse(testSet$target_amt, testSet$target_amt_predicted)
## [1] 7343.597

MODEL APPLIED - MULTILINEAR REGRESSION (TARGET_AMT)

From above corrplot and density curve, we see that there’s actually nothing strongly associated with the target variable. We used stepwise regression and used up to three degree of interaction to come up with the best model possible (which will cause overfitting and make this model useless in the future), but still that only gave us 9% of adjusted R-squared. That means, over 90% of the variance in the actual amount cannot be predicted from this model. The regression model is less encouraging than the classification model.

dfEval$target_amt <- predict(finalModel, dfEval[, variables])

dfEval %>%
        dplyr::filter(target_flag == "Yes") %>%
        ggplot(aes(target_amt)) +
        geom_density() +
        geom_vline(xintercept = median(dfEval$target_amt[dfEval$target_flag == "Yes"]),
                   linetype = "dashed") +
        ggtitle("Distribution of Predicted Target Amount")


dfEval %>% head() %>% kable()
index target_flag target_amt kidsdriv age homekids yoj income parent1 home_val mstatus sex education job travtime car_use bluebook tif car_type red_car oldclaim clm_freq revoked mvr_pts car_age urbanicity OOF_pred_glm OOF_pred_nb OOF_pred_rf OOF_pred_top_glm OOF_pred_top_nb OOF_pred_top_gbm target_flag_prob
3 No 6783.665 -0.3337943 0.3499326 -0.6425179 0.1488987 0.6619537 No -0.9915932 z_No M Bachelors Manager -0.4549095 Private 0.0016269 -1.0689293 Van yes -0.6571645 -0.7111922 No 0.1062001 0.3150395 Highly Urban/ Urban 0.1220784 0.0013785 0.092 0.0915576 0.0098567 0.0886053 0.063
9 No 5900.963 1.7198080 -0.5884831 0.2530751 0.1488987 0.6011222 Yes -0.9915932 z_No M z_High School Manager -0.7729272 Private -0.3709252 0.1901913 Minivan no 0.4022653 0.1679432 No 0.1062001 -1.2457634 Highly Urban/ Urban 0.2336235 0.2439135 0.328 0.2164859 0.2120411 0.2684035 0.232
10 No 4250.961 -0.3337943 -0.1192753 1.1486681 0.3887064 0.3491062 Yes -0.9915932 z_No z_F z_High School z_Blue Collar -0.2004953 Commercial 1.1124265 1.1974878 z_SUV no -0.6571645 -0.7111922 No -0.8014825 0.3150395 z_Highly Rural/ Rural 0.0908932 0.0091209 0.012 0.0667108 0.0056237 0.0872490 0.053
18 No 2521.773 -0.3337943 -1.1749930 1.1486681 0.1488987 -0.4521312 Yes -0.9915932 z_No M z_High School Clerical 2.5980610 Private 1.5329762 0.1901913 Pickup no -0.6571645 -0.7111922 Yes -0.8014825 -0.7254958 z_Highly Rural/ Rural 0.2403932 0.0106405 0.112 0.1351010 0.0187269 0.1722218 0.109
21 No 5053.206 -0.3337943 1.6402542 -0.6425179 0.3887064 1.5865921 No -0.9915932 z_No M z_High School Manager 0.7535580 Private -0.8166164 -1.0689293 Minivan yes 1.2701376 1.0470786 No 1.0138828 -1.2457634 Highly Urban/ Urban 0.2392829 0.0536335 0.256 0.1920650 0.0462885 0.2155320 0.151
30 No 5818.214 -0.3337943 0.1153287 -0.6425179 0.8683218 -1.3420087 No 0.3708270 Yes M Bachelors Professional -1.6633769 Commercial 0.3696079 -1.0689293 Panel Truck no -0.1254949 0.1679432 No 0.1062001 0.6618846 Highly Urban/ Urban 0.1864743 0.0259940 0.226 0.1558029 0.0287481 0.1669544 0.117