data_621_hw

Introduction

This homework exercise is to build a logistic regression model and a multiple regression model that will estimate the likelihood of car accident, and if so, we try to predict the cost when such accidents happen. We have two response variables, i.e. TARGET_FLAG and TARGET_AMT. TARGET_FLAG is a binary field where 1 is equal to crash, and 0 is equal to no crash. TARGET_AMT, on the other hand, is the amount of time spent on repairs given there is a car crash accident. TARGET_FLAG is the response variable for our logistic regression model, whereas TARGET_AMT is the response variable for multiple regression.

# load packages
if(!require(pacman)){install.packages("pacman"); require(pacman)}
## Loading required package: pacman
## Warning: package 'pacman' was built under R version 3.6.2
packages <- c('tidyverse', 'sqldf', 'broom', 'caret', 'kableExtra', 'janitor', 'Hmisc', 'MASS', 'corrplot', 'Metrics')
pacman::p_load(char = packages)

EDA

# read data
dfTrain <- read.csv("insurance_training_data.csv", header = TRUE)
dfEval <- read.csv("insurance-evaluation-data.csv", header = TRUE)

# check dim
dim(dfTrain); dim(dfEval)
## [1] 8161   26
## [1] 2141   26

# are they compatible?
if(!any(names(dfTrain) == names(dfEval))){print("the two data sets are different, please check for consistency")}

# clean names
dfTrain <- dfTrain %>% janitor::clean_names()
dfEval <- dfEval %>% janitor::clean_names()

# head
head(dfTrain) %>% kable()

index	target_flag	target_amt	age	homekids	yoj	income	parent1	home_val	mstatus	sex	education	job	travtime	car_use	bluebook	tif	car_type	red_car	oldclaim	clm_freq	revoked	mvr_pts	car_age	urbanicity
1	0	0	60	0	11	$67,349	No	$0	z_No	M	PhD	Professional	14	Private	$14,230	11	Minivan	yes	$4,461	2	No	3	18	Highly Urban/ Urban
2	0	0	43	0	11	$91,449	No	$257,252	z_No	M	z_High School	z_Blue Collar	22	Commercial	$14,940	1	Minivan	yes	$0	0	No	0	1	Highly Urban/ Urban
4	0	0	35	1	10	$16,039	No	$124,191	Yes	z_F	z_High School	Clerical	5	Private	$4,010	4	z_SUV	no	$38,690	2	No	3	10	Highly Urban/ Urban
5	0	0	51	0	14		No	$306,251	Yes	M	<High School	z_Blue Collar	32	Private	$15,440	7	Minivan	yes	$0	0	No	0	6	Highly Urban/ Urban
6	0	0	50	0	NA	$114,986	No	$243,925	Yes	z_F	PhD	Doctor	36	Private	$18,000	1	z_SUV	no	$19,217	2	Yes	3	17	Highly Urban/ Urban
7	1	2946	34	1	12	$125,301	Yes	$0	z_No	z_F	Bachelors	z_Blue Collar	46	Commercial	$17,430	1	Sports Car	no	$0	0	No	0	7	Highly Urban/ Urban


# target flag - proportion
with(dfTrain, prop.table(ftable(target_flag), 1))
## target_flag         0         1
##                                
##             0.7361843 0.2638157

# target amt - distribution
hist(dfTrain$target_amt)


# target amt - log transformed
boxplot(log(dfTrain$target_amt) ~ dfTrain$target_flag)
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z$group
## == : Outlier (-Inf) in boxplot 1 is not drawn

DATA PREP

# length of unique values of each variable from dfTrain
sapply(dfTrain, function(x) length(unique(x)))
##       index target_flag  target_amt    kidsdriv         age    homekids 
##        8161           2        1949           5          61           6 
##         yoj      income     parent1    home_val     mstatus         sex 
##          22        6613           2        5107           2           2 
##   education         job    travtime     car_use    bluebook         tif 
##           5           9          97           2        2789          23 
##    car_type     red_car    oldclaim    clm_freq     revoked     mvr_pts 
##           6           2        2857           6           2          13 
##     car_age  urbanicity 
##          31           2

# quickly glance at the class of each variable
fields <- data.frame(fields = names(dfTrain),
                     class = sapply(dfTrain, class) %>% unlist %>% as.vector)
fields
##         fields   class
## 1        index integer
## 2  target_flag integer
## 3   target_amt numeric
## 4     kidsdriv integer
## 5          age integer
## 6     homekids integer
## 7          yoj integer
## 8       income  factor
## 9      parent1  factor
## 10    home_val  factor
## 11     mstatus  factor
## 12         sex  factor
## 13   education  factor
## 14         job  factor
## 15    travtime integer
## 16     car_use  factor
## 17    bluebook  factor
## 18         tif integer
## 19    car_type  factor
## 20     red_car  factor
## 21    oldclaim  factor
## 22    clm_freq integer
## 23     revoked  factor
## 24     mvr_pts integer
## 25     car_age integer
## 26  urbanicity  factor

# mutate target_flag
# mutate these 4 factors - "income", "home_val", "bluebook", "oldclaim"
# these should be numeric variables, not factor
dfTrain <- dfTrain %>%
        dplyr::mutate(target_flag = dplyr::case_when(target_flag == 1 ~ "Y", 
                                              TRUE ~ "N") %>%
                              factor(., levels = c("Y", "N"), labels = c("Yes", "No")),
                      income = as.numeric(income),
                      home_val = as.numeric(home_val),
                      bluebook = as.numeric(bluebook),
                      oldclaim = as.numeric(oldclaim))

dfEval <- dfEval %>%
        dplyr::mutate(target_flag = dplyr::case_when(target_flag == 1 ~ "Y", 
                                              TRUE ~ "N") %>%
                              factor(., levels = c("Y", "N"), labels = c("Yes", "No")),
                      income = as.numeric(income),
                      home_val = as.numeric(home_val),
                      bluebook = as.numeric(bluebook),
                      oldclaim = as.numeric(oldclaim))

# rerun
fields <- data.frame(fields = names(dfTrain),
                     class = sapply(dfTrain, class) %>% unlist %>% as.vector)
fields
##         fields   class
## 1        index integer
## 2  target_flag  factor
## 3   target_amt numeric
## 4     kidsdriv integer
## 5          age integer
## 6     homekids integer
## 7          yoj integer
## 8       income numeric
## 9      parent1  factor
## 10    home_val numeric
## 11     mstatus  factor
## 12         sex  factor
## 13   education  factor
## 14         job  factor
## 15    travtime integer
## 16     car_use  factor
## 17    bluebook numeric
## 18         tif integer
## 19    car_type  factor
## 20     red_car  factor
## 21    oldclaim numeric
## 22    clm_freq integer
## 23     revoked  factor
## 24     mvr_pts integer
## 25     car_age integer
## 26  urbanicity  factor

# set variables
binary_var <- "target_flag"
quant_var <- "target_amt"
variables <- names(dfTrain)[names(dfTrain) %nin% c(binary_var, quant_var, "index")]

# check missing
colSums(is.na(dfTrain))
##       index target_flag  target_amt    kidsdriv         age    homekids 
##           0           0           0           0           6           0 
##         yoj      income     parent1    home_val     mstatus         sex 
##         454           0           0           0           0           0 
##   education         job    travtime     car_use    bluebook         tif 
##           0           0           0           0           0           0 
##    car_type     red_car    oldclaim    clm_freq     revoked     mvr_pts 
##           0           0           0           0           0           0 
##     car_age  urbanicity 
##         510           0
colSums(is.na(dfEval))
##       index target_flag  target_amt    kidsdriv         age    homekids 
##           0           0        2141           0           1           0 
##         yoj      income     parent1    home_val     mstatus         sex 
##          94           0           0           0           0           0 
##   education         job    travtime     car_use    bluebook         tif 
##           0           0           0           0           0           0 
##    car_type     red_car    oldclaim    clm_freq     revoked     mvr_pts 
##           0           0           0           0           0           0 
##     car_age  urbanicity 
##         129           0

# fix missing - train
preProcValuesTrain <- caret::preProcess(dfTrain[, variables], method = c("medianImpute", "center", "scale"))
dfTrain <- predict(preProcValuesTrain, dfTrain)
colSums(is.na(dfTrain))
##       index target_flag  target_amt    kidsdriv         age    homekids 
##           0           0           0           0           0           0 
##         yoj      income     parent1    home_val     mstatus         sex 
##           0           0           0           0           0           0 
##   education         job    travtime     car_use    bluebook         tif 
##           0           0           0           0           0           0 
##    car_type     red_car    oldclaim    clm_freq     revoked     mvr_pts 
##           0           0           0           0           0           0 
##     car_age  urbanicity 
##           0           0

# fix missing - eval
preProcValuesEval <- caret::preProcess(dfEval[, variables], method = c("medianImpute", "center", "scale"))
dfEval <- predict(preProcValuesEval, dfEval)
colSums(is.na(dfEval))
##       index target_flag  target_amt    kidsdriv         age    homekids 
##           0           0        2141           0           0           0 
##         yoj      income     parent1    home_val     mstatus         sex 
##           0           0           0           0           0           0 
##   education         job    travtime     car_use    bluebook         tif 
##           0           0           0           0           0           0 
##    car_type     red_car    oldclaim    clm_freq     revoked     mvr_pts 
##           0           0           0           0           0           0 
##     car_age  urbanicity 
##           0           0

# split data into train (70%), test (30%) sets
set.seed(1234)
index <- caret::createDataPartition(dfTrain$target_flag, p = 0.7, list = FALSE)
trainSet <- dfTrain[index, ]
testSet <- dfTrain[-index, ]

# trainSetControl - let's do 10-fold cross-validation 
trainSet.control <- caret::trainControl(method = "cv", number = 10, savePredictions = 'final', classProbs = TRUE)

MODEL - CLASSIFICATION PROBLEM (TARGET_FLAG)

We will first build three classification models and then two multi-linear regression models. Besides logistic regression, let’s try two different classification algorithms, i.e. naivey bayes and random forest. In addition, let’s try to improve our accuracy by 1) doing 10-fold cross-validation, and 2) ensembling our models, i.e. using glm (generalized linear model), nb (naive bayes) and rf (random forest) as our base layer and stacking on top by building a top layer using glm, nb, in addition gbm (gradient boosting machine).

# turn off warning
options(warn = -1)

# count time - start
start <- Sys.time()

# set seed
set.seed(1234)

# methods
baseTrainMethods <- c("glm", "nb", "rf")
topTrainMethods <- c("glm", "nb", "gbm")

# output
modelSummaryList <- vector(mode = "list")

# train base layer
for(baseLayer in baseTrainMethods){
        
        # set parameters
        ml = baseLayer
        model = paste0("model_base_", ml)
        OOF_prediction = paste0("OOF_pred_", ml)
        prediction = paste0("pred_", ml)
        result = paste0("result_", ml)
        
        # model
        assign(bquote(.(model)), caret::train(trainSet[, variables], trainSet[, binary_var], 
                                              method = ml, 
                                              trControl = trainSet.control,
                                              trace = FALSE))
        
        # Out-Of-Fold probability predictions - trainSet    
        if(ml == "glm"){trainSet$OOF_pred_glm = eval(sym(model))$pred$Y[order(eval(sym(model))$pred$rowIndex)]}        
        if(ml == "nb"){trainSet$OOF_pred_nb = eval(sym(model))$pred$Y[order(eval(sym(model))$pred$rowIndex)]}        
        if(ml == "rf"){trainSet$OOF_pred_rf = eval(sym(model))$pred$Y[order(eval(sym(model))$pred$rowIndex)]}
        
        # Out-Of-Fold probability predictions - testSet 
        assign(bquote(.(OOF_prediction)), predict(eval(sym(model)), testSet[, variables], type = "prob")$Y)
        if(ml == "glm"){testSet$OOF_pred_glm = eval(sym(OOF_prediction))}        
        if(ml == "nb"){testSet$OOF_pred_nb = eval(sym(OOF_prediction))}        
        if(ml == "rf"){testSet$OOF_pred_rf = eval(sym(OOF_prediction))}
        
        # Y/N predictions for Confusion Matrix - testSet    
        assign(bquote(.(prediction)), predict(eval(sym(model)), testSet[, variables]))
        if(ml == "glm"){testSet$pred_glm = eval(sym(prediction))}        
        if(ml == "nb"){testSet$pred_nb = eval(sym(prediction))}        
        if(ml == "rf"){testSet$pred_rf = eval(sym(prediction))}
        
        # output
        assign(bquote(.(result)), broom::tidy(caret::confusionMatrix(testSet[, prediction], testSet[, binary_var])) %>%
                       dplyr::mutate(trainMethod = ml) %>%
                       dplyr::select(trainMethod, everything()))
        
        # store output into a list
        tempModelList <- list(eval(sym(result)))
        modelSummaryList <<- c(modelSummaryList, tempModelList)        
        
}

# train top layer
for(topLayer in topTrainMethods){
        
        # set parameters
        ml = topLayer
        model = paste0("model_top_", ml)        
        OOF_predictors_top = c("OOF_pred_glm", "OOF_pred_nb", "OOF_pred_rf")
        OOF_prediction_top = paste0("OOF_pred_top_", ml)
        prediction_top = paste0("pred_top_", ml)
        result = paste0("result_top_", ml)
        
        # model
        assign(bquote(.(model)), caret::train(trainSet[, OOF_predictors_top], trainSet[, binary_var], 
                                              method = ml, 
                                              trControl = trainSet.control))
        
        # Out-Of-Fold probability predictions - testSet 
        assign(bquote(.(OOF_prediction_top)), predict(eval(sym(model)), testSet[, OOF_predictors_top], type = "prob")$Y)
        if(ml == "glm"){testSet$OOF_pred_top_glm = eval(sym(OOF_prediction_top))}
        if(ml == "nb"){testSet$OOF_pred_top_nb = eval(sym(OOF_prediction_top))}
        if(ml == "gbm"){testSet$OOF_pred_top_gbm = eval(sym(OOF_prediction_top))}
        
        # Y/N predictions for Confusion Matrix - testSet    
        assign(bquote(.(prediction_top)), predict(eval(sym(model)), testSet[, OOF_predictors_top]))
        if(ml == "glm"){testSet$pred_top_glm = eval(sym(prediction_top))}
        if(ml == "nb"){testSet$pred_top_nb = eval(sym(prediction_top))}
        if(ml == "gbm"){testSet$pred_top_gbm = eval(sym(prediction_top))}
        
        # output
        assign(bquote(.(result)), broom::tidy(caret::confusionMatrix(testSet[, prediction_top], testSet[, binary_var])) %>%
                       dplyr::mutate(trainMethod = paste0(ml, " - top layer")) %>%
                       dplyr::select(trainMethod, everything()))
        
        # store output into a list
        tempModelList <- list(eval(sym(result)))
        modelSummaryList <<- c(modelSummaryList, tempModelList)
        
}

# put together - averaging the OOF prediction probability
testSet <- testSet %>%
        dplyr::mutate(pred_final_prob_avg = (OOF_pred_top_glm + OOF_pred_top_nb + OOF_pred_top_gbm) / length(topTrainMethods),
                      pred_final = ifelse(pred_final_prob_avg > 0.5, "Y", "N") %>%
                              factor(., levels = c("Y", "N"), labels = c("Yes", "No")))

finalResult <- broom::tidy(caret::confusionMatrix(testSet$pred_final, testSet$target_flag)) %>%
        dplyr::mutate(trainMethod = "final - averaging") %>%
        dplyr::select(trainMethod, everything())

# store finalResult output into a list
tempModelList <- list(finalResult)
modelSummaryList <<- c(modelSummaryList, tempModelList)

# count time - finish
finish <- Sys.time()

MODEL SELECTION - CLASSIFICATION PROBLEM (TARGET_FLAG)

First, we build a base layer (using glm, nb and rf) to predict the “target_flag” using all the variables (minus the index and target_amt) from the train set. Second, we build a top layer (using glm, nb and gbm) to predict the “target_flag” based on the outcomes (OOF or Out-Of-Fold prediction) of our base layer. In other words, we make prediction (of the target variable) based on the predictions of our base models. Finally, we simply average the outcome probabilities of our top layer to get our final probability and decision using 0.5 as cut-off.

Take a look of our results, i.e. modelSummaryDf, we decide to apply the final model (averaging) for our evaluation set because of the highest f1 score displayed for the final result from the test set. Looking at the results, “nb” from the base layer did the worst in terms of sensitivity, but did the best among all models in precision. All three models from the top layer can significantly enhance accuracy, sensitivy and f1 score. The final result is far from perfect but better than relying on just a single classification algorithm. Lastly, when we compare the distribution between train and evaluation set, we see that that the proportion is very similar, i.e. Yes from evaluation (21.4%) vs Yes from train set (26.4%). Although it improved accuracy, the downside of ensembling is that the result is hard to interpret and communicate with stakeholder (such as what contributed more significantly to causing an accident).

# let's look at the time it finished running the models
print(paste0("The models ensembling exercise took roughly ", round(finish - start, 1), " mins to run"))
## [1] "The models ensembling exercise took roughly 5.8 mins to run"

# let's look at the result
modelSummaryDf <- modelSummaryList %>% 
        dplyr::bind_rows() %>% 
        dplyr::select(trainMethod, term, estimate) %>%
        tidyr::spread(term, estimate) %>%
        arrange(desc(f1))

modelSummaryDf %>% kable()

trainMethod	accuracy	balanced_accuracy	detection_prevalence	detection_rate	f1	kappa	neg_pred_value	pos_pred_value	precision	prevalence	recall	sensitivity	specificity
final - averaging	0.7870862	0.6921848	0.2084185	0.1295464	0.5489177	0.4120569	0.8306660	0.6215686	0.6215686	0.2635881	0.4914729	0.4914729	0.8928968
gbm - top layer	0.7854516	0.6915727	0.2108705	0.1299550	0.5478036	0.4094336	0.8306577	0.6162791	0.6162791	0.2635881	0.4930233	0.4930233	0.8901221
nb - top layer	0.7825909	0.6916213	0.2170004	0.1315897	0.5476190	0.4062960	0.8314196	0.6064030	0.6064030	0.2635881	0.4992248	0.4992248	0.8840178
glm - top layer	0.7850429	0.6708885	0.1777687	0.1131998	0.5129630	0.3816700	0.8170974	0.6367816	0.6367816	0.2635881	0.4294574	0.4294574	0.9123196
glm	0.7793216	0.6620267	0.1753167	0.1091132	0.4972067	0.3630885	0.8126858	0.6223776	0.6223776	0.2635881	0.4139535	0.4139535	0.9100999
rf	0.7834083	0.6593264	0.1622395	0.1046179	0.4913628	0.3635247	0.8102439	0.6448363	0.6448363	0.2635881	0.3968992	0.3968992	0.9217536
nb	0.7621577	0.5697416	0.0600736	0.0429097	0.2651515	0.1854502	0.7652174	0.7142857	0.7142857	0.2635881	0.1627907	0.1627907	0.9766926

options(warn = -1)

# let's fit the dfEval using the final averaging method

# base layer
dfEval <- dfEval %>%
        dplyr::mutate(OOF_pred_glm = predict(model_base_glm, dfEval[, variables], type = "prob")$Y,
                      OOF_pred_nb = predict(model_base_nb, dfEval[, variables], type = "prob")$Y,
                      OOF_pred_rf = predict(model_base_rf, dfEval[, variables], type = "prob")$Y)

# top layer
dfEval <- dfEval %>%
        dplyr::mutate(OOF_pred_top_glm = predict(model_top_glm, dfEval[, c("OOF_pred_glm", "OOF_pred_nb", "OOF_pred_rf")], type = "prob")$Y,
                      OOF_pred_top_nb = predict(model_top_nb, dfEval[, c("OOF_pred_glm", "OOF_pred_nb", "OOF_pred_rf")], type = "prob")$Y,
                      OOF_pred_top_gbm = predict(model_top_gbm, dfEval[, c("OOF_pred_glm", "OOF_pred_nb", "OOF_pred_rf")], type = "prob")$Y)

# final predicion
dfEval <- dfEval %>%
        dplyr::mutate(target_flag_prob = round((OOF_pred_top_glm + OOF_pred_top_nb + OOF_pred_top_gbm) / 3, 3),
                      target_flag = ifelse(target_flag_prob > 0.5, "Y", "N") %>%
                              factor(., levels = c("Y", "N"), labels = c("Yes", "No")))

# see prediction
evalTable <- ftable(dfEval$target_flag)
evalTable
##   Yes   No
##           
##   459 1682

# comparison between evaluation and train set
with(dfEval, prop.table(evalTable, 1))  # evaluation
##        Yes        No
##                     
##  0.2143858 0.7856142
with(dfTrain, prop.table(ftable(dfTrain$target_flag)))  # train
##        Yes        No
##                     
##  0.2638157 0.7361843

# dfEval
dfEval %>% dplyr::select(index, target_flag, target_flag_prob, everything()) %>% head() %>% kable()

index	target_flag	target_flag_prob	target_amt	kidsdriv	age	homekids	yoj	income	parent1	home_val	mstatus	sex	education	job	travtime	car_use	bluebook	tif	car_type	red_car	oldclaim	clm_freq	revoked	mvr_pts	car_age	urbanicity	OOF_pred_glm	OOF_pred_nb	OOF_pred_rf	OOF_pred_top_glm	OOF_pred_top_nb	OOF_pred_top_gbm
3	No	0.063	NA	-0.3337943	0.3499326	-0.6425179	0.1488987	0.6619537	No	-0.9915932	z_No	M	Bachelors	Manager	-0.4549095	Private	0.0016269	-1.0689293	Van	yes	-0.6571645	-0.7111922	No	0.1062001	0.3150395	Highly Urban/ Urban	0.1220784	0.0013785	0.092	0.0915576	0.0098567	0.0886053
9	No	0.232	NA	1.7198080	-0.5884831	0.2530751	0.1488987	0.6011222	Yes	-0.9915932	z_No	M	z_High School	Manager	-0.7729272	Private	-0.3709252	0.1901913	Minivan	no	0.4022653	0.1679432	No	0.1062001	-1.2457634	Highly Urban/ Urban	0.2336235	0.2439135	0.328	0.2164859	0.2120411	0.2684035
10	No	0.053	NA	-0.3337943	-0.1192753	1.1486681	0.3887064	0.3491062	Yes	-0.9915932	z_No	z_F	z_High School	z_Blue Collar	-0.2004953	Commercial	1.1124265	1.1974878	z_SUV	no	-0.6571645	-0.7111922	No	-0.8014825	0.3150395	z_Highly Rural/ Rural	0.0908932	0.0091209	0.012	0.0667108	0.0056237	0.0872490
18	No	0.109	NA	-0.3337943	-1.1749930	1.1486681	0.1488987	-0.4521312	Yes	-0.9915932	z_No	M	z_High School	Clerical	2.5980610	Private	1.5329762	0.1901913	Pickup	no	-0.6571645	-0.7111922	Yes	-0.8014825	-0.7254958	z_Highly Rural/ Rural	0.2403932	0.0106405	0.112	0.1351010	0.0187269	0.1722218
21	No	0.151	NA	-0.3337943	1.6402542	-0.6425179	0.3887064	1.5865921	No	-0.9915932	z_No	M	z_High School	Manager	0.7535580	Private	-0.8166164	-1.0689293	Minivan	yes	1.2701376	1.0470786	No	1.0138828	-1.2457634	Highly Urban/ Urban	0.2392829	0.0536335	0.256	0.1920650	0.0462885	0.2155320
30	No	0.117	NA	-0.3337943	0.1153287	-0.6425179	0.8683218	-1.3420087	No	0.3708270	Yes	M	Bachelors	Professional	-1.6633769	Commercial	0.3696079	-1.0689293	Panel Truck	no	-0.1254949	0.1679432	No	0.1062001	0.6618846	Highly Urban/ Urban	0.1864743	0.0259940	0.226	0.1558029	0.0287481	0.1669544

MODEL - MULTILINEAR REGRESSION (TARGET_AMT)

Let’s turn our focus to build two multilinear regression models using the same data. Interestingly, the variables do not seem to correlate with the target variable in this exercise. Nothing seems to indicate anything relevant to “target_amt”.

# corrplot - quantitative variables
trainSet %>%
        dplyr::filter(target_flag == "Yes") %>%
        dplyr::select(fields %>% dplyr::filter(class != 'factor') %>% .$fields) %>% 
        dplyr::select(-index) %>%
        cor %>%
        corrplot(method = "number", type = "upper", order = "hclust")


# density - categorical variables
dfGather <- trainSet %>%
        dplyr::filter(target_flag == "Yes") %>%
        dplyr::select(fields %>% dplyr::filter(class == 'factor') %>% .$fields, target_amt) %>% 
        tidyr::gather(key, value, -target_amt) 

dfGather %>%
        ggplot(aes(target_amt, color = value)) +
        geom_density() +
        geom_vline(data = aggregate(target_amt ~ key + value, dfGather, median), 
                   aes(xintercept = target_amt,
                       color = value),
                   linetype = "dashed") +
        facet_wrap(~ key, nrow = 5, scales = "free") + 
        theme(legend.position = "none") + 
        ggtitle("Distribution of Target Amount by Various Categorical Variables")


# build full model
fullModel <- lm(target_amt ~., data = trainSet %>% 
                        dplyr::filter(target_flag == "Yes") %>% 
                        dplyr::select(variables, quant_var))
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(variables)` instead of `variables` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(quant_var)` instead of `quant_var` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
summary(fullModel)
## 
## Call:
## lm(formula = target_amt ~ ., data = trainSet %>% dplyr::filter(target_flag == 
##     "Yes") %>% dplyr::select(variables, quant_var))
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -7646  -2980  -1428    479  78846 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      6420.76    1622.17   3.958 7.91e-05 ***
## kidsdriv                          -41.87     186.91  -0.224   0.8228    
## age                                63.94     209.70   0.305   0.7605    
## homekids                          241.18     275.06   0.877   0.3807    
## yoj                                50.77     237.11   0.214   0.8305    
## income                             26.37     243.05   0.109   0.9136    
## parent1Yes                       -135.57     677.69  -0.200   0.8415    
## home_val                          181.00     232.56   0.778   0.4365    
## mstatusz_No                       855.11     534.28   1.600   0.1097    
## sexz_F                          -1352.67     708.49  -1.909   0.0564 .  
## educationBachelors               -176.93     740.56  -0.239   0.8112    
## educationMasters                 1170.32    1245.90   0.939   0.3477    
## educationPhD                     1820.80    1428.87   1.274   0.2028    
## educationz_High School           -390.17     590.99  -0.660   0.5092    
## jobClerical                      -465.61    1380.79  -0.337   0.7360    
## jobDoctor                       -3129.01    2092.81  -1.495   0.1351    
## jobHome Maker                    -851.86    1411.45  -0.604   0.5462    
## jobLawyer                       -1153.76    1182.09  -0.976   0.3292    
## jobManager                      -1606.24    1235.83  -1.300   0.1939    
## jobProfessional                   286.88    1297.24   0.221   0.8250    
## jobStudent                       -307.51    1454.56  -0.211   0.8326    
## jobz_Blue Collar                   29.85    1320.61   0.023   0.9820    
## travtime                           31.93     202.57   0.158   0.8748    
## car_usePrivate                   -396.40     595.54  -0.666   0.5058    
## bluebook                         -130.25     194.72  -0.669   0.5037    
## tif                               -49.40     206.61  -0.239   0.8111    
## car_typePanel Truck               806.16     962.96   0.837   0.4026    
## car_typePickup                    -80.94     695.63  -0.116   0.9074    
## car_typeSports Car                740.04     814.52   0.909   0.3637    
## car_typeVan                       370.21     838.62   0.441   0.6589    
## car_typez_SUV                     327.53     715.49   0.458   0.6472    
## red_caryes                       -725.27     571.83  -1.268   0.2049    
## oldclaim                          -56.25     220.52  -0.255   0.7987    
## clm_freq                         -118.32     228.37  -0.518   0.6045    
## revokedYes                       -756.60     485.77  -1.558   0.1196    
## mvr_pts                           103.60     171.38   0.604   0.5456    
## car_age                          -527.25     289.71  -1.820   0.0690 .  
## urbanicityz_Highly Rural/ Rural   -52.34     877.12  -0.060   0.9524    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7391 on 1470 degrees of freedom
## Multiple R-squared:  0.02302,    Adjusted R-squared:  -0.001574 
## F-statistic: 0.936 on 37 and 1470 DF,  p-value: 0.5804

# stepwise regression - direction default to both
step <- MASS::stepAIC(fullModel, trace = FALSE)
step$anova
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## target_amt ~ kidsdriv + age + homekids + yoj + income + parent1 + 
##     home_val + mstatus + sex + education + job + travtime + car_use + 
##     bluebook + tif + car_type + red_car + oldclaim + clm_freq + 
##     revoked + mvr_pts + car_age + urbanicity
## 
## Final Model:
## target_amt ~ mstatus + sex + car_use + revoked
## 
## 
##            Step Df  Deviance Resid. Df  Resid. Dev      AIC
## 1                                 1470 80306517739 26904.19
## 2         - job  8 350075913      1478 80656593652 26894.75
## 3    - car_type  5 156824305      1483 80813417957 26887.68
## 4   - education  4 209630349      1487 81023048307 26883.59
## 5    - kidsdriv  1    345385      1488 81023393692 26881.59
## 6     - parent1  1   1850124      1489 81025243816 26879.63
## 7  - urbanicity  1   2060081      1490 81027303897 26877.66
## 8    - travtime  1   2040288      1491 81029344185 26875.70
## 9         - tif  1   2786998      1492 81032131183 26873.75
## 10   - oldclaim  1   3433572      1493 81035564754 26871.82
## 11     - income  1   4926100      1494 81040490854 26869.91
## 12        - age  1  17593993      1495 81058084847 26868.24
## 13   - homekids  1  20065899      1496 81078150746 26866.61
## 14    - mvr_pts  1  24168604      1497 81102319350 26865.06
## 15   - clm_freq  1  17348896      1498 81119668246 26863.38
## 16   - bluebook  1  27790046      1499 81147458292 26861.90
## 17        - yoj  1  36399906      1500 81183858198 26860.58
## 18    - car_age  1  73266728      1501 81257124926 26859.94
## 19   - home_val  1  54967527      1502 81312092454 26858.96
## 20    - red_car  1  85939499      1503 81398031953 26858.55

# 2 degree of interactions - full model
step2 <- MASS::stepAIC(fullModel, ~ .^2, trace = FALSE)
step2$anova
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## target_amt ~ kidsdriv + age + homekids + yoj + income + parent1 + 
##     home_val + mstatus + sex + education + job + travtime + car_use + 
##     bluebook + tif + car_type + red_car + oldclaim + clm_freq + 
##     revoked + mvr_pts + car_age + urbanicity
## 
## Final Model:
## target_amt ~ kidsdriv + age + homekids + yoj + income + home_val + 
##     mstatus + sex + tif + car_type + red_car + oldclaim + clm_freq + 
##     mvr_pts + car_age + urbanicity + homekids:clm_freq + homekids:car_type + 
##     car_type:urbanicity + home_val:mstatus + age:car_type + kidsdriv:age + 
##     income:mstatus + age:home_val + homekids:car_age + yoj:tif + 
##     kidsdriv:oldclaim + income:car_type + homekids:red_car + 
##     home_val:sex + homekids:home_val + kidsdriv:car_age + age:urbanicity + 
##     home_val:oldclaim + age:red_car + age:clm_freq + kidsdriv:clm_freq
## 
## 
##                     Step Df     Deviance Resid. Df  Resid. Dev      AIC
## 1                                             1470 80306517739 26904.19
## 2    + homekids:clm_freq  1 6.753112e+08      1469 79631206509 26893.46
## 3    + homekids:car_type  5 1.011300e+09      1464 78619906978 26884.18
## 4                  - job  8 3.895629e+08      1472 79009469844 26875.64
## 5  + car_type:urbanicity  4 7.850784e+08      1468 78224391452 26868.58
## 6     + home_val:mstatus  1 4.600644e+08      1467 77764327040 26861.68
## 7         + age:car_type  5 8.447631e+08      1462 76919563928 26855.21
## 8            - education  4 1.752619e+08      1466 77094825815 26850.64
## 9         + kidsdriv:age  1 2.948190e+08      1465 76800006825 26846.86
## 10      + income:mstatus  1 2.600865e+08      1464 76539920341 26843.75
## 11        + age:home_val  1 2.833268e+08      1463 76256593508 26840.16
## 12    + homekids:car_age  1 2.494893e+08      1462 76007104200 26837.21
## 13             + yoj:tif  1 2.338405e+08      1461 75773263655 26834.57
## 14   + kidsdriv:oldclaim  1 2.327973e+08      1460 75540466362 26831.93
## 15            - travtime  1 5.909411e+03      1461 75540472271 26829.93
## 16     + income:car_type  5 5.975858e+08      1456 74942886461 26827.95
## 17             - parent1  1 6.248921e+06      1457 74949135382 26826.08
## 18             - car_use  1 2.510220e+07      1458 74974237583 26824.58
## 19            - bluebook  1 2.596424e+07      1459 75000201824 26823.10
## 20             - revoked  1 3.376960e+07      1460 75033971420 26821.78
## 21    + homekids:red_car  1 1.592836e+08      1459 74874687832 26820.58
## 22        + home_val:sex  1 1.455318e+08      1458 74729155997 26819.64
## 23   + homekids:home_val  1 1.492159e+08      1457 74579940090 26818.63
## 24    + kidsdriv:car_age  1 1.541971e+08      1456 74425743008 26817.51
## 25      + age:urbanicity  1 1.541162e+08      1455 74271626808 26816.38
## 26   + home_val:oldclaim  1 1.458966e+08      1454 74125730252 26815.42
## 27         + age:red_car  1 9.948659e+07      1453 74026243664 26815.39
## 28        + age:clm_freq  1 1.051529e+08      1452 73921090735 26815.25
## 29   + kidsdriv:clm_freq  1 1.467009e+08      1451 73774389817 26814.25

# 3 degree of interactions - full model
step3 <- MASS::stepAIC(fullModel, ~ .^3, trace = FALSE)
step3$anova
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## target_amt ~ kidsdriv + age + homekids + yoj + income + parent1 + 
##     home_val + mstatus + sex + education + job + travtime + car_use + 
##     bluebook + tif + car_type + red_car + oldclaim + clm_freq + 
##     revoked + mvr_pts + car_age + urbanicity
## 
## Final Model:
## target_amt ~ kidsdriv + age + homekids + yoj + income + home_val + 
##     mstatus + sex + tif + car_type + red_car + oldclaim + clm_freq + 
##     mvr_pts + car_age + urbanicity + homekids:clm_freq + homekids:car_type + 
##     car_type:urbanicity + home_val:mstatus + age:car_type + kidsdriv:age + 
##     income:mstatus + age:home_val + homekids:car_age + yoj:tif + 
##     kidsdriv:oldclaim + income:car_type + homekids:red_car + 
##     home_val:sex + homekids:home_val + kidsdriv:car_age + age:urbanicity + 
##     home_val:oldclaim + kidsdriv:clm_freq + age:clm_freq + age:car_type:urbanicity + 
##     kidsdriv:age:clm_freq
## 
## 
##                         Step Df     Deviance Resid. Df  Resid. Dev      AIC
## 1                                                 1470 80306517739 26904.19
## 2        + homekids:clm_freq  1 6.753112e+08      1469 79631206509 26893.46
## 3        + homekids:car_type  5 1.011300e+09      1464 78619906978 26884.18
## 4                      - job  8 3.895629e+08      1472 79009469844 26875.64
## 5      + car_type:urbanicity  4 7.850784e+08      1468 78224391452 26868.58
## 6         + home_val:mstatus  1 4.600644e+08      1467 77764327040 26861.68
## 7             + age:car_type  5 8.447631e+08      1462 76919563928 26855.21
## 8                - education  4 1.752619e+08      1466 77094825815 26850.64
## 9             + kidsdriv:age  1 2.948190e+08      1465 76800006825 26846.86
## 10          + income:mstatus  1 2.600865e+08      1464 76539920341 26843.75
## 11            + age:home_val  1 2.833268e+08      1463 76256593508 26840.16
## 12        + homekids:car_age  1 2.494893e+08      1462 76007104200 26837.21
## 13                 + yoj:tif  1 2.338405e+08      1461 75773263655 26834.57
## 14       + kidsdriv:oldclaim  1 2.327973e+08      1460 75540466362 26831.93
## 15                - travtime  1 5.909411e+03      1461 75540472271 26829.93
## 16         + income:car_type  5 5.975858e+08      1456 74942886461 26827.95
## 17                 - parent1  1 6.248921e+06      1457 74949135382 26826.08
## 18                 - car_use  1 2.510220e+07      1458 74974237583 26824.58
## 19                - bluebook  1 2.596424e+07      1459 75000201824 26823.10
## 20                 - revoked  1 3.376960e+07      1460 75033971420 26821.78
## 21        + homekids:red_car  1 1.592836e+08      1459 74874687832 26820.58
## 22            + home_val:sex  1 1.455318e+08      1458 74729155997 26819.64
## 23       + homekids:home_val  1 1.492159e+08      1457 74579940090 26818.63
## 24        + kidsdriv:car_age  1 1.541971e+08      1456 74425743008 26817.51
## 25          + age:urbanicity  1 1.541162e+08      1455 74271626808 26816.38
## 26 + age:car_type:urbanicity  4 1.848856e+09      1451 72422770876 26786.37
## 27       + home_val:oldclaim  1 1.482064e+08      1450 72274564496 26785.28
## 28       + kidsdriv:clm_freq  1 1.109008e+08      1449 72163663686 26784.96
## 29            + age:clm_freq  1 1.759370e+08      1448 71987726674 26783.28
## 30   + kidsdriv:age:clm_freq  1 2.379337e+08      1447 71749793011 26780.29

# final model (based on the 3-degree of interactions)
finalModel <- lm(
        target_amt ~ kidsdriv + age + homekids + yoj + income + home_val + 
            mstatus + sex + tif + car_type + red_car + oldclaim + clm_freq + 
            mvr_pts + car_age + urbanicity + homekids:clm_freq + homekids:car_type + 
            car_type:urbanicity + home_val:mstatus + age:car_type + kidsdriv:age + 
            income:mstatus + age:home_val + homekids:car_age + yoj:tif + 
            kidsdriv:oldclaim + income:car_type + homekids:red_car + 
            home_val:sex + homekids:home_val + kidsdriv:car_age + age:urbanicity + 
            home_val:oldclaim + kidsdriv:clm_freq + age:clm_freq + age:car_type:urbanicity + 
            kidsdriv:age:clm_freq,
        data = trainSet %>% 
                        dplyr::filter(target_flag == "Yes") %>% 
                        dplyr::select(variables, quant_var)
)

summary(finalModel)
## 
## Call:
## lm(formula = target_amt ~ kidsdriv + age + homekids + yoj + income + 
##     home_val + mstatus + sex + tif + car_type + red_car + oldclaim + 
##     clm_freq + mvr_pts + car_age + urbanicity + homekids:clm_freq + 
##     homekids:car_type + car_type:urbanicity + home_val:mstatus + 
##     age:car_type + kidsdriv:age + income:mstatus + age:home_val + 
##     homekids:car_age + yoj:tif + kidsdriv:oldclaim + income:car_type + 
##     homekids:red_car + home_val:sex + homekids:home_val + kidsdriv:car_age + 
##     age:urbanicity + home_val:oldclaim + kidsdriv:clm_freq + 
##     age:clm_freq + age:car_type:urbanicity + kidsdriv:age:clm_freq, 
##     data = trainSet %>% dplyr::filter(target_flag == "Yes") %>% 
##         dplyr::select(variables, quant_var))
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -18504  -3101  -1091    912  75691 
## 
## Coefficients: (2 not defined because of singularities)
##                                                         Estimate Std. Error
## (Intercept)                                              6157.26     664.87
## kidsdriv                                                 -220.46     203.47
## age                                                       421.19     480.96
## homekids                                                  442.93     541.58
## yoj                                                      -104.28     201.34
## income                                                  -1780.86     525.57
## home_val                                                  -77.16     385.25
## mstatusz_No                                              1194.44     432.93
## sexz_F                                                  -1264.66     689.06
## tif                                                       -47.34     199.78
## car_typePanel Truck                                      1227.58     825.18
## car_typePickup                                           -371.90     661.09
## car_typeSports Car                                        431.36     813.56
## car_typeVan                                              -176.79     798.68
## car_typez_SUV                                            -285.40     710.90
## red_caryes                                               -368.62     548.91
## oldclaim                                                   21.94     214.98
## clm_freq                                                 -166.52     219.32
## mvr_pts                                                   259.13     166.20
## car_age                                                  -287.20     208.52
## urbanicityz_Highly Rural/ Rural                         -3096.97    1774.29
## homekids:clm_freq                                         735.41     228.39
## homekids:car_typePanel Truck                             3451.02     835.13
## homekids:car_typePickup                                   657.81     640.92
## homekids:car_typeSports Car                              -842.79     724.14
## homekids:car_typeVan                                     -678.26     816.93
## homekids:car_typez_SUV                                   -128.95     611.79
## car_typePanel Truck:urbanicityz_Highly Rural/ Rural           NA         NA
## car_typePickup:urbanicityz_Highly Rural/ Rural           5557.25    2881.32
## car_typeSports Car:urbanicityz_Highly Rural/ Rural       2089.92    2894.18
## car_typeVan:urbanicityz_Highly Rural/ Rural             20566.09    3644.58
## car_typez_SUV:urbanicityz_Highly Rural/ Rural            2609.11    2294.87
## home_val:mstatusz_No                                     1567.46     438.06
## age:car_typePanel Truck                                  1924.33     824.48
## age:car_typePickup                                        -38.02     680.70
## age:car_typeSports Car                                   -391.10     638.73
## age:car_typeVan                                         -2235.39     824.49
## age:car_typez_SUV                                        -375.29     575.19
## kidsdriv:age                                              266.52     216.77
## income:mstatusz_No                                       1066.00     388.41
## age:home_val                                              677.16     192.65
## homekids:car_age                                          695.12     234.39
## yoj:tif                                                  -418.88     180.08
## kidsdriv:oldclaim                                         476.50     172.21
## income:car_typePanel Truck                               1169.47     775.82
## income:car_typePickup                                    1929.04     673.58
## income:car_typeSports Car                                2066.00     689.29
## income:car_typeVan                                       1701.52     737.94
## income:car_typez_SUV                                     1564.40     584.45
## homekids:red_caryes                                     -1015.11     470.41
## home_val:sexz_F                                          -682.46     401.28
## homekids:home_val                                         468.87     210.25
## kidsdriv:car_age                                         -369.25     180.45
## age:urbanicityz_Highly Rural/ Rural                      -702.84    1300.16
## home_val:oldclaim                                         272.93     177.81
## kidsdriv:clm_freq                                        -336.48     197.71
## age:clm_freq                                              328.15     175.17
## age:car_typePanel Truck:urbanicityz_Highly Rural/ Rural       NA         NA
## age:car_typePickup:urbanicityz_Highly Rural/ Rural       5226.66    3060.49
## age:car_typeSports Car:urbanicityz_Highly Rural/ Rural    882.66    2245.88
## age:car_typeVan:urbanicityz_Highly Rural/ Rural         17335.75    2991.61
## age:car_typez_SUV:urbanicityz_Highly Rural/ Rural         428.86    1815.43
## kidsdriv:age:clm_freq                                     354.29     161.74
##                                                         t value Pr(>|t|)    
## (Intercept)                                               9.261  < 2e-16 ***
## kidsdriv                                                 -1.084 0.278761    
## age                                                       0.876 0.381325    
## homekids                                                  0.818 0.413582    
## yoj                                                      -0.518 0.604584    
## income                                                   -3.388 0.000722 ***
## home_val                                                 -0.200 0.841287    
## mstatusz_No                                               2.759 0.005871 ** 
## sexz_F                                                   -1.835 0.066659 .  
## tif                                                      -0.237 0.812718    
## car_typePanel Truck                                       1.488 0.137062    
## car_typePickup                                           -0.563 0.573819    
## car_typeSports Car                                        0.530 0.596051    
## car_typeVan                                              -0.221 0.824848    
## car_typez_SUV                                            -0.401 0.688143    
## red_caryes                                               -0.672 0.501981    
## oldclaim                                                  0.102 0.918722    
## clm_freq                                                 -0.759 0.447823    
## mvr_pts                                                   1.559 0.119184    
## car_age                                                  -1.377 0.168620    
## urbanicityz_Highly Rural/ Rural                          -1.745 0.081114 .  
## homekids:clm_freq                                         3.220 0.001311 ** 
## homekids:car_typePanel Truck                              4.132 3.80e-05 ***
## homekids:car_typePickup                                   1.026 0.304897    
## homekids:car_typeSports Car                              -1.164 0.244676    
## homekids:car_typeVan                                     -0.830 0.406534    
## homekids:car_typez_SUV                                   -0.211 0.833090    
## car_typePanel Truck:urbanicityz_Highly Rural/ Rural          NA       NA    
## car_typePickup:urbanicityz_Highly Rural/ Rural            1.929 0.053962 .  
## car_typeSports Car:urbanicityz_Highly Rural/ Rural        0.722 0.470343    
## car_typeVan:urbanicityz_Highly Rural/ Rural               5.643 2.01e-08 ***
## car_typez_SUV:urbanicityz_Highly Rural/ Rural             1.137 0.255754    
## home_val:mstatusz_No                                      3.578 0.000357 ***
## age:car_typePanel Truck                                   2.334 0.019732 *  
## age:car_typePickup                                       -0.056 0.955469    
## age:car_typeSports Car                                   -0.612 0.540428    
## age:car_typeVan                                          -2.711 0.006783 ** 
## age:car_typez_SUV                                        -0.652 0.514203    
## kidsdriv:age                                              1.229 0.219097    
## income:mstatusz_No                                        2.745 0.006135 ** 
## age:home_val                                              3.515 0.000453 ***
## homekids:car_age                                          2.966 0.003070 ** 
## yoj:tif                                                  -2.326 0.020155 *  
## kidsdriv:oldclaim                                         2.767 0.005731 ** 
## income:car_typePanel Truck                                1.507 0.131927    
## income:car_typePickup                                     2.864 0.004246 ** 
## income:car_typeSports Car                                 2.997 0.002770 ** 
## income:car_typeVan                                        2.306 0.021265 *  
## income:car_typez_SUV                                      2.677 0.007519 ** 
## homekids:red_caryes                                      -2.158 0.031097 *  
## home_val:sexz_F                                          -1.701 0.089212 .  
## homekids:home_val                                         2.230 0.025895 *  
## kidsdriv:car_age                                         -2.046 0.040914 *  
## age:urbanicityz_Highly Rural/ Rural                      -0.541 0.588878    
## home_val:oldclaim                                         1.535 0.125010    
## kidsdriv:clm_freq                                        -1.702 0.088983 .  
## age:clm_freq                                              1.873 0.061221 .  
## age:car_typePanel Truck:urbanicityz_Highly Rural/ Rural      NA       NA    
## age:car_typePickup:urbanicityz_Highly Rural/ Rural        1.708 0.087891 .  
## age:car_typeSports Car:urbanicityz_Highly Rural/ Rural    0.393 0.694368    
## age:car_typeVan:urbanicityz_Highly Rural/ Rural           5.795 8.38e-09 ***
## age:car_typez_SUV:urbanicityz_Highly Rural/ Rural         0.236 0.813289    
## kidsdriv:age:clm_freq                                     2.191 0.028644 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7042 on 1447 degrees of freedom
## Multiple R-squared:  0.1271, Adjusted R-squared:  0.09092 
## F-statistic: 3.512 on 60 and 1447 DF,  p-value: < 2.2e-16
broom::tidy(finalModel) %>% arrange(p.value) %>% kable()

term	estimate	std.error	statistic	p.value
(Intercept)	6157.26102	664.8728	9.2608107	0.0000000
age:car_typeVan:urbanicityz_Highly Rural/ Rural	17335.74875	2991.6108	5.7947874	0.0000000
car_typeVan:urbanicityz_Highly Rural/ Rural	20566.08610	3644.5828	5.6429191	0.0000000
homekids:car_typePanel Truck	3451.01882	835.1349	4.1322889	0.0000380
home_val:mstatusz_No	1567.46217	438.0585	3.5782032	0.0003574
age:home_val	677.15735	192.6520	3.5149249	0.0004534
income	-1780.85560	525.5706	-3.3884234	0.0007218
homekids:clm_freq	735.40808	228.3909	3.2199540	0.0013106
income:car_typeSports Car	2065.99967	689.2879	2.9972959	0.0027703
homekids:car_age	695.12298	234.3885	2.9656877	0.0030696
income:car_typePickup	1929.04279	673.5836	2.8638507	0.0042456
kidsdriv:oldclaim	476.49886	172.2145	2.7668918	0.0057314
mstatusz_No	1194.43930	432.9270	2.7589854	0.0058712
income:mstatusz_No	1066.00114	388.4118	2.7445130	0.0061350
age:car_typeVan	-2235.39108	824.4937	-2.7112287	0.0067827
income:car_typez_SUV	1564.40159	584.4488	2.6767129	0.0075188
age:car_typePanel Truck	1924.33174	824.4782	2.3339996	0.0197320
yoj:tif	-418.87507	180.0818	-2.3260261	0.0201547
income:car_typeVan	1701.51669	737.9440	2.3057532	0.0212653
homekids:home_val	468.86746	210.2472	2.2300770	0.0258952
kidsdriv:age:clm_freq	354.29002	161.7361	2.1905435	0.0286439
homekids:red_caryes	-1015.10786	470.4060	-2.1579398	0.0310966
kidsdriv:car_age	-369.24951	180.4533	-2.0462326	0.0409139
car_typePickup:urbanicityz_Highly Rural/ Rural	5557.24992	2881.3229	1.9287147	0.0539617
age:clm_freq	328.15357	175.1687	1.8733572	0.0612205
sexz_F	-1264.65757	689.0559	-1.8353484	0.0666592
urbanicityz_Highly Rural/ Rural	-3096.97458	1774.2871	-1.7454755	0.0811144
age:car_typePickup:urbanicityz_Highly Rural/ Rural	5226.65734	3060.4936	1.7077825	0.0878912
kidsdriv:clm_freq	-336.48308	197.7066	-1.7019313	0.0889831
home_val:sexz_F	-682.46038	401.2797	-1.7007098	0.0892124
mvr_pts	259.13257	166.2033	1.5591304	0.1191842
home_val:oldclaim	272.92888	177.8076	1.5349677	0.1250104
income:car_typePanel Truck	1169.47047	775.8212	1.5073968	0.1319273
car_typePanel Truck	1227.58097	825.1824	1.4876480	0.1370616
car_age	-287.20185	208.5190	-1.3773414	0.1686197
kidsdriv:age	266.51728	216.7747	1.2294669	0.2190967
homekids:car_typeSports Car	-842.79430	724.1426	-1.1638513	0.2446761
car_typez_SUV:urbanicityz_Highly Rural/ Rural	2609.11289	2294.8677	1.1369339	0.2557542
kidsdriv	-220.46485	203.4723	-1.0835129	0.2787613
homekids:car_typePickup	657.80690	640.9177	1.0263516	0.3048974
age	421.19239	480.9643	0.8757249	0.3813249
homekids:car_typeVan	-678.25917	816.9334	-0.8302502	0.4065343
homekids	442.92558	541.5782	0.8178423	0.4135819
clm_freq	-166.51752	219.3169	-0.7592554	0.4478235
car_typeSports Car:urbanicityz_Highly Rural/ Rural	2089.92132	2894.1821	0.7221112	0.4703428
red_caryes	-368.62000	548.9139	-0.6715443	0.5019810
age:car_typez_SUV	-375.29216	575.1881	-0.6524686	0.5142026
age:car_typeSports Car	-391.09925	638.7256	-0.6123118	0.5404277
car_typePickup	-371.90288	661.0853	-0.5625641	0.5738189
age:urbanicityz_Highly Rural/ Rural	-702.84499	1300.1613	-0.5405829	0.5888783
car_typeSports Car	431.35601	813.5639	0.5302055	0.5960508
yoj	-104.27931	201.3376	-0.5179326	0.6045845
car_typez_SUV	-285.39708	710.9025	-0.4014574	0.6881427
age:car_typeSports Car:urbanicityz_Highly Rural/ Rural	882.65837	2245.8772	0.3930128	0.6943680
tif	-47.34196	199.7847	-0.2369649	0.8127176
age:car_typez_SUV:urbanicityz_Highly Rural/ Rural	428.85546	1815.4302	0.2362280	0.8132892
car_typeVan	-176.79136	798.6800	-0.2213544	0.8248477
homekids:car_typez_SUV	-128.95172	611.7876	-0.2107786	0.8330897
home_val	-77.16001	385.2538	-0.2002836	0.8412869
oldclaim	21.94131	214.9811	0.1020616	0.9187219
age:car_typePickup	-38.01724	680.7042	-0.0558499	0.9554691


# comparison
testSet$target_amt_predicted <- predict(finalModel, testSet[, variables])

# root mean squared error (rmse)
Metrics::rmse(testSet$target_amt, testSet$target_amt_predicted)
## [1] 7343.597

MODEL APPLIED - MULTILINEAR REGRESSION (TARGET_AMT)

From above corrplot and density curve, we see that there’s actually nothing strongly associated with the target variable. We used stepwise regression and used up to three degree of interaction to come up with the best model possible (which will cause overfitting and make this model useless in the future), but still that only gave us 9% of adjusted R-squared. That means, over 90% of the variance in the actual amount cannot be predicted from this model. The regression model is less encouraging than the classification model.

dfEval$target_amt <- predict(finalModel, dfEval[, variables])

dfEval %>%
        dplyr::filter(target_flag == "Yes") %>%
        ggplot(aes(target_amt)) +
        geom_density() +
        geom_vline(xintercept = median(dfEval$target_amt[dfEval$target_flag == "Yes"]),
                   linetype = "dashed") +
        ggtitle("Distribution of Predicted Target Amount")


dfEval %>% head() %>% kable()

index	target_flag	target_amt	kidsdriv	age	homekids	yoj	income	parent1	home_val	mstatus	sex	education	job	travtime	car_use	bluebook	tif	car_type	red_car	oldclaim	clm_freq	revoked	mvr_pts	car_age	urbanicity	OOF_pred_glm	OOF_pred_nb	OOF_pred_rf	OOF_pred_top_glm	OOF_pred_top_nb	OOF_pred_top_gbm	target_flag_prob
3	No	6783.665	-0.3337943	0.3499326	-0.6425179	0.1488987	0.6619537	No	-0.9915932	z_No	M	Bachelors	Manager	-0.4549095	Private	0.0016269	-1.0689293	Van	yes	-0.6571645	-0.7111922	No	0.1062001	0.3150395	Highly Urban/ Urban	0.1220784	0.0013785	0.092	0.0915576	0.0098567	0.0886053	0.063
9	No	5900.963	1.7198080	-0.5884831	0.2530751	0.1488987	0.6011222	Yes	-0.9915932	z_No	M	z_High School	Manager	-0.7729272	Private	-0.3709252	0.1901913	Minivan	no	0.4022653	0.1679432	No	0.1062001	-1.2457634	Highly Urban/ Urban	0.2336235	0.2439135	0.328	0.2164859	0.2120411	0.2684035	0.232
10	No	4250.961	-0.3337943	-0.1192753	1.1486681	0.3887064	0.3491062	Yes	-0.9915932	z_No	z_F	z_High School	z_Blue Collar	-0.2004953	Commercial	1.1124265	1.1974878	z_SUV	no	-0.6571645	-0.7111922	No	-0.8014825	0.3150395	z_Highly Rural/ Rural	0.0908932	0.0091209	0.012	0.0667108	0.0056237	0.0872490	0.053
18	No	2521.773	-0.3337943	-1.1749930	1.1486681	0.1488987	-0.4521312	Yes	-0.9915932	z_No	M	z_High School	Clerical	2.5980610	Private	1.5329762	0.1901913	Pickup	no	-0.6571645	-0.7111922	Yes	-0.8014825	-0.7254958	z_Highly Rural/ Rural	0.2403932	0.0106405	0.112	0.1351010	0.0187269	0.1722218	0.109
21	No	5053.206	-0.3337943	1.6402542	-0.6425179	0.3887064	1.5865921	No	-0.9915932	z_No	M	z_High School	Manager	0.7535580	Private	-0.8166164	-1.0689293	Minivan	yes	1.2701376	1.0470786	No	1.0138828	-1.2457634	Highly Urban/ Urban	0.2392829	0.0536335	0.256	0.1920650	0.0462885	0.2155320	0.151
30	No	5818.214	-0.3337943	0.1153287	-0.6425179	0.8683218	-1.3420087	No	0.3708270	Yes	M	Bachelors	Professional	-1.6633769	Commercial	0.3696079	-1.0689293	Panel Truck	no	-0.1254949	0.1679432	No	0.1062001	0.6618846	Highly Urban/ Urban	0.1864743	0.0259940	0.226	0.1558029	0.0287481	0.1669544	0.117

data_621_hw_4

Jimmy Ng