This homework exercise is to build a logistic regression model and a multiple regression model that will estimate the likelihood of car accident, and if so, we try to predict the cost when such accidents happen. We have two response variables, i.e. TARGET_FLAG and TARGET_AMT. TARGET_FLAG is a binary field where 1 is equal to crash, and 0 is equal to no crash. TARGET_AMT, on the other hand, is the amount of time spent on repairs given there is a car crash accident. TARGET_FLAG is the response variable for our logistic regression model, whereas TARGET_AMT is the response variable for multiple regression.
# load packages
if(!require(pacman)){install.packages("pacman"); require(pacman)}
## Loading required package: pacman
## Warning: package 'pacman' was built under R version 3.6.2
packages <- c('tidyverse', 'sqldf', 'broom', 'caret', 'kableExtra', 'janitor', 'Hmisc', 'MASS', 'corrplot', 'Metrics')
pacman::p_load(char = packages)
# read data
dfTrain <- read.csv("insurance_training_data.csv", header = TRUE)
dfEval <- read.csv("insurance-evaluation-data.csv", header = TRUE)
# check dim
dim(dfTrain); dim(dfEval)
## [1] 8161 26
## [1] 2141 26
# are they compatible?
if(!any(names(dfTrain) == names(dfEval))){print("the two data sets are different, please check for consistency")}
# clean names
dfTrain <- dfTrain %>% janitor::clean_names()
dfEval <- dfEval %>% janitor::clean_names()
# head
head(dfTrain) %>% kable()
| index | target_flag | target_amt | kidsdriv | age | homekids | yoj | income | parent1 | home_val | mstatus | sex | education | job | travtime | car_use | bluebook | tif | car_type | red_car | oldclaim | clm_freq | revoked | mvr_pts | car_age | urbanicity |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 0 | 0 | 0 | 60 | 0 | 11 | $67,349 | No | $0 | z_No | M | PhD | Professional | 14 | Private | $14,230 | 11 | Minivan | yes | $4,461 | 2 | No | 3 | 18 | Highly Urban/ Urban |
| 2 | 0 | 0 | 0 | 43 | 0 | 11 | $91,449 | No | $257,252 | z_No | M | z_High School | z_Blue Collar | 22 | Commercial | $14,940 | 1 | Minivan | yes | $0 | 0 | No | 0 | 1 | Highly Urban/ Urban |
| 4 | 0 | 0 | 0 | 35 | 1 | 10 | $16,039 | No | $124,191 | Yes | z_F | z_High School | Clerical | 5 | Private | $4,010 | 4 | z_SUV | no | $38,690 | 2 | No | 3 | 10 | Highly Urban/ Urban |
| 5 | 0 | 0 | 0 | 51 | 0 | 14 | No | $306,251 | Yes | M | <High School | z_Blue Collar | 32 | Private | $15,440 | 7 | Minivan | yes | $0 | 0 | No | 0 | 6 | Highly Urban/ Urban | |
| 6 | 0 | 0 | 0 | 50 | 0 | NA | $114,986 | No | $243,925 | Yes | z_F | PhD | Doctor | 36 | Private | $18,000 | 1 | z_SUV | no | $19,217 | 2 | Yes | 3 | 17 | Highly Urban/ Urban |
| 7 | 1 | 2946 | 0 | 34 | 1 | 12 | $125,301 | Yes | $0 | z_No | z_F | Bachelors | z_Blue Collar | 46 | Commercial | $17,430 | 1 | Sports Car | no | $0 | 0 | No | 0 | 7 | Highly Urban/ Urban |
# target flag - proportion
with(dfTrain, prop.table(ftable(target_flag), 1))
## target_flag 0 1
##
## 0.7361843 0.2638157
# target amt - distribution
hist(dfTrain$target_amt)
# target amt - log transformed
boxplot(log(dfTrain$target_amt) ~ dfTrain$target_flag)
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z$group
## == : Outlier (-Inf) in boxplot 1 is not drawn
# length of unique values of each variable from dfTrain
sapply(dfTrain, function(x) length(unique(x)))
## index target_flag target_amt kidsdriv age homekids
## 8161 2 1949 5 61 6
## yoj income parent1 home_val mstatus sex
## 22 6613 2 5107 2 2
## education job travtime car_use bluebook tif
## 5 9 97 2 2789 23
## car_type red_car oldclaim clm_freq revoked mvr_pts
## 6 2 2857 6 2 13
## car_age urbanicity
## 31 2
# quickly glance at the class of each variable
fields <- data.frame(fields = names(dfTrain),
class = sapply(dfTrain, class) %>% unlist %>% as.vector)
fields
## fields class
## 1 index integer
## 2 target_flag integer
## 3 target_amt numeric
## 4 kidsdriv integer
## 5 age integer
## 6 homekids integer
## 7 yoj integer
## 8 income factor
## 9 parent1 factor
## 10 home_val factor
## 11 mstatus factor
## 12 sex factor
## 13 education factor
## 14 job factor
## 15 travtime integer
## 16 car_use factor
## 17 bluebook factor
## 18 tif integer
## 19 car_type factor
## 20 red_car factor
## 21 oldclaim factor
## 22 clm_freq integer
## 23 revoked factor
## 24 mvr_pts integer
## 25 car_age integer
## 26 urbanicity factor
# mutate target_flag
# mutate these 4 factors - "income", "home_val", "bluebook", "oldclaim"
# these should be numeric variables, not factor
dfTrain <- dfTrain %>%
dplyr::mutate(target_flag = dplyr::case_when(target_flag == 1 ~ "Y",
TRUE ~ "N") %>%
factor(., levels = c("Y", "N"), labels = c("Yes", "No")),
income = as.numeric(income),
home_val = as.numeric(home_val),
bluebook = as.numeric(bluebook),
oldclaim = as.numeric(oldclaim))
dfEval <- dfEval %>%
dplyr::mutate(target_flag = dplyr::case_when(target_flag == 1 ~ "Y",
TRUE ~ "N") %>%
factor(., levels = c("Y", "N"), labels = c("Yes", "No")),
income = as.numeric(income),
home_val = as.numeric(home_val),
bluebook = as.numeric(bluebook),
oldclaim = as.numeric(oldclaim))
# rerun
fields <- data.frame(fields = names(dfTrain),
class = sapply(dfTrain, class) %>% unlist %>% as.vector)
fields
## fields class
## 1 index integer
## 2 target_flag factor
## 3 target_amt numeric
## 4 kidsdriv integer
## 5 age integer
## 6 homekids integer
## 7 yoj integer
## 8 income numeric
## 9 parent1 factor
## 10 home_val numeric
## 11 mstatus factor
## 12 sex factor
## 13 education factor
## 14 job factor
## 15 travtime integer
## 16 car_use factor
## 17 bluebook numeric
## 18 tif integer
## 19 car_type factor
## 20 red_car factor
## 21 oldclaim numeric
## 22 clm_freq integer
## 23 revoked factor
## 24 mvr_pts integer
## 25 car_age integer
## 26 urbanicity factor
# set variables
binary_var <- "target_flag"
quant_var <- "target_amt"
variables <- names(dfTrain)[names(dfTrain) %nin% c(binary_var, quant_var, "index")]
# check missing
colSums(is.na(dfTrain))
## index target_flag target_amt kidsdriv age homekids
## 0 0 0 0 6 0
## yoj income parent1 home_val mstatus sex
## 454 0 0 0 0 0
## education job travtime car_use bluebook tif
## 0 0 0 0 0 0
## car_type red_car oldclaim clm_freq revoked mvr_pts
## 0 0 0 0 0 0
## car_age urbanicity
## 510 0
colSums(is.na(dfEval))
## index target_flag target_amt kidsdriv age homekids
## 0 0 2141 0 1 0
## yoj income parent1 home_val mstatus sex
## 94 0 0 0 0 0
## education job travtime car_use bluebook tif
## 0 0 0 0 0 0
## car_type red_car oldclaim clm_freq revoked mvr_pts
## 0 0 0 0 0 0
## car_age urbanicity
## 129 0
# fix missing - train
preProcValuesTrain <- caret::preProcess(dfTrain[, variables], method = c("medianImpute", "center", "scale"))
dfTrain <- predict(preProcValuesTrain, dfTrain)
colSums(is.na(dfTrain))
## index target_flag target_amt kidsdriv age homekids
## 0 0 0 0 0 0
## yoj income parent1 home_val mstatus sex
## 0 0 0 0 0 0
## education job travtime car_use bluebook tif
## 0 0 0 0 0 0
## car_type red_car oldclaim clm_freq revoked mvr_pts
## 0 0 0 0 0 0
## car_age urbanicity
## 0 0
# fix missing - eval
preProcValuesEval <- caret::preProcess(dfEval[, variables], method = c("medianImpute", "center", "scale"))
dfEval <- predict(preProcValuesEval, dfEval)
colSums(is.na(dfEval))
## index target_flag target_amt kidsdriv age homekids
## 0 0 2141 0 0 0
## yoj income parent1 home_val mstatus sex
## 0 0 0 0 0 0
## education job travtime car_use bluebook tif
## 0 0 0 0 0 0
## car_type red_car oldclaim clm_freq revoked mvr_pts
## 0 0 0 0 0 0
## car_age urbanicity
## 0 0
# split data into train (70%), test (30%) sets
set.seed(1234)
index <- caret::createDataPartition(dfTrain$target_flag, p = 0.7, list = FALSE)
trainSet <- dfTrain[index, ]
testSet <- dfTrain[-index, ]
# trainSetControl - let's do 10-fold cross-validation
trainSet.control <- caret::trainControl(method = "cv", number = 10, savePredictions = 'final', classProbs = TRUE)
We will first build three classification models and then two multi-linear regression models. Besides logistic regression, let’s try two different classification algorithms, i.e. naivey bayes and random forest. In addition, let’s try to improve our accuracy by 1) doing 10-fold cross-validation, and 2) ensembling our models, i.e. using glm (generalized linear model), nb (naive bayes) and rf (random forest) as our base layer and stacking on top by building a top layer using glm, nb, in addition gbm (gradient boosting machine).
# turn off warning
options(warn = -1)
# count time - start
start <- Sys.time()
# set seed
set.seed(1234)
# methods
baseTrainMethods <- c("glm", "nb", "rf")
topTrainMethods <- c("glm", "nb", "gbm")
# output
modelSummaryList <- vector(mode = "list")
# train base layer
for(baseLayer in baseTrainMethods){
# set parameters
ml = baseLayer
model = paste0("model_base_", ml)
OOF_prediction = paste0("OOF_pred_", ml)
prediction = paste0("pred_", ml)
result = paste0("result_", ml)
# model
assign(bquote(.(model)), caret::train(trainSet[, variables], trainSet[, binary_var],
method = ml,
trControl = trainSet.control,
trace = FALSE))
# Out-Of-Fold probability predictions - trainSet
if(ml == "glm"){trainSet$OOF_pred_glm = eval(sym(model))$pred$Y[order(eval(sym(model))$pred$rowIndex)]}
if(ml == "nb"){trainSet$OOF_pred_nb = eval(sym(model))$pred$Y[order(eval(sym(model))$pred$rowIndex)]}
if(ml == "rf"){trainSet$OOF_pred_rf = eval(sym(model))$pred$Y[order(eval(sym(model))$pred$rowIndex)]}
# Out-Of-Fold probability predictions - testSet
assign(bquote(.(OOF_prediction)), predict(eval(sym(model)), testSet[, variables], type = "prob")$Y)
if(ml == "glm"){testSet$OOF_pred_glm = eval(sym(OOF_prediction))}
if(ml == "nb"){testSet$OOF_pred_nb = eval(sym(OOF_prediction))}
if(ml == "rf"){testSet$OOF_pred_rf = eval(sym(OOF_prediction))}
# Y/N predictions for Confusion Matrix - testSet
assign(bquote(.(prediction)), predict(eval(sym(model)), testSet[, variables]))
if(ml == "glm"){testSet$pred_glm = eval(sym(prediction))}
if(ml == "nb"){testSet$pred_nb = eval(sym(prediction))}
if(ml == "rf"){testSet$pred_rf = eval(sym(prediction))}
# output
assign(bquote(.(result)), broom::tidy(caret::confusionMatrix(testSet[, prediction], testSet[, binary_var])) %>%
dplyr::mutate(trainMethod = ml) %>%
dplyr::select(trainMethod, everything()))
# store output into a list
tempModelList <- list(eval(sym(result)))
modelSummaryList <<- c(modelSummaryList, tempModelList)
}
# train top layer
for(topLayer in topTrainMethods){
# set parameters
ml = topLayer
model = paste0("model_top_", ml)
OOF_predictors_top = c("OOF_pred_glm", "OOF_pred_nb", "OOF_pred_rf")
OOF_prediction_top = paste0("OOF_pred_top_", ml)
prediction_top = paste0("pred_top_", ml)
result = paste0("result_top_", ml)
# model
assign(bquote(.(model)), caret::train(trainSet[, OOF_predictors_top], trainSet[, binary_var],
method = ml,
trControl = trainSet.control))
# Out-Of-Fold probability predictions - testSet
assign(bquote(.(OOF_prediction_top)), predict(eval(sym(model)), testSet[, OOF_predictors_top], type = "prob")$Y)
if(ml == "glm"){testSet$OOF_pred_top_glm = eval(sym(OOF_prediction_top))}
if(ml == "nb"){testSet$OOF_pred_top_nb = eval(sym(OOF_prediction_top))}
if(ml == "gbm"){testSet$OOF_pred_top_gbm = eval(sym(OOF_prediction_top))}
# Y/N predictions for Confusion Matrix - testSet
assign(bquote(.(prediction_top)), predict(eval(sym(model)), testSet[, OOF_predictors_top]))
if(ml == "glm"){testSet$pred_top_glm = eval(sym(prediction_top))}
if(ml == "nb"){testSet$pred_top_nb = eval(sym(prediction_top))}
if(ml == "gbm"){testSet$pred_top_gbm = eval(sym(prediction_top))}
# output
assign(bquote(.(result)), broom::tidy(caret::confusionMatrix(testSet[, prediction_top], testSet[, binary_var])) %>%
dplyr::mutate(trainMethod = paste0(ml, " - top layer")) %>%
dplyr::select(trainMethod, everything()))
# store output into a list
tempModelList <- list(eval(sym(result)))
modelSummaryList <<- c(modelSummaryList, tempModelList)
}
# put together - averaging the OOF prediction probability
testSet <- testSet %>%
dplyr::mutate(pred_final_prob_avg = (OOF_pred_top_glm + OOF_pred_top_nb + OOF_pred_top_gbm) / length(topTrainMethods),
pred_final = ifelse(pred_final_prob_avg > 0.5, "Y", "N") %>%
factor(., levels = c("Y", "N"), labels = c("Yes", "No")))
finalResult <- broom::tidy(caret::confusionMatrix(testSet$pred_final, testSet$target_flag)) %>%
dplyr::mutate(trainMethod = "final - averaging") %>%
dplyr::select(trainMethod, everything())
# store finalResult output into a list
tempModelList <- list(finalResult)
modelSummaryList <<- c(modelSummaryList, tempModelList)
# count time - finish
finish <- Sys.time()
First, we build a base layer (using glm, nb and rf) to predict the “target_flag” using all the variables (minus the index and target_amt) from the train set. Second, we build a top layer (using glm, nb and gbm) to predict the “target_flag” based on the outcomes (OOF or Out-Of-Fold prediction) of our base layer. In other words, we make prediction (of the target variable) based on the predictions of our base models. Finally, we simply average the outcome probabilities of our top layer to get our final probability and decision using 0.5 as cut-off.
Take a look of our results, i.e. modelSummaryDf, we decide to apply the final model (averaging) for our evaluation set because of the highest f1 score displayed for the final result from the test set. Looking at the results, “nb” from the base layer did the worst in terms of sensitivity, but did the best among all models in precision. All three models from the top layer can significantly enhance accuracy, sensitivy and f1 score. The final result is far from perfect but better than relying on just a single classification algorithm. Lastly, when we compare the distribution between train and evaluation set, we see that that the proportion is very similar, i.e. Yes from evaluation (21.4%) vs Yes from train set (26.4%). Although it improved accuracy, the downside of ensembling is that the result is hard to interpret and communicate with stakeholder (such as what contributed more significantly to causing an accident).
# let's look at the time it finished running the models
print(paste0("The models ensembling exercise took roughly ", round(finish - start, 1), " mins to run"))
## [1] "The models ensembling exercise took roughly 5.8 mins to run"
# let's look at the result
modelSummaryDf <- modelSummaryList %>%
dplyr::bind_rows() %>%
dplyr::select(trainMethod, term, estimate) %>%
tidyr::spread(term, estimate) %>%
arrange(desc(f1))
modelSummaryDf %>% kable()
| trainMethod | accuracy | balanced_accuracy | detection_prevalence | detection_rate | f1 | kappa | neg_pred_value | pos_pred_value | precision | prevalence | recall | sensitivity | specificity |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| final - averaging | 0.7870862 | 0.6921848 | 0.2084185 | 0.1295464 | 0.5489177 | 0.4120569 | 0.8306660 | 0.6215686 | 0.6215686 | 0.2635881 | 0.4914729 | 0.4914729 | 0.8928968 |
| gbm - top layer | 0.7854516 | 0.6915727 | 0.2108705 | 0.1299550 | 0.5478036 | 0.4094336 | 0.8306577 | 0.6162791 | 0.6162791 | 0.2635881 | 0.4930233 | 0.4930233 | 0.8901221 |
| nb - top layer | 0.7825909 | 0.6916213 | 0.2170004 | 0.1315897 | 0.5476190 | 0.4062960 | 0.8314196 | 0.6064030 | 0.6064030 | 0.2635881 | 0.4992248 | 0.4992248 | 0.8840178 |
| glm - top layer | 0.7850429 | 0.6708885 | 0.1777687 | 0.1131998 | 0.5129630 | 0.3816700 | 0.8170974 | 0.6367816 | 0.6367816 | 0.2635881 | 0.4294574 | 0.4294574 | 0.9123196 |
| glm | 0.7793216 | 0.6620267 | 0.1753167 | 0.1091132 | 0.4972067 | 0.3630885 | 0.8126858 | 0.6223776 | 0.6223776 | 0.2635881 | 0.4139535 | 0.4139535 | 0.9100999 |
| rf | 0.7834083 | 0.6593264 | 0.1622395 | 0.1046179 | 0.4913628 | 0.3635247 | 0.8102439 | 0.6448363 | 0.6448363 | 0.2635881 | 0.3968992 | 0.3968992 | 0.9217536 |
| nb | 0.7621577 | 0.5697416 | 0.0600736 | 0.0429097 | 0.2651515 | 0.1854502 | 0.7652174 | 0.7142857 | 0.7142857 | 0.2635881 | 0.1627907 | 0.1627907 | 0.9766926 |
options(warn = -1)
# let's fit the dfEval using the final averaging method
# base layer
dfEval <- dfEval %>%
dplyr::mutate(OOF_pred_glm = predict(model_base_glm, dfEval[, variables], type = "prob")$Y,
OOF_pred_nb = predict(model_base_nb, dfEval[, variables], type = "prob")$Y,
OOF_pred_rf = predict(model_base_rf, dfEval[, variables], type = "prob")$Y)
# top layer
dfEval <- dfEval %>%
dplyr::mutate(OOF_pred_top_glm = predict(model_top_glm, dfEval[, c("OOF_pred_glm", "OOF_pred_nb", "OOF_pred_rf")], type = "prob")$Y,
OOF_pred_top_nb = predict(model_top_nb, dfEval[, c("OOF_pred_glm", "OOF_pred_nb", "OOF_pred_rf")], type = "prob")$Y,
OOF_pred_top_gbm = predict(model_top_gbm, dfEval[, c("OOF_pred_glm", "OOF_pred_nb", "OOF_pred_rf")], type = "prob")$Y)
# final predicion
dfEval <- dfEval %>%
dplyr::mutate(target_flag_prob = round((OOF_pred_top_glm + OOF_pred_top_nb + OOF_pred_top_gbm) / 3, 3),
target_flag = ifelse(target_flag_prob > 0.5, "Y", "N") %>%
factor(., levels = c("Y", "N"), labels = c("Yes", "No")))
# see prediction
evalTable <- ftable(dfEval$target_flag)
evalTable
## Yes No
##
## 459 1682
# comparison between evaluation and train set
with(dfEval, prop.table(evalTable, 1)) # evaluation
## Yes No
##
## 0.2143858 0.7856142
with(dfTrain, prop.table(ftable(dfTrain$target_flag))) # train
## Yes No
##
## 0.2638157 0.7361843
# dfEval
dfEval %>% dplyr::select(index, target_flag, target_flag_prob, everything()) %>% head() %>% kable()
| index | target_flag | target_flag_prob | target_amt | kidsdriv | age | homekids | yoj | income | parent1 | home_val | mstatus | sex | education | job | travtime | car_use | bluebook | tif | car_type | red_car | oldclaim | clm_freq | revoked | mvr_pts | car_age | urbanicity | OOF_pred_glm | OOF_pred_nb | OOF_pred_rf | OOF_pred_top_glm | OOF_pred_top_nb | OOF_pred_top_gbm |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3 | No | 0.063 | NA | -0.3337943 | 0.3499326 | -0.6425179 | 0.1488987 | 0.6619537 | No | -0.9915932 | z_No | M | Bachelors | Manager | -0.4549095 | Private | 0.0016269 | -1.0689293 | Van | yes | -0.6571645 | -0.7111922 | No | 0.1062001 | 0.3150395 | Highly Urban/ Urban | 0.1220784 | 0.0013785 | 0.092 | 0.0915576 | 0.0098567 | 0.0886053 |
| 9 | No | 0.232 | NA | 1.7198080 | -0.5884831 | 0.2530751 | 0.1488987 | 0.6011222 | Yes | -0.9915932 | z_No | M | z_High School | Manager | -0.7729272 | Private | -0.3709252 | 0.1901913 | Minivan | no | 0.4022653 | 0.1679432 | No | 0.1062001 | -1.2457634 | Highly Urban/ Urban | 0.2336235 | 0.2439135 | 0.328 | 0.2164859 | 0.2120411 | 0.2684035 |
| 10 | No | 0.053 | NA | -0.3337943 | -0.1192753 | 1.1486681 | 0.3887064 | 0.3491062 | Yes | -0.9915932 | z_No | z_F | z_High School | z_Blue Collar | -0.2004953 | Commercial | 1.1124265 | 1.1974878 | z_SUV | no | -0.6571645 | -0.7111922 | No | -0.8014825 | 0.3150395 | z_Highly Rural/ Rural | 0.0908932 | 0.0091209 | 0.012 | 0.0667108 | 0.0056237 | 0.0872490 |
| 18 | No | 0.109 | NA | -0.3337943 | -1.1749930 | 1.1486681 | 0.1488987 | -0.4521312 | Yes | -0.9915932 | z_No | M | z_High School | Clerical | 2.5980610 | Private | 1.5329762 | 0.1901913 | Pickup | no | -0.6571645 | -0.7111922 | Yes | -0.8014825 | -0.7254958 | z_Highly Rural/ Rural | 0.2403932 | 0.0106405 | 0.112 | 0.1351010 | 0.0187269 | 0.1722218 |
| 21 | No | 0.151 | NA | -0.3337943 | 1.6402542 | -0.6425179 | 0.3887064 | 1.5865921 | No | -0.9915932 | z_No | M | z_High School | Manager | 0.7535580 | Private | -0.8166164 | -1.0689293 | Minivan | yes | 1.2701376 | 1.0470786 | No | 1.0138828 | -1.2457634 | Highly Urban/ Urban | 0.2392829 | 0.0536335 | 0.256 | 0.1920650 | 0.0462885 | 0.2155320 |
| 30 | No | 0.117 | NA | -0.3337943 | 0.1153287 | -0.6425179 | 0.8683218 | -1.3420087 | No | 0.3708270 | Yes | M | Bachelors | Professional | -1.6633769 | Commercial | 0.3696079 | -1.0689293 | Panel Truck | no | -0.1254949 | 0.1679432 | No | 0.1062001 | 0.6618846 | Highly Urban/ Urban | 0.1864743 | 0.0259940 | 0.226 | 0.1558029 | 0.0287481 | 0.1669544 |
Let’s turn our focus to build two multilinear regression models using the same data. Interestingly, the variables do not seem to correlate with the target variable in this exercise. Nothing seems to indicate anything relevant to “target_amt”.
# corrplot - quantitative variables
trainSet %>%
dplyr::filter(target_flag == "Yes") %>%
dplyr::select(fields %>% dplyr::filter(class != 'factor') %>% .$fields) %>%
dplyr::select(-index) %>%
cor %>%
corrplot(method = "number", type = "upper", order = "hclust")
# density - categorical variables
dfGather <- trainSet %>%
dplyr::filter(target_flag == "Yes") %>%
dplyr::select(fields %>% dplyr::filter(class == 'factor') %>% .$fields, target_amt) %>%
tidyr::gather(key, value, -target_amt)
dfGather %>%
ggplot(aes(target_amt, color = value)) +
geom_density() +
geom_vline(data = aggregate(target_amt ~ key + value, dfGather, median),
aes(xintercept = target_amt,
color = value),
linetype = "dashed") +
facet_wrap(~ key, nrow = 5, scales = "free") +
theme(legend.position = "none") +
ggtitle("Distribution of Target Amount by Various Categorical Variables")
# build full model
fullModel <- lm(target_amt ~., data = trainSet %>%
dplyr::filter(target_flag == "Yes") %>%
dplyr::select(variables, quant_var))
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(variables)` instead of `variables` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(quant_var)` instead of `quant_var` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
summary(fullModel)
##
## Call:
## lm(formula = target_amt ~ ., data = trainSet %>% dplyr::filter(target_flag ==
## "Yes") %>% dplyr::select(variables, quant_var))
##
## Residuals:
## Min 1Q Median 3Q Max
## -7646 -2980 -1428 479 78846
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6420.76 1622.17 3.958 7.91e-05 ***
## kidsdriv -41.87 186.91 -0.224 0.8228
## age 63.94 209.70 0.305 0.7605
## homekids 241.18 275.06 0.877 0.3807
## yoj 50.77 237.11 0.214 0.8305
## income 26.37 243.05 0.109 0.9136
## parent1Yes -135.57 677.69 -0.200 0.8415
## home_val 181.00 232.56 0.778 0.4365
## mstatusz_No 855.11 534.28 1.600 0.1097
## sexz_F -1352.67 708.49 -1.909 0.0564 .
## educationBachelors -176.93 740.56 -0.239 0.8112
## educationMasters 1170.32 1245.90 0.939 0.3477
## educationPhD 1820.80 1428.87 1.274 0.2028
## educationz_High School -390.17 590.99 -0.660 0.5092
## jobClerical -465.61 1380.79 -0.337 0.7360
## jobDoctor -3129.01 2092.81 -1.495 0.1351
## jobHome Maker -851.86 1411.45 -0.604 0.5462
## jobLawyer -1153.76 1182.09 -0.976 0.3292
## jobManager -1606.24 1235.83 -1.300 0.1939
## jobProfessional 286.88 1297.24 0.221 0.8250
## jobStudent -307.51 1454.56 -0.211 0.8326
## jobz_Blue Collar 29.85 1320.61 0.023 0.9820
## travtime 31.93 202.57 0.158 0.8748
## car_usePrivate -396.40 595.54 -0.666 0.5058
## bluebook -130.25 194.72 -0.669 0.5037
## tif -49.40 206.61 -0.239 0.8111
## car_typePanel Truck 806.16 962.96 0.837 0.4026
## car_typePickup -80.94 695.63 -0.116 0.9074
## car_typeSports Car 740.04 814.52 0.909 0.3637
## car_typeVan 370.21 838.62 0.441 0.6589
## car_typez_SUV 327.53 715.49 0.458 0.6472
## red_caryes -725.27 571.83 -1.268 0.2049
## oldclaim -56.25 220.52 -0.255 0.7987
## clm_freq -118.32 228.37 -0.518 0.6045
## revokedYes -756.60 485.77 -1.558 0.1196
## mvr_pts 103.60 171.38 0.604 0.5456
## car_age -527.25 289.71 -1.820 0.0690 .
## urbanicityz_Highly Rural/ Rural -52.34 877.12 -0.060 0.9524
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7391 on 1470 degrees of freedom
## Multiple R-squared: 0.02302, Adjusted R-squared: -0.001574
## F-statistic: 0.936 on 37 and 1470 DF, p-value: 0.5804
# stepwise regression - direction default to both
step <- MASS::stepAIC(fullModel, trace = FALSE)
step$anova
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## target_amt ~ kidsdriv + age + homekids + yoj + income + parent1 +
## home_val + mstatus + sex + education + job + travtime + car_use +
## bluebook + tif + car_type + red_car + oldclaim + clm_freq +
## revoked + mvr_pts + car_age + urbanicity
##
## Final Model:
## target_amt ~ mstatus + sex + car_use + revoked
##
##
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 1470 80306517739 26904.19
## 2 - job 8 350075913 1478 80656593652 26894.75
## 3 - car_type 5 156824305 1483 80813417957 26887.68
## 4 - education 4 209630349 1487 81023048307 26883.59
## 5 - kidsdriv 1 345385 1488 81023393692 26881.59
## 6 - parent1 1 1850124 1489 81025243816 26879.63
## 7 - urbanicity 1 2060081 1490 81027303897 26877.66
## 8 - travtime 1 2040288 1491 81029344185 26875.70
## 9 - tif 1 2786998 1492 81032131183 26873.75
## 10 - oldclaim 1 3433572 1493 81035564754 26871.82
## 11 - income 1 4926100 1494 81040490854 26869.91
## 12 - age 1 17593993 1495 81058084847 26868.24
## 13 - homekids 1 20065899 1496 81078150746 26866.61
## 14 - mvr_pts 1 24168604 1497 81102319350 26865.06
## 15 - clm_freq 1 17348896 1498 81119668246 26863.38
## 16 - bluebook 1 27790046 1499 81147458292 26861.90
## 17 - yoj 1 36399906 1500 81183858198 26860.58
## 18 - car_age 1 73266728 1501 81257124926 26859.94
## 19 - home_val 1 54967527 1502 81312092454 26858.96
## 20 - red_car 1 85939499 1503 81398031953 26858.55
# 2 degree of interactions - full model
step2 <- MASS::stepAIC(fullModel, ~ .^2, trace = FALSE)
step2$anova
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## target_amt ~ kidsdriv + age + homekids + yoj + income + parent1 +
## home_val + mstatus + sex + education + job + travtime + car_use +
## bluebook + tif + car_type + red_car + oldclaim + clm_freq +
## revoked + mvr_pts + car_age + urbanicity
##
## Final Model:
## target_amt ~ kidsdriv + age + homekids + yoj + income + home_val +
## mstatus + sex + tif + car_type + red_car + oldclaim + clm_freq +
## mvr_pts + car_age + urbanicity + homekids:clm_freq + homekids:car_type +
## car_type:urbanicity + home_val:mstatus + age:car_type + kidsdriv:age +
## income:mstatus + age:home_val + homekids:car_age + yoj:tif +
## kidsdriv:oldclaim + income:car_type + homekids:red_car +
## home_val:sex + homekids:home_val + kidsdriv:car_age + age:urbanicity +
## home_val:oldclaim + age:red_car + age:clm_freq + kidsdriv:clm_freq
##
##
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 1470 80306517739 26904.19
## 2 + homekids:clm_freq 1 6.753112e+08 1469 79631206509 26893.46
## 3 + homekids:car_type 5 1.011300e+09 1464 78619906978 26884.18
## 4 - job 8 3.895629e+08 1472 79009469844 26875.64
## 5 + car_type:urbanicity 4 7.850784e+08 1468 78224391452 26868.58
## 6 + home_val:mstatus 1 4.600644e+08 1467 77764327040 26861.68
## 7 + age:car_type 5 8.447631e+08 1462 76919563928 26855.21
## 8 - education 4 1.752619e+08 1466 77094825815 26850.64
## 9 + kidsdriv:age 1 2.948190e+08 1465 76800006825 26846.86
## 10 + income:mstatus 1 2.600865e+08 1464 76539920341 26843.75
## 11 + age:home_val 1 2.833268e+08 1463 76256593508 26840.16
## 12 + homekids:car_age 1 2.494893e+08 1462 76007104200 26837.21
## 13 + yoj:tif 1 2.338405e+08 1461 75773263655 26834.57
## 14 + kidsdriv:oldclaim 1 2.327973e+08 1460 75540466362 26831.93
## 15 - travtime 1 5.909411e+03 1461 75540472271 26829.93
## 16 + income:car_type 5 5.975858e+08 1456 74942886461 26827.95
## 17 - parent1 1 6.248921e+06 1457 74949135382 26826.08
## 18 - car_use 1 2.510220e+07 1458 74974237583 26824.58
## 19 - bluebook 1 2.596424e+07 1459 75000201824 26823.10
## 20 - revoked 1 3.376960e+07 1460 75033971420 26821.78
## 21 + homekids:red_car 1 1.592836e+08 1459 74874687832 26820.58
## 22 + home_val:sex 1 1.455318e+08 1458 74729155997 26819.64
## 23 + homekids:home_val 1 1.492159e+08 1457 74579940090 26818.63
## 24 + kidsdriv:car_age 1 1.541971e+08 1456 74425743008 26817.51
## 25 + age:urbanicity 1 1.541162e+08 1455 74271626808 26816.38
## 26 + home_val:oldclaim 1 1.458966e+08 1454 74125730252 26815.42
## 27 + age:red_car 1 9.948659e+07 1453 74026243664 26815.39
## 28 + age:clm_freq 1 1.051529e+08 1452 73921090735 26815.25
## 29 + kidsdriv:clm_freq 1 1.467009e+08 1451 73774389817 26814.25
# 3 degree of interactions - full model
step3 <- MASS::stepAIC(fullModel, ~ .^3, trace = FALSE)
step3$anova
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## target_amt ~ kidsdriv + age + homekids + yoj + income + parent1 +
## home_val + mstatus + sex + education + job + travtime + car_use +
## bluebook + tif + car_type + red_car + oldclaim + clm_freq +
## revoked + mvr_pts + car_age + urbanicity
##
## Final Model:
## target_amt ~ kidsdriv + age + homekids + yoj + income + home_val +
## mstatus + sex + tif + car_type + red_car + oldclaim + clm_freq +
## mvr_pts + car_age + urbanicity + homekids:clm_freq + homekids:car_type +
## car_type:urbanicity + home_val:mstatus + age:car_type + kidsdriv:age +
## income:mstatus + age:home_val + homekids:car_age + yoj:tif +
## kidsdriv:oldclaim + income:car_type + homekids:red_car +
## home_val:sex + homekids:home_val + kidsdriv:car_age + age:urbanicity +
## home_val:oldclaim + kidsdriv:clm_freq + age:clm_freq + age:car_type:urbanicity +
## kidsdriv:age:clm_freq
##
##
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 1470 80306517739 26904.19
## 2 + homekids:clm_freq 1 6.753112e+08 1469 79631206509 26893.46
## 3 + homekids:car_type 5 1.011300e+09 1464 78619906978 26884.18
## 4 - job 8 3.895629e+08 1472 79009469844 26875.64
## 5 + car_type:urbanicity 4 7.850784e+08 1468 78224391452 26868.58
## 6 + home_val:mstatus 1 4.600644e+08 1467 77764327040 26861.68
## 7 + age:car_type 5 8.447631e+08 1462 76919563928 26855.21
## 8 - education 4 1.752619e+08 1466 77094825815 26850.64
## 9 + kidsdriv:age 1 2.948190e+08 1465 76800006825 26846.86
## 10 + income:mstatus 1 2.600865e+08 1464 76539920341 26843.75
## 11 + age:home_val 1 2.833268e+08 1463 76256593508 26840.16
## 12 + homekids:car_age 1 2.494893e+08 1462 76007104200 26837.21
## 13 + yoj:tif 1 2.338405e+08 1461 75773263655 26834.57
## 14 + kidsdriv:oldclaim 1 2.327973e+08 1460 75540466362 26831.93
## 15 - travtime 1 5.909411e+03 1461 75540472271 26829.93
## 16 + income:car_type 5 5.975858e+08 1456 74942886461 26827.95
## 17 - parent1 1 6.248921e+06 1457 74949135382 26826.08
## 18 - car_use 1 2.510220e+07 1458 74974237583 26824.58
## 19 - bluebook 1 2.596424e+07 1459 75000201824 26823.10
## 20 - revoked 1 3.376960e+07 1460 75033971420 26821.78
## 21 + homekids:red_car 1 1.592836e+08 1459 74874687832 26820.58
## 22 + home_val:sex 1 1.455318e+08 1458 74729155997 26819.64
## 23 + homekids:home_val 1 1.492159e+08 1457 74579940090 26818.63
## 24 + kidsdriv:car_age 1 1.541971e+08 1456 74425743008 26817.51
## 25 + age:urbanicity 1 1.541162e+08 1455 74271626808 26816.38
## 26 + age:car_type:urbanicity 4 1.848856e+09 1451 72422770876 26786.37
## 27 + home_val:oldclaim 1 1.482064e+08 1450 72274564496 26785.28
## 28 + kidsdriv:clm_freq 1 1.109008e+08 1449 72163663686 26784.96
## 29 + age:clm_freq 1 1.759370e+08 1448 71987726674 26783.28
## 30 + kidsdriv:age:clm_freq 1 2.379337e+08 1447 71749793011 26780.29
# final model (based on the 3-degree of interactions)
finalModel <- lm(
target_amt ~ kidsdriv + age + homekids + yoj + income + home_val +
mstatus + sex + tif + car_type + red_car + oldclaim + clm_freq +
mvr_pts + car_age + urbanicity + homekids:clm_freq + homekids:car_type +
car_type:urbanicity + home_val:mstatus + age:car_type + kidsdriv:age +
income:mstatus + age:home_val + homekids:car_age + yoj:tif +
kidsdriv:oldclaim + income:car_type + homekids:red_car +
home_val:sex + homekids:home_val + kidsdriv:car_age + age:urbanicity +
home_val:oldclaim + kidsdriv:clm_freq + age:clm_freq + age:car_type:urbanicity +
kidsdriv:age:clm_freq,
data = trainSet %>%
dplyr::filter(target_flag == "Yes") %>%
dplyr::select(variables, quant_var)
)
summary(finalModel)
##
## Call:
## lm(formula = target_amt ~ kidsdriv + age + homekids + yoj + income +
## home_val + mstatus + sex + tif + car_type + red_car + oldclaim +
## clm_freq + mvr_pts + car_age + urbanicity + homekids:clm_freq +
## homekids:car_type + car_type:urbanicity + home_val:mstatus +
## age:car_type + kidsdriv:age + income:mstatus + age:home_val +
## homekids:car_age + yoj:tif + kidsdriv:oldclaim + income:car_type +
## homekids:red_car + home_val:sex + homekids:home_val + kidsdriv:car_age +
## age:urbanicity + home_val:oldclaim + kidsdriv:clm_freq +
## age:clm_freq + age:car_type:urbanicity + kidsdriv:age:clm_freq,
## data = trainSet %>% dplyr::filter(target_flag == "Yes") %>%
## dplyr::select(variables, quant_var))
##
## Residuals:
## Min 1Q Median 3Q Max
## -18504 -3101 -1091 912 75691
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error
## (Intercept) 6157.26 664.87
## kidsdriv -220.46 203.47
## age 421.19 480.96
## homekids 442.93 541.58
## yoj -104.28 201.34
## income -1780.86 525.57
## home_val -77.16 385.25
## mstatusz_No 1194.44 432.93
## sexz_F -1264.66 689.06
## tif -47.34 199.78
## car_typePanel Truck 1227.58 825.18
## car_typePickup -371.90 661.09
## car_typeSports Car 431.36 813.56
## car_typeVan -176.79 798.68
## car_typez_SUV -285.40 710.90
## red_caryes -368.62 548.91
## oldclaim 21.94 214.98
## clm_freq -166.52 219.32
## mvr_pts 259.13 166.20
## car_age -287.20 208.52
## urbanicityz_Highly Rural/ Rural -3096.97 1774.29
## homekids:clm_freq 735.41 228.39
## homekids:car_typePanel Truck 3451.02 835.13
## homekids:car_typePickup 657.81 640.92
## homekids:car_typeSports Car -842.79 724.14
## homekids:car_typeVan -678.26 816.93
## homekids:car_typez_SUV -128.95 611.79
## car_typePanel Truck:urbanicityz_Highly Rural/ Rural NA NA
## car_typePickup:urbanicityz_Highly Rural/ Rural 5557.25 2881.32
## car_typeSports Car:urbanicityz_Highly Rural/ Rural 2089.92 2894.18
## car_typeVan:urbanicityz_Highly Rural/ Rural 20566.09 3644.58
## car_typez_SUV:urbanicityz_Highly Rural/ Rural 2609.11 2294.87
## home_val:mstatusz_No 1567.46 438.06
## age:car_typePanel Truck 1924.33 824.48
## age:car_typePickup -38.02 680.70
## age:car_typeSports Car -391.10 638.73
## age:car_typeVan -2235.39 824.49
## age:car_typez_SUV -375.29 575.19
## kidsdriv:age 266.52 216.77
## income:mstatusz_No 1066.00 388.41
## age:home_val 677.16 192.65
## homekids:car_age 695.12 234.39
## yoj:tif -418.88 180.08
## kidsdriv:oldclaim 476.50 172.21
## income:car_typePanel Truck 1169.47 775.82
## income:car_typePickup 1929.04 673.58
## income:car_typeSports Car 2066.00 689.29
## income:car_typeVan 1701.52 737.94
## income:car_typez_SUV 1564.40 584.45
## homekids:red_caryes -1015.11 470.41
## home_val:sexz_F -682.46 401.28
## homekids:home_val 468.87 210.25
## kidsdriv:car_age -369.25 180.45
## age:urbanicityz_Highly Rural/ Rural -702.84 1300.16
## home_val:oldclaim 272.93 177.81
## kidsdriv:clm_freq -336.48 197.71
## age:clm_freq 328.15 175.17
## age:car_typePanel Truck:urbanicityz_Highly Rural/ Rural NA NA
## age:car_typePickup:urbanicityz_Highly Rural/ Rural 5226.66 3060.49
## age:car_typeSports Car:urbanicityz_Highly Rural/ Rural 882.66 2245.88
## age:car_typeVan:urbanicityz_Highly Rural/ Rural 17335.75 2991.61
## age:car_typez_SUV:urbanicityz_Highly Rural/ Rural 428.86 1815.43
## kidsdriv:age:clm_freq 354.29 161.74
## t value Pr(>|t|)
## (Intercept) 9.261 < 2e-16 ***
## kidsdriv -1.084 0.278761
## age 0.876 0.381325
## homekids 0.818 0.413582
## yoj -0.518 0.604584
## income -3.388 0.000722 ***
## home_val -0.200 0.841287
## mstatusz_No 2.759 0.005871 **
## sexz_F -1.835 0.066659 .
## tif -0.237 0.812718
## car_typePanel Truck 1.488 0.137062
## car_typePickup -0.563 0.573819
## car_typeSports Car 0.530 0.596051
## car_typeVan -0.221 0.824848
## car_typez_SUV -0.401 0.688143
## red_caryes -0.672 0.501981
## oldclaim 0.102 0.918722
## clm_freq -0.759 0.447823
## mvr_pts 1.559 0.119184
## car_age -1.377 0.168620
## urbanicityz_Highly Rural/ Rural -1.745 0.081114 .
## homekids:clm_freq 3.220 0.001311 **
## homekids:car_typePanel Truck 4.132 3.80e-05 ***
## homekids:car_typePickup 1.026 0.304897
## homekids:car_typeSports Car -1.164 0.244676
## homekids:car_typeVan -0.830 0.406534
## homekids:car_typez_SUV -0.211 0.833090
## car_typePanel Truck:urbanicityz_Highly Rural/ Rural NA NA
## car_typePickup:urbanicityz_Highly Rural/ Rural 1.929 0.053962 .
## car_typeSports Car:urbanicityz_Highly Rural/ Rural 0.722 0.470343
## car_typeVan:urbanicityz_Highly Rural/ Rural 5.643 2.01e-08 ***
## car_typez_SUV:urbanicityz_Highly Rural/ Rural 1.137 0.255754
## home_val:mstatusz_No 3.578 0.000357 ***
## age:car_typePanel Truck 2.334 0.019732 *
## age:car_typePickup -0.056 0.955469
## age:car_typeSports Car -0.612 0.540428
## age:car_typeVan -2.711 0.006783 **
## age:car_typez_SUV -0.652 0.514203
## kidsdriv:age 1.229 0.219097
## income:mstatusz_No 2.745 0.006135 **
## age:home_val 3.515 0.000453 ***
## homekids:car_age 2.966 0.003070 **
## yoj:tif -2.326 0.020155 *
## kidsdriv:oldclaim 2.767 0.005731 **
## income:car_typePanel Truck 1.507 0.131927
## income:car_typePickup 2.864 0.004246 **
## income:car_typeSports Car 2.997 0.002770 **
## income:car_typeVan 2.306 0.021265 *
## income:car_typez_SUV 2.677 0.007519 **
## homekids:red_caryes -2.158 0.031097 *
## home_val:sexz_F -1.701 0.089212 .
## homekids:home_val 2.230 0.025895 *
## kidsdriv:car_age -2.046 0.040914 *
## age:urbanicityz_Highly Rural/ Rural -0.541 0.588878
## home_val:oldclaim 1.535 0.125010
## kidsdriv:clm_freq -1.702 0.088983 .
## age:clm_freq 1.873 0.061221 .
## age:car_typePanel Truck:urbanicityz_Highly Rural/ Rural NA NA
## age:car_typePickup:urbanicityz_Highly Rural/ Rural 1.708 0.087891 .
## age:car_typeSports Car:urbanicityz_Highly Rural/ Rural 0.393 0.694368
## age:car_typeVan:urbanicityz_Highly Rural/ Rural 5.795 8.38e-09 ***
## age:car_typez_SUV:urbanicityz_Highly Rural/ Rural 0.236 0.813289
## kidsdriv:age:clm_freq 2.191 0.028644 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7042 on 1447 degrees of freedom
## Multiple R-squared: 0.1271, Adjusted R-squared: 0.09092
## F-statistic: 3.512 on 60 and 1447 DF, p-value: < 2.2e-16
broom::tidy(finalModel) %>% arrange(p.value) %>% kable()
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 6157.26102 | 664.8728 | 9.2608107 | 0.0000000 |
| age:car_typeVan:urbanicityz_Highly Rural/ Rural | 17335.74875 | 2991.6108 | 5.7947874 | 0.0000000 |
| car_typeVan:urbanicityz_Highly Rural/ Rural | 20566.08610 | 3644.5828 | 5.6429191 | 0.0000000 |
| homekids:car_typePanel Truck | 3451.01882 | 835.1349 | 4.1322889 | 0.0000380 |
| home_val:mstatusz_No | 1567.46217 | 438.0585 | 3.5782032 | 0.0003574 |
| age:home_val | 677.15735 | 192.6520 | 3.5149249 | 0.0004534 |
| income | -1780.85560 | 525.5706 | -3.3884234 | 0.0007218 |
| homekids:clm_freq | 735.40808 | 228.3909 | 3.2199540 | 0.0013106 |
| income:car_typeSports Car | 2065.99967 | 689.2879 | 2.9972959 | 0.0027703 |
| homekids:car_age | 695.12298 | 234.3885 | 2.9656877 | 0.0030696 |
| income:car_typePickup | 1929.04279 | 673.5836 | 2.8638507 | 0.0042456 |
| kidsdriv:oldclaim | 476.49886 | 172.2145 | 2.7668918 | 0.0057314 |
| mstatusz_No | 1194.43930 | 432.9270 | 2.7589854 | 0.0058712 |
| income:mstatusz_No | 1066.00114 | 388.4118 | 2.7445130 | 0.0061350 |
| age:car_typeVan | -2235.39108 | 824.4937 | -2.7112287 | 0.0067827 |
| income:car_typez_SUV | 1564.40159 | 584.4488 | 2.6767129 | 0.0075188 |
| age:car_typePanel Truck | 1924.33174 | 824.4782 | 2.3339996 | 0.0197320 |
| yoj:tif | -418.87507 | 180.0818 | -2.3260261 | 0.0201547 |
| income:car_typeVan | 1701.51669 | 737.9440 | 2.3057532 | 0.0212653 |
| homekids:home_val | 468.86746 | 210.2472 | 2.2300770 | 0.0258952 |
| kidsdriv:age:clm_freq | 354.29002 | 161.7361 | 2.1905435 | 0.0286439 |
| homekids:red_caryes | -1015.10786 | 470.4060 | -2.1579398 | 0.0310966 |
| kidsdriv:car_age | -369.24951 | 180.4533 | -2.0462326 | 0.0409139 |
| car_typePickup:urbanicityz_Highly Rural/ Rural | 5557.24992 | 2881.3229 | 1.9287147 | 0.0539617 |
| age:clm_freq | 328.15357 | 175.1687 | 1.8733572 | 0.0612205 |
| sexz_F | -1264.65757 | 689.0559 | -1.8353484 | 0.0666592 |
| urbanicityz_Highly Rural/ Rural | -3096.97458 | 1774.2871 | -1.7454755 | 0.0811144 |
| age:car_typePickup:urbanicityz_Highly Rural/ Rural | 5226.65734 | 3060.4936 | 1.7077825 | 0.0878912 |
| kidsdriv:clm_freq | -336.48308 | 197.7066 | -1.7019313 | 0.0889831 |
| home_val:sexz_F | -682.46038 | 401.2797 | -1.7007098 | 0.0892124 |
| mvr_pts | 259.13257 | 166.2033 | 1.5591304 | 0.1191842 |
| home_val:oldclaim | 272.92888 | 177.8076 | 1.5349677 | 0.1250104 |
| income:car_typePanel Truck | 1169.47047 | 775.8212 | 1.5073968 | 0.1319273 |
| car_typePanel Truck | 1227.58097 | 825.1824 | 1.4876480 | 0.1370616 |
| car_age | -287.20185 | 208.5190 | -1.3773414 | 0.1686197 |
| kidsdriv:age | 266.51728 | 216.7747 | 1.2294669 | 0.2190967 |
| homekids:car_typeSports Car | -842.79430 | 724.1426 | -1.1638513 | 0.2446761 |
| car_typez_SUV:urbanicityz_Highly Rural/ Rural | 2609.11289 | 2294.8677 | 1.1369339 | 0.2557542 |
| kidsdriv | -220.46485 | 203.4723 | -1.0835129 | 0.2787613 |
| homekids:car_typePickup | 657.80690 | 640.9177 | 1.0263516 | 0.3048974 |
| age | 421.19239 | 480.9643 | 0.8757249 | 0.3813249 |
| homekids:car_typeVan | -678.25917 | 816.9334 | -0.8302502 | 0.4065343 |
| homekids | 442.92558 | 541.5782 | 0.8178423 | 0.4135819 |
| clm_freq | -166.51752 | 219.3169 | -0.7592554 | 0.4478235 |
| car_typeSports Car:urbanicityz_Highly Rural/ Rural | 2089.92132 | 2894.1821 | 0.7221112 | 0.4703428 |
| red_caryes | -368.62000 | 548.9139 | -0.6715443 | 0.5019810 |
| age:car_typez_SUV | -375.29216 | 575.1881 | -0.6524686 | 0.5142026 |
| age:car_typeSports Car | -391.09925 | 638.7256 | -0.6123118 | 0.5404277 |
| car_typePickup | -371.90288 | 661.0853 | -0.5625641 | 0.5738189 |
| age:urbanicityz_Highly Rural/ Rural | -702.84499 | 1300.1613 | -0.5405829 | 0.5888783 |
| car_typeSports Car | 431.35601 | 813.5639 | 0.5302055 | 0.5960508 |
| yoj | -104.27931 | 201.3376 | -0.5179326 | 0.6045845 |
| car_typez_SUV | -285.39708 | 710.9025 | -0.4014574 | 0.6881427 |
| age:car_typeSports Car:urbanicityz_Highly Rural/ Rural | 882.65837 | 2245.8772 | 0.3930128 | 0.6943680 |
| tif | -47.34196 | 199.7847 | -0.2369649 | 0.8127176 |
| age:car_typez_SUV:urbanicityz_Highly Rural/ Rural | 428.85546 | 1815.4302 | 0.2362280 | 0.8132892 |
| car_typeVan | -176.79136 | 798.6800 | -0.2213544 | 0.8248477 |
| homekids:car_typez_SUV | -128.95172 | 611.7876 | -0.2107786 | 0.8330897 |
| home_val | -77.16001 | 385.2538 | -0.2002836 | 0.8412869 |
| oldclaim | 21.94131 | 214.9811 | 0.1020616 | 0.9187219 |
| age:car_typePickup | -38.01724 | 680.7042 | -0.0558499 | 0.9554691 |
# comparison
testSet$target_amt_predicted <- predict(finalModel, testSet[, variables])
# root mean squared error (rmse)
Metrics::rmse(testSet$target_amt, testSet$target_amt_predicted)
## [1] 7343.597
From above corrplot and density curve, we see that there’s actually nothing strongly associated with the target variable. We used stepwise regression and used up to three degree of interaction to come up with the best model possible (which will cause overfitting and make this model useless in the future), but still that only gave us 9% of adjusted R-squared. That means, over 90% of the variance in the actual amount cannot be predicted from this model. The regression model is less encouraging than the classification model.
dfEval$target_amt <- predict(finalModel, dfEval[, variables])
dfEval %>%
dplyr::filter(target_flag == "Yes") %>%
ggplot(aes(target_amt)) +
geom_density() +
geom_vline(xintercept = median(dfEval$target_amt[dfEval$target_flag == "Yes"]),
linetype = "dashed") +
ggtitle("Distribution of Predicted Target Amount")
dfEval %>% head() %>% kable()
| index | target_flag | target_amt | kidsdriv | age | homekids | yoj | income | parent1 | home_val | mstatus | sex | education | job | travtime | car_use | bluebook | tif | car_type | red_car | oldclaim | clm_freq | revoked | mvr_pts | car_age | urbanicity | OOF_pred_glm | OOF_pred_nb | OOF_pred_rf | OOF_pred_top_glm | OOF_pred_top_nb | OOF_pred_top_gbm | target_flag_prob |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3 | No | 6783.665 | -0.3337943 | 0.3499326 | -0.6425179 | 0.1488987 | 0.6619537 | No | -0.9915932 | z_No | M | Bachelors | Manager | -0.4549095 | Private | 0.0016269 | -1.0689293 | Van | yes | -0.6571645 | -0.7111922 | No | 0.1062001 | 0.3150395 | Highly Urban/ Urban | 0.1220784 | 0.0013785 | 0.092 | 0.0915576 | 0.0098567 | 0.0886053 | 0.063 |
| 9 | No | 5900.963 | 1.7198080 | -0.5884831 | 0.2530751 | 0.1488987 | 0.6011222 | Yes | -0.9915932 | z_No | M | z_High School | Manager | -0.7729272 | Private | -0.3709252 | 0.1901913 | Minivan | no | 0.4022653 | 0.1679432 | No | 0.1062001 | -1.2457634 | Highly Urban/ Urban | 0.2336235 | 0.2439135 | 0.328 | 0.2164859 | 0.2120411 | 0.2684035 | 0.232 |
| 10 | No | 4250.961 | -0.3337943 | -0.1192753 | 1.1486681 | 0.3887064 | 0.3491062 | Yes | -0.9915932 | z_No | z_F | z_High School | z_Blue Collar | -0.2004953 | Commercial | 1.1124265 | 1.1974878 | z_SUV | no | -0.6571645 | -0.7111922 | No | -0.8014825 | 0.3150395 | z_Highly Rural/ Rural | 0.0908932 | 0.0091209 | 0.012 | 0.0667108 | 0.0056237 | 0.0872490 | 0.053 |
| 18 | No | 2521.773 | -0.3337943 | -1.1749930 | 1.1486681 | 0.1488987 | -0.4521312 | Yes | -0.9915932 | z_No | M | z_High School | Clerical | 2.5980610 | Private | 1.5329762 | 0.1901913 | Pickup | no | -0.6571645 | -0.7111922 | Yes | -0.8014825 | -0.7254958 | z_Highly Rural/ Rural | 0.2403932 | 0.0106405 | 0.112 | 0.1351010 | 0.0187269 | 0.1722218 | 0.109 |
| 21 | No | 5053.206 | -0.3337943 | 1.6402542 | -0.6425179 | 0.3887064 | 1.5865921 | No | -0.9915932 | z_No | M | z_High School | Manager | 0.7535580 | Private | -0.8166164 | -1.0689293 | Minivan | yes | 1.2701376 | 1.0470786 | No | 1.0138828 | -1.2457634 | Highly Urban/ Urban | 0.2392829 | 0.0536335 | 0.256 | 0.1920650 | 0.0462885 | 0.2155320 | 0.151 |
| 30 | No | 5818.214 | -0.3337943 | 0.1153287 | -0.6425179 | 0.8683218 | -1.3420087 | No | 0.3708270 | Yes | M | Bachelors | Professional | -1.6633769 | Commercial | 0.3696079 | -1.0689293 | Panel Truck | no | -0.1254949 | 0.1679432 | No | 0.1062001 | 0.6618846 | Highly Urban/ Urban | 0.1864743 | 0.0259940 | 0.226 | 0.1558029 | 0.0287481 | 0.1669544 | 0.117 |