Data621 HW4

Load Libraries

library(tidyverse)
library(tidymodels)
library(ggplot2)
library(Ecfun)
library(vip)
library(themis)
rm(list=ls())
#setwd('Code/HW_4')

Load data

We will load the data from the Cloud Servers

df_train_full <- read_csv('http://www.my-cunymsds.com/data621/insurance_training_data.csv')

df_validation <- read_csv('http://www.my-cunymsds.com/data621/insurance-evaluation-data.csv')

Exploratory Data Analysis (EDA)

Let’s take a look at the general structure of the dataset

str(df_train_full)
## spc_tbl_ [8,161 × 26] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ INDEX      : num [1:8161] 1 2 4 5 6 7 8 11 12 13 ...
##  $ TARGET_FLAG: num [1:8161] 0 0 0 0 0 1 0 1 1 0 ...
##  $ TARGET_AMT : num [1:8161] 0 0 0 0 0 ...
##  $ KIDSDRIV   : num [1:8161] 0 0 0 0 0 0 0 1 0 0 ...
##  $ AGE        : num [1:8161] 60 43 35 51 50 34 54 37 34 50 ...
##  $ HOMEKIDS   : num [1:8161] 0 0 1 0 0 1 0 2 0 0 ...
##  $ YOJ        : num [1:8161] 11 11 10 14 NA 12 NA NA 10 7 ...
##  $ INCOME     : chr [1:8161] "$67,349" "$91,449" "$16,039" NA ...
##  $ PARENT1    : chr [1:8161] "No" "No" "No" "No" ...
##  $ HOME_VAL   : chr [1:8161] "$0" "$257,252" "$124,191" "$306,251" ...
##  $ MSTATUS    : chr [1:8161] "z_No" "z_No" "Yes" "Yes" ...
##  $ SEX        : chr [1:8161] "M" "M" "z_F" "M" ...
##  $ EDUCATION  : chr [1:8161] "PhD" "z_High School" "z_High School" "<High School" ...
##  $ JOB        : chr [1:8161] "Professional" "z_Blue Collar" "Clerical" "z_Blue Collar" ...
##  $ TRAVTIME   : num [1:8161] 14 22 5 32 36 46 33 44 34 48 ...
##  $ CAR_USE    : chr [1:8161] "Private" "Commercial" "Private" "Private" ...
##  $ BLUEBOOK   : chr [1:8161] "$14,230" "$14,940" "$4,010" "$15,440" ...
##  $ TIF        : num [1:8161] 11 1 4 7 1 1 1 1 1 7 ...
##  $ CAR_TYPE   : chr [1:8161] "Minivan" "Minivan" "z_SUV" "Minivan" ...
##  $ RED_CAR    : chr [1:8161] "yes" "yes" "no" "yes" ...
##  $ OLDCLAIM   : chr [1:8161] "$4,461" "$0" "$38,690" "$0" ...
##  $ CLM_FREQ   : num [1:8161] 2 0 2 0 2 0 0 1 0 0 ...
##  $ REVOKED    : chr [1:8161] "No" "No" "No" "No" ...
##  $ MVR_PTS    : num [1:8161] 3 0 3 0 3 0 0 10 0 1 ...
##  $ CAR_AGE    : num [1:8161] 18 1 10 6 17 7 1 7 1 17 ...
##  $ URBANICITY : chr [1:8161] "Highly Urban/ Urban" "Highly Urban/ Urban" "Highly Urban/ Urban" "Highly Urban/ Urban" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   INDEX = col_double(),
##   ..   TARGET_FLAG = col_double(),
##   ..   TARGET_AMT = col_double(),
##   ..   KIDSDRIV = col_double(),
##   ..   AGE = col_double(),
##   ..   HOMEKIDS = col_double(),
##   ..   YOJ = col_double(),
##   ..   INCOME = col_character(),
##   ..   PARENT1 = col_character(),
##   ..   HOME_VAL = col_character(),
##   ..   MSTATUS = col_character(),
##   ..   SEX = col_character(),
##   ..   EDUCATION = col_character(),
##   ..   JOB = col_character(),
##   ..   TRAVTIME = col_double(),
##   ..   CAR_USE = col_character(),
##   ..   BLUEBOOK = col_character(),
##   ..   TIF = col_double(),
##   ..   CAR_TYPE = col_character(),
##   ..   RED_CAR = col_character(),
##   ..   OLDCLAIM = col_character(),
##   ..   CLM_FREQ = col_double(),
##   ..   REVOKED = col_character(),
##   ..   MVR_PTS = col_double(),
##   ..   CAR_AGE = col_double(),
##   ..   URBANICITY = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

We see several character columns. This we would need to change to FACTORS for later perform a DUMMY VARIABLE transformation to do the classification and regression.

Let’s see how imbalanced the data set is

df_train_full %>%
  group_by(TARGET_FLAG) %>%
  summarise(n = n())
## # A tibble: 2 × 2
##   TARGET_FLAG     n
##         <dbl> <int>
## 1           0  6008
## 2           1  2153

Roughly 2/3 are ZEROS. This is a fairly imbalanced, as one class dominates the other. We will need to do something like SMOTE (Synthetic Minority Oversampling Technique) to take care of this.

Let’s looks at the distribution of the TARGET AMOUNTS variable

# plot
p <- df_train_full %>%
  # filter( price<300 ) %>%
  ggplot( aes(x=TARGET_AMT)) +
  geom_histogram(bins = 20) +
  scale_x_continuous(trans='log10') +
    ggtitle("Distribution of Target Amounts")

p

We can see that Target Amounts is heavily skewed to the right with most values around 2,000-5,000 but with some values going above 100,000

Outliers are many times an issue when we are trying to forecast. Let’s look at a correlation plot but only on values larger than 20,000 to see if we have some strong correlations.

library(corrplot)

M <- df_train_full %>%
  filter(TARGET_AMT>20000)

M2 <- cor(M[,sapply(M,is.numeric)],use="complete.obs",method="pearson")

M2 %>% 
  corrplot(type = "upper",
         addCoef.col = TRUE,
         method = "shade",
         diag = TRUE, number.cex = 0.5, tl.cex = 0.7)

Since we have so many character columns, very few correlations show up, but we can see TARGET amount doesn’t have strong correlations with the numerical data as is. Let’s fix the character fields and re-run

df_train_full$INCOME <- parseDollars(df_train_full$INCOME)
df_train_full$HOME_VAL <- parseDollars(df_train_full$HOME_VAL)
df_train_full$BLUEBOOK <- parseDollars(df_train_full$BLUEBOOK)
df_train_full$OLDCLAIM <- parseDollars(df_train_full$OLDCLAIM)
df_train_full$TARGET_FLAG <- as.factor(df_train_full$TARGET_FLAG)

df_train_full$PARENT1 <- as.factor(df_train_full$PARENT1)
df_train_full$MSTATUS <- as.factor(df_train_full$MSTATUS)
df_train_full$SEX <- as.factor(df_train_full$SEX)
df_train_full$EDUCATION <- as.factor(df_train_full$EDUCATION)
df_train_full$JOB <- as.factor(df_train_full$JOB)
df_train_full$CAR_USE <- as.factor(df_train_full$CAR_USE)
df_train_full$CAR_TYPE <- as.factor(df_train_full$CAR_TYPE)
df_train_full$RED_CAR <- as.factor(df_train_full$RED_CAR)
df_train_full$CLM_FREQ <- as.factor(df_train_full$CLM_FREQ)
df_train_full$REVOKED <- as.factor(df_train_full$REVOKED)
df_train_full$URBANICITY <- as.factor(df_train_full$URBANICITY)

str(df_train_full)
## spc_tbl_ [8,161 × 26] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ INDEX      : num [1:8161] 1 2 4 5 6 7 8 11 12 13 ...
##  $ TARGET_FLAG: Factor w/ 2 levels "0","1": 1 1 1 1 1 2 1 2 2 1 ...
##  $ TARGET_AMT : num [1:8161] 0 0 0 0 0 ...
##  $ KIDSDRIV   : num [1:8161] 0 0 0 0 0 0 0 1 0 0 ...
##  $ AGE        : num [1:8161] 60 43 35 51 50 34 54 37 34 50 ...
##  $ HOMEKIDS   : num [1:8161] 0 0 1 0 0 1 0 2 0 0 ...
##  $ YOJ        : num [1:8161] 11 11 10 14 NA 12 NA NA 10 7 ...
##  $ INCOME     : num [1:8161] 67349 91449 16039 NA 114986 ...
##  $ PARENT1    : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 2 1 1 1 1 ...
##  $ HOME_VAL   : num [1:8161] 0 257252 124191 306251 243925 ...
##  $ MSTATUS    : Factor w/ 2 levels "Yes","z_No": 2 2 1 1 1 2 1 1 2 2 ...
##  $ SEX        : Factor w/ 2 levels "M","z_F": 1 1 2 1 2 2 2 1 2 1 ...
##  $ EDUCATION  : Factor w/ 5 levels "<High School",..: 4 5 5 1 4 2 1 2 2 2 ...
##  $ JOB        : Factor w/ 8 levels "Clerical","Doctor",..: 6 8 1 8 2 8 8 8 1 6 ...
##  $ TRAVTIME   : num [1:8161] 14 22 5 32 36 46 33 44 34 48 ...
##  $ CAR_USE    : Factor w/ 2 levels "Commercial","Private": 2 1 2 2 2 1 2 1 2 1 ...
##  $ BLUEBOOK   : num [1:8161] 14230 14940 4010 15440 18000 ...
##  $ TIF        : num [1:8161] 11 1 4 7 1 1 1 1 1 7 ...
##  $ CAR_TYPE   : Factor w/ 6 levels "Minivan","Panel Truck",..: 1 1 6 1 6 4 6 5 6 5 ...
##  $ RED_CAR    : Factor w/ 2 levels "no","yes": 2 2 1 2 1 1 1 2 1 1 ...
##  $ OLDCLAIM   : num [1:8161] 4461 0 38690 0 19217 ...
##  $ CLM_FREQ   : Factor w/ 6 levels "0","1","2","3",..: 3 1 3 1 3 1 1 2 1 1 ...
##  $ REVOKED    : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 1 2 1 1 ...
##  $ MVR_PTS    : num [1:8161] 3 0 3 0 3 0 0 10 0 1 ...
##  $ CAR_AGE    : num [1:8161] 18 1 10 6 17 7 1 7 1 17 ...
##  $ URBANICITY : Factor w/ 2 levels "Highly Urban/ Urban",..: 1 1 1 1 1 1 1 1 1 2 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   INDEX = col_double(),
##   ..   TARGET_FLAG = col_double(),
##   ..   TARGET_AMT = col_double(),
##   ..   KIDSDRIV = col_double(),
##   ..   AGE = col_double(),
##   ..   HOMEKIDS = col_double(),
##   ..   YOJ = col_double(),
##   ..   INCOME = col_character(),
##   ..   PARENT1 = col_character(),
##   ..   HOME_VAL = col_character(),
##   ..   MSTATUS = col_character(),
##   ..   SEX = col_character(),
##   ..   EDUCATION = col_character(),
##   ..   JOB = col_character(),
##   ..   TRAVTIME = col_double(),
##   ..   CAR_USE = col_character(),
##   ..   BLUEBOOK = col_character(),
##   ..   TIF = col_double(),
##   ..   CAR_TYPE = col_character(),
##   ..   RED_CAR = col_character(),
##   ..   OLDCLAIM = col_character(),
##   ..   CLM_FREQ = col_double(),
##   ..   REVOKED = col_character(),
##   ..   MVR_PTS = col_double(),
##   ..   CAR_AGE = col_double(),
##   ..   URBANICITY = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
#library(corrplot)

M <- df_train_full %>%
  filter(TARGET_AMT>20000)

M2 <- cor(M[,sapply(M,is.numeric)],use="complete.obs",method="pearson")

M2 %>% 
  corrplot(type = "upper",
         addCoef.col = TRUE,
         method = "shade",
         diag = TRUE, number.cex = 0.5, tl.cex = 0.7)

Now we see some correlations with TARGET_AMT and BLUEBOOK and HOME_VAL

df_train_full %>%
  keep(is.numeric) %>%                     # Keep only numeric columns
  gather() %>%                             # Convert to key-value pairs
  ggplot(aes(value)) +                     # Plot the values
    facet_wrap(~ key, scales = "free") +   # In separate panels
    geom_density()                         # as density

With so many fields expected do en up as dummy variables, we still could benefit from doing some transformations to the numerical data. We will address them through TIDYMODEL RECIPES.

Data Transformations and Feature Engineering

Let’s apply same transformations to the validation dataset.

df_validation$INCOME <- parseDollars(df_validation$INCOME)
df_validation$HOME_VAL <- parseDollars(df_validation$HOME_VAL)
df_validation$BLUEBOOK <- parseDollars(df_validation$BLUEBOOK)
df_validation$OLDCLAIM <- parseDollars(df_validation$OLDCLAIM)
df_validation$TARGET_FLAG <- as.factor(df_validation$TARGET_FLAG)

df_validation$PARENT1 <- as.factor(df_validation$PARENT1)
df_validation$MSTATUS <- as.factor(df_validation$MSTATUS)
df_validation$SEX <- as.factor(df_validation$SEX)
df_validation$EDUCATION <- as.factor(df_validation$EDUCATION)
df_validation$JOB <- as.factor(df_validation$JOB)
df_validation$CAR_USE <- as.factor(df_validation$CAR_USE)
df_validation$CAR_TYPE <- as.factor(df_validation$CAR_TYPE)
df_validation$RED_CAR <- as.factor(df_validation$RED_CAR)
df_validation$CLM_FREQ <- as.factor(df_validation$CLM_FREQ)
df_validation$REVOKED <- as.factor(df_validation$REVOKED)
df_validation$URBANICITY <- as.factor(df_validation$URBANICITY)

Let’s further split the training dataset into training and validation

set.seed(456)
data_split <- initial_split(df_train_full)
data_train <- training(data_split)
data_test <- testing(data_split)

# Not needed here since there is no tuning required
data_folds <- bootstraps(data_train, times = 10,)
data_folds2 <- vfold_cv(data_train, v=10)

Let’s transform data using Tidymodels Recipes. We will do a few recipes. This would give 5 different data transformations we will test. In some cases we will impute NA’s values using, KNN, Median, Mode and Mean. We will also apply normalization. Finally we will use SMOTE to help the issues we have of class imbalances.

data_recipe1 <-
  recipe(formula = TARGET_FLAG ~ ., data = data_train) %>%
  update_role(INDEX, new_role = "id variable") %>%  #ignores this var
  update_role(TARGET_AMT, new_role = "id variable") %>%  #ignores this var
  step_impute_knn(all_predictors(), neighbors =5) %>% #impute if necessary
  step_other(all_factor_predictors(), threshold = 0.2) %>%
  step_dummy(all_factor_predictors()) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_smote(TARGET_FLAG)
data_recipe2 <-
  recipe(formula = TARGET_FLAG ~ ., data = data_train) %>%
  update_role(INDEX, new_role = "id variable") %>%  #ignores this var
  update_role(TARGET_AMT, new_role = "id variable") %>%  #ignores this var
  step_impute_knn(all_predictors(), neighbors = 50) %>% #impute if necessary
  step_other(all_factor_predictors(), threshold = 0.1) %>%
  step_dummy(all_factor_predictors()) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_smote(TARGET_FLAG)
data_recipe3 <-
  recipe(formula = TARGET_FLAG ~ ., data = data_train) %>%
  update_role(INDEX, new_role = "id variable") %>%  #ignores this var
  update_role(TARGET_AMT, new_role = "id variable") %>%  #ignores this var
  step_impute_mean(all_numeric_predictors()) %>% #impute if necessary
  step_impute_mode(all_factor_predictors()) %>% #impute if necessary
  step_other(all_factor_predictors(), threshold = 0.2) %>%
  step_dummy(all_factor_predictors()) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_smote(TARGET_FLAG)
data_recipe4 <-
  recipe(formula = TARGET_FLAG ~ ., data = data_train) %>%
  update_role(INDEX, new_role = "id variable") %>%  #ignores this var
  update_role(TARGET_AMT, new_role = "id variable") %>%  #ignores this var
  step_impute_median(all_numeric_predictors()) %>% #impute if necessary
  step_impute_mode(all_factor_predictors()) %>% #impute if necessary
  step_dummy(all_factor_predictors())
data_recipe5 <-
  recipe(formula = TARGET_FLAG ~ ., data = data_train) %>%
  update_role(INDEX, new_role = "id variable") %>%  #ignores this var
  update_role(TARGET_AMT, new_role = "id variable") %>%  #ignores this var
  step_impute_median(all_numeric_predictors()) %>% #impute if necessary
  step_impute_mode(all_factor_predictors()) %>% #impute if necessary
  step_other(all_factor_predictors(), threshold = 0.2) %>%
  step_dummy(all_factor_predictors())

Here we can see how the transformed data looks like

baked_df <- data_recipe1 %>% prep() %>% bake(new_data=NULL)
head(baked_df)
## # A tibble: 6 × 29
##   INDEX TARGET_AMT KIDSDRIV    AGE HOMEK…¹    YOJ INCOME HOME_…² TRAVT…³ BLUEB…⁴
##   <dbl>      <dbl>    <dbl>  <dbl>   <dbl>  <dbl>  <dbl>   <dbl>   <dbl>   <dbl>
## 1  7416          0   -0.336 -0.448  -0.646 -0.121  3.78    3.02   -0.288   3.30 
## 2  7083          0    3.55  -0.914   2.04  -0.221 -1.17   -0.803  -0.850  -0.842
## 3  8940          0   -0.336  1.77   -0.646  0.876  0.545   1.58   -1.54    0.996
## 4  8464          0   -0.336  0.252  -0.646 -0.171 -0.952  -0.307   1.40   -0.372
## 5  3182          0   -0.336  0.136  -0.646  0.128  0.904  -1.22   -0.413   0.606
## 6  6783          0   -0.336  1.07   -0.646  0.876  0.393   0.724  -0.662   1.88 
## # … with 19 more variables: TIF <dbl>, OLDCLAIM <dbl>, MVR_PTS <dbl>,
## #   CAR_AGE <dbl>, TARGET_FLAG <fct>, PARENT1_other <dbl>, MSTATUS_z_No <dbl>,
## #   SEX_z_F <dbl>, EDUCATION_Masters <dbl>, EDUCATION_z_High.School <dbl>,
## #   EDUCATION_other <dbl>, JOB_other <dbl>, CAR_USE_Private <dbl>,
## #   CAR_TYPE_z_SUV <dbl>, CAR_TYPE_other <dbl>, RED_CAR_yes <dbl>,
## #   CLM_FREQ_other <dbl>, REVOKED_other <dbl>,
## #   URBANICITY_z_Highly.Rural..Rural <dbl>, and abbreviated variable names …

Machine Learning - CLASSIFICATION part

Define Models to use

Let’s define the Classification model to use.

#Logistic regression
logistic_spec <-
  logistic_reg() %>%
  set_mode("classification") %>%
  set_engine("glm")

#XGBoost
xgb_spec <- boost_tree(
  trees = 1000,
  tree_depth = tune(), min_n = tune(),
  sample_size = tune(), mtry = tune(),  
  learn_rate = tune()                          
) %>%
  set_engine("xgboost") %>%
  set_mode("classification")

Define the workflow

workflow_set <-workflow_set(
  preproc = list(data_recipe1,data_recipe2,data_recipe3,data_recipe4, data_recipe5),
  models = list(logistic_spec, xgb_spec),
  cross = TRUE
  )

workflow_set
## # A workflow set/tibble: 10 × 4
##    wflow_id              info             option    result    
##    <chr>                 <list>           <list>    <list>    
##  1 recipe_1_logistic_reg <tibble [1 × 4]> <opts[0]> <list [0]>
##  2 recipe_1_boost_tree   <tibble [1 × 4]> <opts[0]> <list [0]>
##  3 recipe_2_logistic_reg <tibble [1 × 4]> <opts[0]> <list [0]>
##  4 recipe_2_boost_tree   <tibble [1 × 4]> <opts[0]> <list [0]>
##  5 recipe_3_logistic_reg <tibble [1 × 4]> <opts[0]> <list [0]>
##  6 recipe_3_boost_tree   <tibble [1 × 4]> <opts[0]> <list [0]>
##  7 recipe_4_logistic_reg <tibble [1 × 4]> <opts[0]> <list [0]>
##  8 recipe_4_boost_tree   <tibble [1 × 4]> <opts[0]> <list [0]>
##  9 recipe_5_logistic_reg <tibble [1 × 4]> <opts[0]> <list [0]>
## 10 recipe_5_boost_tree   <tibble [1 × 4]> <opts[0]> <list [0]>

Fit the stack of models

Here we fits the models and use hyperparameter tuning using 20 levels.

# 
RUN = FALSE
if (RUN) {
doParallel::registerDoParallel()
start.time <- Sys.time()
start.time
fit_workflows <- workflow_set %>%  
  workflow_map(
    seed = 888,  
    fn = "tune_grid",
    grid = 20,
    resamples = data_folds2,
    verbose = TRUE
  )

end.time <- Sys.time()
time.taken <- end.time - start.time
time.taken
doParallel::stopImplicitCluster()
}

Save resulting model (backup)

Let’s save/load model.

if (RUN) {
saved_insurance_modelset <- fit_workflows
saveRDS(saved_insurance_modelset, "saved_insurance_fit3.rds")
}
########
if (!RUN) {
fit_workflows <- readRDS("saved_insurance_fit3.rds")
}

Review Results of all models

autoplot(fit_workflows)

collect_metrics(fit_workflows)
## # A tibble: 210 × 9
##    wflow_id            .config preproc model .metric .esti…¹  mean     n std_err
##    <chr>               <chr>   <chr>   <chr> <chr>   <chr>   <dbl> <int>   <dbl>
##  1 recipe_1_logistic_… Prepro… recipe  logi… accura… binary  0.724    10 0.00704
##  2 recipe_1_logistic_… Prepro… recipe  logi… roc_auc binary  0.807    10 0.00705
##  3 recipe_1_boost_tree Prepro… recipe  boos… accura… binary  0.731    10 0.00434
##  4 recipe_1_boost_tree Prepro… recipe  boos… roc_auc binary  0.793    10 0.00647
##  5 recipe_1_boost_tree Prepro… recipe  boos… accura… binary  0.761    10 0.00584
##  6 recipe_1_boost_tree Prepro… recipe  boos… roc_auc binary  0.808    10 0.00613
##  7 recipe_1_boost_tree Prepro… recipe  boos… accura… binary  0.783    10 0.00535
##  8 recipe_1_boost_tree Prepro… recipe  boos… roc_auc binary  0.811    10 0.00655
##  9 recipe_1_boost_tree Prepro… recipe  boos… accura… binary  0.768    10 0.00412
## 10 recipe_1_boost_tree Prepro… recipe  boos… roc_auc binary  0.813    10 0.00637
## # … with 200 more rows, and abbreviated variable name ¹​.estimator
#rank_results(fit_workflows, rank_metric = "accuracy", select_best = TRUE)
rank_results(fit_workflows, rank_metric = "roc_auc", select_best = TRUE)
## # A tibble: 20 × 9
##    wflow_id              .config .metric  mean std_err     n prepr…¹ model  rank
##    <chr>                 <chr>   <chr>   <dbl>   <dbl> <int> <chr>   <chr> <int>
##  1 recipe_4_boost_tree   Prepro… accura… 0.793 0.00500    10 recipe  boos…     1
##  2 recipe_4_boost_tree   Prepro… roc_auc 0.822 0.00683    10 recipe  boos…     1
##  3 recipe_2_boost_tree   Prepro… accura… 0.784 0.00441    10 recipe  boos…     2
##  4 recipe_2_boost_tree   Prepro… roc_auc 0.818 0.00636    10 recipe  boos…     2
##  5 recipe_5_boost_tree   Prepro… accura… 0.797 0.00465    10 recipe  boos…     3
##  6 recipe_5_boost_tree   Prepro… roc_auc 0.817 0.00672    10 recipe  boos…     3
##  7 recipe_3_boost_tree   Prepro… accura… 0.779 0.00555    10 recipe  boos…     4
##  8 recipe_3_boost_tree   Prepro… roc_auc 0.814 0.00649    10 recipe  boos…     4
##  9 recipe_1_boost_tree   Prepro… accura… 0.768 0.00412    10 recipe  boos…     5
## 10 recipe_1_boost_tree   Prepro… roc_auc 0.813 0.00637    10 recipe  boos…     5
## 11 recipe_4_logistic_reg Prepro… accura… 0.788 0.00468    10 recipe  logi…     6
## 12 recipe_4_logistic_reg Prepro… roc_auc 0.810 0.00707    10 recipe  logi…     6
## 13 recipe_2_logistic_reg Prepro… accura… 0.728 0.00735    10 recipe  logi…     7
## 14 recipe_2_logistic_reg Prepro… roc_auc 0.809 0.00742    10 recipe  logi…     7
## 15 recipe_5_logistic_reg Prepro… accura… 0.792 0.00469    10 recipe  logi…     8
## 16 recipe_5_logistic_reg Prepro… roc_auc 0.808 0.00691    10 recipe  logi…     8
## 17 recipe_1_logistic_reg Prepro… accura… 0.724 0.00704    10 recipe  logi…     9
## 18 recipe_1_logistic_reg Prepro… roc_auc 0.807 0.00705    10 recipe  logi…     9
## 19 recipe_3_logistic_reg Prepro… accura… 0.725 0.00662    10 recipe  logi…    10
## 20 recipe_3_logistic_reg Prepro… roc_auc 0.807 0.00675    10 recipe  logi…    10
## # … with abbreviated variable name ¹​preprocessor

Based on ROC_AUC and ACCURACY XGBoost model had the best performance.

fit_workflows %>%
  collect_metrics()
## # A tibble: 210 × 9
##    wflow_id            .config preproc model .metric .esti…¹  mean     n std_err
##    <chr>               <chr>   <chr>   <chr> <chr>   <chr>   <dbl> <int>   <dbl>
##  1 recipe_1_logistic_… Prepro… recipe  logi… accura… binary  0.724    10 0.00704
##  2 recipe_1_logistic_… Prepro… recipe  logi… roc_auc binary  0.807    10 0.00705
##  3 recipe_1_boost_tree Prepro… recipe  boos… accura… binary  0.731    10 0.00434
##  4 recipe_1_boost_tree Prepro… recipe  boos… roc_auc binary  0.793    10 0.00647
##  5 recipe_1_boost_tree Prepro… recipe  boos… accura… binary  0.761    10 0.00584
##  6 recipe_1_boost_tree Prepro… recipe  boos… roc_auc binary  0.808    10 0.00613
##  7 recipe_1_boost_tree Prepro… recipe  boos… accura… binary  0.783    10 0.00535
##  8 recipe_1_boost_tree Prepro… recipe  boos… roc_auc binary  0.811    10 0.00655
##  9 recipe_1_boost_tree Prepro… recipe  boos… accura… binary  0.768    10 0.00412
## 10 recipe_1_boost_tree Prepro… recipe  boos… roc_auc binary  0.813    10 0.00637
## # … with 200 more rows, and abbreviated variable name ¹​.estimator

Pick and fit best model

We will select best model based on ROC_AUC

metric <- "roc_auc"

best_workflow_id <- fit_workflows %>% 
  rank_results(
    rank_metric = metric,
    select_best = TRUE
  ) %>% 
  dplyr::slice(1) %>% 
  pull(wflow_id)

workflow_best <- extract_workflow(fit_workflows, id = best_workflow_id)

best_workflow_id
## [1] "recipe_4_boost_tree"

The best model based on ROC_AUC is XGBoost with Recipe 4

workflow_best_tuned <- fit_workflows[fit_workflows$wflow_id == best_workflow_id,"result"][[1]][[1]]

workflow_best_tuned
## # Tuning results
## # 10-fold cross-validation 
## # A tibble: 10 × 4
##    splits             id     .metrics          .notes          
##    <list>             <chr>  <list>            <list>          
##  1 <split [5508/612]> Fold01 <tibble [40 × 9]> <tibble [0 × 3]>
##  2 <split [5508/612]> Fold02 <tibble [40 × 9]> <tibble [0 × 3]>
##  3 <split [5508/612]> Fold03 <tibble [40 × 9]> <tibble [0 × 3]>
##  4 <split [5508/612]> Fold04 <tibble [40 × 9]> <tibble [0 × 3]>
##  5 <split [5508/612]> Fold05 <tibble [40 × 9]> <tibble [0 × 3]>
##  6 <split [5508/612]> Fold06 <tibble [40 × 9]> <tibble [0 × 3]>
##  7 <split [5508/612]> Fold07 <tibble [40 × 9]> <tibble [0 × 3]>
##  8 <split [5508/612]> Fold08 <tibble [40 × 9]> <tibble [0 × 3]>
##  9 <split [5508/612]> Fold09 <tibble [40 × 9]> <tibble [0 × 3]>
## 10 <split [5508/612]> Fold10 <tibble [40 × 9]> <tibble [0 × 3]>
collect_metrics(workflow_best_tuned)
## # A tibble: 40 × 11
##     mtry min_n tree_depth learn_rate sampl…¹ .metric .esti…²  mean     n std_err
##    <int> <int>      <int>      <dbl>   <dbl> <chr>   <chr>   <dbl> <int>   <dbl>
##  1    38     5          6    0.00554   0.571 accura… binary  0.793    10 0.00500
##  2    38     5          6    0.00554   0.571 roc_auc binary  0.822    10 0.00683
##  3    33    17          2    0.0311    0.800 accura… binary  0.795    10 0.00512
##  4    33    17          2    0.0311    0.800 roc_auc binary  0.819    10 0.00711
##  5    17    38          8    0.279     0.238 accura… binary  0.754    10 0.00539
##  6    17    38          8    0.279     0.238 roc_auc binary  0.753    10 0.00598
##  7    32     9         12    0.0203    0.744 accura… binary  0.787    10 0.00447
##  8    32     9         12    0.0203    0.744 roc_auc binary  0.814    10 0.00678
##  9    29    27         14    0.00911   0.842 accura… binary  0.795    10 0.00436
## 10    29    27         14    0.00911   0.842 roc_auc binary  0.820    10 0.00672
## # … with 30 more rows, 1 more variable: .config <chr>, and abbreviated variable
## #   names ¹​sample_size, ²​.estimator
autoplot(workflow_best_tuned)

select_best(workflow_best_tuned, "roc_auc")
## # A tibble: 1 × 6
##    mtry min_n tree_depth learn_rate sample_size .config              
##   <int> <int>      <int>      <dbl>       <dbl> <chr>                
## 1    38     5          6    0.00554       0.571 Preprocessor1_Model01

Evaluate Model

Let’s test it with the unseen test data

workflow_best_final <- finalize_workflow(workflow_best, select_best(workflow_best_tuned, "roc_auc"))

doParallel::registerDoParallel()

workflow_best_final_fit <- workflow_best_final %>% 
  last_fit(
    split = data_split
  )

doParallel::stopImplicitCluster()

workflow_best_final_fit
## # Resampling results
## # Manual resampling 
## # A tibble: 1 × 6
##   splits              id               .metrics .notes   .predictions .workflow 
##   <list>              <chr>            <list>   <list>   <list>       <list>    
## 1 <split [6120/2041]> train/test split <tibble> <tibble> <tibble>     <workflow>

Peformance results of selected model and hyperparameters

workflow_best_final_fit %>% 
  collect_metrics()
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.791 Preprocessor1_Model1
## 2 roc_auc  binary         0.818 Preprocessor1_Model1

Results with TEST DATA were 79% accuracy.

Let’s see a confusion matrix.

workflow_best_final_fit %>%
  collect_predictions() %>%
  conf_mat(TARGET_FLAG, .pred_class)
##           Truth
## Prediction    0    1
##          0 1385  330
##          1   96  230

Let’s see which ones were the most important variables

imp_spec <- xgb_spec %>%
  finalize_model(select_best(workflow_best_tuned,"roc_auc")) %>%
  set_engine("xgboost", importance = "permutation")

#imp_spec <- logistic_spec %>%
#  finalize_model(select_best(house_tune)) %>%
#  set_engine("glm", importance = "permutation")

workflow() %>%
  add_recipe(data_recipe4) %>%
# add_model(imp_spec) %>%
  add_model(imp_spec) %>%
  fit(data_train) %>%
  #pull_workflow_fit() %>%
  extract_fit_parsnip() %>%
  vip(aesthetics = list(alpha = 0.8, fill = "midnightblue"))
## [12:22:09] WARNING: amalgamation/../src/learner.cc:627: 
## Parameters: { "importance" } might not be used.
## 
##   This could be a false alarm, with some parameters getting used by language bindings but
##   then being mistakenly passed down to XGBoost core, or some parameter actually being used
##   but getting flagged wrongly here. Please open an issue if you find any such cases.

Let’s predict on the validation set

my_predictions1 <- workflow_best_final_fit$.workflow[[1]] %>%
  predict(df_validation)

Save the data in .CSV

my_predictions2 <- as_tibble(df_validation$INDEX)
my_predictions2$pred_class <- my_predictions1$.pred_class
write.csv(my_predictions2,"JF_predictions.csv")

Machine Learning - REGRESSION part

Data Transformation

We would need to define different data transformation for the the regression. Let’s transform data using Tidymodels Recipes. We will do a few recipes

data_recipe1 <-
  recipe(formula = TARGET_AMT ~ ., data = data_train) %>%
  update_role(INDEX, new_role = "id variable") %>%  #ignores this var
  update_role(TARGET_FLAG, new_role = "id variable") %>%  #ignores this var
  step_interact(terms = ~ HOME_VAL:BLUEBOOK) %>%
  step_impute_knn(all_predictors(), neighbors =5) %>% #impute if necessary
  step_other(all_factor_predictors(), threshold = 0.2) %>%
  step_dummy(all_factor_predictors()) %>%
  #step_smote(TARGET_FLAG) %>%
  step_normalize(all_numeric_predictors())
data_recipe2 <-
  recipe(formula = TARGET_AMT ~ ., data = data_train) %>%
  update_role(INDEX, new_role = "id variable") %>%  #ignores this var
  update_role(TARGET_FLAG, new_role = "id variable") %>%  #ignores this var
  step_impute_median(all_numeric_predictors()) %>% #impute if necessary
  step_impute_mode(all_factor_predictors()) %>% #impute if necessary
  step_dummy(all_factor_predictors())
data_recipe3 <-
  recipe(formula = TARGET_AMT ~ BLUEBOOK+CAR_TYPE+CAR_AGE, data = data_train) %>%
  step_impute_mean(all_numeric_predictors()) %>% #impute if necessary
  step_impute_mode(all_factor_predictors()) %>%
  step_dummy(all_factor_predictors()) %>%
  step_normalize(all_numeric_predictors())
data_recipe4 <-
  recipe(formula = TARGET_AMT ~ BLUEBOOK+CAR_TYPE+CAR_AGE, data = data_train) %>%
  step_impute_median(all_numeric_predictors()) %>% #impute if necessary
  step_impute_knn(all_factor_predictors(), neighbors = 10) %>%
  step_dummy(all_factor_predictors())
data_recipe5 <-
  recipe(formula = TARGET_AMT ~ BLUEBOOK+CAR_TYPE+CAR_AGE, data = data_train) %>%
  step_impute_linear(all_numeric_predictors()) %>% #impute if necessary
  step_impute_knn(all_factor_predictors(), neighbors = 10) %>%
  step_dummy(all_factor_predictors())

Here we can see how the transformed data looks like

baked_df <- data_recipe1 %>% prep() %>% bake(new_data=NULL)
head(baked_df)
## # A tibble: 6 × 30
##   INDEX TARGET_FLAG KIDSD…¹    AGE HOMEK…²    YOJ INCOME HOME_…³ TRAVT…⁴ BLUEB…⁵
##   <dbl> <fct>         <dbl>  <dbl>   <dbl>  <dbl>  <dbl>   <dbl>   <dbl>   <dbl>
## 1  7416 0            -0.336 -0.448  -0.646 -0.121  3.78    3.02   -0.288   3.30 
## 2  7083 0             3.55  -0.914   2.04  -0.221 -1.16   -0.803  -0.850  -0.842
## 3  8940 0            -0.336  1.77   -0.646  0.876  0.545   1.58   -1.54    0.996
## 4  8464 0            -0.336  0.252  -0.646 -0.171 -0.952  -0.307   1.40   -0.372
## 5  3182 0            -0.336  0.136  -0.646  0.128  0.904  -1.22   -0.413   0.606
## 6  6783 0            -0.336  1.07   -0.646  0.876  0.393   0.724  -0.662   1.88 
## # … with 20 more variables: TIF <dbl>, OLDCLAIM <dbl>, MVR_PTS <dbl>,
## #   CAR_AGE <dbl>, TARGET_AMT <dbl>, HOME_VAL_x_BLUEBOOK <dbl>,
## #   PARENT1_other <dbl>, MSTATUS_z_No <dbl>, SEX_z_F <dbl>,
## #   EDUCATION_Masters <dbl>, EDUCATION_z_High.School <dbl>,
## #   EDUCATION_other <dbl>, JOB_other <dbl>, CAR_USE_Private <dbl>,
## #   CAR_TYPE_z_SUV <dbl>, CAR_TYPE_other <dbl>, RED_CAR_yes <dbl>,
## #   CLM_FREQ_other <dbl>, REVOKED_other <dbl>, …

Define Regression Models

Let’s define the regression model to use.

#Linear Regression
lm_spec <- linear_reg() %>% set_engine("lm")

#Lasso Regression
lasso_spec <- linear_reg(mode = "regression",
                        penalty = tune(),
                        mixture = 1) %>%
  set_engine("glmnet")

Define the workflow

workflow_set <-workflow_set(
  preproc = list(data_recipe1,data_recipe2,data_recipe3,data_recipe4, data_recipe5),
  models = list(lm_spec, lasso_spec),
  cross = TRUE
  )

workflow_set
## # A workflow set/tibble: 10 × 4
##    wflow_id              info             option    result    
##    <chr>                 <list>           <list>    <list>    
##  1 recipe_1_linear_reg_1 <tibble [1 × 4]> <opts[0]> <list [0]>
##  2 recipe_1_linear_reg_2 <tibble [1 × 4]> <opts[0]> <list [0]>
##  3 recipe_2_linear_reg_1 <tibble [1 × 4]> <opts[0]> <list [0]>
##  4 recipe_2_linear_reg_2 <tibble [1 × 4]> <opts[0]> <list [0]>
##  5 recipe_3_linear_reg_1 <tibble [1 × 4]> <opts[0]> <list [0]>
##  6 recipe_3_linear_reg_2 <tibble [1 × 4]> <opts[0]> <list [0]>
##  7 recipe_4_linear_reg_1 <tibble [1 × 4]> <opts[0]> <list [0]>
##  8 recipe_4_linear_reg_2 <tibble [1 × 4]> <opts[0]> <list [0]>
##  9 recipe_5_linear_reg_1 <tibble [1 × 4]> <opts[0]> <list [0]>
## 10 recipe_5_linear_reg_2 <tibble [1 × 4]> <opts[0]> <list [0]>

Fit the stack of models

Here we fits the models and use hyperparameter tuning using 20 levels.

# 
RUN = FALSE
if (RUN) {
doParallel::registerDoParallel()
start.time <- Sys.time()
start.time
fit_workflows <- workflow_set %>%  
  workflow_map(
    seed = 888,  
    fn = "tune_grid",
    grid = 10,
    resamples = data_folds,
    verbose = TRUE
  )

end.time <- Sys.time()
time.taken <- end.time - start.time
time.taken
doParallel::stopImplicitCluster()
}

Save resulting model (backup)

Let’s save/load model.

if (RUN) {
saved_insurance_regression <- fit_workflows
saveRDS(saved_insurance_regression, "saved_insurance_regression.rds")
}
########
if (!RUN) {
fit_workflows <- readRDS("saved_insurance_regression.rds")
}

Review Results of all Regression models

autoplot(fit_workflows)

collect_metrics(fit_workflows)
## # A tibble: 110 × 9
##    wflow_id          .config preproc model .metric .esti…¹    mean     n std_err
##    <chr>             <chr>   <chr>   <chr> <chr>   <chr>     <dbl> <int>   <dbl>
##  1 recipe_1_linear_… Prepro… recipe  line… rmse    standa… 4.57e+3    10 1.43e+2
##  2 recipe_1_linear_… Prepro… recipe  line… rsq     standa… 6.29e-2    10 2.27e-3
##  3 recipe_1_linear_… Prepro… recipe  line… rmse    standa… 4.57e+3    10 1.43e+2
##  4 recipe_1_linear_… Prepro… recipe  line… rsq     standa… 6.30e-2    10 2.28e-3
##  5 recipe_1_linear_… Prepro… recipe  line… rmse    standa… 4.57e+3    10 1.43e+2
##  6 recipe_1_linear_… Prepro… recipe  line… rsq     standa… 6.30e-2    10 2.28e-3
##  7 recipe_1_linear_… Prepro… recipe  line… rmse    standa… 4.57e+3    10 1.43e+2
##  8 recipe_1_linear_… Prepro… recipe  line… rsq     standa… 6.30e-2    10 2.28e-3
##  9 recipe_1_linear_… Prepro… recipe  line… rmse    standa… 4.57e+3    10 1.43e+2
## 10 recipe_1_linear_… Prepro… recipe  line… rsq     standa… 6.30e-2    10 2.28e-3
## # … with 100 more rows, and abbreviated variable name ¹​.estimator
#rank_results(fit_workflows, rank_metric = "mse", select_best = TRUE)
rank_results(fit_workflows, rank_metric = "rmse", select_best = TRUE)
## # A tibble: 20 × 9
##    wflow_id            .config .metric    mean std_err     n prepr…¹ model  rank
##    <chr>               <chr>   <chr>     <dbl>   <dbl> <int> <chr>   <chr> <int>
##  1 recipe_1_linear_re… Prepro… rmse    4.57e+3 1.43e+2    10 recipe  line…     1
##  2 recipe_1_linear_re… Prepro… rsq     6.30e-2 2.28e-3    10 recipe  line…     1
##  3 recipe_1_linear_re… Prepro… rmse    4.57e+3 1.43e+2    10 recipe  line…     2
##  4 recipe_1_linear_re… Prepro… rsq     6.29e-2 2.27e-3    10 recipe  line…     2
##  5 recipe_2_linear_re… Prepro… rmse    4.58e+3 1.43e+2    10 recipe  line…     3
##  6 recipe_2_linear_re… Prepro… rsq     6.17e-2 2.74e-3    10 recipe  line…     3
##  7 recipe_2_linear_re… Prepro… rmse    4.58e+3 1.43e+2    10 recipe  line…     4
##  8 recipe_2_linear_re… Prepro… rsq     6.16e-2 2.74e-3    10 recipe  line…     4
##  9 recipe_5_linear_re… Prepro… rmse    4.70e+3 1.45e+2    10 recipe  line…     5
## 10 recipe_5_linear_re… Prepro… rsq     8.16e-3 8.66e-4    10 recipe  line…     5
## 11 recipe_5_linear_re… Prepro… rmse    4.70e+3 1.45e+2    10 recipe  line…     6
## 12 recipe_5_linear_re… Prepro… rsq     8.17e-3 8.65e-4    10 recipe  line…     6
## 13 recipe_3_linear_re… Prepro… rmse    4.70e+3 1.45e+2    10 recipe  line…     7
## 14 recipe_3_linear_re… Prepro… rsq     8.10e-3 8.63e-4    10 recipe  line…     7
## 15 recipe_4_linear_re… Prepro… rmse    4.70e+3 1.45e+2    10 recipe  line…     8
## 16 recipe_4_linear_re… Prepro… rsq     8.10e-3 8.60e-4    10 recipe  line…     8
## 17 recipe_3_linear_re… Prepro… rmse    4.70e+3 1.45e+2    10 recipe  line…     9
## 18 recipe_3_linear_re… Prepro… rsq     8.11e-3 8.61e-4    10 recipe  line…     9
## 19 recipe_4_linear_re… Prepro… rmse    4.70e+3 1.45e+2    10 recipe  line…    10
## 20 recipe_4_linear_re… Prepro… rsq     8.11e-3 8.58e-4    10 recipe  line…    10
## # … with abbreviated variable name ¹​preprocessor
fit_workflows %>%
  collect_metrics()
## # A tibble: 110 × 9
##    wflow_id          .config preproc model .metric .esti…¹    mean     n std_err
##    <chr>             <chr>   <chr>   <chr> <chr>   <chr>     <dbl> <int>   <dbl>
##  1 recipe_1_linear_… Prepro… recipe  line… rmse    standa… 4.57e+3    10 1.43e+2
##  2 recipe_1_linear_… Prepro… recipe  line… rsq     standa… 6.29e-2    10 2.27e-3
##  3 recipe_1_linear_… Prepro… recipe  line… rmse    standa… 4.57e+3    10 1.43e+2
##  4 recipe_1_linear_… Prepro… recipe  line… rsq     standa… 6.30e-2    10 2.28e-3
##  5 recipe_1_linear_… Prepro… recipe  line… rmse    standa… 4.57e+3    10 1.43e+2
##  6 recipe_1_linear_… Prepro… recipe  line… rsq     standa… 6.30e-2    10 2.28e-3
##  7 recipe_1_linear_… Prepro… recipe  line… rmse    standa… 4.57e+3    10 1.43e+2
##  8 recipe_1_linear_… Prepro… recipe  line… rsq     standa… 6.30e-2    10 2.28e-3
##  9 recipe_1_linear_… Prepro… recipe  line… rmse    standa… 4.57e+3    10 1.43e+2
## 10 recipe_1_linear_… Prepro… recipe  line… rsq     standa… 6.30e-2    10 2.28e-3
## # … with 100 more rows, and abbreviated variable name ¹​.estimator

Results are not great. Very low R-Squared. This is driven by the large amount cases which don’t seem to corelate to much in the dataset.

Pick and fit best model

We will select best model based on rmse

metric <- "rmse"

best_workflow_id <- fit_workflows %>% 
  rank_results(
    rank_metric = metric,
    select_best = TRUE
  ) %>% 
  dplyr::slice(1) %>% 
  pull(wflow_id)

workflow_best <- extract_workflow(fit_workflows, id = best_workflow_id)

best_workflow_id
## [1] "recipe_1_linear_reg_2"

The best model based on rmse is Lasso Regression with Recipe 1

workflow_best_tuned <- fit_workflows[fit_workflows$wflow_id == best_workflow_id,"result"][[1]][[1]]

workflow_best_tuned
## # Tuning results
## # Bootstrap sampling 
## # A tibble: 10 × 4
##    splits              id          .metrics          .notes          
##    <list>              <chr>       <list>            <list>          
##  1 <split [6120/2259]> Bootstrap01 <tibble [20 × 5]> <tibble [0 × 3]>
##  2 <split [6120/2265]> Bootstrap02 <tibble [20 × 5]> <tibble [0 × 3]>
##  3 <split [6120/2250]> Bootstrap03 <tibble [20 × 5]> <tibble [0 × 3]>
##  4 <split [6120/2282]> Bootstrap04 <tibble [20 × 5]> <tibble [0 × 3]>
##  5 <split [6120/2211]> Bootstrap05 <tibble [20 × 5]> <tibble [0 × 3]>
##  6 <split [6120/2260]> Bootstrap06 <tibble [20 × 5]> <tibble [0 × 3]>
##  7 <split [6120/2233]> Bootstrap07 <tibble [20 × 5]> <tibble [0 × 3]>
##  8 <split [6120/2260]> Bootstrap08 <tibble [20 × 5]> <tibble [0 × 3]>
##  9 <split [6120/2279]> Bootstrap09 <tibble [20 × 5]> <tibble [0 × 3]>
## 10 <split [6120/2219]> Bootstrap10 <tibble [20 × 5]> <tibble [0 × 3]>
collect_metrics(workflow_best_tuned)
## # A tibble: 20 × 7
##     penalty .metric .estimator      mean     n   std_err .config              
##       <dbl> <chr>   <chr>          <dbl> <int>     <dbl> <chr>                
##  1 1.62e-10 rmse    standard   4570.        10 143.      Preprocessor1_Model01
##  2 1.62e-10 rsq     standard      0.0630    10   0.00228 Preprocessor1_Model01
##  3 2.41e- 9 rmse    standard   4570.        10 143.      Preprocessor1_Model02
##  4 2.41e- 9 rsq     standard      0.0630    10   0.00228 Preprocessor1_Model02
##  5 2.89e- 8 rmse    standard   4570.        10 143.      Preprocessor1_Model03
##  6 2.89e- 8 rsq     standard      0.0630    10   0.00228 Preprocessor1_Model03
##  7 1.94e- 7 rmse    standard   4570.        10 143.      Preprocessor1_Model04
##  8 1.94e- 7 rsq     standard      0.0630    10   0.00228 Preprocessor1_Model04
##  9 2.10e- 6 rmse    standard   4570.        10 143.      Preprocessor1_Model05
## 10 2.10e- 6 rsq     standard      0.0630    10   0.00228 Preprocessor1_Model05
## 11 3.55e- 5 rmse    standard   4570.        10 143.      Preprocessor1_Model06
## 12 3.55e- 5 rsq     standard      0.0630    10   0.00228 Preprocessor1_Model06
## 13 1.44e- 4 rmse    standard   4570.        10 143.      Preprocessor1_Model07
## 14 1.44e- 4 rsq     standard      0.0630    10   0.00228 Preprocessor1_Model07
## 15 4.71e- 3 rmse    standard   4570.        10 143.      Preprocessor1_Model08
## 16 4.71e- 3 rsq     standard      0.0630    10   0.00228 Preprocessor1_Model08
## 17 1.94e- 2 rmse    standard   4570.        10 143.      Preprocessor1_Model09
## 18 1.94e- 2 rsq     standard      0.0630    10   0.00228 Preprocessor1_Model09
## 19 4.95e- 1 rmse    standard   4570.        10 143.      Preprocessor1_Model10
## 20 4.95e- 1 rsq     standard      0.0630    10   0.00228 Preprocessor1_Model10
autoplot(workflow_best_tuned)

select_best(workflow_best_tuned, "rmse")
## # A tibble: 1 × 2
##    penalty .config              
##      <dbl> <chr>                
## 1 1.62e-10 Preprocessor1_Model01

Review Results

Let’s test it with the unseen test data

workflow_best_final <- finalize_workflow(workflow_best, select_best(workflow_best_tuned, "rmse"))

doParallel::registerDoParallel()

workflow_best_final_fit <- workflow_best_final %>% 
  last_fit(
    split = data_split
  )

doParallel::stopImplicitCluster()

workflow_best_final_fit
## # Resampling results
## # Manual resampling 
## # A tibble: 1 × 6
##   splits              id               .metrics .notes   .predictions .workflow 
##   <list>              <chr>            <list>   <list>   <list>       <list>    
## 1 <split [6120/2041]> train/test split <tibble> <tibble> <tibble>     <workflow>

Performance results of seleted model and hyperparameters

workflow_best_final_fit %>% 
  collect_metrics()
## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard   4710.     Preprocessor1_Model1
## 2 rsq     standard      0.0534 Preprocessor1_Model1

Plot Predictions vs Actual

collect_predictions(workflow_best_final_fit) %>%
  ggplot(aes(TARGET_AMT, .pred)) +
  geom_abline(lty = 2, color = "gray50") +
  geom_point(alpha = 0.5, color = "midnightblue") +
  coord_fixed()

Plot doesn’t look god due to the many zeros in the dataset and the skewness of the data.

We will do a couple data manipulations steps to see better our results

# Replace all PREDS with 0 when TARGET_AMT =0

workflow_best_final_fit$.predictions[[1]] <- workflow_best_final_fit$.predictions[[1]] %>%
  mutate(.pred = ifelse(TARGET_AMT==0, 0, .pred))
collect_predictions(workflow_best_final_fit) %>%
  filter(.pred > 0) %>%
  ggplot(aes(TARGET_AMT, .pred)) +
  geom_abline(lty = 5, color = "red") +
  geom_point(alpha = 0.3, color = "midnightblue") 

  #scale_x_continuous(trans='log10')
  #coord_fixed()

We can see a little the results of our predictions vs actual. Yet the model doesn’t see to be extremely accurate, specially with the cases where the TARGET AMOUNT is larger than 10,000

Let’s see which ones were the most important variables

imp_spec <- lasso_spec %>%
  finalize_model(select_best(workflow_best_tuned,"rmse")) %>%
  set_engine("glmnet", importance = "permutation")


workflow() %>%
  add_recipe(data_recipe1) %>%
  add_model(imp_spec) %>%
  fit(data_train) %>%
  #pull_workflow_fit() %>%
  extract_fit_parsnip() %>%
  vip(aesthetics = list(alpha = 0.8, fill = "midnightblue"))

Some analysis of the fit

test_fit <- workflow() %>%
  add_recipe(data_recipe1) %>%
  add_model(lm_spec) %>%
  fit(data_train) %>%
  extract_fit_parsnip()
summary(test_fit$fit)
## 
## Call:
## stats::lm(formula = ..y ~ ., data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -5670  -1668   -734    347 104034 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      1476.559     57.496  25.681  < 2e-16 ***
## KIDSDRIV                          157.571     66.232   2.379 0.017387 *  
## AGE                                96.249     68.044   1.415 0.157264    
## HOMEKIDS                           88.770     83.090   1.068 0.285396    
## YOJ                                 2.323     63.707   0.036 0.970916    
## INCOME                           -243.797     88.218  -2.764 0.005735 ** 
## HOME_VAL                         -165.700    141.597  -1.170 0.241956    
## TRAVTIME                          245.044     58.549   4.185 2.89e-05 ***
## BLUEBOOK                           22.379     92.369   0.242 0.808571    
## TIF                              -218.812     57.674  -3.794 0.000150 ***
## OLDCLAIM                         -149.099     80.311  -1.857 0.063426 .  
## MVR_PTS                           354.683     65.470   5.417 6.28e-08 ***
## CAR_AGE                          -189.532     75.412  -2.513 0.011987 *  
## HOME_VAL_x_BLUEBOOK               119.554    148.532   0.805 0.420903    
## PARENT1_other                     217.357     78.810   2.758 0.005833 ** 
## MSTATUS_z_No                      293.065     82.897   3.535 0.000410 ***
## SEX_z_F                           -13.281     87.889  -0.151 0.879896    
## EDUCATION_Masters                 101.220     74.527   1.358 0.174461    
## EDUCATION_z_High.School            75.666     75.046   1.008 0.313364    
## EDUCATION_other                   -16.230     70.907  -0.229 0.818963    
## JOB_other                        -114.808     70.433  -1.630 0.103144    
## CAR_USE_Private                  -294.780     78.270  -3.766 0.000167 ***
## CAR_TYPE_z_SUV                    235.526     82.404   2.858 0.004275 ** 
## CAR_TYPE_other                    339.335     76.598   4.430 9.59e-06 ***
## RED_CAR_yes                        -4.908     77.913  -0.063 0.949776    
## CLM_FREQ_other                    258.799     81.921   3.159 0.001590 ** 
## REVOKED_other                     171.200     65.963   2.595 0.009471 ** 
## URBANICITY_z_Highly.Rural..Rural -610.152     63.556  -9.600  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4498 on 6092 degrees of freedom
## Multiple R-squared:  0.07172,    Adjusted R-squared:  0.06761 
## F-statistic: 17.43 on 27 and 6092 DF,  p-value: < 2.2e-16
knitr::kable(tidy(test_fit))
term estimate std.error statistic p.value
(Intercept) 1476.559444 57.49569 25.6812206 0.0000000
KIDSDRIV 157.571409 66.23206 2.3790805 0.0173865
AGE 96.248573 68.04388 1.4145073 0.1572641
HOMEKIDS 88.770452 83.08960 1.0683701 0.2853959
YOJ 2.322788 63.70694 0.0364605 0.9709164
INCOME -243.796781 88.21831 -2.7635622 0.0057346
HOME_VAL -165.700348 141.59697 -1.1702252 0.2419561
TRAVTIME 245.043565 58.54922 4.1852576 0.0000289
BLUEBOOK 22.379172 92.36909 0.2422799 0.8085715
TIF -218.811735 57.67431 -3.7939205 0.0001497
OLDCLAIM -149.099292 80.31074 -1.8565299 0.0634263
MVR_PTS 354.683083 65.47032 5.4174640 0.0000001
CAR_AGE -189.532082 75.41210 -2.5132847 0.0119869
HOME_VAL_x_BLUEBOOK 119.554486 148.53158 0.8049095 0.4209033
PARENT1_other 217.357408 78.80951 2.7580097 0.0058328
MSTATUS_z_No 293.064865 82.89714 3.5352833 0.0004104
SEX_z_F -13.280585 87.88861 -0.1511070 0.8798963
EDUCATION_Masters 101.220167 74.52705 1.3581668 0.1744611
EDUCATION_z_High.School 75.666427 75.04558 1.0082729 0.3133635
EDUCATION_other -16.229751 70.90674 -0.2288887 0.8189631
JOB_other -114.808391 70.43279 -1.6300418 0.1031444
CAR_USE_Private -294.780450 78.27027 -3.7661869 0.0001673
CAR_TYPE_z_SUV 235.526008 82.40438 2.8581734 0.0042753
CAR_TYPE_other 339.334834 76.59836 4.4300536 0.0000096
RED_CAR_yes -4.907846 77.91309 -0.0629913 0.9497755
CLM_FREQ_other 258.798790 81.92051 3.1591451 0.0015901
REVOKED_other 171.199806 65.96291 2.5953949 0.0094709
URBANICITY_z_Highly.Rural..Rural -610.151575 63.55595 -9.6002280 0.0000000
par(mfrow=c(2,2)) # plot all 4 plots in one

plot(test_fit$fit, 
     pch = 16,    # optional parameters to make points blue
     col = '#006EA1')

#knitr::kable(tidy(workflow_best_final_fit$.workflow[[1]]))

FINAL Predictions on VALIDATION Dataset

Let’s predict on the validation set

my_predictions1 <- workflow_best_final_fit$.workflow[[1]] %>%
  predict(df_validation)

replace all cases TARGET_FLAG=0 with 0 TARGET_AMT

For this we will need to load our predictions for ACCIDENT (TARGET_FLAG)

df_class <- read_csv("JF_predictions.csv")
head(df_class)
## # A tibble: 6 × 3
##    ...1 value pred_class
##   <dbl> <dbl>      <dbl>
## 1     1     3          0
## 2     2     9          0
## 3     3    10          0
## 4     4    18          0
## 5     5    21          0
## 6     6    30          0
my_predictions1$class <- df_class$pred_class
my_predictions1$INDEX <- df_class$value

my_predictions1 <- my_predictions1 %>%
  mutate(.pred=ifelse(class==0,0, .pred))

my_predictions1 <- my_predictions1[,c(3,2,1)]
head(my_predictions1)
## # A tibble: 6 × 3
##   INDEX class .pred
##   <dbl> <dbl> <dbl>
## 1     3     0     0
## 2     9     0     0
## 3    10     0     0
## 4    18     0     0
## 5    21     0     0
## 6    30     0     0

Save the data in .CSV

write.csv(my_predictions1,"JF_predictions_all.csv")

Our prections of accident/no accident AND amount are in a single file calle JF_predictions_all.csv

Done. Thank you!