Data621 HW4
Load Libraries
library(tidyverse)
library(tidymodels)
library(ggplot2)
library(Ecfun)
library(vip)
library(themis)
rm(list=ls())#setwd('Code/HW_4')Load data
We will load the data from the Cloud Servers
df_train_full <- read_csv('http://www.my-cunymsds.com/data621/insurance_training_data.csv')
df_validation <- read_csv('http://www.my-cunymsds.com/data621/insurance-evaluation-data.csv')Exploratory Data Analysis (EDA)
Let’s take a look at the general structure of the dataset
str(df_train_full)## spc_tbl_ [8,161 × 26] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ INDEX : num [1:8161] 1 2 4 5 6 7 8 11 12 13 ...
## $ TARGET_FLAG: num [1:8161] 0 0 0 0 0 1 0 1 1 0 ...
## $ TARGET_AMT : num [1:8161] 0 0 0 0 0 ...
## $ KIDSDRIV : num [1:8161] 0 0 0 0 0 0 0 1 0 0 ...
## $ AGE : num [1:8161] 60 43 35 51 50 34 54 37 34 50 ...
## $ HOMEKIDS : num [1:8161] 0 0 1 0 0 1 0 2 0 0 ...
## $ YOJ : num [1:8161] 11 11 10 14 NA 12 NA NA 10 7 ...
## $ INCOME : chr [1:8161] "$67,349" "$91,449" "$16,039" NA ...
## $ PARENT1 : chr [1:8161] "No" "No" "No" "No" ...
## $ HOME_VAL : chr [1:8161] "$0" "$257,252" "$124,191" "$306,251" ...
## $ MSTATUS : chr [1:8161] "z_No" "z_No" "Yes" "Yes" ...
## $ SEX : chr [1:8161] "M" "M" "z_F" "M" ...
## $ EDUCATION : chr [1:8161] "PhD" "z_High School" "z_High School" "<High School" ...
## $ JOB : chr [1:8161] "Professional" "z_Blue Collar" "Clerical" "z_Blue Collar" ...
## $ TRAVTIME : num [1:8161] 14 22 5 32 36 46 33 44 34 48 ...
## $ CAR_USE : chr [1:8161] "Private" "Commercial" "Private" "Private" ...
## $ BLUEBOOK : chr [1:8161] "$14,230" "$14,940" "$4,010" "$15,440" ...
## $ TIF : num [1:8161] 11 1 4 7 1 1 1 1 1 7 ...
## $ CAR_TYPE : chr [1:8161] "Minivan" "Minivan" "z_SUV" "Minivan" ...
## $ RED_CAR : chr [1:8161] "yes" "yes" "no" "yes" ...
## $ OLDCLAIM : chr [1:8161] "$4,461" "$0" "$38,690" "$0" ...
## $ CLM_FREQ : num [1:8161] 2 0 2 0 2 0 0 1 0 0 ...
## $ REVOKED : chr [1:8161] "No" "No" "No" "No" ...
## $ MVR_PTS : num [1:8161] 3 0 3 0 3 0 0 10 0 1 ...
## $ CAR_AGE : num [1:8161] 18 1 10 6 17 7 1 7 1 17 ...
## $ URBANICITY : chr [1:8161] "Highly Urban/ Urban" "Highly Urban/ Urban" "Highly Urban/ Urban" "Highly Urban/ Urban" ...
## - attr(*, "spec")=
## .. cols(
## .. INDEX = col_double(),
## .. TARGET_FLAG = col_double(),
## .. TARGET_AMT = col_double(),
## .. KIDSDRIV = col_double(),
## .. AGE = col_double(),
## .. HOMEKIDS = col_double(),
## .. YOJ = col_double(),
## .. INCOME = col_character(),
## .. PARENT1 = col_character(),
## .. HOME_VAL = col_character(),
## .. MSTATUS = col_character(),
## .. SEX = col_character(),
## .. EDUCATION = col_character(),
## .. JOB = col_character(),
## .. TRAVTIME = col_double(),
## .. CAR_USE = col_character(),
## .. BLUEBOOK = col_character(),
## .. TIF = col_double(),
## .. CAR_TYPE = col_character(),
## .. RED_CAR = col_character(),
## .. OLDCLAIM = col_character(),
## .. CLM_FREQ = col_double(),
## .. REVOKED = col_character(),
## .. MVR_PTS = col_double(),
## .. CAR_AGE = col_double(),
## .. URBANICITY = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
We see several character columns. This we would need to change to FACTORS for later perform a DUMMY VARIABLE transformation to do the classification and regression.
Let’s see how imbalanced the data set is
df_train_full %>%
group_by(TARGET_FLAG) %>%
summarise(n = n())## # A tibble: 2 × 2
## TARGET_FLAG n
## <dbl> <int>
## 1 0 6008
## 2 1 2153
Roughly 2/3 are ZEROS. This is a fairly imbalanced, as one class dominates the other. We will need to do something like SMOTE (Synthetic Minority Oversampling Technique) to take care of this.
Let’s looks at the distribution of the TARGET AMOUNTS variable
# plot
p <- df_train_full %>%
# filter( price<300 ) %>%
ggplot( aes(x=TARGET_AMT)) +
geom_histogram(bins = 20) +
scale_x_continuous(trans='log10') +
ggtitle("Distribution of Target Amounts")
pWe can see that Target Amounts is heavily skewed to the right with most values around 2,000-5,000 but with some values going above 100,000
Outliers are many times an issue when we are trying to forecast. Let’s look at a correlation plot but only on values larger than 20,000 to see if we have some strong correlations.
library(corrplot)
M <- df_train_full %>%
filter(TARGET_AMT>20000)
M2 <- cor(M[,sapply(M,is.numeric)],use="complete.obs",method="pearson")
M2 %>%
corrplot(type = "upper",
addCoef.col = TRUE,
method = "shade",
diag = TRUE, number.cex = 0.5, tl.cex = 0.7)Since we have so many character columns, very few correlations show up, but we can see TARGET amount doesn’t have strong correlations with the numerical data as is. Let’s fix the character fields and re-run
df_train_full$INCOME <- parseDollars(df_train_full$INCOME)
df_train_full$HOME_VAL <- parseDollars(df_train_full$HOME_VAL)
df_train_full$BLUEBOOK <- parseDollars(df_train_full$BLUEBOOK)
df_train_full$OLDCLAIM <- parseDollars(df_train_full$OLDCLAIM)
df_train_full$TARGET_FLAG <- as.factor(df_train_full$TARGET_FLAG)
df_train_full$PARENT1 <- as.factor(df_train_full$PARENT1)
df_train_full$MSTATUS <- as.factor(df_train_full$MSTATUS)
df_train_full$SEX <- as.factor(df_train_full$SEX)
df_train_full$EDUCATION <- as.factor(df_train_full$EDUCATION)
df_train_full$JOB <- as.factor(df_train_full$JOB)
df_train_full$CAR_USE <- as.factor(df_train_full$CAR_USE)
df_train_full$CAR_TYPE <- as.factor(df_train_full$CAR_TYPE)
df_train_full$RED_CAR <- as.factor(df_train_full$RED_CAR)
df_train_full$CLM_FREQ <- as.factor(df_train_full$CLM_FREQ)
df_train_full$REVOKED <- as.factor(df_train_full$REVOKED)
df_train_full$URBANICITY <- as.factor(df_train_full$URBANICITY)
str(df_train_full)## spc_tbl_ [8,161 × 26] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ INDEX : num [1:8161] 1 2 4 5 6 7 8 11 12 13 ...
## $ TARGET_FLAG: Factor w/ 2 levels "0","1": 1 1 1 1 1 2 1 2 2 1 ...
## $ TARGET_AMT : num [1:8161] 0 0 0 0 0 ...
## $ KIDSDRIV : num [1:8161] 0 0 0 0 0 0 0 1 0 0 ...
## $ AGE : num [1:8161] 60 43 35 51 50 34 54 37 34 50 ...
## $ HOMEKIDS : num [1:8161] 0 0 1 0 0 1 0 2 0 0 ...
## $ YOJ : num [1:8161] 11 11 10 14 NA 12 NA NA 10 7 ...
## $ INCOME : num [1:8161] 67349 91449 16039 NA 114986 ...
## $ PARENT1 : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 2 1 1 1 1 ...
## $ HOME_VAL : num [1:8161] 0 257252 124191 306251 243925 ...
## $ MSTATUS : Factor w/ 2 levels "Yes","z_No": 2 2 1 1 1 2 1 1 2 2 ...
## $ SEX : Factor w/ 2 levels "M","z_F": 1 1 2 1 2 2 2 1 2 1 ...
## $ EDUCATION : Factor w/ 5 levels "<High School",..: 4 5 5 1 4 2 1 2 2 2 ...
## $ JOB : Factor w/ 8 levels "Clerical","Doctor",..: 6 8 1 8 2 8 8 8 1 6 ...
## $ TRAVTIME : num [1:8161] 14 22 5 32 36 46 33 44 34 48 ...
## $ CAR_USE : Factor w/ 2 levels "Commercial","Private": 2 1 2 2 2 1 2 1 2 1 ...
## $ BLUEBOOK : num [1:8161] 14230 14940 4010 15440 18000 ...
## $ TIF : num [1:8161] 11 1 4 7 1 1 1 1 1 7 ...
## $ CAR_TYPE : Factor w/ 6 levels "Minivan","Panel Truck",..: 1 1 6 1 6 4 6 5 6 5 ...
## $ RED_CAR : Factor w/ 2 levels "no","yes": 2 2 1 2 1 1 1 2 1 1 ...
## $ OLDCLAIM : num [1:8161] 4461 0 38690 0 19217 ...
## $ CLM_FREQ : Factor w/ 6 levels "0","1","2","3",..: 3 1 3 1 3 1 1 2 1 1 ...
## $ REVOKED : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 1 2 1 1 ...
## $ MVR_PTS : num [1:8161] 3 0 3 0 3 0 0 10 0 1 ...
## $ CAR_AGE : num [1:8161] 18 1 10 6 17 7 1 7 1 17 ...
## $ URBANICITY : Factor w/ 2 levels "Highly Urban/ Urban",..: 1 1 1 1 1 1 1 1 1 2 ...
## - attr(*, "spec")=
## .. cols(
## .. INDEX = col_double(),
## .. TARGET_FLAG = col_double(),
## .. TARGET_AMT = col_double(),
## .. KIDSDRIV = col_double(),
## .. AGE = col_double(),
## .. HOMEKIDS = col_double(),
## .. YOJ = col_double(),
## .. INCOME = col_character(),
## .. PARENT1 = col_character(),
## .. HOME_VAL = col_character(),
## .. MSTATUS = col_character(),
## .. SEX = col_character(),
## .. EDUCATION = col_character(),
## .. JOB = col_character(),
## .. TRAVTIME = col_double(),
## .. CAR_USE = col_character(),
## .. BLUEBOOK = col_character(),
## .. TIF = col_double(),
## .. CAR_TYPE = col_character(),
## .. RED_CAR = col_character(),
## .. OLDCLAIM = col_character(),
## .. CLM_FREQ = col_double(),
## .. REVOKED = col_character(),
## .. MVR_PTS = col_double(),
## .. CAR_AGE = col_double(),
## .. URBANICITY = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
#library(corrplot)
M <- df_train_full %>%
filter(TARGET_AMT>20000)
M2 <- cor(M[,sapply(M,is.numeric)],use="complete.obs",method="pearson")
M2 %>%
corrplot(type = "upper",
addCoef.col = TRUE,
method = "shade",
diag = TRUE, number.cex = 0.5, tl.cex = 0.7)Now we see some correlations with TARGET_AMT and BLUEBOOK and HOME_VAL
df_train_full %>%
keep(is.numeric) %>% # Keep only numeric columns
gather() %>% # Convert to key-value pairs
ggplot(aes(value)) + # Plot the values
facet_wrap(~ key, scales = "free") + # In separate panels
geom_density() # as densityWith so many fields expected do en up as dummy variables, we still could benefit from doing some transformations to the numerical data. We will address them through TIDYMODEL RECIPES.
Data Transformations and Feature Engineering
Let’s apply same transformations to the validation dataset.
df_validation$INCOME <- parseDollars(df_validation$INCOME)
df_validation$HOME_VAL <- parseDollars(df_validation$HOME_VAL)
df_validation$BLUEBOOK <- parseDollars(df_validation$BLUEBOOK)
df_validation$OLDCLAIM <- parseDollars(df_validation$OLDCLAIM)
df_validation$TARGET_FLAG <- as.factor(df_validation$TARGET_FLAG)
df_validation$PARENT1 <- as.factor(df_validation$PARENT1)
df_validation$MSTATUS <- as.factor(df_validation$MSTATUS)
df_validation$SEX <- as.factor(df_validation$SEX)
df_validation$EDUCATION <- as.factor(df_validation$EDUCATION)
df_validation$JOB <- as.factor(df_validation$JOB)
df_validation$CAR_USE <- as.factor(df_validation$CAR_USE)
df_validation$CAR_TYPE <- as.factor(df_validation$CAR_TYPE)
df_validation$RED_CAR <- as.factor(df_validation$RED_CAR)
df_validation$CLM_FREQ <- as.factor(df_validation$CLM_FREQ)
df_validation$REVOKED <- as.factor(df_validation$REVOKED)
df_validation$URBANICITY <- as.factor(df_validation$URBANICITY)Let’s further split the training dataset into training and validation
set.seed(456)
data_split <- initial_split(df_train_full)
data_train <- training(data_split)
data_test <- testing(data_split)
# Not needed here since there is no tuning required
data_folds <- bootstraps(data_train, times = 10,)
data_folds2 <- vfold_cv(data_train, v=10)Let’s transform data using Tidymodels Recipes. We will do a few recipes. This would give 5 different data transformations we will test. In some cases we will impute NA’s values using, KNN, Median, Mode and Mean. We will also apply normalization. Finally we will use SMOTE to help the issues we have of class imbalances.
data_recipe1 <-
recipe(formula = TARGET_FLAG ~ ., data = data_train) %>%
update_role(INDEX, new_role = "id variable") %>% #ignores this var
update_role(TARGET_AMT, new_role = "id variable") %>% #ignores this var
step_impute_knn(all_predictors(), neighbors =5) %>% #impute if necessary
step_other(all_factor_predictors(), threshold = 0.2) %>%
step_dummy(all_factor_predictors()) %>%
step_normalize(all_numeric_predictors()) %>%
step_smote(TARGET_FLAG)data_recipe2 <-
recipe(formula = TARGET_FLAG ~ ., data = data_train) %>%
update_role(INDEX, new_role = "id variable") %>% #ignores this var
update_role(TARGET_AMT, new_role = "id variable") %>% #ignores this var
step_impute_knn(all_predictors(), neighbors = 50) %>% #impute if necessary
step_other(all_factor_predictors(), threshold = 0.1) %>%
step_dummy(all_factor_predictors()) %>%
step_normalize(all_numeric_predictors()) %>%
step_smote(TARGET_FLAG)data_recipe3 <-
recipe(formula = TARGET_FLAG ~ ., data = data_train) %>%
update_role(INDEX, new_role = "id variable") %>% #ignores this var
update_role(TARGET_AMT, new_role = "id variable") %>% #ignores this var
step_impute_mean(all_numeric_predictors()) %>% #impute if necessary
step_impute_mode(all_factor_predictors()) %>% #impute if necessary
step_other(all_factor_predictors(), threshold = 0.2) %>%
step_dummy(all_factor_predictors()) %>%
step_normalize(all_numeric_predictors()) %>%
step_smote(TARGET_FLAG)data_recipe4 <-
recipe(formula = TARGET_FLAG ~ ., data = data_train) %>%
update_role(INDEX, new_role = "id variable") %>% #ignores this var
update_role(TARGET_AMT, new_role = "id variable") %>% #ignores this var
step_impute_median(all_numeric_predictors()) %>% #impute if necessary
step_impute_mode(all_factor_predictors()) %>% #impute if necessary
step_dummy(all_factor_predictors())data_recipe5 <-
recipe(formula = TARGET_FLAG ~ ., data = data_train) %>%
update_role(INDEX, new_role = "id variable") %>% #ignores this var
update_role(TARGET_AMT, new_role = "id variable") %>% #ignores this var
step_impute_median(all_numeric_predictors()) %>% #impute if necessary
step_impute_mode(all_factor_predictors()) %>% #impute if necessary
step_other(all_factor_predictors(), threshold = 0.2) %>%
step_dummy(all_factor_predictors())Here we can see how the transformed data looks like
baked_df <- data_recipe1 %>% prep() %>% bake(new_data=NULL)
head(baked_df)## # A tibble: 6 × 29
## INDEX TARGET_AMT KIDSDRIV AGE HOMEK…¹ YOJ INCOME HOME_…² TRAVT…³ BLUEB…⁴
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7416 0 -0.336 -0.448 -0.646 -0.121 3.78 3.02 -0.288 3.30
## 2 7083 0 3.55 -0.914 2.04 -0.221 -1.17 -0.803 -0.850 -0.842
## 3 8940 0 -0.336 1.77 -0.646 0.876 0.545 1.58 -1.54 0.996
## 4 8464 0 -0.336 0.252 -0.646 -0.171 -0.952 -0.307 1.40 -0.372
## 5 3182 0 -0.336 0.136 -0.646 0.128 0.904 -1.22 -0.413 0.606
## 6 6783 0 -0.336 1.07 -0.646 0.876 0.393 0.724 -0.662 1.88
## # … with 19 more variables: TIF <dbl>, OLDCLAIM <dbl>, MVR_PTS <dbl>,
## # CAR_AGE <dbl>, TARGET_FLAG <fct>, PARENT1_other <dbl>, MSTATUS_z_No <dbl>,
## # SEX_z_F <dbl>, EDUCATION_Masters <dbl>, EDUCATION_z_High.School <dbl>,
## # EDUCATION_other <dbl>, JOB_other <dbl>, CAR_USE_Private <dbl>,
## # CAR_TYPE_z_SUV <dbl>, CAR_TYPE_other <dbl>, RED_CAR_yes <dbl>,
## # CLM_FREQ_other <dbl>, REVOKED_other <dbl>,
## # URBANICITY_z_Highly.Rural..Rural <dbl>, and abbreviated variable names …
Machine Learning - CLASSIFICATION part
Define Models to use
Let’s define the Classification model to use.
#Logistic regression
logistic_spec <-
logistic_reg() %>%
set_mode("classification") %>%
set_engine("glm")
#XGBoost
xgb_spec <- boost_tree(
trees = 1000,
tree_depth = tune(), min_n = tune(),
sample_size = tune(), mtry = tune(),
learn_rate = tune()
) %>%
set_engine("xgboost") %>%
set_mode("classification")Define the workflow
workflow_set <-workflow_set(
preproc = list(data_recipe1,data_recipe2,data_recipe3,data_recipe4, data_recipe5),
models = list(logistic_spec, xgb_spec),
cross = TRUE
)
workflow_set## # A workflow set/tibble: 10 × 4
## wflow_id info option result
## <chr> <list> <list> <list>
## 1 recipe_1_logistic_reg <tibble [1 × 4]> <opts[0]> <list [0]>
## 2 recipe_1_boost_tree <tibble [1 × 4]> <opts[0]> <list [0]>
## 3 recipe_2_logistic_reg <tibble [1 × 4]> <opts[0]> <list [0]>
## 4 recipe_2_boost_tree <tibble [1 × 4]> <opts[0]> <list [0]>
## 5 recipe_3_logistic_reg <tibble [1 × 4]> <opts[0]> <list [0]>
## 6 recipe_3_boost_tree <tibble [1 × 4]> <opts[0]> <list [0]>
## 7 recipe_4_logistic_reg <tibble [1 × 4]> <opts[0]> <list [0]>
## 8 recipe_4_boost_tree <tibble [1 × 4]> <opts[0]> <list [0]>
## 9 recipe_5_logistic_reg <tibble [1 × 4]> <opts[0]> <list [0]>
## 10 recipe_5_boost_tree <tibble [1 × 4]> <opts[0]> <list [0]>
Fit the stack of models
Here we fits the models and use hyperparameter tuning using 20 levels.
#
RUN = FALSE
if (RUN) {
doParallel::registerDoParallel()
start.time <- Sys.time()
start.time
fit_workflows <- workflow_set %>%
workflow_map(
seed = 888,
fn = "tune_grid",
grid = 20,
resamples = data_folds2,
verbose = TRUE
)
end.time <- Sys.time()
time.taken <- end.time - start.time
time.taken
doParallel::stopImplicitCluster()
}Save resulting model (backup)
Let’s save/load model.
if (RUN) {
saved_insurance_modelset <- fit_workflows
saveRDS(saved_insurance_modelset, "saved_insurance_fit3.rds")
}
########
if (!RUN) {
fit_workflows <- readRDS("saved_insurance_fit3.rds")
}Review Results of all models
autoplot(fit_workflows)collect_metrics(fit_workflows)## # A tibble: 210 × 9
## wflow_id .config preproc model .metric .esti…¹ mean n std_err
## <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <int> <dbl>
## 1 recipe_1_logistic_… Prepro… recipe logi… accura… binary 0.724 10 0.00704
## 2 recipe_1_logistic_… Prepro… recipe logi… roc_auc binary 0.807 10 0.00705
## 3 recipe_1_boost_tree Prepro… recipe boos… accura… binary 0.731 10 0.00434
## 4 recipe_1_boost_tree Prepro… recipe boos… roc_auc binary 0.793 10 0.00647
## 5 recipe_1_boost_tree Prepro… recipe boos… accura… binary 0.761 10 0.00584
## 6 recipe_1_boost_tree Prepro… recipe boos… roc_auc binary 0.808 10 0.00613
## 7 recipe_1_boost_tree Prepro… recipe boos… accura… binary 0.783 10 0.00535
## 8 recipe_1_boost_tree Prepro… recipe boos… roc_auc binary 0.811 10 0.00655
## 9 recipe_1_boost_tree Prepro… recipe boos… accura… binary 0.768 10 0.00412
## 10 recipe_1_boost_tree Prepro… recipe boos… roc_auc binary 0.813 10 0.00637
## # … with 200 more rows, and abbreviated variable name ¹.estimator
#rank_results(fit_workflows, rank_metric = "accuracy", select_best = TRUE)
rank_results(fit_workflows, rank_metric = "roc_auc", select_best = TRUE)## # A tibble: 20 × 9
## wflow_id .config .metric mean std_err n prepr…¹ model rank
## <chr> <chr> <chr> <dbl> <dbl> <int> <chr> <chr> <int>
## 1 recipe_4_boost_tree Prepro… accura… 0.793 0.00500 10 recipe boos… 1
## 2 recipe_4_boost_tree Prepro… roc_auc 0.822 0.00683 10 recipe boos… 1
## 3 recipe_2_boost_tree Prepro… accura… 0.784 0.00441 10 recipe boos… 2
## 4 recipe_2_boost_tree Prepro… roc_auc 0.818 0.00636 10 recipe boos… 2
## 5 recipe_5_boost_tree Prepro… accura… 0.797 0.00465 10 recipe boos… 3
## 6 recipe_5_boost_tree Prepro… roc_auc 0.817 0.00672 10 recipe boos… 3
## 7 recipe_3_boost_tree Prepro… accura… 0.779 0.00555 10 recipe boos… 4
## 8 recipe_3_boost_tree Prepro… roc_auc 0.814 0.00649 10 recipe boos… 4
## 9 recipe_1_boost_tree Prepro… accura… 0.768 0.00412 10 recipe boos… 5
## 10 recipe_1_boost_tree Prepro… roc_auc 0.813 0.00637 10 recipe boos… 5
## 11 recipe_4_logistic_reg Prepro… accura… 0.788 0.00468 10 recipe logi… 6
## 12 recipe_4_logistic_reg Prepro… roc_auc 0.810 0.00707 10 recipe logi… 6
## 13 recipe_2_logistic_reg Prepro… accura… 0.728 0.00735 10 recipe logi… 7
## 14 recipe_2_logistic_reg Prepro… roc_auc 0.809 0.00742 10 recipe logi… 7
## 15 recipe_5_logistic_reg Prepro… accura… 0.792 0.00469 10 recipe logi… 8
## 16 recipe_5_logistic_reg Prepro… roc_auc 0.808 0.00691 10 recipe logi… 8
## 17 recipe_1_logistic_reg Prepro… accura… 0.724 0.00704 10 recipe logi… 9
## 18 recipe_1_logistic_reg Prepro… roc_auc 0.807 0.00705 10 recipe logi… 9
## 19 recipe_3_logistic_reg Prepro… accura… 0.725 0.00662 10 recipe logi… 10
## 20 recipe_3_logistic_reg Prepro… roc_auc 0.807 0.00675 10 recipe logi… 10
## # … with abbreviated variable name ¹preprocessor
Based on ROC_AUC and ACCURACY XGBoost model had the best performance.
fit_workflows %>%
collect_metrics()## # A tibble: 210 × 9
## wflow_id .config preproc model .metric .esti…¹ mean n std_err
## <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <int> <dbl>
## 1 recipe_1_logistic_… Prepro… recipe logi… accura… binary 0.724 10 0.00704
## 2 recipe_1_logistic_… Prepro… recipe logi… roc_auc binary 0.807 10 0.00705
## 3 recipe_1_boost_tree Prepro… recipe boos… accura… binary 0.731 10 0.00434
## 4 recipe_1_boost_tree Prepro… recipe boos… roc_auc binary 0.793 10 0.00647
## 5 recipe_1_boost_tree Prepro… recipe boos… accura… binary 0.761 10 0.00584
## 6 recipe_1_boost_tree Prepro… recipe boos… roc_auc binary 0.808 10 0.00613
## 7 recipe_1_boost_tree Prepro… recipe boos… accura… binary 0.783 10 0.00535
## 8 recipe_1_boost_tree Prepro… recipe boos… roc_auc binary 0.811 10 0.00655
## 9 recipe_1_boost_tree Prepro… recipe boos… accura… binary 0.768 10 0.00412
## 10 recipe_1_boost_tree Prepro… recipe boos… roc_auc binary 0.813 10 0.00637
## # … with 200 more rows, and abbreviated variable name ¹.estimator
Pick and fit best model
We will select best model based on ROC_AUC
metric <- "roc_auc"
best_workflow_id <- fit_workflows %>%
rank_results(
rank_metric = metric,
select_best = TRUE
) %>%
dplyr::slice(1) %>%
pull(wflow_id)
workflow_best <- extract_workflow(fit_workflows, id = best_workflow_id)
best_workflow_id## [1] "recipe_4_boost_tree"
The best model based on ROC_AUC is XGBoost with Recipe 4
workflow_best_tuned <- fit_workflows[fit_workflows$wflow_id == best_workflow_id,"result"][[1]][[1]]
workflow_best_tuned## # Tuning results
## # 10-fold cross-validation
## # A tibble: 10 × 4
## splits id .metrics .notes
## <list> <chr> <list> <list>
## 1 <split [5508/612]> Fold01 <tibble [40 × 9]> <tibble [0 × 3]>
## 2 <split [5508/612]> Fold02 <tibble [40 × 9]> <tibble [0 × 3]>
## 3 <split [5508/612]> Fold03 <tibble [40 × 9]> <tibble [0 × 3]>
## 4 <split [5508/612]> Fold04 <tibble [40 × 9]> <tibble [0 × 3]>
## 5 <split [5508/612]> Fold05 <tibble [40 × 9]> <tibble [0 × 3]>
## 6 <split [5508/612]> Fold06 <tibble [40 × 9]> <tibble [0 × 3]>
## 7 <split [5508/612]> Fold07 <tibble [40 × 9]> <tibble [0 × 3]>
## 8 <split [5508/612]> Fold08 <tibble [40 × 9]> <tibble [0 × 3]>
## 9 <split [5508/612]> Fold09 <tibble [40 × 9]> <tibble [0 × 3]>
## 10 <split [5508/612]> Fold10 <tibble [40 × 9]> <tibble [0 × 3]>
collect_metrics(workflow_best_tuned)## # A tibble: 40 × 11
## mtry min_n tree_depth learn_rate sampl…¹ .metric .esti…² mean n std_err
## <int> <int> <int> <dbl> <dbl> <chr> <chr> <dbl> <int> <dbl>
## 1 38 5 6 0.00554 0.571 accura… binary 0.793 10 0.00500
## 2 38 5 6 0.00554 0.571 roc_auc binary 0.822 10 0.00683
## 3 33 17 2 0.0311 0.800 accura… binary 0.795 10 0.00512
## 4 33 17 2 0.0311 0.800 roc_auc binary 0.819 10 0.00711
## 5 17 38 8 0.279 0.238 accura… binary 0.754 10 0.00539
## 6 17 38 8 0.279 0.238 roc_auc binary 0.753 10 0.00598
## 7 32 9 12 0.0203 0.744 accura… binary 0.787 10 0.00447
## 8 32 9 12 0.0203 0.744 roc_auc binary 0.814 10 0.00678
## 9 29 27 14 0.00911 0.842 accura… binary 0.795 10 0.00436
## 10 29 27 14 0.00911 0.842 roc_auc binary 0.820 10 0.00672
## # … with 30 more rows, 1 more variable: .config <chr>, and abbreviated variable
## # names ¹sample_size, ².estimator
autoplot(workflow_best_tuned)select_best(workflow_best_tuned, "roc_auc")## # A tibble: 1 × 6
## mtry min_n tree_depth learn_rate sample_size .config
## <int> <int> <int> <dbl> <dbl> <chr>
## 1 38 5 6 0.00554 0.571 Preprocessor1_Model01
Evaluate Model
Let’s test it with the unseen test data
workflow_best_final <- finalize_workflow(workflow_best, select_best(workflow_best_tuned, "roc_auc"))
doParallel::registerDoParallel()
workflow_best_final_fit <- workflow_best_final %>%
last_fit(
split = data_split
)
doParallel::stopImplicitCluster()
workflow_best_final_fit## # Resampling results
## # Manual resampling
## # A tibble: 1 × 6
## splits id .metrics .notes .predictions .workflow
## <list> <chr> <list> <list> <list> <list>
## 1 <split [6120/2041]> train/test split <tibble> <tibble> <tibble> <workflow>
Peformance results of selected model and hyperparameters
workflow_best_final_fit %>%
collect_metrics()## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.791 Preprocessor1_Model1
## 2 roc_auc binary 0.818 Preprocessor1_Model1
Results with TEST DATA were 79% accuracy.
Let’s see a confusion matrix.
workflow_best_final_fit %>%
collect_predictions() %>%
conf_mat(TARGET_FLAG, .pred_class)## Truth
## Prediction 0 1
## 0 1385 330
## 1 96 230
Let’s see which ones were the most important variables
imp_spec <- xgb_spec %>%
finalize_model(select_best(workflow_best_tuned,"roc_auc")) %>%
set_engine("xgboost", importance = "permutation")
#imp_spec <- logistic_spec %>%
# finalize_model(select_best(house_tune)) %>%
# set_engine("glm", importance = "permutation")
workflow() %>%
add_recipe(data_recipe4) %>%
# add_model(imp_spec) %>%
add_model(imp_spec) %>%
fit(data_train) %>%
#pull_workflow_fit() %>%
extract_fit_parsnip() %>%
vip(aesthetics = list(alpha = 0.8, fill = "midnightblue"))## [12:22:09] WARNING: amalgamation/../src/learner.cc:627:
## Parameters: { "importance" } might not be used.
##
## This could be a false alarm, with some parameters getting used by language bindings but
## then being mistakenly passed down to XGBoost core, or some parameter actually being used
## but getting flagged wrongly here. Please open an issue if you find any such cases.
Let’s predict on the validation set
my_predictions1 <- workflow_best_final_fit$.workflow[[1]] %>%
predict(df_validation)Save the data in .CSV
my_predictions2 <- as_tibble(df_validation$INDEX)
my_predictions2$pred_class <- my_predictions1$.pred_class
write.csv(my_predictions2,"JF_predictions.csv")Machine Learning - REGRESSION part
Data Transformation
We would need to define different data transformation for the the regression. Let’s transform data using Tidymodels Recipes. We will do a few recipes
data_recipe1 <-
recipe(formula = TARGET_AMT ~ ., data = data_train) %>%
update_role(INDEX, new_role = "id variable") %>% #ignores this var
update_role(TARGET_FLAG, new_role = "id variable") %>% #ignores this var
step_interact(terms = ~ HOME_VAL:BLUEBOOK) %>%
step_impute_knn(all_predictors(), neighbors =5) %>% #impute if necessary
step_other(all_factor_predictors(), threshold = 0.2) %>%
step_dummy(all_factor_predictors()) %>%
#step_smote(TARGET_FLAG) %>%
step_normalize(all_numeric_predictors())data_recipe2 <-
recipe(formula = TARGET_AMT ~ ., data = data_train) %>%
update_role(INDEX, new_role = "id variable") %>% #ignores this var
update_role(TARGET_FLAG, new_role = "id variable") %>% #ignores this var
step_impute_median(all_numeric_predictors()) %>% #impute if necessary
step_impute_mode(all_factor_predictors()) %>% #impute if necessary
step_dummy(all_factor_predictors())data_recipe3 <-
recipe(formula = TARGET_AMT ~ BLUEBOOK+CAR_TYPE+CAR_AGE, data = data_train) %>%
step_impute_mean(all_numeric_predictors()) %>% #impute if necessary
step_impute_mode(all_factor_predictors()) %>%
step_dummy(all_factor_predictors()) %>%
step_normalize(all_numeric_predictors())data_recipe4 <-
recipe(formula = TARGET_AMT ~ BLUEBOOK+CAR_TYPE+CAR_AGE, data = data_train) %>%
step_impute_median(all_numeric_predictors()) %>% #impute if necessary
step_impute_knn(all_factor_predictors(), neighbors = 10) %>%
step_dummy(all_factor_predictors())data_recipe5 <-
recipe(formula = TARGET_AMT ~ BLUEBOOK+CAR_TYPE+CAR_AGE, data = data_train) %>%
step_impute_linear(all_numeric_predictors()) %>% #impute if necessary
step_impute_knn(all_factor_predictors(), neighbors = 10) %>%
step_dummy(all_factor_predictors())Here we can see how the transformed data looks like
baked_df <- data_recipe1 %>% prep() %>% bake(new_data=NULL)
head(baked_df)## # A tibble: 6 × 30
## INDEX TARGET_FLAG KIDSD…¹ AGE HOMEK…² YOJ INCOME HOME_…³ TRAVT…⁴ BLUEB…⁵
## <dbl> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7416 0 -0.336 -0.448 -0.646 -0.121 3.78 3.02 -0.288 3.30
## 2 7083 0 3.55 -0.914 2.04 -0.221 -1.16 -0.803 -0.850 -0.842
## 3 8940 0 -0.336 1.77 -0.646 0.876 0.545 1.58 -1.54 0.996
## 4 8464 0 -0.336 0.252 -0.646 -0.171 -0.952 -0.307 1.40 -0.372
## 5 3182 0 -0.336 0.136 -0.646 0.128 0.904 -1.22 -0.413 0.606
## 6 6783 0 -0.336 1.07 -0.646 0.876 0.393 0.724 -0.662 1.88
## # … with 20 more variables: TIF <dbl>, OLDCLAIM <dbl>, MVR_PTS <dbl>,
## # CAR_AGE <dbl>, TARGET_AMT <dbl>, HOME_VAL_x_BLUEBOOK <dbl>,
## # PARENT1_other <dbl>, MSTATUS_z_No <dbl>, SEX_z_F <dbl>,
## # EDUCATION_Masters <dbl>, EDUCATION_z_High.School <dbl>,
## # EDUCATION_other <dbl>, JOB_other <dbl>, CAR_USE_Private <dbl>,
## # CAR_TYPE_z_SUV <dbl>, CAR_TYPE_other <dbl>, RED_CAR_yes <dbl>,
## # CLM_FREQ_other <dbl>, REVOKED_other <dbl>, …
Define Regression Models
Let’s define the regression model to use.
#Linear Regression
lm_spec <- linear_reg() %>% set_engine("lm")
#Lasso Regression
lasso_spec <- linear_reg(mode = "regression",
penalty = tune(),
mixture = 1) %>%
set_engine("glmnet")Define the workflow
workflow_set <-workflow_set(
preproc = list(data_recipe1,data_recipe2,data_recipe3,data_recipe4, data_recipe5),
models = list(lm_spec, lasso_spec),
cross = TRUE
)
workflow_set## # A workflow set/tibble: 10 × 4
## wflow_id info option result
## <chr> <list> <list> <list>
## 1 recipe_1_linear_reg_1 <tibble [1 × 4]> <opts[0]> <list [0]>
## 2 recipe_1_linear_reg_2 <tibble [1 × 4]> <opts[0]> <list [0]>
## 3 recipe_2_linear_reg_1 <tibble [1 × 4]> <opts[0]> <list [0]>
## 4 recipe_2_linear_reg_2 <tibble [1 × 4]> <opts[0]> <list [0]>
## 5 recipe_3_linear_reg_1 <tibble [1 × 4]> <opts[0]> <list [0]>
## 6 recipe_3_linear_reg_2 <tibble [1 × 4]> <opts[0]> <list [0]>
## 7 recipe_4_linear_reg_1 <tibble [1 × 4]> <opts[0]> <list [0]>
## 8 recipe_4_linear_reg_2 <tibble [1 × 4]> <opts[0]> <list [0]>
## 9 recipe_5_linear_reg_1 <tibble [1 × 4]> <opts[0]> <list [0]>
## 10 recipe_5_linear_reg_2 <tibble [1 × 4]> <opts[0]> <list [0]>
Fit the stack of models
Here we fits the models and use hyperparameter tuning using 20 levels.
#
RUN = FALSE
if (RUN) {
doParallel::registerDoParallel()
start.time <- Sys.time()
start.time
fit_workflows <- workflow_set %>%
workflow_map(
seed = 888,
fn = "tune_grid",
grid = 10,
resamples = data_folds,
verbose = TRUE
)
end.time <- Sys.time()
time.taken <- end.time - start.time
time.taken
doParallel::stopImplicitCluster()
}Save resulting model (backup)
Let’s save/load model.
if (RUN) {
saved_insurance_regression <- fit_workflows
saveRDS(saved_insurance_regression, "saved_insurance_regression.rds")
}
########
if (!RUN) {
fit_workflows <- readRDS("saved_insurance_regression.rds")
}Review Results of all Regression models
autoplot(fit_workflows)collect_metrics(fit_workflows)## # A tibble: 110 × 9
## wflow_id .config preproc model .metric .esti…¹ mean n std_err
## <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <int> <dbl>
## 1 recipe_1_linear_… Prepro… recipe line… rmse standa… 4.57e+3 10 1.43e+2
## 2 recipe_1_linear_… Prepro… recipe line… rsq standa… 6.29e-2 10 2.27e-3
## 3 recipe_1_linear_… Prepro… recipe line… rmse standa… 4.57e+3 10 1.43e+2
## 4 recipe_1_linear_… Prepro… recipe line… rsq standa… 6.30e-2 10 2.28e-3
## 5 recipe_1_linear_… Prepro… recipe line… rmse standa… 4.57e+3 10 1.43e+2
## 6 recipe_1_linear_… Prepro… recipe line… rsq standa… 6.30e-2 10 2.28e-3
## 7 recipe_1_linear_… Prepro… recipe line… rmse standa… 4.57e+3 10 1.43e+2
## 8 recipe_1_linear_… Prepro… recipe line… rsq standa… 6.30e-2 10 2.28e-3
## 9 recipe_1_linear_… Prepro… recipe line… rmse standa… 4.57e+3 10 1.43e+2
## 10 recipe_1_linear_… Prepro… recipe line… rsq standa… 6.30e-2 10 2.28e-3
## # … with 100 more rows, and abbreviated variable name ¹.estimator
#rank_results(fit_workflows, rank_metric = "mse", select_best = TRUE)
rank_results(fit_workflows, rank_metric = "rmse", select_best = TRUE)## # A tibble: 20 × 9
## wflow_id .config .metric mean std_err n prepr…¹ model rank
## <chr> <chr> <chr> <dbl> <dbl> <int> <chr> <chr> <int>
## 1 recipe_1_linear_re… Prepro… rmse 4.57e+3 1.43e+2 10 recipe line… 1
## 2 recipe_1_linear_re… Prepro… rsq 6.30e-2 2.28e-3 10 recipe line… 1
## 3 recipe_1_linear_re… Prepro… rmse 4.57e+3 1.43e+2 10 recipe line… 2
## 4 recipe_1_linear_re… Prepro… rsq 6.29e-2 2.27e-3 10 recipe line… 2
## 5 recipe_2_linear_re… Prepro… rmse 4.58e+3 1.43e+2 10 recipe line… 3
## 6 recipe_2_linear_re… Prepro… rsq 6.17e-2 2.74e-3 10 recipe line… 3
## 7 recipe_2_linear_re… Prepro… rmse 4.58e+3 1.43e+2 10 recipe line… 4
## 8 recipe_2_linear_re… Prepro… rsq 6.16e-2 2.74e-3 10 recipe line… 4
## 9 recipe_5_linear_re… Prepro… rmse 4.70e+3 1.45e+2 10 recipe line… 5
## 10 recipe_5_linear_re… Prepro… rsq 8.16e-3 8.66e-4 10 recipe line… 5
## 11 recipe_5_linear_re… Prepro… rmse 4.70e+3 1.45e+2 10 recipe line… 6
## 12 recipe_5_linear_re… Prepro… rsq 8.17e-3 8.65e-4 10 recipe line… 6
## 13 recipe_3_linear_re… Prepro… rmse 4.70e+3 1.45e+2 10 recipe line… 7
## 14 recipe_3_linear_re… Prepro… rsq 8.10e-3 8.63e-4 10 recipe line… 7
## 15 recipe_4_linear_re… Prepro… rmse 4.70e+3 1.45e+2 10 recipe line… 8
## 16 recipe_4_linear_re… Prepro… rsq 8.10e-3 8.60e-4 10 recipe line… 8
## 17 recipe_3_linear_re… Prepro… rmse 4.70e+3 1.45e+2 10 recipe line… 9
## 18 recipe_3_linear_re… Prepro… rsq 8.11e-3 8.61e-4 10 recipe line… 9
## 19 recipe_4_linear_re… Prepro… rmse 4.70e+3 1.45e+2 10 recipe line… 10
## 20 recipe_4_linear_re… Prepro… rsq 8.11e-3 8.58e-4 10 recipe line… 10
## # … with abbreviated variable name ¹preprocessor
fit_workflows %>%
collect_metrics()## # A tibble: 110 × 9
## wflow_id .config preproc model .metric .esti…¹ mean n std_err
## <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <int> <dbl>
## 1 recipe_1_linear_… Prepro… recipe line… rmse standa… 4.57e+3 10 1.43e+2
## 2 recipe_1_linear_… Prepro… recipe line… rsq standa… 6.29e-2 10 2.27e-3
## 3 recipe_1_linear_… Prepro… recipe line… rmse standa… 4.57e+3 10 1.43e+2
## 4 recipe_1_linear_… Prepro… recipe line… rsq standa… 6.30e-2 10 2.28e-3
## 5 recipe_1_linear_… Prepro… recipe line… rmse standa… 4.57e+3 10 1.43e+2
## 6 recipe_1_linear_… Prepro… recipe line… rsq standa… 6.30e-2 10 2.28e-3
## 7 recipe_1_linear_… Prepro… recipe line… rmse standa… 4.57e+3 10 1.43e+2
## 8 recipe_1_linear_… Prepro… recipe line… rsq standa… 6.30e-2 10 2.28e-3
## 9 recipe_1_linear_… Prepro… recipe line… rmse standa… 4.57e+3 10 1.43e+2
## 10 recipe_1_linear_… Prepro… recipe line… rsq standa… 6.30e-2 10 2.28e-3
## # … with 100 more rows, and abbreviated variable name ¹.estimator
Results are not great. Very low R-Squared. This is driven by the large amount cases which don’t seem to corelate to much in the dataset.
Pick and fit best model
We will select best model based on rmse
metric <- "rmse"
best_workflow_id <- fit_workflows %>%
rank_results(
rank_metric = metric,
select_best = TRUE
) %>%
dplyr::slice(1) %>%
pull(wflow_id)
workflow_best <- extract_workflow(fit_workflows, id = best_workflow_id)
best_workflow_id## [1] "recipe_1_linear_reg_2"
The best model based on rmse is Lasso Regression with Recipe 1
workflow_best_tuned <- fit_workflows[fit_workflows$wflow_id == best_workflow_id,"result"][[1]][[1]]
workflow_best_tuned## # Tuning results
## # Bootstrap sampling
## # A tibble: 10 × 4
## splits id .metrics .notes
## <list> <chr> <list> <list>
## 1 <split [6120/2259]> Bootstrap01 <tibble [20 × 5]> <tibble [0 × 3]>
## 2 <split [6120/2265]> Bootstrap02 <tibble [20 × 5]> <tibble [0 × 3]>
## 3 <split [6120/2250]> Bootstrap03 <tibble [20 × 5]> <tibble [0 × 3]>
## 4 <split [6120/2282]> Bootstrap04 <tibble [20 × 5]> <tibble [0 × 3]>
## 5 <split [6120/2211]> Bootstrap05 <tibble [20 × 5]> <tibble [0 × 3]>
## 6 <split [6120/2260]> Bootstrap06 <tibble [20 × 5]> <tibble [0 × 3]>
## 7 <split [6120/2233]> Bootstrap07 <tibble [20 × 5]> <tibble [0 × 3]>
## 8 <split [6120/2260]> Bootstrap08 <tibble [20 × 5]> <tibble [0 × 3]>
## 9 <split [6120/2279]> Bootstrap09 <tibble [20 × 5]> <tibble [0 × 3]>
## 10 <split [6120/2219]> Bootstrap10 <tibble [20 × 5]> <tibble [0 × 3]>
collect_metrics(workflow_best_tuned)## # A tibble: 20 × 7
## penalty .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 1.62e-10 rmse standard 4570. 10 143. Preprocessor1_Model01
## 2 1.62e-10 rsq standard 0.0630 10 0.00228 Preprocessor1_Model01
## 3 2.41e- 9 rmse standard 4570. 10 143. Preprocessor1_Model02
## 4 2.41e- 9 rsq standard 0.0630 10 0.00228 Preprocessor1_Model02
## 5 2.89e- 8 rmse standard 4570. 10 143. Preprocessor1_Model03
## 6 2.89e- 8 rsq standard 0.0630 10 0.00228 Preprocessor1_Model03
## 7 1.94e- 7 rmse standard 4570. 10 143. Preprocessor1_Model04
## 8 1.94e- 7 rsq standard 0.0630 10 0.00228 Preprocessor1_Model04
## 9 2.10e- 6 rmse standard 4570. 10 143. Preprocessor1_Model05
## 10 2.10e- 6 rsq standard 0.0630 10 0.00228 Preprocessor1_Model05
## 11 3.55e- 5 rmse standard 4570. 10 143. Preprocessor1_Model06
## 12 3.55e- 5 rsq standard 0.0630 10 0.00228 Preprocessor1_Model06
## 13 1.44e- 4 rmse standard 4570. 10 143. Preprocessor1_Model07
## 14 1.44e- 4 rsq standard 0.0630 10 0.00228 Preprocessor1_Model07
## 15 4.71e- 3 rmse standard 4570. 10 143. Preprocessor1_Model08
## 16 4.71e- 3 rsq standard 0.0630 10 0.00228 Preprocessor1_Model08
## 17 1.94e- 2 rmse standard 4570. 10 143. Preprocessor1_Model09
## 18 1.94e- 2 rsq standard 0.0630 10 0.00228 Preprocessor1_Model09
## 19 4.95e- 1 rmse standard 4570. 10 143. Preprocessor1_Model10
## 20 4.95e- 1 rsq standard 0.0630 10 0.00228 Preprocessor1_Model10
autoplot(workflow_best_tuned)select_best(workflow_best_tuned, "rmse")## # A tibble: 1 × 2
## penalty .config
## <dbl> <chr>
## 1 1.62e-10 Preprocessor1_Model01
Review Results
Let’s test it with the unseen test data
workflow_best_final <- finalize_workflow(workflow_best, select_best(workflow_best_tuned, "rmse"))
doParallel::registerDoParallel()
workflow_best_final_fit <- workflow_best_final %>%
last_fit(
split = data_split
)
doParallel::stopImplicitCluster()
workflow_best_final_fit## # Resampling results
## # Manual resampling
## # A tibble: 1 × 6
## splits id .metrics .notes .predictions .workflow
## <list> <chr> <list> <list> <list> <list>
## 1 <split [6120/2041]> train/test split <tibble> <tibble> <tibble> <workflow>
Performance results of seleted model and hyperparameters
workflow_best_final_fit %>%
collect_metrics()## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 4710. Preprocessor1_Model1
## 2 rsq standard 0.0534 Preprocessor1_Model1
Plot Predictions vs Actual
collect_predictions(workflow_best_final_fit) %>%
ggplot(aes(TARGET_AMT, .pred)) +
geom_abline(lty = 2, color = "gray50") +
geom_point(alpha = 0.5, color = "midnightblue") +
coord_fixed()Plot doesn’t look god due to the many zeros in the dataset and the skewness of the data.
We will do a couple data manipulations steps to see better our results
# Replace all PREDS with 0 when TARGET_AMT =0
workflow_best_final_fit$.predictions[[1]] <- workflow_best_final_fit$.predictions[[1]] %>%
mutate(.pred = ifelse(TARGET_AMT==0, 0, .pred))collect_predictions(workflow_best_final_fit) %>%
filter(.pred > 0) %>%
ggplot(aes(TARGET_AMT, .pred)) +
geom_abline(lty = 5, color = "red") +
geom_point(alpha = 0.3, color = "midnightblue") #scale_x_continuous(trans='log10')
#coord_fixed()We can see a little the results of our predictions vs actual. Yet the model doesn’t see to be extremely accurate, specially with the cases where the TARGET AMOUNT is larger than 10,000
Let’s see which ones were the most important variables
imp_spec <- lasso_spec %>%
finalize_model(select_best(workflow_best_tuned,"rmse")) %>%
set_engine("glmnet", importance = "permutation")
workflow() %>%
add_recipe(data_recipe1) %>%
add_model(imp_spec) %>%
fit(data_train) %>%
#pull_workflow_fit() %>%
extract_fit_parsnip() %>%
vip(aesthetics = list(alpha = 0.8, fill = "midnightblue"))Some analysis of the fit
test_fit <- workflow() %>%
add_recipe(data_recipe1) %>%
add_model(lm_spec) %>%
fit(data_train) %>%
extract_fit_parsnip()summary(test_fit$fit)##
## Call:
## stats::lm(formula = ..y ~ ., data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5670 -1668 -734 347 104034
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1476.559 57.496 25.681 < 2e-16 ***
## KIDSDRIV 157.571 66.232 2.379 0.017387 *
## AGE 96.249 68.044 1.415 0.157264
## HOMEKIDS 88.770 83.090 1.068 0.285396
## YOJ 2.323 63.707 0.036 0.970916
## INCOME -243.797 88.218 -2.764 0.005735 **
## HOME_VAL -165.700 141.597 -1.170 0.241956
## TRAVTIME 245.044 58.549 4.185 2.89e-05 ***
## BLUEBOOK 22.379 92.369 0.242 0.808571
## TIF -218.812 57.674 -3.794 0.000150 ***
## OLDCLAIM -149.099 80.311 -1.857 0.063426 .
## MVR_PTS 354.683 65.470 5.417 6.28e-08 ***
## CAR_AGE -189.532 75.412 -2.513 0.011987 *
## HOME_VAL_x_BLUEBOOK 119.554 148.532 0.805 0.420903
## PARENT1_other 217.357 78.810 2.758 0.005833 **
## MSTATUS_z_No 293.065 82.897 3.535 0.000410 ***
## SEX_z_F -13.281 87.889 -0.151 0.879896
## EDUCATION_Masters 101.220 74.527 1.358 0.174461
## EDUCATION_z_High.School 75.666 75.046 1.008 0.313364
## EDUCATION_other -16.230 70.907 -0.229 0.818963
## JOB_other -114.808 70.433 -1.630 0.103144
## CAR_USE_Private -294.780 78.270 -3.766 0.000167 ***
## CAR_TYPE_z_SUV 235.526 82.404 2.858 0.004275 **
## CAR_TYPE_other 339.335 76.598 4.430 9.59e-06 ***
## RED_CAR_yes -4.908 77.913 -0.063 0.949776
## CLM_FREQ_other 258.799 81.921 3.159 0.001590 **
## REVOKED_other 171.200 65.963 2.595 0.009471 **
## URBANICITY_z_Highly.Rural..Rural -610.152 63.556 -9.600 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4498 on 6092 degrees of freedom
## Multiple R-squared: 0.07172, Adjusted R-squared: 0.06761
## F-statistic: 17.43 on 27 and 6092 DF, p-value: < 2.2e-16
knitr::kable(tidy(test_fit))| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 1476.559444 | 57.49569 | 25.6812206 | 0.0000000 |
| KIDSDRIV | 157.571409 | 66.23206 | 2.3790805 | 0.0173865 |
| AGE | 96.248573 | 68.04388 | 1.4145073 | 0.1572641 |
| HOMEKIDS | 88.770452 | 83.08960 | 1.0683701 | 0.2853959 |
| YOJ | 2.322788 | 63.70694 | 0.0364605 | 0.9709164 |
| INCOME | -243.796781 | 88.21831 | -2.7635622 | 0.0057346 |
| HOME_VAL | -165.700348 | 141.59697 | -1.1702252 | 0.2419561 |
| TRAVTIME | 245.043565 | 58.54922 | 4.1852576 | 0.0000289 |
| BLUEBOOK | 22.379172 | 92.36909 | 0.2422799 | 0.8085715 |
| TIF | -218.811735 | 57.67431 | -3.7939205 | 0.0001497 |
| OLDCLAIM | -149.099292 | 80.31074 | -1.8565299 | 0.0634263 |
| MVR_PTS | 354.683083 | 65.47032 | 5.4174640 | 0.0000001 |
| CAR_AGE | -189.532082 | 75.41210 | -2.5132847 | 0.0119869 |
| HOME_VAL_x_BLUEBOOK | 119.554486 | 148.53158 | 0.8049095 | 0.4209033 |
| PARENT1_other | 217.357408 | 78.80951 | 2.7580097 | 0.0058328 |
| MSTATUS_z_No | 293.064865 | 82.89714 | 3.5352833 | 0.0004104 |
| SEX_z_F | -13.280585 | 87.88861 | -0.1511070 | 0.8798963 |
| EDUCATION_Masters | 101.220167 | 74.52705 | 1.3581668 | 0.1744611 |
| EDUCATION_z_High.School | 75.666427 | 75.04558 | 1.0082729 | 0.3133635 |
| EDUCATION_other | -16.229751 | 70.90674 | -0.2288887 | 0.8189631 |
| JOB_other | -114.808391 | 70.43279 | -1.6300418 | 0.1031444 |
| CAR_USE_Private | -294.780450 | 78.27027 | -3.7661869 | 0.0001673 |
| CAR_TYPE_z_SUV | 235.526008 | 82.40438 | 2.8581734 | 0.0042753 |
| CAR_TYPE_other | 339.334834 | 76.59836 | 4.4300536 | 0.0000096 |
| RED_CAR_yes | -4.907846 | 77.91309 | -0.0629913 | 0.9497755 |
| CLM_FREQ_other | 258.798790 | 81.92051 | 3.1591451 | 0.0015901 |
| REVOKED_other | 171.199806 | 65.96291 | 2.5953949 | 0.0094709 |
| URBANICITY_z_Highly.Rural..Rural | -610.151575 | 63.55595 | -9.6002280 | 0.0000000 |
par(mfrow=c(2,2)) # plot all 4 plots in one
plot(test_fit$fit,
pch = 16, # optional parameters to make points blue
col = '#006EA1')#knitr::kable(tidy(workflow_best_final_fit$.workflow[[1]]))FINAL Predictions on VALIDATION Dataset
Let’s predict on the validation set
my_predictions1 <- workflow_best_final_fit$.workflow[[1]] %>%
predict(df_validation)replace all cases TARGET_FLAG=0 with 0 TARGET_AMT
For this we will need to load our predictions for ACCIDENT (TARGET_FLAG)
df_class <- read_csv("JF_predictions.csv")
head(df_class)## # A tibble: 6 × 3
## ...1 value pred_class
## <dbl> <dbl> <dbl>
## 1 1 3 0
## 2 2 9 0
## 3 3 10 0
## 4 4 18 0
## 5 5 21 0
## 6 6 30 0
my_predictions1$class <- df_class$pred_class
my_predictions1$INDEX <- df_class$value
my_predictions1 <- my_predictions1 %>%
mutate(.pred=ifelse(class==0,0, .pred))
my_predictions1 <- my_predictions1[,c(3,2,1)]
head(my_predictions1)## # A tibble: 6 × 3
## INDEX class .pred
## <dbl> <dbl> <dbl>
## 1 3 0 0
## 2 9 0 0
## 3 10 0 0
## 4 18 0 0
## 5 21 0 0
## 6 30 0 0
Save the data in .CSV
write.csv(my_predictions1,"JF_predictions_all.csv")Our prections of accident/no accident AND amount are in a single file calle JF_predictions_all.csv