DATA 607 Final Project: Random Forest

David Simbandumwe

Overview

Applying a random forest model to the survey data using the same variables from the linear regression models.

Setup

set.seed(1234)
rm(list=ls())

Consumer Financial Protection Bureau Survey

Download and Tidy dataset from CFPB

# get cfpb file
cfpb_df <- getCFPBFile()
## The following `from` values were not present in `x`: -1, 1, 2, 3, 4, 5, 6, 7, 98, 99
cfpb_df$cfpb_score_4cat <- cut(cfpb_df$cfpb_score, breaks = c(-10, 40, 60, 80, 100),
                           labels = c("< 40","40-60","60-80","80-100"),
                           right = FALSE,
                           include.lowest=TRUE) 
cfpb_df <- cfpb_df %>% filter(cfpb_score >= 0)

# reduce cfpb data set
cfpb_df <- slice_sample(cfpb_df, weight_by=cfpb_score_4cat ,n=4000) 
#cfpb_df <- cfpb_df %>% select(cfpb_score, cfpb_score_4cat, econ_save_rate, house_mortgage, age_8cat, econ_hh_income)
cfpb_df <- cfpb_df %>% select(cfpb_score_4cat, econ_save_rate, house_mortgage, age_8cat, econ_hh_income)



vis_miss(cfpb_df, sort_miss = TRUE)

prepare data

# Put 3/4 of the data into the training set 
cfpb_split <- initial_split(cfpb_df, prop = 0.8, strata = cfpb_score_4cat)

# Create dataframes for the two sets:
cfpb_train_data <- training(cfpb_split) 
cfpb_test_data <- testing(cfpb_split)


# define recipts 
cfpb_rec <-
  recipe(cfpb_score_4cat ~ econ_save_rate + house_mortgage + age_8cat + econ_hh_income, 
         data = cfpb_train_data) %>%
  step_naomit(everything(), skip = TRUE) %>% 
  step_upsample(cfpb_score_4cat, over_ratio = .5) %>%
  step_novel(all_nominal(), -all_outcomes()) %>%
  step_normalize(all_numeric(), -all_outcomes()) %>% 
  step_dummy(all_nominal(), -all_outcomes()) %>%
  step_zv(all_numeric(), -all_outcomes()) %>%
  step_corr(all_predictors(), threshold = 0.7, method = "spearman") 


summary(cfpb_rec)
## # A tibble: 5 × 4
##   variable        type    role      source  
##   <chr>           <chr>   <chr>     <chr>   
## 1 econ_save_rate  nominal predictor original
## 2 house_mortgage  nominal predictor original
## 3 age_8cat        nominal predictor original
## 4 econ_hh_income  nominal predictor original
## 5 cfpb_score_4cat nominal outcome   original
# folds and spec
cv_folds <-
 vfold_cv(cfpb_train_data, 
          v = 5, 
          strata = cfpb_score_4cat) 


rf_spec <- 
  rand_forest() %>% 
  set_engine("ranger", importance = "impurity") %>% 
  set_mode("classification")

tune workflow

# workflow
rf_wflow <-
 workflow() %>%
 add_recipe(cfpb_rec) %>% 
 add_model(rf_spec) 


# resample
rf_res <-
  rf_wflow %>% 
  fit_resamples(
    resamples = cv_folds, 
    metrics = metric_set(recall, precision, f_meas,accuracy, kap,roc_auc, sens, spec),
    control = control_resamples(save_pred = TRUE)
    ) 
rf_res %>%  collect_metrics(summarize = TRUE)
## # A tibble: 8 × 6
##   .metric   .estimator  mean     n std_err .config             
##   <chr>     <chr>      <dbl> <int>   <dbl> <chr>               
## 1 accuracy  multiclass 0.526     5 0.0121  Preprocessor1_Model1
## 2 f_meas    macro      0.448     5 0.0110  Preprocessor1_Model1
## 3 kap       multiclass 0.264     5 0.0167  Preprocessor1_Model1
## 4 precision macro      0.436     5 0.00999 Preprocessor1_Model1
## 5 recall    macro      0.475     5 0.0132  Preprocessor1_Model1
## 6 roc_auc   hand_till  0.778     5 0.00698 Preprocessor1_Model1
## 7 sens      macro      0.475     5 0.0132  Preprocessor1_Model1
## 8 spec      macro      0.817     5 0.00428 Preprocessor1_Model1
rf_metrics <- 
  rf_res %>% 
  collect_metrics(summarise = TRUE) %>%
  mutate(model = "Random Forest")

Final Fit

last_fit_rf <- last_fit(rf_wflow, 
                        split = cfpb_split,
                        metrics = metric_set(recall, precision, f_meas,accuracy, kap, roc_auc, sens, spec)
                        )


last_fit_rf %>% 
  collect_metrics()
## # A tibble: 8 × 4
##   .metric   .estimator .estimate .config             
##   <chr>     <chr>          <dbl> <chr>               
## 1 recall    macro          0.456 Preprocessor1_Model1
## 2 precision macro          0.400 Preprocessor1_Model1
## 3 f_meas    macro          0.411 Preprocessor1_Model1
## 4 accuracy  multiclass     0.519 Preprocessor1_Model1
## 5 kap       multiclass     0.241 Preprocessor1_Model1
## 6 sens      macro          0.456 Preprocessor1_Model1
## 7 spec      macro          0.814 Preprocessor1_Model1
## 8 roc_auc   hand_till      0.788 Preprocessor1_Model1
last_fit_rf %>% 
  pluck(".workflow", 1) %>%   
  extract_fit_parsnip() %>% 
  vip(num_features = 15) +
  labs(title = "Variable Importance (cfpb data set")  

last_fit_rf %>%
  collect_predictions() %>% 
  conf_mat(cfpb_score_4cat, .pred_class) %>% 
  autoplot(type = "heatmap") +
  labs(title = "Confusion Matrix (cfpb data set)")

Federal Reserve System Survey

Download and Tidy dataset from FED

# get fed file
fed_df <- getFedFile()

#filter data
fed_df <- fed_df %>% drop_na()
fed_df <- fed_df %>% filter(!(econ_saving == "" |  credit_guess =="" | health ==""))
fed_df$credit_guess <- str_replace_all(fed_df$credit_guess, "[^[:alnum:]]", "")


fed_df$cfpb_score_4cat <- cut(fed_df$cfpb_score, breaks = c(-10, 40, 60, 80, 100),
                           labels = c("< 40","40-60","60-80","80-100"),
                           right = FALSE,
                           include.lowest=TRUE) 

# reduce cfpb dataset
fed_df <- slice_sample(fed_df, weight_by=cfpb_score_4cat ,n=4000) 

fed_df <- fed_df %>% select(cfpb_score, cfpb_score_4cat,age_7cat, econ_saving, econ_inc_4cat, econ_fin_ok, 
                            econ_pay_exp400, econ_skip_med)



vis_miss(fed_df, sort_miss = TRUE)

prepare data

# Put 3/4 of the data into the training set 
fed_split <- initial_split(fed_df, 
                           prop = .8, 
                           strata = cfpb_score_4cat)

# Create dataframes for the two sets:
fed_train_data <- training(fed_split) 
fed_test_data <- testing(fed_split)



# define recipts 
fed_rec <-
  recipe(cfpb_score_4cat ~ age_7cat, econ_saving, econ_inc_4cat, econ_fin_ok, econ_pay_exp400, econ_skip_med, 
         data = fed_train_data) %>%
  step_naomit(everything(), skip = TRUE) %>% 
  step_upsample(cfpb_score_4cat, over_ratio = .5) %>%    
  step_novel(all_nominal(), -all_outcomes()) %>%
  step_normalize(all_numeric(), -all_outcomes()) %>% 
  step_dummy(all_nominal(), -all_outcomes()) %>%
  step_zv(all_numeric(), -all_outcomes()) %>%
  step_corr(all_predictors(), threshold = 0.7, method = "spearman", skip = TRUE) 


summary(fed_rec)
## # A tibble: 2 × 4
##   variable        type    role      source  
##   <chr>           <chr>   <chr>     <chr>   
## 1 age_7cat        nominal predictor original
## 2 cfpb_score_4cat nominal outcome   original
# folds and spec
cv_fed_folds <-
 vfold_cv(fed_train_data, 
          v = 5, 
          strata = cfpb_score_4cat) 


rf_fed_spec <- 
  rand_forest() %>% 
  set_engine("ranger", importance = "impurity") %>% 
  set_mode("classification")

tune workflow

# workflow
rf_fed_wflow <-
 workflow() %>%
 add_recipe(fed_rec) %>% 
 add_model(rf_fed_spec) 


# resample
rf_fed_res <-
  rf_fed_wflow %>% 
  fit_resamples(
    resamples = cv_fed_folds, 
    metrics = metric_set(recall, precision, f_meas,accuracy, kap,roc_auc, sens, spec),
    control = control_resamples(save_pred = TRUE)
    ) 
## ! Fold1: internal: While computing multiclass `precision()`, some levels had no p...
## ! Fold2: internal: While computing multiclass `precision()`, some levels had no p...
## ! Fold3: internal: While computing multiclass `precision()`, some levels had no p...
## ! Fold4: internal: While computing multiclass `precision()`, some levels had no p...
## ! Fold5: internal: While computing multiclass `precision()`, some levels had no p...
rf_fed_res %>%  collect_metrics(summarize = TRUE)
## # A tibble: 8 × 6
##   .metric   .estimator   mean     n std_err .config             
##   <chr>     <chr>       <dbl> <int>   <dbl> <chr>               
## 1 accuracy  multiclass 0.447      5 0.0104  Preprocessor1_Model1
## 2 f_meas    macro      0.409      5 0.0267  Preprocessor1_Model1
## 3 kap       multiclass 0.0903     5 0.00802 Preprocessor1_Model1
## 4 precision macro      0.407      5 0.0182  Preprocessor1_Model1
## 5 recall    macro      0.302      5 0.00402 Preprocessor1_Model1
## 6 roc_auc   hand_till  0.643      5 0.00841 Preprocessor1_Model1
## 7 sens      macro      0.302      5 0.00402 Preprocessor1_Model1
## 8 spec      macro      0.772      5 0.00225 Preprocessor1_Model1
rf_fed_metrics <- 
  rf_fed_res %>% 
  collect_metrics(summarise = TRUE) %>%
  mutate(model = "Random Forest")

Final Fit

last_fit_fed_rf <- last_fit(rf_fed_wflow, 
                        split = fed_split,
                        metrics = metric_set(recall, precision, f_meas,accuracy, kap, roc_auc, sens, spec)
                        )
## ! train/test split: internal: While computing multiclass `precision()`, some levels had no p...
last_fit_fed_rf %>% 
  collect_metrics()
## # A tibble: 8 × 4
##   .metric   .estimator .estimate .config             
##   <chr>     <chr>          <dbl> <chr>               
## 1 recall    macro          0.313 Preprocessor1_Model1
## 2 precision macro          0.406 Preprocessor1_Model1
## 3 f_meas    macro          0.391 Preprocessor1_Model1
## 4 accuracy  multiclass     0.466 Preprocessor1_Model1
## 5 kap       multiclass     0.115 Preprocessor1_Model1
## 6 sens      macro          0.313 Preprocessor1_Model1
## 7 spec      macro          0.778 Preprocessor1_Model1
## 8 roc_auc   hand_till      0.616 Preprocessor1_Model1
last_fit_fed_rf %>% 
  pluck(".workflow", 1) %>%   
  extract_fit_parsnip() %>% 
  vip(num_features = 10) +
  labs(title = "Variable Importance (Fed data set)")    

last_fit_fed_rf %>%
  collect_predictions() %>% 
  conf_mat(cfpb_score_4cat, .pred_class) %>% 
  autoplot(type = "heatmap") +
  labs(title = "Confusion Matrix (Fed data set)")

Conclusion

The structure of the random forest model dictates using a categorical predictions. To support this model a factor representation of the cfpb score was create with 4 categories. The resulting model has a 0.4720 precision and fails to predict any values score less than 40 or a score over 80.