Overview

Applying a random forest model to the survey data using the same variables from the linear regression models.

Setup

set.seed(1234)
rm(list=ls())

Consumer Financial Protection Bureau Survey

Download and Tidy dataset from CFPB

# get cfpb file
cfpb_df <- getCFPBFile()

## The following `from` values were not present in `x`: -1, 1, 2, 3, 4, 5, 6, 7, 98, 99

cfpb_df$cfpb_score_4cat <- cut(cfpb_df$cfpb_score, breaks = c(-10, 40, 60, 80, 100),
                           labels = c("< 40","40-60","60-80","80-100"),
                           right = FALSE,
                           include.lowest=TRUE) 
cfpb_df <- cfpb_df %>% filter(cfpb_score >= 0)

# reduce cfpb data set
cfpb_df <- slice_sample(cfpb_df, weight_by=cfpb_score_4cat ,n=4000) 
#cfpb_df <- cfpb_df %>% select(cfpb_score, cfpb_score_4cat, econ_save_rate, house_mortgage, age_8cat, econ_hh_income)
cfpb_df <- cfpb_df %>% select(cfpb_score_4cat, econ_save_rate, house_mortgage, age_8cat, econ_hh_income)



vis_miss(cfpb_df, sort_miss = TRUE)

prepare data

# Put 3/4 of the data into the training set 
cfpb_split <- initial_split(cfpb_df, prop = 0.8, strata = cfpb_score_4cat)

# Create dataframes for the two sets:
cfpb_train_data <- training(cfpb_split) 
cfpb_test_data <- testing(cfpb_split)


# define recipts 
cfpb_rec <-
  recipe(cfpb_score_4cat ~ econ_save_rate + house_mortgage + age_8cat + econ_hh_income, 
         data = cfpb_train_data) %>%
  step_naomit(everything(), skip = TRUE) %>% 
  step_upsample(cfpb_score_4cat, over_ratio = .5) %>%
  step_novel(all_nominal(), -all_outcomes()) %>%
  step_normalize(all_numeric(), -all_outcomes()) %>% 
  step_dummy(all_nominal(), -all_outcomes()) %>%
  step_zv(all_numeric(), -all_outcomes()) %>%
  step_corr(all_predictors(), threshold = 0.7, method = "spearman") 


summary(cfpb_rec)

## # A tibble: 5 × 4
##   variable        type    role      source  
##   <chr>           <chr>   <chr>     <chr>   
## 1 econ_save_rate  nominal predictor original
## 2 house_mortgage  nominal predictor original
## 3 age_8cat        nominal predictor original
## 4 econ_hh_income  nominal predictor original
## 5 cfpb_score_4cat nominal outcome   original

# folds and spec
cv_folds <-
 vfold_cv(cfpb_train_data, 
          v = 5, 
          strata = cfpb_score_4cat) 


rf_spec <- 
  rand_forest() %>% 
  set_engine("ranger", importance = "impurity") %>% 
  set_mode("classification")

tune workflow

# workflow
rf_wflow <-
 workflow() %>%
 add_recipe(cfpb_rec) %>% 
 add_model(rf_spec) 


# resample
rf_res <-
  rf_wflow %>% 
  fit_resamples(
    resamples = cv_folds, 
    metrics = metric_set(recall, precision, f_meas,accuracy, kap,roc_auc, sens, spec),
    control = control_resamples(save_pred = TRUE)
    ) 
rf_res %>%  collect_metrics(summarize = TRUE)

## # A tibble: 8 × 6
##   .metric   .estimator  mean     n std_err .config             
##   <chr>     <chr>      <dbl> <int>   <dbl> <chr>               
## 1 accuracy  multiclass 0.526     5 0.0121  Preprocessor1_Model1
## 2 f_meas    macro      0.448     5 0.0110  Preprocessor1_Model1
## 3 kap       multiclass 0.264     5 0.0167  Preprocessor1_Model1
## 4 precision macro      0.436     5 0.00999 Preprocessor1_Model1
## 5 recall    macro      0.475     5 0.0132  Preprocessor1_Model1
## 6 roc_auc   hand_till  0.778     5 0.00698 Preprocessor1_Model1
## 7 sens      macro      0.475     5 0.0132  Preprocessor1_Model1
## 8 spec      macro      0.817     5 0.00428 Preprocessor1_Model1

rf_metrics <- 
  rf_res %>% 
  collect_metrics(summarise = TRUE) %>%
  mutate(model = "Random Forest")

Final Fit

last_fit_rf <- last_fit(rf_wflow, 
                        split = cfpb_split,
                        metrics = metric_set(recall, precision, f_meas,accuracy, kap, roc_auc, sens, spec)
                        )


last_fit_rf %>% 
  collect_metrics()

## # A tibble: 8 × 4
##   .metric   .estimator .estimate .config             
##   <chr>     <chr>          <dbl> <chr>               
## 1 recall    macro          0.456 Preprocessor1_Model1
## 2 precision macro          0.400 Preprocessor1_Model1
## 3 f_meas    macro          0.411 Preprocessor1_Model1
## 4 accuracy  multiclass     0.519 Preprocessor1_Model1
## 5 kap       multiclass     0.241 Preprocessor1_Model1
## 6 sens      macro          0.456 Preprocessor1_Model1
## 7 spec      macro          0.814 Preprocessor1_Model1
## 8 roc_auc   hand_till      0.788 Preprocessor1_Model1

last_fit_rf %>% 
  pluck(".workflow", 1) %>%   
  extract_fit_parsnip() %>% 
  vip(num_features = 15) +
  labs(title = "Variable Importance (cfpb data set")

last_fit_rf %>%
  collect_predictions() %>% 
  conf_mat(cfpb_score_4cat, .pred_class) %>% 
  autoplot(type = "heatmap") +
  labs(title = "Confusion Matrix (cfpb data set)")

Federal Reserve System Survey

Download and Tidy dataset from FED

# get fed file
fed_df <- getFedFile()

#filter data
fed_df <- fed_df %>% drop_na()
fed_df <- fed_df %>% filter(!(econ_saving == "" |  credit_guess =="" | health ==""))
fed_df$credit_guess <- str_replace_all(fed_df$credit_guess, "[^[:alnum:]]", "")


fed_df$cfpb_score_4cat <- cut(fed_df$cfpb_score, breaks = c(-10, 40, 60, 80, 100),
                           labels = c("< 40","40-60","60-80","80-100"),
                           right = FALSE,
                           include.lowest=TRUE) 

# reduce cfpb dataset
fed_df <- slice_sample(fed_df, weight_by=cfpb_score_4cat ,n=4000) 

fed_df <- fed_df %>% select(cfpb_score, cfpb_score_4cat,age_7cat, econ_saving, econ_inc_4cat, econ_fin_ok, 
                            econ_pay_exp400, econ_skip_med)



vis_miss(fed_df, sort_miss = TRUE)

prepare data

# Put 3/4 of the data into the training set 
fed_split <- initial_split(fed_df, 
                           prop = .8, 
                           strata = cfpb_score_4cat)

# Create dataframes for the two sets:
fed_train_data <- training(fed_split) 
fed_test_data <- testing(fed_split)



# define recipts 
fed_rec <-
  recipe(cfpb_score_4cat ~ age_7cat, econ_saving, econ_inc_4cat, econ_fin_ok, econ_pay_exp400, econ_skip_med, 
         data = fed_train_data) %>%
  step_naomit(everything(), skip = TRUE) %>% 
  step_upsample(cfpb_score_4cat, over_ratio = .5) %>%    
  step_novel(all_nominal(), -all_outcomes()) %>%
  step_normalize(all_numeric(), -all_outcomes()) %>% 
  step_dummy(all_nominal(), -all_outcomes()) %>%
  step_zv(all_numeric(), -all_outcomes()) %>%
  step_corr(all_predictors(), threshold = 0.7, method = "spearman", skip = TRUE) 


summary(fed_rec)

## # A tibble: 2 × 4
##   variable        type    role      source  
##   <chr>           <chr>   <chr>     <chr>   
## 1 age_7cat        nominal predictor original
## 2 cfpb_score_4cat nominal outcome   original

# folds and spec
cv_fed_folds <-
 vfold_cv(fed_train_data, 
          v = 5, 
          strata = cfpb_score_4cat) 


rf_fed_spec <- 
  rand_forest() %>% 
  set_engine("ranger", importance = "impurity") %>% 
  set_mode("classification")

tune workflow

# workflow
rf_fed_wflow <-
 workflow() %>%
 add_recipe(fed_rec) %>% 
 add_model(rf_fed_spec) 


# resample
rf_fed_res <-
  rf_fed_wflow %>% 
  fit_resamples(
    resamples = cv_fed_folds, 
    metrics = metric_set(recall, precision, f_meas,accuracy, kap,roc_auc, sens, spec),
    control = control_resamples(save_pred = TRUE)
    )

## ! Fold1: internal: While computing multiclass `precision()`, some levels had no p...

## ! Fold2: internal: While computing multiclass `precision()`, some levels had no p...

## ! Fold3: internal: While computing multiclass `precision()`, some levels had no p...

## ! Fold4: internal: While computing multiclass `precision()`, some levels had no p...

## ! Fold5: internal: While computing multiclass `precision()`, some levels had no p...

rf_fed_res %>%  collect_metrics(summarize = TRUE)

## # A tibble: 8 × 6
##   .metric   .estimator   mean     n std_err .config             
##   <chr>     <chr>       <dbl> <int>   <dbl> <chr>               
## 1 accuracy  multiclass 0.447      5 0.0104  Preprocessor1_Model1
## 2 f_meas    macro      0.409      5 0.0267  Preprocessor1_Model1
## 3 kap       multiclass 0.0903     5 0.00802 Preprocessor1_Model1
## 4 precision macro      0.407      5 0.0182  Preprocessor1_Model1
## 5 recall    macro      0.302      5 0.00402 Preprocessor1_Model1
## 6 roc_auc   hand_till  0.643      5 0.00841 Preprocessor1_Model1
## 7 sens      macro      0.302      5 0.00402 Preprocessor1_Model1
## 8 spec      macro      0.772      5 0.00225 Preprocessor1_Model1

rf_fed_metrics <- 
  rf_fed_res %>% 
  collect_metrics(summarise = TRUE) %>%
  mutate(model = "Random Forest")

Final Fit

last_fit_fed_rf <- last_fit(rf_fed_wflow, 
                        split = fed_split,
                        metrics = metric_set(recall, precision, f_meas,accuracy, kap, roc_auc, sens, spec)
                        )

## ! train/test split: internal: While computing multiclass `precision()`, some levels had no p...

last_fit_fed_rf %>% 
  collect_metrics()

## # A tibble: 8 × 4
##   .metric   .estimator .estimate .config             
##   <chr>     <chr>          <dbl> <chr>               
## 1 recall    macro          0.313 Preprocessor1_Model1
## 2 precision macro          0.406 Preprocessor1_Model1
## 3 f_meas    macro          0.391 Preprocessor1_Model1
## 4 accuracy  multiclass     0.466 Preprocessor1_Model1
## 5 kap       multiclass     0.115 Preprocessor1_Model1
## 6 sens      macro          0.313 Preprocessor1_Model1
## 7 spec      macro          0.778 Preprocessor1_Model1
## 8 roc_auc   hand_till      0.616 Preprocessor1_Model1

last_fit_fed_rf %>% 
  pluck(".workflow", 1) %>%   
  extract_fit_parsnip() %>% 
  vip(num_features = 10) +
  labs(title = "Variable Importance (Fed data set)")

last_fit_fed_rf %>%
  collect_predictions() %>% 
  conf_mat(cfpb_score_4cat, .pred_class) %>% 
  autoplot(type = "heatmap") +
  labs(title = "Confusion Matrix (Fed data set)")

Conclusion

The structure of the random forest model dictates using a categorical predictions. To support this model a factor representation of the cfpb score was create with 4 categories. The resulting model has a 0.4720 precision and fails to predict any values score less than 40 or a score over 80.

DATA 607 Final Project: Random Forest

David Simbandumwe

Overview

Setup

Consumer Financial Protection Bureau Survey

Download and Tidy dataset from CFPB

prepare data

tune workflow

Final Fit

Federal Reserve System Survey

Download and Tidy dataset from FED

prepare data

tune workflow

Final Fit

Conclusion