DATA 607 Final Project: Neural Network

David Simbandumwe

Overview

Applying a neural network to the survey data using the same variables from the linear regression models.

Setup

set.seed(1234)
rm(list=ls())

Consumer Financial Protection Bureau Survey

Download and Tidy dataset from CFPB

# get cfpb file
cfpb_df <- getCFPBFile()
## The following `from` values were not present in `x`: -1, 1, 2, 3, 4, 5, 6, 7, 98, 99
cfpb_df$cfpb_score_4cat <- cut(cfpb_df$cfpb_score, breaks = c(-10, 40, 60, 80, 100),
                           labels = c("< 40","40-60","60-80","80-100"),
                           right = FALSE,
                           include.lowest=TRUE) 
cfpb_df <- cfpb_df %>% filter(cfpb_score >= 0)

# reduce cfpb data set
cfpb_df <- slice_sample(cfpb_df, weight_by=cfpb_score_4cat ,n=5000) 
cfpb_df <- cfpb_df %>% select(cfpb_score, econ_save_rate, house_mortgage, age_8cat, econ_hh_income)

summary(cfpb_df$cfpb_score)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   14.00   49.00   58.00   57.81   66.00   95.00
# prepare data for analysis
concrete_norm <- cfpb_df
concrete_norm$cfpb_score <- normalize(concrete_norm$cfpb_score)

dmy <- dummyVars(" ~ .", data = concrete_norm, fullRank = T)
dat_transformed <- data.frame(predict(dmy, newdata = concrete_norm))
glimpse(dat_transformed)
## Rows: 5,000
## Columns: 31
## $ cfpb_score                          <dbl> 0.6296296, 0.5555556, 0.3827160, 0…
## $ econ_save_rate.0                    <dbl> 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0…
## $ econ_save_rate..1.99                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ econ_save_rate..100.999             <dbl> 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0…
## $ econ_save_rate..1.000.4.999         <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1…
## $ econ_save_rate..5.000.19.999        <dbl> 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0…
## $ econ_save_rate..20.000.74.999       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ econ_save_rate..75.000.or.more      <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0…
## $ econ_save_rate.I.don.t.know         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ econ_save_rate.Prefer.not.to.say    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ house_mortgage.Refused              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0…
## $ house_mortgage.Less.than..50.000    <dbl> 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0…
## $ house_mortgage..50.000.199.999      <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ house_mortgage..200.000.or.more     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ house_mortgage.I.don.t.know         <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
## $ house_mortgage.Prefer.not.to.say    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0…
## $ age_8cat.25.34                      <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1…
## $ age_8cat.35.44                      <dbl> 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0…
## $ age_8cat.45.54                      <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ age_8cat.55.61                      <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ age_8cat.62.69                      <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ age_8cat.70.74                      <dbl> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0…
## $ age_8cat.75.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0…
## $ econ_hh_income..20.000.to..29.999   <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ econ_hh_income..30.000.to..39.999   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ econ_hh_income..40.000.to..49.999   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1…
## $ econ_hh_income..50.000.to..59.999   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ econ_hh_income..60.000.to..74.999   <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ econ_hh_income..75.000.to..99.999   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ econ_hh_income..100.000.to..149.999 <dbl> 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0…
## $ econ_hh_income..150.000.or.more     <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
summary(concrete_norm$cfpb_score)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.4321  0.5432  0.5408  0.6420  1.0000
# split data
split <- initial_split(dat_transformed, prop = .8, strata = cfpb_score)

train_data <- training(split) 
test_data <- testing(split)

Train Model

# create model
nn <- neuralnet(cfpb_score ~ ., data = train_data, rep=1 , stepmax=1e5, err.fct="sse")
nn$result.matrix
##                                                          [,1]
## error                                            3.213177e+01
## reached.threshold                                9.841321e-03
## steps                                            6.790000e+03
## Intercept.to.1layhid1                           -1.743017e+00
## econ_save_rate.0.to.1layhid1                    -1.178150e+00
## econ_save_rate..1.99.to.1layhid1                -1.183444e+00
## econ_save_rate..100.999.to.1layhid1             -5.575820e-01
## econ_save_rate..1.000.4.999.to.1layhid1         -1.158569e-01
## econ_save_rate..5.000.19.999.to.1layhid1         2.212034e-01
## econ_save_rate..20.000.74.999.to.1layhid1        3.900005e-01
## econ_save_rate..75.000.or.more.to.1layhid1       5.297376e-01
## econ_save_rate.I.don.t.know.to.1layhid1         -6.499695e-02
## econ_save_rate.Prefer.not.to.say.to.1layhid1     2.383573e-01
## house_mortgage.Refused.to.1layhid1               3.388674e-01
## house_mortgage.Less.than..50.000.to.1layhid1     2.727791e-01
## house_mortgage..50.000.199.999.to.1layhid1       1.180487e-01
## house_mortgage..200.000.or.more.to.1layhid1      8.166761e-02
## house_mortgage.I.don.t.know.to.1layhid1         -8.557511e-02
## house_mortgage.Prefer.not.to.say.to.1layhid1     2.025904e-01
## age_8cat.25.34.to.1layhid1                       2.385232e-02
## age_8cat.35.44.to.1layhid1                      -2.894418e-02
## age_8cat.45.54.to.1layhid1                      -7.498905e-02
## age_8cat.55.61.to.1layhid1                      -4.188472e-02
## age_8cat.62.69.to.1layhid1                       2.316219e-01
## age_8cat.70.74.to.1layhid1                       2.918574e-01
## age_8cat.75..to.1layhid1                         3.577549e-01
## econ_hh_income..20.000.to..29.999.to.1layhid1   -7.205608e-02
## econ_hh_income..30.000.to..39.999.to.1layhid1    6.910612e-02
## econ_hh_income..40.000.to..49.999.to.1layhid1    2.190871e-01
## econ_hh_income..50.000.to..59.999.to.1layhid1    2.998847e-01
## econ_hh_income..60.000.to..74.999.to.1layhid1    2.397054e-01
## econ_hh_income..75.000.to..99.999.to.1layhid1    3.503936e-01
## econ_hh_income..100.000.to..149.999.to.1layhid1  3.992277e-01
## econ_hh_income..150.000.or.more.to.1layhid1      5.402518e-01
## Intercept.to.cfpb_score                          2.909699e-01
## 1layhid1.to.cfpb_score                           1.032146e+00
plot(nn)

Test Model

len <- length(names(test_data))

results <- compute(nn, test_data[2:len])

predicted_score <- results$net.resul
t1 <- tibble(
    predict = results$net.result,
    actual = test_data$cfpb_score
)
t1 <- t1 %>% mutate (diff = predict - actual)

summary(t1)
##      predict.V1          actual             diff.V1       
##  Min.   :0.3368361   Min.   :0.0000   Min.   :-0.4453194  
##  1st Qu.:0.4624328   1st Qu.:0.4321   1st Qu.:-0.0738248  
##  Median :0.5489829   Median :0.5432   Median : 0.0111306  
##  Mean   :0.5420495   Mean   :0.5362   Mean   : 0.0058749  
##  3rd Qu.:0.6200608   3rd Qu.:0.6420   3rd Qu.: 0.0897470  
##  Max.   :0.7960797   Max.   :1.0000   Max.   : 0.3953917
ggplot(data = t1,
       mapping = aes(x = predict, y = actual)) +
  geom_point(color = '#006EA1') +
  geom_abline(intercept = 0, slope = 1, color = 'orange') +    
  labs(title = 'Neural Net (cfpb data set) ',
       x = 'Predicted',
       y = 'Actual')

ggplot(data = t1) +
  geom_boxplot(mapping = aes(x=diff)) +
  labs (title = "Std Error - Neural Net (cfpb data set)")

ggplot(t1,
    aes(diff, y = stat(density))) +
    geom_histogram(binwidth = .02, alpha = 0.7, bins = 100, color="white",size = 0.1) +
    geom_vline(aes(xintercept = mean(diff)), linetype = "dashed", size = 0.8, color="red") +
    scale_fill_brewer(palette="Spectral") +
    labs(title = "Std Error - Neural Net (cfpb data set)")

Adding 5 Hidden layers to improve model performance.

nn2 <- neuralnet(cfpb_score ~ ., data = train_data, rep=1 , stepmax=1e5, hidden = 5)
nn$result.matrix
##                                                          [,1]
## error                                            3.213177e+01
## reached.threshold                                9.841321e-03
## steps                                            6.790000e+03
## Intercept.to.1layhid1                           -1.743017e+00
## econ_save_rate.0.to.1layhid1                    -1.178150e+00
## econ_save_rate..1.99.to.1layhid1                -1.183444e+00
## econ_save_rate..100.999.to.1layhid1             -5.575820e-01
## econ_save_rate..1.000.4.999.to.1layhid1         -1.158569e-01
## econ_save_rate..5.000.19.999.to.1layhid1         2.212034e-01
## econ_save_rate..20.000.74.999.to.1layhid1        3.900005e-01
## econ_save_rate..75.000.or.more.to.1layhid1       5.297376e-01
## econ_save_rate.I.don.t.know.to.1layhid1         -6.499695e-02
## econ_save_rate.Prefer.not.to.say.to.1layhid1     2.383573e-01
## house_mortgage.Refused.to.1layhid1               3.388674e-01
## house_mortgage.Less.than..50.000.to.1layhid1     2.727791e-01
## house_mortgage..50.000.199.999.to.1layhid1       1.180487e-01
## house_mortgage..200.000.or.more.to.1layhid1      8.166761e-02
## house_mortgage.I.don.t.know.to.1layhid1         -8.557511e-02
## house_mortgage.Prefer.not.to.say.to.1layhid1     2.025904e-01
## age_8cat.25.34.to.1layhid1                       2.385232e-02
## age_8cat.35.44.to.1layhid1                      -2.894418e-02
## age_8cat.45.54.to.1layhid1                      -7.498905e-02
## age_8cat.55.61.to.1layhid1                      -4.188472e-02
## age_8cat.62.69.to.1layhid1                       2.316219e-01
## age_8cat.70.74.to.1layhid1                       2.918574e-01
## age_8cat.75..to.1layhid1                         3.577549e-01
## econ_hh_income..20.000.to..29.999.to.1layhid1   -7.205608e-02
## econ_hh_income..30.000.to..39.999.to.1layhid1    6.910612e-02
## econ_hh_income..40.000.to..49.999.to.1layhid1    2.190871e-01
## econ_hh_income..50.000.to..59.999.to.1layhid1    2.998847e-01
## econ_hh_income..60.000.to..74.999.to.1layhid1    2.397054e-01
## econ_hh_income..75.000.to..99.999.to.1layhid1    3.503936e-01
## econ_hh_income..100.000.to..149.999.to.1layhid1  3.992277e-01
## econ_hh_income..150.000.or.more.to.1layhid1      5.402518e-01
## Intercept.to.cfpb_score                          2.909699e-01
## 1layhid1.to.cfpb_score                           1.032146e+00
plot(nn2)
len <- length(names(test_data))

results2 <- compute(nn2, test_data[2:len])

predicted_score2 <- results2$net.result
t2 <- tibble(
    predict = results2$net.result,
    actual = test_data$cfpb_score
)
t2 <- t2 %>% mutate (diff = predict - actual)

summary(t2)
##      predict.V1          actual             diff.V1       
##  Min.   :0.2644693   Min.   :0.0000   Min.   :-0.5058503  
##  1st Qu.:0.4619308   1st Qu.:0.4321   1st Qu.:-0.0772526  
##  Median :0.5565810   Median :0.5432   Median : 0.0106831  
##  Mean   :0.5417271   Mean   :0.5362   Mean   : 0.0055526  
##  3rd Qu.:0.6157857   3rd Qu.:0.6420   3rd Qu.: 0.0913219  
##  Max.   :0.7879452   Max.   :1.0000   Max.   : 0.4542833
ggplot(data = t2,
       mapping = aes(x = predict, y = actual)) +
  geom_point(color = '#006EA1') +
  geom_abline(intercept = 0, slope = 1, color = 'orange') +    
  labs(title = 'Neural Net - 5 hidden layers (cfpb data set)',
       x = 'Predicted',
       y = 'Actual')

ggplot(data = t2) +
  geom_boxplot(mapping = aes(x=diff)) +
  labs (title = "Std Error - Neural Net (cfpb data set)")

ggplot(t2,
    aes(diff, y = stat(density))) +
    geom_histogram(binwidth = .02, alpha = 0.7, bins = 100, color="white",size = 0.1) +
    geom_vline(aes(xintercept = mean(diff)), linetype = "dashed", size = 0.8, color="red") +
    scale_fill_brewer(palette="Spectral") +
    labs(title = "Std Error - Neural Net (cfpb data set)")

Federal Reserve System Survey

Download and Tidy dataset from FED

# get fed file
fed_df <- getFedFile()

#filter data
fed_df <- fed_df %>% drop_na()
fed_df <- fed_df %>% filter(!(econ_saving == "" |  credit_guess =="" | health ==""))
fed_df$credit_guess <- str_replace_all(fed_df$credit_guess, "[^[:alnum:]]", "")


fed_df$cfpb_score_4cat <- cut(fed_df$cfpb_score, breaks = c(-10, 40, 60, 80, 100),
                           labels = c("< 40","40-60","60-80","80-100"),
                           right = FALSE,
                           include.lowest=TRUE) 

# reduce cfpb dataset
fed_df <- slice_sample(fed_df, weight_by=cfpb_score_4cat ,n=5000) 

fed_df <- fed_df %>% select(cfpb_score, age_7cat, econ_saving, econ_inc_4cat, econ_fin_ok, 
                            econ_pay_exp400, econ_skip_med)

summary(fed_df$cfpb_score)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   19.00   50.00   59.00   60.09   70.00   90.00
# prepare data for analysis
concrete_norm <- fed_df
concrete_norm$age <- normalize(as.numeric(concrete_norm$age))
concrete_norm$cfpb_score <- normalize(as.numeric(concrete_norm$cfpb_score)) 


dmy <- dummyVars(" ~ .", data = concrete_norm, fullRank = T)
dat_transformed <- data.frame(predict(dmy, newdata = concrete_norm))
glimpse(dat_transformed)
## Rows: 5,000
## Columns: 26
## $ cfpb_score                      <dbl> 0.3802817, 0.7464789, 0.4788732, 0.577…
## $ age_7cat.25.34                  <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,…
## $ age_7cat.35.44                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,…
## $ age_7cat.45.54                  <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ age_7cat.55.64                  <dbl> 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1,…
## $ age_7cat.65.74                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ age_7cat.75.                    <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,…
## $ econ_saving..1.000.000.or.more  <dbl> 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,…
## $ econ_saving..100.000....249.999 <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ econ_saving..250.000....499.999 <dbl> 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,…
## $ econ_saving..50.000....99.999   <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,…
## $ econ_saving..500.000....999.999 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ econ_saving.Not.sure            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ econ_saving.Refused             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ econ_saving.Under..50.000       <dbl> 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,…
## $ econ_inc_4cat..25.000..49.999   <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ econ_inc_4cat..50.000..99.999   <dbl> 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0,…
## $ econ_inc_4cat.Less.than..25.000 <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,…
## $ econ_inc_4cat.Refused           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ econ_fin_ok.Refused             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ econ_fin_ok.Yes                 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ econ_pay_exp400.Refused         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ econ_pay_exp400.Yes             <dbl> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ econ_skip_med.Refused           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ econ_skip_med.Yes               <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,…
## $ age                             <dbl> 0.5000000, 0.5000000, 0.1666667, 0.666…
summary(dat_transformed$cfpb_score)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.4366  0.5634  0.5788  0.7183  1.0000
# split data
split <- initial_split(dat_transformed, prop = .8, strata = cfpb_score)

train_data <- training(split) 
test_data <- testing(split)

Train Model

# create model
nn <- neuralnet(cfpb_score ~ ., data = train_data, rep=1 , stepmax=1e5, err.fct="sse")
nn$result.matrix
##                                                      [,1]
## error                                        4.227016e+01
## reached.threshold                            8.030992e-03
## steps                                        1.341700e+04
## Intercept.to.1layhid1                       -1.289955e+00
## age_7cat.25.34.to.1layhid1                   5.354369e-02
## age_7cat.35.44.to.1layhid1                   3.039263e-02
## age_7cat.45.54.to.1layhid1                   6.163244e-02
## age_7cat.55.64.to.1layhid1                   2.259856e-01
## age_7cat.65.74.to.1layhid1                   4.370609e-01
## age_7cat.75..to.1layhid1                     5.417223e-01
## econ_saving..1.000.000.or.more.to.1layhid1  -6.097535e-01
## econ_saving..100.000....249.999.to.1layhid1 -8.942505e-01
## econ_saving..250.000....499.999.to.1layhid1 -8.558654e-01
## econ_saving..50.000....99.999.to.1layhid1   -8.995392e-01
## econ_saving..500.000....999.999.to.1layhid1 -7.616772e-01
## econ_saving.Not.sure.to.1layhid1            -8.764564e-01
## econ_saving.Refused.to.1layhid1             -7.783528e-01
## econ_saving.Under..50.000.to.1layhid1       -1.020066e+00
## econ_inc_4cat..25.000..49.999.to.1layhid1   -1.648763e-01
## econ_inc_4cat..50.000..99.999.to.1layhid1   -8.974232e-02
## econ_inc_4cat.Less.than..25.000.to.1layhid1 -1.647020e-01
## econ_inc_4cat.Refused.to.1layhid1           -2.452059e-02
## econ_fin_ok.Refused.to.1layhid1              9.213874e-01
## econ_fin_ok.Yes.to.1layhid1                  8.089211e-01
## econ_pay_exp400.Refused.to.1layhid1          2.035839e-01
## econ_pay_exp400.Yes.to.1layhid1              3.546085e-01
## econ_skip_med.Refused.to.1layhid1            8.317161e-02
## econ_skip_med.Yes.to.1layhid1               -4.738707e-01
## age.to.1layhid1                             -1.818677e-01
## Intercept.to.cfpb_score                      2.193678e-01
## 1layhid1.to.cfpb_score                       1.535307e+00
plot(nn)

Test Model

len <- length(names(train_data))

results <- compute(nn, test_data[2:len])

predicted_score <- results$net.result
t1 <- tibble(
    predict = results$net.result,
    actual = test_data$cfpb_score
)
t1 <- t1 %>% mutate (diff = predict - actual)

summary(t1)
##      predict.V1          actual             diff.V1       
##  Min.   :0.2936517   Min.   :0.0000   Min.   :-0.4806259  
##  1st Qu.:0.4833388   1st Qu.:0.4366   1st Qu.:-0.0813013  
##  Median :0.5961433   Median :0.5634   Median : 0.0087107  
##  Mean   :0.5780807   Mean   :0.5743   Mean   : 0.0038208  
##  3rd Qu.:0.6753420   3rd Qu.:0.7183   3rd Qu.: 0.0939826  
##  Max.   :0.8442590   Max.   :1.0000   Max.   : 0.4143745
ggplot(data = t1,
       mapping = aes(x = predict, y = actual)) +
  geom_point(color = '#006EA1') +
  geom_abline(intercept = 0, slope = 1, color = 'orange') +    
  labs(title = 'Std Error - Neural Net (FED data set)',
       x = 'Predicted',
       y = 'Actual')

ggplot(data = t1) +
  geom_boxplot(mapping = aes(x=diff)) +
  labs (title = "Std Error - Neural Net (FED data set)")

ggplot(t1,
    aes(diff, y = stat(density))) +
    geom_histogram(binwidth = .02, alpha = 0.7, bins = 100, color="white",size = 0.1) +
    geom_vline(aes(xintercept = mean(diff)), linetype = "dashed", size = 0.8, color="red") +
    scale_fill_brewer(palette="Spectral") +
    labs(title = "Std Error - Neural Net (FED data set)")

Adding 5 Hidden layers to improve model performance.

nn2 <- neuralnet(cfpb_score ~ ., data = train_data, rep=1 , stepmax=1e5, hidden = 5)
nn$result.matrix
##                                                      [,1]
## error                                        4.227016e+01
## reached.threshold                            8.030992e-03
## steps                                        1.341700e+04
## Intercept.to.1layhid1                       -1.289955e+00
## age_7cat.25.34.to.1layhid1                   5.354369e-02
## age_7cat.35.44.to.1layhid1                   3.039263e-02
## age_7cat.45.54.to.1layhid1                   6.163244e-02
## age_7cat.55.64.to.1layhid1                   2.259856e-01
## age_7cat.65.74.to.1layhid1                   4.370609e-01
## age_7cat.75..to.1layhid1                     5.417223e-01
## econ_saving..1.000.000.or.more.to.1layhid1  -6.097535e-01
## econ_saving..100.000....249.999.to.1layhid1 -8.942505e-01
## econ_saving..250.000....499.999.to.1layhid1 -8.558654e-01
## econ_saving..50.000....99.999.to.1layhid1   -8.995392e-01
## econ_saving..500.000....999.999.to.1layhid1 -7.616772e-01
## econ_saving.Not.sure.to.1layhid1            -8.764564e-01
## econ_saving.Refused.to.1layhid1             -7.783528e-01
## econ_saving.Under..50.000.to.1layhid1       -1.020066e+00
## econ_inc_4cat..25.000..49.999.to.1layhid1   -1.648763e-01
## econ_inc_4cat..50.000..99.999.to.1layhid1   -8.974232e-02
## econ_inc_4cat.Less.than..25.000.to.1layhid1 -1.647020e-01
## econ_inc_4cat.Refused.to.1layhid1           -2.452059e-02
## econ_fin_ok.Refused.to.1layhid1              9.213874e-01
## econ_fin_ok.Yes.to.1layhid1                  8.089211e-01
## econ_pay_exp400.Refused.to.1layhid1          2.035839e-01
## econ_pay_exp400.Yes.to.1layhid1              3.546085e-01
## econ_skip_med.Refused.to.1layhid1            8.317161e-02
## econ_skip_med.Yes.to.1layhid1               -4.738707e-01
## age.to.1layhid1                             -1.818677e-01
## Intercept.to.cfpb_score                      2.193678e-01
## 1layhid1.to.cfpb_score                       1.535307e+00
plot(nn2)
len <- length(names(train_data))

results2 <- compute(nn2, test_data[2:len])

predicted_score2 <- results2$net.result
t2 <- tibble(
    predict = results2$net.result,
    actual = test_data$cfpb_score
)
t2 <- t2 %>% mutate (  
    diff = actual - predict,
    dev = (actual - predict) / actual
    )
#accuracy_res = 1-abs(mean(t2$dev))
summary(t2)
##      predict.V1          actual             diff.V1               dev.V1       
##  Min.   :0.1600280   Min.   :0.0000   Min.   :-0.4152538   Min.   :      -Inf  
##  1st Qu.:0.4765999   1st Qu.:0.4366   1st Qu.:-0.0992048   1st Qu.:-0.2133509  
##  Median :0.5954335   Median :0.5634   Median :-0.0111548   Median :-0.0188526  
##  Mean   :0.5754922   Mean   :0.5743   Mean   :-0.0012323   Mean   :      -Inf  
##  3rd Qu.:0.6802764   3rd Qu.:0.7183   3rd Qu.: 0.1026609   3rd Qu.: 0.1525124  
##  Max.   :0.8865640   Max.   :1.0000   Max.   : 0.5121628   Max.   : 0.6861048
ggplot(data = t2,
       mapping = aes(x = predict, y = actual)) +
  geom_point(color = '#006EA1') +
  geom_abline(intercept = 0, slope = 1, color = 'orange') +    
  labs(title = 'Neural Net 5 layers (FED data set) ',
       x = 'Predicted',
       y = 'Actual')

ggplot(data = t2) +
  geom_boxplot(mapping = aes(x=diff)) +
  labs (title = "Std Error - Neural Net (FED data set")

ggplot(t2,
    aes(diff, y = stat(density))) +
    geom_histogram(binwidth = .02, alpha = 0.7, bins = 100, color="white",size = 0.1) +
    geom_vline(aes(xintercept = mean(diff)), linetype = "dashed", size = 0.8, color="red") +
    scale_fill_brewer(palette="Spectral") +
    labs(title = "Std Error - Neural Net (FED data set)")

Conclusion

Developing the neural net was an interesting exercise however it might not be the most appropriate modeling approach for the project.