Overview

Applying a neural network to the survey data using the same variables from the linear regression models.

Setup

set.seed(1234)
rm(list=ls())

Consumer Financial Protection Bureau Survey

Download and Tidy dataset from CFPB

# get cfpb file
cfpb_df <- getCFPBFile()

## The following `from` values were not present in `x`: -1, 1, 2, 3, 4, 5, 6, 7, 98, 99

cfpb_df$cfpb_score_4cat <- cut(cfpb_df$cfpb_score, breaks = c(-10, 40, 60, 80, 100),
                           labels = c("< 40","40-60","60-80","80-100"),
                           right = FALSE,
                           include.lowest=TRUE) 
cfpb_df <- cfpb_df %>% filter(cfpb_score >= 0)

# reduce cfpb data set
cfpb_df <- slice_sample(cfpb_df, weight_by=cfpb_score_4cat ,n=5000) 
cfpb_df <- cfpb_df %>% select(cfpb_score, econ_save_rate, house_mortgage, age_8cat, econ_hh_income)

summary(cfpb_df$cfpb_score)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   14.00   49.00   58.00   57.81   66.00   95.00

# prepare data for analysis
concrete_norm <- cfpb_df
concrete_norm$cfpb_score <- normalize(concrete_norm$cfpb_score)

dmy <- dummyVars(" ~ .", data = concrete_norm, fullRank = T)
dat_transformed <- data.frame(predict(dmy, newdata = concrete_norm))
glimpse(dat_transformed)

## Rows: 5,000
## Columns: 31
## $ cfpb_score                          <dbl> 0.6296296, 0.5555556, 0.3827160, 0…
## $ econ_save_rate.0                    <dbl> 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0…
## $ econ_save_rate..1.99                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ econ_save_rate..100.999             <dbl> 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0…
## $ econ_save_rate..1.000.4.999         <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1…
## $ econ_save_rate..5.000.19.999        <dbl> 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0…
## $ econ_save_rate..20.000.74.999       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ econ_save_rate..75.000.or.more      <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0…
## $ econ_save_rate.I.don.t.know         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ econ_save_rate.Prefer.not.to.say    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ house_mortgage.Refused              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0…
## $ house_mortgage.Less.than..50.000    <dbl> 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0…
## $ house_mortgage..50.000.199.999      <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ house_mortgage..200.000.or.more     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ house_mortgage.I.don.t.know         <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
## $ house_mortgage.Prefer.not.to.say    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0…
## $ age_8cat.25.34                      <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1…
## $ age_8cat.35.44                      <dbl> 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0…
## $ age_8cat.45.54                      <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ age_8cat.55.61                      <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ age_8cat.62.69                      <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ age_8cat.70.74                      <dbl> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0…
## $ age_8cat.75.                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0…
## $ econ_hh_income..20.000.to..29.999   <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ econ_hh_income..30.000.to..39.999   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ econ_hh_income..40.000.to..49.999   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1…
## $ econ_hh_income..50.000.to..59.999   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ econ_hh_income..60.000.to..74.999   <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ econ_hh_income..75.000.to..99.999   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ econ_hh_income..100.000.to..149.999 <dbl> 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0…
## $ econ_hh_income..150.000.or.more     <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…

summary(concrete_norm$cfpb_score)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.4321  0.5432  0.5408  0.6420  1.0000

# split data
split <- initial_split(dat_transformed, prop = .8, strata = cfpb_score)

train_data <- training(split) 
test_data <- testing(split)

Train Model

# create model
nn <- neuralnet(cfpb_score ~ ., data = train_data, rep=1 , stepmax=1e5, err.fct="sse")
nn$result.matrix

##                                                          [,1]
## error                                            3.213177e+01
## reached.threshold                                9.841321e-03
## steps                                            6.790000e+03
## Intercept.to.1layhid1                           -1.743017e+00
## econ_save_rate.0.to.1layhid1                    -1.178150e+00
## econ_save_rate..1.99.to.1layhid1                -1.183444e+00
## econ_save_rate..100.999.to.1layhid1             -5.575820e-01
## econ_save_rate..1.000.4.999.to.1layhid1         -1.158569e-01
## econ_save_rate..5.000.19.999.to.1layhid1         2.212034e-01
## econ_save_rate..20.000.74.999.to.1layhid1        3.900005e-01
## econ_save_rate..75.000.or.more.to.1layhid1       5.297376e-01
## econ_save_rate.I.don.t.know.to.1layhid1         -6.499695e-02
## econ_save_rate.Prefer.not.to.say.to.1layhid1     2.383573e-01
## house_mortgage.Refused.to.1layhid1               3.388674e-01
## house_mortgage.Less.than..50.000.to.1layhid1     2.727791e-01
## house_mortgage..50.000.199.999.to.1layhid1       1.180487e-01
## house_mortgage..200.000.or.more.to.1layhid1      8.166761e-02
## house_mortgage.I.don.t.know.to.1layhid1         -8.557511e-02
## house_mortgage.Prefer.not.to.say.to.1layhid1     2.025904e-01
## age_8cat.25.34.to.1layhid1                       2.385232e-02
## age_8cat.35.44.to.1layhid1                      -2.894418e-02
## age_8cat.45.54.to.1layhid1                      -7.498905e-02
## age_8cat.55.61.to.1layhid1                      -4.188472e-02
## age_8cat.62.69.to.1layhid1                       2.316219e-01
## age_8cat.70.74.to.1layhid1                       2.918574e-01
## age_8cat.75..to.1layhid1                         3.577549e-01
## econ_hh_income..20.000.to..29.999.to.1layhid1   -7.205608e-02
## econ_hh_income..30.000.to..39.999.to.1layhid1    6.910612e-02
## econ_hh_income..40.000.to..49.999.to.1layhid1    2.190871e-01
## econ_hh_income..50.000.to..59.999.to.1layhid1    2.998847e-01
## econ_hh_income..60.000.to..74.999.to.1layhid1    2.397054e-01
## econ_hh_income..75.000.to..99.999.to.1layhid1    3.503936e-01
## econ_hh_income..100.000.to..149.999.to.1layhid1  3.992277e-01
## econ_hh_income..150.000.or.more.to.1layhid1      5.402518e-01
## Intercept.to.cfpb_score                          2.909699e-01
## 1layhid1.to.cfpb_score                           1.032146e+00

plot(nn)

Test Model

len <- length(names(test_data))

results <- compute(nn, test_data[2:len])

predicted_score <- results$net.resul

t1 <- tibble(
    predict = results$net.result,
    actual = test_data$cfpb_score
)
t1 <- t1 %>% mutate (diff = predict - actual)

summary(t1)

##      predict.V1          actual             diff.V1       
##  Min.   :0.3368361   Min.   :0.0000   Min.   :-0.4453194  
##  1st Qu.:0.4624328   1st Qu.:0.4321   1st Qu.:-0.0738248  
##  Median :0.5489829   Median :0.5432   Median : 0.0111306  
##  Mean   :0.5420495   Mean   :0.5362   Mean   : 0.0058749  
##  3rd Qu.:0.6200608   3rd Qu.:0.6420   3rd Qu.: 0.0897470  
##  Max.   :0.7960797   Max.   :1.0000   Max.   : 0.3953917

ggplot(data = t1,
       mapping = aes(x = predict, y = actual)) +
  geom_point(color = '#006EA1') +
  geom_abline(intercept = 0, slope = 1, color = 'orange') +    
  labs(title = 'Neural Net (cfpb data set) ',
       x = 'Predicted',
       y = 'Actual')

ggplot(data = t1) +
  geom_boxplot(mapping = aes(x=diff)) +
  labs (title = "Std Error - Neural Net (cfpb data set)")

ggplot(t1,
    aes(diff, y = stat(density))) +
    geom_histogram(binwidth = .02, alpha = 0.7, bins = 100, color="white",size = 0.1) +
    geom_vline(aes(xintercept = mean(diff)), linetype = "dashed", size = 0.8, color="red") +
    scale_fill_brewer(palette="Spectral") +
    labs(title = "Std Error - Neural Net (cfpb data set)")

Adding 5 Hidden layers to improve model performance.

nn2 <- neuralnet(cfpb_score ~ ., data = train_data, rep=1 , stepmax=1e5, hidden = 5)
nn$result.matrix

##                                                          [,1]
## error                                            3.213177e+01
## reached.threshold                                9.841321e-03
## steps                                            6.790000e+03
## Intercept.to.1layhid1                           -1.743017e+00
## econ_save_rate.0.to.1layhid1                    -1.178150e+00
## econ_save_rate..1.99.to.1layhid1                -1.183444e+00
## econ_save_rate..100.999.to.1layhid1             -5.575820e-01
## econ_save_rate..1.000.4.999.to.1layhid1         -1.158569e-01
## econ_save_rate..5.000.19.999.to.1layhid1         2.212034e-01
## econ_save_rate..20.000.74.999.to.1layhid1        3.900005e-01
## econ_save_rate..75.000.or.more.to.1layhid1       5.297376e-01
## econ_save_rate.I.don.t.know.to.1layhid1         -6.499695e-02
## econ_save_rate.Prefer.not.to.say.to.1layhid1     2.383573e-01
## house_mortgage.Refused.to.1layhid1               3.388674e-01
## house_mortgage.Less.than..50.000.to.1layhid1     2.727791e-01
## house_mortgage..50.000.199.999.to.1layhid1       1.180487e-01
## house_mortgage..200.000.or.more.to.1layhid1      8.166761e-02
## house_mortgage.I.don.t.know.to.1layhid1         -8.557511e-02
## house_mortgage.Prefer.not.to.say.to.1layhid1     2.025904e-01
## age_8cat.25.34.to.1layhid1                       2.385232e-02
## age_8cat.35.44.to.1layhid1                      -2.894418e-02
## age_8cat.45.54.to.1layhid1                      -7.498905e-02
## age_8cat.55.61.to.1layhid1                      -4.188472e-02
## age_8cat.62.69.to.1layhid1                       2.316219e-01
## age_8cat.70.74.to.1layhid1                       2.918574e-01
## age_8cat.75..to.1layhid1                         3.577549e-01
## econ_hh_income..20.000.to..29.999.to.1layhid1   -7.205608e-02
## econ_hh_income..30.000.to..39.999.to.1layhid1    6.910612e-02
## econ_hh_income..40.000.to..49.999.to.1layhid1    2.190871e-01
## econ_hh_income..50.000.to..59.999.to.1layhid1    2.998847e-01
## econ_hh_income..60.000.to..74.999.to.1layhid1    2.397054e-01
## econ_hh_income..75.000.to..99.999.to.1layhid1    3.503936e-01
## econ_hh_income..100.000.to..149.999.to.1layhid1  3.992277e-01
## econ_hh_income..150.000.or.more.to.1layhid1      5.402518e-01
## Intercept.to.cfpb_score                          2.909699e-01
## 1layhid1.to.cfpb_score                           1.032146e+00

plot(nn2)

len <- length(names(test_data))

results2 <- compute(nn2, test_data[2:len])

predicted_score2 <- results2$net.result

t2 <- tibble(
    predict = results2$net.result,
    actual = test_data$cfpb_score
)
t2 <- t2 %>% mutate (diff = predict - actual)

summary(t2)

##      predict.V1          actual             diff.V1       
##  Min.   :0.2644693   Min.   :0.0000   Min.   :-0.5058503  
##  1st Qu.:0.4619308   1st Qu.:0.4321   1st Qu.:-0.0772526  
##  Median :0.5565810   Median :0.5432   Median : 0.0106831  
##  Mean   :0.5417271   Mean   :0.5362   Mean   : 0.0055526  
##  3rd Qu.:0.6157857   3rd Qu.:0.6420   3rd Qu.: 0.0913219  
##  Max.   :0.7879452   Max.   :1.0000   Max.   : 0.4542833

ggplot(data = t2,
       mapping = aes(x = predict, y = actual)) +
  geom_point(color = '#006EA1') +
  geom_abline(intercept = 0, slope = 1, color = 'orange') +    
  labs(title = 'Neural Net - 5 hidden layers (cfpb data set)',
       x = 'Predicted',
       y = 'Actual')

ggplot(data = t2) +
  geom_boxplot(mapping = aes(x=diff)) +
  labs (title = "Std Error - Neural Net (cfpb data set)")

ggplot(t2,
    aes(diff, y = stat(density))) +
    geom_histogram(binwidth = .02, alpha = 0.7, bins = 100, color="white",size = 0.1) +
    geom_vline(aes(xintercept = mean(diff)), linetype = "dashed", size = 0.8, color="red") +
    scale_fill_brewer(palette="Spectral") +
    labs(title = "Std Error - Neural Net (cfpb data set)")

Federal Reserve System Survey

Download and Tidy dataset from FED

# get fed file
fed_df <- getFedFile()

#filter data
fed_df <- fed_df %>% drop_na()
fed_df <- fed_df %>% filter(!(econ_saving == "" |  credit_guess =="" | health ==""))
fed_df$credit_guess <- str_replace_all(fed_df$credit_guess, "[^[:alnum:]]", "")


fed_df$cfpb_score_4cat <- cut(fed_df$cfpb_score, breaks = c(-10, 40, 60, 80, 100),
                           labels = c("< 40","40-60","60-80","80-100"),
                           right = FALSE,
                           include.lowest=TRUE) 

# reduce cfpb dataset
fed_df <- slice_sample(fed_df, weight_by=cfpb_score_4cat ,n=5000) 

fed_df <- fed_df %>% select(cfpb_score, age_7cat, econ_saving, econ_inc_4cat, econ_fin_ok, 
                            econ_pay_exp400, econ_skip_med)

summary(fed_df$cfpb_score)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   19.00   50.00   59.00   60.09   70.00   90.00

# prepare data for analysis
concrete_norm <- fed_df
concrete_norm$age <- normalize(as.numeric(concrete_norm$age))
concrete_norm$cfpb_score <- normalize(as.numeric(concrete_norm$cfpb_score)) 


dmy <- dummyVars(" ~ .", data = concrete_norm, fullRank = T)
dat_transformed <- data.frame(predict(dmy, newdata = concrete_norm))
glimpse(dat_transformed)

## Rows: 5,000
## Columns: 26
## $ cfpb_score                      <dbl> 0.3802817, 0.7464789, 0.4788732, 0.577…
## $ age_7cat.25.34                  <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,…
## $ age_7cat.35.44                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,…
## $ age_7cat.45.54                  <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ age_7cat.55.64                  <dbl> 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1,…
## $ age_7cat.65.74                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ age_7cat.75.                    <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,…
## $ econ_saving..1.000.000.or.more  <dbl> 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,…
## $ econ_saving..100.000....249.999 <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ econ_saving..250.000....499.999 <dbl> 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,…
## $ econ_saving..50.000....99.999   <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,…
## $ econ_saving..500.000....999.999 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ econ_saving.Not.sure            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ econ_saving.Refused             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ econ_saving.Under..50.000       <dbl> 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,…
## $ econ_inc_4cat..25.000..49.999   <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ econ_inc_4cat..50.000..99.999   <dbl> 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0,…
## $ econ_inc_4cat.Less.than..25.000 <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,…
## $ econ_inc_4cat.Refused           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ econ_fin_ok.Refused             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ econ_fin_ok.Yes                 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ econ_pay_exp400.Refused         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ econ_pay_exp400.Yes             <dbl> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ econ_skip_med.Refused           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ econ_skip_med.Yes               <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,…
## $ age                             <dbl> 0.5000000, 0.5000000, 0.1666667, 0.666…

summary(dat_transformed$cfpb_score)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.4366  0.5634  0.5788  0.7183  1.0000

# split data
split <- initial_split(dat_transformed, prop = .8, strata = cfpb_score)

train_data <- training(split) 
test_data <- testing(split)

Train Model

# create model
nn <- neuralnet(cfpb_score ~ ., data = train_data, rep=1 , stepmax=1e5, err.fct="sse")
nn$result.matrix

##                                                      [,1]
## error                                        4.227016e+01
## reached.threshold                            8.030992e-03
## steps                                        1.341700e+04
## Intercept.to.1layhid1                       -1.289955e+00
## age_7cat.25.34.to.1layhid1                   5.354369e-02
## age_7cat.35.44.to.1layhid1                   3.039263e-02
## age_7cat.45.54.to.1layhid1                   6.163244e-02
## age_7cat.55.64.to.1layhid1                   2.259856e-01
## age_7cat.65.74.to.1layhid1                   4.370609e-01
## age_7cat.75..to.1layhid1                     5.417223e-01
## econ_saving..1.000.000.or.more.to.1layhid1  -6.097535e-01
## econ_saving..100.000....249.999.to.1layhid1 -8.942505e-01
## econ_saving..250.000....499.999.to.1layhid1 -8.558654e-01
## econ_saving..50.000....99.999.to.1layhid1   -8.995392e-01
## econ_saving..500.000....999.999.to.1layhid1 -7.616772e-01
## econ_saving.Not.sure.to.1layhid1            -8.764564e-01
## econ_saving.Refused.to.1layhid1             -7.783528e-01
## econ_saving.Under..50.000.to.1layhid1       -1.020066e+00
## econ_inc_4cat..25.000..49.999.to.1layhid1   -1.648763e-01
## econ_inc_4cat..50.000..99.999.to.1layhid1   -8.974232e-02
## econ_inc_4cat.Less.than..25.000.to.1layhid1 -1.647020e-01
## econ_inc_4cat.Refused.to.1layhid1           -2.452059e-02
## econ_fin_ok.Refused.to.1layhid1              9.213874e-01
## econ_fin_ok.Yes.to.1layhid1                  8.089211e-01
## econ_pay_exp400.Refused.to.1layhid1          2.035839e-01
## econ_pay_exp400.Yes.to.1layhid1              3.546085e-01
## econ_skip_med.Refused.to.1layhid1            8.317161e-02
## econ_skip_med.Yes.to.1layhid1               -4.738707e-01
## age.to.1layhid1                             -1.818677e-01
## Intercept.to.cfpb_score                      2.193678e-01
## 1layhid1.to.cfpb_score                       1.535307e+00

plot(nn)

Test Model

len <- length(names(train_data))

results <- compute(nn, test_data[2:len])

predicted_score <- results$net.result

t1 <- tibble(
    predict = results$net.result,
    actual = test_data$cfpb_score
)
t1 <- t1 %>% mutate (diff = predict - actual)

summary(t1)

##      predict.V1          actual             diff.V1       
##  Min.   :0.2936517   Min.   :0.0000   Min.   :-0.4806259  
##  1st Qu.:0.4833388   1st Qu.:0.4366   1st Qu.:-0.0813013  
##  Median :0.5961433   Median :0.5634   Median : 0.0087107  
##  Mean   :0.5780807   Mean   :0.5743   Mean   : 0.0038208  
##  3rd Qu.:0.6753420   3rd Qu.:0.7183   3rd Qu.: 0.0939826  
##  Max.   :0.8442590   Max.   :1.0000   Max.   : 0.4143745

ggplot(data = t1,
       mapping = aes(x = predict, y = actual)) +
  geom_point(color = '#006EA1') +
  geom_abline(intercept = 0, slope = 1, color = 'orange') +    
  labs(title = 'Std Error - Neural Net (FED data set)',
       x = 'Predicted',
       y = 'Actual')

ggplot(data = t1) +
  geom_boxplot(mapping = aes(x=diff)) +
  labs (title = "Std Error - Neural Net (FED data set)")

ggplot(t1,
    aes(diff, y = stat(density))) +
    geom_histogram(binwidth = .02, alpha = 0.7, bins = 100, color="white",size = 0.1) +
    geom_vline(aes(xintercept = mean(diff)), linetype = "dashed", size = 0.8, color="red") +
    scale_fill_brewer(palette="Spectral") +
    labs(title = "Std Error - Neural Net (FED data set)")

Adding 5 Hidden layers to improve model performance.

nn2 <- neuralnet(cfpb_score ~ ., data = train_data, rep=1 , stepmax=1e5, hidden = 5)
nn$result.matrix

##                                                      [,1]
## error                                        4.227016e+01
## reached.threshold                            8.030992e-03
## steps                                        1.341700e+04
## Intercept.to.1layhid1                       -1.289955e+00
## age_7cat.25.34.to.1layhid1                   5.354369e-02
## age_7cat.35.44.to.1layhid1                   3.039263e-02
## age_7cat.45.54.to.1layhid1                   6.163244e-02
## age_7cat.55.64.to.1layhid1                   2.259856e-01
## age_7cat.65.74.to.1layhid1                   4.370609e-01
## age_7cat.75..to.1layhid1                     5.417223e-01
## econ_saving..1.000.000.or.more.to.1layhid1  -6.097535e-01
## econ_saving..100.000....249.999.to.1layhid1 -8.942505e-01
## econ_saving..250.000....499.999.to.1layhid1 -8.558654e-01
## econ_saving..50.000....99.999.to.1layhid1   -8.995392e-01
## econ_saving..500.000....999.999.to.1layhid1 -7.616772e-01
## econ_saving.Not.sure.to.1layhid1            -8.764564e-01
## econ_saving.Refused.to.1layhid1             -7.783528e-01
## econ_saving.Under..50.000.to.1layhid1       -1.020066e+00
## econ_inc_4cat..25.000..49.999.to.1layhid1   -1.648763e-01
## econ_inc_4cat..50.000..99.999.to.1layhid1   -8.974232e-02
## econ_inc_4cat.Less.than..25.000.to.1layhid1 -1.647020e-01
## econ_inc_4cat.Refused.to.1layhid1           -2.452059e-02
## econ_fin_ok.Refused.to.1layhid1              9.213874e-01
## econ_fin_ok.Yes.to.1layhid1                  8.089211e-01
## econ_pay_exp400.Refused.to.1layhid1          2.035839e-01
## econ_pay_exp400.Yes.to.1layhid1              3.546085e-01
## econ_skip_med.Refused.to.1layhid1            8.317161e-02
## econ_skip_med.Yes.to.1layhid1               -4.738707e-01
## age.to.1layhid1                             -1.818677e-01
## Intercept.to.cfpb_score                      2.193678e-01
## 1layhid1.to.cfpb_score                       1.535307e+00

plot(nn2)

len <- length(names(train_data))

results2 <- compute(nn2, test_data[2:len])

predicted_score2 <- results2$net.result

t2 <- tibble(
    predict = results2$net.result,
    actual = test_data$cfpb_score
)
t2 <- t2 %>% mutate (  
    diff = actual - predict,
    dev = (actual - predict) / actual
    )
#accuracy_res = 1-abs(mean(t2$dev))
summary(t2)

##      predict.V1          actual             diff.V1               dev.V1       
##  Min.   :0.1600280   Min.   :0.0000   Min.   :-0.4152538   Min.   :      -Inf  
##  1st Qu.:0.4765999   1st Qu.:0.4366   1st Qu.:-0.0992048   1st Qu.:-0.2133509  
##  Median :0.5954335   Median :0.5634   Median :-0.0111548   Median :-0.0188526  
##  Mean   :0.5754922   Mean   :0.5743   Mean   :-0.0012323   Mean   :      -Inf  
##  3rd Qu.:0.6802764   3rd Qu.:0.7183   3rd Qu.: 0.1026609   3rd Qu.: 0.1525124  
##  Max.   :0.8865640   Max.   :1.0000   Max.   : 0.5121628   Max.   : 0.6861048

ggplot(data = t2,
       mapping = aes(x = predict, y = actual)) +
  geom_point(color = '#006EA1') +
  geom_abline(intercept = 0, slope = 1, color = 'orange') +    
  labs(title = 'Neural Net 5 layers (FED data set) ',
       x = 'Predicted',
       y = 'Actual')

ggplot(data = t2) +
  geom_boxplot(mapping = aes(x=diff)) +
  labs (title = "Std Error - Neural Net (FED data set")

ggplot(t2,
    aes(diff, y = stat(density))) +
    geom_histogram(binwidth = .02, alpha = 0.7, bins = 100, color="white",size = 0.1) +
    geom_vline(aes(xintercept = mean(diff)), linetype = "dashed", size = 0.8, color="red") +
    scale_fill_brewer(palette="Spectral") +
    labs(title = "Std Error - Neural Net (FED data set)")

Conclusion

Developing the neural net was an interesting exercise however it might not be the most appropriate modeling approach for the project.

DATA 607 Final Project: Neural Network

David Simbandumwe

Overview

Setup

Consumer Financial Protection Bureau Survey

Download and Tidy dataset from CFPB

Train Model

Test Model

Adding 5 Hidden layers to improve model performance.

Federal Reserve System Survey

Download and Tidy dataset from FED

Train Model

Test Model

Adding 5 Hidden layers to improve model performance.

Conclusion