Overview
Applying a neural network to the survey data using the same variables from the linear regression models.
Setup
set.seed(1234)
rm(list=ls())Consumer Financial Protection Bureau Survey
Download and Tidy dataset from CFPB
# get cfpb file
cfpb_df <- getCFPBFile()## The following `from` values were not present in `x`: -1, 1, 2, 3, 4, 5, 6, 7, 98, 99
cfpb_df$cfpb_score_4cat <- cut(cfpb_df$cfpb_score, breaks = c(-10, 40, 60, 80, 100),
labels = c("< 40","40-60","60-80","80-100"),
right = FALSE,
include.lowest=TRUE)
cfpb_df <- cfpb_df %>% filter(cfpb_score >= 0)
# reduce cfpb data set
cfpb_df <- slice_sample(cfpb_df, weight_by=cfpb_score_4cat ,n=5000)
cfpb_df <- cfpb_df %>% select(cfpb_score, econ_save_rate, house_mortgage, age_8cat, econ_hh_income)
summary(cfpb_df$cfpb_score)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 14.00 49.00 58.00 57.81 66.00 95.00
# prepare data for analysis
concrete_norm <- cfpb_df
concrete_norm$cfpb_score <- normalize(concrete_norm$cfpb_score)
dmy <- dummyVars(" ~ .", data = concrete_norm, fullRank = T)
dat_transformed <- data.frame(predict(dmy, newdata = concrete_norm))
glimpse(dat_transformed)## Rows: 5,000
## Columns: 31
## $ cfpb_score <dbl> 0.6296296, 0.5555556, 0.3827160, 0…
## $ econ_save_rate.0 <dbl> 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0…
## $ econ_save_rate..1.99 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ econ_save_rate..100.999 <dbl> 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0…
## $ econ_save_rate..1.000.4.999 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1…
## $ econ_save_rate..5.000.19.999 <dbl> 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0…
## $ econ_save_rate..20.000.74.999 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ econ_save_rate..75.000.or.more <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0…
## $ econ_save_rate.I.don.t.know <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ econ_save_rate.Prefer.not.to.say <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ house_mortgage.Refused <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0…
## $ house_mortgage.Less.than..50.000 <dbl> 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0…
## $ house_mortgage..50.000.199.999 <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ house_mortgage..200.000.or.more <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ house_mortgage.I.don.t.know <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
## $ house_mortgage.Prefer.not.to.say <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0…
## $ age_8cat.25.34 <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1…
## $ age_8cat.35.44 <dbl> 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0…
## $ age_8cat.45.54 <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ age_8cat.55.61 <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ age_8cat.62.69 <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ age_8cat.70.74 <dbl> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0…
## $ age_8cat.75. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0…
## $ econ_hh_income..20.000.to..29.999 <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ econ_hh_income..30.000.to..39.999 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ econ_hh_income..40.000.to..49.999 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1…
## $ econ_hh_income..50.000.to..59.999 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ econ_hh_income..60.000.to..74.999 <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ econ_hh_income..75.000.to..99.999 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ econ_hh_income..100.000.to..149.999 <dbl> 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0…
## $ econ_hh_income..150.000.or.more <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
summary(concrete_norm$cfpb_score)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.4321 0.5432 0.5408 0.6420 1.0000
# split data
split <- initial_split(dat_transformed, prop = .8, strata = cfpb_score)
train_data <- training(split)
test_data <- testing(split)Train Model
# create model
nn <- neuralnet(cfpb_score ~ ., data = train_data, rep=1 , stepmax=1e5, err.fct="sse")
nn$result.matrix## [,1]
## error 3.213177e+01
## reached.threshold 9.841321e-03
## steps 6.790000e+03
## Intercept.to.1layhid1 -1.743017e+00
## econ_save_rate.0.to.1layhid1 -1.178150e+00
## econ_save_rate..1.99.to.1layhid1 -1.183444e+00
## econ_save_rate..100.999.to.1layhid1 -5.575820e-01
## econ_save_rate..1.000.4.999.to.1layhid1 -1.158569e-01
## econ_save_rate..5.000.19.999.to.1layhid1 2.212034e-01
## econ_save_rate..20.000.74.999.to.1layhid1 3.900005e-01
## econ_save_rate..75.000.or.more.to.1layhid1 5.297376e-01
## econ_save_rate.I.don.t.know.to.1layhid1 -6.499695e-02
## econ_save_rate.Prefer.not.to.say.to.1layhid1 2.383573e-01
## house_mortgage.Refused.to.1layhid1 3.388674e-01
## house_mortgage.Less.than..50.000.to.1layhid1 2.727791e-01
## house_mortgage..50.000.199.999.to.1layhid1 1.180487e-01
## house_mortgage..200.000.or.more.to.1layhid1 8.166761e-02
## house_mortgage.I.don.t.know.to.1layhid1 -8.557511e-02
## house_mortgage.Prefer.not.to.say.to.1layhid1 2.025904e-01
## age_8cat.25.34.to.1layhid1 2.385232e-02
## age_8cat.35.44.to.1layhid1 -2.894418e-02
## age_8cat.45.54.to.1layhid1 -7.498905e-02
## age_8cat.55.61.to.1layhid1 -4.188472e-02
## age_8cat.62.69.to.1layhid1 2.316219e-01
## age_8cat.70.74.to.1layhid1 2.918574e-01
## age_8cat.75..to.1layhid1 3.577549e-01
## econ_hh_income..20.000.to..29.999.to.1layhid1 -7.205608e-02
## econ_hh_income..30.000.to..39.999.to.1layhid1 6.910612e-02
## econ_hh_income..40.000.to..49.999.to.1layhid1 2.190871e-01
## econ_hh_income..50.000.to..59.999.to.1layhid1 2.998847e-01
## econ_hh_income..60.000.to..74.999.to.1layhid1 2.397054e-01
## econ_hh_income..75.000.to..99.999.to.1layhid1 3.503936e-01
## econ_hh_income..100.000.to..149.999.to.1layhid1 3.992277e-01
## econ_hh_income..150.000.or.more.to.1layhid1 5.402518e-01
## Intercept.to.cfpb_score 2.909699e-01
## 1layhid1.to.cfpb_score 1.032146e+00
plot(nn)Test Model
len <- length(names(test_data))
results <- compute(nn, test_data[2:len])
predicted_score <- results$net.result1 <- tibble(
predict = results$net.result,
actual = test_data$cfpb_score
)
t1 <- t1 %>% mutate (diff = predict - actual)
summary(t1)## predict.V1 actual diff.V1
## Min. :0.3368361 Min. :0.0000 Min. :-0.4453194
## 1st Qu.:0.4624328 1st Qu.:0.4321 1st Qu.:-0.0738248
## Median :0.5489829 Median :0.5432 Median : 0.0111306
## Mean :0.5420495 Mean :0.5362 Mean : 0.0058749
## 3rd Qu.:0.6200608 3rd Qu.:0.6420 3rd Qu.: 0.0897470
## Max. :0.7960797 Max. :1.0000 Max. : 0.3953917
ggplot(data = t1,
mapping = aes(x = predict, y = actual)) +
geom_point(color = '#006EA1') +
geom_abline(intercept = 0, slope = 1, color = 'orange') +
labs(title = 'Neural Net (cfpb data set) ',
x = 'Predicted',
y = 'Actual')ggplot(data = t1) +
geom_boxplot(mapping = aes(x=diff)) +
labs (title = "Std Error - Neural Net (cfpb data set)")ggplot(t1,
aes(diff, y = stat(density))) +
geom_histogram(binwidth = .02, alpha = 0.7, bins = 100, color="white",size = 0.1) +
geom_vline(aes(xintercept = mean(diff)), linetype = "dashed", size = 0.8, color="red") +
scale_fill_brewer(palette="Spectral") +
labs(title = "Std Error - Neural Net (cfpb data set)")Federal Reserve System Survey
Download and Tidy dataset from FED
# get fed file
fed_df <- getFedFile()
#filter data
fed_df <- fed_df %>% drop_na()
fed_df <- fed_df %>% filter(!(econ_saving == "" | credit_guess =="" | health ==""))
fed_df$credit_guess <- str_replace_all(fed_df$credit_guess, "[^[:alnum:]]", "")
fed_df$cfpb_score_4cat <- cut(fed_df$cfpb_score, breaks = c(-10, 40, 60, 80, 100),
labels = c("< 40","40-60","60-80","80-100"),
right = FALSE,
include.lowest=TRUE)
# reduce cfpb dataset
fed_df <- slice_sample(fed_df, weight_by=cfpb_score_4cat ,n=5000)
fed_df <- fed_df %>% select(cfpb_score, age_7cat, econ_saving, econ_inc_4cat, econ_fin_ok,
econ_pay_exp400, econ_skip_med)
summary(fed_df$cfpb_score)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 19.00 50.00 59.00 60.09 70.00 90.00
# prepare data for analysis
concrete_norm <- fed_df
concrete_norm$age <- normalize(as.numeric(concrete_norm$age))
concrete_norm$cfpb_score <- normalize(as.numeric(concrete_norm$cfpb_score))
dmy <- dummyVars(" ~ .", data = concrete_norm, fullRank = T)
dat_transformed <- data.frame(predict(dmy, newdata = concrete_norm))
glimpse(dat_transformed)## Rows: 5,000
## Columns: 26
## $ cfpb_score <dbl> 0.3802817, 0.7464789, 0.4788732, 0.577…
## $ age_7cat.25.34 <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,…
## $ age_7cat.35.44 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,…
## $ age_7cat.45.54 <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ age_7cat.55.64 <dbl> 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1,…
## $ age_7cat.65.74 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ age_7cat.75. <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,…
## $ econ_saving..1.000.000.or.more <dbl> 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,…
## $ econ_saving..100.000....249.999 <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ econ_saving..250.000....499.999 <dbl> 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,…
## $ econ_saving..50.000....99.999 <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,…
## $ econ_saving..500.000....999.999 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ econ_saving.Not.sure <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ econ_saving.Refused <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ econ_saving.Under..50.000 <dbl> 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,…
## $ econ_inc_4cat..25.000..49.999 <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ econ_inc_4cat..50.000..99.999 <dbl> 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0,…
## $ econ_inc_4cat.Less.than..25.000 <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,…
## $ econ_inc_4cat.Refused <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ econ_fin_ok.Refused <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ econ_fin_ok.Yes <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ econ_pay_exp400.Refused <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ econ_pay_exp400.Yes <dbl> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ econ_skip_med.Refused <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ econ_skip_med.Yes <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,…
## $ age <dbl> 0.5000000, 0.5000000, 0.1666667, 0.666…
summary(dat_transformed$cfpb_score)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.4366 0.5634 0.5788 0.7183 1.0000
# split data
split <- initial_split(dat_transformed, prop = .8, strata = cfpb_score)
train_data <- training(split)
test_data <- testing(split)Train Model
# create model
nn <- neuralnet(cfpb_score ~ ., data = train_data, rep=1 , stepmax=1e5, err.fct="sse")
nn$result.matrix## [,1]
## error 4.227016e+01
## reached.threshold 8.030992e-03
## steps 1.341700e+04
## Intercept.to.1layhid1 -1.289955e+00
## age_7cat.25.34.to.1layhid1 5.354369e-02
## age_7cat.35.44.to.1layhid1 3.039263e-02
## age_7cat.45.54.to.1layhid1 6.163244e-02
## age_7cat.55.64.to.1layhid1 2.259856e-01
## age_7cat.65.74.to.1layhid1 4.370609e-01
## age_7cat.75..to.1layhid1 5.417223e-01
## econ_saving..1.000.000.or.more.to.1layhid1 -6.097535e-01
## econ_saving..100.000....249.999.to.1layhid1 -8.942505e-01
## econ_saving..250.000....499.999.to.1layhid1 -8.558654e-01
## econ_saving..50.000....99.999.to.1layhid1 -8.995392e-01
## econ_saving..500.000....999.999.to.1layhid1 -7.616772e-01
## econ_saving.Not.sure.to.1layhid1 -8.764564e-01
## econ_saving.Refused.to.1layhid1 -7.783528e-01
## econ_saving.Under..50.000.to.1layhid1 -1.020066e+00
## econ_inc_4cat..25.000..49.999.to.1layhid1 -1.648763e-01
## econ_inc_4cat..50.000..99.999.to.1layhid1 -8.974232e-02
## econ_inc_4cat.Less.than..25.000.to.1layhid1 -1.647020e-01
## econ_inc_4cat.Refused.to.1layhid1 -2.452059e-02
## econ_fin_ok.Refused.to.1layhid1 9.213874e-01
## econ_fin_ok.Yes.to.1layhid1 8.089211e-01
## econ_pay_exp400.Refused.to.1layhid1 2.035839e-01
## econ_pay_exp400.Yes.to.1layhid1 3.546085e-01
## econ_skip_med.Refused.to.1layhid1 8.317161e-02
## econ_skip_med.Yes.to.1layhid1 -4.738707e-01
## age.to.1layhid1 -1.818677e-01
## Intercept.to.cfpb_score 2.193678e-01
## 1layhid1.to.cfpb_score 1.535307e+00
plot(nn)Test Model
len <- length(names(train_data))
results <- compute(nn, test_data[2:len])
predicted_score <- results$net.resultt1 <- tibble(
predict = results$net.result,
actual = test_data$cfpb_score
)
t1 <- t1 %>% mutate (diff = predict - actual)
summary(t1)## predict.V1 actual diff.V1
## Min. :0.2936517 Min. :0.0000 Min. :-0.4806259
## 1st Qu.:0.4833388 1st Qu.:0.4366 1st Qu.:-0.0813013
## Median :0.5961433 Median :0.5634 Median : 0.0087107
## Mean :0.5780807 Mean :0.5743 Mean : 0.0038208
## 3rd Qu.:0.6753420 3rd Qu.:0.7183 3rd Qu.: 0.0939826
## Max. :0.8442590 Max. :1.0000 Max. : 0.4143745
ggplot(data = t1,
mapping = aes(x = predict, y = actual)) +
geom_point(color = '#006EA1') +
geom_abline(intercept = 0, slope = 1, color = 'orange') +
labs(title = 'Std Error - Neural Net (FED data set)',
x = 'Predicted',
y = 'Actual')ggplot(data = t1) +
geom_boxplot(mapping = aes(x=diff)) +
labs (title = "Std Error - Neural Net (FED data set)")ggplot(t1,
aes(diff, y = stat(density))) +
geom_histogram(binwidth = .02, alpha = 0.7, bins = 100, color="white",size = 0.1) +
geom_vline(aes(xintercept = mean(diff)), linetype = "dashed", size = 0.8, color="red") +
scale_fill_brewer(palette="Spectral") +
labs(title = "Std Error - Neural Net (FED data set)")Conclusion
Developing the neural net was an interesting exercise however it might not be the most appropriate modeling approach for the project.