library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.1 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.3 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
df <- read_csv("nfcs.csv")
## Rows: 25539 Columns: 133
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (133): NFCSID, STATEQ, CENSUSDIV, CENSUSREG, A50A, A3Ar_w, A50B, A4A_new...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df)
dim(df)
## [1] 25539 133
Getting only the important columns
finance_df <- df %>%
select(
# demographics
A50A, # gender
A3Ar_w, # age group
A4A_new_w, # ethnicity
A5_2015, # education
A7A, # maritial status
A8_2021, # household income
# financial behavior / outcomes
J3, # spending habits
J4, # monthly expenses
B14A_1, # investments?
EA_1, # own home?
F2_2, # credit card debt
F2_3, # paying only monthly minimums
F2_4, # cc late fees
G23, # "i have too much debt"
# financial literacy questions
M6,
M7,
M8,
M31,
M50,
M9,
M10,
)
Renaming columns
finance_df <- finance_df %>%
rename(
gender = A50A,
age_group = A3Ar_w,
ethnicity = A4A_new_w,
education = A5_2015,
marital_status = A7A,
income = A8_2021,
spending_behavior = J3,
financial_difficulty = J4,
investments = B14A_1,
homeowner = EA_1,
carry_balance = F2_2,
minimum_payment = F2_3,
late_fee = F2_4,
too_much_debt = G23
)
Removing “Don’t know” and “Prefer not to say” values
finance_df[finance_df == 98 | finance_df == 99] <- NA
finance_df <- na.omit(finance_df)
CREATE A LITERACY SCORE! (sum of correct answer = score)
finance_df <- finance_df %>%
mutate(
literacy_score =
(M6 == 1) +
(M7 == 3) +
(M8 == 2) +
(M31 == 2) +
(M9 == 1) +
(M10 == 2) +
(M50 == 3)
)
PART 1: USING “financial_difficulty” as the target variable
1.1: create target variable, financial fragile
Making financial_difficulty the target variable. there are 3 possible values. In a typical month, how difficult is it for you to cover your expenses and pay all your bills? 1: very difficult 2: somewhat difficult 3: not at all difficult
Turning this into a binary score so answers 1 & 2 are both considered as financial hardship. Now 0 = no financial difficulty 1 = financial difficulty
fragility_df <- finance_df %>%
mutate(
fragile = ifelse(financial_difficulty %in% c(1,2), 1, 0),
fragile = as.factor(fragile)
)
1.2 LOGISTIC REGRESSION
Model:
fragility_model <- glm(
fragile ~ literacy_score +
gender +
age_group +
ethnicity +
education +
income +
marital_status +
spending_behavior,
data = fragility_df,
family = binomial
)
summary(fragility_model)
##
## Call:
## glm(formula = fragile ~ literacy_score + gender + age_group +
## ethnicity + education + income + marital_status + spending_behavior,
## family = binomial, data = fragility_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.60850 0.25738 6.249 4.12e-10 ***
## literacy_score -0.10276 0.02534 -4.056 4.99e-05 ***
## gender 0.31617 0.06314 5.008 5.51e-07 ***
## age_group -0.28173 0.02109 -13.357 < 2e-16 ***
## ethnicity 0.14719 0.07334 2.007 0.0448 *
## education -0.08328 0.02129 -3.913 9.13e-05 ***
## income -0.25255 0.01844 -13.697 < 2e-16 ***
## marital_status -0.06973 0.03633 -1.919 0.0549 .
## spending_behavior 0.53917 0.03494 15.433 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7685.7 on 5733 degrees of freedom
## Residual deviance: 6546.5 on 5725 degrees of freedom
## AIC: 6564.5
##
## Number of Fisher Scoring iterations: 3
Log odds and CI
exp(cbind(
Odds_Ratio = coef(fragility_model),
confint(fragility_model)
))
## Waiting for profiling to be done...
## Odds_Ratio 2.5 % 97.5 %
## (Intercept) 4.9953309 3.0207264 8.2865069
## literacy_score 0.9023401 0.8586017 0.9482721
## gender 1.3718697 1.2121510 1.5526051
## age_group 0.7544747 0.7238149 0.7862164
## ethnicity 1.1585762 1.0031725 1.3373822
## education 0.9200940 0.8825046 0.9593133
## income 0.7768195 0.7491047 0.8052630
## marital_status 0.9326415 0.8683515 1.0012806
## spending_behavior 1.7145778 1.6013545 1.8364222
1.3 RANDOM FOREST
library(rpart)
library(rpart.plot)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
Test / Train Split
set.seed(1234)
trainIndex <- createDataPartition(
fragility_df$fragile,
p = 0.8,
list = FALSE
)
train_fragile <- fragility_df[trainIndex, ]
test_fragile <- fragility_df[-trainIndex, ]
Model
rf_fragile <- randomForest(
fragile ~ literacy_score +
gender +
age_group +
ethnicity +
education +
income +
marital_status +
spending_behavior,
data = train_fragile,
importance = TRUE,
ntree = 500
)
rf_fragile
##
## Call:
## randomForest(formula = fragile ~ literacy_score + gender + age_group + ethnicity + education + income + marital_status + spending_behavior, data = train_fragile, importance = TRUE, ntree = 500)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 26.37%
## Confusion matrix:
## 0 1 class.error
## 0 2320 464 0.1666667
## 1 746 1058 0.4135255
varImpPlot(rf_fragile)
Predictions
rf_pred_fragile <- predict(
rf_fragile,
test_fragile
)
Confusion Matrix
conf_rf_fragile <- confusionMatrix(
rf_pred_fragile,
test_fragile$fragile
)
conf_rf_fragile
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 579 159
## 1 116 292
##
## Accuracy : 0.76
## 95% CI : (0.7342, 0.7845)
## No Information Rate : 0.6065
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.4887
##
## Mcnemar's Test P-Value : 0.01132
##
## Sensitivity : 0.8331
## Specificity : 0.6475
## Pos Pred Value : 0.7846
## Neg Pred Value : 0.7157
## Prevalence : 0.6065
## Detection Rate : 0.5052
## Detection Prevalence : 0.6440
## Balanced Accuracy : 0.7403
##
## 'Positive' Class : 0
##
1.4 COMPARING MODEL ACCURACY
Logistic Regression
log_prob_fragile <- predict(fragility_model, test_fragile, type = "response")
log_pred_fragile <- ifelse(log_prob_fragile > 0.5, 1, 0)
log_pred_fragile <- as.factor(log_pred_fragile)
# confusion matrix
conf_log <- confusionMatrix(log_pred_fragile, test_fragile$fragile)
conf_log
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 575 219
## 1 120 232
##
## Accuracy : 0.7042
## 95% CI : (0.6768, 0.7305)
## No Information Rate : 0.6065
## P-Value [Acc > NIR] : 3.316e-12
##
## Kappa : 0.3554
##
## Mcnemar's Test P-Value : 1.023e-07
##
## Sensitivity : 0.8273
## Specificity : 0.5144
## Pos Pred Value : 0.7242
## Neg Pred Value : 0.6591
## Prevalence : 0.6065
## Detection Rate : 0.5017
## Detection Prevalence : 0.6928
## Balanced Accuracy : 0.6709
##
## 'Positive' Class : 0
##
Random forest predictions
rf_pred <- predict(rf_fragile, test_fragile)
# confusion matrix
conf_rf <- confusionMatrix(rf_pred_fragile, test_fragile$fragile)
conf_rf
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 579 159
## 1 116 292
##
## Accuracy : 0.76
## 95% CI : (0.7342, 0.7845)
## No Information Rate : 0.6065
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.4887
##
## Mcnemar's Test P-Value : 0.01132
##
## Sensitivity : 0.8331
## Specificity : 0.6475
## Pos Pred Value : 0.7846
## Neg Pred Value : 0.7157
## Prevalence : 0.6065
## Detection Rate : 0.5052
## Detection Prevalence : 0.6440
## Balanced Accuracy : 0.7403
##
## 'Positive' Class : 0
##
Comparing models
log_acc <- conf_log$overall["Accuracy"]
rf_acc <- conf_rf$overall["Accuracy"]
log_acc
## Accuracy
## 0.7041885
rf_acc
## Accuracy
## 0.7600349
PART 2: DEBT STRESS MODEL
2.1: Creating a debt stress score using my own target variable “debt_stress_score”. Uses the scores from carry_balance, minimum_payment, late_fee, too_much_debt, financial_difficulty
debt_df <- finance_df %>%
mutate(
debt_stress_score =
(carry_balance == 1) +
(minimum_payment == 1) +
(late_fee == 1) +
(too_much_debt >= 5) +
(financial_difficulty %in% c(1,2))
)
Converting into binary target
debt_df <- debt_df %>%
mutate(
debt_stress = ifelse(debt_stress_score >= 2, 1, 0),
debt_stress = as.factor(debt_stress)
)
2.2: LOGISTIC REGRESSION FOR DEBT STRESS
Clean the model to remove leakage variables
debt_model <- glm(
debt_stress ~ literacy_score +
gender +
age_group +
ethnicity +
education +
income +
marital_status +
spending_behavior +
investments +
homeowner,
data = debt_df,
family = binomial
)
summary(debt_model)
##
## Call:
## glm(formula = debt_stress ~ literacy_score + gender + age_group +
## ethnicity + education + income + marital_status + spending_behavior +
## investments + homeowner, family = binomial, data = debt_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.05577 0.30472 -0.183 0.854778
## literacy_score -0.20525 0.02587 -7.934 2.12e-15 ***
## gender 0.16189 0.06418 2.522 0.011653 *
## age_group -0.28610 0.02177 -13.143 < 2e-16 ***
## ethnicity 0.17074 0.07453 2.291 0.021974 *
## education -0.08228 0.02176 -3.782 0.000156 ***
## income -0.09524 0.01878 -5.072 3.94e-07 ***
## marital_status -0.08343 0.03781 -2.206 0.027368 *
## spending_behavior 0.55597 0.03535 15.729 < 2e-16 ***
## investments 0.70734 0.06440 10.984 < 2e-16 ***
## homeowner 0.48614 0.07884 6.166 7.01e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7826.7 on 5733 degrees of freedom
## Residual deviance: 6468.6 on 5723 degrees of freedom
## AIC: 6490.6
##
## Number of Fisher Scoring iterations: 3
Log odds and CI
exp(cbind(
Odds_Ratio = coef(debt_model),
confint(debt_model)
))
## Waiting for profiling to be done...
## Odds_Ratio 2.5 % 97.5 %
## (Intercept) 0.9457559 0.5206532 1.7195404
## literacy_score 0.8144398 0.7740737 0.8567092
## gender 1.1757251 1.0366755 1.3332597
## age_group 0.7511841 0.7197031 0.7838265
## ethnicity 1.1861867 1.0247673 1.3725785
## education 0.9210096 0.8825507 0.9611456
## income 0.9091591 0.8762384 0.9431824
## marital_status 0.9199593 0.8540554 0.9905404
## spending_behavior 1.7436301 1.6272055 1.8690707
## investments 2.0285981 1.7881862 2.3017315
## homeowner 1.6260236 1.3933035 1.8980036
2.3: RANDOM FOREST FOR DEBT STRESS
library(rpart)
library(rpart.plot)
library(randomForest)
library(caret)
set.seed(1234)
trainIndex2 <- createDataPartition(
debt_df$debt_stress,
p = 0.8,
list = FALSE
)
train_debt <- debt_df[trainIndex2, ]
test_debt <- debt_df[-trainIndex2, ]
rf_debt <- randomForest(
debt_stress ~ literacy_score +
gender +
age_group +
ethnicity +
education +
income +
marital_status +
spending_behavior +
investments +
homeowner,
data = train_debt,
importance = TRUE,
ntree = 500
)
rf_debt
##
## Call:
## randomForest(formula = debt_stress ~ literacy_score + gender + age_group + ethnicity + education + income + marital_status + spending_behavior + investments + homeowner, data = train_debt, importance = TRUE, ntree = 500)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 27.33%
## Confusion matrix:
## 0 1 class.error
## 0 2049 579 0.2203196
## 1 675 1285 0.3443878
varImpPlot(rf_debt)
2.4: COMPARING MODEL ACCURACY
log_prob_debt <- predict(debt_model, test_debt, type = "response")
log_pred_debt <- ifelse(log_prob_debt > 0.5, 1, 0)
log_pred_debt <- as.factor(log_pred_debt)
# confusion matrix
conf_log_debt <- confusionMatrix(log_pred_debt,test_debt$debt_stress)
conf_log_debt
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 526 198
## 1 131 291
##
## Accuracy : 0.7129
## 95% CI : (0.6858, 0.739)
## No Information Rate : 0.5733
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4028
##
## Mcnemar's Test P-Value : 0.000274
##
## Sensitivity : 0.8006
## Specificity : 0.5951
## Pos Pred Value : 0.7265
## Neg Pred Value : 0.6896
## Prevalence : 0.5733
## Detection Rate : 0.4590
## Detection Prevalence : 0.6318
## Balanced Accuracy : 0.6979
##
## 'Positive' Class : 0
##
Random Forest predictions
rf_pred_debt <- predict(rf_debt,test_debt)
# confusion matrix
conf_rf_debt <- confusionMatrix(rf_pred_debt,test_debt$debt_stress)
conf_rf_debt
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 507 167
## 1 150 322
##
## Accuracy : 0.7234
## 95% CI : (0.6965, 0.7491)
## No Information Rate : 0.5733
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.4321
##
## Mcnemar's Test P-Value : 0.3688
##
## Sensitivity : 0.7717
## Specificity : 0.6585
## Pos Pred Value : 0.7522
## Neg Pred Value : 0.6822
## Prevalence : 0.5733
## Detection Rate : 0.4424
## Detection Prevalence : 0.5881
## Balanced Accuracy : 0.7151
##
## 'Positive' Class : 0
##
Compare accuracies
log_acc_debt <- conf_log_debt$overall["Accuracy"]
rf_acc_debt <- conf_rf_debt$overall["Accuracy"]
log_acc_debt
## Accuracy
## 0.7129145
rf_acc_debt
## Accuracy
## 0.7233857