library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.1     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.3     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
df <- read_csv("nfcs.csv")
## Rows: 25539 Columns: 133
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (133): NFCSID, STATEQ, CENSUSDIV, CENSUSREG, A50A, A3Ar_w, A50B, A4A_new...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df)
dim(df)
## [1] 25539   133

Getting only the important columns

finance_df <- df %>%
  select(
    # demographics
    A50A, # gender
    A3Ar_w, # age group
    A4A_new_w, # ethnicity
    A5_2015, # education
    A7A, # maritial status
    A8_2021, # household income

    # financial behavior / outcomes
    J3, # spending habits
    J4, # monthly expenses 
    B14A_1, # investments?
    EA_1, # own home?
    F2_2, # credit card debt
    F2_3, # paying only monthly minimums
    F2_4, # cc late fees
    G23, # "i have too much debt"

    # financial literacy questions
    M6,
    M7,
    M8,
    M31,
    M50,
    M9,
    M10,
  )

Renaming columns

finance_df <- finance_df %>%
  rename(
    gender = A50A,
    age_group = A3Ar_w,
    ethnicity = A4A_new_w,
    education = A5_2015,
    marital_status = A7A,
    income = A8_2021,

    spending_behavior = J3,
    financial_difficulty = J4,
    investments = B14A_1,
    homeowner = EA_1,

    carry_balance = F2_2,
    minimum_payment = F2_3,
    late_fee = F2_4,
    too_much_debt = G23
  )

Removing “Don’t know” and “Prefer not to say” values

finance_df[finance_df == 98 | finance_df == 99] <- NA
finance_df <- na.omit(finance_df)

CREATE A LITERACY SCORE! (sum of correct answer = score)

finance_df <- finance_df %>%
  mutate(
    literacy_score =
      (M6 == 1) +
      (M7 == 3) +
      (M8 == 2) +
      (M31 == 2) +
      (M9 == 1) +
      (M10 == 2) +
      (M50 == 3)
  )

PART 1: USING “financial_difficulty” as the target variable

1.1: create target variable, financial fragile

Making financial_difficulty the target variable. there are 3 possible values. In a typical month, how difficult is it for you to cover your expenses and pay all your bills? 1: very difficult 2: somewhat difficult 3: not at all difficult

Turning this into a binary score so answers 1 & 2 are both considered as financial hardship. Now 0 = no financial difficulty 1 = financial difficulty

fragility_df <- finance_df %>%
  mutate(
    fragile = ifelse(financial_difficulty %in% c(1,2), 1, 0),
    fragile = as.factor(fragile)
  )

1.2 LOGISTIC REGRESSION

Model:

fragility_model <- glm(
  fragile ~ literacy_score +
             gender +
             age_group +
             ethnicity +
             education +
             income +
             marital_status +
             spending_behavior,
  data = fragility_df,
  family = binomial
)
summary(fragility_model)
## 
## Call:
## glm(formula = fragile ~ literacy_score + gender + age_group + 
##     ethnicity + education + income + marital_status + spending_behavior, 
##     family = binomial, data = fragility_df)
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        1.60850    0.25738   6.249 4.12e-10 ***
## literacy_score    -0.10276    0.02534  -4.056 4.99e-05 ***
## gender             0.31617    0.06314   5.008 5.51e-07 ***
## age_group         -0.28173    0.02109 -13.357  < 2e-16 ***
## ethnicity          0.14719    0.07334   2.007   0.0448 *  
## education         -0.08328    0.02129  -3.913 9.13e-05 ***
## income            -0.25255    0.01844 -13.697  < 2e-16 ***
## marital_status    -0.06973    0.03633  -1.919   0.0549 .  
## spending_behavior  0.53917    0.03494  15.433  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7685.7  on 5733  degrees of freedom
## Residual deviance: 6546.5  on 5725  degrees of freedom
## AIC: 6564.5
## 
## Number of Fisher Scoring iterations: 3

Log odds and CI

exp(cbind(
  Odds_Ratio = coef(fragility_model),
  confint(fragility_model)
))
## Waiting for profiling to be done...
##                   Odds_Ratio     2.5 %    97.5 %
## (Intercept)        4.9953309 3.0207264 8.2865069
## literacy_score     0.9023401 0.8586017 0.9482721
## gender             1.3718697 1.2121510 1.5526051
## age_group          0.7544747 0.7238149 0.7862164
## ethnicity          1.1585762 1.0031725 1.3373822
## education          0.9200940 0.8825046 0.9593133
## income             0.7768195 0.7491047 0.8052630
## marital_status     0.9326415 0.8683515 1.0012806
## spending_behavior  1.7145778 1.6013545 1.8364222

1.3 RANDOM FOREST

library(rpart)
library(rpart.plot)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift

Test / Train Split

set.seed(1234)

trainIndex <- createDataPartition(
  fragility_df$fragile,
  p = 0.8,
  list = FALSE
)

train_fragile <- fragility_df[trainIndex, ]
test_fragile  <- fragility_df[-trainIndex, ]

Model

rf_fragile <- randomForest(
  fragile ~ literacy_score +
             gender +
             age_group +
             ethnicity +
             education +
             income +
             marital_status +
             spending_behavior,
  data = train_fragile,
  importance = TRUE,
  ntree = 500
)
rf_fragile
## 
## Call:
##  randomForest(formula = fragile ~ literacy_score + gender + age_group +      ethnicity + education + income + marital_status + spending_behavior,      data = train_fragile, importance = TRUE, ntree = 500) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 26.37%
## Confusion matrix:
##      0    1 class.error
## 0 2320  464   0.1666667
## 1  746 1058   0.4135255
varImpPlot(rf_fragile)

Predictions

rf_pred_fragile <- predict(
  rf_fragile,
  test_fragile
)

Confusion Matrix

conf_rf_fragile <- confusionMatrix(
  rf_pred_fragile,
  test_fragile$fragile
)

conf_rf_fragile
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 579 159
##          1 116 292
##                                           
##                Accuracy : 0.76            
##                  95% CI : (0.7342, 0.7845)
##     No Information Rate : 0.6065          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.4887          
##                                           
##  Mcnemar's Test P-Value : 0.01132         
##                                           
##             Sensitivity : 0.8331          
##             Specificity : 0.6475          
##          Pos Pred Value : 0.7846          
##          Neg Pred Value : 0.7157          
##              Prevalence : 0.6065          
##          Detection Rate : 0.5052          
##    Detection Prevalence : 0.6440          
##       Balanced Accuracy : 0.7403          
##                                           
##        'Positive' Class : 0               
## 

1.4 COMPARING MODEL ACCURACY

Logistic Regression

log_prob_fragile <- predict(fragility_model, test_fragile, type = "response")

log_pred_fragile <- ifelse(log_prob_fragile > 0.5, 1, 0)
log_pred_fragile <- as.factor(log_pred_fragile)

# confusion matrix
conf_log <- confusionMatrix(log_pred_fragile, test_fragile$fragile)
conf_log
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 575 219
##          1 120 232
##                                           
##                Accuracy : 0.7042          
##                  95% CI : (0.6768, 0.7305)
##     No Information Rate : 0.6065          
##     P-Value [Acc > NIR] : 3.316e-12       
##                                           
##                   Kappa : 0.3554          
##                                           
##  Mcnemar's Test P-Value : 1.023e-07       
##                                           
##             Sensitivity : 0.8273          
##             Specificity : 0.5144          
##          Pos Pred Value : 0.7242          
##          Neg Pred Value : 0.6591          
##              Prevalence : 0.6065          
##          Detection Rate : 0.5017          
##    Detection Prevalence : 0.6928          
##       Balanced Accuracy : 0.6709          
##                                           
##        'Positive' Class : 0               
## 

Random forest predictions

rf_pred <- predict(rf_fragile, test_fragile)

# confusion matrix
conf_rf <- confusionMatrix(rf_pred_fragile, test_fragile$fragile)
conf_rf
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 579 159
##          1 116 292
##                                           
##                Accuracy : 0.76            
##                  95% CI : (0.7342, 0.7845)
##     No Information Rate : 0.6065          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.4887          
##                                           
##  Mcnemar's Test P-Value : 0.01132         
##                                           
##             Sensitivity : 0.8331          
##             Specificity : 0.6475          
##          Pos Pred Value : 0.7846          
##          Neg Pred Value : 0.7157          
##              Prevalence : 0.6065          
##          Detection Rate : 0.5052          
##    Detection Prevalence : 0.6440          
##       Balanced Accuracy : 0.7403          
##                                           
##        'Positive' Class : 0               
## 

Comparing models

log_acc <- conf_log$overall["Accuracy"]
rf_acc  <- conf_rf$overall["Accuracy"]

log_acc
##  Accuracy 
## 0.7041885
rf_acc
##  Accuracy 
## 0.7600349

PART 2: DEBT STRESS MODEL

2.1: Creating a debt stress score using my own target variable “debt_stress_score”. Uses the scores from carry_balance, minimum_payment, late_fee, too_much_debt, financial_difficulty

debt_df <- finance_df %>%
  mutate(
    debt_stress_score =
      (carry_balance == 1) +
      (minimum_payment == 1) +
      (late_fee == 1) +
      (too_much_debt >= 5) +
      (financial_difficulty %in% c(1,2))
  )

Converting into binary target

debt_df <- debt_df %>%
  mutate(
    debt_stress = ifelse(debt_stress_score >= 2, 1, 0),
    debt_stress = as.factor(debt_stress)
  )

2.2: LOGISTIC REGRESSION FOR DEBT STRESS

Clean the model to remove leakage variables

debt_model <- glm(
  debt_stress ~ literacy_score +
                 gender +
                 age_group +
                 ethnicity +
                 education +
                 income +
                 marital_status +
                 spending_behavior +
                 investments +
                 homeowner,
  data = debt_df,
  family = binomial
)
summary(debt_model)
## 
## Call:
## glm(formula = debt_stress ~ literacy_score + gender + age_group + 
##     ethnicity + education + income + marital_status + spending_behavior + 
##     investments + homeowner, family = binomial, data = debt_df)
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -0.05577    0.30472  -0.183 0.854778    
## literacy_score    -0.20525    0.02587  -7.934 2.12e-15 ***
## gender             0.16189    0.06418   2.522 0.011653 *  
## age_group         -0.28610    0.02177 -13.143  < 2e-16 ***
## ethnicity          0.17074    0.07453   2.291 0.021974 *  
## education         -0.08228    0.02176  -3.782 0.000156 ***
## income            -0.09524    0.01878  -5.072 3.94e-07 ***
## marital_status    -0.08343    0.03781  -2.206 0.027368 *  
## spending_behavior  0.55597    0.03535  15.729  < 2e-16 ***
## investments        0.70734    0.06440  10.984  < 2e-16 ***
## homeowner          0.48614    0.07884   6.166 7.01e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7826.7  on 5733  degrees of freedom
## Residual deviance: 6468.6  on 5723  degrees of freedom
## AIC: 6490.6
## 
## Number of Fisher Scoring iterations: 3

Log odds and CI

exp(cbind(
  Odds_Ratio = coef(debt_model),
  confint(debt_model)
))
## Waiting for profiling to be done...
##                   Odds_Ratio     2.5 %    97.5 %
## (Intercept)        0.9457559 0.5206532 1.7195404
## literacy_score     0.8144398 0.7740737 0.8567092
## gender             1.1757251 1.0366755 1.3332597
## age_group          0.7511841 0.7197031 0.7838265
## ethnicity          1.1861867 1.0247673 1.3725785
## education          0.9210096 0.8825507 0.9611456
## income             0.9091591 0.8762384 0.9431824
## marital_status     0.9199593 0.8540554 0.9905404
## spending_behavior  1.7436301 1.6272055 1.8690707
## investments        2.0285981 1.7881862 2.3017315
## homeowner          1.6260236 1.3933035 1.8980036

2.3: RANDOM FOREST FOR DEBT STRESS

library(rpart)
library(rpart.plot)
library(randomForest)
library(caret)
set.seed(1234)

trainIndex2 <- createDataPartition(
  debt_df$debt_stress,
  p = 0.8,
  list = FALSE
)

train_debt <- debt_df[trainIndex2, ]
test_debt  <- debt_df[-trainIndex2, ]
rf_debt <- randomForest(
  debt_stress ~ literacy_score +
                 gender +
                 age_group +
                 ethnicity +
                 education +
                 income +
                 marital_status +
                 spending_behavior +
                 investments +
                 homeowner,
  data = train_debt,
  importance = TRUE,
  ntree = 500
)
rf_debt
## 
## Call:
##  randomForest(formula = debt_stress ~ literacy_score + gender +      age_group + ethnicity + education + income + marital_status +      spending_behavior + investments + homeowner, data = train_debt,      importance = TRUE, ntree = 500) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 27.33%
## Confusion matrix:
##      0    1 class.error
## 0 2049  579   0.2203196
## 1  675 1285   0.3443878
varImpPlot(rf_debt)

2.4: COMPARING MODEL ACCURACY

log_prob_debt <- predict(debt_model, test_debt, type = "response")

log_pred_debt <- ifelse(log_prob_debt > 0.5, 1, 0)
log_pred_debt <- as.factor(log_pred_debt)

# confusion matrix
conf_log_debt <- confusionMatrix(log_pred_debt,test_debt$debt_stress)
conf_log_debt
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 526 198
##          1 131 291
##                                          
##                Accuracy : 0.7129         
##                  95% CI : (0.6858, 0.739)
##     No Information Rate : 0.5733         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.4028         
##                                          
##  Mcnemar's Test P-Value : 0.000274       
##                                          
##             Sensitivity : 0.8006         
##             Specificity : 0.5951         
##          Pos Pred Value : 0.7265         
##          Neg Pred Value : 0.6896         
##              Prevalence : 0.5733         
##          Detection Rate : 0.4590         
##    Detection Prevalence : 0.6318         
##       Balanced Accuracy : 0.6979         
##                                          
##        'Positive' Class : 0              
## 

Random Forest predictions

rf_pred_debt <- predict(rf_debt,test_debt)

# confusion matrix
conf_rf_debt <- confusionMatrix(rf_pred_debt,test_debt$debt_stress)
conf_rf_debt
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 507 167
##          1 150 322
##                                           
##                Accuracy : 0.7234          
##                  95% CI : (0.6965, 0.7491)
##     No Information Rate : 0.5733          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.4321          
##                                           
##  Mcnemar's Test P-Value : 0.3688          
##                                           
##             Sensitivity : 0.7717          
##             Specificity : 0.6585          
##          Pos Pred Value : 0.7522          
##          Neg Pred Value : 0.6822          
##              Prevalence : 0.5733          
##          Detection Rate : 0.4424          
##    Detection Prevalence : 0.5881          
##       Balanced Accuracy : 0.7151          
##                                           
##        'Positive' Class : 0               
## 

Compare accuracies

log_acc_debt <- conf_log_debt$overall["Accuracy"]
rf_acc_debt <- conf_rf_debt$overall["Accuracy"]

log_acc_debt
##  Accuracy 
## 0.7129145
rf_acc_debt
##  Accuracy 
## 0.7233857