library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.1     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.3     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
df <- read_csv("nfcs.csv")
## Rows: 25539 Columns: 133
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (133): NFCSID, STATEQ, CENSUSDIV, CENSUSREG, A50A, A3Ar_w, A50B, A4A_new...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df)
dim(df)
## [1] 25539   133

Getting only the important columns

df2 <- df %>%
  select(
    # demographics
    A50A, # gender
    A3Ar_w, # age group
    A4A_new_w, # ethnicity
    A5_2015, # education
    A7A, # maritial status
    A8_2021, # household income

    # financial behavior / outcomes
    J3, # spending habits
    J4, # monthly expenses
    B14A_1, # investments?
    EA_1, # own home?
    F2_2, # credit card debt
    F2_3, # paying only monthly minimums
    F2_4, # cc late fees
    G23, # "i have too much debt"

    # financial literacy questions
    M6,
    M7,
    M8,
    M31,
    M50,
    M9,
    M10,
  )

Renaming columns

df2 <- df2 %>%
  rename(
    gender = A50A,
    age_group = A3Ar_w,
    ethnicity = A4A_new_w,
    education = A5_2015,
    marital_status = A7A,
    income = A8_2021,

    spending_behavior = J3,
    financial_difficulty = J4,
    investments = B14A_1,
    homeowner = EA_1,

    carry_balance = F2_2,
    minimum_payment = F2_3,
    late_fee = F2_4,
    too_much_debt = G23
  )

CREATE A LITERACY SCORE! (sum of correct answer = score)

df2 <- df2 %>%
  mutate(
    literacy_score =
      (M6 == 1) +
      (M7 == 3) +
      (M8 == 2) +
      (M31 == 2) +
      (M9 == 1) +
      (M10 == 2) +
      (M50 == 3)
  )

Making financial_difficulty the target variable. there are 3 possible values. In a typical month, how difficult is it for you to cover your expenses and pay all your bills? 1: very difficult 2: somewhat difficult 3: not at all difficult

Turning this into a binary score so answers 1 & 2 are both considered as financial hardship. Now 0 = no financial difficulty 1 = financial difficulty

df2 <- df2 %>%
  mutate(
    fragile = ifelse(financial_difficulty %in% c(1,2), 1, 0)
  )

table(df2$fragile)
## 
##     0     1 
## 11912 13627

Removing “Don’t know” and “Prefer not to say” values

df2[df2 == 98 | df2 == 99] <- NA
df2 <- na.omit(df2)
head(df2)

……….. LOGISTIC REGRESSION MODEL!

model <- glm(
  fragile ~ literacy_score +
            gender +
            age_group +
            ethnicity +
            education +
            income +
            marital_status +
            spending_behavior,
  data = df2,
  family = binomial
)

summary(model)
## 
## Call:
## glm(formula = fragile ~ literacy_score + gender + age_group + 
##     ethnicity + education + income + marital_status + spending_behavior, 
##     family = binomial, data = df2)
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        1.60850    0.25738   6.249 4.12e-10 ***
## literacy_score    -0.10276    0.02534  -4.056 4.99e-05 ***
## gender             0.31617    0.06314   5.008 5.51e-07 ***
## age_group         -0.28173    0.02109 -13.357  < 2e-16 ***
## ethnicity          0.14719    0.07334   2.007   0.0448 *  
## education         -0.08328    0.02129  -3.913 9.13e-05 ***
## income            -0.25255    0.01844 -13.697  < 2e-16 ***
## marital_status    -0.06973    0.03633  -1.919   0.0549 .  
## spending_behavior  0.53917    0.03494  15.433  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7685.7  on 5733  degrees of freedom
## Residual deviance: 6546.5  on 5725  degrees of freedom
## AIC: 6564.5
## 
## Number of Fisher Scoring iterations: 3

Log odds and CI

exp(cbind(
  Odds_Ratio = coef(model),
  confint(model)
))
## Waiting for profiling to be done...
##                   Odds_Ratio     2.5 %    97.5 %
## (Intercept)        4.9953309 3.0207264 8.2865069
## literacy_score     0.9023401 0.8586017 0.9482721
## gender             1.3718697 1.2121510 1.5526051
## age_group          0.7544747 0.7238149 0.7862164
## ethnicity          1.1585762 1.0031725 1.3373822
## education          0.9200940 0.8825046 0.9593133
## income             0.7768195 0.7491047 0.8052630
## marital_status     0.9326415 0.8683515 1.0012806
## spending_behavior  1.7145778 1.6013545 1.8364222

…… RANDOM FOREST!

library(rpart)
library(rpart.plot)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
df2$fragile <- as.factor(df2$fragile)

test / train split

set.seed(1234)

trainIndex <- createDataPartition(df2$fragile, p = 0.8, list = FALSE)

train <- df2[trainIndex, ]
test  <- df2[-trainIndex, ]

Random Forest Model:

rf_model <- randomForest(
  fragile ~ literacy_score +
            gender +
            age_group +
            ethnicity +
            education +
            income +
            marital_status +
            spending_behavior,
  data = train,
  importance = TRUE,
  ntree = 500
)
rf_model
## 
## Call:
##  randomForest(formula = fragile ~ literacy_score + gender + age_group +      ethnicity + education + income + marital_status + spending_behavior,      data = train, importance = TRUE, ntree = 500) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 26.37%
## Confusion matrix:
##      0    1 class.error
## 0 2320  464   0.1666667
## 1  746 1058   0.4135255
varImpPlot(rf_model)

…. COMPARING MODELS

  1. Logistic regression predictions
log_prob <- predict(model, test, type = "response")

log_pred <- ifelse(log_prob > 0.5, 1, 0)
log_pred <- as.factor(log_pred)

# confusion matrix
conf_log <- confusionMatrix(log_pred, test$fragile)
conf_log
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 575 219
##          1 120 232
##                                           
##                Accuracy : 0.7042          
##                  95% CI : (0.6768, 0.7305)
##     No Information Rate : 0.6065          
##     P-Value [Acc > NIR] : 3.316e-12       
##                                           
##                   Kappa : 0.3554          
##                                           
##  Mcnemar's Test P-Value : 1.023e-07       
##                                           
##             Sensitivity : 0.8273          
##             Specificity : 0.5144          
##          Pos Pred Value : 0.7242          
##          Neg Pred Value : 0.6591          
##              Prevalence : 0.6065          
##          Detection Rate : 0.5017          
##    Detection Prevalence : 0.6928          
##       Balanced Accuracy : 0.6709          
##                                           
##        'Positive' Class : 0               
## 
  1. random forest predictions
rf_pred <- predict(rf_model, test)

# confusion matrix
conf_rf <- confusionMatrix(rf_pred, test$fragile)
conf_rf
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 579 159
##          1 116 292
##                                           
##                Accuracy : 0.76            
##                  95% CI : (0.7342, 0.7845)
##     No Information Rate : 0.6065          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.4887          
##                                           
##  Mcnemar's Test P-Value : 0.01132         
##                                           
##             Sensitivity : 0.8331          
##             Specificity : 0.6475          
##          Pos Pred Value : 0.7846          
##          Neg Pred Value : 0.7157          
##              Prevalence : 0.6065          
##          Detection Rate : 0.5052          
##    Detection Prevalence : 0.6440          
##       Balanced Accuracy : 0.7403          
##                                           
##        'Positive' Class : 0               
## 

comparing

log_acc <- conf_log$overall["Accuracy"]
rf_acc  <- conf_rf$overall["Accuracy"]

log_acc
##  Accuracy 
## 0.7041885
rf_acc
##  Accuracy 
## 0.7600349