library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.1 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.3 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
df <- read_csv("nfcs.csv")
## Rows: 25539 Columns: 133
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (133): NFCSID, STATEQ, CENSUSDIV, CENSUSREG, A50A, A3Ar_w, A50B, A4A_new...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df)
dim(df)
## [1] 25539 133
Getting only the important columns
df2 <- df %>%
select(
# demographics
A50A, # gender
A3Ar_w, # age group
A4A_new_w, # ethnicity
A5_2015, # education
A7A, # maritial status
A8_2021, # household income
# financial behavior / outcomes
J3, # spending habits
J4, # monthly expenses
B14A_1, # investments?
EA_1, # own home?
F2_2, # credit card debt
F2_3, # paying only monthly minimums
F2_4, # cc late fees
G23, # "i have too much debt"
# financial literacy questions
M6,
M7,
M8,
M31,
M50,
M9,
M10,
)
Renaming columns
df2 <- df2 %>%
rename(
gender = A50A,
age_group = A3Ar_w,
ethnicity = A4A_new_w,
education = A5_2015,
marital_status = A7A,
income = A8_2021,
spending_behavior = J3,
financial_difficulty = J4,
investments = B14A_1,
homeowner = EA_1,
carry_balance = F2_2,
minimum_payment = F2_3,
late_fee = F2_4,
too_much_debt = G23
)
CREATE A LITERACY SCORE! (sum of correct answer = score)
df2 <- df2 %>%
mutate(
literacy_score =
(M6 == 1) +
(M7 == 3) +
(M8 == 2) +
(M31 == 2) +
(M9 == 1) +
(M10 == 2) +
(M50 == 3)
)
Making financial_difficulty the target variable. there are 3 possible values. In a typical month, how difficult is it for you to cover your expenses and pay all your bills? 1: very difficult 2: somewhat difficult 3: not at all difficult
Turning this into a binary score so answers 1 & 2 are both considered as financial hardship. Now 0 = no financial difficulty 1 = financial difficulty
df2 <- df2 %>%
mutate(
fragile = ifelse(financial_difficulty %in% c(1,2), 1, 0)
)
table(df2$fragile)
##
## 0 1
## 11912 13627
Removing “Don’t know” and “Prefer not to say” values
df2[df2 == 98 | df2 == 99] <- NA
df2 <- na.omit(df2)
head(df2)
……….. LOGISTIC REGRESSION MODEL!
model <- glm(
fragile ~ literacy_score +
gender +
age_group +
ethnicity +
education +
income +
marital_status +
spending_behavior,
data = df2,
family = binomial
)
summary(model)
##
## Call:
## glm(formula = fragile ~ literacy_score + gender + age_group +
## ethnicity + education + income + marital_status + spending_behavior,
## family = binomial, data = df2)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.60850 0.25738 6.249 4.12e-10 ***
## literacy_score -0.10276 0.02534 -4.056 4.99e-05 ***
## gender 0.31617 0.06314 5.008 5.51e-07 ***
## age_group -0.28173 0.02109 -13.357 < 2e-16 ***
## ethnicity 0.14719 0.07334 2.007 0.0448 *
## education -0.08328 0.02129 -3.913 9.13e-05 ***
## income -0.25255 0.01844 -13.697 < 2e-16 ***
## marital_status -0.06973 0.03633 -1.919 0.0549 .
## spending_behavior 0.53917 0.03494 15.433 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7685.7 on 5733 degrees of freedom
## Residual deviance: 6546.5 on 5725 degrees of freedom
## AIC: 6564.5
##
## Number of Fisher Scoring iterations: 3
Log odds and CI
exp(cbind(
Odds_Ratio = coef(model),
confint(model)
))
## Waiting for profiling to be done...
## Odds_Ratio 2.5 % 97.5 %
## (Intercept) 4.9953309 3.0207264 8.2865069
## literacy_score 0.9023401 0.8586017 0.9482721
## gender 1.3718697 1.2121510 1.5526051
## age_group 0.7544747 0.7238149 0.7862164
## ethnicity 1.1585762 1.0031725 1.3373822
## education 0.9200940 0.8825046 0.9593133
## income 0.7768195 0.7491047 0.8052630
## marital_status 0.9326415 0.8683515 1.0012806
## spending_behavior 1.7145778 1.6013545 1.8364222
…… RANDOM FOREST!
library(rpart)
library(rpart.plot)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
df2$fragile <- as.factor(df2$fragile)
test / train split
set.seed(1234)
trainIndex <- createDataPartition(df2$fragile, p = 0.8, list = FALSE)
train <- df2[trainIndex, ]
test <- df2[-trainIndex, ]
Random Forest Model:
rf_model <- randomForest(
fragile ~ literacy_score +
gender +
age_group +
ethnicity +
education +
income +
marital_status +
spending_behavior,
data = train,
importance = TRUE,
ntree = 500
)
rf_model
##
## Call:
## randomForest(formula = fragile ~ literacy_score + gender + age_group + ethnicity + education + income + marital_status + spending_behavior, data = train, importance = TRUE, ntree = 500)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 26.37%
## Confusion matrix:
## 0 1 class.error
## 0 2320 464 0.1666667
## 1 746 1058 0.4135255
varImpPlot(rf_model)
…. COMPARING MODELS
log_prob <- predict(model, test, type = "response")
log_pred <- ifelse(log_prob > 0.5, 1, 0)
log_pred <- as.factor(log_pred)
# confusion matrix
conf_log <- confusionMatrix(log_pred, test$fragile)
conf_log
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 575 219
## 1 120 232
##
## Accuracy : 0.7042
## 95% CI : (0.6768, 0.7305)
## No Information Rate : 0.6065
## P-Value [Acc > NIR] : 3.316e-12
##
## Kappa : 0.3554
##
## Mcnemar's Test P-Value : 1.023e-07
##
## Sensitivity : 0.8273
## Specificity : 0.5144
## Pos Pred Value : 0.7242
## Neg Pred Value : 0.6591
## Prevalence : 0.6065
## Detection Rate : 0.5017
## Detection Prevalence : 0.6928
## Balanced Accuracy : 0.6709
##
## 'Positive' Class : 0
##
rf_pred <- predict(rf_model, test)
# confusion matrix
conf_rf <- confusionMatrix(rf_pred, test$fragile)
conf_rf
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 579 159
## 1 116 292
##
## Accuracy : 0.76
## 95% CI : (0.7342, 0.7845)
## No Information Rate : 0.6065
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.4887
##
## Mcnemar's Test P-Value : 0.01132
##
## Sensitivity : 0.8331
## Specificity : 0.6475
## Pos Pred Value : 0.7846
## Neg Pred Value : 0.7157
## Prevalence : 0.6065
## Detection Rate : 0.5052
## Detection Prevalence : 0.6440
## Balanced Accuracy : 0.7403
##
## 'Positive' Class : 0
##
comparing
log_acc <- conf_log$overall["Accuracy"]
rf_acc <- conf_rf$overall["Accuracy"]
log_acc
## Accuracy
## 0.7041885
rf_acc
## Accuracy
## 0.7600349