Implement a logistic regression model to predict the target variable. Provide the model summary, including coefficient estimates and their interpretations.
# Load required libraries
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(rpart)
library(rpart.plot)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(gains)
# Load dataset
bank.df <- read.csv("BANL 6625_FA_24_Assignment_3_dataset.csv")
# Convert categorical variables to factors
bank.df$Education <- factor(bank.df$Education, levels = unique(bank.df$Education))
bank.df$Marital_Status <- factor(bank.df$Marital_Status)
bank.df$Defaulted <- factor(bank.df$Defaulted, levels = c("0", "1"))
# Partition data (60% training, 40% validation)
set.seed(2)
train.index <- sample(1:nrow(bank.df), 0.6 * nrow(bank.df))
train.df <- bank.df[train.index, ]
valid.df <- bank.df[-train.index, ]
# Logistic Regression
logit.reg <- glm(Defaulted ~ Age + Income + Education + Marital_Status,
data = train.df, family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Model Summary
summary(logit.reg)
##
## Call:
## glm(formula = Defaulted ~ Age + Income + Education + Marital_Status,
## family = "binomial", data = train.df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.425e+01 6.025e+04 0.000 1.000
## Age 5.796e-03 8.766e+02 0.000 1.000
## Income -7.841e-06 6.559e-01 0.000 1.000
## EducationBachelor's 4.928e+01 4.410e+04 0.001 0.999
## EducationHigh School 4.888e+01 4.337e+04 0.001 0.999
## EducationPhD -4.792e+01 4.533e+04 -0.001 0.999
## Marital_StatusMarried -9.813e+01 6.032e+04 -0.002 0.999
## Marital_StatusSingle -4.796e+01 4.130e+04 -0.001 0.999
## Marital_StatusWidowed -4.761e+01 4.456e+04 -0.001 0.999
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2.4951e+02 on 179 degrees of freedom
## Residual deviance: 4.1421e-09 on 171 degrees of freedom
## AIC: 18
##
## Number of Fisher Scoring iterations: 25
# Predictions for validation data
logit.pred <- predict(logit.reg, valid.df, type = "response")
# Convert predictions to binary (threshold = 0.5)
logit.pred.class <- ifelse(logit.pred > 0.5, "1", "0")
# Confusion Matrix for Logistic Regression
confusion.logit <- confusionMatrix(factor(logit.pred.class, levels = c("0", "1")),
valid.df$Defaulted)
print(confusion.logit)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 60 0
## 1 1 59
##
## Accuracy : 0.9917
## 95% CI : (0.9544, 0.9998)
## No Information Rate : 0.5083
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9833
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9836
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9833
## Prevalence : 0.5083
## Detection Rate : 0.5000
## Detection Prevalence : 0.5000
## Balanced Accuracy : 0.9918
##
## 'Positive' Class : 0
##
# AUC-ROC for Logistic Regression
roc.logit <- roc(valid.df$Defaulted, logit.pred)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc.logit <- auc(roc.logit)
plot(roc.logit, main = paste("Logistic Regression AUC-ROC (AUC =", round(auc.logit, 2), ")"))
# Gains Chart for Logistic Regression
gain.logit <- gains(as.numeric(as.character(valid.df$Defaulted)), logit.pred, groups = 10)
plot(c(0, gain.logit$cume.pct.of.total * sum(as.numeric(as.character(valid.df$Defaulted)))) ~
c(0, gain.logit$cume.obs), type = "l", main = "Cumulative Gains Chart", xlab = "Cases", ylab = "Cumulative")
# Classification Tree
tree.model <- rpart(Defaulted ~ Age + Income + Education + Marital_Status,
data = train.df, method = "class", cp = 0.01)
# Plot Classification Tree
rpart.plot(tree.model, main = "Classification Tree")
# Best Complexity Parameter (cp)
printcp(tree.model)
##
## Classification tree:
## rpart(formula = Defaulted ~ Age + Income + Education + Marital_Status,
## data = train.df, method = "class", cp = 0.01)
##
## Variables actually used in tree construction:
## [1] Education Marital_Status
##
## Root node error: 89/180 = 0.49444
##
## n= 180
##
## CP nsplit rel error xerror xstd
## 1 0.640449 0 1.00000 1.269663 0.072870
## 2 0.213483 1 0.35955 0.359551 0.057634
## 3 0.073034 2 0.14607 0.146067 0.039021
## 4 0.010000 4 0.00000 0.078652 0.029144
best.cp <- tree.model$cptable[which.min(tree.model$cptable[,"xerror"]), "CP"]
cat("Best cp:", best.cp)
## Best cp: 0.01
# Prune Tree Using Best cp
pruned.tree <- prune(tree.model, cp = best.cp)
# Visualize Pruned Tree
rpart.plot(pruned.tree, main = "Pruned Classification Tree")
# Predictions for Training and Validation Data
tree.pred.train <- predict(pruned.tree, train.df, type = "class")
tree.pred.valid <- predict(pruned.tree, valid.df, type = "class")
# Confusion Matrix for Tree Model
confusion.tree.train <- confusionMatrix(tree.pred.train, train.df$Defaulted)
confusion.tree.valid <- confusionMatrix(tree.pred.valid, valid.df$Defaulted)
print(confusion.tree.valid)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 60 0
## 1 1 59
##
## Accuracy : 0.9917
## 95% CI : (0.9544, 0.9998)
## No Information Rate : 0.5083
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9833
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9836
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9833
## Prevalence : 0.5083
## Detection Rate : 0.5000
## Detection Prevalence : 0.5000
## Balanced Accuracy : 0.9918
##
## 'Positive' Class : 0
##
# AUC-ROC for Classification Tree
tree.prob <- predict(pruned.tree, valid.df, type = "prob")[,2]
roc.tree <- roc(valid.df$Defaulted, tree.prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc.tree <- auc(roc.tree)
plot(roc.tree, main = paste("Classification Tree AUC-ROC (AUC =", round(auc.tree, 2), ")"))
# Compare Logistic Regression and Classification Tree
cat("Logistic Regression AUC:", auc.logit, "\n")
## Logistic Regression AUC: 1
cat("Classification Tree AUC:", auc.tree, "\n")
## Classification Tree AUC: 0.9918033
# Summary of Comparison
if (auc.logit > auc.tree) {
cat("Logistic Regression performs better.\n")
} else {
cat("Classification Tree performs better.\n")
}
## Logistic Regression performs better.
Kappa (0.983): Indicates an almost perfect agreement between predictions and actual values for the logistic regression model.
Sensitivity (0.984): The model correctly
identifies 98.4% of the positive cases
(Defaulted = 0).
Specificity (1.000): The model perfectly
identifies all the negative cases (Defaulted = 1).
Balanced Accuracy (0.992): This metric, the average of sensitivity and specificity, shows that the model performs well on both classes.
AUC-ROC for Logistic Regression (1.0): This perfect score means the logistic regression model is exceptionally good at distinguishing between classes.
AUC-ROC for Classification Tree (0.992): The tree also performs well but is slightly less accurate compared to logistic regression.
Logistic regression achieves near-perfect performance, as evidenced by its AUC of 1 and perfect specificity.
The classification tree, though slightly less accurate with an AUC of 0.992, is still an excellent model.
From the logistic regression
summary, Income, Age, Education,
and Marital_Status are analyzed for their contribution
to predicting Defaulted. Features with statistically
significant coefficients (p-values < 0.05) play the most critical
role in determining the outcome.
Income and Age: Likely to have high predictive importance as they influence financial behavior.
Education: Higher education levels might correlate with lower default rates due to better financial literacy.
Marital_Status: Life stability (e.g., married vs. divorced) could impact financial obligations and decisions.
Logistic Regression performs better overall, especially in cases requiring a continuous prediction probability for finer threshold adjustments (e.g., financial risk modeling).
Classification Tree provides an interpretable model structure, making it suitable when model transparency is important, such as presenting results to non-technical stakeholders.
Logistic Regression is preferred for:
High-dimensional data.
Scenarios demanding accurate probability estimates.
Classification Tree is preferred for:
Interpretable decision-making processes.
Situations with potential non-linear relationships in features