Step1. Load libraries
library(ISLR)
library(ggplot2)
library(dplyr)
library(pROC)
Step2: Data Preprocessing
# Load the Default dataset from ISLR
data("Default")
# Preview the dataset
head(Default)
# Summary of the dataset
summary(Default)
default student balance income
No :9667 No :7056 Min. : 0.0 Min. : 772
Yes: 333 Yes:2944 1st Qu.: 481.7 1st Qu.:21340
Median : 823.6 Median :34553
Mean : 835.4 Mean :33517
3rd Qu.:1166.3 3rd Qu.:43808
Max. :2654.3 Max. :73554
# Structure of the dataset
str(Default)
'data.frame': 10000 obs. of 4 variables:
$ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
$ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
$ balance: num 730 817 1074 529 786 ...
$ income : num 44362 12106 31767 35704 38463 ...
# Convert target variable 'default' to a binary numeric variable (1 for 'Yes', 0 for 'No')
Default <- Default %>%
mutate(default = ifelse(default == "Yes", 1, 0))
# Check the conversion
table(Default$default)
Step3:Train-Test split
# Set seed for reproducibility
set.seed(123)
# Split the dataset
train_idx <- sample(1:nrow(Default), 0.8 * nrow(Default))
train_data <- Default[train_idx, ]
test_data <- Default[-train_idx, ]
Step 4: Logistic Regression Model
# Train the logistic regression model
logit_model <- glm(
default ~ balance + income + student,
data = train_data,
family = binomial(link = "logit")
)
# Summary of the model
summary(logit_model)
Step 5: Model Evaluation
# Make predictions on the test data
test_data$predicted_prob <- predict(logit_model, newdata = test_data, type = "response")
# Convert probabilities to binary predictions
test_data$predicted_class <- ifelse(test_data$predicted_prob > 0.5, 1, 0)
# Confusion Matrix
conf_matrix <- table(Predicted = test_data$predicted_class, Actual = test_data$default)
print(conf_matrix)
Actual
Predicted 0 1
0 1928 46
1 6 20
# Calculate Accuracy
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
print(paste("Accuracy:", round(accuracy, 4)))
[1] "Accuracy: 0.974"
# Calculate AUC
roc_curve <- roc(test_data$default, test_data$predicted_prob)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
auc_value <- auc(roc_curve)
print(paste("AUC:", round(auc_value, 4)))
[1] "AUC: 0.9458"
# Plot the ROC curve
plot(roc_curve, main = "ROC Curve", col = "blue", lwd = 2)
Step 6: Visualizing Results
# Visualize the relationship between balance and default
ggplot(Default, aes(x = balance, fill = factor(default))) +
geom_histogram(binwidth = 500, position = "dodge") +
labs(title = "Balance vs Default", x = "Balance", y = "Count", fill = "Default") +
theme_minimal()
# Visualize the relationship between income and default
ggplot(Default, aes(x = income, fill = factor(default))) +
geom_histogram(binwidth = 5000, position = "dodge") +
labs(title = "Income vs Default", x = "Income", y = "Count", fill = "Default") +
theme_minimal()
Observations: The model performs well, with high accuracy and AUC scores. Balance is the strongest predictor of default, with a clear separation between the two classes. Income shows a weaker relationship with default.
Conclusion:
This analysis demonstrates the application of logistic regression for credit risk prediction. The results indicate that the customer’s balance is the most significant factor in predicting default.