# 1 Load Libraries and Data
# Load required libraries
library(ISLR2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(e1071) # needed for confusionMatrix
library(ggplot2)
# Load the dataset
data(Default)
df <- Default
# View first few rows
head(df)
## default student balance income
## 1 No No 729.5265 44361.625
## 2 No Yes 817.1804 12106.135
## 3 No No 1073.5492 31767.139
## 4 No No 529.2506 35704.494
## 5 No No 785.6559 38463.496
## 6 No Yes 919.5885 7491.559
# 2 Preprocess Data
# Encode 'default' as binary
df$default <- ifelse(df$default == "Yes", 1, 0)
# Split into training and testing
set.seed(123)
train_index <- createDataPartition(df$default, p = 0.7, list = FALSE)
train <- df[train_index, ]
test <- df[-train_index, ]
# 3: Logistic Regression
# Fit logistic regression model
logit_model <- glm(default ~ balance + income + student,
data = train,
family = "binomial")
# Predict on test set
logit_probs <- predict(logit_model, newdata = test, type = "response")
logit_pred <- ifelse(logit_probs > 0.5, 1, 0)
# Confusion matrix
confusionMatrix(factor(logit_pred), factor(test$default))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2889 70
## 1 11 30
##
## Accuracy : 0.973
## 95% CI : (0.9666, 0.9785)
## No Information Rate : 0.9667
## P-Value [Acc > NIR] : 0.02705
##
## Kappa : 0.4142
##
## Mcnemar's Test P-Value : 1.16e-10
##
## Sensitivity : 0.9962
## Specificity : 0.3000
## Pos Pred Value : 0.9763
## Neg Pred Value : 0.7317
## Prevalence : 0.9667
## Detection Rate : 0.9630
## Detection Prevalence : 0.9863
## Balanced Accuracy : 0.6481
##
## 'Positive' Class : 0
##
# 4 : PCA + Logistic Regression
# Apply PCA (scale = TRUE is important)
pca_train <- prcomp(train[, c("balance", "income")], scale. = TRUE)
pca_test <- predict(pca_train, newdata = test[, c("balance", "income")])
# Use first 2 principal components
pca_train_df <- data.frame(PC1 = pca_train$x[,1],
PC2 = pca_train$x[,2],
default = train$default)
pca_test_df <- data.frame(PC1 = pca_test[,1],
PC2 = pca_test[,2],
default = test$default)
# Logistic regression using PCs
pca_logit <- glm(default ~ PC1 + PC2, data = pca_train_df, family = "binomial")
# Predict on test set
pca_probs <- predict(pca_logit, newdata = pca_test_df, type = "response")
pca_pred <- ifelse(pca_probs > 0.5, 1, 0)
# Confusion matrix
confusionMatrix(factor(pca_pred), factor(pca_test_df$default))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2890 69
## 1 10 31
##
## Accuracy : 0.9737
## 95% CI : (0.9673, 0.9791)
## No Information Rate : 0.9667
## P-Value [Acc > NIR] : 0.01601
##
## Kappa : 0.4286
##
## Mcnemar's Test P-Value : 6.777e-11
##
## Sensitivity : 0.9966
## Specificity : 0.3100
## Pos Pred Value : 0.9767
## Neg Pred Value : 0.7561
## Prevalence : 0.9667
## Detection Rate : 0.9633
## Detection Prevalence : 0.9863
## Balanced Accuracy : 0.6533
##
## 'Positive' Class : 0
##
# 5 : Visualization
ggplot(pca_train_df, aes(x = PC1, y = PC2, color = factor(default))) +
geom_point(alpha = 0.6) +
labs(title = "PCA Projection of Credit Default Data", color = "Default") +
theme_minimal()

# third model: K-Nearest Neighbors (KNN)
# Load required package
library(class)
# Normalize numeric columns
normalize <- function(x) {
return((x - min(x)) / (max(x) - min(x)))
}
# Create normalized dataset
df_knn <- df
df_knn$balance <- normalize(df$balance)
df_knn$income <- normalize(df$income)
# Split train/test again using same indices for fair comparison
train_knn <- df_knn[train_index, ]
test_knn <- df_knn[-train_index, ]
# Use balance, income, student (encoded)
train_knn$student <- ifelse(train_knn$student == "Yes", 1, 0)
test_knn$student <- ifelse(test_knn$student == "Yes", 1, 0)
# Prepare features and labels
X_train <- train_knn[, c("balance", "income", "student")]
X_test <- test_knn[, c("balance", "income", "student")]
y_train <- train_knn$default
y_test <- test_knn$default
# Apply KNN (k = 5)
set.seed(123)
knn_pred <- knn(train = X_train, test = X_test, cl = y_train, k = 5)
# Confusion matrix
confusionMatrix(knn_pred, factor(y_test))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2878 67
## 1 22 33
##
## Accuracy : 0.9703
## 95% CI : (0.9636, 0.9761)
## No Information Rate : 0.9667
## P-Value [Acc > NIR] : 0.1422
##
## Kappa : 0.4119
##
## Mcnemar's Test P-Value : 3.101e-06
##
## Sensitivity : 0.9924
## Specificity : 0.3300
## Pos Pred Value : 0.9772
## Neg Pred Value : 0.6000
## Prevalence : 0.9667
## Detection Rate : 0.9593
## Detection Prevalence : 0.9817
## Balanced Accuracy : 0.6612
##
## 'Positive' Class : 0
##