# 1 Load Libraries and Data
# Load required libraries
library(ISLR2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(e1071)  # needed for confusionMatrix
library(ggplot2)

# Load the dataset
data(Default)
df <- Default

# View first few rows
head(df)
##   default student   balance    income
## 1      No      No  729.5265 44361.625
## 2      No     Yes  817.1804 12106.135
## 3      No      No 1073.5492 31767.139
## 4      No      No  529.2506 35704.494
## 5      No      No  785.6559 38463.496
## 6      No     Yes  919.5885  7491.559
# 2 Preprocess Data
# Encode 'default' as binary
df$default <- ifelse(df$default == "Yes", 1, 0)

# Split into training and testing
set.seed(123)
train_index <- createDataPartition(df$default, p = 0.7, list = FALSE)
train <- df[train_index, ]
test <- df[-train_index, ]
# 3: Logistic Regression
# Fit logistic regression model
logit_model <- glm(default ~ balance + income + student, 
                   data = train, 
                   family = "binomial")

# Predict on test set
logit_probs <- predict(logit_model, newdata = test, type = "response")
logit_pred <- ifelse(logit_probs > 0.5, 1, 0)

# Confusion matrix
confusionMatrix(factor(logit_pred), factor(test$default))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2889   70
##          1   11   30
##                                           
##                Accuracy : 0.973           
##                  95% CI : (0.9666, 0.9785)
##     No Information Rate : 0.9667          
##     P-Value [Acc > NIR] : 0.02705         
##                                           
##                   Kappa : 0.4142          
##                                           
##  Mcnemar's Test P-Value : 1.16e-10        
##                                           
##             Sensitivity : 0.9962          
##             Specificity : 0.3000          
##          Pos Pred Value : 0.9763          
##          Neg Pred Value : 0.7317          
##              Prevalence : 0.9667          
##          Detection Rate : 0.9630          
##    Detection Prevalence : 0.9863          
##       Balanced Accuracy : 0.6481          
##                                           
##        'Positive' Class : 0               
## 
# 4 : PCA + Logistic Regression
# Apply PCA (scale = TRUE is important)
pca_train <- prcomp(train[, c("balance", "income")], scale. = TRUE)
pca_test <- predict(pca_train, newdata = test[, c("balance", "income")])

# Use first 2 principal components
pca_train_df <- data.frame(PC1 = pca_train$x[,1], 
                           PC2 = pca_train$x[,2], 
                           default = train$default)

pca_test_df <- data.frame(PC1 = pca_test[,1], 
                          PC2 = pca_test[,2], 
                          default = test$default)

# Logistic regression using PCs
pca_logit <- glm(default ~ PC1 + PC2, data = pca_train_df, family = "binomial")

# Predict on test set
pca_probs <- predict(pca_logit, newdata = pca_test_df, type = "response")
pca_pred <- ifelse(pca_probs > 0.5, 1, 0)

# Confusion matrix
confusionMatrix(factor(pca_pred), factor(pca_test_df$default))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2890   69
##          1   10   31
##                                           
##                Accuracy : 0.9737          
##                  95% CI : (0.9673, 0.9791)
##     No Information Rate : 0.9667          
##     P-Value [Acc > NIR] : 0.01601         
##                                           
##                   Kappa : 0.4286          
##                                           
##  Mcnemar's Test P-Value : 6.777e-11       
##                                           
##             Sensitivity : 0.9966          
##             Specificity : 0.3100          
##          Pos Pred Value : 0.9767          
##          Neg Pred Value : 0.7561          
##              Prevalence : 0.9667          
##          Detection Rate : 0.9633          
##    Detection Prevalence : 0.9863          
##       Balanced Accuracy : 0.6533          
##                                           
##        'Positive' Class : 0               
## 
# 5 : Visualization
ggplot(pca_train_df, aes(x = PC1, y = PC2, color = factor(default))) +
  geom_point(alpha = 0.6) +
  labs(title = "PCA Projection of Credit Default Data", color = "Default") +
  theme_minimal()

# third model: K-Nearest Neighbors (KNN)
# Load required package
library(class)

# Normalize numeric columns
normalize <- function(x) {
  return((x - min(x)) / (max(x) - min(x)))
}

# Create normalized dataset
df_knn <- df
df_knn$balance <- normalize(df$balance)
df_knn$income <- normalize(df$income)

# Split train/test again using same indices for fair comparison
train_knn <- df_knn[train_index, ]
test_knn <- df_knn[-train_index, ]

# Use balance, income, student (encoded)
train_knn$student <- ifelse(train_knn$student == "Yes", 1, 0)
test_knn$student <- ifelse(test_knn$student == "Yes", 1, 0)

# Prepare features and labels
X_train <- train_knn[, c("balance", "income", "student")]
X_test <- test_knn[, c("balance", "income", "student")]
y_train <- train_knn$default
y_test <- test_knn$default

# Apply KNN (k = 5)
set.seed(123)
knn_pred <- knn(train = X_train, test = X_test, cl = y_train, k = 5)

# Confusion matrix
confusionMatrix(knn_pred, factor(y_test))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2878   67
##          1   22   33
##                                           
##                Accuracy : 0.9703          
##                  95% CI : (0.9636, 0.9761)
##     No Information Rate : 0.9667          
##     P-Value [Acc > NIR] : 0.1422          
##                                           
##                   Kappa : 0.4119          
##                                           
##  Mcnemar's Test P-Value : 3.101e-06       
##                                           
##             Sensitivity : 0.9924          
##             Specificity : 0.3300          
##          Pos Pred Value : 0.9772          
##          Neg Pred Value : 0.6000          
##              Prevalence : 0.9667          
##          Detection Rate : 0.9593          
##    Detection Prevalence : 0.9817          
##       Balanced Accuracy : 0.6612          
##                                           
##        'Positive' Class : 0               
##