library(ISLR2)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(class)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(e1071)
# Load the Boston dataset
data("Boston")
# Create a binary response variable: 1 if crime rate > median, else 0
Boston <- Boston %>%
mutate(crim_binary = ifelse(crim > median(crim), 1, 0)) %>%
select(-crim) # Remove original crim to avoid leakage
set.seed(1)
train_index <- createDataPartition(Boston$crim_binary, p = 0.7, list = FALSE)
train <- Boston[train_index, ]
test <- Boston[-train_index, ]
log_model <- glm(crim_binary ~ ., data = train, family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
log_probs <- predict(log_model, test, type = "response")
log_preds <- ifelse(log_probs > 0.5, 1, 0)
# Accuracy
log_acc <- mean(log_preds == test$crim_binary)
log_acc
## [1] 0.88
# Logistic Regression using a subset of predictors (rm, lstat, dis)
log_model_subset <- glm(crim_binary ~ rm + lstat + dis, data = train, family = binomial)
log_probs_subset <- predict(log_model_subset, test, type = "response")
log_preds_subset <- ifelse(log_probs_subset > 0.5, 1, 0)
# Accuracy for subset model
log_subset_acc <- mean(log_preds_subset == test$crim_binary)
log_subset_acc
## [1] 0.7666667
lda_model <- lda(crim_binary ~ ., data = train)
lda_preds <- predict(lda_model, test)$class
# Accuracy
lda_acc <- mean(lda_preds == test$crim_binary)
lda_acc
## [1] 0.84
lda_model_subset <- lda(crim_binary ~ rm + lstat + dis, data = train)
lda_preds_subset <- predict(lda_model_subset, test)$class
lda_subset_acc <- mean(lda_preds_subset == test$crim_binary)
lda_subset_acc
## [1] 0.76
nb_model <- naiveBayes(as.factor(crim_binary) ~ ., data = train)
nb_preds <- predict(nb_model, test)
# Accuracy
nb_acc <- mean(nb_preds == test$crim_binary)
nb_acc
## [1] 0.8333333
nb_model_subset <- naiveBayes(as.factor(crim_binary) ~ rm + lstat + dis, data = train)
nb_preds_subset <- predict(nb_model_subset, test)
nb_subset_acc <- mean(nb_preds_subset == test$crim_binary)
nb_subset_acc
## [1] 0.78
# Standardize features
train_scaled <- scale(train[ , -which(names(train) == "crim_binary")])
test_scaled <- scale(test[ , -which(names(test) == "crim_binary")])
train_labels <- train$crim_binary
test_labels <- test$crim_binary
# Try k = 1, 5, 10
knn1 <- knn(train_scaled, test_scaled, train_labels, k = 1)
knn5 <- knn(train_scaled, test_scaled, train_labels, k = 5)
knn10 <- knn(train_scaled, test_scaled, train_labels, k = 10)
knn1_acc <- mean(knn1 == test_labels)
knn5_acc <- mean(knn5 == test_labels)
knn10_acc <- mean(knn10 == test_labels)
knn1_acc
## [1] 0.9
knn5_acc
## [1] 0.8666667
knn10_acc
## [1] 0.8466667
train_knn_subset <- scale(train[, c("rm", "lstat", "dis")])
test_knn_subset <- scale(test[, c("rm", "lstat", "dis")])
knn_subset1 <- knn(train_knn_subset, test_knn_subset, train_labels, k = 1)
knn_subset5 <- knn(train_knn_subset, test_knn_subset, train_labels, k = 5)
knn_subset10 <- knn(train_knn_subset, test_knn_subset, train_labels, k = 10)
knn_subset1_acc <- mean(knn_subset1 == test_labels)
knn_subset5_acc <- mean(knn_subset5 == test_labels)
knn_subset10_acc <- mean(knn_subset10 == test_labels)
knn_subset1_acc
## [1] 0.7266667
knn_subset5_acc
## [1] 0.7866667
knn_subset10_acc
## [1] 0.8066667
cat("Accuracy Results:\n")
## Accuracy Results:
cat("Logistic Regression: ", round(log_acc, 3), "\n")
## Logistic Regression: 0.88
cat("LDA: ", round(lda_acc, 3), "\n")
## LDA: 0.84
cat("Naive Bayes: ", round(nb_acc, 3), "\n")
## Naive Bayes: 0.833
cat("KNN (k=1): ", round(knn1_acc, 3), "\n")
## KNN (k=1): 0.9
cat("KNN (k=5): ", round(knn5_acc, 3), "\n")
## KNN (k=5): 0.867
cat("KNN (k=10): ", round(knn10_acc, 3), "\n")
## KNN (k=10): 0.847
cat("Subset Accuracy Results:\n")
## Subset Accuracy Results:
cat("Logistic Regression (subset): ", round(log_subset_acc, 3), "\n")
## Logistic Regression (subset): 0.767
cat("LDA (subset): ", round(lda_subset_acc, 3), "\n")
## LDA (subset): 0.76
cat("Naive Bayes (subset): ", round(nb_subset_acc, 3), "\n")
## Naive Bayes (subset): 0.78
cat("KNN (subset, k=1): ", round(knn_subset1_acc, 3), "\n")
## KNN (subset, k=1): 0.727
cat("KNN (subset, k=5): ", round(knn_subset5_acc, 3), "\n")
## KNN (subset, k=5): 0.787
cat("KNN (subset, k=10): ", round(knn_subset10_acc, 3), "\n")
## KNN (subset, k=10): 0.807
# Confusion matrix for logistic regression (full model)
confusionMatrix(as.factor(log_preds), as.factor(test$crim_binary))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 68 11
## 1 7 64
##
## Accuracy : 0.88
## 95% CI : (0.817, 0.9273)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.76
##
## Mcnemar's Test P-Value : 0.4795
##
## Sensitivity : 0.9067
## Specificity : 0.8533
## Pos Pred Value : 0.8608
## Neg Pred Value : 0.9014
## Prevalence : 0.5000
## Detection Rate : 0.4533
## Detection Prevalence : 0.5267
## Balanced Accuracy : 0.8800
##
## 'Positive' Class : 0
##
# Confusion matrix for LDA
confusionMatrix(as.factor(lda_preds), as.factor(test$crim_binary))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 68 17
## 1 7 58
##
## Accuracy : 0.84
## 95% CI : (0.7714, 0.8947)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.68
##
## Mcnemar's Test P-Value : 0.06619
##
## Sensitivity : 0.9067
## Specificity : 0.7733
## Pos Pred Value : 0.8000
## Neg Pred Value : 0.8923
## Prevalence : 0.5000
## Detection Rate : 0.4533
## Detection Prevalence : 0.5667
## Balanced Accuracy : 0.8400
##
## 'Positive' Class : 0
##
# Confusion matrix for Naive Bayes
confusionMatrix(as.factor(nb_preds), as.factor(test$crim_binary))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 66 16
## 1 9 59
##
## Accuracy : 0.8333
## 95% CI : (0.7639, 0.8891)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.6667
##
## Mcnemar's Test P-Value : 0.2301
##
## Sensitivity : 0.8800
## Specificity : 0.7867
## Pos Pred Value : 0.8049
## Neg Pred Value : 0.8676
## Prevalence : 0.5000
## Detection Rate : 0.4400
## Detection Prevalence : 0.5467
## Balanced Accuracy : 0.8333
##
## 'Positive' Class : 0
##
# Confusion matrix for KNN (k=5)
confusionMatrix(as.factor(knn5), as.factor(test$crim_binary))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 70 15
## 1 5 60
##
## Accuracy : 0.8667
## 95% CI : (0.8016, 0.9166)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.7333
##
## Mcnemar's Test P-Value : 0.04417
##
## Sensitivity : 0.9333
## Specificity : 0.8000
## Pos Pred Value : 0.8235
## Neg Pred Value : 0.9231
## Prevalence : 0.5000
## Detection Rate : 0.4667
## Detection Prevalence : 0.5667
## Balanced Accuracy : 0.8667
##
## 'Positive' Class : 0
##
# Store the confusionMatrix object
log_cm <- confusionMatrix(as.factor(log_preds), as.factor(test$crim_binary))
# Extract the table
log_cm_table <- as.data.frame(log_cm$table)
# Create the heatmap
ggplot(log_cm_table, aes(x = Reference, y = Prediction, fill = Freq)) +
geom_tile(color = "white") +
scale_fill_gradient(low = "white", high = "steelblue") +
geom_text(aes(label = Freq), vjust = 1, size = 6, color = "black") + # Adjust size and color as needed
labs(title = "Confusion Matrix: Logistic Regression (Full Model)",
x = "Actual Class",
y = "Predicted Class") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"), # Center and bold title
axis.title = element_text(size = 12),
axis.text = element_text(size = 10))
lda_cm <- confusionMatrix(as.factor(lda_preds), as.factor(test$crim_binary))
lda_cm_table <- as.data.frame(lda_cm$table)
ggplot(lda_cm_table, aes(x = Reference, y = Prediction, fill = Freq)) +
geom_tile(color = "white") +
scale_fill_gradient(low = "white", high = "darkgreen") + # Different color for variety
geom_text(aes(label = Freq), vjust = 1, size = 6, color = "black") +
labs(title = "Confusion Matrix: LDA (Full Model)",
x = "Actual Class",
y = "Predicted Class") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"),
axis.title = element_text(size = 12),
axis.text = element_text(size = 10))
nb_cm <- confusionMatrix(as.factor(nb_preds), as.factor(test$crim_binary))
nb_cm_table <- as.data.frame(nb_cm$table)
ggplot(nb_cm_table, aes(x = Reference, y = Prediction, fill = Freq)) +
geom_tile(color = "white") +
scale_fill_gradient(low = "white", high = "purple") +
geom_text(aes(label = Freq), vjust = 1, size = 6, color = "black") +
labs(title = "Confusion Matrix: Naive Bayes (Full Model)",
x = "Actual Class",
y = "Predicted Class") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"),
axis.title = element_text(size = 12),
axis.text = element_text(size = 10))
knn5_cm <- confusionMatrix(as.factor(knn5), as.factor(test$crim_binary))
knn5_cm_table <- as.data.frame(knn5_cm$table)
ggplot(knn5_cm_table, aes(x = Reference, y = Prediction, fill = Freq)) +
geom_tile(color = "white") +
scale_fill_gradient(low = "white", high = "orange") +
geom_text(aes(label = Freq), vjust = 1, size = 6, color = "black") +
labs(title = "Confusion Matrix: KNN (k=5, Full Model)",
x = "Actual Class",
y = "Predicted Class") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"),
axis.title = element_text(size = 12),
axis.text = element_text(size = 10))
confusionMatrix(as.factor(lda_preds_subset), as.factor(test$crim_binary))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 53 14
## 1 22 61
##
## Accuracy : 0.76
## 95% CI : (0.6835, 0.8259)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 6.115e-11
##
## Kappa : 0.52
##
## Mcnemar's Test P-Value : 0.2433
##
## Sensitivity : 0.7067
## Specificity : 0.8133
## Pos Pred Value : 0.7910
## Neg Pred Value : 0.7349
## Prevalence : 0.5000
## Detection Rate : 0.3533
## Detection Prevalence : 0.4467
## Balanced Accuracy : 0.7600
##
## 'Positive' Class : 0
##
confusionMatrix(as.factor(nb_preds_subset), as.factor(test$crim_binary))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 53 11
## 1 22 64
##
## Accuracy : 0.78
## 95% CI : (0.7051, 0.8435)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 1.603e-12
##
## Kappa : 0.56
##
## Mcnemar's Test P-Value : 0.08172
##
## Sensitivity : 0.7067
## Specificity : 0.8533
## Pos Pred Value : 0.8281
## Neg Pred Value : 0.7442
## Prevalence : 0.5000
## Detection Rate : 0.3533
## Detection Prevalence : 0.4267
## Balanced Accuracy : 0.7800
##
## 'Positive' Class : 0
##
confusionMatrix(as.factor(knn_subset5), as.factor(test$crim_binary)) # example for KNN k=5
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 63 20
## 1 12 55
##
## Accuracy : 0.7867
## 95% CI : (0.7124, 0.8493)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 4.42e-13
##
## Kappa : 0.5733
##
## Mcnemar's Test P-Value : 0.2159
##
## Sensitivity : 0.8400
## Specificity : 0.7333
## Pos Pred Value : 0.7590
## Neg Pred Value : 0.8209
## Prevalence : 0.5000
## Detection Rate : 0.4200
## Detection Prevalence : 0.5533
## Balanced Accuracy : 0.7867
##
## 'Positive' Class : 0
##
In this analysis, we evaluated multiple classification methods to predict whether a neighborhood in Boston has a high or low crime rate, using both full and reduced feature sets. Among all models tested, K-Nearest Neighbors (K = 1) achieved the highest accuracy (90%) on the full feature set, highlighting its ability to capture local patterns in the data. Logistic Regression, LDA, and Naive Bayes also performed well but showed a moderate decline in accuracy when using only selected predictors. This suggests that including more features can improve predictive performance, although simpler models still offer reasonably strong results. Overall, KNN showed promising performance but may benefit from tuning and validation to avoid overfitting.
# Set a seed for reproducibility
set.seed(1)
# Check the structure
str(Default)
## 'data.frame': 10000 obs. of 4 variables:
## $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
## $ balance: num 730 817 1074 529 786 ...
## $ income : num 44362 12106 31767 35704 38463 ...
# Fit logistic regression model
logit_model <- glm(default ~ income + balance, data = Default, family = "binomial")
# Summary of model
summary(logit_model)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
set.seed(1) # for reproducibility
# Create train and validation set
train_indices <- sample(1:nrow(Default), nrow(Default) / 2)
train_data <- Default[train_indices, ]
valid_data <- Default[-train_indices, ]
# Fit model on training set
logit_model_val <- glm(default ~ income + balance, data = train_data, family = "binomial")
# Predict probabilities on validation set
pred_probs <- predict(logit_model_val, newdata = valid_data, type = "response")
# Convert to binary prediction using 0.5 cutoff
pred_class <- ifelse(pred_probs > 0.5, "Yes", "No")
# Validation error (fraction misclassified)
val_error <- mean(pred_class != valid_data$default)
val_error
## [1] 0.0254
set.seed(1)
split_error <- function(seed_val) {
set.seed(seed_val)
train_indices <- sample(1:nrow(Default), nrow(Default) / 2)
train_data <- Default[train_indices, ]
valid_data <- Default[-train_indices, ]
model <- glm(default ~ income + balance, data = train_data, family = "binomial")
pred_probs <- predict(model, newdata = valid_data, type = "response")
pred_class <- ifelse(pred_probs > 0.5, "Yes", "No")
error_rate <- mean(pred_class != valid_data$default)
return(error_rate)
}
#three different seeds/splits
errors <- c(
split_error(1),
split_error(2),
split_error(3)
)
errors
## [1] 0.0254 0.0238 0.0264
mean(errors) # average test error across the 3 splits
## [1] 0.0252
split_error_with_student <- function(seed_val) {
set.seed(seed_val)
train_indices <- sample(1:nrow(Default), nrow(Default) / 2)
train_data <- Default[train_indices, ]
valid_data <- Default[-train_indices, ]
model <- glm(default ~ income + balance + student, data = train_data, family = "binomial")
pred_probs <- predict(model, newdata = valid_data, type = "response")
pred_class <- ifelse(pred_probs > 0.5, "Yes", "No")
error_rate <- mean(pred_class != valid_data$default)
return(error_rate)
}
# Test error with student variable
errors_with_student <- c(
split_error_with_student(1),
split_error_with_student(2),
split_error_with_student(3)
)
errors_with_student
## [1] 0.0260 0.0246 0.0272
mean(errors_with_student)
## [1] 0.02593333
Including the student variable in the logistic regression model does not reduce the test error. In fact, the average validation set error increased slightly, suggesting that student is not a significant predictor of default beyond what is already explained by balance and income
library(ISLR2)
library(tidyverse)
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-9
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
##
## R2
## The following object is masked from 'package:stats':
##
## loadings
# Set seed for reproducibility
set.seed(1)
# View structure
str(College)
## 'data.frame': 777 obs. of 18 variables:
## $ Private : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ Apps : num 1660 2186 1428 417 193 ...
## $ Accept : num 1232 1924 1097 349 146 ...
## $ Enroll : num 721 512 336 137 55 158 103 489 227 172 ...
## $ Top10perc : num 23 16 22 60 16 38 17 37 30 21 ...
## $ Top25perc : num 52 29 50 89 44 62 45 68 63 44 ...
## $ F.Undergrad: num 2885 2683 1036 510 249 ...
## $ P.Undergrad: num 537 1227 99 63 869 ...
## $ Outstate : num 7440 12280 11250 12960 7560 ...
## $ Room.Board : num 3300 6450 3750 5450 4120 ...
## $ Books : num 450 750 400 450 800 500 500 450 300 660 ...
## $ Personal : num 2200 1500 1165 875 1500 ...
## $ PhD : num 70 29 53 92 76 67 90 89 79 40 ...
## $ Terminal : num 78 30 66 97 72 73 93 100 84 41 ...
## $ S.F.Ratio : num 18.1 12.2 12.9 7.7 11.9 9.4 11.5 13.7 11.3 11.5 ...
## $ perc.alumni: num 12 16 30 37 2 11 26 37 23 15 ...
## $ Expend : num 7041 10527 8735 19016 10922 ...
## $ Grad.Rate : num 60 56 54 59 15 55 63 73 80 52 ...
# Split into train/test sets
train_indices <- sample(1:nrow(College), nrow(College) / 2)
train_data <- College[train_indices, ]
test_data <- College[-train_indices, ]
lm_fit <- lm(Apps ~ ., data = train_data)
lm_preds <- predict(lm_fit, newdata = test_data)
lm_mse <- mean((lm_preds - test_data$Apps)^2)
lm_mse
## [1] 1135758
x_train <- model.matrix(Apps ~ ., data = train_data)[, -1]
x_test <- model.matrix(Apps ~ ., data = test_data)[, -1]
y_train <- train_data$Apps
y_test <- test_data$Apps
cv_ridge <- cv.glmnet(x_train, y_train, alpha = 0)
best_lambda_ridge <- cv_ridge$lambda.min
ridge_preds <- predict(cv_ridge, s = best_lambda_ridge, newx = x_test)
ridge_mse <- mean((ridge_preds - y_test)^2)
ridge_mse
## [1] 976261.5
cv_lasso <- cv.glmnet(x_train, y_train, alpha = 1)
best_lambda_lasso <- cv_lasso$lambda.min
lasso_preds <- predict(cv_lasso, s = best_lambda_lasso, newx = x_test)
lasso_mse <- mean((lasso_preds - y_test)^2)
# Number of non-zero coefficients
lasso_coef <- predict(cv_lasso, s = best_lambda_lasso, type = "coefficients")
non_zero_lasso <- sum(lasso_coef != 0) - 1 # exclude intercept
lasso_mse
## [1] 1115901
non_zero_lasso
## [1] 17
set.seed(1)
pcr_fit <- pcr(Apps ~ ., data = train_data, scale = TRUE, validation = "CV")
validationplot(pcr_fit, val.type = "MSEP")
# Choose best M
pcr_pred <- predict(pcr_fit, newdata = test_data, ncomp = which.min(pcr_fit$validation$PRESS))
pcr_mse <- mean((pcr_pred - test_data$Apps)^2)
best_M_pcr <- which.min(pcr_fit$validation$PRESS)
pcr_mse
## [1] 1135758
best_M_pcr
## [1] 17
set.seed(1)
pls_fit <- plsr(Apps ~ ., data = train_data, scale = TRUE, validation = "CV")
validationplot(pls_fit, val.type = "MSEP")
pls_pred <- predict(pls_fit, newdata = test_data, ncomp = which.min(pls_fit$validation$PRESS))
pls_mse <- mean((pls_pred - test_data$Apps)^2)
best_M_pls <- which.min(pls_fit$validation$PRESS)
pls_mse
## [1] 1135758
best_M_pls
## [1] 17
library(ISLR2)
library(tidyverse)
library(caret)
library(MASS)
library(class)
library(glmnet)
library(pls)
library(e1071)
data(Default)
str(Default)
## 'data.frame': 10000 obs. of 4 variables:
## $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
## $ balance: num 730 817 1074 529 786 ...
## $ income : num 44362 12106 31767 35704 38463 ...
summary(Default)
## default student balance income
## No :9667 No :7056 Min. : 0.0 Min. : 772
## Yes: 333 Yes:2944 1st Qu.: 481.7 1st Qu.:21340
## Median : 823.6 Median :34553
## Mean : 835.4 Mean :33517
## 3rd Qu.:1166.3 3rd Qu.:43808
## Max. :2654.3 Max. :73554
data(Default)
str(Default)
## 'data.frame': 10000 obs. of 4 variables:
## $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
## $ balance: num 730 817 1074 529 786 ...
## $ income : num 44362 12106 31767 35704 38463 ...
summary(Default)
## default student balance income
## No :9667 No :7056 Min. : 0.0 Min. : 772
## Yes: 333 Yes:2944 1st Qu.: 481.7 1st Qu.:21340
## Median : 823.6 Median :34553
## Mean : 835.4 Mean :33517
## 3rd Qu.:1166.3 3rd Qu.:43808
## Max. :2654.3 Max. :73554
# Convert student to numeric
Default$student <- as.factor(Default$student)
Default$default <- as.factor(Default$default)
# Train-test split
set.seed(123)
train_index <- createDataPartition(Default$default, p = 0.7, list = FALSE)
train_data <- Default[train_index, ]
test_data <- Default[-train_index, ]
log_model <- glm(default ~ balance + income + student, data = train_data, family = "binomial")
summary(log_model)
##
## Call:
## glm(formula = default ~ balance + income + student, family = "binomial",
## data = train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.048e+01 5.782e-01 -18.122 < 2e-16 ***
## balance 5.638e-03 2.733e-04 20.628 < 2e-16 ***
## income -5.519e-07 9.660e-06 -0.057 0.95444
## studentYes -8.937e-01 2.774e-01 -3.221 0.00128 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2050.6 on 7000 degrees of freedom
## Residual deviance: 1128.8 on 6997 degrees of freedom
## AIC: 1136.8
##
## Number of Fisher Scoring iterations: 8
log_probs <- predict(log_model, test_data, type = "response")
log_pred <- ifelse(log_probs > 0.5, "Yes", "No")
log_pred <- factor(log_pred, levels = c("No", "Yes"))
actual <- factor(test_data$default, levels = c("No", "Yes"))
confusionMatrix(log_pred, actual)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 2884 63
## Yes 16 36
##
## Accuracy : 0.9737
## 95% CI : (0.9673, 0.9791)
## No Information Rate : 0.967
## P-Value [Acc > NIR] : 0.0204
##
## Kappa : 0.4646
##
## Mcnemar's Test P-Value : 2.274e-07
##
## Sensitivity : 0.9945
## Specificity : 0.3636
## Pos Pred Value : 0.9786
## Neg Pred Value : 0.6923
## Prevalence : 0.9670
## Detection Rate : 0.9617
## Detection Prevalence : 0.9827
## Balanced Accuracy : 0.6791
##
## 'Positive' Class : No
##
lda_model <- lda(default ~ balance + income + student, data = train_data)
lda_pred <- predict(lda_model, test_data)$class
lda_pred <- factor(lda_pred, levels = c("No", "Yes"))
actual <- factor(test_data$default, levels = c("No", "Yes"))
confusionMatrix(lda_pred, actual)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 2891 71
## Yes 9 28
##
## Accuracy : 0.9733
## 95% CI : (0.9669, 0.9788)
## No Information Rate : 0.967
## P-Value [Acc > NIR] : 0.02642
##
## Kappa : 0.401
##
## Mcnemar's Test P-Value : 9.104e-12
##
## Sensitivity : 0.9969
## Specificity : 0.2828
## Pos Pred Value : 0.9760
## Neg Pred Value : 0.7568
## Prevalence : 0.9670
## Detection Rate : 0.9640
## Detection Prevalence : 0.9877
## Balanced Accuracy : 0.6399
##
## 'Positive' Class : No
##
# Normalize predictors
train_X <- scale(train_data[, c("balance", "income")])
test_X <- scale(test_data[, c("balance", "income")], center = attr(train_X, "scaled:center"), scale = attr(train_X, "scaled:scale"))
train_Y <- train_data$default
test_Y <- test_data$default
knn_pred <- knn(train_X, test_X, train_Y, k = 5)
knn_pred <- factor(knn_pred, levels = c("No", "Yes"))
actual <- factor(test_Y, levels = c("No", "Yes"))
confusionMatrix(knn_pred, actual)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 2870 63
## Yes 30 36
##
## Accuracy : 0.969
## 95% CI : (0.9621, 0.9749)
## No Information Rate : 0.967
## P-Value [Acc > NIR] : 0.2907542
##
## Kappa : 0.4211
##
## Mcnemar's Test P-Value : 0.0009058
##
## Sensitivity : 0.9897
## Specificity : 0.3636
## Pos Pred Value : 0.9785
## Neg Pred Value : 0.5455
## Prevalence : 0.9670
## Detection Rate : 0.9570
## Detection Prevalence : 0.9780
## Balanced Accuracy : 0.6766
##
## 'Positive' Class : No
##
# Prepare design matrix
x <- model.matrix(default ~ balance + income + student, data = Default)[, -1]
y <- Default$default
x_train <- x[train_index, ]
y_train <- y[train_index]
x_test <- x[-train_index, ]
y_test <- y[-train_index]
# Ridge Regression (alpha = 0)
cv_ridge <- cv.glmnet(x_train, y_train, alpha = 0, family = "binomial")
ridge_pred <- predict(cv_ridge, s = "lambda.min", newx = x_test, type = "class")
ridge_pred <- factor(ridge_pred, levels = c("No", "Yes"))
y_test <- factor(y_test, levels = c("No", "Yes"))
confusionMatrix(ridge_pred, y_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 2898 91
## Yes 2 8
##
## Accuracy : 0.969
## 95% CI : (0.9621, 0.9749)
## No Information Rate : 0.967
## P-Value [Acc > NIR] : 0.2908
##
## Kappa : 0.1416
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.99931
## Specificity : 0.08081
## Pos Pred Value : 0.96956
## Neg Pred Value : 0.80000
## Prevalence : 0.96699
## Detection Rate : 0.96632
## Detection Prevalence : 0.99667
## Balanced Accuracy : 0.54006
##
## 'Positive' Class : No
##
# Lasso Regression (alpha = 1)
cv_lasso <- cv.glmnet(x_train, y_train, alpha = 1, family = "binomial")
lasso_pred <- predict(cv_lasso, s = "lambda.min", newx = x_test, type = "class")
lasso_pred <- factor(lasso_pred, levels = c("No", "Yes"))
confusionMatrix(lasso_pred, y_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 2887 63
## Yes 13 36
##
## Accuracy : 0.9747
## 95% CI : (0.9684, 0.98)
## No Information Rate : 0.967
## P-Value [Acc > NIR] : 0.008743
##
## Kappa : 0.475
##
## Mcnemar's Test P-Value : 1.902e-08
##
## Sensitivity : 0.9955
## Specificity : 0.3636
## Pos Pred Value : 0.9786
## Neg Pred Value : 0.7347
## Prevalence : 0.9670
## Detection Rate : 0.9627
## Detection Prevalence : 0.9837
## Balanced Accuracy : 0.6796
##
## 'Positive' Class : No
##
pca_data <- Default[, c("balance", "income")]
pca_result <- prcomp(pca_data, scale. = TRUE)
summary(pca_result)
## Importance of components:
## PC1 PC2
## Standard deviation 1.0734 0.9207
## Proportion of Variance 0.5761 0.4239
## Cumulative Proportion 0.5761 1.0000
biplot(pca_result)
library(smotefamily)
train_data$default_num <- ifelse(train_data$default == "Yes", 1, 0)
smote_output <- SMOTE(
X = train_data[, c("balance", "income")],
target = train_data$default_num,
K = 5
)
# Combine result into a new data frame
smote_data <- smote_output$data
smote_data$default <- ifelse(smote_data$class == 1, "Yes", "No")
smote_data$default <- as.factor(smote_data$default)
smote_data$student <- sample(train_data$student, size = nrow(smote_data), replace = TRUE)
# Fit logistic regression
log_model_smote <- glm(default ~ balance + income + student, data = smote_data, family = "binomial")
# Predict on original test data
log_probs_smote <- predict(log_model_smote, test_data, type = "response")
log_pred_smote <- ifelse(log_probs_smote > 0.5, "Yes", "No")
log_pred_smote <- factor(log_pred_smote, levels = c("No", "Yes"))
# Evaluate
conf_matrix_smote <- confusionMatrix(log_pred_smote, actual)
conf_matrix_smote
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 2533 10
## Yes 367 89
##
## Accuracy : 0.8743
## 95% CI : (0.8619, 0.886)
## No Information Rate : 0.967
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2818
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.8734
## Specificity : 0.8990
## Pos Pred Value : 0.9961
## Neg Pred Value : 0.1952
## Prevalence : 0.9670
## Detection Rate : 0.8446
## Detection Prevalence : 0.8479
## Balanced Accuracy : 0.8862
##
## 'Positive' Class : No
##
# Save all confusion matrices
cm_logistic <- confusionMatrix(log_pred, actual)
cm_lda <- confusionMatrix(lda_pred, actual)
cm_knn <- confusionMatrix(knn_pred, actual)
cm_ridge <- confusionMatrix(ridge_pred, y_test)
cm_lasso <- confusionMatrix(lasso_pred, y_test)
cm_smote <- conf_matrix_smote
# Convert to tidy data frame
cm_to_df <- function(cm, model_name) {
df <- as.data.frame(as.table(cm$table))
colnames(df) <- c("Predicted", "Actual", "Freq")
df$Model <- model_name
return(df)
}
cm_all <- bind_rows(
cm_to_df(cm_logistic, "Logistic"),
cm_to_df(cm_lda, "LDA"),
cm_to_df(cm_knn, "KNN"),
cm_to_df(cm_ridge, "Ridge"),
cm_to_df(cm_lasso, "Lasso"),
cm_to_df(cm_smote, "Logistic (SMOTE)")
)
# Plot confusion matrices
ggplot(cm_all, aes(x = Actual, y = Predicted, fill = Freq)) +
geom_tile(color = "white") +
geom_text(aes(label = Freq), size = 4) +
scale_fill_gradient(low = "white", high = "steelblue") +
theme_minimal() +
labs(title = "Confusion Matrices for All Models") +
facet_wrap(~ Model, ncol = 3)
results <- tibble::tibble(
Model = c("Logistic", "LDA", "KNN", "Ridge", "Lasso", "Logistic (SMOTE)"),
Accuracy = c(0.9737, 0.9733, 0.969, 0.969, 0.9747, conf_matrix_smote$overall["Accuracy"]),
Specificity = c(0.3636, 0.2828, 0.3636, 0.0808, 0.3636, conf_matrix_smote$byClass["Specificity"]),
Sensitivity = c(0.9945, 0.9969, 0.9897, 0.9993, 0.9955, conf_matrix_smote$byClass["Sensitivity"])
)
knitr::kable(results, digits = 4)
Model | Accuracy | Specificity | Sensitivity |
---|---|---|---|
Logistic | 0.9737 | 0.3636 | 0.9945 |
LDA | 0.9733 | 0.2828 | 0.9969 |
KNN | 0.9690 | 0.3636 | 0.9897 |
Ridge | 0.9690 | 0.0808 | 0.9993 |
Lasso | 0.9747 | 0.3636 | 0.9955 |
Logistic (SMOTE) | 0.8743 | 0.8990 | 0.8734 |
library(ggplot2)
# Plot SMOTE result
ggplot(smote_data, aes(x = balance, y = income, color = default)) +
geom_point(alpha = 0.6, size = 1.5) +
scale_color_manual(values = c("No" = "steelblue", "Yes" = "firebrick")) +
theme_minimal() +
labs(
title = "SMOTE Oversampling: Class Distribution After Resampling",
x = "Balance",
y = "Income",
color = "Default"
)