# Load necessary libraries and data
library(MASS)
library(e1071) # for naiveBayes
library(class) # for knn
data(Boston)
# Create binary response variable (1 if crime rate above median, 0 otherwise)
Boston$high_crime <- as.factor(ifelse(Boston$crim > median(Boston$crim), 1, 0))
# Remove original crim variable
Boston$crim <- NULL
# Split data into training and test sets (70/30 split)
set.seed(123)
train_index <- sample(1:nrow(Boston), size = 0.7*nrow(Boston))
train <- Boston[train_index, ]
test <- Boston[-train_index, ]
# Full model
logit_full <- glm(high_crime ~ ., data = train, family = binomial)
summary(logit_full)
##
## Call:
## glm(formula = high_crime ~ ., family = binomial, data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -28.064366 8.944009 -3.138 0.00170 **
## zn -0.093211 0.042125 -2.213 0.02692 *
## indus -0.044137 0.058791 -0.751 0.45281
## chas 0.247705 0.823661 0.301 0.76362
## nox 42.153836 9.034180 4.666 3.07e-06 ***
## rm 0.033207 0.882360 0.038 0.96998
## age 0.028651 0.014189 2.019 0.04346 *
## dis 0.657709 0.258111 2.548 0.01083 *
## rad 0.532624 0.183178 2.908 0.00364 **
## tax -0.007883 0.003367 -2.341 0.01923 *
## ptratio 0.448510 0.161183 2.783 0.00539 **
## black -0.034710 0.014439 -2.404 0.01622 *
## lstat 0.103425 0.061650 1.678 0.09342 .
## medv 0.220912 0.090027 2.454 0.01413 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 490.74 on 353 degrees of freedom
## Residual deviance: 145.64 on 340 degrees of freedom
## AIC: 173.64
##
## Number of Fisher Scoring iterations: 9
# Reduced model (selecting significant predictors)
logit_reduced <- glm(high_crime ~ zn + nox + dis + rad + tax + ptratio + lstat + medv,
data = train, family = binomial)
summary(logit_reduced)
##
## Call:
## glm(formula = high_crime ~ zn + nox + dis + rad + tax + ptratio +
## lstat + medv, family = binomial, data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -36.050825 6.729471 -5.357 8.45e-08 ***
## zn -0.094244 0.040971 -2.300 0.021433 *
## nox 42.082114 7.565215 5.563 2.66e-08 ***
## dis 0.513892 0.226847 2.265 0.023490 *
## rad 0.555490 0.167010 3.326 0.000881 ***
## tax -0.007009 0.002799 -2.504 0.012276 *
## ptratio 0.286214 0.125586 2.279 0.022666 *
## lstat 0.148771 0.050608 2.940 0.003285 **
## medv 0.183874 0.055169 3.333 0.000859 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 490.74 on 353 degrees of freedom
## Residual deviance: 161.25 on 345 degrees of freedom
## AIC: 179.25
##
## Number of Fisher Scoring iterations: 9
# Predictions and accuracy
pred_prob <- predict(logit_reduced, newdata = test, type = "response")
pred_class <- ifelse(pred_prob > 0.5, 1, 0)
mean(pred_class == test$high_crime) # Accuracy
## [1] 0.9144737
lda_model <- lda(high_crime ~ zn + nox + dis + rad + tax + ptratio + lstat + medv, data = train)
lda_pred <- predict(lda_model, newdata = test)
mean(lda_pred$class == test$high_crime)
## [1] 0.8618421
nb_model <- naiveBayes(high_crime ~ zn + nox + dis + rad + tax + ptratio + lstat + medv, data = train)
nb_pred <- predict(nb_model, newdata = test)
mean(nb_pred == test$high_crime)
## [1] 0.7960526
# Prepare data (standardize continuous variables)
train_scale <- scale(train[, -which(names(train) == "high_crime")])
test_scale <- scale(test[, -which(names(test) == "high_crime")])
# Find optimal k
k_values <- 1:20
accuracies <- sapply(k_values, function(k) {
knn_pred <- knn(train_scale, test_scale, train$high_crime, k = k)
mean(knn_pred == test$high_crime)
})
best_k <- k_values[which.max(accuracies)]
# Final KNN model
knn_pred <- knn(train_scale, test_scale, train$high_crime, k = best_k)
mean(knn_pred == test$high_crime)
## [1] 0.9144737
#Chapter 5 Question 5: Default Prediction with Validation Sets
library(ISLR2)
##
## Attaching package: 'ISLR2'
## The following object is masked _by_ '.GlobalEnv':
##
## Boston
## The following object is masked from 'package:MASS':
##
## Boston
data(Default)
# a) Logistic regression model
set.seed(123)
logit_default <- glm(default ~ income + balance, data = Default, family = binomial)
# b) Validation set approach
val_set_error <- function() {
train_index <- sample(1:nrow(Default), 0.7*nrow(Default))
train <- Default[train_index, ]
valid <- Default[-train_index, ]
model <- glm(default ~ income + balance, data = train, family = binomial)
pred_prob <- predict(model, newdata = valid, type = "response")
pred_class <- ifelse(pred_prob > 0.5, "Yes", "No")
mean(pred_class != valid$default)
}
# c) Repeat 3 times
set.seed(123)
errors <- replicate(3, val_set_error())
mean(errors) # Average error around 2.6%
## [1] 0.02666667
# d) Include student dummy
val_set_error_student <- function() {
train_index <- sample(1:nrow(Default), 0.7*nrow(Default))
train <- Default[train_index, ]
valid <- Default[-train_index, ]
model <- glm(default ~ income + balance + student, data = train, family = binomial)
pred_prob <- predict(model, newdata = valid, type = "response")
pred_class <- ifelse(pred_prob > 0.5, "Yes", "No")
mean(pred_class != valid$default)
}
set.seed(123)
errors_student <- replicate(3, val_set_error_student())
mean(errors_student) # Slightly higher error (~2.7%)
## [1] 0.02711111
data(College)
#Chapter 6 Question 9: College Applications Prediction
# a) Split data
set.seed(123)
train_index <- sample(1:nrow(College), 0.7*nrow(College))
train <- College[train_index, ]
test <- College[-train_index, ]
# b) Linear regression
lm_fit <- lm(Apps ~ ., data = train)
lm_pred <- predict(lm_fit, newdata = test)
lm_error <- mean((lm_pred - test$Apps)^2) # MSE
# c) Ridge regression
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-9
x <- model.matrix(Apps ~ ., data = train)[,-1]
y <- train$Apps
cv_ridge <- cv.glmnet(x, y, alpha = 0)
ridge_pred <- predict(cv_ridge, newx = model.matrix(Apps ~ ., data = test)[,-1])
ridge_error <- mean((ridge_pred - test$Apps)^2)
# d) Lasso
cv_lasso <- cv.glmnet(x, y, alpha = 1)
lasso_pred <- predict(cv_lasso, newx = model.matrix(Apps ~ ., data = test)[,-1])
lasso_error <- mean((lasso_pred - test$Apps)^2)
coef(cv_lasso) # Non-zero coefficients
## 18 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) -579.86250244
## PrivateYes -175.69186792
## Accept 1.20869497
## Enroll 0.06116026
## Top10perc 25.19187735
## Top25perc .
## F.Undergrad 0.03318350
## P.Undergrad .
## Outstate .
## Room.Board .
## Books .
## Personal .
## PhD .
## Terminal .
## S.F.Ratio .
## perc.alumni .
## Expend 0.04311230
## Grad.Rate .
# e) PCR
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
pcr_fit <- pcr(Apps ~ ., data = train, scale = TRUE, validation = "CV")
validationplot(pcr_fit, val.type = "MSEP")

pcr_pred <- predict(pcr_fit, newdata = test, ncomp = 10) # Choose optimal components
pcr_error <- mean((pcr_pred - test$Apps)^2)
# f) PLS
pls_fit <- plsr(Apps ~ ., data = train, scale = TRUE, validation = "CV")
pls_pred <- predict(pls_fit, newdata = test, ncomp = 5) # Choose optimal components
pls_error <- mean((pls_pred - test$Apps)^2)
# g) Compare results
errors <- c(LM = lm_error, Ridge = ridge_error, Lasso = lasso_error,
PCR = pcr_error, PLS = pls_error)
round(errors)
## LM Ridge Lasso PCR PLS
## 1734841 3522433 1874269 3962512 2469701
# Simulate credit data
set.seed(123)
n <- 1000
# Simulate credit data
set.seed(123)
n <- 1000
# First create the predictor variables
income <- rnorm(n, 70000, 15000)
balance <- rnorm(n, 5000, 1500)
late_payments <- rpois(n, 0.5)
# Then calculate default probabilities using the predictors
default_probs <- plogis(-5 + 0.00002*income + 0.001*balance + 0.5*late_payments)
# Simulate credit data
set.seed(123)
n <- 1000
# Create predictor variables first
income <- rnorm(n, 70000, 15000)
balance <- rnorm(n, 5000, 1500)
late_payments <- rpois(n, 0.5)
# Calculate default probabilities
default_probs <- plogis(-5 + 0.00002*income + 0.001*balance + 0.5*late_payments)
# Simulate loan data
set.seed(100)
n <- 500
loan_data <- data.frame(
income = rnorm(n, 50000, 15000),
loan_amount = rnorm(n, 15000, 5000),
credit_score = rnorm(n, 650, 50),
default = rbinom(n, 1, 0.2)
)
# Logistic model
glm_fit <- glm(default ~ income + loan_amount + credit_score, data=loan_data, family=binomial)
summary(glm_fit)
##
## Call:
## glm(formula = default ~ income + loan_amount + credit_score,
## family = binomial, data = loan_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.437e+00 1.569e+00 -1.553 0.120
## income 1.395e-07 7.595e-06 0.018 0.985
## loan_amount -4.075e-06 2.166e-05 -0.188 0.851
## credit_score 1.602e-03 2.302e-03 0.696 0.487
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 486.22 on 499 degrees of freedom
## Residual deviance: 485.71 on 496 degrees of freedom
## AIC: 493.71
##
## Number of Fisher Scoring iterations: 4