install.packages(“MASS”) install.packages(“class”) install.packages(“e1071”) install.packages(“caret”) #CHAPTER 4

Load necessary libraries

library(MASS) library(class) library(e1071) library(caret)

Load Boston data

data(“Boston”)

Create binary target: 1 if crime rate > median, 0 otherwise

Boston\(HighCrime <- ifelse(Boston\)crim > median(Boston\(crim), 1, 0) Boston\)HighCrime <- as.factor(Boston$HighCrime)

Remove original crime rate column

Boston_data <- Boston[, !names(Boston) %in% c(“crim”)]

Split into training/testing sets

set.seed(123) train_index <- createDataPartition(Boston$HighCrime, p = 0.7, list = FALSE) train_data <- Boston_data[train_index, ] test_data <- Boston_data[-train_index, ]

Logistic Regression

log_model <- glm(HighCrime ~ ., data = train_data, family = binomial) log_pred <- predict(log_model, newdata = test_data, type = “response”) log_class <- ifelse(log_pred > 0.5, 1, 0) log_accuracy <- mean(log_class == test_data$HighCrime)

LDA

lda_model <- lda(HighCrime ~ ., data = train_data) lda_pred <- predict(lda_model, test_data)\(class lda_accuracy <- mean(lda_pred == test_data\)HighCrime)

Naive Bayes

nb_model <- naiveBayes(HighCrime ~ ., data = train_data) nb_pred <- predict(nb_model, test_data) nb_accuracy <- mean(nb_pred == test_data$HighCrime)

KNN

Normalize predictors

norm_train <- scale(train_data[, -which(names(train_data) == “HighCrime”)]) norm_test <- scale(test_data[, -which(names(test_data) == “HighCrime”)], center = attr(norm_train, “scaled:center”), scale = attr(norm_train, “scaled:scale”)) knn_pred <- knn(train = norm_train, test = norm_test, cl = train_data\(HighCrime, k = 5) knn_accuracy <- mean(knn_pred == test_data\)HighCrime)

Set seed for reproducibility

set.seed(123)

Load the dataset

data(Default)

View structure

str(Default)

Logistic model

model_ab <- glm(default ~ income + balance, data = Default, family = “binomial”) summary(model_ab)

Split the data: 70% train, 30% test

set.seed(123) train_idx <- createDataPartition(Default$default, p = 0.7, list = FALSE) train_set <- Default[train_idx, ] test_set <- Default[-train_idx, ]

Fit logistic regression on training set

log_model <- glm(default ~ income + balance, data = train_set, family = “binomial”)

Predict on test set

probs <- predict(log_model, newdata = test_set, type = “response”) pred_class <- ifelse(probs > 0.5, “Yes”, “No”)

Validation error

error_rate <- mean(pred_class != test_set$default) cat(“Validation Error Rate (1 split):”, error_rate, “”)

error_rates <- c()

for (i in 1:3) { set.seed(100 + i) idx <- createDataPartition(Default$default, p = 0.7, list = FALSE) train <- Default[idx, ] test <- Default[-idx, ]

model <- glm(default ~ income + balance, data = train, family = “binomial”) probs <- predict(model, newdata = test, type = “response”) pred <- ifelse(probs > 0.5, “Yes”, “No”)

error_rates[i] <- mean(pred != test$default) }

cat(“Validation Errors for 3 splits:”, round(error_rates, 4), “”) cat(“Average Error Rate:”, round(mean(error_rates), 4), “”)

Using same split as earlier (or create a new one if you want)

set.seed(321) idx <- createDataPartition(Default$default, p = 0.7, list = FALSE) train <- Default[idx, ] test <- Default[-idx, ]

New model with student variable

#CHAPTER 6

install.packages(“ISLR”) install.packages(“glmnet”) install.packages(“pls”) install.packages(“caret”)

library(ISLR) library(glmnet) library(pls) library(caret)

Load College data

data(College)

Set seed

set.seed(123)

Split into training (70%) and testing (30%)

train_idx <- createDataPartition(College$Apps, p = 0.7, list = FALSE) train <- College[train_idx, ] test <- College[-train_idx, ]

lm_fit <- lm(Apps ~ ., data = train) lm_pred <- predict(lm_fit, newdata = test) lm_mse <- mean((lm_pred - test$Apps)^2) cat(“Linear Model Test MSE:”, lm_mse, “”)

Prepare matrices for glmnet

x_train <- model.matrix(Apps ~ ., train)[, -1] y_train <- train$Apps x_test <- model.matrix(Apps ~ ., test)[, -1]

Ridge (alpha = 0)

ridge_cv <- cv.glmnet(x_train, y_train, alpha = 0) ridge_best_lambda <- ridge_cv\(lambda.min ridge_pred <- predict(ridge_cv, s = ridge_best_lambda, newx = x_test) ridge_mse <- mean((ridge_pred - test\)Apps)^2) cat(“Ridge Test MSE:”, ridge_mse, “”)

Lasso (alpha = 1)

lasso_cv <- cv.glmnet(x_train, y_train, alpha = 1) lasso_best_lambda <- lasso_cv\(lambda.min lasso_pred <- predict(lasso_cv, s = lasso_best_lambda, newx = x_test) lasso_mse <- mean((lasso_pred - test\)Apps)^2) lasso_coef <- coef(lasso_cv, s = lasso_best_lambda)

cat(“Lasso Test MSE:”, lasso_mse, “”) cat(“Non-zero coefficients in Lasso:”, sum(lasso_coef != 0), “”)

pcr_fit <- pcr(Apps ~ ., data = train, scale = TRUE, validation = “CV”) summary(pcr_fit)

Best number of components

best_M_pcr <- which.min(pcr_fit\(validation\)PRESS) pcr_pred <- predict(pcr_fit, newdata = test, ncomp = best_M_pcr) pcr_mse <- mean((pcr_pred - test$Apps)^2)

cat(“PCR Test MSE:”, pcr_mse, “”) cat(“Optimal number of components (PCR):”, best_M_pcr, “”)

pls_fit <- plsr(Apps ~ ., data = train, scale = TRUE, validation = “CV”) summary(pls_fit)

best_M_pls <- which.min(pls_fit\(validation\)PRESS) pls_pred <- predict(pls_fit, newdata = test, ncomp = best_M_pls) pls_mse <- mean((pls_pred - test$Apps)^2)

cat(“PLS Test MSE:”, pls_mse, “”) cat(“Optimal number of components (PLS):”, best_M_pls, “”)

model_student <- glm(College ~ income + balance + student, data = train, family = “binomial”)

Predict and compute error

student_probs <- predict(model_student, newdata = test, type = “response”) student_pred <- ifelse(student_probs > 0.5, “Yes”, “No”) student_error <- mean(student_pred != test$College)

cat(“Validation Error with student variable:”, round(student_error, 4), “”)

#CAPM

install.packages(“quantmod”) install.packages(“PerformanceAnalytics”)

library(quantmod) library(PerformanceAnalytics)

Get daily prices for AAPL and SP500 (^GSPC)

getSymbols(c(“AAPL”, “^GSPC”), from = “2023-01-01”, to = “2023-12-31”)

Calculate daily returns

aapl_ret <- dailyReturn(Ad(AAPL)) sp500_ret <- dailyReturn(Ad(GSPC))

Merge and clean

data <- na.omit(merge(aapl_ret, sp500_ret)) colnames(data) <- c(“AAPL”, “Market”)

CAPM regression: excess return not used here (risk-free assumed zero)

capm_model <- lm(AAPL ~ Market, data = data) summary(capm_model)

plot(data\(Market, data\)AAPL, main = “CAPM Regression: AAPL vs Market”, xlab = “Market Return (S&P500)”, ylab = “AAPL Return”, pch = 20) abline(capm_model, col = “red”, lwd = 2)