"CHAPTER 4"
## [1] "CHAPTER 4"
library(MASS)
data("Boston")
Boston$crim_above_median <- ifelse(Boston$crim > median(Boston$crim), 1, 0)
set.seed(123)
library(caTools)
split <- sample.split(Boston$crim_above_median, SplitRatio = 0.7)
train <- subset(Boston, split == TRUE)
test <- subset(Boston, split == FALSE)
logistic_model <- glm(crim_above_median ~ ., data = train, family = binomial)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logistic_model)
##
## Call:
## glm(formula = crim_above_median ~ ., family = binomial, data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.315e+03 1.581e+05 -0.008 0.993
## crim 6.665e+02 1.726e+04 0.039 0.969
## zn 1.249e+00 1.170e+02 0.011 0.991
## indus 8.851e-01 5.064e+02 0.002 0.999
## chas -3.317e+00 1.364e+04 0.000 1.000
## nox 3.191e+01 7.608e+04 0.000 1.000
## rm -1.782e+01 3.197e+03 -0.006 0.996
## age 3.317e-01 1.055e+02 0.003 0.997
## dis -1.472e+01 2.631e+03 -0.006 0.996
## rad 1.160e+01 1.402e+03 0.008 0.993
## tax -3.735e-01 2.555e+01 -0.015 0.988
## ptratio 1.262e+01 1.530e+03 0.008 0.993
## black 2.615e+00 3.093e+02 0.008 0.993
## lstat 1.458e-01 2.383e+02 0.001 1.000
## medv 2.255e+00 5.640e+02 0.004 0.997
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4.9075e+02 on 353 degrees of freedom
## Residual deviance: 8.6635e-06 on 339 degrees of freedom
## AIC: 30
##
## Number of Fisher Scoring iterations: 25
# Predict on the test set
logistic_preds <- predict(logistic_model, newdata = test, type = "response")
logistic_pred_class <- ifelse(logistic_preds > 0.5, 1, 0)
library(MASS)
lda_model <- lda(crim_above_median ~ ., data = train)
lda_preds <- predict(lda_model, newdata = test)
lda_pred_class <- lda_preds$class
library(e1071)
nb_model <- naiveBayes(crim_above_median ~ ., data = train)
nb_preds <- predict(nb_model, newdata = test)
library(class)
train_X <- train[, -which(names(train) == "crim_above_median")]
test_X <- test[, -which(names(test) == "crim_above_median")]
train_y <- train$crim_above_median
# Scale the predictors
train_X_scaled <- scale(train_X)
test_X_scaled <- scale(test_X)
knn_preds <- knn(train = train_X_scaled, test = test_X_scaled, cl = train_y, k = 5)
# Confusion matrix and accuracy for Logistic Regression
logistic_cm <- table(test$crim_above_median, logistic_pred_class)
logistic_accuracy <- sum(diag(logistic_cm)) / sum(logistic_cm)
print(paste("Logistic Regression Accuracy:", logistic_accuracy))
## [1] "Logistic Regression Accuracy: 0.980263157894737"
# Confusion matrix and accuracy for LDA
lda_cm <- table(test$crim_above_median, lda_pred_class)
lda_accuracy <- sum(diag(lda_cm)) / sum(lda_cm)
print(paste("LDA Accuracy:", lda_accuracy))
## [1] "LDA Accuracy: 0.881578947368421"
# Confusion matrix and accuracy for Naive Bayes
nb_cm <- table(test$crim_above_median, nb_preds)
nb_accuracy <- sum(diag(nb_cm)) / sum(nb_cm)
print(paste("Naive Bayes Accuracy:", nb_accuracy))
## [1] "Naive Bayes Accuracy: 0.947368421052632"
# Confusion matrix and accuracy for KNN
knn_cm <- table(test$crim_above_median, knn_preds)
knn_accuracy <- sum(diag(knn_cm)) / sum(knn_cm)
print(paste("KNN Accuracy:", knn_accuracy))
## [1] "KNN Accuracy: 0.901315789473684"
"CHAPTER 5"
## [1] "CHAPTER 5"
library(ISLR)
data("Default")
set.seed(123)
logistic_model <- glm(default ~ income + balance, data = Default, family = binomial)
summary(logistic_model)
##
## Call:
## glm(formula = default ~ income + balance, family = binomial,
## data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
set.seed(123)
library(caTools)
split <- sample.split(Default$default, SplitRatio = 0.7)
train <- subset(Default, split == TRUE)
test <- subset(Default, split == FALSE)
# Fit the model on the training set
logistic_model_train <- glm(default ~ income + balance, data = train, family = binomial)
# Predict on the validation set
test_probs <- predict(logistic_model_train, newdata = test, type = "response")
test_preds <- ifelse(test_probs > 0.5, "Yes", "No")
# Calculate the validation set error
test_actual <- test$default
conf_matrix <- table(test_actual, test_preds)
validation_error <- 1 - sum(diag(conf_matrix)) / sum(conf_matrix)
print(paste("Validation Error:", validation_error))
## [1] "Validation Error: 0.0246666666666666"
# Function to perform the validation set approach
validation_set_approach <- function(seed) {
set.seed(seed)
split <- sample.split(Default$default, SplitRatio = 0.7)
train <- subset(Default, split == TRUE)
test <- subset(Default, split == FALSE)
logistic_model_train <- glm(default ~ income + balance, data = train, family = binomial)
test_probs <- predict(logistic_model_train, newdata = test, type = "response")
test_preds <- ifelse(test_probs > 0.5, "Yes", "No")
test_actual <- test$default
conf_matrix <- table(test_actual, test_preds)
validation_error <- 1 - sum(diag(conf_matrix)) / sum(conf_matrix)
return(validation_error)
}
# Perform the process three times with different seeds
set.seed(123)
seeds <- sample(1:1000, 3)
errors <- sapply(seeds, validation_set_approach)
print(errors)
## [1] 0.02600000 0.02566667 0.02866667
mean_error <- mean(errors)
print(paste("Mean Validation Error:", mean_error))
## [1] "Mean Validation Error: 0.0267777777777778"
# Add the dummy variable for student
set.seed(123)
split <- sample.split(Default$default, SplitRatio = 0.7)
train <- subset(Default, split == TRUE)
test <- subset(Default, split == FALSE)
logistic_model_student <- glm(default ~ income + balance + student, data = train, family = binomial)
test_probs_student <- predict(logistic_model_student, newdata = test, type = "response")
test_preds_student <- ifelse(test_probs_student > 0.5, "Yes", "No")
conf_matrix_student <- table(test$default, test_preds_student)
validation_error_student <- 1 - sum(diag(conf_matrix_student)) / sum(conf_matrix_student)
print(paste("Validation Error with Student Variable:", validation_error_student))
## [1] "Validation Error with Student Variable: 0.0243333333333333"
"CHAPTER 6"
## [1] "CHAPTER 6"
library(ISLR)
data("College")
set.seed(123)
library(caTools)
split <- sample.split(College$Apps, SplitRatio = 0.7)
train <- subset(College, split == TRUE)
test <- subset(College, split == FALSE)
linear_model <- lm(Apps ~ ., data = train)
test_preds_linear <- predict(linear_model, newdata = test)
test_error_linear <- mean((test$Apps - test_preds_linear)^2)
print(paste("Test Error for Linear Model:", test_error_linear))
## [1] "Test Error for Linear Model: 1304671.1253516"
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-8
x_train <- model.matrix(Apps ~ ., train)[, -1]
y_train <- train$Apps
x_test <- model.matrix(Apps ~ ., test)[, -1]
y_test <- test$Apps
# Perform cross-validation to choose the best lambda
set.seed(123)
cv_ridge <- cv.glmnet(x_train, y_train, alpha = 0)
best_lambda_ridge <- cv_ridge$lambda.min
# Fit the ridge regression model
ridge_model <- glmnet(x_train, y_train, alpha = 0, lambda = best_lambda_ridge)
test_preds_ridge <- predict(ridge_model, s = best_lambda_ridge, newx = x_test)
test_error_ridge <- mean((y_test - test_preds_ridge)^2)
print(paste("Test Error for Ridge Regression:", test_error_ridge))
## [1] "Test Error for Ridge Regression: 1313120.09915936"
# Perform cross-validation to choose the best lambda
set.seed(123)
cv_lasso <- cv.glmnet(x_train, y_train, alpha = 1)
best_lambda_lasso <- cv_lasso$lambda.min
# Fit the lasso model
lasso_model <- glmnet(x_train, y_train, alpha = 1, lambda = best_lambda_lasso)
test_preds_lasso <- predict(lasso_model, s = best_lambda_lasso, newx = x_test)
test_error_lasso <- mean((y_test - test_preds_lasso)^2)
# Number of non-zero coefficients
num_nonzero_coef <- sum(coef(lasso_model, s = best_lambda_lasso) != 0)
print(paste("Test Error for Lasso:", test_error_lasso))
## [1] "Test Error for Lasso: 1311182.68289331"
print(paste("Number of Non-zero Coefficients in Lasso:", num_nonzero_coef))
## [1] "Number of Non-zero Coefficients in Lasso: 13"
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
set.seed(123)
pcr_model <- pcr(Apps ~ ., data = train, scale = TRUE, validation = "CV")
validationplot(pcr_model, val.type = "MSEP")

# Select the number of components
optimal_M <- which.min(pcr_model$validation$PRESS)
test_preds_pcr <- predict(pcr_model, newdata = test, ncomp = optimal_M)
test_error_pcr <- mean((test$Apps - test_preds_pcr)^2)
print(paste("Test Error for PCR:", test_error_pcr))
## [1] "Test Error for PCR: 1304671.1253516"
print(paste("Optimal Number of Components for PCR:", optimal_M))
## [1] "Optimal Number of Components for PCR: 17"
set.seed(123)
pls_model <- plsr(Apps ~ ., data = train, scale = TRUE, validation = "CV")
validationplot(pls_model, val.type = "MSEP")

# Select the number of components
optimal_M_pls <- which.min(pls_model$validation$PRESS)
test_preds_pls <- predict(pls_model, newdata = test, ncomp = optimal_M_pls)
test_error_pls <- mean((test$Apps - test_preds_pls)^2)
print(paste("Test Error for PLS:", test_error_pls))
## [1] "Test Error for PLS: 1304653.05383128"
print(paste("Optimal Number of Components for PLS:", optimal_M_pls))
## [1] "Optimal Number of Components for PLS: 16"
# Summary of results
results <- data.frame(
Method = c("Linear Model", "Ridge Regression", "Lasso", "PCR", "PLS"),
TestError = c(test_error_linear, test_error_ridge, test_error_lasso, test_error_pcr, test_error_pls)
)
print(results)
## Method TestError
## 1 Linear Model 1304671
## 2 Ridge Regression 1313120
## 3 Lasso 1311183
## 4 PCR 1304671
## 5 PLS 1304653
# Load necessary libraries
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom 1.0.6 ✔ recipes 1.0.10
## ✔ dials 1.2.1 ✔ rsample 1.2.1
## ✔ dplyr 1.1.4 ✔ tibble 3.2.1
## ✔ ggplot2 3.5.1 ✔ tidyr 1.3.1
## ✔ infer 1.0.7 ✔ tune 1.2.1
## ✔ modeldata 1.3.0 ✔ workflows 1.1.4
## ✔ parsnip 1.2.1 ✔ workflowsets 1.1.0
## ✔ purrr 1.0.2 ✔ yardstick 1.3.1
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tidyr::pack() masks Matrix::pack()
## ✖ rsample::permutations() masks e1071::permutations()
## ✖ dplyr::select() masks MASS::select()
## ✖ recipes::step() masks stats::step()
## ✖ tune::tune() masks parsnip::tune(), e1071::tune()
## ✖ tidyr::unpack() masks Matrix::unpack()
## ✖ recipes::update() masks Matrix::update(), stats::update()
## • Use tidymodels_prefer() to resolve common conflicts.
library(ggplot2)
library(ISLR2)
##
## Attaching package: 'ISLR2'
## The following object is masked _by_ '.GlobalEnv':
##
## Boston
## The following objects are masked from 'package:ISLR':
##
## Auto, Credit
## The following object is masked from 'package:MASS':
##
## Boston
library(MASS)
# Load the Auto dataset
data(auto)
## Warning in data(auto): data set 'auto' not found
# (a) Perform simple linear regression
model <- lm(mpg ~ horsepower, data = Auto)
# Summary of the model
summary(model)
##
## Call:
## lm(formula = mpg ~ horsepower, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.5710 -3.2592 -0.3435 2.7630 16.9240
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.935861 0.717499 55.66 <2e-16 ***
## horsepower -0.157845 0.006446 -24.49 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.906 on 390 degrees of freedom
## Multiple R-squared: 0.6059, Adjusted R-squared: 0.6049
## F-statistic: 599.7 on 1 and 390 DF, p-value: < 2.2e-16
# (b) Plot the response and predictor
ggplot(Auto, aes(x = horsepower, y = mpg)) +
geom_point() +
geom_abline(intercept = coef(model)[1], slope = coef(model)[2], color = "red") +
labs(x = "Horsepower", y = "MPG") +
theme_minimal()
