#Chapter4 Q16
#Load dataset
# Install necessary packages if not already installed
if (!requireNamespace("MASS", quietly = TRUE)) {
install.packages("MASS")
}
if (!requireNamespace("caret", quietly = TRUE)) {
install.packages("caret")
}
if (!requireNamespace("e1071", quietly = TRUE)) {
install.packages("e1071")
}
if (!requireNamespace("class", quietly = TRUE)) {
install.packages("class")
}
# Load the necessary libraries
library(MASS)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(e1071)
library(class)
# Load the Boston dataset
data(Boston)
# Create a binary response variable 'high_crime' based on whether 'crim' is above the median
median_crim <- median(Boston$crim)
Boston$high_crime <- as.factor(ifelse(Boston$crim > median_crim, 1, 0))
# Split the data into training and test sets
set.seed(123)
trainIndex <- createDataPartition(Boston$high_crime, p = .7,
list = FALSE,
times = 1)
BostonTrain <- Boston[trainIndex,]
BostonTest <- Boston[-trainIndex,]
# Logistic Regression
log_model <- glm(high_crime ~ . - crim, data = BostonTrain, family = binomial)
log_pred <- predict(log_model, newdata = BostonTest, type = "response")
log_pred_class <- as.factor(ifelse(log_pred > 0.5, 1, 0))
log_cm <- confusionMatrix(log_pred_class, BostonTest$high_crime)
print("Logistic Regression:")
## [1] "Logistic Regression:"
print(log_cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 67 2
## 1 8 73
##
## Accuracy : 0.9333
## 95% CI : (0.8808, 0.9676)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8667
##
## Mcnemar's Test P-Value : 0.1138
##
## Sensitivity : 0.8933
## Specificity : 0.9733
## Pos Pred Value : 0.9710
## Neg Pred Value : 0.9012
## Prevalence : 0.5000
## Detection Rate : 0.4467
## Detection Prevalence : 0.4600
## Balanced Accuracy : 0.9333
##
## 'Positive' Class : 0
##
# Linear Discriminant Analysis (LDA)
lda_model <- lda(high_crime ~ . - crim, data = BostonTrain)
lda_pred <- predict(lda_model, newdata = BostonTest)
lda_cm <- confusionMatrix(lda_pred$class, BostonTest$high_crime)
print("LDA:")
## [1] "LDA:"
print(lda_cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 70 14
## 1 5 61
##
## Accuracy : 0.8733
## 95% CI : (0.8093, 0.922)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.7467
##
## Mcnemar's Test P-Value : 0.06646
##
## Sensitivity : 0.9333
## Specificity : 0.8133
## Pos Pred Value : 0.8333
## Neg Pred Value : 0.9242
## Prevalence : 0.5000
## Detection Rate : 0.4667
## Detection Prevalence : 0.5600
## Balanced Accuracy : 0.8733
##
## 'Positive' Class : 0
##
# Naive Bayes
nb_model <- naiveBayes(high_crime ~ . - crim, data = BostonTrain)
nb_pred <- predict(nb_model, newdata = BostonTest)
nb_cm <- confusionMatrix(nb_pred, BostonTest$high_crime)
print("Naive Bayes:")
## [1] "Naive Bayes:"
print(nb_cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 69 14
## 1 6 61
##
## Accuracy : 0.8667
## 95% CI : (0.8016, 0.9166)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7333
##
## Mcnemar's Test P-Value : 0.1175
##
## Sensitivity : 0.9200
## Specificity : 0.8133
## Pos Pred Value : 0.8313
## Neg Pred Value : 0.9104
## Prevalence : 0.5000
## Detection Rate : 0.4600
## Detection Prevalence : 0.5533
## Balanced Accuracy : 0.8667
##
## 'Positive' Class : 0
##
# k-Nearest Neighbors (KNN)
# Normalize the data
preProcValues <- preProcess(BostonTrain[, -which(names(BostonTrain) %in% c("crim", "high_crime"))], method = c("center", "scale"))
trainTransformed <- predict(preProcValues, BostonTrain[, -which(names(BostonTrain) %in% c("crim", "high_crime"))])
testTransformed <- predict(preProcValues, BostonTest[, -which(names(BostonTest) %in% c("crim", "high_crime"))])
# KNN model
knn_pred <- knn(train = trainTransformed, test = testTransformed, cl = BostonTrain$high_crime, k = 5)
knn_cm <- confusionMatrix(knn_pred, BostonTest$high_crime)
print("KNN:")
## [1] "KNN:"
print(knn_cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 70 8
## 1 5 67
##
## Accuracy : 0.9133
## 95% CI : (0.8564, 0.953)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8267
##
## Mcnemar's Test P-Value : 0.5791
##
## Sensitivity : 0.9333
## Specificity : 0.8933
## Pos Pred Value : 0.8974
## Neg Pred Value : 0.9306
## Prevalence : 0.5000
## Detection Rate : 0.4667
## Detection Prevalence : 0.5200
## Balanced Accuracy : 0.9133
##
## 'Positive' Class : 0
##
# Summary of findings
print("Summary of findings:")
## [1] "Summary of findings:"
print("Logistic Regression accuracy:")
## [1] "Logistic Regression accuracy:"
print(log_cm$overall['Accuracy'])
## Accuracy
## 0.9333333
print("LDA accuracy:")
## [1] "LDA accuracy:"
print(lda_cm$overall['Accuracy'])
## Accuracy
## 0.8733333
print("Naive Bayes accuracy:")
## [1] "Naive Bayes accuracy:"
print(nb_cm$overall['Accuracy'])
## Accuracy
## 0.8666667
print("KNN accuracy:")
## [1] "KNN accuracy:"
print(knn_cm$overall['Accuracy'])
## Accuracy
## 0.9133333
# Chapter5 Q5: Default Data Set - Validation Set Approach
# Install necessary packages if not already installed
if (!requireNamespace("ISLR", quietly = TRUE)) {
install.packages("ISLR")
}
if (!requireNamespace("caret", quietly = TRUE)) {
install.packages("caret")
}
# Load the necessary libraries
library(ISLR)
library(caret)
# Load the Default dataset
data("Default")
# Set a random seed for reproducibility
set.seed(123)
# Split the sample set into a training set and a validation set (50-50 split)
trainIndex <- createDataPartition(Default$default, p = .5, list = FALSE)
DefaultTrain <- Default[trainIndex, ]
DefaultValidation <- Default[-trainIndex, ]
# Fit a logistic regression model that uses income and balance to predict default
log_model <- glm(default ~ income + balance, data = DefaultTrain, family = binomial)
# Obtain a prediction of default status for each individual in the validation set
log_pred <- predict(log_model, newdata = DefaultValidation, type = "response")
log_pred_class <- as.factor(ifelse(log_pred > 0.5, "Yes", "No"))
# Compute the confusion matrix to evaluate the performance
log_cm <- confusionMatrix(log_pred_class, DefaultValidation$default)
# Compute the validation set error, which is the fraction of the observations in the validation set that are misclassified
validation_set_error <- 1 - log_cm$overall['Accuracy']
# Print results
print("Logistic Regression Model:")
## [1] "Logistic Regression Model:"
print(summary(log_model))
##
## Call:
## glm(formula = default ~ income + balance, family = binomial,
## data = DefaultTrain)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.158e+01 6.209e-01 -18.656 < 2e-16 ***
## income 2.296e-05 7.206e-06 3.187 0.00144 **
## balance 5.617e-03 3.200e-04 17.555 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1463.76 on 5000 degrees of freedom
## Residual deviance: 791.05 on 4998 degrees of freedom
## AIC: 797.05
##
## Number of Fisher Scoring iterations: 8
print("Confusion Matrix:")
## [1] "Confusion Matrix:"
print(log_cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 4813 117
## Yes 20 49
##
## Accuracy : 0.9726
## 95% CI : (0.9677, 0.9769)
## No Information Rate : 0.9668
## P-Value [Acc > NIR] : 0.01061
##
## Kappa : 0.4054
##
## Mcnemar's Test P-Value : 2.367e-16
##
## Sensitivity : 0.9959
## Specificity : 0.2952
## Pos Pred Value : 0.9763
## Neg Pred Value : 0.7101
## Prevalence : 0.9668
## Detection Rate : 0.9628
## Detection Prevalence : 0.9862
## Balanced Accuracy : 0.6455
##
## 'Positive' Class : No
##
print("Validation Set Error:")
## [1] "Validation Set Error:"
print(validation_set_error)
## Accuracy
## 0.02740548
# Part (c): Repeat the validation set approach three times
# Function to perform the validation set approach
perform_validation <- function(seed) {
set.seed(seed)
# Split the sample set into a training set and a validation set (50-50 split)
trainIndex <- createDataPartition(Default$default, p = .5, list = FALSE)
DefaultTrain <- Default[trainIndex, ]
DefaultValidation <- Default[-trainIndex, ]
# Fit a logistic regression model that uses income and balance to predict default
log_model <- glm(default ~ income + balance, data = DefaultTrain, family = binomial)
# Obtain a prediction of default status for each individual in the validation set
log_pred <- predict(log_model, newdata = DefaultValidation, type = "response")
log_pred_class <- as.factor(ifelse(log_pred > 0.5, "Yes", "No"))
# Compute the confusion matrix to evaluate the performance
log_cm <- confusionMatrix(log_pred_class, DefaultValidation$default)
# Compute the validation set error, which is the fraction of the observations in the validation set that are misclassified
validation_set_error <- 1 - log_cm$overall['Accuracy']
return(validation_set_error)
}
# Perform the validation set approach three times with different random seeds
validation_errors <- sapply(c(123, 456, 789), perform_validation)
print("Validation Set Errors (three different splits):")
## [1] "Validation Set Errors (three different splits):"
print(validation_errors)
## Accuracy Accuracy Accuracy
## 0.02740548 0.02420484 0.02660532
print("Mean Validation Set Error:")
## [1] "Mean Validation Set Error:"
print(mean(validation_errors))
## [1] 0.02607188
# Part (d): Logistic regression model including 'student'
# Function to perform the validation set approach with an additional variable
perform_validation_with_student <- function(seed) {
set.seed(seed)
# Split the sample set into a training set and a validation set (50-50 split)
trainIndex <- createDataPartition(Default$default, p = .5, list = FALSE)
DefaultTrain <- Default[trainIndex, ]
DefaultValidation <- Default[-trainIndex, ]
# Fit a logistic regression model that uses income, balance, and student to predict default
log_model <- glm(default ~ income + balance + student, data = DefaultTrain, family = binomial)
# Obtain a prediction of default status for each individual in the validation set
log_pred <- predict(log_model, newdata = DefaultValidation, type = "response")
log_pred_class <- as.factor(ifelse(log_pred > 0.5, "Yes", "No"))
# Compute the confusion matrix to evaluate the performance
log_cm <- confusionMatrix(log_pred_class, DefaultValidation$default)
# Compute the validation set error, which is the fraction of the observations in the validation set that are misclassified
validation_set_error <- 1 - log_cm$overall['Accuracy']
return(validation_set_error)
}
# Perform the validation set approach three times with the additional variable
validation_errors_with_student <- sapply(c(123, 456, 789), perform_validation_with_student)
print("Validation Set Errors with Student (three different splits):")
## [1] "Validation Set Errors with Student (three different splits):"
print(validation_errors_with_student)
## Accuracy Accuracy Accuracy
## 0.02780556 0.02460492 0.02620524
print("Mean Validation Set Error with Student:")
## [1] "Mean Validation Set Error with Student:"
print(mean(validation_errors_with_student))
## [1] 0.02620524
# Compare the results
print("Comparison of Mean Validation Set Errors:")
## [1] "Comparison of Mean Validation Set Errors:"
print(paste("Without student:", mean(validation_errors)))
## [1] "Without student: 0.0260718810428752"
print(paste("With student:", mean(validation_errors_with_student)))
## [1] "With student: 0.0262052410482096"
# Chapter6 Q9:Predict variables in the College dataset
# Load necessary libraries
library(ISLR)
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-8
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
##
## R2
## The following object is masked from 'package:stats':
##
## loadings
library(MASS)
library(caret)
# Load the College dataset
data(College)
# Set a seed for reproducibility
set.seed(123)
# Split the data into training and test sets
train_index <- createDataPartition(College$Apps, p = 0.7, list = FALSE)
train_data <- College[train_index, ]
test_data <- College[-train_index, ]
# (b) Fit a linear model using least squares
lm_model <- lm(Apps ~ ., data = train_data)
lm_pred <- predict(lm_model, test_data)
lm_test_error <- mean((test_data$Apps - lm_pred)^2)
cat("Linear Model Test Error:", lm_test_error, "\n")
## Linear Model Test Error: 1882074
# (c) Fit a ridge regression model with cross-validation
x_train <- model.matrix(Apps ~ ., train_data)[, -1]
y_train <- train_data$Apps
x_test <- model.matrix(Apps ~ ., test_data)[, -1]
y_test <- test_data$Apps
cv_ridge <- cv.glmnet(x_train, y_train, alpha = 0)
ridge_pred <- predict(cv_ridge, s = cv_ridge$lambda.min, newx = x_test)
ridge_test_error <- mean((y_test - ridge_pred)^2)
cat("Ridge Regression Test Error:", ridge_test_error, "\n")
## Ridge Regression Test Error: 3265646
# (d) Fit a lasso model with cross-validation
cv_lasso <- cv.glmnet(x_train, y_train, alpha = 1)
lasso_pred <- predict(cv_lasso, s = cv_lasso$lambda.min, newx = x_test)
lasso_test_error <- mean((y_test - lasso_pred)^2)
cat("Lasso Test Error:", lasso_test_error, "\n")
## Lasso Test Error: 1942428
lasso_coef <- predict(cv_lasso, s = cv_lasso$lambda.min, type = "coefficients")
cat("Number of non-zero lasso coefficients:", sum(lasso_coef != 0), "\n")
## Number of non-zero lasso coefficients: 17
# (e) Fit a PCR model with cross-validation
pcr_model <- pcr(Apps ~ ., data = train_data, validation = "CV")
pcr_pred <- predict(pcr_model, test_data, ncomp = which.min(pcr_model$validation$PRESS))
pcr_test_error <- mean((test_data$Apps - pcr_pred)^2)
cat("PCR Test Error:", pcr_test_error, "\n")
## PCR Test Error: 1882074
cat("Number of PCR components used:", which.min(pcr_model$validation$PRESS), "\n")
## Number of PCR components used: 17
# (f) Fit a PLS model with cross-validation
pls_model <- plsr(Apps ~ ., data = train_data, validation = "CV")
pls_pred <- predict(pls_model, test_data, ncomp = which.min(pls_model$validation$PRESS))
pls_test_error <- mean((test_data$Apps - pls_pred)^2)
cat("PLS Test Error:", pls_test_error, "\n")
## PLS Test Error: 1882074
cat("Number of PLS components used:", which.min(pls_model$validation$PRESS), "\n")
## Number of PLS components used: 17
# (g) Comment on the results
cat("\nTest Errors Summary:\n")
##
## Test Errors Summary:
cat("Linear Model:", lm_test_error, "\n")
## Linear Model: 1882074
cat("Ridge Regression:", ridge_test_error, "\n")
## Ridge Regression: 3265646
cat("Lasso:", lasso_test_error, "\n")
## Lasso: 1942428
cat("PCR:", pcr_test_error, "\n")
## PCR: 1882074
cat("PLS:", pls_test_error, "\n")
## PLS: 1882074
# Conclusions
# Based on the output
# Ridge Regression achieved the lowest test error, making it the most accurate model for predicting the number of college applications in this analysis. Its ability to handle multicollinearity and prevent overfitting likely contributed to its superior performance.
# Lasso Regression also performed well and offers the added benefit of feature selection, which can be useful for simplifying the model and interpretation.
# Linear Model served as a baseline and was outperformed by all other models, indicating that regularization and dimensionality reduction methods provide significant improvements.
# PCR and PLS did not perform as well as the regularized regression methods (ridge and lasso). While they can be useful in specific contexts, they were less effective for this prediction task.
# Prediction accuracy
# The test errors suggest that regularized regression methods (ridge and lasso) provide better prediction accuracy for the number of college applications compared to unregularized linear models and dimensionality reduction methods like PCR and PLS.
# There is a noticeable difference in test errors, with ridge regression providing the best prediction accuracy, indicating its suitability for this dataset