library(MASS)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(ISLR)
library(e1071)
library(class)
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-8
library(pls)
## 
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
## 
##     R2
## The following object is masked from 'package:stats':
## 
##     loadings

Chapter 4-16

# Load the Boston dataset
data("Boston")

# Create response variable: crime rate above/below median
median_crime <- median(Boston$crim)
Boston$HighCrime <- ifelse(Boston$crim > median_crime, 1, 0)

# Remove the original crime rate variable
Boston$crim <- NULL

# Split the data into training and testing sets
set.seed(123)
trainIndex <- createDataPartition(Boston$HighCrime, p = .7, list = FALSE, times = 1)
trainData <- Boston[trainIndex,]
testData <- Boston[-trainIndex,]

# Logistic Regression
logitModel <- glm(HighCrime ~ ., data = trainData, family = binomial)
logitPred <- predict(logitModel, testData, type = "response")
logitClass <- ifelse(logitPred > 0.5, 1, 0)
logitConfMatrix <- confusionMatrix(as.factor(logitClass), as.factor(testData$HighCrime))
print("Logistic Regression Confusion Matrix:")
## [1] "Logistic Regression Confusion Matrix:"
print(logitConfMatrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 67  2
##          1  8 73
##                                           
##                Accuracy : 0.9333          
##                  95% CI : (0.8808, 0.9676)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8667          
##                                           
##  Mcnemar's Test P-Value : 0.1138          
##                                           
##             Sensitivity : 0.8933          
##             Specificity : 0.9733          
##          Pos Pred Value : 0.9710          
##          Neg Pred Value : 0.9012          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4467          
##    Detection Prevalence : 0.4600          
##       Balanced Accuracy : 0.9333          
##                                           
##        'Positive' Class : 0               
## 
# Linear Discriminant Analysis (LDA)
ldaModel <- lda(HighCrime ~ ., data = trainData)
ldaPred <- predict(ldaModel, testData)
ldaClass <- ldaPred$class
ldaConfMatrix <- confusionMatrix(as.factor(ldaClass), as.factor(testData$HighCrime))
print("LDA Confusion Matrix:")
## [1] "LDA Confusion Matrix:"
print(ldaConfMatrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 70 14
##          1  5 61
##                                          
##                Accuracy : 0.8733         
##                  95% CI : (0.8093, 0.922)
##     No Information Rate : 0.5            
##     P-Value [Acc > NIR] : < 2e-16        
##                                          
##                   Kappa : 0.7467         
##                                          
##  Mcnemar's Test P-Value : 0.06646        
##                                          
##             Sensitivity : 0.9333         
##             Specificity : 0.8133         
##          Pos Pred Value : 0.8333         
##          Neg Pred Value : 0.9242         
##              Prevalence : 0.5000         
##          Detection Rate : 0.4667         
##    Detection Prevalence : 0.5600         
##       Balanced Accuracy : 0.8733         
##                                          
##        'Positive' Class : 0              
## 
# Naive Bayes
nbModel <- naiveBayes(HighCrime ~ ., data = trainData)
nbPred <- predict(nbModel, testData)
nbConfMatrix <- confusionMatrix(as.factor(nbPred), as.factor(testData$HighCrime))
print("Naive Bayes Confusion Matrix:")
## [1] "Naive Bayes Confusion Matrix:"
print(nbConfMatrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 69 14
##          1  6 61
##                                           
##                Accuracy : 0.8667          
##                  95% CI : (0.8016, 0.9166)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.7333          
##                                           
##  Mcnemar's Test P-Value : 0.1175          
##                                           
##             Sensitivity : 0.9200          
##             Specificity : 0.8133          
##          Pos Pred Value : 0.8313          
##          Neg Pred Value : 0.9104          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4600          
##    Detection Prevalence : 0.5533          
##       Balanced Accuracy : 0.8667          
##                                           
##        'Positive' Class : 0               
## 
# k-Nearest Neighbors (KNN)
preProcValues <- preProcess(trainData, method = c("center", "scale"))
trainDataNorm <- predict(preProcValues, trainData)
testDataNorm <- predict(preProcValues, testData)
knnModel <- knn(trainDataNorm[,-14], testDataNorm[,-14], cl = trainDataNorm$HighCrime, k = 5)

# Ensure factor levels match between predictions and actuals
knnModel <- factor(knnModel, levels = levels(as.factor(testData$HighCrime)))
knnConfMatrix <- confusionMatrix(knnModel, as.factor(testData$HighCrime))
print("KNN Confusion Matrix:")
## [1] "KNN Confusion Matrix:"
print(knnConfMatrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction 0 1
##          0 0 0
##          1 0 0
##                                   
##                Accuracy : NaN     
##                  95% CI : (NA, NA)
##     No Information Rate : NA      
##     P-Value [Acc > NIR] : NA      
##                                   
##                   Kappa : NaN     
##                                   
##  Mcnemar's Test P-Value : NA      
##                                   
##             Sensitivity :  NA     
##             Specificity :  NA     
##          Pos Pred Value :  NA     
##          Neg Pred Value :  NA     
##              Prevalence : NaN     
##          Detection Rate : NaN     
##    Detection Prevalence : NaN     
##       Balanced Accuracy :  NA     
##                                   
##        'Positive' Class : 0       
## 

Chapter 5-5

# Load the Default dataset
data("Default")

# Set a random seed
set.seed(123)

# Split the data into training and validation sets
trainIndex <- createDataPartition(Default$default, p = .5, list = FALSE)
trainData <- Default[trainIndex,]
validationData <- Default[-trainIndex,]
# (a) Fit logistic regression model
logitModel <- glm(default ~ income + balance, data = trainData, family = binomial)
# (b) Estimate the test error using validation set approach
logitPred <- predict(logitModel, validationData, type = "response")
logitClass <- ifelse(logitPred > 0.5, "Yes", "No")
validationError <- mean(logitClass != validationData$default)
print("Validation Error (Logistic Regression):")
## [1] "Validation Error (Logistic Regression):"
print(validationError)
## [1] 0.02740548
# (c) Repeat the process three times
errors <- rep(0, 3)
for (i in 1:3) {
  trainIndex <- createDataPartition(Default$default, p = .5, list = FALSE)
  trainData <- Default[trainIndex,]
  validationData <- Default[-trainIndex,]
  
  logitModel <- glm(default ~ income + balance, data = trainData, family = binomial)
  logitPred <- predict(logitModel, validationData, type = "response")
  logitClass <- ifelse(logitPred > 0.5, "Yes", "No")
  
  errors[i] <- mean(logitClass != validationData$default)
}
print("Validation Errors (Three Repeats):")
## [1] "Validation Errors (Three Repeats):"
print(errors)
## [1] 0.02580516 0.02740548 0.02680536
# (d) Logistic regression model with student variable
logitModelStudent <- glm(default ~ income + balance + student, data = trainData, family = binomial)
logitPredStudent <- predict(logitModelStudent, validationData, type = "response")
logitClassStudent <- ifelse(logitPredStudent > 0.5, "Yes", "No")
validationErrorStudent <- mean(logitClassStudent != validationData$default)
print("Validation Error (Logistic Regression with Student):")
## [1] "Validation Error (Logistic Regression with Student):"
print(validationErrorStudent)
## [1] 0.02620524

Chapter 6-9

# Load the College dataset
data("College")

# Set a random seed
set.seed(123)

# Split the data into training and test sets
trainIndex <- createDataPartition(College$Apps, p = .7, list = FALSE)
trainData <- College[trainIndex,]
testData <- College[-trainIndex,]

# Separate predictors and response
trainPredictors <- trainData[ , -which(names(trainData) == "Apps")]
trainResponse <- trainData$Apps
testPredictors <- testData[ , -which(names(testData) == "Apps")]
testResponse <- testData$Apps
preProc <- preProcess(trainPredictors, method = c("center", "scale"))
trainPredictorsScaled <- predict(preProc, trainPredictors)
testPredictorsScaled <- predict(preProc, testPredictors)
lmModel <- lm(Apps ~ ., data = trainData)
lmPred <- predict(lmModel, testData)
lmError <- mean((lmPred - testResponse)^2)
print("Test Error (Linear Model):")
## [1] "Test Error (Linear Model):"
print(lmError)
## [1] 1882074
x <- model.matrix(Apps ~ ., trainData)[,-1]
y <- trainData$Apps
cvRidge <- cv.glmnet(x, y, alpha = 0)
bestLambda <- cvRidge$lambda.min
ridgeModel <- glmnet(x, y, alpha = 0, lambda = bestLambda)
xTest <- model.matrix(Apps ~ ., testData)[,-1]
ridgePred <- predict(ridgeModel, s = bestLambda, newx = xTest)
ridgeError <- mean((ridgePred - testResponse)^2)
print("Test Error (Ridge Regression):")
## [1] "Test Error (Ridge Regression):"
print(ridgeError)
## [1] 3270111
cvLasso <- cv.glmnet(x, y, alpha = 1)
bestLambdaLasso <- cvLasso$lambda.min
lassoModel <- glmnet(x, y, alpha = 1, lambda = bestLambdaLasso)
lassoPred <- predict(lassoModel, s = bestLambdaLasso, newx = xTest)
lassoError <- mean((lassoPred - testResponse)^2)
print("Test Error (Lasso):")
## [1] "Test Error (Lasso):"
print(lassoError)
## [1] 1946216
numNonZeroCoef <- sum(coef(lassoModel, s = bestLambdaLasso) != 0)
print("Number of Non-Zero Coefficients (Lasso):")
## [1] "Number of Non-Zero Coefficients (Lasso):"
print(numNonZeroCoef)
## [1] 17
  1. Fit a PCR model on the training set, with M chosen by cross validation. Report the test error obtained, along with the value of M selected by cross-validation.
pcrModel <- pcr(Apps ~ ., data = trainData, scale = TRUE, validation = "CV")
validationplot(pcrModel, val.type = "MSEP")

bestComponents <- which.min(pcrModel$validation$MSEP)
pcrPred <- predict(pcrModel, testData, ncomp = bestComponents)
pcrError <- mean((pcrPred - testResponse)^2)
print("Test Error (PCR):")
## [1] "Test Error (PCR):"
print(pcrError)
## [1] NaN
plsModel <- plsr(Apps ~ ., data = trainData, scale = TRUE, validation = "CV")
validationplot(plsModel, val.type = "MSEP")

bestComponentsPls <- which.min(plsModel$validation$MSEP)
plsPred <- predict(plsModel, testData, ncomp = bestComponentsPls)
plsError <- mean((plsPred - testResponse)^2)
print("Test Error (PLS):")
## [1] "Test Error (PLS):"
print(plsError)
## [1] NaN

{# (g) Comment on the results} print("Comments on the Results:") print("Linear Model Test Error: 1882074") print("Ridge Regression Test Error: 3270111") print("Lasso Regression Test Error: 1946216") print("PCR Test Error: Check the printed value") print("PLS Test Error: Check the printed value")

20%

# Load necessary libraries
library(MASS)
library(caret)
library(e1071)
library(class)
library(glmnet)

# Set seed for reproducibility
set.seed(123)

# Generate synthetic data
n <- 1000
income <- rnorm(n, mean = 50000, sd = 15000)
balance <- rnorm(n, mean = 1500, sd = 500)
student <- sample(c("Yes", "No"), n, replace = TRUE)
default <- ifelse(balance / income + rnorm(n, mean = 0.2, sd = 0.1) > 0.5, "Yes", "No")

# Create a data frame
data <- data.frame(income, balance, student, default)

# Convert default to a binary variable
data$default <- ifelse(data$default == "Yes", 1, 0)

# Convert student to a binary variable
data$student <- ifelse(data$student == "Yes", 1, 0)

# Split the data into training and testing sets
trainIndex <- createDataPartition(data$default, p = .7, list = FALSE, times = 1)
trainData <- data[trainIndex,]
testData <- data[-trainIndex,]

# Logistic Regression
logitModel <- glm(default ~ income + balance + student, data = trainData, family = binomial)
logitPred <- predict(logitModel, testData, type = "response")
logitClass <- ifelse(logitPred > 0.5, 1, 0)
logitConfMatrix <- confusionMatrix(as.factor(logitClass), as.factor(testData$default))
## Warning in confusionMatrix.default(as.factor(logitClass),
## as.factor(testData$default)): Levels are not in the same order for reference
## and data. Refactoring data to match.
print("Logistic Regression Confusion Matrix:")
## [1] "Logistic Regression Confusion Matrix:"
print(logitConfMatrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 299   1
##          1   0   0
##                                           
##                Accuracy : 0.9967          
##                  95% CI : (0.9816, 0.9999)
##     No Information Rate : 0.9967          
##     P-Value [Acc > NIR] : 0.7358          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : 1.0000          
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.9967          
##          Neg Pred Value :    NaN          
##              Prevalence : 0.9967          
##          Detection Rate : 0.9967          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : 0               
## 
# Linear Discriminant Analysis (LDA)
ldaModel <- lda(default ~ income + balance + student, data = trainData)
ldaPred <- predict(ldaModel, testData)
ldaClass <- ldaPred$class
ldaConfMatrix <- confusionMatrix(as.factor(ldaClass), as.factor(testData$default))
print("LDA Confusion Matrix:")
## [1] "LDA Confusion Matrix:"
print(ldaConfMatrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 299   1
##          1   0   0
##                                           
##                Accuracy : 0.9967          
##                  95% CI : (0.9816, 0.9999)
##     No Information Rate : 0.9967          
##     P-Value [Acc > NIR] : 0.7358          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : 1.0000          
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.9967          
##          Neg Pred Value :    NaN          
##              Prevalence : 0.9967          
##          Detection Rate : 0.9967          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : 0               
## 
# Naive Bayes
nbModel <- naiveBayes(default ~ income + balance + student, data = trainData)
nbPred <- predict(nbModel, testData)
nbConfMatrix <- confusionMatrix(as.factor(nbPred), as.factor(testData$default))
print("Naive Bayes Confusion Matrix:")
## [1] "Naive Bayes Confusion Matrix:"
print(nbConfMatrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 299   1
##          1   0   0
##                                           
##                Accuracy : 0.9967          
##                  95% CI : (0.9816, 0.9999)
##     No Information Rate : 0.9967          
##     P-Value [Acc > NIR] : 0.7358          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : 1.0000          
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.9967          
##          Neg Pred Value :    NaN          
##              Prevalence : 0.9967          
##          Detection Rate : 0.9967          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : 0               
## 
# k-Nearest Neighbors (KNN)
preProcValues <- preProcess(trainData, method = c("center", "scale"))
trainDataNorm <- predict(preProcValues, trainData)
testDataNorm <- predict(preProcValues, testData)
knnModel <- knn(trainDataNorm[,-4], testDataNorm[,-4], cl = trainDataNorm$default, k = 5)

# Ensure factor levels match between predictions and actuals
knnModel <- factor(knnModel, levels = levels(as.factor(testData$default)))
knnConfMatrix <- confusionMatrix(knnModel, as.factor(testData$default))
print("KNN Confusion Matrix:")
## [1] "KNN Confusion Matrix:"
print(knnConfMatrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction 0 1
##          0 0 0
##          1 0 0
##                                   
##                Accuracy : NaN     
##                  95% CI : (NA, NA)
##     No Information Rate : NA      
##     P-Value [Acc > NIR] : NA      
##                                   
##                   Kappa : NaN     
##                                   
##  Mcnemar's Test P-Value : NA      
##                                   
##             Sensitivity :  NA     
##             Specificity :  NA     
##          Pos Pred Value :  NA     
##          Neg Pred Value :  NA     
##              Prevalence : NaN     
##          Detection Rate : NaN     
##    Detection Prevalence : NaN     
##       Balanced Accuracy :  NA     
##                                   
##        'Positive' Class : 0       
##