library(MASS)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(ISLR)
library(e1071)
library(class)
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-8
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
##
## R2
## The following object is masked from 'package:stats':
##
## loadings
# Load the Boston dataset
data("Boston")
# Create response variable: crime rate above/below median
median_crime <- median(Boston$crim)
Boston$HighCrime <- ifelse(Boston$crim > median_crime, 1, 0)
# Remove the original crime rate variable
Boston$crim <- NULL
# Split the data into training and testing sets
set.seed(123)
trainIndex <- createDataPartition(Boston$HighCrime, p = .7, list = FALSE, times = 1)
trainData <- Boston[trainIndex,]
testData <- Boston[-trainIndex,]
# Logistic Regression
logitModel <- glm(HighCrime ~ ., data = trainData, family = binomial)
logitPred <- predict(logitModel, testData, type = "response")
logitClass <- ifelse(logitPred > 0.5, 1, 0)
logitConfMatrix <- confusionMatrix(as.factor(logitClass), as.factor(testData$HighCrime))
print("Logistic Regression Confusion Matrix:")
## [1] "Logistic Regression Confusion Matrix:"
print(logitConfMatrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 67 2
## 1 8 73
##
## Accuracy : 0.9333
## 95% CI : (0.8808, 0.9676)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8667
##
## Mcnemar's Test P-Value : 0.1138
##
## Sensitivity : 0.8933
## Specificity : 0.9733
## Pos Pred Value : 0.9710
## Neg Pred Value : 0.9012
## Prevalence : 0.5000
## Detection Rate : 0.4467
## Detection Prevalence : 0.4600
## Balanced Accuracy : 0.9333
##
## 'Positive' Class : 0
##
# Linear Discriminant Analysis (LDA)
ldaModel <- lda(HighCrime ~ ., data = trainData)
ldaPred <- predict(ldaModel, testData)
ldaClass <- ldaPred$class
ldaConfMatrix <- confusionMatrix(as.factor(ldaClass), as.factor(testData$HighCrime))
print("LDA Confusion Matrix:")
## [1] "LDA Confusion Matrix:"
print(ldaConfMatrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 70 14
## 1 5 61
##
## Accuracy : 0.8733
## 95% CI : (0.8093, 0.922)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.7467
##
## Mcnemar's Test P-Value : 0.06646
##
## Sensitivity : 0.9333
## Specificity : 0.8133
## Pos Pred Value : 0.8333
## Neg Pred Value : 0.9242
## Prevalence : 0.5000
## Detection Rate : 0.4667
## Detection Prevalence : 0.5600
## Balanced Accuracy : 0.8733
##
## 'Positive' Class : 0
##
# Naive Bayes
nbModel <- naiveBayes(HighCrime ~ ., data = trainData)
nbPred <- predict(nbModel, testData)
nbConfMatrix <- confusionMatrix(as.factor(nbPred), as.factor(testData$HighCrime))
print("Naive Bayes Confusion Matrix:")
## [1] "Naive Bayes Confusion Matrix:"
print(nbConfMatrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 69 14
## 1 6 61
##
## Accuracy : 0.8667
## 95% CI : (0.8016, 0.9166)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7333
##
## Mcnemar's Test P-Value : 0.1175
##
## Sensitivity : 0.9200
## Specificity : 0.8133
## Pos Pred Value : 0.8313
## Neg Pred Value : 0.9104
## Prevalence : 0.5000
## Detection Rate : 0.4600
## Detection Prevalence : 0.5533
## Balanced Accuracy : 0.8667
##
## 'Positive' Class : 0
##
# k-Nearest Neighbors (KNN)
preProcValues <- preProcess(trainData, method = c("center", "scale"))
trainDataNorm <- predict(preProcValues, trainData)
testDataNorm <- predict(preProcValues, testData)
knnModel <- knn(trainDataNorm[,-14], testDataNorm[,-14], cl = trainDataNorm$HighCrime, k = 5)
# Ensure factor levels match between predictions and actuals
knnModel <- factor(knnModel, levels = levels(as.factor(testData$HighCrime)))
knnConfMatrix <- confusionMatrix(knnModel, as.factor(testData$HighCrime))
print("KNN Confusion Matrix:")
## [1] "KNN Confusion Matrix:"
print(knnConfMatrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 0 0
## 1 0 0
##
## Accuracy : NaN
## 95% CI : (NA, NA)
## No Information Rate : NA
## P-Value [Acc > NIR] : NA
##
## Kappa : NaN
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : NA
## Specificity : NA
## Pos Pred Value : NA
## Neg Pred Value : NA
## Prevalence : NaN
## Detection Rate : NaN
## Detection Prevalence : NaN
## Balanced Accuracy : NA
##
## 'Positive' Class : 0
##
# Load the Default dataset
data("Default")
# Set a random seed
set.seed(123)
# Split the data into training and validation sets
trainIndex <- createDataPartition(Default$default, p = .5, list = FALSE)
trainData <- Default[trainIndex,]
validationData <- Default[-trainIndex,]
# (a) Fit logistic regression model
logitModel <- glm(default ~ income + balance, data = trainData, family = binomial)
# (b) Estimate the test error using validation set approach
logitPred <- predict(logitModel, validationData, type = "response")
logitClass <- ifelse(logitPred > 0.5, "Yes", "No")
validationError <- mean(logitClass != validationData$default)
print("Validation Error (Logistic Regression):")
## [1] "Validation Error (Logistic Regression):"
print(validationError)
## [1] 0.02740548
# (c) Repeat the process three times
errors <- rep(0, 3)
for (i in 1:3) {
trainIndex <- createDataPartition(Default$default, p = .5, list = FALSE)
trainData <- Default[trainIndex,]
validationData <- Default[-trainIndex,]
logitModel <- glm(default ~ income + balance, data = trainData, family = binomial)
logitPred <- predict(logitModel, validationData, type = "response")
logitClass <- ifelse(logitPred > 0.5, "Yes", "No")
errors[i] <- mean(logitClass != validationData$default)
}
print("Validation Errors (Three Repeats):")
## [1] "Validation Errors (Three Repeats):"
print(errors)
## [1] 0.02580516 0.02740548 0.02680536
# (d) Logistic regression model with student variable
logitModelStudent <- glm(default ~ income + balance + student, data = trainData, family = binomial)
logitPredStudent <- predict(logitModelStudent, validationData, type = "response")
logitClassStudent <- ifelse(logitPredStudent > 0.5, "Yes", "No")
validationErrorStudent <- mean(logitClassStudent != validationData$default)
print("Validation Error (Logistic Regression with Student):")
## [1] "Validation Error (Logistic Regression with Student):"
print(validationErrorStudent)
## [1] 0.02620524
# Load the College dataset
data("College")
# Set a random seed
set.seed(123)
# Split the data into training and test sets
trainIndex <- createDataPartition(College$Apps, p = .7, list = FALSE)
trainData <- College[trainIndex,]
testData <- College[-trainIndex,]
# Separate predictors and response
trainPredictors <- trainData[ , -which(names(trainData) == "Apps")]
trainResponse <- trainData$Apps
testPredictors <- testData[ , -which(names(testData) == "Apps")]
testResponse <- testData$Apps
preProc <- preProcess(trainPredictors, method = c("center", "scale"))
trainPredictorsScaled <- predict(preProc, trainPredictors)
testPredictorsScaled <- predict(preProc, testPredictors)
lmModel <- lm(Apps ~ ., data = trainData)
lmPred <- predict(lmModel, testData)
lmError <- mean((lmPred - testResponse)^2)
print("Test Error (Linear Model):")
## [1] "Test Error (Linear Model):"
print(lmError)
## [1] 1882074
x <- model.matrix(Apps ~ ., trainData)[,-1]
y <- trainData$Apps
cvRidge <- cv.glmnet(x, y, alpha = 0)
bestLambda <- cvRidge$lambda.min
ridgeModel <- glmnet(x, y, alpha = 0, lambda = bestLambda)
xTest <- model.matrix(Apps ~ ., testData)[,-1]
ridgePred <- predict(ridgeModel, s = bestLambda, newx = xTest)
ridgeError <- mean((ridgePred - testResponse)^2)
print("Test Error (Ridge Regression):")
## [1] "Test Error (Ridge Regression):"
print(ridgeError)
## [1] 3270111
cvLasso <- cv.glmnet(x, y, alpha = 1)
bestLambdaLasso <- cvLasso$lambda.min
lassoModel <- glmnet(x, y, alpha = 1, lambda = bestLambdaLasso)
lassoPred <- predict(lassoModel, s = bestLambdaLasso, newx = xTest)
lassoError <- mean((lassoPred - testResponse)^2)
print("Test Error (Lasso):")
## [1] "Test Error (Lasso):"
print(lassoError)
## [1] 1946216
numNonZeroCoef <- sum(coef(lassoModel, s = bestLambdaLasso) != 0)
print("Number of Non-Zero Coefficients (Lasso):")
## [1] "Number of Non-Zero Coefficients (Lasso):"
print(numNonZeroCoef)
## [1] 17
pcrModel <- pcr(Apps ~ ., data = trainData, scale = TRUE, validation = "CV")
validationplot(pcrModel, val.type = "MSEP")
bestComponents <- which.min(pcrModel$validation$MSEP)
pcrPred <- predict(pcrModel, testData, ncomp = bestComponents)
pcrError <- mean((pcrPred - testResponse)^2)
print("Test Error (PCR):")
## [1] "Test Error (PCR):"
print(pcrError)
## [1] NaN
plsModel <- plsr(Apps ~ ., data = trainData, scale = TRUE, validation = "CV")
validationplot(plsModel, val.type = "MSEP")
bestComponentsPls <- which.min(plsModel$validation$MSEP)
plsPred <- predict(plsModel, testData, ncomp = bestComponentsPls)
plsError <- mean((plsPred - testResponse)^2)
print("Test Error (PLS):")
## [1] "Test Error (PLS):"
print(plsError)
## [1] NaN
{# (g) Comment on the results} print("Comments on the Results:") print("Linear Model Test Error: 1882074") print("Ridge Regression Test Error: 3270111") print("Lasso Regression Test Error: 1946216") print("PCR Test Error: Check the printed value") print("PLS Test Error: Check the printed value")
# Load necessary libraries
library(MASS)
library(caret)
library(e1071)
library(class)
library(glmnet)
# Set seed for reproducibility
set.seed(123)
# Generate synthetic data
n <- 1000
income <- rnorm(n, mean = 50000, sd = 15000)
balance <- rnorm(n, mean = 1500, sd = 500)
student <- sample(c("Yes", "No"), n, replace = TRUE)
default <- ifelse(balance / income + rnorm(n, mean = 0.2, sd = 0.1) > 0.5, "Yes", "No")
# Create a data frame
data <- data.frame(income, balance, student, default)
# Convert default to a binary variable
data$default <- ifelse(data$default == "Yes", 1, 0)
# Convert student to a binary variable
data$student <- ifelse(data$student == "Yes", 1, 0)
# Split the data into training and testing sets
trainIndex <- createDataPartition(data$default, p = .7, list = FALSE, times = 1)
trainData <- data[trainIndex,]
testData <- data[-trainIndex,]
# Logistic Regression
logitModel <- glm(default ~ income + balance + student, data = trainData, family = binomial)
logitPred <- predict(logitModel, testData, type = "response")
logitClass <- ifelse(logitPred > 0.5, 1, 0)
logitConfMatrix <- confusionMatrix(as.factor(logitClass), as.factor(testData$default))
## Warning in confusionMatrix.default(as.factor(logitClass),
## as.factor(testData$default)): Levels are not in the same order for reference
## and data. Refactoring data to match.
print("Logistic Regression Confusion Matrix:")
## [1] "Logistic Regression Confusion Matrix:"
print(logitConfMatrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 299 1
## 1 0 0
##
## Accuracy : 0.9967
## 95% CI : (0.9816, 0.9999)
## No Information Rate : 0.9967
## P-Value [Acc > NIR] : 0.7358
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.9967
## Neg Pred Value : NaN
## Prevalence : 0.9967
## Detection Rate : 0.9967
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : 0
##
# Linear Discriminant Analysis (LDA)
ldaModel <- lda(default ~ income + balance + student, data = trainData)
ldaPred <- predict(ldaModel, testData)
ldaClass <- ldaPred$class
ldaConfMatrix <- confusionMatrix(as.factor(ldaClass), as.factor(testData$default))
print("LDA Confusion Matrix:")
## [1] "LDA Confusion Matrix:"
print(ldaConfMatrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 299 1
## 1 0 0
##
## Accuracy : 0.9967
## 95% CI : (0.9816, 0.9999)
## No Information Rate : 0.9967
## P-Value [Acc > NIR] : 0.7358
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.9967
## Neg Pred Value : NaN
## Prevalence : 0.9967
## Detection Rate : 0.9967
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : 0
##
# Naive Bayes
nbModel <- naiveBayes(default ~ income + balance + student, data = trainData)
nbPred <- predict(nbModel, testData)
nbConfMatrix <- confusionMatrix(as.factor(nbPred), as.factor(testData$default))
print("Naive Bayes Confusion Matrix:")
## [1] "Naive Bayes Confusion Matrix:"
print(nbConfMatrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 299 1
## 1 0 0
##
## Accuracy : 0.9967
## 95% CI : (0.9816, 0.9999)
## No Information Rate : 0.9967
## P-Value [Acc > NIR] : 0.7358
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.9967
## Neg Pred Value : NaN
## Prevalence : 0.9967
## Detection Rate : 0.9967
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : 0
##
# k-Nearest Neighbors (KNN)
preProcValues <- preProcess(trainData, method = c("center", "scale"))
trainDataNorm <- predict(preProcValues, trainData)
testDataNorm <- predict(preProcValues, testData)
knnModel <- knn(trainDataNorm[,-4], testDataNorm[,-4], cl = trainDataNorm$default, k = 5)
# Ensure factor levels match between predictions and actuals
knnModel <- factor(knnModel, levels = levels(as.factor(testData$default)))
knnConfMatrix <- confusionMatrix(knnModel, as.factor(testData$default))
print("KNN Confusion Matrix:")
## [1] "KNN Confusion Matrix:"
print(knnConfMatrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 0 0
## 1 0 0
##
## Accuracy : NaN
## 95% CI : (NA, NA)
## No Information Rate : NA
## P-Value [Acc > NIR] : NA
##
## Kappa : NaN
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : NA
## Specificity : NA
## Pos Pred Value : NA
## Neg Pred Value : NA
## Prevalence : NaN
## Detection Rate : NaN
## Detection Prevalence : NaN
## Balanced Accuracy : NA
##
## 'Positive' Class : 0
##