# Load necessary libraries
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.4.3
library(MASS)
## Warning: package 'MASS' was built under R version 4.4.3
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
library(class)
## Warning: package 'class' was built under R version 4.4.3
library(e1071)
## Warning: package 'e1071' was built under R version 4.4.3
library(ggplot2)
# Load the Weekly data set
data("Weekly")
# Numerical summary
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
# Graphical summary
pairs(Weekly, main="Scatterplot Matrix of Weekly Data")
# Plotting Direction vs Volume
ggplot(Weekly, aes(x=Volume, fill=Direction)) +
geom_histogram(binwidth=0.1, position="dodge") +
labs(title="Volume vs Direction", x="Volume", y="Count")
Explanation:
Summary(Weekly): Provides a numerical summary of the data, including mean, median, and quartiles for numeric variables and frequency counts for categorical variables.
Pairs(Weekly): Creates a scatterplot matrix to visualize relationships between all pairs of variables.
ggplot: Plots histograms of Volume by Direction to visualize how volume varies with market direction.
# Perform logistic regression
model_logistic <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data=Weekly, family="binomial")
# Print the results
summary(model_logistic)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = "binomial", data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
glm: Fits a logistic regression model with Direction as the response variable and the lag variables plus Volume as predictors.
family=“binomial”: Specifies that this is a logistic regression model.
summary(model_logistic): Displays the coefficients, standard errors, z-values, and p-values for each predictor, indicating which are statistically significant.
(c) Compute the confusion matrix and overall fraction of correct predictions. Explain what the confusion matrix is telling you about the types of mistakes made by logistic regression.
# Predict on the full data set
predictions_logistic <- predict(model_logistic, type="response")
# Convert predictions to class labels
class_labels_logistic <- ifelse(predictions_logistic > 0.5, "Up", "Down")
# Compute the confusion matrix
confusion_matrix_logistic <- table(Weekly$Direction, class_labels_logistic)
# Print the confusion matrix
print(confusion_matrix_logistic)
## class_labels_logistic
## Down Up
## Down 54 430
## Up 48 557
# Overall fraction of correct predictions
accuracy_logistic <- sum(diag(confusion_matrix_logistic)) / sum(confusion_matrix_logistic)
print(paste("Accuracy:", accuracy_logistic))
## [1] "Accuracy: 0.561065197428834"
predict(model_logistic): Generates predicted probabilities for the full data set.
ifelse: Converts these probabilities into class labels (“Up” or “Down”).
table: Creates a confusion matrix comparing actual and predicted classes.
sum(diag(confusion_matrix_logistic)) / sum(confusion_matrix_logistic): Calculates the overall accuracy.
(d) Nowfitthe logistic regression model using a training data period from 1990 to 2008, with Lag2 as the only predictor. Compute the confusion matrix and the overall fraction of correct predictions for the held out data (that is, the data from 2009 and 2010).
# Split data into training and testing sets
train_data <- Weekly[Weekly$Year <= 2008, ]
test_data <- Weekly[Weekly$Year > 2008, ]
# Perform logistic regression using Lag2 as the only predictor
model_logistic_train <- glm(Direction ~ Lag2, data=train_data, family="binomial")
# Predict on the test data
predictions_logistic_test <- predict(model_logistic_train, newdata=test_data, type="response")
# Convert predictions to class labels
class_labels_logistic_test <- ifelse(predictions_logistic_test > 0.5, "Up", "Down")
# Compute the confusion matrix for the test data
confusion_matrix_logistic_test <- table(test_data$Direction, class_labels_logistic_test)
# Print the confusion matrix
print(confusion_matrix_logistic_test)
## class_labels_logistic_test
## Down Up
## Down 9 34
## Up 5 56
# Overall fraction of correct predictions for the test data
accuracy_logistic_test <- sum(diag(confusion_matrix_logistic_test)) / sum(confusion_matrix_logistic_test)
print(paste("Test Accuracy:", accuracy_logistic_test))
## [1] "Test Accuracy: 0.625"
train_data and test_data: Split the data into training (1990-2008) and testing (2009-2010) sets.
glm(Direction ~ Lag2): Fits a logistic regression model using only Lag2 as a predictor on the training data.
The rest of the code predicts on the test data, computes the confusion matrix, and calculates the test accuracy.
(e) Repeat (d) using LDA.
# Perform LDA on the training data
model_lda <- lda(Direction ~ Lag2, data=train_data)
# Predict on the test data
predictions_lda_test <- predict(model_lda, newdata=test_data)$class
# Compute the confusion matrix for the test data
confusion_matrix_lda_test <- table(test_data$Direction, predictions_lda_test)
# Print the confusion matrix
print(confusion_matrix_lda_test)
## predictions_lda_test
## Down Up
## Down 9 34
## Up 5 56
# Overall fraction of correct predictions for the test data
accuracy_lda_test <- sum(diag(confusion_matrix_lda_test)) / sum(confusion_matrix_lda_test)
print(paste("Test Accuracy (LDA):", accuracy_lda_test))
## [1] "Test Accuracy (LDA): 0.625"
lda: Fits a linear discriminant analysis model using Lag2 as a predictor on the training data.
The rest of the code predicts on the test data, computes the confusion matrix, and calculates the test accuracy.
(f) Repeat (d) using QDA.
# Perform QDA on the training data
model_qda <- qda(Direction ~ Lag2, data=train_data)
# Predict on the test data
predictions_qda_test <- predict(model_qda, newdata=test_data)$class
# Compute the confusion matrix for the test data
confusion_matrix_qda_test <- table(test_data$Direction, predictions_qda_test)
# Print the confusion matrix
print(confusion_matrix_qda_test)
## predictions_qda_test
## Down Up
## Down 0 43
## Up 0 61
# Overall fraction of correct predictions for the test data
accuracy_qda_test <- sum(diag(confusion_matrix_qda_test)) / sum(confusion_matrix_qda_test)
print(paste("Test Accuracy (QDA):", accuracy_qda_test))
## [1] "Test Accuracy (QDA): 0.586538461538462"
qda: Fits a quadratic discriminant analysis model using Lag2 as a predictor on the training data.
The rest of the code predicts on the test data, computes the confusion matrix, and calculates the test accuracy.
(g) Repeat (d) using KNN with K =1.
# Ensure both train and test data have the same number of features
train_features <- as.matrix(train_data[, c("Lag2")])
test_features <- as.matrix(test_data[, c("Lag2")])
# Check dimensions explicitly
if (ncol(train_features) != ncol(test_features)) {
stop("Dimensions of train and test features do not match.")
}
# Check for missing values
if (anyNA(train_features) | anyNA(test_features)) {
stop("There are missing values in the data.")
}
# Now use these features in the knn function
predictions_knn_test <- knn(train_features, test_features, train_data$Direction, k=1)
# Compute the confusion matrix for the test data
confusion_matrix_knn_test <- table(test_data$Direction, predictions_knn_test)
# Print the confusion matrix
print(confusion_matrix_knn_test)
## predictions_knn_test
## Down Up
## Down 21 22
## Up 30 31
# Overall fraction of correct predictions for the test data
accuracy_knn_test <- sum(diag(confusion_matrix_knn_test)) / sum(confusion_matrix_knn_test)
print(paste("Test Accuracy (KNN):", accuracy_knn_test))
## [1] "Test Accuracy (KNN): 0.5"
knn: Fits a K-nearest neighbors model with K=1 using Lag2 as a predictor on the training data.
The rest of the code predicts on the test data, computes the confusion matrix, and calculates the test accuracy.
(h) Repeat (d) using naive Bayes.
# Perform Naive Bayes on the training data
library(e1071)
model_nb <- naiveBayes(Direction ~ Lag2, data=train_data)
# Predict on the test data
predictions_nb_test <- predict(model_nb, newdata=test_data)
# Compute the confusion matrix for the test data
confusion_matrix_nb_test <- table(test_data$Direction, predictions_nb_test)
# Print the confusion matrix
print(confusion_matrix_nb_test)
## predictions_nb_test
## Down Up
## Down 0 43
## Up 0 61
# Overall fraction of correct predictions for the test data
accuracy_nb_test <- sum(diag(confusion_matrix_nb_test)) / sum(confusion_matrix_nb_test)
print(paste("Test Accuracy (Naive Bayes):", accuracy_nb_test))
## [1] "Test Accuracy (Naive Bayes): 0.586538461538462"
naiveBayes: Fits a naive Bayes model using Lag2 as a predictor on the training data.
The rest of the code predicts on the test data, computes the confusion matrix, and calculates the test accuracy.
(i) Which of these methods appears to provide the best results on this data?
# Now that we have computed accuracy for each method
accuracy_logistic <- sum(diag(confusion_matrix_logistic_test)) / sum(confusion_matrix_logistic_test)
accuracy_lda <- sum(diag(confusion_matrix_lda_test)) / sum(confusion_matrix_lda_test)
accuracy_qda <- sum(diag(confusion_matrix_qda_test)) / sum(confusion_matrix_qda_test)
accuracy_knn <- sum(diag(confusion_matrix_knn_test)) / sum(confusion_matrix_knn_test)
accuracy_nb <- sum(diag(confusion_matrix_nb_test)) / sum(confusion_matrix_nb_test)
# Comparing the accuracies
accuracies <- c(
Logistic = accuracy_logistic,
LDA = accuracy_lda,
QDA = accuracy_qda,
KNN = accuracy_knn,
NaiveBayes = accuracy_nb
)
best_method <- names(accuracies[which.max(accuracies)])
print(paste("The best method is:", best_method))
## [1] "The best method is: Logistic"
(j) Experiment with different combinations of predictors, including possible transformations and interactions, for each of the methods. Report the variables, method, and associated confusion matrix that appears to provide the best results on the held out data. Note that you should also experiment with values for K in the KNN classifier.
# Example: Using Lag1, Lag2, and Volume with transformations
model_logistic_subset <- glm(Direction ~ Lag1 + Lag2 + log(Volume), data=train_data, family="binomial")
# Predict on the test data
predictions_logistic_subset_test <- predict(model_logistic_subset, newdata=test_data, type="response")
# Convert predictions to class labels
class_labels_logistic_subset_test <- ifelse(predictions_logistic_subset_test > 0.5, "Up", "Down")
# Compute the confusion matrix for the test data
confusion_matrix_logistic_subset_test <- table(test_data$Direction, class_labels_logistic_subset_test)
# Print the confusion matrix
print(confusion_matrix_logistic_subset_test)
## class_labels_logistic_subset_test
## Down Up
## Down 20 23
## Up 18 43
# Overall fraction of correct predictions for the test data
accuracy_logistic_subset_test <- sum(diag(confusion_matrix_logistic_subset_test)) / sum(confusion_matrix_logistic_subset_test)
print(paste("Test Accuracy (Subset):", accuracy_logistic_subset_test))
## [1] "Test Accuracy (Subset): 0.605769230769231"
# Example subsets of features
subset1 <- c("Lag1", "Lag2", "Volume")
subset2 <- c("Lag1", "Lag2", "log(Volume)")
subset3 <- c("Lag1", "Lag2", "Lag1:Lag2")
Step 2: Implement Models with Different Feature Sets
Now, we will implement each model using these different subsets of features.
Logistic Regression
# Logistic Regression with subset1
model_logistic_subset1 <- glm(Direction ~ Lag1 + Lag2 + Volume, data=train_data, family="binomial")
# Predict on the test data
predictions_logistic_subset1_test <- predict(model_logistic_subset1, newdata=test_data, type="response")
# Convert predictions to class labels
class_labels_logistic_subset1_test <- ifelse(predictions_logistic_subset1_test > 0.5, "Up", "Down")
# Compute the confusion matrix for the test data
confusion_matrix_logistic_subset1_test <- table(test_data$Direction, class_labels_logistic_subset1_test)
# Print the confusion matrix
print(confusion_matrix_logistic_subset1_test)
## class_labels_logistic_subset1_test
## Down Up
## Down 27 16
## Up 33 28
# Overall fraction of correct predictions for the test data
accuracy_logistic_subset1_test <- sum(diag(confusion_matrix_logistic_subset1_test)) / sum(confusion_matrix_logistic_subset1_test)
print(paste("Test Accuracy (Logistic Subset1):", accuracy_logistic_subset1_test))
## [1] "Test Accuracy (Logistic Subset1): 0.528846153846154"
LDA
# LDA with subset1
model_lda_subset1 <- lda(Direction ~ Lag1 + Lag2 + Volume, data=train_data)
# Predict on the test data
predictions_lda_subset1_test <- predict(model_lda_subset1, newdata=test_data)$class
# Compute the confusion matrix for the test data
confusion_matrix_lda_subset1_test <- table(test_data$Direction, predictions_lda_subset1_test)
# Print the confusion matrix
print(confusion_matrix_lda_subset1_test)
## predictions_lda_subset1_test
## Down Up
## Down 27 16
## Up 33 28
# Overall fraction of correct predictions for the test data
accuracy_lda_subset1_test <- sum(diag(confusion_matrix_lda_subset1_test)) / sum(confusion_matrix_lda_subset1_test)
print(paste("Test Accuracy (LDA Subset1):", accuracy_lda_subset1_test))
## [1] "Test Accuracy (LDA Subset1): 0.528846153846154"
QDA
# QDA with subset1
model_qda_subset1 <- qda(Direction ~ Lag1 + Lag2 + Volume, data=train_data)
# Predict on the test data
predictions_qda_subset1_test <- predict(model_qda_subset1, newdata=test_data)$class
# Compute the confusion matrix for the test data
confusion_matrix_qda_subset1_test <- table(test_data$Direction, predictions_qda_subset1_test)
# Print the confusion matrix
print(confusion_matrix_qda_subset1_test)
## predictions_qda_subset1_test
## Down Up
## Down 31 12
## Up 44 17
# Overall fraction of correct predictions for the test data
accuracy_qda_subset1_test <- sum(diag(confusion_matrix_qda_subset1_test)) / sum(confusion_matrix_qda_subset1_test)
print(paste("Test Accuracy (QDA Subset1):", accuracy_qda_subset1_test))
## [1] "Test Accuracy (QDA Subset1): 0.461538461538462"
KNN
# KNN with subset1
train_features_subset1 <- train_data[, c("Lag1", "Lag2", "Volume")]
test_features_subset1 <- test_data[, c("Lag1", "Lag2", "Volume")]
predictions_knn_subset1_test <- knn(train_features_subset1, test_features_subset1, train_data$Direction, k=1)
# Compute the confusion matrix for the test data
confusion_matrix_knn_subset1_test <- table(test_data$Direction, predictions_knn_subset1_test)
# Print the confusion matrix
print(confusion_matrix_knn_subset1_test)
## predictions_knn_subset1_test
## Down Up
## Down 26 17
## Up 35 26
# Overall fraction of correct predictions for the test data
accuracy_knn_subset1_test <- sum(diag(confusion_matrix_knn_subset1_test)) / sum(confusion_matrix_knn_subset1_test)
print(paste("Test Accuracy (KNN Subset1):", accuracy_knn_subset1_test))
## [1] "Test Accuracy (KNN Subset1): 0.5"
Naive Bayes
# Naive Bayes with subset1
model_nb_subset1 <- naiveBayes(Direction ~ Lag1 + Lag2 + Volume, data=train_data)
# Predict on the test data
predictions_nb_subset1_test <- predict(model_nb_subset1, newdata=test_data)
# Compute the confusion matrix for the test data
confusion_matrix_nb_subset1_test <- table(test_data$Direction, predictions_nb_subset1_test)
# Print the confusion matrix
print(confusion_matrix_nb_subset1_test)
## predictions_nb_subset1_test
## Down Up
## Down 41 2
## Up 58 3
# Overall fraction of correct predictions for the test data
accuracy_nb_subset1_test <- sum(diag(confusion_matrix_nb_subset1_test)) / sum(confusion_matrix_nb_subset1_test)
print(paste("Test Accuracy (Naive Bayes Subset1):", accuracy_nb_subset1_test))
## [1] "Test Accuracy (Naive Bayes Subset1): 0.423076923076923"
Comparing Results
# Example comparison
accuracies <- c(
Logistic_Subset1 = accuracy_logistic_subset1_test,
LDA_Subset1 = accuracy_lda_subset1_test,
QDA_Subset1 = accuracy_qda_subset1_test,
KNN_Subset1 = accuracy_knn_subset1_test,
NaiveBayes_Subset1 = accuracy_nb_subset1_test
)
# Print the best method
best_method <- names(accuracies[which.max(accuracies)])
print(paste("The best method is:", best_method))
## [1] "The best method is: Logistic_Subset1"
EXERCISE 14
(a) Create a binary variable, mpg01, that contains a 1 if mpg contains a value above its median, and a 0 if mpg contains a value below its median. You can compute the median using the median() function. Note you may find it helpful to use the data.frame() function to create a single data set containing both mpg01 and the other Auto variables.
# Load the Auto data set
data("Auto", package="ISLR2")
# Create a binary variable mpg01
Auto$mpg01 <- ifelse(Auto$mpg > median(Auto$mpg), 1, 0)
(b) Explore the data graphically in order to investigate the association between mpg01 and the other features. Which of the other features seem most likely to be useful in predicting mpg01? Scatterplots and box plots may be useful tools to answer this question. Describe your findings.
# Scatterplots to explore associations
library(ggplot2)
ggplot(Auto, aes(x=cylinders, y=mpg01)) + geom_point()
ggplot(Auto, aes(x=displacement, y=mpg01)) + geom_point()
ggplot(Auto, aes(x=weight, y=mpg01)) + geom_point()
# Set seed for reproducibility
set.seed(123)
# Split data into training and test sets
train_index <- sample(nrow(Auto), nrow(Auto)*0.7)
train_data <- Auto[train_index, ]
test_data <- Auto[-train_index, ]
sample: Randomly selects indices for the training set.
train_data and test_data: Split the data into training and testing sets.
# Perform LDA on the training data
model_lda <- lda(mpg01 ~ cylinders + displacement + weight, data=train_data)
# Predict on the test data
predictions_lda_test <- predict(model_lda, newdata=test_data)$class
# Compute the confusion matrix for the test data
confusion_matrix_lda_test <- table(test_data$mpg01, predictions_lda_test)
# Print the confusion matrix
print(confusion_matrix_lda_test)
## predictions_lda_test
## 0 1
## 0 50 10
## 1 4 54
# Overall fraction of correct predictions for the test data
accuracy_lda_test <- sum(diag(confusion_matrix_lda_test)) / sum(confusion_matrix_lda_test)
print(paste("Test Accuracy (LDA):", accuracy_lda_test))
## [1] "Test Accuracy (LDA): 0.88135593220339"
lda: Fits a linear discriminant analysis model using selected predictors on the training data.
The rest of the code predicts on the test data, computes the
confusion matrix, and calculates the test accuracy.**
# Overall fraction of incorrect predictions for the test data (test error)
test_error_lda <- 1 - sum(diag(confusion_matrix_lda_test)) / sum(confusion_matrix_lda_test)
print(paste("Test Error (LDA):", test_error_lda))
## [1] "Test Error (LDA): 0.11864406779661"
(e) Perform QDA on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b).
# Perform QDA on the training data
model_qda <- qda(mpg01 ~ cylinders + displacement + weight, data=train_data)
# Predict on the test data
predictions_qda_test <- predict(model_qda, newdata=test_data)$class
# Compute the confusion matrix for the test data
confusion_matrix_qda_test <- table(test_data$mpg01, predictions_qda_test)
# Print the confusion matrix
print(confusion_matrix_qda_test)
## predictions_qda_test
## 0 1
## 0 54 6
## 1 4 54
# Overall fraction of correct predictions for the test data
accuracy_qda_test <- sum(diag(confusion_matrix_qda_test)) / sum(confusion_matrix_qda_test)
print(paste("Test Accuracy (QDA):", accuracy_qda_test))
## [1] "Test Accuracy (QDA): 0.915254237288136"
qda: Fits a quadratic discriminant analysis model using selected predictors on the training data.
The rest of the code predicts on the test data, computes the
confusion matrix, and calculates the test accuracy.
What is the test error of the model obtained?
# Overall fraction of incorrect predictions for the test data (test error)
test_error_qda <- 1 - sum(diag(confusion_matrix_qda_test)) / sum(confusion_matrix_qda_test)
print(paste("Test Error (QDA):", test_error_qda))
## [1] "Test Error (QDA): 0.0847457627118644"
(f) Perform logistic regression on the training data in order to pre dict mpg01 using the variables that seemed most associated with mpg01 in (b).
# Perform logistic regression on the training data
model_logistic <- glm(mpg01 ~ cylinders + displacement + weight, data=train_data, family="binomial")
# Predict on the test data
predictions_logistic_test <- predict(model_logistic, newdata=test_data, type="response")
# Convert predictions to class labels
class_labels_logistic_test <- ifelse(predictions_logistic_test > 0.5, 1, 0)
# Compute the confusion matrix for the test data
confusion_matrix_logistic_test <- table(test_data$mpg01, class_labels_logistic_test)
# Print the confusion matrix
print(confusion_matrix_logistic_test)
## class_labels_logistic_test
## 0 1
## 0 52 8
## 1 5 53
# Overall fraction of correct predictions for the test data
accuracy_logistic_test <- sum(diag(confusion_matrix_logistic_test)) / sum(confusion_matrix_logistic_test)
print(paste("Test Accuracy (Logistic):", accuracy_logistic_test))
## [1] "Test Accuracy (Logistic): 0.889830508474576"
glm: Fits a logistic regression model using selected predictors on the training data.
The rest of the code predicts on the test data, computes the confusion matrix, and calculates the test accuracy.
What is the test error of the model obtained?
# Overall fraction of incorrect predictions for the test data (test error)
test_error_logistic <- 1 - sum(diag(confusion_matrix_logistic_test)) / sum(confusion_matrix_logistic_test)
print(paste("Test Error (Logistic):", test_error_logistic))
## [1] "Test Error (Logistic): 0.110169491525424"
(g) Perform naive Bayes on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b).
# Perform Naive Bayes on the training data
library(e1071)
model_nb <- naiveBayes(mpg01 ~ cylinders + displacement + weight, data=train_data)
# Predict on the test data
predictions_nb_test <- predict(model_nb, newdata=test_data)
# Compute the confusion matrix for the test data
confusion_matrix_nb_test <- table(test_data$mpg01, predictions_nb_test)
# Print the confusion matrix
print(confusion_matrix_nb_test)
## predictions_nb_test
## 0 1
## 0 52 8
## 1 4 54
# Overall fraction of correct predictions for the test data
accuracy_nb_test <- sum(diag(confusion_matrix_nb_test)) / sum(confusion_matrix_nb_test)
print(paste("Test Accuracy (Naive Bayes):", accuracy_nb_test))
## [1] "Test Accuracy (Naive Bayes): 0.898305084745763"
naiveBayes: Fits a naive Bayes model using selected predictors on the training data.
The rest of the code predicts on the test data, computes the confusion matrix, and calculates the test accuracy.
What is the test error of the model obtained?
# Overall fraction of incorrect predictions for the test data (test error)
test_error_nb <- 1 - sum(diag(confusion_matrix_nb_test)) / sum(confusion_matrix_nb_test)
print(paste("Test Error (Naive Bayes):", test_error_nb))
## [1] "Test Error (Naive Bayes): 0.101694915254237"
(h) Perform KNN on the training data, with several values of K, in order to predict mpg01. Use only the variables that seemed most associated with mpg01 in (b). What test errors do you obtain? Which value of K seems to perform the best on this data set?
# Load necessary libraries
library(class)
library(ISLR2)
# Assuming cylinders and displacement are most associated with mpg01
train_features <- train_data[, c("cylinders", "displacement")]
test_features <- test_data[, c("cylinders", "displacement")]
# Try several values of K
K_values <- c(1, 3, 5, 7, 9)
# Initialize a vector to store test errors
test_errors <- rep(0, length(K_values))
# Loop through each K value
for (i in seq_along(K_values)) {
# Perform KNN with the current K value
predictions_knn_test <- knn(train_features, test_features, train_data$mpg01, k=K_values[i])
# Compute the confusion matrix for the test data
confusion_matrix_knn_test <- table(test_data$mpg01, predictions_knn_test)
# Calculate the test error
test_error_knn <- 1 - sum(diag(confusion_matrix_knn_test)) / sum(confusion_matrix_knn_test)
# Store the test error
test_errors[i] <- test_error_knn
}
# Print the test errors for each K value
print(paste("Test Errors for K =", K_values))
## [1] "Test Errors for K = 1" "Test Errors for K = 3" "Test Errors for K = 5"
## [4] "Test Errors for K = 7" "Test Errors for K = 9"
print(test_errors)
## [1] 0.1016949 0.1101695 0.1016949 0.1016949 0.1101695
# Determine which K value performs the best
best_K <- K_values[which.min(test_errors)]
print(paste("The best K value is:", best_K))
## [1] "The best K value is: 1"
knn: Fits a K-nearest neighbors model with K=1 using selected predictors on the training data.
The rest of the code predicts on the test data, computes the confusion matrix, and calculates the test accuracy.