library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.4.2
library(ISLR2)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
library(class)
library(e1071)
## Warning: package 'e1071' was built under R version 4.4.2
data("Weekly")
Question 13
This question should be answered using the Weekly data set, which is part of the ISLR2 package. This data is similar in nature to the Smarket data from this chapter’s lab, except that it contains 1, 089 weekly returns for 21 years, from the beginning of 1990 to the end of 2010.
# Numerical summaries
print(summary(Weekly))
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
print(cor(Weekly[, c("Lag1", "Lag2", "Lag3", "Lag4", "Lag5", "Volume", "Today")]))
## Lag1 Lag2 Lag3 Lag4 Lag5
## Lag1 1.000000000 -0.07485305 0.05863568 -0.071273876 -0.008183096
## Lag2 -0.074853051 1.00000000 -0.07572091 0.058381535 -0.072499482
## Lag3 0.058635682 -0.07572091 1.00000000 -0.075395865 0.060657175
## Lag4 -0.071273876 0.05838153 -0.07539587 1.000000000 -0.075675027
## Lag5 -0.008183096 -0.07249948 0.06065717 -0.075675027 1.000000000
## Volume -0.064951313 -0.08551314 -0.06928771 -0.061074617 -0.058517414
## Today -0.075031842 0.05916672 -0.07124364 -0.007825873 0.011012698
## Volume Today
## Lag1 -0.06495131 -0.075031842
## Lag2 -0.08551314 0.059166717
## Lag3 -0.06928771 -0.071243639
## Lag4 -0.06107462 -0.007825873
## Lag5 -0.05851741 0.011012698
## Volume 1.00000000 -0.033077783
## Today -0.03307778 1.000000000
# Set up a 2x2 plotting layout
par(mfrow = c(2, 2))
# 1. Time Series Plot: Volume over Years
plot(Weekly$Year, Weekly$Volume, type = "l",
xlab = "Year", ylab = "Volume", main = "Volume over Time")
# 2. Histogram: Distribution of Today's Returns
hist(Weekly$Today, breaks = 30,
main = "Histogram of Today's Returns", xlab = "Today's Return (%)")
# 3. Boxplot: Today's Return by Market Direction
boxplot(Today ~ Direction, data = Weekly,
main = "Today's Return by Direction", ylab = "Today's Return (%)")
# 4. Pairs Plot: Lag Variables and Today's Return
pairs(Weekly[, c("Lag1", "Lag2", "Lag3", "Lag4", "Lag5", "Today")],
main = "Pairs Plot of Lag Variables & Today's Return")
# Fit the logistic regression model using all observations
glm.fit <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Weekly, family = binomial)
# Print the summary of the model
summary(glm.fit)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
# Obtain predicted probabilities and convert them into class labels ("Up" if >0.5, else "Down")
glm.probs <- predict(glm.fit, type = "response")
glm.pred <- ifelse(glm.probs > 0.5, "Up", "Down")
# Compute the confusion matrix: rows = Actual, columns = Predicted
confusion_matrix <- table(Actual = Weekly$Direction, Predicted = glm.pred)
print(confusion_matrix)
## Predicted
## Actual Down Up
## Down 54 430
## Up 48 557
# Calculate the overall fraction of correct predictions (accuracy)
accuracy <- mean(glm.pred == Weekly$Direction)
print(accuracy)
## [1] 0.5610652
# Create training set (years 1990 to 2008) and test set (years 2009 and 2010)
train <- Weekly[Weekly$Year <= 2008, ]
test <- Weekly[Weekly$Year >= 2009, ]
# Fit logistic regression model on training data with Lag2 as predictor
glm.fit <- glm(Direction ~ Lag2, data = train, family = binomial)
# Predict probabilities on the test set
glm.probs <- predict(glm.fit, newdata = test, type = "response")
# Classify observations: if probability > 0.5, predict "Up", else "Down"
glm.pred <- ifelse(glm.probs > 0.5, "Up", "Down")
# Compute the confusion matrix: rows = actual, columns = predicted
confusion_matrix <- table(Actual = test$Direction, Predicted = glm.pred)
print(confusion_matrix)
## Predicted
## Actual Down Up
## Down 9 34
## Up 5 56
# Calculate the overall fraction of correct predictions (accuracy)
accuracy <- mean(glm.pred == test$Direction)
print(accuracy)
## [1] 0.625
### LDA
lda.fit <- lda(Direction ~ Lag2, data = train)
lda.pred <- predict(lda.fit, newdata = test)$class
lda.cm <- table(Actual = test$Direction, Predicted = lda.pred)
cat("LDA Confusion Matrix:\n")
## LDA Confusion Matrix:
print(lda.cm)
## Predicted
## Actual Down Up
## Down 9 34
## Up 5 56
cat("LDA Accuracy:", mean(lda.pred == test$Direction), "\n\n")
## LDA Accuracy: 0.625
### QDA
qda.fit <- qda(Direction ~ Lag2, data = train)
qda.pred <- predict(qda.fit, newdata = test)$class
qda.cm <- table(Actual = test$Direction, Predicted = qda.pred)
cat("QDA Confusion Matrix:\n")
## QDA Confusion Matrix:
print(qda.cm)
## Predicted
## Actual Down Up
## Down 0 43
## Up 0 61
cat("QDA Accuracy:", mean(qda.pred == test$Direction), "\n\n")
## QDA Accuracy: 0.5865385
### KNN with K = 1
# Create matrices of predictors
train.X <- as.matrix(train$Lag2)
test.X <- as.matrix(test$Lag2)
set.seed(1) # For reproducibility
knn.pred <- knn(train.X, test.X, train$Direction, k = 1)
knn.cm <- table(Actual = test$Direction, Predicted = knn.pred)
cat("KNN (k = 1) Confusion Matrix:\n")
## KNN (k = 1) Confusion Matrix:
print(knn.cm)
## Predicted
## Actual Down Up
## Down 21 22
## Up 30 31
cat("KNN (k = 1) Accuracy:", mean(knn.pred == test$Direction), "\n\n")
## KNN (k = 1) Accuracy: 0.5
### Naive Bayes
nb.fit <- naiveBayes(Direction ~ Lag2, data = train)
nb.pred <- predict(nb.fit, newdata = test)
nb.cm <- table(Actual = test$Direction, Predicted = nb.pred)
cat("Naive Bayes Confusion Matrix:\n")
## Naive Bayes Confusion Matrix:
print(nb.cm)
## Predicted
## Actual Down Up
## Down 0 43
## Up 0 61
cat("Naive Bayes Accuracy:", mean(nb.pred == test$Direction), "\n")
## Naive Bayes Accuracy: 0.5865385
QDA or Naive Bayes sometimes turn out slightly better
# Try a more flexible logistic model
glm.fit2 <- glm(Direction ~ Lag2 + I(Lag2^2) + Lag1:Lag2,
data = train, family = binomial)
glm.probs2 <- predict(glm.fit2, newdata = test, type = "response")
glm.pred2 <- ifelse(glm.probs2 > 0.5, "Up", "Down")
mean(glm.pred2 == test$Direction)
## [1] 0.5769231
# Try KNN with different k
for(k_val in c(1,3,5,7,10)) {
knn.pred <- knn(train.X, test.X, train$Direction, k = k_val)
cat("k =", k_val, "Accuracy =", mean(knn.pred == test$Direction), "\n")
}
## k = 1 Accuracy = 0.5
## k = 3 Accuracy = 0.5576923
## k = 5 Accuracy = 0.5384615
## k = 7 Accuracy = 0.5480769
## k = 10 Accuracy = 0.5865385
Question 14
In this problem, you will develop a model to predict whether a given car gets high or low gas mileage based on the Auto data set.
# Load the Auto dataset
data("Auto")
# View the first few rows of the Auto data
head(Auto)
## mpg cylinders displacement horsepower weight acceleration year origin
## 1 18 8 307 130 3504 12.0 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11.0 70 1
## 4 16 8 304 150 3433 12.0 70 1
## 5 17 8 302 140 3449 10.5 70 1
## 6 15 8 429 198 4341 10.0 70 1
## name
## 1 chevrolet chevelle malibu
## 2 buick skylark 320
## 3 plymouth satellite
## 4 amc rebel sst
## 5 ford torino
## 6 ford galaxie 500
# Compute the median of the mpg variable
mpg_median <- median(Auto$mpg)
# Create the binary variable: 1 if mpg is above the median, 0 otherwise
mpg01 <- ifelse(Auto$mpg > mpg_median, 1, 0)
# Combine mpg01 with the Auto data set
Auto_new <- data.frame(mpg01, Auto)
# View the first few rows of the new data frame
print(mpg_median)
## [1] 22.75
head(Auto_new)
## mpg01 mpg cylinders displacement horsepower weight acceleration year origin
## 1 0 18 8 307 130 3504 12.0 70 1
## 2 0 15 8 350 165 3693 11.5 70 1
## 3 0 18 8 318 150 3436 11.0 70 1
## 4 0 16 8 304 150 3433 12.0 70 1
## 5 0 17 8 302 140 3449 10.5 70 1
## 6 0 15 8 429 198 4341 10.0 70 1
## name
## 1 chevrolet chevelle malibu
## 2 buick skylark 320
## 3 plymouth satellite
## 4 amc rebel sst
## 5 ford torino
## 6 ford galaxie 500
# Scatterplot matrix for a subset of variables
pairs(Auto_new[, c("mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration")])
# Boxplot for cylinders by mpg01
boxplot(cylinders ~ mpg01, data = Auto_new,
main = "Cylinders vs. mpg01", xlab = "mpg01", ylab = "Cylinders")
# Boxplot for displacement by mpg01
boxplot(displacement ~ mpg01, data = Auto_new,
main = "Displacement vs. mpg01", xlab = "mpg01", ylab = "Displacement")
# Boxplot for horsepower by mpg01
boxplot(horsepower ~ mpg01, data = Auto_new,
main = "Horsepower vs. mpg01", xlab = "mpg01", ylab = "Horsepower")
# Boxplot for weight by mpg01
boxplot(weight ~ mpg01, data = Auto_new,
main = "Weight vs. mpg01", xlab = "mpg01", ylab = "Weight")
# Boxplot for acceleration by mpg01
boxplot(acceleration ~ mpg01, data = Auto_new,
main = "Acceleration vs. mpg01", xlab = "mpg01", ylab = "Acceleration")
- Split the data into a training set and a test set.
# Set seed for reproducibility
set.seed(123)
# Assume Auto_new is your data frame with mpg01 as a factor
n <- nrow(Auto_new)
train_idx <- sample(1:n, size = round(0.7 * n))
train_data <- Auto_new[train_idx, ]
test_data <- Auto_new[-train_idx, ]
# Perform LDA on the training data using the selected predictors
lda_fit <- lda(mpg01 ~ weight + horsepower + cylinders + displacement, data = train_data )
# Use the fitted model to predict on the test data
lda_pred <- predict(lda_fit, test_data)$class
# Create a confusion matrix
confusion_matrix <- table(Predicted = lda_pred, Actual = test_data$mpg01)
print(confusion_matrix)
## Actual
## Predicted 0 1
## 0 50 3
## 1 10 55
# Calculate the test error rate
test_error <- mean(lda_pred != test_data$mpg01)
print(paste("Test Error Rate:", round(test_error, 4)))
## [1] "Test Error Rate: 0.1102"
# Fit the QDA model on the training data
qda_fit <- qda(mpg01 ~ weight + horsepower + cylinders + displacement, data = train_data)
# Use the fitted model to predict on the test data
qda_pred <- predict(qda_fit, test_data)$class
# Create a confusion matrix to compare the predicted and actual classes
confusion_matrix_qda <- table(Predicted = qda_pred, Actual = test_data$mpg01)
print(confusion_matrix_qda)
## Actual
## Predicted 0 1
## 0 53 5
## 1 7 53
# Calculate the test error rate
test_error_qda <- mean(qda_pred != test_data$mpg01)
print(paste("Test Error Rate for QDA:", round(test_error_qda, 4)))
## [1] "Test Error Rate for QDA: 0.1017"
# Fit the logistic regression model on the training data
glm_fit <- glm(mpg01 ~ weight + horsepower + cylinders + displacement,
data = train_data,
family = binomial)
# Use the fitted model to predict probabilities on the test data
glm_probs <- predict(glm_fit, test_data, type = "response")
# Convert predicted probabilities to class labels using a threshold of 0.5
glm_pred <- ifelse(glm_probs > 0.5, 1, 0)
# Create a confusion matrix to compare predicted vs. actual values
conf_matrix <- table(Predicted = glm_pred, Actual = test_data$mpg01)
print(conf_matrix)
## Actual
## Predicted 0 1
## 0 53 6
## 1 7 52
# Calculate the test error rate (proportion of misclassified observations)
test_error <- mean(glm_pred != test_data$mpg01)
print(paste("Logistic Regression Test Error Rate:", round(test_error, 4)))
## [1] "Logistic Regression Test Error Rate: 0.1102"
# Ensure that mpg01 is a factor (if not already)
train_data$mpg01 <- as.factor(train_data$mpg01)
test_data$mpg01 <- as.factor(test_data$mpg01)
# Fit the naive Bayes model on the training data
nb_fit <- naiveBayes(mpg01 ~ weight + horsepower + cylinders + displacement, data = train_data)
# Use the fitted model to predict on the test data
nb_pred <- predict(nb_fit, test_data)
# Create a confusion matrix to compare predicted vs. actual values
conf_matrix <- table(Predicted = nb_pred, Actual = test_data$mpg01)
print(conf_matrix)
## Actual
## Predicted 0 1
## 0 52 4
## 1 8 54
# Calculate the test error rate (proportion of misclassified observations)
test_error <- mean(nb_pred != test_data$mpg01)
print(paste("Naive Bayes Test Error Rate:", round(test_error, 4)))
## [1] "Naive Bayes Test Error Rate: 0.1017"
# Load the class package (if not already loaded)
library(class)
# Extract the predictors for training and test sets
train_X <- as.matrix(train_data[, c("weight", "horsepower", "cylinders", "displacement")])
test_X <- as.matrix(test_data[, c("weight", "horsepower", "cylinders", "displacement")])
# Standardize the predictors using the training set parameters
train_X_scaled <- scale(train_X)
test_X_scaled <- scale(test_X, center = attr(train_X_scaled, "scaled:center"),
scale = attr(train_X_scaled, "scaled:scale"))
# Define the training and test labels
train_Y <- train_data$mpg01
test_Y <- test_data$mpg01
# Try several values for K
ks <- c(1, 3, 5, 7, 9, 11, 13, 15)
errors <- numeric(length(ks))
for (i in 1:length(ks)) {
k <- ks[i]
knn_pred <- knn(train = train_X_scaled, test = test_X_scaled, cl = train_Y, k = k)
error_rate <- mean(knn_pred != test_Y)
errors[i] <- error_rate
cat("k =", k, "Test Error =", round(error_rate, 4), "\n")
}
## k = 1 Test Error = 0.1356
## k = 3 Test Error = 0.1017
## k = 5 Test Error = 0.1186
## k = 7 Test Error = 0.0932
## k = 9 Test Error = 0.1017
## k = 11 Test Error = 0.1017
## k = 13 Test Error = 0.1017
## k = 15 Test Error = 0.1017
# Determine which value of K gives the lowest test error
best_k <- ks[which.min(errors)]
cat("Best k based on test error:", best_k, "\n")
## Best k based on test error: 7