library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.4.2
library(ISLR2)
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
## 
##     Boston
library(class)
library(e1071)
## Warning: package 'e1071' was built under R version 4.4.2
data("Weekly")

Question 13

This question should be answered using the Weekly data set, which is part of the ISLR2 package. This data is similar in nature to the Smarket data from this chapter’s lab, except that it contains 1, 089 weekly returns for 21 years, from the beginning of 1990 to the end of 2010.

# Numerical summaries
print(summary(Weekly))
##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
## 
print(cor(Weekly[, c("Lag1", "Lag2", "Lag3", "Lag4", "Lag5", "Volume", "Today")]))
##                Lag1        Lag2        Lag3         Lag4         Lag5
## Lag1    1.000000000 -0.07485305  0.05863568 -0.071273876 -0.008183096
## Lag2   -0.074853051  1.00000000 -0.07572091  0.058381535 -0.072499482
## Lag3    0.058635682 -0.07572091  1.00000000 -0.075395865  0.060657175
## Lag4   -0.071273876  0.05838153 -0.07539587  1.000000000 -0.075675027
## Lag5   -0.008183096 -0.07249948  0.06065717 -0.075675027  1.000000000
## Volume -0.064951313 -0.08551314 -0.06928771 -0.061074617 -0.058517414
## Today  -0.075031842  0.05916672 -0.07124364 -0.007825873  0.011012698
##             Volume        Today
## Lag1   -0.06495131 -0.075031842
## Lag2   -0.08551314  0.059166717
## Lag3   -0.06928771 -0.071243639
## Lag4   -0.06107462 -0.007825873
## Lag5   -0.05851741  0.011012698
## Volume  1.00000000 -0.033077783
## Today  -0.03307778  1.000000000
# Set up a 2x2 plotting layout
par(mfrow = c(2, 2))

# 1. Time Series Plot: Volume over Years
plot(Weekly$Year, Weekly$Volume, type = "l",
     xlab = "Year", ylab = "Volume", main = "Volume over Time")

# 2. Histogram: Distribution of Today's Returns
hist(Weekly$Today, breaks = 30,
     main = "Histogram of Today's Returns", xlab = "Today's Return (%)")

# 3. Boxplot: Today's Return by Market Direction
boxplot(Today ~ Direction, data = Weekly,
        main = "Today's Return by Direction", ylab = "Today's Return (%)")

# 4. Pairs Plot: Lag Variables and Today's Return
pairs(Weekly[, c("Lag1", "Lag2", "Lag3", "Lag4", "Lag5", "Today")],
      main = "Pairs Plot of Lag Variables & Today's Return")

# Fit the logistic regression model using all observations
glm.fit <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
               data = Weekly, family = binomial)

# Print the summary of the model
summary(glm.fit)
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Weekly)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4
# Obtain predicted probabilities and convert them into class labels ("Up" if >0.5, else "Down")
glm.probs <- predict(glm.fit, type = "response")
glm.pred <- ifelse(glm.probs > 0.5, "Up", "Down")

# Compute the confusion matrix: rows = Actual, columns = Predicted
confusion_matrix <- table(Actual = Weekly$Direction, Predicted = glm.pred)
print(confusion_matrix)
##       Predicted
## Actual Down  Up
##   Down   54 430
##   Up     48 557
# Calculate the overall fraction of correct predictions (accuracy)
accuracy <- mean(glm.pred == Weekly$Direction)
print(accuracy)
## [1] 0.5610652
# Create training set (years 1990 to 2008) and test set (years 2009 and 2010)
train <- Weekly[Weekly$Year <= 2008, ]
test  <- Weekly[Weekly$Year >= 2009, ]

# Fit logistic regression model on training data with Lag2 as predictor
glm.fit <- glm(Direction ~ Lag2, data = train, family = binomial)

# Predict probabilities on the test set
glm.probs <- predict(glm.fit, newdata = test, type = "response")

# Classify observations: if probability > 0.5, predict "Up", else "Down"
glm.pred <- ifelse(glm.probs > 0.5, "Up", "Down")

# Compute the confusion matrix: rows = actual, columns = predicted
confusion_matrix <- table(Actual = test$Direction, Predicted = glm.pred)
print(confusion_matrix)
##       Predicted
## Actual Down Up
##   Down    9 34
##   Up      5 56
# Calculate the overall fraction of correct predictions (accuracy)
accuracy <- mean(glm.pred == test$Direction)
print(accuracy)
## [1] 0.625
### LDA
lda.fit <- lda(Direction ~ Lag2, data = train)
lda.pred <- predict(lda.fit, newdata = test)$class
lda.cm <- table(Actual = test$Direction, Predicted = lda.pred)
cat("LDA Confusion Matrix:\n")
## LDA Confusion Matrix:
print(lda.cm)
##       Predicted
## Actual Down Up
##   Down    9 34
##   Up      5 56
cat("LDA Accuracy:", mean(lda.pred == test$Direction), "\n\n")
## LDA Accuracy: 0.625
### QDA
qda.fit <- qda(Direction ~ Lag2, data = train)
qda.pred <- predict(qda.fit, newdata = test)$class
qda.cm <- table(Actual = test$Direction, Predicted = qda.pred)
cat("QDA Confusion Matrix:\n")
## QDA Confusion Matrix:
print(qda.cm)
##       Predicted
## Actual Down Up
##   Down    0 43
##   Up      0 61
cat("QDA Accuracy:", mean(qda.pred == test$Direction), "\n\n")
## QDA Accuracy: 0.5865385
### KNN with K = 1
# Create matrices of predictors
train.X <- as.matrix(train$Lag2)
test.X  <- as.matrix(test$Lag2)
set.seed(1)  # For reproducibility
knn.pred <- knn(train.X, test.X, train$Direction, k = 1)
knn.cm <- table(Actual = test$Direction, Predicted = knn.pred)
cat("KNN (k = 1) Confusion Matrix:\n")
## KNN (k = 1) Confusion Matrix:
print(knn.cm)
##       Predicted
## Actual Down Up
##   Down   21 22
##   Up     30 31
cat("KNN (k = 1) Accuracy:", mean(knn.pred == test$Direction), "\n\n")
## KNN (k = 1) Accuracy: 0.5
### Naive Bayes
nb.fit <- naiveBayes(Direction ~ Lag2, data = train)
nb.pred <- predict(nb.fit, newdata = test)
nb.cm <- table(Actual = test$Direction, Predicted = nb.pred)
cat("Naive Bayes Confusion Matrix:\n")
## Naive Bayes Confusion Matrix:
print(nb.cm)
##       Predicted
## Actual Down Up
##   Down    0 43
##   Up      0 61
cat("Naive Bayes Accuracy:", mean(nb.pred == test$Direction), "\n")
## Naive Bayes Accuracy: 0.5865385

QDA or Naive Bayes sometimes turn out slightly better

# Try a more flexible logistic model
glm.fit2 <- glm(Direction ~ Lag2 + I(Lag2^2) + Lag1:Lag2,
                data = train, family = binomial)
glm.probs2 <- predict(glm.fit2, newdata = test, type = "response")
glm.pred2  <- ifelse(glm.probs2 > 0.5, "Up", "Down")
mean(glm.pred2 == test$Direction)
## [1] 0.5769231
# Try KNN with different k
for(k_val in c(1,3,5,7,10)) {
  knn.pred <- knn(train.X, test.X, train$Direction, k = k_val)
  cat("k =", k_val, "Accuracy =", mean(knn.pred == test$Direction), "\n")
}
## k = 1 Accuracy = 0.5 
## k = 3 Accuracy = 0.5576923 
## k = 5 Accuracy = 0.5384615 
## k = 7 Accuracy = 0.5480769 
## k = 10 Accuracy = 0.5865385

Question 14

In this problem, you will develop a model to predict whether a given car gets high or low gas mileage based on the Auto data set.

# Load the Auto dataset
data("Auto")

# View the first few rows of the Auto data
head(Auto)
##   mpg cylinders displacement horsepower weight acceleration year origin
## 1  18         8          307        130   3504         12.0   70      1
## 2  15         8          350        165   3693         11.5   70      1
## 3  18         8          318        150   3436         11.0   70      1
## 4  16         8          304        150   3433         12.0   70      1
## 5  17         8          302        140   3449         10.5   70      1
## 6  15         8          429        198   4341         10.0   70      1
##                        name
## 1 chevrolet chevelle malibu
## 2         buick skylark 320
## 3        plymouth satellite
## 4             amc rebel sst
## 5               ford torino
## 6          ford galaxie 500
# Compute the median of the mpg variable
mpg_median <- median(Auto$mpg)

# Create the binary variable: 1 if mpg is above the median, 0 otherwise
mpg01 <- ifelse(Auto$mpg > mpg_median, 1, 0)

# Combine mpg01 with the Auto data set
Auto_new <- data.frame(mpg01, Auto)

# View the first few rows of the new data frame
print(mpg_median)
## [1] 22.75
head(Auto_new)
##   mpg01 mpg cylinders displacement horsepower weight acceleration year origin
## 1     0  18         8          307        130   3504         12.0   70      1
## 2     0  15         8          350        165   3693         11.5   70      1
## 3     0  18         8          318        150   3436         11.0   70      1
## 4     0  16         8          304        150   3433         12.0   70      1
## 5     0  17         8          302        140   3449         10.5   70      1
## 6     0  15         8          429        198   4341         10.0   70      1
##                        name
## 1 chevrolet chevelle malibu
## 2         buick skylark 320
## 3        plymouth satellite
## 4             amc rebel sst
## 5               ford torino
## 6          ford galaxie 500
# Scatterplot matrix for a subset of variables
pairs(Auto_new[, c("mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration")])

# Boxplot for cylinders by mpg01
boxplot(cylinders ~ mpg01, data = Auto_new, 
        main = "Cylinders vs. mpg01", xlab = "mpg01", ylab = "Cylinders")

# Boxplot for displacement by mpg01
boxplot(displacement ~ mpg01, data = Auto_new, 
        main = "Displacement vs. mpg01", xlab = "mpg01", ylab = "Displacement")

# Boxplot for horsepower by mpg01
boxplot(horsepower ~ mpg01, data = Auto_new, 
        main = "Horsepower vs. mpg01", xlab = "mpg01", ylab = "Horsepower")

# Boxplot for weight by mpg01
boxplot(weight ~ mpg01, data = Auto_new, 
        main = "Weight vs. mpg01", xlab = "mpg01", ylab = "Weight")

# Boxplot for acceleration by mpg01
boxplot(acceleration ~ mpg01, data = Auto_new, 
        main = "Acceleration vs. mpg01", xlab = "mpg01", ylab = "Acceleration")

- Split the data into a training set and a test set.

# Set seed for reproducibility
set.seed(123)

# Assume Auto_new is your data frame with mpg01 as a factor
n <- nrow(Auto_new)
train_idx <- sample(1:n, size = round(0.7 * n))
train_data <- Auto_new[train_idx, ]
test_data  <- Auto_new[-train_idx, ]
# Perform LDA on the training data using the selected predictors
lda_fit <- lda(mpg01 ~ weight + horsepower + cylinders + displacement, data = train_data )

# Use the fitted model to predict on the test data
lda_pred <- predict(lda_fit, test_data)$class

# Create a confusion matrix
confusion_matrix <- table(Predicted = lda_pred, Actual = test_data$mpg01)
print(confusion_matrix)
##          Actual
## Predicted  0  1
##         0 50  3
##         1 10 55
# Calculate the test error rate
test_error <- mean(lda_pred != test_data$mpg01)
print(paste("Test Error Rate:", round(test_error, 4)))
## [1] "Test Error Rate: 0.1102"
# Fit the QDA model on the training data
qda_fit <- qda(mpg01 ~ weight + horsepower + cylinders + displacement, data = train_data)

# Use the fitted model to predict on the test data
qda_pred <- predict(qda_fit, test_data)$class

# Create a confusion matrix to compare the predicted and actual classes
confusion_matrix_qda <- table(Predicted = qda_pred, Actual = test_data$mpg01)
print(confusion_matrix_qda)
##          Actual
## Predicted  0  1
##         0 53  5
##         1  7 53
# Calculate the test error rate
test_error_qda <- mean(qda_pred != test_data$mpg01)
print(paste("Test Error Rate for QDA:", round(test_error_qda, 4)))
## [1] "Test Error Rate for QDA: 0.1017"
# Fit the logistic regression model on the training data
glm_fit <- glm(mpg01 ~ weight + horsepower + cylinders + displacement, 
               data = train_data, 
               family = binomial)

# Use the fitted model to predict probabilities on the test data
glm_probs <- predict(glm_fit, test_data, type = "response")

# Convert predicted probabilities to class labels using a threshold of 0.5
glm_pred <- ifelse(glm_probs > 0.5, 1, 0)

# Create a confusion matrix to compare predicted vs. actual values
conf_matrix <- table(Predicted = glm_pred, Actual = test_data$mpg01)
print(conf_matrix)
##          Actual
## Predicted  0  1
##         0 53  6
##         1  7 52
# Calculate the test error rate (proportion of misclassified observations)
test_error <- mean(glm_pred != test_data$mpg01)
print(paste("Logistic Regression Test Error Rate:", round(test_error, 4)))
## [1] "Logistic Regression Test Error Rate: 0.1102"
# Ensure that mpg01 is a factor (if not already)
train_data$mpg01 <- as.factor(train_data$mpg01)
test_data$mpg01 <- as.factor(test_data$mpg01)

# Fit the naive Bayes model on the training data
nb_fit <- naiveBayes(mpg01 ~ weight + horsepower + cylinders + displacement, data = train_data)

# Use the fitted model to predict on the test data
nb_pred <- predict(nb_fit, test_data)

# Create a confusion matrix to compare predicted vs. actual values
conf_matrix <- table(Predicted = nb_pred, Actual = test_data$mpg01)
print(conf_matrix)
##          Actual
## Predicted  0  1
##         0 52  4
##         1  8 54
# Calculate the test error rate (proportion of misclassified observations)
test_error <- mean(nb_pred != test_data$mpg01)
print(paste("Naive Bayes Test Error Rate:", round(test_error, 4)))
## [1] "Naive Bayes Test Error Rate: 0.1017"
# Load the class package (if not already loaded)
library(class)

# Extract the predictors for training and test sets
train_X <- as.matrix(train_data[, c("weight", "horsepower", "cylinders", "displacement")])
test_X <- as.matrix(test_data[, c("weight", "horsepower", "cylinders", "displacement")])

# Standardize the predictors using the training set parameters
train_X_scaled <- scale(train_X)
test_X_scaled <- scale(test_X, center = attr(train_X_scaled, "scaled:center"),
                       scale = attr(train_X_scaled, "scaled:scale"))

# Define the training and test labels
train_Y <- train_data$mpg01
test_Y <- test_data$mpg01

# Try several values for K
ks <- c(1, 3, 5, 7, 9, 11, 13, 15)
errors <- numeric(length(ks))

for (i in 1:length(ks)) {
  k <- ks[i]
  knn_pred <- knn(train = train_X_scaled, test = test_X_scaled, cl = train_Y, k = k)
  error_rate <- mean(knn_pred != test_Y)
  errors[i] <- error_rate
  cat("k =", k, "Test Error =", round(error_rate, 4), "\n")
}
## k = 1 Test Error = 0.1356 
## k = 3 Test Error = 0.1017 
## k = 5 Test Error = 0.1186 
## k = 7 Test Error = 0.0932 
## k = 9 Test Error = 0.1017 
## k = 11 Test Error = 0.1017 
## k = 13 Test Error = 0.1017 
## k = 15 Test Error = 0.1017
# Determine which value of K gives the lowest test error
best_k <- ks[which.min(errors)]
cat("Best k based on test error:", best_k, "\n")
## Best k based on test error: 7