Lab 5

library(ISLR2)

## Warning: package 'ISLR2' was built under R version 4.4.2

library(ISLR2)
library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:ISLR2':
## 
##     Boston

library(class)
library(e1071)

## Warning: package 'e1071' was built under R version 4.4.2

data("Weekly")

Question 13

This question should be answered using the Weekly data set, which is part of the ISLR2 package. This data is similar in nature to the Smarket data from this chapter’s lab, except that it contains 1, 089 weekly returns for 21 years, from the beginning of 1990 to the end of 2010.

Produce some numerical and graphical summaries of the Weekly data. Do there appear to be any patterns?

# Numerical summaries
print(summary(Weekly))

##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
##

print(cor(Weekly[, c("Lag1", "Lag2", "Lag3", "Lag4", "Lag5", "Volume", "Today")]))

##                Lag1        Lag2        Lag3         Lag4         Lag5
## Lag1    1.000000000 -0.07485305  0.05863568 -0.071273876 -0.008183096
## Lag2   -0.074853051  1.00000000 -0.07572091  0.058381535 -0.072499482
## Lag3    0.058635682 -0.07572091  1.00000000 -0.075395865  0.060657175
## Lag4   -0.071273876  0.05838153 -0.07539587  1.000000000 -0.075675027
## Lag5   -0.008183096 -0.07249948  0.06065717 -0.075675027  1.000000000
## Volume -0.064951313 -0.08551314 -0.06928771 -0.061074617 -0.058517414
## Today  -0.075031842  0.05916672 -0.07124364 -0.007825873  0.011012698
##             Volume        Today
## Lag1   -0.06495131 -0.075031842
## Lag2   -0.08551314  0.059166717
## Lag3   -0.06928771 -0.071243639
## Lag4   -0.06107462 -0.007825873
## Lag5   -0.05851741  0.011012698
## Volume  1.00000000 -0.033077783
## Today  -0.03307778  1.000000000

# Set up a 2x2 plotting layout
par(mfrow = c(2, 2))

# 1. Time Series Plot: Volume over Years
plot(Weekly$Year, Weekly$Volume, type = "l",
     xlab = "Year", ylab = "Volume", main = "Volume over Time")

# 2. Histogram: Distribution of Today's Returns
hist(Weekly$Today, breaks = 30,
     main = "Histogram of Today's Returns", xlab = "Today's Return (%)")

# 3. Boxplot: Today's Return by Market Direction
boxplot(Today ~ Direction, data = Weekly,
        main = "Today's Return by Direction", ylab = "Today's Return (%)")

# 4. Pairs Plot: Lag Variables and Today's Return
pairs(Weekly[, c("Lag1", "Lag2", "Lag3", "Lag4", "Lag5", "Today")],
      main = "Pairs Plot of Lag Variables & Today's Return")

Use the full data set to perform a logistic regression with Direction as the response and the five lag variables plus Volume as predictors. Use the summary function to print the results. Do any of the predictors appear to be statistically signifcant? If so, which ones?

# Fit the logistic regression model using all observations
glm.fit <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
               data = Weekly, family = binomial)

# Print the summary of the model
summary(glm.fit)

## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Weekly)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4

Compute the confusion matrix and overall fraction of correct predictions. Explain what the confusion matrix is telling you about the types of mistakes made by logistic regression.

# Obtain predicted probabilities and convert them into class labels ("Up" if >0.5, else "Down")
glm.probs <- predict(glm.fit, type = "response")
glm.pred <- ifelse(glm.probs > 0.5, "Up", "Down")

# Compute the confusion matrix: rows = Actual, columns = Predicted
confusion_matrix <- table(Actual = Weekly$Direction, Predicted = glm.pred)
print(confusion_matrix)

##       Predicted
## Actual Down  Up
##   Down   54 430
##   Up     48 557

# Calculate the overall fraction of correct predictions (accuracy)
accuracy <- mean(glm.pred == Weekly$Direction)
print(accuracy)

## [1] 0.5610652

Now fit the logistic regression model using a training data period from 1990 to 2008, with Lag2 as the only predictor. Compute the confusion matrix and the overall fraction of correct predictions for the held out data (that is, the data from 2009 and 2010).

# Create training set (years 1990 to 2008) and test set (years 2009 and 2010)
train <- Weekly[Weekly$Year <= 2008, ]
test  <- Weekly[Weekly$Year >= 2009, ]

# Fit logistic regression model on training data with Lag2 as predictor
glm.fit <- glm(Direction ~ Lag2, data = train, family = binomial)

# Predict probabilities on the test set
glm.probs <- predict(glm.fit, newdata = test, type = "response")

# Classify observations: if probability > 0.5, predict "Up", else "Down"
glm.pred <- ifelse(glm.probs > 0.5, "Up", "Down")

# Compute the confusion matrix: rows = actual, columns = predicted
confusion_matrix <- table(Actual = test$Direction, Predicted = glm.pred)
print(confusion_matrix)

##       Predicted
## Actual Down Up
##   Down    9 34
##   Up      5 56

# Calculate the overall fraction of correct predictions (accuracy)
accuracy <- mean(glm.pred == test$Direction)
print(accuracy)

## [1] 0.625

Repeat (d) using LDA.

### LDA
lda.fit <- lda(Direction ~ Lag2, data = train)
lda.pred <- predict(lda.fit, newdata = test)$class
lda.cm <- table(Actual = test$Direction, Predicted = lda.pred)
cat("LDA Confusion Matrix:\n")

## LDA Confusion Matrix:

print(lda.cm)

##       Predicted
## Actual Down Up
##   Down    9 34
##   Up      5 56

cat("LDA Accuracy:", mean(lda.pred == test$Direction), "\n\n")

## LDA Accuracy: 0.625

Repeat (d) using QDA.

### QDA
qda.fit <- qda(Direction ~ Lag2, data = train)
qda.pred <- predict(qda.fit, newdata = test)$class
qda.cm <- table(Actual = test$Direction, Predicted = qda.pred)
cat("QDA Confusion Matrix:\n")

## QDA Confusion Matrix:

print(qda.cm)

##       Predicted
## Actual Down Up
##   Down    0 43
##   Up      0 61

cat("QDA Accuracy:", mean(qda.pred == test$Direction), "\n\n")

## QDA Accuracy: 0.5865385

Repeat (d) using KNN with K = 1.

### KNN with K = 1
# Create matrices of predictors
train.X <- as.matrix(train$Lag2)
test.X  <- as.matrix(test$Lag2)
set.seed(1)  # For reproducibility
knn.pred <- knn(train.X, test.X, train$Direction, k = 1)
knn.cm <- table(Actual = test$Direction, Predicted = knn.pred)
cat("KNN (k = 1) Confusion Matrix:\n")

## KNN (k = 1) Confusion Matrix:

print(knn.cm)

##       Predicted
## Actual Down Up
##   Down   21 22
##   Up     30 31

cat("KNN (k = 1) Accuracy:", mean(knn.pred == test$Direction), "\n\n")

## KNN (k = 1) Accuracy: 0.5

Repeat (d) using naive Bayes.

### Naive Bayes
nb.fit <- naiveBayes(Direction ~ Lag2, data = train)
nb.pred <- predict(nb.fit, newdata = test)
nb.cm <- table(Actual = test$Direction, Predicted = nb.pred)
cat("Naive Bayes Confusion Matrix:\n")

## Naive Bayes Confusion Matrix:

print(nb.cm)

##       Predicted
## Actual Down Up
##   Down    0 43
##   Up      0 61

cat("Naive Bayes Accuracy:", mean(nb.pred == test$Direction), "\n")

## Naive Bayes Accuracy: 0.5865385

Which of these methods appears to provide the best results on this data?

QDA or Naive Bayes sometimes turn out slightly better

Experiment with diferent combinations of predictors, including possible transformations and interactions, for each of the methods. Report the variables, method, and associated confusion matrix that appears to provide the best results on the held out data. Note that you should also experiment with values for K in the KNN classifer.

# Try a more flexible logistic model
glm.fit2 <- glm(Direction ~ Lag2 + I(Lag2^2) + Lag1:Lag2,
                data = train, family = binomial)
glm.probs2 <- predict(glm.fit2, newdata = test, type = "response")
glm.pred2  <- ifelse(glm.probs2 > 0.5, "Up", "Down")
mean(glm.pred2 == test$Direction)

## [1] 0.5769231

# Try KNN with different k
for(k_val in c(1,3,5,7,10)) {
  knn.pred <- knn(train.X, test.X, train$Direction, k = k_val)
  cat("k =", k_val, "Accuracy =", mean(knn.pred == test$Direction), "\n")
}

## k = 1 Accuracy = 0.5 
## k = 3 Accuracy = 0.5576923 
## k = 5 Accuracy = 0.5384615 
## k = 7 Accuracy = 0.5480769 
## k = 10 Accuracy = 0.5865385

Question 14

In this problem, you will develop a model to predict whether a given car gets high or low gas mileage based on the Auto data set.

# Load the Auto dataset
data("Auto")

# View the first few rows of the Auto data
head(Auto)

##   mpg cylinders displacement horsepower weight acceleration year origin
## 1  18         8          307        130   3504         12.0   70      1
## 2  15         8          350        165   3693         11.5   70      1
## 3  18         8          318        150   3436         11.0   70      1
## 4  16         8          304        150   3433         12.0   70      1
## 5  17         8          302        140   3449         10.5   70      1
## 6  15         8          429        198   4341         10.0   70      1
##                        name
## 1 chevrolet chevelle malibu
## 2         buick skylark 320
## 3        plymouth satellite
## 4             amc rebel sst
## 5               ford torino
## 6          ford galaxie 500

Create a binary variable, mpg01, that contains a 1 if mpg contains a value above its median, and a 0 if mpg contains a value below its median. You can compute the median using the median() function. Note you may fnd it helpful to use the data.frame() function to create a single data set containing both mpg01 and the other Auto variables.

# Compute the median of the mpg variable
mpg_median <- median(Auto$mpg)

# Create the binary variable: 1 if mpg is above the median, 0 otherwise
mpg01 <- ifelse(Auto$mpg > mpg_median, 1, 0)

# Combine mpg01 with the Auto data set
Auto_new <- data.frame(mpg01, Auto)

# View the first few rows of the new data frame
print(mpg_median)

## [1] 22.75

head(Auto_new)

##   mpg01 mpg cylinders displacement horsepower weight acceleration year origin
## 1     0  18         8          307        130   3504         12.0   70      1
## 2     0  15         8          350        165   3693         11.5   70      1
## 3     0  18         8          318        150   3436         11.0   70      1
## 4     0  16         8          304        150   3433         12.0   70      1
## 5     0  17         8          302        140   3449         10.5   70      1
## 6     0  15         8          429        198   4341         10.0   70      1
##                        name
## 1 chevrolet chevelle malibu
## 2         buick skylark 320
## 3        plymouth satellite
## 4             amc rebel sst
## 5               ford torino
## 6          ford galaxie 500

Explore the data graphically in order to investigate the association between mpg01 and the other features. Which of the other features seem most likely to be useful in predicting mpg01? Scatterplots and boxplots may be useful tools to answer this question. Describe your fndings.

# Scatterplot matrix for a subset of variables
pairs(Auto_new[, c("mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration")])

# Boxplot for cylinders by mpg01
boxplot(cylinders ~ mpg01, data = Auto_new, 
        main = "Cylinders vs. mpg01", xlab = "mpg01", ylab = "Cylinders")

# Boxplot for displacement by mpg01
boxplot(displacement ~ mpg01, data = Auto_new, 
        main = "Displacement vs. mpg01", xlab = "mpg01", ylab = "Displacement")

# Boxplot for horsepower by mpg01
boxplot(horsepower ~ mpg01, data = Auto_new, 
        main = "Horsepower vs. mpg01", xlab = "mpg01", ylab = "Horsepower")

# Boxplot for weight by mpg01
boxplot(weight ~ mpg01, data = Auto_new, 
        main = "Weight vs. mpg01", xlab = "mpg01", ylab = "Weight")

# Boxplot for acceleration by mpg01
boxplot(acceleration ~ mpg01, data = Auto_new, 
        main = "Acceleration vs. mpg01", xlab = "mpg01", ylab = "Acceleration")

- Split the data into a training set and a test set.

# Set seed for reproducibility
set.seed(123)

# Assume Auto_new is your data frame with mpg01 as a factor
n <- nrow(Auto_new)
train_idx <- sample(1:n, size = round(0.7 * n))
train_data <- Auto_new[train_idx, ]
test_data  <- Auto_new[-train_idx, ]

Perform LDA on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained?

# Perform LDA on the training data using the selected predictors
lda_fit <- lda(mpg01 ~ weight + horsepower + cylinders + displacement, data = train_data )

# Use the fitted model to predict on the test data
lda_pred <- predict(lda_fit, test_data)$class

# Create a confusion matrix
confusion_matrix <- table(Predicted = lda_pred, Actual = test_data$mpg01)
print(confusion_matrix)

##          Actual
## Predicted  0  1
##         0 50  3
##         1 10 55

# Calculate the test error rate
test_error <- mean(lda_pred != test_data$mpg01)
print(paste("Test Error Rate:", round(test_error, 4)))

## [1] "Test Error Rate: 0.1102"

Perform QDA on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained?

# Fit the QDA model on the training data
qda_fit <- qda(mpg01 ~ weight + horsepower + cylinders + displacement, data = train_data)

# Use the fitted model to predict on the test data
qda_pred <- predict(qda_fit, test_data)$class

# Create a confusion matrix to compare the predicted and actual classes
confusion_matrix_qda <- table(Predicted = qda_pred, Actual = test_data$mpg01)
print(confusion_matrix_qda)

##          Actual
## Predicted  0  1
##         0 53  5
##         1  7 53

# Calculate the test error rate
test_error_qda <- mean(qda_pred != test_data$mpg01)
print(paste("Test Error Rate for QDA:", round(test_error_qda, 4)))

## [1] "Test Error Rate for QDA: 0.1017"

Perform logistic regression on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained?

# Fit the logistic regression model on the training data
glm_fit <- glm(mpg01 ~ weight + horsepower + cylinders + displacement, 
               data = train_data, 
               family = binomial)

# Use the fitted model to predict probabilities on the test data
glm_probs <- predict(glm_fit, test_data, type = "response")

# Convert predicted probabilities to class labels using a threshold of 0.5
glm_pred <- ifelse(glm_probs > 0.5, 1, 0)

# Create a confusion matrix to compare predicted vs. actual values
conf_matrix <- table(Predicted = glm_pred, Actual = test_data$mpg01)
print(conf_matrix)

##          Actual
## Predicted  0  1
##         0 53  6
##         1  7 52

# Calculate the test error rate (proportion of misclassified observations)
test_error <- mean(glm_pred != test_data$mpg01)
print(paste("Logistic Regression Test Error Rate:", round(test_error, 4)))

## [1] "Logistic Regression Test Error Rate: 0.1102"

Perform naive Bayes on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained?

# Ensure that mpg01 is a factor (if not already)
train_data$mpg01 <- as.factor(train_data$mpg01)
test_data$mpg01 <- as.factor(test_data$mpg01)

# Fit the naive Bayes model on the training data
nb_fit <- naiveBayes(mpg01 ~ weight + horsepower + cylinders + displacement, data = train_data)

# Use the fitted model to predict on the test data
nb_pred <- predict(nb_fit, test_data)

# Create a confusion matrix to compare predicted vs. actual values
conf_matrix <- table(Predicted = nb_pred, Actual = test_data$mpg01)
print(conf_matrix)

##          Actual
## Predicted  0  1
##         0 52  4
##         1  8 54

# Calculate the test error rate (proportion of misclassified observations)
test_error <- mean(nb_pred != test_data$mpg01)
print(paste("Naive Bayes Test Error Rate:", round(test_error, 4)))

## [1] "Naive Bayes Test Error Rate: 0.1017"

Perform KNN on the training data, with several values of K, in order to predict mpg01. Use only the variables that seemed most associated with mpg01 in (b). What test errors do you obtain? Which value of K seems to perform the best on this data set?

# Load the class package (if not already loaded)
library(class)

# Extract the predictors for training and test sets
train_X <- as.matrix(train_data[, c("weight", "horsepower", "cylinders", "displacement")])
test_X <- as.matrix(test_data[, c("weight", "horsepower", "cylinders", "displacement")])

# Standardize the predictors using the training set parameters
train_X_scaled <- scale(train_X)
test_X_scaled <- scale(test_X, center = attr(train_X_scaled, "scaled:center"),
                       scale = attr(train_X_scaled, "scaled:scale"))

# Define the training and test labels
train_Y <- train_data$mpg01
test_Y <- test_data$mpg01

# Try several values for K
ks <- c(1, 3, 5, 7, 9, 11, 13, 15)
errors <- numeric(length(ks))

for (i in 1:length(ks)) {
  k <- ks[i]
  knn_pred <- knn(train = train_X_scaled, test = test_X_scaled, cl = train_Y, k = k)
  error_rate <- mean(knn_pred != test_Y)
  errors[i] <- error_rate
  cat("k =", k, "Test Error =", round(error_rate, 4), "\n")
}

## k = 1 Test Error = 0.1356 
## k = 3 Test Error = 0.1017 
## k = 5 Test Error = 0.1186 
## k = 7 Test Error = 0.0932 
## k = 9 Test Error = 0.1017 
## k = 11 Test Error = 0.1017 
## k = 13 Test Error = 0.1017 
## k = 15 Test Error = 0.1017

# Determine which value of K gives the lowest test error
best_k <- ks[which.min(errors)]
cat("Best k based on test error:", best_k, "\n")

## Best k based on test error: 7

Lab 5

Aiden Tierney

2025-02-27