knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)

Chapter 4 Questions

13.

a.The S&P 500 returns seem to be more positive than negative from 1990 - 2010. There is also a significant positive correlation between Volume and Year suggesting that as the years went by the number of shares traded increased.

library(ISLR2)
library(GGally)
library(MASS)
library(class)
library(e1071)
data(Weekly)
summary(Weekly)
##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
## 
numeric_vars <- Weekly[sapply(Weekly, is.numeric)]
ggpairs(numeric_vars)

b. Lag2 is just slightly significant.

model <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = Weekly, family = "binomial")
summary(model)
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = "binomial", data = Weekly)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4

c. The confusion matrix shows that the logistic regression model is much better at predicting when the stock market will go up than when it will go down. The model gets the direction right about 56% of the time. So, while the model does okay at spotting up weeks, it’s not very reliable for catching down weeks.

probs <- predict(model, type = "response")
actual_class <- ifelse(Weekly$Direction == "Up", 1, 0)
predicted_class <- ifelse(probs > 0.5, 1, 0)
conf_matrix <- table(Predicted = predicted_class, Actual = actual_class)
print(conf_matrix)
##          Actual
## Predicted   0   1
##         0  54  48
##         1 430 557
accuracy <- mean(predicted_class == actual_class)
print(paste("Overall Accuracy:", round(accuracy, 4)))
## [1] "Overall Accuracy: 0.5611"

d.

train_data <- subset(Weekly, Year <= 2008)
test_data  <- subset(Weekly, Year > 2008)
model1 <- glm(Direction ~ Lag2, data = train_data, family = "binomial")
probs1 <- predict(model1, newdata = test_data, type = "response")
predicted_class1 <- ifelse(probs1 > 0.5, 1, 0)
actual_class1 <- ifelse(test_data$Direction == "Up", 1, 0)
conf_matrix1 <- table(Predicted = predicted_class1, Actual = actual_class1)
print(conf_matrix1)
##          Actual
## Predicted  0  1
##         0  9  5
##         1 34 56
accuracy1 <- mean(predicted_class1 == actual_class1)
print(paste("Overall Accuracy:", round(accuracy1, 4)))
## [1] "Overall Accuracy: 0.625"

e.

lda_model <- lda(Direction ~ Lag2, data = train_data)
lda_pred <- predict(lda_model, newdata = test_data)
predicted_classl <- lda_pred$class
actual_classl <- test_data$Direction
conf_matrixl <- table(Predicted = predicted_classl, Actual = actual_classl)
print(conf_matrixl)
##          Actual
## Predicted Down Up
##      Down    9  5
##      Up     34 56
accuracyl <- mean(predicted_classl == actual_classl)
print(paste("Overall Accuracy:", round(accuracyl, 4)))
## [1] "Overall Accuracy: 0.625"

f.

qda_model <- qda(Direction ~ Lag2, data = train_data)
qda_pred <- predict(qda_model, newdata = test_data)
predicted_classq <- qda_pred$class
actual_classq <- test_data$Direction
conf_matrixq <- table(Predicted = predicted_classq, Actual = actual_classq)
print(conf_matrixq)
##          Actual
## Predicted Down Up
##      Down    0  0
##      Up     43 61
accuracyq <- mean(predicted_classq == actual_classq)
print(paste("Overall Accuracy:", round(accuracyq, 4)))
## [1] "Overall Accuracy: 0.5865"

g.

train_X <- data.frame(Lag2 = train_data$Lag2)
test_X  <- data.frame(Lag2 = test_data$Lag2)

train_y <- train_data$Direction
test_y  <- test_data$Direction

set.seed(1)
knn_pred <- knn(train = train_X, test = test_X, cl = train_y, k = 1)
conf_matrix_knn <- table(Predicted = knn_pred, Actual = test_y)
print(conf_matrix_knn)
##          Actual
## Predicted Down Up
##      Down   21 30
##      Up     22 31
accuracy_knn <- mean(knn_pred == test_y)
print(paste("Overall Accuracy:", round(accuracy_knn, 4)))
## [1] "Overall Accuracy: 0.5"

h.

nb_model <- naiveBayes(Direction ~ Lag2, data = train_data)
nb_pred <- predict(nb_model, newdata = test_data)
conf_matrix_nb <- table(Predicted = nb_pred, Actual = test_data$Direction)
print(conf_matrix_nb)
##          Actual
## Predicted Down Up
##      Down    0  0
##      Up     43 61
accuracy_nb <- mean(nb_pred == test_data$Direction)
print(paste("Overall Accuracy:", round(accuracy_nb, 4)))
## [1] "Overall Accuracy: 0.5865"

i. The lda model and the train and test logistic regression tied for the most accurate models at 62.5%

j.

log_model <- glm(Direction ~ Lag2 + Lag1 + Lag1:Lag2, data = train_data, family = "binomial")
log_pred <- predict(log_model, newdata = test_data, type = "response")
log_class <- ifelse(log_pred > 0.5, "Up", "Down")
conf_log <- table(Predicted = log_class, Actual = test_y)
acc_log <- mean(log_class == test_y)
print(paste("Overall Accuracy:", round(acc_log, 4)))
## [1] "Overall Accuracy: 0.5769"
lda_model1 <- lda(Direction ~ Lag2 + Lag5, data = train_data)
lda_pred1 <- predict(lda_model1, newdata = test_data)$class
conf_lda <- table(Predicted = lda_pred1, Actual = test_y)
acc_lda <- mean(lda_pred1 == test_y)
print(paste("Overall Accuracy:", round(acc_lda, 4)))
## [1] "Overall Accuracy: 0.5962"
train_data$Lag2_sq <- train_data$Lag2^2
test_data$Lag2_sq <- test_data$Lag2^2
qda_model1 <- qda(Direction ~ Lag2 + Lag2_sq, data = train_data)
qda_pred1 <- predict(qda_model1, newdata = test_data)$class
conf_qda <- table(Predicted = qda_pred1, Actual = test_y)
acc_qda <- mean(qda_pred1 == test_y)
print(paste("Overall Accuracy:", round(acc_qda, 4)))
## [1] "Overall Accuracy: 0.625"
train_X1 <- data.frame(Lag2 = train_data$Lag2, Lag5 = train_data$Lag5)
test_X1  <- data.frame(Lag2 = test_data$Lag2, Lag5 = test_data$Lag5)

acc_knn <- c()
conf_knn_list <- list()
best_k <- NULL
best_acc <- 0

for (k in c(1, 3, 5, 10)) {
  set.seed(2)
  knn_pred1 <- knn(train_X1, test_X1, train_y, k = k)
  acc <- mean(knn_pred1 == test_y)
  acc_knn <- c(acc_knn, acc)
  conf_knn_list[[as.character(k)]] <- table(Predicted = knn_pred1, Actual = test_y)
  
  if (acc > best_acc) {
    best_acc <- acc
    best_k <- k
  }
}
print(paste("Overall Accuracy:", round(best_acc, 4)))
## [1] "Overall Accuracy: 0.5577"
print(paste("Best K:", best_k))
## [1] "Best K: 10"
nb_model1 <- naiveBayes(Direction ~ Lag2 + Lag1, data = train_data)
nb_pred1 <- predict(nb_model1, newdata = test_data)
conf_nb1 <- table(Predicted = nb_pred1, Actual = test_y)
acc_nb <- mean(nb_pred1 == test_y)
print(paste("Overall Accuracy:", round(acc_nb, 4)))
## [1] "Overall Accuracy: 0.5385"

14.

data(Auto)

a.

mpg_median <- median(Auto$mpg)
Auto$mpg01 <- ifelse(Auto$mpg > mpg_median, 1, 0)
head(Auto[c("mpg", "mpg01")])
##   mpg mpg01
## 1  18     0
## 2  15     0
## 3  18     0
## 4  16     0
## 5  17     0
## 6  15     0

b. Weight, horsepower, displacement, and cylinders all have a strong negative correlation with mpg01. As they go up, mpg01 goes down.

numeric_varsA <- Auto[sapply(Auto, is.numeric)]
ggpairs(numeric_varsA)

par(mfrow = c(2, 2))
boxplot(horsepower ~ mpg01, data = Auto, xlab = "mpg01", ylab = "Horsepower")
boxplot(weight ~ mpg01, data = Auto, xlab = "mpg01", ylab = "Weight")
boxplot(acceleration ~ mpg01, data = Auto, xlab = "mpg01", ylab = "Acceleration")
boxplot(displacement ~ mpg01, data = Auto, xlab = "mpg01", ylab = "Displacement")

par(mfrow = c(2, 2))
boxplot(mpg ~ mpg01, data = Auto, xlab = "mpg01", ylab = "Mpg")
boxplot(cylinders ~ mpg01, data = Auto, xlab = "mpg01", ylab = "Cylinders")
boxplot(year ~ mpg01, data = Auto, xlab = "mpg01", ylab = "Year")
boxplot(origin ~ mpg01, data = Auto, xlab = "mpg01", ylab = "Origin")

c.

set.seed(1)
train_index <- sample(1:nrow(Auto), size = 0.7 * nrow(Auto))

train_data1 <- Auto[train_index, ]
test_data1  <- Auto[-train_index, ]

nrow(train_data1)
## [1] 274
nrow(test_data1)
## [1] 118

d.

lda_automodel <- lda(mpg01 ~ weight + horsepower + displacement + cylinders, data = train_data1)
lda_autopred <- predict(lda_automodel, newdata = test_data1)
pred_autoclass <- lda_autopred$class
conf_automatrix <- table(Predicted = pred_autoclass, Actual = test_data1$mpg01)
print(conf_automatrix)
##          Actual
## Predicted  0  1
##         0 50  3
##         1 11 54
test_error <- mean(pred_autoclass != test_data1$mpg01)
print(paste("Test Error:", round(test_error, 4)))
## [1] "Test Error: 0.1186"

e.

qda_automodel <- qda(mpg01 ~ weight + horsepower + displacement + cylinders, data = train_data1)
qda_autopred <- predict(qda_automodel, newdata = test_data1)
pred_autoclassq <- qda_autopred$class
conf_automatrixq <- table(Predicted = pred_autoclassq, Actual = test_data1$mpg01)
print(conf_automatrixq)
##          Actual
## Predicted  0  1
##         0 52  5
##         1  9 52
test_autoerror <- mean(pred_autoclassq != test_data1$mpg01)
print(paste("Test Error:", round(test_autoerror, 4)))
## [1] "Test Error: 0.1186"

f.

log_automodel <- glm(mpg01 ~ weight + horsepower + displacement + cylinders, data = train_data1, family = "binomial")
log_autopred <- predict(log_automodel, newdata = test_data1, type = "response")
log_predautoclass <- ifelse(log_autopred > 0.5, 1, 0)
conf_autolog <- table(Predicted = log_predautoclass, Actual = test_data1$mpg01)

test_logerror <- mean(log_predautoclass != test_data1$mpg01)
print(paste("Test Error:", round(test_logerror, 4)))
## [1] "Test Error: 0.0932"

g.

nb_automodel <- naiveBayes(mpg01 ~ weight + horsepower + displacement + cylinders, data = train_data1)
nb_autopred <- predict(nb_automodel, newdata = test_data1)
conf_autonb <- table(Predicted = nb_autopred, Actual = test_data1$mpg01)
print(conf_autonb)
##          Actual
## Predicted  0  1
##         0 52  4
##         1  9 53
test_nberror <- mean(nb_autopred != test_data1$mpg01)
print(paste("Test Error:", round(test_nberror, 4)))
## [1] "Test Error: 0.1102"

h.

predictors <- c("cylinders", "weight", "horsepower", "displacement")

train_XA <- scale(train_data1[, predictors])
test_XA  <- scale(test_data1[, predictors],
                 center = attr(train_XA, "scaled:center"),
                 scale  = attr(train_XA, "scaled:scale"))

train_yA <- train_data1$mpg01
test_yA  <- test_data1$mpg01

k_values <- 1:20
error_rates <- numeric(length(k_values))

set.seed(1)
for (i in seq_along(k_values)) {
  kauto <- k_values[i]
  pred <- knn(train = train_XA, test = test_XA, cl = train_yA, k = kauto)
  error_rates[i] <- mean(pred != test_yA)
}

best_kauto <- k_values[which.min(error_rates)]
best_error <- min(error_rates)

cat("Best K:", best_k, "\n")
## Best K: 10
cat("Lowest Test Error Rate:", round(best_error, 4), "\n")
## Lowest Test Error Rate: 0.1102

16.

I tested four different methods to predict whether a neighborhood in Boston has a high crime rate or not. Logistic regression has an accuracy of about 82%. Linear Discriminant Analysis (LDA) did slightly better with 84% accuracy. Naive Bayes and K-Nearest Neighbors (KNN) tied for the best performance, each correctly predicting about 86% of the time. This shows that Naive Bayes and KNN were the most effective at spotting high-crime areas in this case.

data(Boston)
Boston$crim01 <- ifelse(Boston$crim > median(Boston$crim), 1, 0)

set.seed(1)
train_indexB <- sample(1:nrow(Boston), nrow(Boston) * 0.7)
train_dataB <- Boston[train_indexB, ]
test_dataB  <- Boston[-train_indexB, ]
log_modelB <- glm(crim01 ~ lstat + rm + dis + nox + age, data = train_dataB, family = "binomial")
log_probs <- predict(log_modelB, newdata = test_dataB, type = "response")
log_predB <- ifelse(log_probs > 0.5, 1, 0)
log_error <- mean(log_predB != test_dataB$crim01)
lda_modelB <- lda(crim01 ~ lstat + rm + dis + nox + age, data = train_dataB)
lda_predB <- predict(lda_modelB, newdata = test_dataB)$class
lda_error <- mean(lda_predB != test_dataB$crim01)
nb_modelB <- naiveBayes(crim01 ~ lstat + rm + dis + nox + age, data = train_dataB)
nb_predB <- predict(nb_modelB, newdata = test_dataB)
nb_error <- mean(nb_predB != test_dataB$crim01)
predictorsB <- c("lstat", "rm", "dis", "nox", "age")
train_XB <- scale(train_dataB[, predictorsB])
test_XB  <- scale(test_dataB[, predictorsB],
                 center = attr(train_XB, "scaled:center"),
                 scale  = attr(train_XB, "scaled:scale"))

train_yB <- train_dataB$crim01
test_yB  <- test_dataB$crim01

set.seed(1)
knn_predB <- knn(train_XB, test_XB, cl = train_yB, k = 5)
knn_error <- mean(knn_predB != test_yB)
log_accB  <- 1 - log_error
lda_accB  <- 1 - lda_error
nb_accB   <- 1 - nb_error
knn_accB  <- 1 - knn_error
cat("Test Error and Accuracy Rates:\n")
## Test Error and Accuracy Rates:
cat("----------------------------------\n")
## ----------------------------------
cat("Logistic Regression:\n  Error =", round(log_error, 4), 
    "| Accuracy =", round(log_accB, 4), "\n")
## Logistic Regression:
##   Error = 0.1776 | Accuracy = 0.8224
cat("LDA:\n  Error =", round(lda_error, 4), 
    "| Accuracy =", round(lda_accB, 4), "\n")
## LDA:
##   Error = 0.1579 | Accuracy = 0.8421
cat("Naive Bayes:\n  Error =", round(nb_error, 4), 
    "| Accuracy =", round(nb_accB, 4), "\n")
## Naive Bayes:
##   Error = 0.1447 | Accuracy = 0.8553
cat("KNN (k = 5):\n  Error =", round(knn_error, 4), 
    "| Accuracy =", round(knn_accB, 4), "\n")
## KNN (k = 5):
##   Error = 0.1447 | Accuracy = 0.8553