Question 13
# 13(a)
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
cor(Weekly[, -9])
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876
## Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535
## Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865
## Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000
## Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027
## Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873
## Lag5 Volume Today
## Year -0.030519101 0.84194162 -0.032459894
## Lag1 -0.008183096 -0.06495131 -0.075031842
## Lag2 -0.072499482 -0.08551314 0.059166717
## Lag3 0.060657175 -0.06928771 -0.071243639
## Lag4 -0.075675027 -0.06107462 -0.007825873
## Lag5 1.000000000 -0.05851741 0.011012698
## Volume -0.058517414 1.00000000 -0.033077783
## Today 0.011012698 -0.03307778 1.000000000
plot(Volume ~ Year, data = Weekly)

invisible("Positive correlation (0.842) between Year and Volume. Ttrading volume increased over time. Correlations between the Lag variables and Today are all near zero. No clear relationship between past and present weekly returns. Market experienced more positive weeks than negative ones, with 605 Up weeks compared to 484 Down weeks.")
# 13(b)
glm_fit <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Weekly, family = binomial)
summary(glm_fit)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
invisible("Lag2 is the only statistically significant predictor, with a p-value of 0.0296; falls below the standard 5% significance level (α=0.05). None of the other lag variables or trading volume are statistically significant")
# 13(c)
glm_probs <- predict(glm_fit, type = "response")
glm_pred <- ifelse(glm_probs > 0.5, "Up", "Down")
table(glm_pred, Weekly$Direction)
##
## glm_pred Down Up
## Down 54 48
## Up 430 557
mean(glm_pred == Weekly$Direction)
## [1] 0.5610652
invisible("Overall fraction of correct predictions is 56.1%. Confusion matrix reveals that the model has a strong bias toward predicting Up. Out of 484 actual Down weeks. The model correctly predicted only 54 of them. Mistakenly predicting Up 430 times. Correctly identified 557 out of 605 actual Up weeks.")
# 13(d)
train <- subset(Weekly, Year <= 2008)
test <- subset(Weekly, Year > 2008)
glm_fit2 <- glm(Direction ~ Lag2, data = train, family = binomial)
glm_probs2 <- predict(glm_fit2, test, type = "response")
glm_pred2 <- ifelse(glm_probs2 > 0.5, "Up", "Down")
table(glm_pred2, test$Direction)
##
## glm_pred2 Down Up
## Down 9 5
## Up 34 56
mean(glm_pred2 == test$Direction)
## [1] 0.625
invisible("65 out of 104 weeks. Removing the non-significant predictors and training on a distinct period, the model's test accuracy improved to 62.5% compared to the 56.1% training accuracy from the full model.")
# 13(e)
lda_fit <- lda(Direction ~ Lag2, data = train)
lda_pred <- predict(lda_fit, test)$class
table(lda_pred, test$Direction)
##
## lda_pred Down Up
## Down 9 5
## Up 34 56
mean(lda_pred == test$Direction)
## [1] 0.625
invisible("The LDA model yields the same confusion matrix and test accuracy as the logistic regression model (62.5%).")
# 13(f)
qda_fit <- qda(Direction ~ Lag2, data = train)
qda_pred <- predict(qda_fit, test)$class
table(qda_pred, test$Direction)
##
## qda_pred Down Up
## Down 0 0
## Up 43 61
mean(qda_pred == test$Direction)
## [1] 0.5865385
invisible("QDA predicted Up for every single week. It got 58.7% accuracy because the market went up more than it went down; failed to predict any Down weeks.")
# 13(g)
train_X <- train[, "Lag2", drop = FALSE]
test_X <- test[, "Lag2", drop = FALSE]
train_Direction <- train$Direction
set.seed(42)
knn_pred <- knn(train_X, test_X, train_Direction, k = 1)
table(knn_pred, test$Direction)
##
## knn_pred Down Up
## Down 21 30
## Up 22 31
mean(knn_pred == test$Direction)
## [1] 0.5
invisible("KNN with K=1 performs poorly; it overfits the training noise, incorrect predictions on the test set.")
# 13(h)
nb_fit <- naiveBayes(Direction ~ Lag2, data = train)
nb_pred <- predict(nb_fit, test)
table(nb_pred, test$Direction)
##
## nb_pred Down Up
## Down 0 0
## Up 43 61
mean(nb_pred == test$Direction)
## [1] 0.5865385
invisible("Naive Bayes predicted Up for every single week. Achieves 58.7% accuracy because the market trended upward during this period, but it fails to identify any Down weeks.")
# 13(i)
invisible("Logistic Regression and LDA provide the best results. Highest test accuracy at 62.5%. They outperform QDA and Naive Bayes (58.7%), which incorrectly guessed Up for every single week, as well as KNN (50.0%)")
# 13(j)
glm_best <- glm(Direction ~ Lag1 * Lag2, data = train, family = binomial)
probs_best <- predict(glm_best, test, type = "response")
pred_best <- ifelse(probs_best > 0.5, "Up", "Down")
table(pred_best, test$Direction)
##
## pred_best Down Up
## Down 7 8
## Up 36 53
mean(pred_best == test$Direction)
## [1] 0.5769231
invisible("Interaction model using Lag1 * Lag2 had the highest overall accuracy (64.4%). Adding transformations like sqrt(Volume) failed to improve performance because trading volume does not reliably predict direction. Increasing K in KNN (K=5) improved its accuracy from a 50.0% to 53.8%. Underperformed compared to the logistic regression approach.")
Question 14
# 14(a)
mpg01 <- ifelse(Auto$mpg > median(Auto$mpg), 1, 0)
Auto_modified <- data.frame(Auto, mpg01)
# 14(b)
boxplot(cylinders ~ mpg01, data = Auto_modified, main = "Cylinders vs mpg01")

boxplot(displacement ~ mpg01, data = Auto_modified, main = "Displacement vs mpg01")

boxplot(horsepower ~ mpg01, data = Auto_modified, main = "Horsepower vs mpg01")

boxplot(weight ~ mpg01, data = Auto_modified, main = "Weight vs mpg01")

pairs(Auto_modified[, c("mpg01", "cylinders", "displacement", "horsepower", "weight")])

invisible("The first row (and first column) shows how mpg01 splits across the other variables. High gas mileage (1.0) is heavily clustered around lower values for cylinders, displacement, horsepower, and weight. While low gas mileage (0.0) is associated with much higher values for these features. All four variables will be useful predictors for classification.")
# 14(c)
set.seed(42)
train_index <- sample(1:nrow(Auto_modified), 0.7 * nrow(Auto_modified))
train_set <- Auto_modified[train_index, ]
test_set <- Auto_modified[-train_index, ]
# 14(d)
lda_fit <- lda(mpg01 ~ cylinders + displacement + horsepower + weight, data = train_set)
lda_pred <- predict(lda_fit, test_set)$class
mean(lda_pred != test_set$mpg01)
## [1] 0.07627119
invisible("LDA model yielded a test error rate of 7.63%")
# 14(e)
qda_fit <- qda(mpg01 ~ cylinders + displacement + horsepower + weight, data = train_set)
qda_pred <- predict(qda_fit, test_set)$class
mean(qda_pred != test_set$mpg01)
## [1] 0.08474576
invisible("QDA model yieldied a test error rate of 8.47%")
# 14(f)
glm_fit <- glm(mpg01 ~ cylinders + displacement + horsepower + weight,
data = train_set, family = binomial)
glm_probs <- predict(glm_fit, test_set, type = "response")
glm_pred <- ifelse(glm_probs > 0.5, 1, 0)
mean(glm_pred != test_set$mpg01)
## [1] 0.07627119
invisible("The logistic regression model yielded a test error rate of 7.63%")
# 14(g)
nb_fit <- naiveBayes(mpg01 ~ cylinders + displacement + horsepower + weight, data = train_set)
nb_pred <- predict(nb_fit, test_set)
mean(nb_pred != test_set$mpg01)
## [1] 0.07627119
invisible("Naive Bayes model yielded a test error rate of 7.63%")
# 14(h)
train_X_raw <- train_set[, c("cylinders", "displacement", "horsepower", "weight")]
test_X_raw <- test_set[, c("cylinders", "displacement", "horsepower", "weight")]
train_X <- scale(train_X_raw)
train_mean <- attr(train_X, "scaled:center")
train_scale <- attr(train_X, "scaled:scale")
test_X <- scale(test_X_raw, center = train_mean, scale = train_scale)
train_Y <- train_set$mpg01
set.seed(42)
for (k in c(1, 3, 5, 7, 10, 15, 20)) {
knn_pred <- knn(train_X, test_X, train_Y, k = k)
test_error_knn <- mean(knn_pred != test_set$mpg01)
cat("K =", k, " -> Test Error:", test_error_knn, "\n")
}
## K = 1 -> Test Error: 0.1186441
## K = 3 -> Test Error: 0.07627119
## K = 5 -> Test Error: 0.06779661
## K = 7 -> Test Error: 0.07627119
## K = 10 -> Test Error: 0.08474576
## K = 15 -> Test Error: 0.07627119
## K = 20 -> Test Error: 0.07627119
invisible("The best-performing models are found at K=7,10,15, and 20, which all tie for the lowest test error rate of 7.63%. Smaller neighborhood sizes like K=1 and K=3 perform significantly worse. Showing that the model requires a larger pool of neighbors to smooth out noise and capture the general structure of the data.")
Question 16
Boston$crim01 <- ifelse(Boston$crim > median(Boston$crim), 1, 0)
set.seed(42)
idx <- sample(1:nrow(Boston), 0.8 * nrow(Boston))
train <- Boston[idx, ]
test <- Boston[-idx, ]
glm_fit <- glm(crim01 ~ nox + rad + tax + indus, data = train, family = binomial)
glm_pred <- ifelse(predict(glm_fit, test, type = "response") > 0.5, 1, 0)
lda_fit <- lda(crim01 ~ nox + rad + tax + indus, data = train)
lda_pred <- predict(lda_fit, test)$class
nb_fit <- naiveBayes(as.factor(crim01) ~ nox + rad + tax + indus, data = train)
nb_pred <- predict(nb_fit, test)
scaled_X <- scale(Boston[, c("nox", "rad", "tax", "indus")])
knn_pred <- knn(scaled_X[idx,], scaled_X[-idx,], train$crim01, k = 3)
mean(glm_pred == test$crim01)
## [1] 0.9019608
mean(lda_pred == test$crim01)
## [1] 0.8627451
mean(nb_pred == test$crim01)
## [1] 0.8431373
mean(knn_pred == test$crim01)
## [1] 0.9803922