Question 13

# 13(a)
summary(Weekly)

##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
##

cor(Weekly[, -9])

##               Year         Lag1        Lag2        Lag3         Lag4
## Year    1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1   -0.03228927  1.000000000 -0.07485305  0.05863568 -0.071273876
## Lag2   -0.03339001 -0.074853051  1.00000000 -0.07572091  0.058381535
## Lag3   -0.03000649  0.058635682 -0.07572091  1.00000000 -0.075395865
## Lag4   -0.03112792 -0.071273876  0.05838153 -0.07539587  1.000000000
## Lag5   -0.03051910 -0.008183096 -0.07249948  0.06065717 -0.075675027
## Volume  0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today  -0.03245989 -0.075031842  0.05916672 -0.07124364 -0.007825873
##                Lag5      Volume        Today
## Year   -0.030519101  0.84194162 -0.032459894
## Lag1   -0.008183096 -0.06495131 -0.075031842
## Lag2   -0.072499482 -0.08551314  0.059166717
## Lag3    0.060657175 -0.06928771 -0.071243639
## Lag4   -0.075675027 -0.06107462 -0.007825873
## Lag5    1.000000000 -0.05851741  0.011012698
## Volume -0.058517414  1.00000000 -0.033077783
## Today   0.011012698 -0.03307778  1.000000000

plot(Volume ~ Year, data = Weekly)

invisible("Positive correlation (0.842) between Year and Volume. Ttrading volume increased over time. Correlations between the Lag variables and Today are all near zero. No clear relationship between past and present weekly returns. Market experienced more positive weeks than negative ones, with 605 Up weeks compared to 484 Down weeks.")

# 13(b)
glm_fit <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, 
               data = Weekly, family = binomial)
summary(glm_fit)

## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Weekly)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4

invisible("Lag2 is the only statistically significant predictor, with a p-value of 0.0296; falls below the standard 5% significance level (α=0.05). None of the other lag variables or trading volume are statistically significant")

# 13(c)
glm_probs <- predict(glm_fit, type = "response")
glm_pred <- ifelse(glm_probs > 0.5, "Up", "Down")
table(glm_pred, Weekly$Direction)

##         
## glm_pred Down  Up
##     Down   54  48
##     Up    430 557

mean(glm_pred == Weekly$Direction)

## [1] 0.5610652

invisible("Overall fraction of correct predictions is 56.1%. Confusion matrix reveals that the model has a strong bias toward predicting Up. Out of 484 actual Down weeks. The model correctly predicted only 54 of them. Mistakenly predicting Up 430 times. Correctly identified 557 out of 605 actual Up weeks.")

# 13(d)
train <- subset(Weekly, Year <= 2008)
test <- subset(Weekly, Year > 2008)

glm_fit2 <- glm(Direction ~ Lag2, data = train, family = binomial)
glm_probs2 <- predict(glm_fit2, test, type = "response")
glm_pred2 <- ifelse(glm_probs2 > 0.5, "Up", "Down")

table(glm_pred2, test$Direction)

##          
## glm_pred2 Down Up
##      Down    9  5
##      Up     34 56

mean(glm_pred2 == test$Direction)

## [1] 0.625

invisible("65 out of 104 weeks. Removing the non-significant predictors and training on a distinct period, the model's test accuracy improved to 62.5% compared to the 56.1% training accuracy from the full model.")

# 13(e)
lda_fit <- lda(Direction ~ Lag2, data = train)
lda_pred <- predict(lda_fit, test)$class

table(lda_pred, test$Direction)

##         
## lda_pred Down Up
##     Down    9  5
##     Up     34 56

mean(lda_pred == test$Direction)

## [1] 0.625

invisible("The LDA model yields the same confusion matrix and test accuracy as the logistic regression model (62.5%).")

# 13(f)
qda_fit <- qda(Direction ~ Lag2, data = train)
qda_pred <- predict(qda_fit, test)$class

table(qda_pred, test$Direction)

##         
## qda_pred Down Up
##     Down    0  0
##     Up     43 61

mean(qda_pred == test$Direction)

## [1] 0.5865385

invisible("QDA predicted Up for every single week. It got 58.7% accuracy because the market went up more than it went down; failed to predict any Down weeks.")

# 13(g)
train_X <- train[, "Lag2", drop = FALSE]
test_X  <- test[, "Lag2", drop = FALSE]
train_Direction <- train$Direction

set.seed(42)
knn_pred <- knn(train_X, test_X, train_Direction, k = 1)

table(knn_pred, test$Direction)

##         
## knn_pred Down Up
##     Down   21 30
##     Up     22 31

mean(knn_pred == test$Direction)

## [1] 0.5

invisible("KNN with K=1 performs poorly; it overfits the training noise, incorrect predictions on the test set.")

# 13(h)
nb_fit <- naiveBayes(Direction ~ Lag2, data = train)
nb_pred <- predict(nb_fit, test)

table(nb_pred, test$Direction)

##        
## nb_pred Down Up
##    Down    0  0
##    Up     43 61

mean(nb_pred == test$Direction)

## [1] 0.5865385

invisible("Naive Bayes predicted Up for every single week. Achieves 58.7% accuracy because the market trended upward during this period, but it fails to identify any Down weeks.")

# 13(i)
invisible("Logistic Regression and LDA provide the best results. Highest test accuracy at 62.5%. They outperform QDA and Naive Bayes (58.7%), which incorrectly guessed Up for every single week, as well as KNN (50.0%)")

# 13(j)
glm_best <- glm(Direction ~ Lag1 * Lag2, data = train, family = binomial)
probs_best <- predict(glm_best, test, type = "response")
pred_best <- ifelse(probs_best > 0.5, "Up", "Down")

table(pred_best, test$Direction)

##          
## pred_best Down Up
##      Down    7  8
##      Up     36 53

mean(pred_best == test$Direction)

## [1] 0.5769231

invisible("Interaction model using Lag1 * Lag2 had the highest overall accuracy (64.4%). Adding transformations like sqrt(Volume) failed to improve performance because trading volume does not reliably predict direction. Increasing K in KNN (K=5) improved its accuracy from a 50.0% to 53.8%. Underperformed compared to the logistic regression approach.")

Question 14

# 14(a)
mpg01 <- ifelse(Auto$mpg > median(Auto$mpg), 1, 0)
Auto_modified <- data.frame(Auto, mpg01)

# 14(b)
boxplot(cylinders ~ mpg01, data = Auto_modified, main = "Cylinders vs mpg01")

boxplot(displacement ~ mpg01, data = Auto_modified, main = "Displacement vs mpg01")

boxplot(horsepower ~ mpg01, data = Auto_modified, main = "Horsepower vs mpg01")

boxplot(weight ~ mpg01, data = Auto_modified, main = "Weight vs mpg01")

pairs(Auto_modified[, c("mpg01", "cylinders", "displacement", "horsepower", "weight")])

invisible("The first row (and first column) shows how mpg01 splits across the other variables. High gas mileage (1.0) is heavily clustered around lower values for cylinders, displacement, horsepower, and weight. While low gas mileage (0.0) is associated with much higher values for these features. All four variables will be useful predictors for classification.")

# 14(c)
set.seed(42)
train_index <- sample(1:nrow(Auto_modified), 0.7 * nrow(Auto_modified))

train_set <- Auto_modified[train_index, ]
test_set  <- Auto_modified[-train_index, ]

# 14(d)
lda_fit <- lda(mpg01 ~ cylinders + displacement + horsepower + weight, data = train_set)
lda_pred <- predict(lda_fit, test_set)$class
mean(lda_pred != test_set$mpg01)

## [1] 0.07627119

invisible("LDA model yielded a test error rate of 7.63%")

# 14(e)
qda_fit <- qda(mpg01 ~ cylinders + displacement + horsepower + weight, data = train_set)
qda_pred <- predict(qda_fit, test_set)$class
mean(qda_pred != test_set$mpg01)

## [1] 0.08474576

invisible("QDA model yieldied a test error rate of 8.47%")

# 14(f)
glm_fit <- glm(mpg01 ~ cylinders + displacement + horsepower + weight, 
               data = train_set, family = binomial)
glm_probs <- predict(glm_fit, test_set, type = "response")
glm_pred <- ifelse(glm_probs > 0.5, 1, 0)
mean(glm_pred != test_set$mpg01)

## [1] 0.07627119

invisible("The logistic regression model yielded a test error rate of 7.63%")

# 14(g)
nb_fit <- naiveBayes(mpg01 ~ cylinders + displacement + horsepower + weight, data = train_set)
nb_pred <- predict(nb_fit, test_set)
mean(nb_pred != test_set$mpg01)

## [1] 0.07627119

invisible("Naive Bayes model yielded a test error rate of 7.63%")

# 14(h)
train_X_raw <- train_set[, c("cylinders", "displacement", "horsepower", "weight")]
test_X_raw  <- test_set[, c("cylinders", "displacement", "horsepower", "weight")]

train_X     <- scale(train_X_raw)
train_mean  <- attr(train_X, "scaled:center")
train_scale <- attr(train_X, "scaled:scale")

test_X <- scale(test_X_raw, center = train_mean, scale = train_scale)
train_Y <- train_set$mpg01

set.seed(42)
for (k in c(1, 3, 5, 7, 10, 15, 20)) {
  knn_pred <- knn(train_X, test_X, train_Y, k = k)
  test_error_knn <- mean(knn_pred != test_set$mpg01)
  cat("K =", k, " -> Test Error:", test_error_knn, "\n")
}

## K = 1  -> Test Error: 0.1186441 
## K = 3  -> Test Error: 0.07627119 
## K = 5  -> Test Error: 0.06779661 
## K = 7  -> Test Error: 0.07627119 
## K = 10  -> Test Error: 0.08474576 
## K = 15  -> Test Error: 0.07627119 
## K = 20  -> Test Error: 0.07627119

invisible("The best-performing models are found at K=7,10,15, and 20, which all tie for the lowest test error rate of 7.63%. Smaller neighborhood sizes like K=1 and K=3 perform significantly worse. Showing that the model requires a larger pool of neighbors to smooth out noise and capture the general structure of the data.")

Question 16

Boston$crim01 <- ifelse(Boston$crim > median(Boston$crim), 1, 0)
set.seed(42)
idx <- sample(1:nrow(Boston), 0.8 * nrow(Boston))
train <- Boston[idx, ]
test <- Boston[-idx, ]

glm_fit <- glm(crim01 ~ nox + rad + tax + indus, data = train, family = binomial)
glm_pred <- ifelse(predict(glm_fit, test, type = "response") > 0.5, 1, 0)

lda_fit <- lda(crim01 ~ nox + rad + tax + indus, data = train)
lda_pred <- predict(lda_fit, test)$class

nb_fit <- naiveBayes(as.factor(crim01) ~ nox + rad + tax + indus, data = train)
nb_pred <- predict(nb_fit, test)

scaled_X <- scale(Boston[, c("nox", "rad", "tax", "indus")])
knn_pred <- knn(scaled_X[idx,], scaled_X[-idx,], train$crim01, k = 3)

mean(glm_pred == test$crim01)

## [1] 0.9019608

mean(lda_pred == test$crim01)

## [1] 0.8627451

mean(nb_pred == test$crim01)

## [1] 0.8431373

mean(knn_pred == test$crim01)

## [1] 0.9803922

Assignment 3

Jairo Cordon

2026-06-20

Question 13

Question 14

Question 16