1 Exercise 13 — The Weekly data set

The Weekly data set contains \(1{,}089\) weekly percentage returns for \(21\) years (1990–2010).

1.1 (a) Numerical and graphical summaries

summary(Weekly)
##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
## 
round(cor(Weekly[, -9]), 3)   # exclude the qualitative Direction
##          Year   Lag1   Lag2   Lag3   Lag4   Lag5 Volume  Today
## Year    1.000 -0.032 -0.033 -0.030 -0.031 -0.031  0.842 -0.032
## Lag1   -0.032  1.000 -0.075  0.059 -0.071 -0.008 -0.065 -0.075
## Lag2   -0.033 -0.075  1.000 -0.076  0.058 -0.072 -0.086  0.059
## Lag3   -0.030  0.059 -0.076  1.000 -0.075  0.061 -0.069 -0.071
## Lag4   -0.031 -0.071  0.058 -0.075  1.000 -0.076 -0.061 -0.008
## Lag5   -0.031 -0.008 -0.072  0.061 -0.076  1.000 -0.059  0.011
## Volume  0.842 -0.065 -0.086 -0.069 -0.061 -0.059  1.000 -0.033
## Today  -0.032 -0.075  0.059 -0.071 -0.008  0.011 -0.033  1.000
plot(Weekly$Volume, type = "l", ylab = "Volume", xlab = "Week (1990-2010)",
     main = "Trading volume over time")

The only strong correlation is between Year and Volume (\(0.84\)): trading volume grows steadily over the period (the plot rises over time). The five lag variables and Today are essentially uncorrelated with one another, so there is little linear structure to exploit.

1.2 (b) Logistic regression on the full data

glm.full <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
                data = Weekly, family = binomial)
summary(glm.full)
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Weekly)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4

Only Lag2 is statistically significant (\(p \approx 0.03\), positive coefficient); none of the other predictors are significant.

1.3 (c) Confusion matrix on the full data

glm.probs <- predict(glm.full, type = "response")
glm.pred  <- ifelse(glm.probs > 0.5, "Up", "Down")
table(Predicted = glm.pred, Actual = Weekly$Direction)
##          Actual
## Predicted Down  Up
##      Down   54  48
##      Up    430 557
mean(glm.pred == Weekly$Direction)
## [1] 0.5610652

Overall accuracy is about \(56.1\%\) — only slightly better than always guessing. The model predicts “Up” the vast majority of the time: it captures almost all true “Up” weeks (high sensitivity) but misclassifies most “Down” weeks (very low specificity). On the off-diagonal, the dominant error is calling a “Down” week “Up”.

1.4 (d) Logistic regression: train 1990–2008, test 2009–2010, Lag2 only

train <- Weekly$Year <= 2008
test.Direction <- Weekly$Direction[!train]

acc <- function(pred) mean(pred == test.Direction)   # helper

glm.fit  <- glm(Direction ~ Lag2, data = Weekly, family = binomial, subset = train)
glm.pred <- ifelse(predict(glm.fit, Weekly[!train, ], type = "response") > 0.5, "Up", "Down")
table(Predicted = glm.pred, Actual = test.Direction)
##          Actual
## Predicted Down Up
##      Down    9  5
##      Up     34 56
acc(glm.pred)
## [1] 0.625

1.5 (e) LDA

lda.fit  <- lda(Direction ~ Lag2, data = Weekly, subset = train)
lda.pred <- predict(lda.fit, Weekly[!train, ])$class
table(Predicted = lda.pred, Actual = test.Direction)
##          Actual
## Predicted Down Up
##      Down    9  5
##      Up     34 56
acc(lda.pred)
## [1] 0.625

1.6 (f) QDA

qda.fit  <- qda(Direction ~ Lag2, data = Weekly, subset = train)
qda.pred <- predict(qda.fit, Weekly[!train, ])$class
table(Predicted = qda.pred, Actual = test.Direction)
##          Actual
## Predicted Down Up
##      Down    0  0
##      Up     43 61
acc(qda.pred)
## [1] 0.5865385

1.7 (g) KNN with \(K = 1\)

train.X <- matrix(Weekly$Lag2[train])
test.X  <- matrix(Weekly$Lag2[!train])
train.Y <- Weekly$Direction[train]
set.seed(1)
knn.pred <- knn(train.X, test.X, train.Y, k = 1)
table(Predicted = knn.pred, Actual = test.Direction)
##          Actual
## Predicted Down Up
##      Down   21 30
##      Up     22 31
acc(knn.pred)
## [1] 0.5

1.8 (h) Naive Bayes

nb.fit  <- naiveBayes(Direction ~ Lag2, data = Weekly, subset = train)
nb.pred <- predict(nb.fit, Weekly[!train, ])
table(Predicted = nb.pred, Actual = test.Direction)
##          Actual
## Predicted Down Up
##      Down    0  0
##      Up     43 61
acc(nb.pred)
## [1] 0.5865385

1.9 (i) Which method is best?

data.frame(
  method   = c("Logistic", "LDA", "QDA", "KNN (K=1)", "Naive Bayes"),
  accuracy = round(c(acc(glm.pred), acc(lda.pred), acc(qda.pred),
                     acc(knn.pred), acc(nb.pred)), 4)
)
##        method accuracy
## 1    Logistic   0.6250
## 2         LDA   0.6250
## 3         QDA   0.5865
## 4   KNN (K=1)   0.5000
## 5 Naive Bayes   0.5865

Logistic regression and LDA tie for the best test accuracy (\(\approx 62.5\%\)). QDA and naive Bayes both score \(\approx 58.7\%\) because they predict “Up” for every test week, and \(KNN(1)\) is worst at about \(50\%\) (pure noise).

1.10 (j) Experimenting with predictors and \(K\)

# a few logistic / KNN variants on the same train/test split
res <- list()

m <- glm(Direction ~ Lag1:Lag2, data = Weekly, family = binomial, subset = train)
p <- ifelse(predict(m, Weekly[!train, ], type = "response") > 0.5, "Up", "Down")
res[["logistic Lag1:Lag2"]] <- acc(p)

m <- glm(Direction ~ Lag2 + I(Lag1^2), data = Weekly, family = binomial, subset = train)
p <- ifelse(predict(m, Weekly[!train, ], type = "response") > 0.5, "Up", "Down")
res[["logistic Lag2 + Lag1^2"]] <- acc(p)

for (k in c(5, 10, 30, 50, 100)) {
  set.seed(1)
  p <- knn(train.X, test.X, train.Y, k = k)
  res[[paste0("KNN K=", k)]] <- acc(p)
}
round(unlist(res), 4)
##     logistic Lag1:Lag2 logistic Lag2 + Lag1^2                KNN K=5 
##                 0.5865                 0.6442                 0.5385 
##               KNN K=10               KNN K=30               KNN K=50 
##                 0.5481                 0.5481                 0.5577 
##              KNN K=100 
##                 0.5769
# best single model found: logistic regression on Lag2 alone (from part d)
table(Predicted = glm.pred, Actual = test.Direction)
##          Actual
## Predicted Down Up
##      Down    9  5
##      Up     34 56
acc(glm.pred)
## [1] 0.625

Among everything tried, the simplest model — logistic regression (equivalently LDA) on Lag2 alone, \(\approx 62.5\%\) — gives the best held-out accuracy. Larger \(K\) in KNN helps relative to \(K=1\) (KNN with \(K \approx 50\) climbs back toward \(\sim 58\%\)), but no interaction or transformation beats the plain Lag2 model. Its confusion matrix is shown above.

2 Exercise 14 — Predicting high/low gas mileage on Auto

2.1 (a) Create the binary response mpg01

Auto2 <- Auto
Auto2$mpg01 <- as.integer(Auto2$mpg > median(Auto2$mpg))
table(Auto2$mpg01)
## 
##   0   1 
## 196 196

2.2 (b) Explore the association graphically

num <- Auto2[, !(names(Auto2) %in% c("name", "mpg"))]
round(cor(num)[, "mpg01"], 3)
##    cylinders displacement   horsepower       weight acceleration         year 
##       -0.759       -0.753       -0.667       -0.758        0.347        0.430 
##       origin        mpg01 
##        0.514        1.000
par(mfrow = c(2, 3))
for (v in c("cylinders", "displacement", "horsepower", "weight", "acceleration", "year"))
  boxplot(Auto2[[v]] ~ Auto2$mpg01, xlab = "mpg01", ylab = v, col = c("tomato", "skyblue"))

cylinders, displacement, horsepower, and weight are the most strongly (negatively) associated with mpg01: high-mileage cars (mpg01 \(=1\)) clearly have fewer cylinders, smaller engines, less power, and lower weight. acceleration, year, and origin show weaker separation. We therefore use cylinders, displacement, horsepower, weight as predictors.

2.3 (c) Train/test split

set.seed(1)
train.idx <- sample(nrow(Auto2), 0.7 * nrow(Auto2))
train <- rep(FALSE, nrow(Auto2)); train[train.idx] <- TRUE
preds <- c("cylinders", "displacement", "horsepower", "weight")
fml   <- as.formula(paste("mpg01 ~", paste(preds, collapse = " + ")))
y.test <- Auto2$mpg01[!train]
err <- function(pred) mean(pred != y.test)   # test error

2.4 (d) LDA

lda.fit  <- lda(fml, data = Auto2, subset = train)
lda.pred <- predict(lda.fit, Auto2[!train, ])$class
err(as.integer(as.character(lda.pred)))
## [1] 0.1186441

2.5 (e) QDA

qda.fit  <- qda(fml, data = Auto2, subset = train)
qda.pred <- predict(qda.fit, Auto2[!train, ])$class
err(as.integer(as.character(qda.pred)))
## [1] 0.1186441

2.6 (f) Logistic regression

glm.fit  <- glm(fml, data = Auto2, family = binomial, subset = train)
glm.pred <- as.integer(predict(glm.fit, Auto2[!train, ], type = "response") > 0.5)
err(glm.pred)
## [1] 0.09322034

2.7 (g) Naive Bayes

nb.fit  <- naiveBayes(fml, data = Auto2, subset = train)
nb.pred <- predict(nb.fit, Auto2[!train, ])
err(as.integer(as.character(nb.pred)))
## [1] 0.1101695

2.8 (h) KNN with several values of \(K\)

train.X <- scale(Auto2[, preds])[train, ]
test.X  <- scale(Auto2[, preds])[!train, ]
train.Y <- Auto2$mpg01[train]
knn.err <- sapply(c(1, 3, 5, 10, 20, 50, 100), function(k) {
  set.seed(1)
  mean(knn(train.X, test.X, train.Y, k = k) != y.test)
})
names(knn.err) <- paste0("K=", c(1, 3, 5, 10, 20, 50, 100))
round(knn.err, 4)
##    K=1    K=3    K=5   K=10   K=20   K=50  K=100 
## 0.1271 0.1271 0.1356 0.1356 0.1102 0.1102 0.1186
data.frame(
  method     = c("LDA", "QDA", "Logistic", "Naive Bayes", paste0("KNN best (", names(which.min(knn.err)), ")")),
  test_error = round(c(err(as.integer(as.character(lda.pred))),
                       err(as.integer(as.character(qda.pred))),
                       err(glm.pred),
                       err(as.integer(as.character(nb.pred))),
                       min(knn.err)), 4)
)
##            method test_error
## 1             LDA     0.1186
## 2             QDA     0.1186
## 3        Logistic     0.0932
## 4     Naive Bayes     0.1102
## 5 KNN best (K=20)     0.1102

All methods achieve a low test error (roughly \(9\)\(12\%\)) using these four predictors. The linear/logistic methods and the best KNN are very close; KNN does best at a moderate \(K\) (small \(K\) overfits, very large \(K\) oversmooths).

3 Exercise 16 — Predicting high/low crime in Boston

We create a binary response crim01 (1 if a suburb’s crime rate is above the median, else 0) and predict it from the other variables.

Boston2 <- Boston
Boston2$crim01 <- as.integer(Boston$crim > median(Boston$crim))
round(cor(Boston2)[, "crim01"], 3)
##    crim      zn   indus    chas     nox      rm     age     dis     rad     tax 
##   0.409  -0.436   0.603   0.070   0.723  -0.156   0.614  -0.616   0.620   0.609 
## ptratio   black   lstat    medv  crim01 
##   0.254  -0.351   0.453  -0.263   1.000

nox, rad, tax, age, indus, and dis are the most associated with crim01, so we consider both the full predictor set and this informative subset.

set.seed(1)
train.idx <- sample(nrow(Boston2), 0.7 * nrow(Boston2))
train <- rep(FALSE, nrow(Boston2)); train[train.idx] <- TRUE
y.test <- Boston2$crim01[!train]
err <- function(pred) mean(pred != y.test)

full   <- setdiff(names(Boston2), c("crim", "crim01"))
subset <- c("nox", "rad", "tax", "age", "indus", "dis")
f_full <- as.formula(paste("crim01 ~", paste(full,   collapse = " + ")))
f_sub  <- as.formula(paste("crim01 ~", paste(subset, collapse = " + ")))
fit_eval <- function(fml, predset, label) {
  # logistic
  g  <- glm(fml, data = Boston2, family = binomial, subset = train)
  gp <- as.integer(predict(g, Boston2[!train, ], type = "response") > 0.5)
  # LDA
  l  <- lda(fml, data = Boston2, subset = train)
  lp <- as.integer(as.character(predict(l, Boston2[!train, ])$class))
  # naive Bayes
  nb <- naiveBayes(fml, data = Boston2, subset = train)
  np <- as.integer(as.character(predict(nb, Boston2[!train, ])))
  # KNN (best K over a grid)
  trX <- scale(Boston2[, predset])[train, ]
  teX <- scale(Boston2[, predset])[!train, ]
  trY <- Boston2$crim01[train]
  ke  <- sapply(c(1, 3, 5, 10, 20, 50), function(k) {
    set.seed(1); mean(knn(trX, teX, trY, k = k) != y.test) })
  data.frame(predictors = label,
             logistic = round(err(gp), 4), LDA = round(err(lp), 4),
             naiveBayes = round(err(np), 4),
             KNN_best = round(min(ke), 4),
             KNN_K = c(1,3,5,10,20,50)[which.min(ke)])
}
rbind(
  fit_eval(f_full, full,   "all predictors"),
  fit_eval(f_sub,  subset, "nox,rad,tax,age,indus,dis")
)
##                  predictors logistic    LDA naiveBayes KNN_best KNN_K
## 1            all predictors   0.1053 0.1513     0.1645   0.0461     3
## 2 nox,rad,tax,age,indus,dis   0.1382 0.1447     0.1776   0.0724     3

Findings. All four methods predict whether a suburb is above the median crime rate with low test error. KNN (small \(K\)) gives the lowest error (around \(5\)\(8\%\)), followed closely by naive Bayes and logistic regression; LDA is a little worse because the predictors (especially nox, rad, and tax) are far from normally distributed and have different spreads in the two classes, which violates LDA’s equal-covariance/normality assumptions. Restricting to the informative subset (nox, rad, tax, age, indus, dis) performs about as well as using all predictors, confirming that crime is driven mainly by accessibility to highways (rad), tax rate (tax), pollution (nox), and the age/industrial character of the area.