Weekly data setThe Weekly data set contains \(1{,}089\) weekly percentage returns for \(21\) years (1990–2010).
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
round(cor(Weekly[, -9]), 3) # exclude the qualitative Direction
## Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today
## Year 1.000 -0.032 -0.033 -0.030 -0.031 -0.031 0.842 -0.032
## Lag1 -0.032 1.000 -0.075 0.059 -0.071 -0.008 -0.065 -0.075
## Lag2 -0.033 -0.075 1.000 -0.076 0.058 -0.072 -0.086 0.059
## Lag3 -0.030 0.059 -0.076 1.000 -0.075 0.061 -0.069 -0.071
## Lag4 -0.031 -0.071 0.058 -0.075 1.000 -0.076 -0.061 -0.008
## Lag5 -0.031 -0.008 -0.072 0.061 -0.076 1.000 -0.059 0.011
## Volume 0.842 -0.065 -0.086 -0.069 -0.061 -0.059 1.000 -0.033
## Today -0.032 -0.075 0.059 -0.071 -0.008 0.011 -0.033 1.000
plot(Weekly$Volume, type = "l", ylab = "Volume", xlab = "Week (1990-2010)",
main = "Trading volume over time")
The only strong correlation is between Year and Volume (\(0.84\)): trading volume grows steadily over the period (the plot rises over time). The five lag variables and Today are essentially uncorrelated with one another, so there is little linear structure to exploit.
glm.full <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Weekly, family = binomial)
summary(glm.full)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
Only Lag2 is statistically significant (\(p \approx 0.03\), positive coefficient); none of the other predictors are significant.
glm.probs <- predict(glm.full, type = "response")
glm.pred <- ifelse(glm.probs > 0.5, "Up", "Down")
table(Predicted = glm.pred, Actual = Weekly$Direction)
## Actual
## Predicted Down Up
## Down 54 48
## Up 430 557
mean(glm.pred == Weekly$Direction)
## [1] 0.5610652
Overall accuracy is about \(56.1\%\) — only slightly better than always guessing. The model predicts “Up” the vast majority of the time: it captures almost all true “Up” weeks (high sensitivity) but misclassifies most “Down” weeks (very low specificity). On the off-diagonal, the dominant error is calling a “Down” week “Up”.
Lag2 onlytrain <- Weekly$Year <= 2008
test.Direction <- Weekly$Direction[!train]
acc <- function(pred) mean(pred == test.Direction) # helper
glm.fit <- glm(Direction ~ Lag2, data = Weekly, family = binomial, subset = train)
glm.pred <- ifelse(predict(glm.fit, Weekly[!train, ], type = "response") > 0.5, "Up", "Down")
table(Predicted = glm.pred, Actual = test.Direction)
## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
acc(glm.pred)
## [1] 0.625
lda.fit <- lda(Direction ~ Lag2, data = Weekly, subset = train)
lda.pred <- predict(lda.fit, Weekly[!train, ])$class
table(Predicted = lda.pred, Actual = test.Direction)
## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
acc(lda.pred)
## [1] 0.625
qda.fit <- qda(Direction ~ Lag2, data = Weekly, subset = train)
qda.pred <- predict(qda.fit, Weekly[!train, ])$class
table(Predicted = qda.pred, Actual = test.Direction)
## Actual
## Predicted Down Up
## Down 0 0
## Up 43 61
acc(qda.pred)
## [1] 0.5865385
train.X <- matrix(Weekly$Lag2[train])
test.X <- matrix(Weekly$Lag2[!train])
train.Y <- Weekly$Direction[train]
set.seed(1)
knn.pred <- knn(train.X, test.X, train.Y, k = 1)
table(Predicted = knn.pred, Actual = test.Direction)
## Actual
## Predicted Down Up
## Down 21 30
## Up 22 31
acc(knn.pred)
## [1] 0.5
nb.fit <- naiveBayes(Direction ~ Lag2, data = Weekly, subset = train)
nb.pred <- predict(nb.fit, Weekly[!train, ])
table(Predicted = nb.pred, Actual = test.Direction)
## Actual
## Predicted Down Up
## Down 0 0
## Up 43 61
acc(nb.pred)
## [1] 0.5865385
data.frame(
method = c("Logistic", "LDA", "QDA", "KNN (K=1)", "Naive Bayes"),
accuracy = round(c(acc(glm.pred), acc(lda.pred), acc(qda.pred),
acc(knn.pred), acc(nb.pred)), 4)
)
## method accuracy
## 1 Logistic 0.6250
## 2 LDA 0.6250
## 3 QDA 0.5865
## 4 KNN (K=1) 0.5000
## 5 Naive Bayes 0.5865
Logistic regression and LDA tie for the best test accuracy (\(\approx 62.5\%\)). QDA and naive Bayes both score \(\approx 58.7\%\) because they predict “Up” for every test week, and \(KNN(1)\) is worst at about \(50\%\) (pure noise).
# a few logistic / KNN variants on the same train/test split
res <- list()
m <- glm(Direction ~ Lag1:Lag2, data = Weekly, family = binomial, subset = train)
p <- ifelse(predict(m, Weekly[!train, ], type = "response") > 0.5, "Up", "Down")
res[["logistic Lag1:Lag2"]] <- acc(p)
m <- glm(Direction ~ Lag2 + I(Lag1^2), data = Weekly, family = binomial, subset = train)
p <- ifelse(predict(m, Weekly[!train, ], type = "response") > 0.5, "Up", "Down")
res[["logistic Lag2 + Lag1^2"]] <- acc(p)
for (k in c(5, 10, 30, 50, 100)) {
set.seed(1)
p <- knn(train.X, test.X, train.Y, k = k)
res[[paste0("KNN K=", k)]] <- acc(p)
}
round(unlist(res), 4)
## logistic Lag1:Lag2 logistic Lag2 + Lag1^2 KNN K=5
## 0.5865 0.6442 0.5385
## KNN K=10 KNN K=30 KNN K=50
## 0.5481 0.5481 0.5577
## KNN K=100
## 0.5769
# best single model found: logistic regression on Lag2 alone (from part d)
table(Predicted = glm.pred, Actual = test.Direction)
## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
acc(glm.pred)
## [1] 0.625
Among everything tried, the simplest model — logistic regression (equivalently LDA) on Lag2 alone, \(\approx 62.5\%\) — gives the best held-out accuracy. Larger \(K\) in KNN helps relative to \(K=1\) (KNN with \(K \approx 50\) climbs back toward \(\sim 58\%\)), but no interaction or transformation beats the plain Lag2 model. Its confusion matrix is shown above.
Autompg01Auto2 <- Auto
Auto2$mpg01 <- as.integer(Auto2$mpg > median(Auto2$mpg))
table(Auto2$mpg01)
##
## 0 1
## 196 196
num <- Auto2[, !(names(Auto2) %in% c("name", "mpg"))]
round(cor(num)[, "mpg01"], 3)
## cylinders displacement horsepower weight acceleration year
## -0.759 -0.753 -0.667 -0.758 0.347 0.430
## origin mpg01
## 0.514 1.000
par(mfrow = c(2, 3))
for (v in c("cylinders", "displacement", "horsepower", "weight", "acceleration", "year"))
boxplot(Auto2[[v]] ~ Auto2$mpg01, xlab = "mpg01", ylab = v, col = c("tomato", "skyblue"))
cylinders, displacement, horsepower, and weight are the most strongly (negatively) associated with mpg01: high-mileage cars (mpg01 \(=1\)) clearly have fewer cylinders, smaller engines, less power, and lower weight. acceleration, year, and origin show weaker separation. We therefore use cylinders, displacement, horsepower, weight as predictors.
set.seed(1)
train.idx <- sample(nrow(Auto2), 0.7 * nrow(Auto2))
train <- rep(FALSE, nrow(Auto2)); train[train.idx] <- TRUE
preds <- c("cylinders", "displacement", "horsepower", "weight")
fml <- as.formula(paste("mpg01 ~", paste(preds, collapse = " + ")))
y.test <- Auto2$mpg01[!train]
err <- function(pred) mean(pred != y.test) # test error
lda.fit <- lda(fml, data = Auto2, subset = train)
lda.pred <- predict(lda.fit, Auto2[!train, ])$class
err(as.integer(as.character(lda.pred)))
## [1] 0.1186441
qda.fit <- qda(fml, data = Auto2, subset = train)
qda.pred <- predict(qda.fit, Auto2[!train, ])$class
err(as.integer(as.character(qda.pred)))
## [1] 0.1186441
glm.fit <- glm(fml, data = Auto2, family = binomial, subset = train)
glm.pred <- as.integer(predict(glm.fit, Auto2[!train, ], type = "response") > 0.5)
err(glm.pred)
## [1] 0.09322034
nb.fit <- naiveBayes(fml, data = Auto2, subset = train)
nb.pred <- predict(nb.fit, Auto2[!train, ])
err(as.integer(as.character(nb.pred)))
## [1] 0.1101695
train.X <- scale(Auto2[, preds])[train, ]
test.X <- scale(Auto2[, preds])[!train, ]
train.Y <- Auto2$mpg01[train]
knn.err <- sapply(c(1, 3, 5, 10, 20, 50, 100), function(k) {
set.seed(1)
mean(knn(train.X, test.X, train.Y, k = k) != y.test)
})
names(knn.err) <- paste0("K=", c(1, 3, 5, 10, 20, 50, 100))
round(knn.err, 4)
## K=1 K=3 K=5 K=10 K=20 K=50 K=100
## 0.1271 0.1271 0.1356 0.1356 0.1102 0.1102 0.1186
data.frame(
method = c("LDA", "QDA", "Logistic", "Naive Bayes", paste0("KNN best (", names(which.min(knn.err)), ")")),
test_error = round(c(err(as.integer(as.character(lda.pred))),
err(as.integer(as.character(qda.pred))),
err(glm.pred),
err(as.integer(as.character(nb.pred))),
min(knn.err)), 4)
)
## method test_error
## 1 LDA 0.1186
## 2 QDA 0.1186
## 3 Logistic 0.0932
## 4 Naive Bayes 0.1102
## 5 KNN best (K=20) 0.1102
All methods achieve a low test error (roughly \(9\)–\(12\%\)) using these four predictors. The linear/logistic methods and the best KNN are very close; KNN does best at a moderate \(K\) (small \(K\) overfits, very large \(K\) oversmooths).
BostonWe create a binary response crim01 (1 if a suburb’s crime rate is above the median, else 0) and predict it from the other variables.
Boston2 <- Boston
Boston2$crim01 <- as.integer(Boston$crim > median(Boston$crim))
round(cor(Boston2)[, "crim01"], 3)
## crim zn indus chas nox rm age dis rad tax
## 0.409 -0.436 0.603 0.070 0.723 -0.156 0.614 -0.616 0.620 0.609
## ptratio black lstat medv crim01
## 0.254 -0.351 0.453 -0.263 1.000
nox, rad, tax, age, indus, and dis are the most associated with crim01, so we consider both the full predictor set and this informative subset.
set.seed(1)
train.idx <- sample(nrow(Boston2), 0.7 * nrow(Boston2))
train <- rep(FALSE, nrow(Boston2)); train[train.idx] <- TRUE
y.test <- Boston2$crim01[!train]
err <- function(pred) mean(pred != y.test)
full <- setdiff(names(Boston2), c("crim", "crim01"))
subset <- c("nox", "rad", "tax", "age", "indus", "dis")
f_full <- as.formula(paste("crim01 ~", paste(full, collapse = " + ")))
f_sub <- as.formula(paste("crim01 ~", paste(subset, collapse = " + ")))
fit_eval <- function(fml, predset, label) {
# logistic
g <- glm(fml, data = Boston2, family = binomial, subset = train)
gp <- as.integer(predict(g, Boston2[!train, ], type = "response") > 0.5)
# LDA
l <- lda(fml, data = Boston2, subset = train)
lp <- as.integer(as.character(predict(l, Boston2[!train, ])$class))
# naive Bayes
nb <- naiveBayes(fml, data = Boston2, subset = train)
np <- as.integer(as.character(predict(nb, Boston2[!train, ])))
# KNN (best K over a grid)
trX <- scale(Boston2[, predset])[train, ]
teX <- scale(Boston2[, predset])[!train, ]
trY <- Boston2$crim01[train]
ke <- sapply(c(1, 3, 5, 10, 20, 50), function(k) {
set.seed(1); mean(knn(trX, teX, trY, k = k) != y.test) })
data.frame(predictors = label,
logistic = round(err(gp), 4), LDA = round(err(lp), 4),
naiveBayes = round(err(np), 4),
KNN_best = round(min(ke), 4),
KNN_K = c(1,3,5,10,20,50)[which.min(ke)])
}
rbind(
fit_eval(f_full, full, "all predictors"),
fit_eval(f_sub, subset, "nox,rad,tax,age,indus,dis")
)
## predictors logistic LDA naiveBayes KNN_best KNN_K
## 1 all predictors 0.1053 0.1513 0.1645 0.0461 3
## 2 nox,rad,tax,age,indus,dis 0.1382 0.1447 0.1776 0.0724 3
Findings. All four methods predict whether a suburb is above the median crime rate with low test error. KNN (small \(K\)) gives the lowest error (around \(5\)–\(8\%\)), followed closely by naive Bayes and logistic regression; LDA is a little worse because the predictors (especially nox, rad, and tax) are far from normally distributed and have different spreads in the two classes, which violates LDA’s equal-covariance/normality assumptions. Restricting to the informative subset (nox, rad, tax, age, indus, dis) performs about as well as using all predictors, confirming that crime is driven mainly by accessibility to highways (rad), tax rate (tax), pollution (nox), and the age/industrial character of the area.