data(Weekly, package = "ISLR2")
cat("Dimensions:", nrow(Weekly), "rows x", ncol(Weekly), "columns\n")## Dimensions: 1089 rows x 9 columns
## Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
## 1 1990 0.816 1.572 -3.936 -0.229 -3.484 0.1549760 -0.270 Down
## 2 1990 -0.270 0.816 1.572 -3.936 -0.229 0.1485740 -2.576 Down
## 3 1990 -2.576 -0.270 0.816 1.572 -3.936 0.1598375 3.514 Up
## 4 1990 3.514 -2.576 -0.270 0.816 1.572 0.1616300 0.712 Up
## 5 1990 0.712 3.514 -2.576 -0.270 0.816 0.1537280 1.178 Up
## 6 1990 1.178 0.712 3.514 -2.576 -0.270 0.1544440 -1.372 Down
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
## Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today
## Year 1.000 -0.032 -0.033 -0.030 -0.031 -0.031 0.842 -0.032
## Lag1 -0.032 1.000 -0.075 0.059 -0.071 -0.008 -0.065 -0.075
## Lag2 -0.033 -0.075 1.000 -0.076 0.058 -0.072 -0.086 0.059
## Lag3 -0.030 0.059 -0.076 1.000 -0.075 0.061 -0.069 -0.071
## Lag4 -0.031 -0.071 0.058 -0.075 1.000 -0.076 -0.061 -0.008
## Lag5 -0.031 -0.008 -0.072 0.061 -0.076 1.000 -0.059 0.011
## Volume 0.842 -0.065 -0.086 -0.069 -0.061 -0.059 1.000 -0.033
## Today -0.032 -0.075 0.059 -0.071 -0.008 0.011 -0.033 1.000
par(mfrow = c(2, 3))
for (v in c("Lag1", "Lag2", "Lag3", "Lag4", "Lag5", "Volume")) {
boxplot(Weekly[[v]] ~ Weekly$Direction,
main = v,
xlab = "Direction",
ylab = v,
col = c("salmon", "steelblue"))
}plot(Weekly$Year, Weekly$Volume,
xlab = "Year", ylab = "Volume",
main = "Trading Volume over Time",
pch = 20, col = "steelblue", cex = 0.6)The correlation matrix doesn’t show much between the lag variables
and Direction. The only clear pattern is that
Volume has been climbing steadily over the years. The lag
variables themselves barely separate the Up and Down weeks in the
boxplots.
fit_all <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Weekly,
family = binomial)
summary(fit_all)##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
Only Lag2 is significant at the 5% level. Everything
else has large p-values.
prob_all <- predict(fit_all, type = "response")
pred_all <- ifelse(prob_all > 0.5, "Up", "Down")
table(Predicted = pred_all, Truth = Weekly$Direction)## Truth
## Predicted Down Up
## Down 54 48
## Up 430 557
## Overall fraction correct: 0.5611
The overall accuracy looks okay at first glance, but the confusion matrix tells the real story. The model is calling almost everything “Up” — it catches most of the actual Up weeks but misses almost all the Down weeks. So it’s not really learning anything useful, just predicting the majority class.
train <- Weekly$Year <= 2008
w_train <- Weekly[train, ]
w_test <- Weekly[!train, ]
fit_d <- glm(Direction ~ Lag2, data = w_train, family = binomial)
prob_d <- predict(fit_d, newdata = w_test, type = "response")
pred_d <- ifelse(prob_d > 0.5, "Up", "Down")
table(Predicted = pred_d, Truth = w_test$Direction)## Truth
## Predicted Down Up
## Down 9 5
## Up 34 56
## Logistic (Lag2) test accuracy: 0.625
fit_lda_d <- lda(Direction ~ Lag2, data = w_train)
pred_lda_d <- predict(fit_lda_d, newdata = w_test)$class
table(Predicted = pred_lda_d, Truth = w_test$Direction)## Truth
## Predicted Down Up
## Down 9 5
## Up 34 56
## LDA test accuracy: 0.625
fit_qda_d <- qda(Direction ~ Lag2, data = w_train)
pred_qda_d <- predict(fit_qda_d, newdata = w_test)$class
table(Predicted = pred_qda_d, Truth = w_test$Direction)## Truth
## Predicted Down Up
## Down 0 0
## Up 43 61
## QDA test accuracy: 0.5865
set.seed(1)
train_X <- as.matrix(w_train$Lag2)
test_X <- as.matrix(w_test$Lag2)
pred_knn1 <- knn(train_X, test_X, w_train$Direction, k = 1)
table(Predicted = pred_knn1, Truth = w_test$Direction)## Truth
## Predicted Down Up
## Down 21 30
## Up 22 31
## KNN (K=1) test accuracy: 0.5
fit_nb_d <- naiveBayes(Direction ~ Lag2, data = w_train)
pred_nb_d <- predict(fit_nb_d, newdata = w_test)
table(Predicted = pred_nb_d, Truth = w_test$Direction)## Truth
## Predicted Down Up
## Down 0 0
## Up 43 61
## Naive Bayes test accuracy: 0.5865
Logistic regression and LDA tie for the best accuracy on the test set. QDA and naive Bayes both default to predicting “Up” almost every time, which gives a passable accuracy number but isn’t actually doing anything useful. KNN with K = 1 is too noisy. For this predictor, logistic regression or LDA are the better options.
# Logistic: Lag1 + Lag2
fit_j1 <- glm(Direction ~ Lag1 + Lag2, data = w_train, family = binomial)
pred_j1 <- ifelse(predict(fit_j1, newdata = w_test, type = "response") > 0.5, "Up", "Down")
cat("Logistic Lag1+Lag2 :", round(mean(pred_j1 == w_test$Direction), 4), "\n")## Logistic Lag1+Lag2 : 0.5769
# Logistic: Lag2 + Volume
fit_j2 <- glm(Direction ~ Lag2 + Volume, data = w_train, family = binomial)
pred_j2 <- ifelse(predict(fit_j2, newdata = w_test, type = "response") > 0.5, "Up", "Down")
cat("Logistic Lag2+Volume :", round(mean(pred_j2 == w_test$Direction), 4), "\n")## Logistic Lag2+Volume : 0.5385
# Logistic: Lag2 quadratic
fit_j3 <- glm(Direction ~ Lag2 + I(Lag2^2), data = w_train, family = binomial)
pred_j3 <- ifelse(predict(fit_j3, newdata = w_test, type = "response") > 0.5, "Up", "Down")
cat("Logistic Lag2 + Lag2^2 :", round(mean(pred_j3 == w_test$Direction), 4), "\n")## Logistic Lag2 + Lag2^2 : 0.625
# LDA: Lag1 + Lag2
fit_lda_j <- lda(Direction ~ Lag1 + Lag2, data = w_train)
pred_lda_j <- predict(fit_lda_j, newdata = w_test)$class
cat("LDA Lag1+Lag2 :", round(mean(pred_lda_j == w_test$Direction), 4), "\n")## LDA Lag1+Lag2 : 0.5769
# KNN with several K values on Lag2
set.seed(1)
for (k in c(3, 5, 10, 20)) {
pk <- knn(train_X, test_X, w_train$Direction, k = k)
cat("KNN K =", k, " :", round(mean(pk == w_test$Direction), 4), "\n")
}## KNN K = 3 : 0.5481
## KNN K = 5 : 0.5385
## KNN K = 10 : 0.5865
## KNN K = 20 : 0.5962
Adding more predictors doesn’t consistently help — the Lag2-only
models stay competitive. KNN improves with larger K compared to K = 1,
but still doesn’t beat logistic regression or LDA. The best overall
approach on this dataset is LDA or logistic regression with just
Lag2.
## Best model: LDA with Lag2
## Truth
## Predicted Down Up
## Down 9 5
## Up 34 56
## Test accuracy: 0.625
data(Auto, package = "ISLR2")
Auto <- na.omit(Auto)
cat("Dimensions:", nrow(Auto), "rows x", ncol(Auto), "columns\n")## Dimensions: 392 rows x 9 columns
Auto$mpg01 <- factor(ifelse(Auto$mpg > median(Auto$mpg), 1, 0))
cat("Median mpg:", median(Auto$mpg), "\n")## Median mpg: 22.75
##
## 0 1
## 196 196
par(mfrow = c(2, 3))
for (v in c("cylinders", "displacement", "horsepower", "weight", "acceleration", "year")) {
boxplot(Auto[[v]] ~ Auto$mpg01,
main = v,
xlab = "mpg01 (0 = low, 1 = high)",
ylab = v,
col = c("salmon", "steelblue"),
names = c("Low mpg", "High mpg"))
}pairs(Auto[, c("cylinders", "displacement", "horsepower",
"weight", "acceleration", "year")],
col = ifelse(Auto$mpg01 == 1, "steelblue", "salmon"),
pch = 20,
cex = 0.5,
main = "Scatterplot Matrix (blue = high mpg, red = low mpg)")round(cor(Auto[, c("cylinders", "displacement", "horsepower",
"weight", "acceleration", "year")])[, 1:6], 3)## cylinders displacement horsepower weight acceleration year
## cylinders 1.000 0.951 0.843 0.898 -0.505 -0.346
## displacement 0.951 1.000 0.897 0.933 -0.544 -0.370
## horsepower 0.843 0.897 1.000 0.865 -0.689 -0.416
## weight 0.898 0.933 0.865 1.000 -0.417 -0.309
## acceleration -0.505 -0.544 -0.689 -0.417 1.000 0.290
## year -0.346 -0.370 -0.416 -0.309 0.290 1.000
The boxplots make the separation pretty clear.
cylinders, displacement,
horsepower, and weight all look strongly
negatively associated with mpg01 — high-mpg cars tend to
have fewer cylinders, smaller engines, less horsepower, and lower
weight. year goes the other way: newer cars are more likely
to fall in the high-mpg group, which makes sense given fuel efficiency
improvements over time. acceleration shows weaker
separation and is less useful as a predictor. Based on this, I’ll use
cylinders, displacement,
horsepower, weight, and year as
predictors going forward.
set.seed(1)
n_auto <- nrow(Auto)
tr_idx <- sample(n_auto, floor(n_auto * 0.7))
auto_tr <- Auto[tr_idx, ]
auto_te <- Auto[-tr_idx, ]
cat("Training:", nrow(auto_tr), " Test:", nrow(auto_te), "\n")## Training: 274 Test: 118
fit_lda14 <- lda(mpg01 ~ cylinders + displacement + horsepower + weight + year,
data = auto_tr)
pred_lda14 <- predict(fit_lda14, newdata = auto_te)$class
table(Predicted = pred_lda14, Truth = auto_te$mpg01)## Truth
## Predicted 0 1
## 0 48 1
## 1 13 56
## LDA test error: 0.1186
fit_qda14 <- qda(mpg01 ~ cylinders + displacement + horsepower + weight + year,
data = auto_tr)
pred_qda14 <- predict(fit_qda14, newdata = auto_te)$class
table(Predicted = pred_qda14, Truth = auto_te$mpg01)## Truth
## Predicted 0 1
## 0 52 4
## 1 9 53
## QDA test error: 0.1102
fit_log14 <- glm(mpg01 ~ cylinders + displacement + horsepower + weight + year,
data = auto_tr,
family = binomial)
summary(fit_log14)##
## Call:
## glm(formula = mpg01 ~ cylinders + displacement + horsepower +
## weight + year, family = binomial, data = auto_tr)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -13.990747 5.524488 -2.532 0.011325 *
## cylinders 0.155724 0.475377 0.328 0.743230
## displacement -0.009207 0.011764 -0.783 0.433836
## horsepower -0.039348 0.019693 -1.998 0.045708 *
## weight -0.003955 0.001108 -3.570 0.000357 ***
## year 0.390736 0.084575 4.620 3.84e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 379.79 on 273 degrees of freedom
## Residual deviance: 114.05 on 268 degrees of freedom
## AIC: 126.05
##
## Number of Fisher Scoring iterations: 8
prob_log14 <- predict(fit_log14, newdata = auto_te, type = "response")
pred_log14 <- factor(ifelse(prob_log14 > 0.5, 1, 0), levels = c("0", "1"))
table(Predicted = pred_log14, Truth = auto_te$mpg01)## Truth
## Predicted 0 1
## 0 53 4
## 1 8 53
## Logistic regression test error: 0.1017
Logistic regression does well here. The test error is in a similar
range to LDA and QDA. Looking at the model summary, weight
and year tend to carry the most significance — heavier cars
and older model years push the probability of being in the low-mpg group
up. displacement and horsepower are highly
correlated with weight, so their individual coefficients
get shrunk down by collinearity.
fit_nb14 <- naiveBayes(mpg01 ~ cylinders + displacement + horsepower + weight + year,
data = auto_tr)
pred_nb14 <- predict(fit_nb14, newdata = auto_te)
table(Predicted = pred_nb14, Truth = auto_te$mpg01)## Truth
## Predicted 0 1
## 0 52 3
## 1 9 54
## Naive Bayes test error: 0.1017
preds14 <- c("cylinders", "displacement", "horsepower", "weight", "year")
tr_mat14 <- scale(auto_tr[, preds14])
te_mat14 <- scale(auto_te[, preds14],
center = attr(tr_mat14, "scaled:center"),
scale = attr(tr_mat14, "scaled:scale"))
set.seed(1)
k_vals <- c(1, 3, 5, 7, 10, 15, 20)
knn_errs <- sapply(k_vals, function(k) {
pk <- knn(tr_mat14, te_mat14, auto_tr$mpg01, k = k)
mean(pk != auto_te$mpg01)
})
names(knn_errs) <- paste0("K=", k_vals)
round(knn_errs, 4)## K=1 K=3 K=5 K=7 K=10 K=15 K=20
## 0.0763 0.0847 0.0847 0.0678 0.0932 0.1102 0.0932
best_k <- k_vals[which.min(knn_errs)]
best_err <- round(min(knn_errs), 4)
cat("Best K:", best_k, " test error:", best_err, "\n")## Best K: 7 test error: 0.0678
# Confusion matrix for best K
pred_knn_best <- knn(tr_mat14, te_mat14, auto_tr$mpg01, k = best_k)
table(Predicted = pred_knn_best, Truth = auto_te$mpg01)## Truth
## Predicted 0 1
## 0 56 3
## 1 5 54
KNN with very small K tends to overfit and does worse. Performance improves with moderate K values. The best K on this split is 7 with a test error of 0.0678.
data(Boston, package = "ISLR2")
Boston$crim01 <- factor(ifelse(Boston$crim > median(Boston$crim), 1, 0))
cat("crim01 distribution:\n")## crim01 distribution:
##
## 0 1
## 253 253
## Dimensions: 506 rows x 14 columns
bos_num <- Boston[, !names(Boston) %in% c("crim", "crim01")]
cor_vec <- sapply(bos_num, function(x) cor(x, as.numeric(as.character(Boston$crim01))))
sort(round(cor_vec, 3), decreasing = TRUE)## nox rad age tax indus lstat ptratio chas rm medv
## 0.723 0.620 0.614 0.609 0.603 0.453 0.254 0.070 -0.156 -0.263
## zn dis
## -0.436 -0.616
par(mfrow = c(2, 3))
for (v in c("rad", "tax", "nox", "age", "lstat", "dis")) {
boxplot(Boston[[v]] ~ Boston$crim01,
main = v,
xlab = "crim01 (0 = low, 1 = high)",
col = c("steelblue", "salmon"),
names = c("Low crime", "High crime"))
}rad (highway accessibility), tax,
nox, and age all show the clearest separation
between low and high crime suburbs. I’ll use those along with
lstat and dis as predictors.
set.seed(1)
n_bos <- nrow(Boston)
tr_bos <- sample(n_bos, floor(n_bos * 0.7))
bos_tr <- Boston[tr_bos, ]
bos_te <- Boston[-tr_bos, ]
cat("Training:", nrow(bos_tr), " Test:", nrow(bos_te), "\n")## Training: 354 Test: 152
fit_log16 <- glm(crim01 ~ nox + rad + tax + age + dis + lstat,
data = bos_tr,
family = binomial)
prob_log16 <- predict(fit_log16, newdata = bos_te, type = "response")
pred_log16 <- factor(ifelse(prob_log16 > 0.5, 1, 0), levels = c("0", "1"))
table(Predicted = pred_log16, Truth = bos_te$crim01)## Truth
## Predicted 0 1
## 0 57 11
## 1 16 68
## Logistic regression test error: 0.1776
fit_lda16 <- lda(crim01 ~ nox + rad + tax + age + dis + lstat, data = bos_tr)
pred_lda16 <- predict(fit_lda16, newdata = bos_te)$class
table(Predicted = pred_lda16, Truth = bos_te$crim01)## Truth
## Predicted 0 1
## 0 71 19
## 1 2 60
## LDA test error: 0.1382
fit_nb16 <- naiveBayes(crim01 ~ nox + rad + tax + age + dis + lstat, data = bos_tr)
pred_nb16 <- predict(fit_nb16, newdata = bos_te)
table(Predicted = pred_nb16, Truth = bos_te$crim01)## Truth
## Predicted 0 1
## 0 66 18
## 1 7 61
## Naive Bayes test error: 0.1645
preds16 <- c("nox", "rad", "tax", "age", "dis", "lstat")
tr_bos_mat <- scale(bos_tr[, preds16])
te_bos_mat <- scale(bos_te[, preds16],
center = attr(tr_bos_mat, "scaled:center"),
scale = attr(tr_bos_mat, "scaled:scale"))
set.seed(1)
k_bos <- c(1, 3, 5, 10, 20)
knn_b_errs <- sapply(k_bos, function(k) {
pk <- knn(tr_bos_mat, te_bos_mat, bos_tr$crim01, k = k)
mean(pk != bos_te$crim01)
})
names(knn_b_errs) <- paste0("K=", k_bos)
round(knn_b_errs, 4)## K=1 K=3 K=5 K=10 K=20
## 0.0789 0.0789 0.0921 0.0921 0.0921
## Test errors:
## Logistic regression: 0.1776
## LDA : 0.1382
## Naive Bayes : 0.1645
## KNN (best) : 0.0789
rad and nox are the strongest predictors —
suburbs with better highway access and higher pollution levels tend to
have above-median crime rates. Logistic regression and LDA give the
lowest test errors here. KNN does well at moderate K values but is more
sensitive to the choice of K. Naive Bayes lags behind slightly, likely
because the independence assumption doesn’t hold well for these
correlated predictors.