## Warning: package 'ISLR2' was built under R version 4.5.3
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
## Warning: package 'rmarkdown' was built under R version 4.5.3
head(Weekly)
## Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
## 1 1990 0.816 1.572 -3.936 -0.229 -3.484 0.1549760 -0.270 Down
## 2 1990 -0.270 0.816 1.572 -3.936 -0.229 0.1485740 -2.576 Down
## 3 1990 -2.576 -0.270 0.816 1.572 -3.936 0.1598375 3.514 Up
## 4 1990 3.514 -2.576 -0.270 0.816 1.572 0.1616300 0.712 Up
## 5 1990 0.712 3.514 -2.576 -0.270 0.816 0.1537280 1.178 Up
## 6 1990 1.178 0.712 3.514 -2.576 -0.270 0.1544440 -1.372 Down
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
cor(Weekly[, -9])
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876
## Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535
## Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865
## Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000
## Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027
## Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873
## Lag5 Volume Today
## Year -0.030519101 0.84194162 -0.032459894
## Lag1 -0.008183096 -0.06495131 -0.075031842
## Lag2 -0.072499482 -0.08551314 0.059166717
## Lag3 0.060657175 -0.06928771 -0.071243639
## Lag4 -0.075675027 -0.06107462 -0.007825873
## Lag5 1.000000000 -0.05851741 0.011012698
## Volume -0.058517414 1.00000000 -0.033077783
## Today 0.011012698 -0.03307778 1.000000000
pairs(Weekly, col = ifelse(Weekly$Direction == "Up", "blue", "red"),
main = "Pairs plot of Weekly data (Blue = Up, Red = Down)")
plot(Weekly$Volume, type = "l", col = "blue",
xlab = "Week Index", ylab = "Volume",
main = "Trading Volume Over Time")
boxplot(Today ~ Direction, data = Weekly, col = c("red", "blue"),
main = "Today's Return by Direction",
xlab = "Direction", ylab = "Today's Return (%)")
Volume of billions of trades increases over time.
logistic1 <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Weekly, family = binomial)
summary(logistic1)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
Lag 2 is statistically significant.
glm.probs <- predict(logistic1, type = "response")
glm.pred <- ifelse(glm.probs > 0.5, "Up", "Down")
confusion_full <- table(Predicted = glm.pred, Actual = Weekly$Direction)
print(confusion_full)
## Actual
## Predicted Down Up
## Down 54 48
## Up 430 557
overall_correct <- mean(glm.pred == Weekly$Direction)
cat(sprintf("Overall fraction correct: %.4f (%.1f%%)\n",
overall_correct, overall_correct * 100))
## Overall fraction correct: 0.5611 (56.1%)
The model is more likely to predict “Up” correctly than “Down”.It is more likely to misclassify “Down” as “Up”.
train <- Weekly$Year <= 2008
test <- !train
glm.train <- glm(Direction ~ Lag2, data = Weekly, subset = train,
family = binomial)
glm.test.probs <- predict(glm.train, newdata = Weekly[test, ], type = "response")
glm.test.pred <- ifelse(glm.test.probs > 0.5, "Up", "Down")
cm_lr <- table(Predicted = glm.test.pred, Actual = Weekly$Direction[test])
print(cm_lr)
## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
cat(sprintf("Logistic Regression accuracy: %.4f\n", mean(glm.test.pred == Weekly$Direction[test])))
## Logistic Regression accuracy: 0.6250
train <- Weekly$Year <= 2008
test <- !train
lda.fit <- lda(Direction ~ Lag2, data = Weekly, subset = train)
lda.pred <- predict(lda.fit, newdata = Weekly[test, ])
cm_lda <- table(Predicted = lda.pred$class, Actual = Weekly$Direction[test])
print(cm_lda)
## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
cat(sprintf("LDA accuracy: %.4f\n", mean(lda.pred$class == Weekly$Direction[test])))
## LDA accuracy: 0.6250
qda.fit <- qda(Direction ~ Lag2, data = Weekly, subset = train)
qda.pred <- predict(qda.fit, newdata = Weekly[test, ])
cm_qda <- table(Predicted = qda.pred$class, Actual = Weekly$Direction[test])
print(cm_qda)
## Actual
## Predicted Down Up
## Down 0 0
## Up 43 61
cat(sprintf("QDA accuracy: %.4f\n", mean(qda.pred$class == Weekly$Direction[test])))
## QDA accuracy: 0.5865
set.seed(1)
train.X <- matrix(Weekly$Lag2[train])
test.X <- matrix(Weekly$Lag2[test])
train.Y <- Weekly$Direction[train]
knn.pred1 <- knn(train.X, test.X, train.Y, k = 1)
cm_knn1 <- table(Predicted = knn.pred1, Actual = Weekly$Direction[test])
print(cm_knn1)
## Actual
## Predicted Down Up
## Down 21 30
## Up 22 31
cat(sprintf("KNN (K=1) accuracy: %.4f\n", mean(knn.pred1 == Weekly$Direction[test])))
## KNN (K=1) accuracy: 0.5000
nb.fit <- naiveBayes(Direction ~ Lag2, data = Weekly, subset = train)
nb.pred <- predict(nb.fit, newdata = Weekly[test, ])
cm_nb <- table(Predicted = nb.pred, Actual = Weekly$Direction[test])
print(cm_nb)
## Actual
## Predicted Down Up
## Down 0 0
## Up 43 61
cat(sprintf("Naive Bayes accuracy: %.4f\n", mean(nb.pred == Weekly$Direction[test])))
## Naive Bayes accuracy: 0.5865
methods <- c("Logistic Regression", "LDA", "QDA", "KNN (K=1)", "Naive Bayes")
accs <- c(
mean(glm.test.pred == Weekly$Direction[test]),
mean(lda.pred$class == Weekly$Direction[test]),
mean(qda.pred$class == Weekly$Direction[test]),
mean(knn.pred1 == Weekly$Direction[test]),
mean(nb.pred == Weekly$Direction[test])
)
results <- data.frame(Method = methods, Accuracy = round(accs, 4))
print(results[order(-results$Accuracy), ])
## Method Accuracy
## 1 Logistic Regression 0.6250
## 2 LDA 0.6250
## 3 QDA 0.5865
## 5 Naive Bayes 0.5865
## 4 KNN (K=1) 0.5000
mpg_median <- median(Auto$mpg)
mpg01 <- ifelse(Auto$mpg > mpg_median, 1, 0)
Auto2 <- data.frame(mpg01, Auto)
cat("mpg median:", mpg_median, "\n")
## mpg median: 22.75
cat("mpg01 distribution:\n")
## mpg01 distribution:
print(table(Auto2$mpg01))
##
## 0 1
## 196 196
pairs(Auto2[, c("mpg01","cylinders","displacement","horsepower",
"weight","acceleration","year","origin")],
col = ifelse(Auto2$mpg01 == 1, "blue", "red"),
main = "Pairs Plot — Blue = High MPG, Red = Low MPG")
par(mfrow = c(2, 3))
boxplot(cylinders ~ mpg01, data = Auto2, col = c("red","blue"),
main = "Cylinders vs mpg01", xlab = "mpg01", ylab = "Cylinders")
boxplot(displacement ~ mpg01, data = Auto2, col = c("red","blue"),
main = "Displacement vs mpg01", xlab = "mpg01", ylab = "Displacement")
boxplot(horsepower ~ mpg01, data = Auto2, col = c("red","blue"),
main = "Horsepower vs mpg01", xlab = "mpg01", ylab = "Horsepower")
boxplot(weight ~ mpg01, data = Auto2, col = c("red","blue"),
main = "Weight vs mpg01", xlab = "mpg01", ylab = "Weight")
boxplot(acceleration ~ mpg01, data = Auto2, col = c("red","blue"),
main = "Acceleration vs mpg01", xlab = "mpg01", ylab = "Acceleration")
boxplot(year ~ mpg01, data = Auto2, col = c("red","blue"),
main = "Year vs mpg01", xlab = "mpg01", ylab = "Year")
par(mfrow = c(1, 1))
cat("\nCorrelations with mpg01:\n")
##
## Correlations with mpg01:
num_vars <- Auto2[, c("cylinders","displacement","horsepower",
"weight","acceleration","year","origin")]
print(round(cor(cbind(mpg01 = Auto2$mpg01, num_vars)), 3))
## mpg01 cylinders displacement horsepower weight acceleration
## mpg01 1.000 -0.759 -0.753 -0.667 -0.758 0.347
## cylinders -0.759 1.000 0.951 0.843 0.898 -0.505
## displacement -0.753 0.951 1.000 0.897 0.933 -0.544
## horsepower -0.667 0.843 0.897 1.000 0.865 -0.689
## weight -0.758 0.898 0.933 0.865 1.000 -0.417
## acceleration 0.347 -0.505 -0.544 -0.689 -0.417 1.000
## year 0.430 -0.346 -0.370 -0.416 -0.309 0.290
## origin 0.514 -0.569 -0.615 -0.455 -0.585 0.213
## year origin
## mpg01 0.430 0.514
## cylinders -0.346 -0.569
## displacement -0.370 -0.615
## horsepower -0.416 -0.455
## weight -0.309 -0.585
## acceleration 0.290 0.213
## year 1.000 0.182
## origin 0.182 1.000
Cylinders, displacement, horsepower, and weight have a negative correlation with mpg01. Year, origin and acceleration have positive correlations with mpg01.
predictors <- c("cylinders", "displacement", "horsepower",
"weight", "acceleration", "year", "origin")
fmla <- as.formula(
paste("mpg01 ~", paste(predictors, collapse = " + "))
)
set.seed(42)
n <- nrow(Auto2)
train_idx <- sample(1:n, size = floor(0.8 * n))
train_data <- Auto2[ train_idx, ]
test_data <- Auto2[-train_idx, ]
cat(sprintf("Training rows: %d | Test rows: %d\n",
nrow(train_data), nrow(test_data)))
## Training rows: 313 | Test rows: 79
lda.fit <- lda(fmla, data = train_data)
lda.pred <- predict(lda.fit, newdata = test_data)$class
cm_lda <- table(Predicted = lda.pred, Actual = test_data$mpg01)
lda_err <- mean(lda.pred != test_data$mpg01)
print(cm_lda)
## Actual
## Predicted 0 1
## 0 29 2
## 1 2 46
cat(sprintf("LDA test error: %.4f (%.1f%%)\n", lda_err, lda_err * 100))
## LDA test error: 0.0506 (5.1%)
qda.fit <- qda(fmla, data = train_data)
qda.pred <- predict(qda.fit, newdata = test_data)$class
cm_qda <- table(Predicted = qda.pred, Actual = test_data$mpg01)
qda_err <- mean(qda.pred != test_data$mpg01)
print(cm_qda)
## Actual
## Predicted 0 1
## 0 28 3
## 1 3 45
cat(sprintf("QDA test error: %.4f (%.1f%%)\n", qda_err, qda_err * 100))
## QDA test error: 0.0759 (7.6%)
glm.fit <- glm(fmla, data = train_data, family = binomial)
glm.probs <- predict(glm.fit, newdata = test_data, type = "response")
glm.pred <- ifelse(glm.probs > 0.5, 1, 0)
cm_glm <- table(Predicted = glm.pred, Actual = test_data$mpg01)
glm_err <- mean(glm.pred != test_data$mpg01)
print(cm_glm)
## Actual
## Predicted 0 1
## 0 30 5
## 1 1 43
cat(sprintf("Logistic Regression test error: %.4f (%.1f%%)\n", glm_err, glm_err * 100))
## Logistic Regression test error: 0.0759 (7.6%)
nb.fit <- naiveBayes(fmla, data = train_data)
nb.pred <- predict(nb.fit, newdata = test_data)
cm_nb <- table(Predicted = nb.pred, Actual = test_data$mpg01)
nb_err <- mean(nb.pred != test_data$mpg01)
print(cm_nb)
## Actual
## Predicted 0 1
## 0 28 3
## 1 3 45
cat(sprintf("Naive Bayes test error: %.4f (%.1f%%)\n", nb_err, nb_err * 100))
## Naive Bayes test error: 0.0759 (7.6%)
train.X <- scale(train_data[, predictors])
test.X <- scale(test_data[, predictors],
center = attr(train.X, "scaled:center"),
scale = attr(train.X, "scaled:scale"))
train.Y <- train_data$mpg01
set.seed(42)
k_values <- 1:50
knn_errs <- sapply(k_values, function(k) {
pred <- knn(train.X, test.X, train.Y, k = k)
mean(pred != test_data$mpg01)
})
plot(k_values, knn_errs, type = "b", pch = 19, col = "blue",
xlab = "K", ylab = "Test Error Rate",
main = "KNN Test Error vs K (all predictors)")
best_k <- k_values[which.min(knn_errs)]
abline(v = best_k, col = "red", lty = 2)
legend("topright", legend = paste0("Best K = ", best_k), col = "red", lty = 2)
cat(sprintf("Best K = %d | Test error = %.4f (%.1f%%)\n",
best_k, min(knn_errs), min(knn_errs) * 100))
## Best K = 28 | Test error = 0.0506 (5.1%)
cat("\nErrors for selected K values:\n")
##
## Errors for selected K values:
for (k in unique(c(1, 3, 5, 10, 15, 20, best_k))) {
cat(sprintf(" K = %2d -> error = %.4f\n", k, knn_errs[k]))
}
## K = 1 -> error = 0.1139
## K = 3 -> error = 0.0633
## K = 5 -> error = 0.0759
## K = 10 -> error = 0.0759
## K = 15 -> error = 0.0633
## K = 20 -> error = 0.0633
## K = 28 -> error = 0.0506
K=28 performs best.
crim_median <- median(Boston$crim)
crim01 <- ifelse(Boston$crim > crim_median, 1, 0)
Boston2 <- data.frame(crim01, Boston[, -1]) # drop original crim
cat("Median crime rate:", round(crim_median, 4), "\n")
## Median crime rate: 0.2565
cat("crim01 distribution:\n")
## crim01 distribution:
print(table(crim01))
## crim01
## 0 1
## 253 253
cat("\nCorrelations with crim01:\n")
##
## Correlations with crim01:
print(round(sort(cor(Boston2)[,"crim01"], decreasing = TRUE), 3))
## crim01 nox rad age tax indus lstat ptratio chas rm
## 1.000 0.723 0.620 0.614 0.609 0.603 0.453 0.254 0.070 -0.156
## medv black zn dis
## -0.263 -0.351 -0.436 -0.616
par(mfrow = c(3, 4))
for (var in names(Boston2)[-1]) {
boxplot(Boston2[[var]] ~ Boston2$crim01,
col = c("blue", "red"),
main = var,
xlab = "crim01 (0=Low, 1=High)",
ylab = var)
}
par(mfrow = c(1, 1))
predictors <- names(Boston2)[-1]
fmla <- as.formula(paste("crim01 ~", paste(predictors, collapse = " + ")))
set.seed(1)
n <- nrow(Boston2)
train_idx <- sample(1:n, size = floor(0.7 * n))
train <- Boston2[ train_idx, ]
test <- Boston2[-train_idx, ]
test_y <- test$crim01
cat(sprintf("\nTraining rows: %d | Test rows: %d\n", nrow(train), nrow(test)))
##
## Training rows: 354 | Test rows: 152
eval_pred <- function(pred, actual, label) {
cm <- table(Predicted = pred, Actual = actual)
err <- mean(pred != actual)
cat(sprintf("\n[%s] Test error: %.4f (%.1f%%)\n", label, err, err * 100))
print(cm)
invisible(err)
}
glm.fit <- glm(fmla, data = train, family = binomial)
glm.prob <- predict(glm.fit, newdata = test, type = "response")
glm.pred <- ifelse(glm.prob > 0.5, 1, 0)
lr_err <- eval_pred(glm.pred, test_y, "Logistic Regression")
##
## [Logistic Regression] Test error: 0.1053 (10.5%)
## Actual
## Predicted 0 1
## 0 62 5
## 1 11 74
lda.fit <- lda(fmla, data = train)
lda.pred <- predict(lda.fit, newdata = test)$class
lda_err <- eval_pred(lda.pred, test_y, "LDA")
##
## [LDA] Test error: 0.1513 (15.1%)
## Actual
## Predicted 0 1
## 0 70 20
## 1 3 59
nb.fit <- naiveBayes(fmla, data = train)
nb.pred <- predict(nb.fit, newdata = test)
nb_err <- eval_pred(as.integer(as.character(nb.pred)), test_y, "Naive Bayes")
##
## [Naive Bayes] Test error: 0.1645 (16.4%)
## Actual
## Predicted 0 1
## 0 68 20
## 1 5 59
train.X <- scale(train[, predictors])
test.X <- scale(test[, predictors],
center = attr(train.X, "scaled:center"),
scale = attr(train.X, "scaled:scale"))
set.seed(1)
knn_errs <- sapply(1:50, function(k) {
pred <- knn(train.X, test.X, train$crim01, k = k)
mean(as.integer(as.character(pred)) != test_y)
})
best_k <- which.min(knn_errs)
best_err <- knn_errs[best_k]
plot(1:50, knn_errs, type = "b", pch = 19, col = "blue",
xlab = "K", ylab = "Test Error Rate",
main = "KNN Test Error vs K (all predictors)")
abline(v = best_k, col = "red", lty = 2)
legend("topright", legend = paste0("Best K = ", best_k), col = "red", lty = 2)
set.seed(1)
knn.pred <- knn(train.X, test.X, train$crim01, k = best_k)
knn_err <- eval_pred(as.integer(as.character(knn.pred)), test_y,
sprintf("KNN (K=%d)", best_k))
##
## [KNN (K=3)] Test error: 0.0461 (4.6%)
## Actual
## Predicted 0 1
## 0 71 5
## 1 2 74
cat("\nTest errors for selected K values:\n")
##
## Test errors for selected K values:
for (k in unique(c(1, 3, 5, 10, 15, 20, best_k))) {
cat(sprintf(" K = %2d -> error = %.4f\n", k, knn_errs[k]))
}
## K = 1 -> error = 0.0789
## K = 3 -> error = 0.0461
## K = 5 -> error = 0.0592
## K = 10 -> error = 0.1053
## K = 15 -> error = 0.1184
## K = 20 -> error = 0.1447
summary_df <- data.frame(
Method = c("Logistic Regression", "LDA", "Naive Bayes",
sprintf("KNN (K=%d)", best_k)),
TestError = round(c(lr_err, lda_err, nb_err, best_err), 4)
)
print(summary_df[order(summary_df$TestError), ], row.names = FALSE)
## Method TestError
## KNN (K=3) 0.0461
## Logistic Regression 0.1053
## LDA 0.1513
## Naive Bayes 0.1645