library(ISLR2)
names(Weekly)
[1] "Year" "Lag1" "Lag2" "Lag3" "Lag4" "Lag5" "Volume"
[8] "Today" "Direction"
dim(Weekly)
[1] 1089 9
summary(Weekly)
Year Lag1 Lag2 Lag3 Lag4
Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580 1st Qu.: -1.1580
Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410 Median : 0.2380
Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472 Mean : 0.1458
3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090 3rd Qu.: 1.4090
Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
Lag5 Volume Today Direction
Min. :-18.1950 Min. :0.08747 Min. :-18.1950 Down:484
1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540 Up :605
Median : 0.2340 Median :1.00268 Median : 0.2410
Mean : 0.1399 Mean :1.57462 Mean : 0.1499
3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
Max. : 12.0260 Max. :9.32821 Max. : 12.0260
cor(Weekly[, -9])
Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume
Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923 -0.030519101 0.84194162
Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876 -0.008183096 -0.06495131
Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535 -0.072499482 -0.08551314
Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865 0.060657175 -0.06928771
Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000 -0.075675027 -0.06107462
Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027 1.000000000 -0.05851741
Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617 -0.058517414 1.00000000
Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873 0.011012698 -0.03307778
Today
Year -0.032459894
Lag1 -0.075031842
Lag2 0.059166717
Lag3 -0.071243639
Lag4 -0.007825873
Lag5 0.011012698
Volume -0.033077783
Today 1.000000000
pairs(Weekly)
boxplot(Lag1 ~ Direction, data = Weekly,
main = "Lag 1 Weekly Return by Market Direction",
xlab = "Direction", ylab = "Lag 1 Return")
plot(Weekly$Volume, type = "l",
main = "Trading Volume Over Time",
xlab = "Index (Weeks from 1990 to 2010)", ylab = "Volume (in billions)")
There don’t appear to be any obvious patterns in the data except for the last chart that shows a positive correlation between time and volume.
glm.fit <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Weekly,
family = binomial)
summary(glm.fit)
Call:
glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
Volume, family = binomial, data = Weekly)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.26686 0.08593 3.106 0.0019 **
Lag1 -0.04127 0.02641 -1.563 0.1181
Lag2 0.05844 0.02686 2.175 0.0296 *
Lag3 -0.01606 0.02666 -0.602 0.5469
Lag4 -0.02779 0.02646 -1.050 0.2937
Lag5 -0.01447 0.02638 -0.549 0.5833
Volume -0.02274 0.03690 -0.616 0.5377
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1496.2 on 1088 degrees of freedom
Residual deviance: 1486.4 on 1082 degrees of freedom
AIC: 1500.4
Number of Fisher Scoring iterations: 4
Only Lag2 appears to be statistically significant.
glm.probs <- predict(glm.fit, type = "response")
glm.pred <- rep("Down", length(glm.probs))
glm.pred[glm.probs > 0.5] <- "Up"
table(glm.pred, Weekly$Direction)
glm.pred Down Up
Down 54 48
Up 430 557
table(glm.pred, Weekly$Direction)
glm.pred Down Up
Down 54 48
Up 430 557
mean(glm.pred == Weekly$Direction)
[1] 0.5610652
The models accuracy is misleading because is predicts “Up” most of the time and capitalizes on the market’s natural trend up. It is not reliable for detecting downturns.
train <- (Weekly$Year < 2009)
Weekly.2009.2010 <- Weekly[!train, ]
Direction.2009.2010 <- Weekly$Direction[!train]
glm.fit2 <- glm(Direction ~ Lag2, data = Weekly, family = binomial, subset = train)
glm.probs2 <- predict(glm.fit2, Weekly.2009.2010, type = "response")
glm.pred2 <- rep("Down", length(glm.probs2))
glm.pred2[glm.probs2 > 0.5] <- "Up"
table(glm.pred2, Direction.2009.2010)
Direction.2009.2010
glm.pred2 Down Up
Down 9 5
Up 34 56
mean(glm.pred2 == Direction.2009.2010)
[1] 0.625
library(MASS)
lda.fit <- lda(Direction ~ Lag2, data = Weekly, subset = train)
lda.pred <- predict(lda.fit, Weekly.2009.2010)
table(lda.pred$class, Direction.2009.2010)
Direction.2009.2010
Down Up
Down 9 5
Up 34 56
mean(lda.pred$class == Direction.2009.2010)
[1] 0.625
qda.fit <- qda(Direction ~ Lag2, data = Weekly, subset = train)
qda.pred <- predict(qda.fit, Weekly.2009.2010)
table(qda.pred$class, Direction.2009.2010)
Direction.2009.2010
Down Up
Down 0 0
Up 43 61
mean(qda.pred$class == Direction.2009.2010)
[1] 0.5865385
library(class)
train.X <- as.matrix(Weekly$Lag2[train])
test.X <- as.matrix(Weekly$Lag2[!train])
train.Direction <- Weekly$Direction[train]
set.seed(1)
knn.pred <- knn(train.X, test.X, train.Direction, k = 1)
table(knn.pred, Direction.2009.2010)
Direction.2009.2010
knn.pred Down Up
Down 21 30
Up 22 31
mean(knn.pred == Direction.2009.2010)
[1] 0.5
library(e1071)
nb.fit <- naiveBayes(Direction ~ Lag2, data = Weekly, subset = train)
nb.pred <- predict(nb.fit, Weekly.2009.2010)
table(nb.pred, Direction.2009.2010)
Direction.2009.2010
nb.pred Down Up
Down 0 0
Up 43 61
mean(nb.pred == Direction.2009.2010)
[1] 0.5865385
Logeitic Regression and LDA perform the best.
Multiply Lag1 and Lag2
glm.fit_inter <- glm(Direction ~ Lag1 * Lag2, data = Weekly, family = binomial, subset = train)
glm.probs_inter <- predict(glm.fit_inter, Weekly.2009.2010, type = "response")
glm.pred_inter <- ifelse(glm.probs_inter > 0.5, "Up", "Down")
table(glm.pred_inter, Direction.2009.2010)
Direction.2009.2010
glm.pred_inter Down Up
Down 7 8
Up 36 53
mean(glm.pred_inter == Direction.2009.2010)
[1] 0.5769231
Using different K values
set.seed(1)
for (k_val in c(3, 5, 7, 10)) {
knn.pred_k <- knn(train.X, test.X, train.Direction, k = k_val)
cat(paste("K =", k_val, "Accuracy:", mean(knn.pred_k == Direction.2009.2010), "\n"))
}
K = 3 Accuracy: 0.548076923076923
K = 5 Accuracy: 0.538461538461538
K = 7 Accuracy: 0.548076923076923
K = 10 Accuracy: 0.538461538461538
Between these two experiments, multiplying Lag1 and Lag2 seem to be the most beneficial.
mpg01 <- ifelse(Auto$mpg > median(Auto$mpg), 1, 0)
Auto <- data.frame(Auto, mpg01)
summary(Auto)
mpg cylinders displacement horsepower weight
Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0 Min. :1613
1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.0 1st Qu.:2225
Median :22.75 Median :4.000 Median :151.0 Median : 93.5 Median :2804
Mean :23.45 Mean :5.472 Mean :194.4 Mean :104.5 Mean :2978
3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8 3rd Qu.:126.0 3rd Qu.:3615
Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0 Max. :5140
acceleration year origin name mpg01
Min. : 8.00 Min. :70.00 Min. :1.000 amc matador : 5 Min. :0.0
1st Qu.:13.78 1st Qu.:73.00 1st Qu.:1.000 ford pinto : 5 1st Qu.:0.0
Median :15.50 Median :76.00 Median :1.000 toyota corolla : 5 Median :0.5
Mean :15.54 Mean :75.98 Mean :1.577 amc gremlin : 4 Mean :0.5
3rd Qu.:17.02 3rd Qu.:79.00 3rd Qu.:2.000 amc hornet : 4 3rd Qu.:1.0
Max. :24.80 Max. :82.00 Max. :3.000 chevrolet chevette: 4 Max. :1.0
(Other) :365
mpg01.1 mpg01.2
Min. :0.0 Min. :0.0
1st Qu.:0.0 1st Qu.:0.0
Median :0.5 Median :0.5
Mean :0.5 Mean :0.5
3rd Qu.:1.0 3rd Qu.:1.0
Max. :1.0 Max. :1.0
par(mfrow = c(2, 3))
boxplot(cylinders ~ mpg01, data = Auto, main = "Cylinders vs mpg01", xlab = "mpg01", ylab = "Cylinders")
boxplot(displacement ~ mpg01, data = Auto, main = "Displacement vs mpg01", xlab = "mpg01", ylab = "Displacement")
boxplot(horsepower ~ mpg01, data = Auto, main = "Horsepower vs mpg01", xlab = "mpg01", ylab = "Horsepower")
boxplot(weight ~ mpg01, data = Auto, main = "Weight vs mpg01", xlab = "mpg01", ylab = "Weight")
boxplot(acceleration ~ mpg01, data = Auto, main = "Acceleration vs mpg01", xlab = "mpg01", ylab = "Acceleration")
boxplot(year ~ mpg01, data = Auto, main = "Year vs mpg01", xlab = "mpg01", ylab = "Year")
pairs(Auto[, c("mpg01", "displacement", "horsepower", "weight", "acceleration")])
Weight, displacement, and horsepower seem to be the most useful.
set.seed(1)
train_index <- sample(1:nrow(Auto), 0.8 * nrow(Auto))
Auto.train <- Auto[train_index, ]
Auto.test <- Auto[-train_index, ]
mpg01.test <- Auto$mpg01[-train_index]
lda.fit <- lda(mpg01 ~ weight + displacement + horsepower + cylinders, data = Auto.train)
lda.pred <- predict(lda.fit, Auto.test)
test.error <- mean(lda.pred$class != mpg01.test)
print(test.error)
[1] 0.08860759
Error rate is 8.66%
qda.fit <- qda(mpg01 ~ weight + displacement + horsepower + cylinders, data = Auto.train)
qda.pred <- predict(qda.fit, Auto.test)
test.error.qda <- mean(qda.pred$class != mpg01.test)
print(test.error.qda)
[1] 0.08860759
Error rate is 8.86%
glm.fit_auto <- glm(mpg01 ~ weight + displacement + horsepower + cylinders, data = Auto.train, family = binomial)
glm.probs_auto <- predict(glm.fit_auto, Auto.test, type = "response")
glm.pred_auto <- ifelse(glm.probs_auto > 0.5, 1, 0)
test.error.glm <- mean(glm.pred_auto != mpg01.test)
print(test.error.glm)
[1] 0.06329114
Error rate is 6.33%
nb.fit_auto <- naiveBayes(mpg01 ~ weight + displacement + horsepower + cylinders, data = Auto.train)
nb.pred_auto <- predict(nb.fit_auto, Auto.test)
test.error.nb <- mean(nb.pred_auto != mpg01.test)
print(test.error.nb)
[1] 0.07594937
Error rate is 7.6%
set.seed(1)
predictors <- c("weight", "displacement", "horsepower", "cylinders")
train.X <- as.matrix(Auto.train[, predictors])
test.X <- as.matrix(Auto.test[, predictors])
train.mpg01 <- Auto.train$mpg01
for (k_val in c(1, 3, 5, 7, 10, 20, 50)) {
knn.pred_auto <- knn(train.X, test.X, train.mpg01, k = k_val)
test.error.knn <- mean(knn.pred_auto != mpg01.test)
cat(paste("K =", k_val, "-> Test Error:", round(test.error.knn, 4), "\n"))
}
K = 1 -> Test Error: 0.1266
K = 3 -> Test Error: 0.1013
K = 5 -> Test Error: 0.1013
K = 7 -> Test Error: 0.1139
K = 10 -> Test Error: 0.1266
K = 20 -> Test Error: 0.1392
K = 50 -> Test Error: 0.1266
K values 3 and 5 seem to perform better with a 10.13% error rate.
crime_high <- ifelse(Boston$crim > median(Boston$crim), 1, 0)
Boston_df <- data.frame(Boston, crime_high)
Boston_df$crim <- NULL
cor_matrix <- cor(Boston_df)
print(sort(cor_matrix["crime_high", ], decreasing = TRUE))
crime_high nox rad age tax indus lstat
1.00000000 0.72323480 0.61978625 0.61393992 0.60874128 0.60326017 0.45326273
ptratio chas rm medv black zn dis
0.25356836 0.07009677 -0.15637178 -0.26301673 -0.35121093 -0.43615103 -0.61634164
set.seed(1)
train_idx <- sample(1:nrow(Boston_df), 0.8 * nrow(Boston_df))
train_set <- Boston_df[train_idx, ]
test_set <- Boston_df[-train_idx, ]
y_test <- test_set$crime_high
subset_all <- crime_high ~ .
subset_top3 <- crime_high ~ rad + tax + nox
glm.fit <- glm(subset_top3, data = train_set, family = binomial)
glm.probs <- predict(glm.fit, test_set, type = "response")
glm.pred <- ifelse(glm.probs > 0.5, 1, 0)
err_glm <- mean(glm.pred != y_test)
print(err_glm)
[1] 0.1568627
lda.fit <- lda(subset_top3, data = train_set)
lda.pred <- predict(lda.fit, test_set)$class
err_lda <- mean(lda.pred != y_test)
print(err_lda)
[1] 0.127451
nb.fit <- naiveBayes(subset_top3, data = train_set)
nb.pred <- predict(nb.fit, test_set)
err_nb <- mean(nb.pred != y_test)
print(err_nb)
[1] 0.1568627
predictors <- c("rad", "tax", "nox")
X_train <- scale(train_set[, predictors])
X_test <- scale(test_set[, predictors])
for (k_val in c(1, 3, 5, 10)) {
knn.pred <- knn(X_train, X_test, train_set$crime_high, k = k_val)
cat(paste("KNN (K =", k_val, ") Test Error:", round(mean(knn.pred != y_test), 4), "\n"))
}
KNN (K = 1 ) Test Error: 0.0588
KNN (K = 3 ) Test Error: 0.0588
KNN (K = 5 ) Test Error: 0.0588
KNN (K = 10 ) Test Error: 0.098
Out of the models tested with the Boston data set, it appears that KNN performs the best with K values of 1,3 and 5. KNN had an error rate of only 5.9%. Logistic Regression and Naive Bayes performed the worst with error rates above 15%.