This question should be answered using the Weekly data set, which is part of the ISLR2 package. This data is similar in nature to the Smarket data from this chapter’s lab, except that it contains 1,089 weekly returns for 21 years, from the beginning of 1990 to the end of 2010.
library(MASS)
library(class)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.4.2
##
## Attaching package: 'ISLR2'
##
## The following object is masked from 'package:MASS':
##
## Boston
library(e1071)
## Warning: package 'e1071' was built under R version 4.4.3
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
corrplot(cor(Weekly[, -9]), type = "lower", diag = FALSE, method = "ellipse")
Volume is strongly positively correlated with Year. Other correlations are week, but Lag1 is negatively correlated with Lag2 but positively correlated with Lag3.
fit <- glm(
Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Weekly,
family = binomial
)
summary(fit)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
Lag2 is significant.
contrasts(Weekly$Direction)
## Up
## Down 0
## Up 1
pred <- predict(fit, type = "response") > 0.5
(t <- table(ifelse(pred, "Up (pred)", "Down (pred)"), Weekly$Direction))
##
## Down Up
## Down (pred) 54 48
## Up (pred) 430 557
sum(diag(t)) / sum(t)
## [1] 0.5610652
The overall accuracy of the predictions is 0.56. While logistic regression is effective at predicting upward movements, it tends to mistakenly classify most downward movements as upward.
train <- Weekly$Year < 2009
fit <- glm(Direction ~ Lag2, data = Weekly[train, ], family = binomial)
pred <- predict(fit, Weekly[!train, ], type = "response") > 0.5
(t <- table(ifelse(pred, "Up (pred)", "Down (pred)"), Weekly[!train, ]$Direction))
##
## Down Up
## Down (pred) 9 5
## Up (pred) 34 56
sum(diag(t)) / sum(t)
## [1] 0.625
The overall fraction of correct prediction is 0.625.
e)Repeat (d) using LDA.
fit <- lda(Direction ~ Lag2, data = Weekly[train, ])
pred <- predict(fit, Weekly[!train, ], type = "response")$class
(t <- table(pred, Weekly[!train, ]$Direction))
##
## pred Down Up
## Down 9 5
## Up 34 56
sum(diag(t)) / sum(t)
## [1] 0.625
f)Repeat (d) using QDA.
fit <- qda(Direction ~ Lag2, data = Weekly[train, ])
pred <- predict(fit, Weekly[!train, ], type = "response")$class
(t <- table(pred, Weekly[!train, ]$Direction))
##
## pred Down Up
## Down 0 0
## Up 43 61
sum(diag(t)) / sum(t)
## [1] 0.5865385
fit <- knn(
Weekly[train, "Lag2", drop = FALSE],
Weekly[!train, "Lag2", drop = FALSE],
Weekly$Direction[train]
)
(t <- table(fit, Weekly[!train, ]$Direction))
##
## fit Down Up
## Down 21 29
## Up 22 32
sum(diag(t)) / sum(t)
## [1] 0.5096154
fit <- naiveBayes(Direction ~ Lag2, data = Weekly, subset = train)
pred <- predict(fit, Weekly[!train, ], type = "class")
(t <- table(pred, Weekly[!train, ]$Direction))
##
## pred Down Up
## Down 0 0
## Up 43 61
sum(diag(t)) / sum(t)
## [1] 0.5865385
Logistic regression and LDA are the best performing.
fit <- glm(Direction ~ Lag1, data = Weekly[train, ], family = binomial)
pred <- predict(fit, Weekly[!train, ], type = "response") > 0.5
mean(ifelse(pred, "Up", "Down") == Weekly[!train, ]$Direction)
## [1] 0.5673077
fit <- glm(Direction ~ Lag2, data = Weekly[train, ], family = binomial)
pred <- predict(fit, Weekly[!train, ], type = "response") > 0.5
mean(ifelse(pred, "Up", "Down") == Weekly[!train, ]$Direction)
## [1] 0.625
fit <- glm(Direction ~ Lag3, data = Weekly[train, ], family = binomial)
pred <- predict(fit, Weekly[!train, ], type = "response") > 0.5
mean(ifelse(pred, "Up", "Down") == Weekly[!train, ]$Direction)
## [1] 0.5865385
fit <- glm(Direction ~ Lag4, data = Weekly[train, ], family = binomial)
pred <- predict(fit, Weekly[!train, ], type = "response") > 0.5
mean(ifelse(pred, "Up", "Down") == Weekly[!train, ]$Direction)
## [1] 0.5865385
fit <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4, data = Weekly[train, ], family = binomial)
pred <- predict(fit, Weekly[!train, ], type = "response") > 0.5
mean(ifelse(pred, "Up", "Down") == Weekly[!train, ]$Direction)
## [1] 0.5865385
fit <- glm(Direction ~ Lag1 * Lag2 * Lag3 * Lag4, data = Weekly[train, ], family = binomial)
pred <- predict(fit, Weekly[!train, ], type = "response") > 0.5
mean(ifelse(pred, "Up", "Down") == Weekly[!train, ]$Direction)
## [1] 0.5961538
fit <- lda(Direction ~ Lag1 + Lag2 + Lag3 + Lag4, data = Weekly[train, ])
pred <- predict(fit, Weekly[!train, ], type = "response")$class
mean(pred == Weekly[!train, ]$Direction)
## [1] 0.5769231
fit <- qda(Direction ~ Lag1 + Lag2 + Lag3 + Lag4, data = Weekly[train, ])
pred <- predict(fit, Weekly[!train, ], type = "response")$class
mean(pred == Weekly[!train, ]$Direction)
## [1] 0.5192308
fit <- naiveBayes(Direction ~ Lag1 + Lag2 + Lag3 + Lag4, data = Weekly[train, ])
pred <- predict(fit, Weekly[!train, ], type = "class")
mean(pred == Weekly[!train, ]$Direction)
## [1] 0.5096154
set.seed(1)
res <- sapply(1:30, function(k) {
fit <- knn(
Weekly[train, 2:4, drop = FALSE],
Weekly[!train, 2:4, drop = FALSE],
Weekly$Direction[train],
k = k
)
mean(fit == Weekly[!train, ]$Direction)
})
plot(1:30, res, type = "o", xlab = "k", ylab = "Fraction correct")
(k <- which.max(res))
## [1] 26
fit <- knn(
Weekly[train, 2:4, drop = FALSE],
Weekly[!train, 2:4, drop = FALSE],
Weekly$Direction[train],
k = k
)
table(fit, Weekly[!train, ]$Direction)
##
## fit Down Up
## Down 23 18
## Up 20 43
mean(fit == Weekly[!train, ]$Direction)
## [1] 0.6346154
KNN, when using the first three lag variables, slightly outperforms logistic regression with Lag2, provided that the value of k is tuned to 26.
In this problem, you will develop a model to predict whether a given car gets high or low gas mileage based on the Auto data set.
x <- cbind(Auto[, -1], data.frame("mpg01" = Auto$mpg > median(Auto$mpg)))
par(mfrow = c(2, 4))
for (i in 1:7) hist(x[, i], breaks = 20, main = colnames(x)[i])
par(mfrow = c(2, 4))
for (i in 1:7) boxplot(x[, i] ~ x$mpg01, main = colnames(x)[i])
pairs(x[, 1:7])
Most variables exhibit a relationship with the mpg01 category, and
several of them are highly collinear.
set.seed(1)
train <- sample(seq_len(nrow(x)), nrow(x) * 2 / 3)
sort(sapply(1:7, function(i) {
setNames(abs(t.test(x[, i] ~ x$mpg01)$statistic), colnames(x)[i])
}))
## acceleration year origin horsepower displacement weight
## 7.302430 9.403221 11.824099 17.681939 22.632004 22.932777
## cylinders
## 23.035328
fit <- lda(mpg01 ~ cylinders + weight + displacement, data = x[train, ])
pred <- predict(fit, x[-train, ], type = "response")$class
mean(pred != x[-train, ]$mpg01)
## [1] 0.1068702
fit <- qda(mpg01 ~ cylinders + weight + displacement, data = x[train, ])
pred <- predict(fit, x[-train, ], type = "response")$class
mean(pred != x[-train, ]$mpg01)
## [1] 0.09923664
fit <- glm(mpg01 ~ cylinders + weight + displacement, data = x[train, ], family = binomial)
pred <- predict(fit, x[-train, ], type = "response") > 0.5
mean(pred != x[-train, ]$mpg01)
## [1] 0.1145038
fit <- naiveBayes(mpg01 ~ cylinders + weight + displacement, data = x[train, ])
pred <- predict(fit, x[-train, ], type = "class")
mean(pred != x[-train, ]$mpg01)
## [1] 0.09923664
res <- sapply(1:50, function(k) {
fit <- knn(x[train, c(1, 4, 2)], x[-train, c(1, 4, 2)], x$mpg01[train], k = k)
mean(fit != x[-train, ]$mpg01)
})
names(res) <- 1:50
plot(res, type = "o")
res[which.min(res)]
## 3
## 0.1068702
For the models tested, k=32 seems to yield the best performance. QDA has a lower overall error rate and performs slightly better than LDA.