data(Weekly)
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
pairs(Weekly)
cor(Weekly[,-9])
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876
## Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535
## Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865
## Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000
## Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027
## Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873
## Lag5 Volume Today
## Year -0.030519101 0.84194162 -0.032459894
## Lag1 -0.008183096 -0.06495131 -0.075031842
## Lag2 -0.072499482 -0.08551314 0.059166717
## Lag3 0.060657175 -0.06928771 -0.071243639
## Lag4 -0.075675027 -0.06107462 -0.007825873
## Lag5 1.000000000 -0.05851741 0.011012698
## Volume -0.058517414 1.00000000 -0.033077783
## Today 0.011012698 -0.03307778 1.000000000
It appears that there is only coorelation between year and volume.
log_model <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Weekly, family = binomial)
summary(log_model)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
Based on the summary, lag2 is the only predictor that is statistically significant.
pred_probs <- predict(log_model, type = "response")
pred_labels <- ifelse(pred_probs > 0.5, "Up", "Down")
table(Predicted = pred_labels, Actual = Weekly$Direction)
## Actual
## Predicted Down Up
## Down 54 48
## Up 430 557
mean(pred_labels == Weekly$Direction)
## [1] 0.5610652
(557+54)/1089
## [1] 0.5610652
The model predicted the trend correctly 56.1% of the time.
557/(557+48)
## [1] 0.9206612
The model correctly predicted the trend going up 92.1% of the time.
(54)/(54+430)
## [1] 0.1115702
The model correctly predicted the trend going down only 11.2% of the time. As we can see the model performed great when predicting an upwards trend, but extremely poorly when predicting downwards trends. The model makes most of its mistakes when predicting downward trends.
train <- Weekly$Year < 2009
test <- !train
log_model_train <- glm(Direction ~ Lag2, data = Weekly, family = binomial, subset = train)
test_probs <- predict(log_model_train, Weekly[test, ], type = "response")
test_preds <- ifelse(test_probs > 0.5, "Up", "Down")
table(Predicted = test_preds, Actual = Weekly$Direction[test])
## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
mean(test_preds == Weekly$Direction[test])
## [1] 0.625
The model correctly predicted the outcome 62.5% of the time.
56/(56+5)
## [1] 0.9180328
The model predicted the upward trend correctly 91.8% of the time.
9/(9+34)
## [1] 0.2093023
The moel predicted the trend would go down correctly 20.9% of the time.
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## The following object is masked from 'package:ISLR2':
##
## Boston
lda_model <- lda(Direction ~ Lag2, data = Weekly, subset = train)
lda_preds <- predict(lda_model, Weekly[test, ])
lda_class <- lda_preds$class
table(Predicted = lda_class, Actual = Weekly$Direction[test])
## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
mean(lda_class == Weekly$Direction[test])
## [1] 0.625
Using LDA gave the same results as the logistic regression model. 62.5% accuracy.
qda_model <- qda(Direction ~ Lag2, data = Weekly, subset = train)
qda_preds <- predict(qda_model, Weekly[test, ])
qda_class <- qda_preds$class
table(Predicted = qda_class, Actual = Weekly$Direction[test])
## Actual
## Predicted Down Up
## Down 0 0
## Up 43 61
mean(qda_class == Weekly$Direction[test])
## [1] 0.5865385
Using QDA we got 58.7% accuracy. It looks like there were no predictions for downward trend.
library(class)
train_X <- as.matrix(Weekly[train, "Lag2"])
test_X <- as.matrix(Weekly[test, "Lag2"])
train_Y <- Weekly$Direction[train]
knn_preds <- knn(train_X, test_X, train_Y, k = 1)
table(Predicted = knn_preds, Actual = Weekly$Direction[test])
## Actual
## Predicted Down Up
## Down 21 30
## Up 22 31
mean(knn_preds == Weekly$Direction[test])
## [1] 0.5
Using KNN with K = 1 the model got a 51% accuracy.
library(e1071)
nb_model <- naiveBayes(Direction ~ Lag2, data = Weekly, subset = train)
nb_preds <- predict(nb_model, Weekly[test, ])
table(Predicted = nb_preds, Actual = Weekly$Direction[test])
## Actual
## Predicted Down Up
## Down 0 0
## Up 43 61
mean(nb_preds == Weekly$Direction[test])
## [1] 0.5865385
The Naive Bayes got an accuracy of 58.7%. It had the same results as part f QDA model.
The logistic regression model had the best accuracy at 62.5%. LDA had the same results as well.
train <- Weekly$Year < 2009
test <- !train
train_data <- Weekly[train, ]
test_data <- Weekly[test, ]
Logistic Regression Model:
log_model_interact <- glm(Direction ~ Lag1 * Lag2 + Volume, data = train_data, family = binomial)
log_probs_interact <- predict(log_model_interact, test_data, type = "response")
log_preds_interact <- ifelse(log_probs_interact > 0.5, "Up", "Down")
table(Predicted = log_preds_interact, Actual = test_data$Direction)
## Actual
## Predicted Down Up
## Down 27 32
## Up 16 29
mean(log_preds_interact == test_data$Direction)
## [1] 0.5384615
LDA with Multiple Predictors
lda_model_multi <- lda(Direction ~ Lag1 + Lag2 + Lag3 + Volume, data = train_data)
lda_preds_multi <- predict(lda_model_multi, test_data)$class
table(Predicted = lda_preds_multi, Actual = test_data$Direction)
## Actual
## Predicted Down Up
## Down 30 37
## Up 13 24
mean(lda_preds_multi == test_data$Direction)
## [1] 0.5192308
QDA with Polynomial Terms
qda_model_poly <- qda(Direction ~ poly(Lag2, 2) + Volume, data = train_data)
qda_preds_poly <- predict(qda_model_poly, test_data)$class
table(Predicted = qda_preds_poly, Actual = test_data$Direction)
## Actual
## Predicted Down Up
## Down 31 42
## Up 12 19
mean(qda_preds_poly == test_data$Direction)
## [1] 0.4807692
Naive Bayes with Different Predictors
nb_model_multi <- naiveBayes(Direction ~ Lag1 + Lag2 + Volume, data = train_data)
nb_preds_multi <- predict(nb_model_multi, test_data)
table(Predicted = nb_preds_multi, Actual = test_data$Direction)
## Actual
## Predicted Down Up
## Down 41 58
## Up 2 3
mean(nb_preds_multi == test_data$Direction)
## [1] 0.4230769
KNN with Different K Values
train_X <- as.matrix(train_data[, c("Lag1", "Lag2", "Volume")])
test_X <- as.matrix(test_data[, c("Lag1", "Lag2", "Volume")])
train_Y <- train_data$Direction
k_values <- c(1, 3, 5, 7, 10)
for (k in k_values) {
knn_preds <- knn(train_X, test_X, train_Y, k = k)
accuracy <- mean(knn_preds == test_data$Direction)
print(paste("K =", k, "Accuracy =", accuracy))
}
## [1] "K = 1 Accuracy = 0.5"
## [1] "K = 3 Accuracy = 0.480769230769231"
## [1] "K = 5 Accuracy = 0.557692307692308"
## [1] "K = 7 Accuracy = 0.548076923076923"
## [1] "K = 10 Accuracy = 0.509615384615385"
I think I would choose the KNN model with K = 10 because it had the best accuracy percentage.
data(Auto)
Auto$mpg01 <- ifelse(Auto$mpg > median(Auto$mpg), 1, 0)
summary(Auto$mpg01)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 0.0 0.5 0.5 1.0 1.0
cor(Auto[,-9])
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## mpg01 0.8369392 -0.7591939 -0.7534766 -0.6670526 -0.7577566
## acceleration year origin mpg01
## mpg 0.4233285 0.5805410 0.5652088 0.8369392
## cylinders -0.5046834 -0.3456474 -0.5689316 -0.7591939
## displacement -0.5438005 -0.3698552 -0.6145351 -0.7534766
## horsepower -0.6891955 -0.4163615 -0.4551715 -0.6670526
## weight -0.4168392 -0.3091199 -0.5850054 -0.7577566
## acceleration 1.0000000 0.2903161 0.2127458 0.3468215
## year 0.2903161 1.0000000 0.1815277 0.4299042
## origin 0.2127458 0.1815277 1.0000000 0.5136984
## mpg01 0.3468215 0.4299042 0.5136984 1.0000000
pairs(Auto)
The features: displacement, horsepower, weight would be useful in predicting mpg01.
ggplot(Auto, aes(x = factor(mpg01), y = horsepower)) +
geom_boxplot() +
labs(title = "Horsepower vs MPG Class")
ggplot(Auto, aes(x = factor(mpg01), y = weight)) +
geom_boxplot() +
labs(title = "Weight vs MPG Class")
set.seed(1)
train_indices <- sample(1:nrow(Auto), nrow(Auto) * 0.7)
train_data <- Auto[train_indices, ]
test_data <- Auto[-train_indices, ]
lda_auto <- lda(mpg01 ~ horsepower + weight + displacement, data = train_data)
lda_preds <- predict(lda_auto, test_data)$class
table(Predicted = lda_preds, Actual = test_data$mpg01)
## Actual
## Predicted 0 1
## 0 47 1
## 1 14 56
mean(lda_preds == test_data$mpg01)
## [1] 0.8728814
accuracy <- mean(lda_preds == test_data$mpg01)
print(paste("Test Accuracy:", accuracy))
## [1] "Test Accuracy: 0.872881355932203"
test_error <- 1 - accuracy
print(paste("Test Error:", test_error))
## [1] "Test Error: 0.127118644067797"
The test error is 12.7%.
qda_auto <- qda(mpg01 ~ horsepower + weight + displacement, data = train_data)
qda_preds <- predict(qda_auto, test_data)$class
table(Predicted = qda_preds, Actual = test_data$mpg01)
## Actual
## Predicted 0 1
## 0 51 3
## 1 10 54
mean(qda_preds == test_data$mpg01)
## [1] 0.8898305
The test error is 11%.
log_auto <- glm(mpg01 ~ horsepower + weight + displacement, data = train_data, family = binomial)
log_probs <- predict(log_auto, test_data, type = "response")
log_preds <- ifelse(log_probs > 0.5, 1, 0)
table(Predicted = log_preds, Actual = test_data$mpg01)
## Actual
## Predicted 0 1
## 0 53 3
## 1 8 54
mean(log_preds == test_data$mpg01)
## [1] 0.9067797
The test error is 9.3%.
nb_auto <- naiveBayes(mpg01 ~ horsepower + weight + displacement, data = train_data)
nb_preds <- predict(nb_auto, test_data)
table(Predicted = nb_preds, Actual = test_data$mpg01)
## Actual
## Predicted 0 1
## 0 50 2
## 1 11 55
mean(nb_preds == test_data$mpg01)
## [1] 0.8898305
The test error is the same as QDA at 11%.
train_X <- as.matrix(train_data[, c("horsepower", "weight", "displacement")])
test_X <- as.matrix(test_data[, c("horsepower", "weight", "displacement")])
train_Y <- train_data$mpg01
k_values <- c(1, 3, 5, 7, 10)
for (k in k_values) {
knn_preds <- knn(train_X, test_X, train_Y, k = k)
accuracy <- mean(knn_preds == test_data$mpg01)
print(paste("K =", k, "Accuracy =", accuracy))
}
## [1] "K = 1 Accuracy = 0.864406779661017"
## [1] "K = 3 Accuracy = 0.889830508474576"
## [1] "K = 5 Accuracy = 0.872881355932203"
## [1] "K = 7 Accuracy = 0.872881355932203"
## [1] "K = 10 Accuracy = 0.855932203389831"
Test Errors: K = 1 - 13.6% K = 3 - 11% K = 5 - 12.7% K = 7 - 12.7% K = 10 - 14.4%
The KNN model that performs the best is the K = 3 model.