library(ISLR2)
#Numerical Summary
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
cor(Weekly[,1:8])
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876
## Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535
## Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865
## Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000
## Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027
## Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873
## Lag5 Volume Today
## Year -0.030519101 0.84194162 -0.032459894
## Lag1 -0.008183096 -0.06495131 -0.075031842
## Lag2 -0.072499482 -0.08551314 0.059166717
## Lag3 0.060657175 -0.06928771 -0.071243639
## Lag4 -0.075675027 -0.06107462 -0.007825873
## Lag5 1.000000000 -0.05851741 0.011012698
## Volume -0.058517414 1.00000000 -0.033077783
## Today 0.011012698 -0.03307778 1.000000000
# Fit a model excluding noninformative predictors
fit1 <- glm(Direction ~ Volume + Lag1 + Lag2 + Lag3 + Lag4 + Lag5,
data = Weekly, family = "binomial")
summary(fit1)
##
## Call:
## glm(formula = Direction ~ Volume + Lag1 + Lag2 + Lag3 + Lag4 +
## Lag5, family = "binomial", data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Volume -0.02274 0.03690 -0.616 0.5377
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
# Compute predicted classes using a 0.5 threshold
predictions_full <- ifelse(predict(fit1, newdata = Weekly, type = "response") > 0.5, "Up", "Down")
conf_matrix_full <- table(predictions_full, Weekly$Direction)
conf_matrix_full
##
## predictions_full Down Up
## Down 54 48
## Up 430 557
-The confusion matrix shows that a considerable number of weeks predicted as Up were actually down and vice-versa. The overall prediction accuracy is reported below.
accuracy_full <- (conf_matrix_full["Down", "Down"] + conf_matrix_full["Up", "Up"]) /
sum(conf_matrix_full)
accuracy_full
## [1] 0.5610652
# Split the data into training and test sets
train <- Weekly[Weekly$Year <= 2008, ]
test <- Weekly[Weekly$Year >= 2009, ]
# Fit logistic regression with Lag2
fit2 <- glm(Direction ~ Lag2, data = train, family = "binomial")
# Predict
predictions_test <- ifelse(predict(fit2, newdata = test, type = "response") > 0.5, "Up", "Down")
conf_matrix_test <- table(predictions_test, test$Direction)
conf_matrix_test
##
## predictions_test Down Up
## Down 9 5
## Up 34 56
# Calculate accuracy
accuracy_test <- (conf_matrix_test["Down", "Down"] + conf_matrix_test["Up", "Up"]) /
sum(conf_matrix_test)
accuracy_test
## [1] 0.625
-Using Lag2 alone, the model correctly predicts the direction with 62.5% accuracy.
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
lda_model <- lda(Direction ~ Lag2, data = train)
lda_predictions <- predict(lda_model, newdata = test)$class
conf_matrix_lda <- table(lda_predictions, test$Direction)
conf_matrix_lda
##
## lda_predictions Down Up
## Down 9 5
## Up 34 56
accuracy_lda <- mean(lda_predictions == test$Direction)
accuracy_lda
## [1] 0.625
-The LDA model produces results that are very similar to the those in part d, also yielding a 62.5% accuracy.
qda_model <- qda(Direction ~ Lag2, data = train)
qda_predictions <- predict(qda_model, newdata = test)$class
conf_matrix_qda <- table(qda_predictions, test$Direction)
conf_matrix_qda
##
## qda_predictions Down Up
## Down 0 0
## Up 43 61
accuracy_qda <- mean(qda_predictions == test$Direction)
accuracy_qda
## [1] 0.5865385
-Interestingly, the QDA model ends up predicting all the test observations as “Up,” resulting in an accuracy of about 58.6%. This is lower than the results from LDA or logistic regression, reflecting a potential mis-specification when using QDA with Lag2 alone.
library(class)
# Standardize Lag2 in training and test sets
Weekly2 <- Weekly[, c("Direction", "Lag2", "Year")]
Weekly2$Lag2 <- scale(Weekly$Lag2)
train2 <- Weekly2[Weekly2$Year <= 2008, ]
test2 <- Weekly2[Weekly2$Year > 2008, ]
# KNN prediction with K = 1
knn_pred <- knn(train = data.frame(scale(train2$Lag2)),
test = data.frame(scale(test2$Lag2)),
cl = train2$Direction, k = 1)
conf_matrix_knn <- table(knn_pred, test2$Direction)
conf_matrix_knn
##
## knn_pred Down Up
## Down 15 25
## Up 28 36
accuracy_knn <- mean(knn_pred == test2$Direction)
accuracy_knn
## [1] 0.4903846
-With k=1, the classification accuracy one the test data is only around 46.2%. This suggests that, for this data set, KNN is outperformed by linear methods.
library(e1071)
# Fit a Naive Bayes model
nb_model <- naiveBayes(Direction ~ Lag2, data = train)
# Predict the market direction
nb_pred <- predict(nb_model, newdata = test)
# Create a confusion matrix
conf_matrix_nb <- table(nb_pred, test$Direction)
print(conf_matrix_nb)
##
## nb_pred Down Up
## Down 0 0
## Up 43 61
accuracy_nb <- mean(nb_pred == test$Direction)
accuracy_nb
## [1] 0.5865385
-The accuracy yielded by the naive Bayes is slightly inferior to that of logistic regression and LDA which were both 62.5%. The naive Bayes seems to have an accuracy of 58.65%.
In practice, when balancing interpretability and predictive performance, using a logistic regression or LDA model with only Lag2 is advisable for this data set. The small edge—if any—gained by more complex models does not outweigh the benefits of simplicity, especially given that the prediction accuracy improvements are marginal. In this case the logistic regression and the LDA slgihtly edge out the other techniques.
# Define the predictor names of interest
varNames <- c("Lag1", "Lag2", "Lag3", "Lag4", "Lag5", "Volume")
# Create an inclusion matrix for all combinations (excluding the all-FALSE model)
inclusionMat <- expand.grid(rep(list(c(TRUE, FALSE)), length(varNames)))
inclusionMat <- inclusionMat[-nrow(inclusionMat), ] # remove the model with no predictors
# Generate all model formulas
modelForms <- apply(inclusionMat, 1, function(include) {
predictors <- paste(varNames[include], collapse = " + ")
as.formula(paste("Direction ~", predictors))
})
# Evaluate each model using logistic regression
results <- lapply(modelForms, function(formula) {
model <- glm(formula, data = train, family = "binomial")
pred <- ifelse(predict(model, newdata = test, type = "response") > 0.5, "Up", "Down")
confMat <- table(pred, test$Direction)
accuracy <- mean(pred == test$Direction)
list(formula = formula, confMat = confMat, accuracy = accuracy)
})
# Identify the model with the highest test accuracy
accuracy_vals <- sapply(results, function(x) x$accuracy)
bestModel <- results[[which.max(accuracy_vals)]]
# Print the best model details
bestModel$formula
## Direction ~ Lag2 + Lag3 + Lag4 + Lag5
## <environment: 0x000001443c137230>
bestModel$confMat
##
## pred Down Up
## Down 9 5
## Up 34 56
bestModel$accuracy
## [1] 0.625
-This automated search confirms that simpler models can perform just as well as more complex ones, emphasizing the benefits of pure simplicity and ease of interpretation.
# Create the binary outcome variable mpg01
AutoNew <- data.frame(mpg01 = ifelse(Auto$mpg > median(Auto$mpg), 1, 0), Auto)
# Pairwise scatterplots
pairs(AutoNew)
# Boxplots for variables versus mpg01
boxplot(displacement ~ mpg01, data = AutoNew, main = "Displacement by mpg01")
boxplot(horsepower ~ mpg01, data = AutoNew, main = "Horsepower by mpg01")
boxplot(weight ~ mpg01, data = AutoNew, main = "Weight by mpg01")
boxplot(acceleration ~ mpg01, data = AutoNew, main = "Acceleration by mpg01")
-Interpretation: Visual inspection of these plots shows that displacement, horsepower, and weight have marked differences in their distributions between the low and high mpg groups. Acceleration also displays a relationship with mpg01 but may be somewhat less pronounced. These observations suggest that these four predictors could be valuable when building a classification model.
set.seed(123)
# Determine indices for training and test sets
n <- nrow(AutoNew)
testIndices <- sample(1:n, size = floor(n / 2))
trainIndices <- setdiff(1:n, testIndices)
# Create the training and test subsets
train <- AutoNew[trainIndices, ]
test <- AutoNew[testIndices, ]
# Fit the LDA model
ldaAuto <- lda(mpg01 ~ horsepower + displacement + weight + acceleration, data = train)
# Predict the mpg01 classification
predLDA <- predict(ldaAuto, newdata = test)$class
# Create the confusion matrix and calc accuracy
confMatrixLDA <- table(predLDA, test$mpg01)
print(confMatrixLDA)
##
## predLDA 0 1
## 0 80 3
## 1 17 96
accuracyLDA <- mean(predLDA == test$mpg01)
accuracyLDA
## [1] 0.8979592
-Interpretation: The confusion matrix shows the counts of correct and incorrect predictions. From the example results, we obtain an accuracy of 89.796%, indicating a reasonably strong performance by the LDA model. Test Error: 10.2%
# Fit the QDA model
qdaAuto <- qda(mpg01 ~ horsepower + displacement + weight + acceleration, data = train)
# Predict classifications
predQDA <- predict(qdaAuto, newdata = test)$class
# Confusion matrix and accuracy
confMatrixQDA <- table(predQDA, test$mpg01)
print(confMatrixQDA)
##
## predQDA 0 1
## 0 86 7
## 1 11 92
accuracyQDA <- mean(predQDA == test$mpg01)
accuracyQDA
## [1] 0.9081633
-Interpretation: It seems that we yield a very marginal improvement in accuracy when doing QDA instead of LDA. Test Error: 9.18%
# Fit logistic regression
lrAuto <- glm(mpg01 ~ horsepower + displacement + weight + acceleration,
data = AutoNew, subset = trainIndices, family = binomial)
# Obtain predicted probabilities
predLR <- ifelse(predict(lrAuto, newdata = test, type = "response") > 0.5, 1, 0)
# Confusion matrix and accuracy
confMatrixLR <- table(predLR, test$mpg01)
print(confMatrixLR)
##
## predLR 0 1
## 0 84 7
## 1 13 92
accuracyLR <- mean(predLR == test$mpg01)
accuracyLR
## [1] 0.8979592
-This yields a similar accuracy to LDA, similarly it is marginally inferior in terms of accuracy in comparison to QDA. Test Error: 10.2%
# Fit a naive Bayes model
nbAuto <- naiveBayes(mpg01 ~ horsepower + displacement + weight + acceleration, data = train)
# Predict the mpg01 classification
predNB <- predict(nbAuto, newdata = test)
# Compute the confusion matrix
confMatrixNB <- table(Predicted = predNB, Actual = test$mpg01)
print(confMatrixNB)
## Actual
## Predicted 0 1
## 0 79 7
## 1 18 92
accuracyNB <- mean(predNB == test$mpg01)
accuracyNB
## [1] 0.872449
-This yields the worst and lowest accuracy that we have seen thus far. Test Error: 12.8%.
varsToUse <- c("acceleration", "horsepower", "weight", "displacement")
# Extract and scale the predictor variables
trainPred <- scale(train[, varsToUse])
testPred <- scale(test[, varsToUse], center = attr(trainPred, "scaled:center"),
scale = attr(trainPred, "scaled:scale"))
# Evaluate KNN for K from 1 to 25
knn_results <- sapply(1:25, function(k) {
knn_pred <- knn(train = trainPred, test = testPred, cl = train$mpg01, k = k)
mean(knn_pred == test$mpg01)
})
bestK <- which.max(knn_results)
bestKAccuracy <- knn_results[bestK]
bestK; bestKAccuracy
## [1] 3
## [1] 0.9081633
-The optimal K appears to be K=3 since it yields the highest test accuracy. The corresponding test error is 9.18%.