library(ISLR2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(caret)
## Loading required package: lattice
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## The following object is masked from 'package:ISLR2':
##
## Boston
library(class)
library(e1071)
data("Weekly")
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
ggplot(Weekly, aes(x = Year, y = Volume)) + geom_line() + labs(title = "Trading Volume Over Time")
ggplot(Weekly, aes(x = Lag1, y = Lag2, color = Direction)) + geom_point() + labs(title = "Lag1 vs Lag2 by Market Direction")
Market movement direction appears to have some patterns with Lag variables. Trading volume shows variations over the years. From the plot 1 , the volume fluctuates over time showing an increasing trend in later years. From the plot 2, Lag variables exhibit some correlation with market direction, but no obvious linear pattern is visible.
logit_model <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Weekly, family = binomial)
summary(logit_model)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
Identify statistically significant predictors based on p-values. Lag2 appears to be statistically significant, while others do not strongly predict market direction. Volume does not seem to contribute significantly to predicting stock movements.
pred_probs <- predict(logit_model, type = "response")
pred_class <- factor(ifelse(pred_probs > 0.5, "Up", "Down"), levels = c("Down", "Up"))
conf_matrix <- confusionMatrix(pred_class, Weekly$Direction)
conf_matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 54 48
## Up 430 557
##
## Accuracy : 0.5611
## 95% CI : (0.531, 0.5908)
## No Information Rate : 0.5556
## P-Value [Acc > NIR] : 0.369
##
## Kappa : 0.035
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.11157
## Specificity : 0.92066
## Pos Pred Value : 0.52941
## Neg Pred Value : 0.56434
## Prevalence : 0.44444
## Detection Rate : 0.04959
## Detection Prevalence : 0.09366
## Balanced Accuracy : 0.51612
##
## 'Positive' Class : Down
##
Model accuracy is calculated using the confusion matrix, indicating how well predictions match actual values. Errors mostly occur in predicting market upturns and downturns, suggesting logistic regression struggles with certain market conditions.
train_data <- Weekly %>% filter(Year <= 2008)
test_data <- Weekly %>% filter(Year > 2008)
logit_model_train <- glm(Direction ~ Lag2, data = train_data, family = binomial)
pred_probs_train <- predict(logit_model_train, test_data, type = "response")
pred_class_train <- factor(ifelse(pred_probs_train > 0.5, "Up", "Down"), levels = c("Down", "Up"))
conf_matrix_train <- confusionMatrix(pred_class_train, test_data$Direction)
conf_matrix_train
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 9 5
## Up 34 56
##
## Accuracy : 0.625
## 95% CI : (0.5247, 0.718)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.2439
##
## Kappa : 0.1414
##
## Mcnemar's Test P-Value : 7.34e-06
##
## Sensitivity : 0.20930
## Specificity : 0.91803
## Pos Pred Value : 0.64286
## Neg Pred Value : 0.62222
## Prevalence : 0.41346
## Detection Rate : 0.08654
## Detection Prevalence : 0.13462
## Balanced Accuracy : 0.56367
##
## 'Positive' Class : Down
##
Using only Lag2 as a predictor, the model still shows some predictive power. Performance is evaluated on 2009-10 data, showing how well past patterns generalize to future trends. Overall accuracy is 62.5%, meaning the model correctly predicts the market direction 62.5% of the time.
lda_model <- lda(Direction ~ Lag2, data = train_data)
lda_pred <- predict(lda_model, test_data)$class
lda_cm <- confusionMatrix(lda_pred, test_data$Direction)
lda_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 9 5
## Up 34 56
##
## Accuracy : 0.625
## 95% CI : (0.5247, 0.718)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.2439
##
## Kappa : 0.1414
##
## Mcnemar's Test P-Value : 7.34e-06
##
## Sensitivity : 0.20930
## Specificity : 0.91803
## Pos Pred Value : 0.64286
## Neg Pred Value : 0.62222
## Prevalence : 0.41346
## Detection Rate : 0.08654
## Detection Prevalence : 0.13462
## Balanced Accuracy : 0.56367
##
## 'Positive' Class : Down
##
qda_model <- qda(Direction ~ Lag2, data = train_data)
qda_pred <- predict(qda_model, test_data)$class
qda_cm <- confusionMatrix(qda_pred, test_data$Direction)
qda_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 0 0
## Up 43 61
##
## Accuracy : 0.5865
## 95% CI : (0.4858, 0.6823)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.5419
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 1.504e-10
##
## Sensitivity : 0.0000
## Specificity : 1.0000
## Pos Pred Value : NaN
## Neg Pred Value : 0.5865
## Prevalence : 0.4135
## Detection Rate : 0.0000
## Detection Prevalence : 0.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : Down
##
train_x <- dplyr::select(train_data, Lag2)
test_x <- dplyr::select(test_data, Lag2)
k_value <- 1
knn_pred <- knn(train = as.matrix(train_data$Lag2),
test = as.matrix(test_data$Lag2),
cl = train_data$Direction,
k = 1)
knn_cm <- confusionMatrix(knn_pred, test_data$Direction)
knn_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 21 30
## Up 22 31
##
## Accuracy : 0.5
## 95% CI : (0.4003, 0.5997)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.9700
##
## Kappa : -0.0033
##
## Mcnemar's Test P-Value : 0.3317
##
## Sensitivity : 0.4884
## Specificity : 0.5082
## Pos Pred Value : 0.4118
## Neg Pred Value : 0.5849
## Prevalence : 0.4135
## Detection Rate : 0.2019
## Detection Prevalence : 0.4904
## Balanced Accuracy : 0.4983
##
## 'Positive' Class : Down
##
nb_model <- naiveBayes(Direction ~ Lag2, data = train_data)
nb_pred <- predict(nb_model, test_data)
nb_cm <- confusionMatrix(nb_pred, test_data$Direction)
nb_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 0 0
## Up 43 61
##
## Accuracy : 0.5865
## 95% CI : (0.4858, 0.6823)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.5419
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 1.504e-10
##
## Sensitivity : 0.0000
## Specificity : 1.0000
## Pos Pred Value : NaN
## Neg Pred Value : 0.5865
## Prevalence : 0.4135
## Detection Rate : 0.0000
## Detection Prevalence : 0.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : Down
##
# Lets compare them by their accuracy to decide method that provide the best results
results <- data.frame(
Model = c("Logistic Regression", "LDA", "QDA", "KNN (K=1)", "Naive Bayes"),
Accuracy = c(conf_matrix_train$overall["Accuracy"], lda_cm$overall["Accuracy"],
qda_cm$overall["Accuracy"], knn_cm$overall["Accuracy"], nb_cm$overall["Accuracy"])
)
print(results)
## Model Accuracy
## 1 Logistic Regression 0.6250000
## 2 LDA 0.6250000
## 3 QDA 0.5865385
## 4 KNN (K=1) 0.5000000
## 5 Naive Bayes 0.5865385
Logistic Regression and LDA appears to be providing best results, so they are preferred.
library(dplyr)
train_x <- dplyr::select(train_data, Lag2)
test_x <- dplyr::select(test_data, Lag2)
train_x <- as.matrix(train_x)
test_x <- as.matrix(test_x)
train_y <- train_data$Direction
test_y <- test_data$Direction
best_knn_k <- tune.knn(x = train_x,
y = train_y,
k = 1:20)
best_knn_k$best.parameters
# Predict using KNN with k=17
knn_pred <- knn(train = train_x,
test = test_x,
cl = train_y,
k = 17)
knn_pred <- as.factor(knn_pred)
test_y <- as.factor(test_y)
knn_cm <- confusionMatrix(knn_pred, test_y)
knn_accuracy <- knn_cm$overall["Accuracy"]
print(knn_accuracy)
## Accuracy
## 0.5961538
results <- rbind(results, data.frame(Model = "KNN (K=17)", Accuracy = knn_accuracy))
print(results)
## Model Accuracy
## 1 Logistic Regression 0.6250000
## 2 LDA 0.6250000
## 3 QDA 0.5865385
## 4 KNN (K=1) 0.5000000
## 5 Naive Bayes 0.5865385
## Accuracy KNN (K=17) 0.5961538
data(Auto)
head(Auto)
median_mpg <- median(Auto$mpg)
Auto$mpg01 <- ifelse(Auto$mpg > median_mpg, 1, 0)
Auto_new <- data.frame(Auto)
head(Auto_new)
feature_vars <- c("cylinders", "displacement", "horsepower", "weight", "acceleration", "year", "origin")
par(mfrow = c(3,3))
for (var in feature_vars) {
boxplot(Auto[[var]] ~ Auto$mpg01, main = var, xlab = "mpg01", ylab = var, col = c("red", "blue"))
}
par(mfrow = c(1,1))
ggplot(Auto, aes(x = horsepower, y = mpg, color = factor(mpg01))) +
geom_point() +
labs(title = "MPG vs Horsepower", color = "MPG01")
ggplot(Auto, aes(x = weight, y = mpg, color = factor(mpg01))) +
geom_point() +
labs(title = "MPG vs Weight", color = "MPG01")
The boxplots compare the distribution of each variable for mpg01 = 0 (low MPG) and mpg01 = 1 (high MPG). weight, horsepower, cylinders, displacement, and year appear to be highly associated with mpg01.
The scatterplots visualize the relationship between mpg and selected numerical predictors (horsepower and weight), color coded by mpg01. MPG vs Horsepower: A clear negative trend—as horsepower increases, MPG decreases. High-MPG cars (mpg01 = 1) have noticeably lower horsepower.
MPG vs Weight:A strong negative correlation—heavier cars tend to have lower MPG. Clear separation between high-MPG (mpg01 = 1) and low-MPG (mpg01 = 0), making weight a strong predictor.
set.seed(1)
train_index <- sample(1:nrow(Auto), size = 0.7 * nrow(Auto))
train_data <- Auto[train_index, ]
test_data <- Auto[-train_index, ]
predictors <- c("cylinders", "displacement", "horsepower", "weight", "year", "origin")
lda_model <- lda(mpg01 ~ cylinders + displacement + horsepower + weight + year + origin, data = train_data)
lda_pred <- predict(lda_model, test_data)
lda_class <- lda_pred$class
lda_error <- mean(lda_class != test_data$mpg01)
lda_error
## [1] 0.1271186
Test error of model 0.1271186
qda_model <- qda(mpg01 ~ cylinders + displacement + horsepower + weight + year + origin, data = train_data)
qda_pred <- predict(qda_model, test_data)
qda_class <- qda_pred$class
qda_error <- mean(qda_class != test_data$mpg01)
qda_error
## [1] 0.1016949
Test error of model 0.1016949
logit_model <- glm(mpg01 ~ cylinders + displacement + horsepower + weight + year + origin, data = train_data, family = binomial)
logit_probs <- predict(logit_model, test_data, type = "response")
logit_class <- ifelse(logit_probs > 0.5, 1, 0)
logit_error <- mean(logit_class != test_data$mpg01)
logit_error
## [1] 0.1016949
Test error of model 0.1016949
nb_model <- naiveBayes(mpg01 ~ cylinders + displacement + horsepower + weight + year + origin, data = train_data)
nb_pred <- predict(nb_model, test_data)
nb_error <- mean(nb_pred != test_data$mpg01)
nb_error
## [1] 0.1186441
Test error of model 0.1186441
train_X <- scale(train_data[, predictors])
test_X <- scale(test_data[, predictors])
train_Y <- train_data$mpg01
test_Y <- test_data$mpg01
k_values <- c(1, 3, 5, 7, 10, 15, 20)
knn_errors <- c()
for (k in k_values) {
knn_pred <- knn(train_X, test_X, train_Y, k = k)
knn_error <- mean(knn_pred != test_Y)
knn_errors <- c(knn_errors, knn_error)
}
data.frame(K = k_values, Test_Error = knn_errors)
Best Choice is K = 1 performed best, but it might be prone to overfitting since it makes decisions based on a single nearest neighbor. So another choice would be K = 5 could be a better choice for generalization, as it still has a relatively low error.
error_results <- data.frame(
Model = c("LDA", "QDA", "Logistic Regression", "Naive Bayes", "KNN (Best K)"),
Test_Error = c(lda_error, qda_error, logit_error, nb_error, min(knn_errors))
)
print(error_results)
## Model Test_Error
## 1 LDA 0.12711864
## 2 QDA 0.10169492
## 3 Logistic Regression 0.10169492
## 4 Naive Bayes 0.11864407
## 5 KNN (Best K) 0.07627119
KNN has relatively low error. So it is best performing model for this dataset.