library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load Auto data
weekly <- read.csv("/Users/saransh/Downloads/Statistical_Learning_Resources/Weekly.csv", na.strings = "?")
weekly <- na.omit(weekly)
# Display structure and summary of data
str(weekly)
## 'data.frame': 1089 obs. of 9 variables:
## $ Year : int 1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
## $ Lag1 : num 0.816 -0.27 -2.576 3.514 0.712 ...
## $ Lag2 : num 1.572 0.816 -0.27 -2.576 3.514 ...
## $ Lag3 : num -3.936 1.572 0.816 -0.27 -2.576 ...
## $ Lag4 : num -0.229 -3.936 1.572 0.816 -0.27 ...
## $ Lag5 : num -3.484 -0.229 -3.936 1.572 0.816 ...
## $ Volume : num 0.155 0.149 0.16 0.162 0.154 ...
## $ Today : num -0.27 -2.576 3.514 0.712 1.178 ...
## $ Direction: chr "Down" "Down" "Up" "Up" ...
summary(weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Length:1089
## Class :character
## Mode :character
##
##
##
# Plot Weekly Returns Over Time
ggplot(weekly, aes(x = Year, y = Today)) +
geom_line() +
labs(title = "Weekly Returns Over Time", x = "Year", y = "Return")
# Histogram of Volume
ggplot(weekly, aes(x = Volume)) +
geom_histogram(binwidth = 0.1, fill = "blue", alpha = 0.5) +
labs(title = "Histogram of Trading Volume", x = "Volume", y = "Count")
Observations:
- The time series plot of returns suggests fluctuations over time, with some noticeable periods of higher volatility.
- The histogram of trading volume shows a right-skewed distribution, indicating that most weeks have relatively low trading volume, while some weeks have significantly higher volumes.
# Convert Direction to a binary variable (0 = "Down", 1 = "Up")
weekly$DirectionBinary <- ifelse(weekly$Direction == "Up", 1, 0)
# Fit logistic regression model
logit_model <- glm(DirectionBinary ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = weekly, family = binomial)
summary(logit_model)
##
## Call:
## glm(formula = DirectionBinary ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
Observations:
- The coefficient for Lag2
is statistically significant
(p-value < 0.05), suggesting that it has predictive power for market
direction.
- Other variables do not appear to be statistically significant in
predicting Direction
.
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
# Predictions
logit_pred <- ifelse(predict(logit_model, type = "response") > 0.5, "Up", "Down")
logit_cm <- confusionMatrix(as.factor(logit_pred), as.factor(weekly$Direction))
logit_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 54 48
## Up 430 557
##
## Accuracy : 0.5611
## 95% CI : (0.531, 0.5908)
## No Information Rate : 0.5556
## P-Value [Acc > NIR] : 0.369
##
## Kappa : 0.035
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.11157
## Specificity : 0.92066
## Pos Pred Value : 0.52941
## Neg Pred Value : 0.56434
## Prevalence : 0.44444
## Detection Rate : 0.04959
## Detection Prevalence : 0.09366
## Balanced Accuracy : 0.51612
##
## 'Positive' Class : Down
##
Observations:
- The confusion matrix shows the number of correctly and incorrectly classified instances.
- The accuracy of logistic regression is around 56%, suggesting that the model does only slightly better than random guessing.
- The model struggles more with classifying Down
days
correctly, indicating possible bias in classification.
# Split data into training (1990-2008) and testing (2009-2010)
train <- weekly$Year < 2009
test_data <- weekly[!train, ]
train_data <- weekly[train, ]
# Fit logistic regression with Lag2 as predictor, using the binary response
logit_train <- glm(DirectionBinary ~ Lag2, data = train_data, family = binomial)
# Make predictions
pred_test <- ifelse(predict(logit_train, test_data, type = "response") > 0.5, "Up", "Down")
# Compute confusion matrix
confusionMatrix(as.factor(pred_test), as.factor(test_data$Direction))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 9 5
## Up 34 56
##
## Accuracy : 0.625
## 95% CI : (0.5247, 0.718)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.2439
##
## Kappa : 0.1414
##
## Mcnemar's Test P-Value : 7.34e-06
##
## Sensitivity : 0.20930
## Specificity : 0.91803
## Pos Pred Value : 0.64286
## Neg Pred Value : 0.62222
## Prevalence : 0.41346
## Detection Rate : 0.08654
## Detection Prevalence : 0.13462
## Balanced Accuracy : 0.56367
##
## 'Positive' Class : Down
##
Observations:
- The logistic regression model with only Lag2
as a
predictor achieves an accuracy of 62.5%, which is an
improvement compared to the full model.
- However, it still misclassifiers a significant number of cases.
# Ensure MASS package is loaded
if (!requireNamespace("MASS", quietly = TRUE)) install.packages("MASS")
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
# Fit LDA model
lda_model <- lda(Direction ~ Lag2, data = train_data)
# Make predictions
lda_pred <- predict(lda_model, test_data)$class
# Compute confusion matrix
confusionMatrix(as.factor(lda_pred), as.factor(test_data$Direction))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 9 5
## Up 34 56
##
## Accuracy : 0.625
## 95% CI : (0.5247, 0.718)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.2439
##
## Kappa : 0.1414
##
## Mcnemar's Test P-Value : 7.34e-06
##
## Sensitivity : 0.20930
## Specificity : 0.91803
## Pos Pred Value : 0.64286
## Neg Pred Value : 0.62222
## Prevalence : 0.41346
## Detection Rate : 0.08654
## Detection Prevalence : 0.13462
## Balanced Accuracy : 0.56367
##
## 'Positive' Class : Down
##
Observations:
- LDA provides similar accuracy (62.5%) as logistic regression.
- The specificity is relatively high, but sensitivity remains low.
qda_model <- qda(Direction ~ Lag2, data = train_data)
qda_pred <- predict(qda_model, test_data)$class
confusionMatrix(as.factor(qda_pred), as.factor(test_data$Direction))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 0 0
## Up 43 61
##
## Accuracy : 0.5865
## 95% CI : (0.4858, 0.6823)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.5419
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 1.504e-10
##
## Sensitivity : 0.0000
## Specificity : 1.0000
## Pos Pred Value : NaN
## Neg Pred Value : 0.5865
## Prevalence : 0.4135
## Detection Rate : 0.0000
## Detection Prevalence : 0.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : Down
##
Observations:
- QDA does not improve accuracy significantly, remaining around 58.7%.
- It seems to struggle with detecting Down
directions.
if (!requireNamespace("class", quietly = TRUE)) install.packages("class")
library(class)
train_X <- train_data$Lag2
test_X <- test_data$Lag2
train_Y <- train_data$Direction
test_Y <- test_data$Direction
knn_pred <- knn(as.matrix(train_X), as.matrix(test_X), train_Y, k = 1)
confusionMatrix(as.factor(knn_pred), as.factor(test_Y))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 21 30
## Up 22 31
##
## Accuracy : 0.5
## 95% CI : (0.4003, 0.5997)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.9700
##
## Kappa : -0.0033
##
## Mcnemar's Test P-Value : 0.3317
##
## Sensitivity : 0.4884
## Specificity : 0.5082
## Pos Pred Value : 0.4118
## Neg Pred Value : 0.5849
## Prevalence : 0.4135
## Detection Rate : 0.2019
## Detection Prevalence : 0.4904
## Balanced Accuracy : 0.4983
##
## 'Positive' Class : Down
##
Observations:
- KNN with K=1 produces an accuracy of 50%, which is equivalent to random guessing.
- The model is highly sensitive to noise, indicating high variance.
if (!requireNamespace("e1071", quietly = TRUE)) install.packages("e1071")
library(e1071)
nb_model <- naiveBayes(Direction ~ Lag2, data = train_data)
nb_pred <- predict(nb_model, test_data)
confusionMatrix(as.factor(nb_pred), as.factor(test_data$Direction))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 0 0
## Up 43 61
##
## Accuracy : 0.5865
## 95% CI : (0.4858, 0.6823)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.5419
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 1.504e-10
##
## Sensitivity : 0.0000
## Specificity : 1.0000
## Pos Pred Value : NaN
## Neg Pred Value : 0.5865
## Prevalence : 0.4135
## Detection Rate : 0.0000
## Detection Prevalence : 0.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : Down
##
Observations:
- Naive Bayes results in an accuracy of 58.7%, performing similarly to QDA.
- The model seems to be biased towards predicting Up
values.
# Summarizing model performances
results <- data.frame(
Method = c("Logistic Regression", "LDA", "QDA", "KNN (K=1)", "Naive Bayes"),
Accuracy = c(
logit_cm$overall["Accuracy"],
confusionMatrix(as.factor(lda_pred), as.factor(test_data$Direction))$overall["Accuracy"],
confusionMatrix(as.factor(qda_pred), as.factor(test_data$Direction))$overall["Accuracy"],
confusionMatrix(as.factor(knn_pred), as.factor(test_Y))$overall["Accuracy"],
confusionMatrix(as.factor(nb_pred), as.factor(test_data$Direction))$overall["Accuracy"]
)
)
print(results)
## Method Accuracy
## 1 Logistic Regression 0.5610652
## 2 LDA 0.6250000
## 3 QDA 0.5865385
## 4 KNN (K=1) 0.5000000
## 5 Naive Bayes 0.5865385
Conclusion:
- LDA performed the best achieving 62.5% accuracy.
- KNN performed the worst, suggesting it is not well-suited for this data.
# Trying KNN with different values of K
for (k in c(3, 5, 10)) {
knn_pred_k <- knn(as.matrix(train_X), as.matrix(test_X), train_Y, k = k)
print(paste("KNN with K =", k))
print(confusionMatrix(as.factor(knn_pred_k), as.factor(test_Y)))
}
## [1] "KNN with K = 3"
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 16 19
## Up 27 42
##
## Accuracy : 0.5577
## 95% CI : (0.457, 0.655)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.7579
##
## Kappa : 0.0623
##
## Mcnemar's Test P-Value : 0.3020
##
## Sensitivity : 0.3721
## Specificity : 0.6885
## Pos Pred Value : 0.4571
## Neg Pred Value : 0.6087
## Prevalence : 0.4135
## Detection Rate : 0.1538
## Detection Prevalence : 0.3365
## Balanced Accuracy : 0.5303
##
## 'Positive' Class : Down
##
## [1] "KNN with K = 5"
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 15 20
## Up 28 41
##
## Accuracy : 0.5385
## 95% CI : (0.438, 0.6367)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.8631
##
## Kappa : 0.0216
##
## Mcnemar's Test P-Value : 0.3123
##
## Sensitivity : 0.3488
## Specificity : 0.6721
## Pos Pred Value : 0.4286
## Neg Pred Value : 0.5942
## Prevalence : 0.4135
## Detection Rate : 0.1442
## Detection Prevalence : 0.3365
## Balanced Accuracy : 0.5105
##
## 'Positive' Class : Down
##
## [1] "KNN with K = 10"
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 17 20
## Up 26 41
##
## Accuracy : 0.5577
## 95% CI : (0.457, 0.655)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.7579
##
## Kappa : 0.0689
##
## Mcnemar's Test P-Value : 0.4610
##
## Sensitivity : 0.3953
## Specificity : 0.6721
## Pos Pred Value : 0.4595
## Neg Pred Value : 0.6119
## Prevalence : 0.4135
## Detection Rate : 0.1635
## Detection Prevalence : 0.3558
## Balanced Accuracy : 0.5337
##
## 'Positive' Class : Down
##
# Trying logistic regression with interaction terms using binary response
logit_interact <- glm(DirectionBinary ~ Lag2 * Volume, data = train_data, family = binomial)
# Make predictions
pred_interact <- ifelse(predict(logit_interact, test_data, type = "response") > 0.5, "Up", "Down")
# Compute confusion matrix
confusionMatrix(as.factor(pred_interact), as.factor(test_data$Direction))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 20 25
## Up 23 36
##
## Accuracy : 0.5385
## 95% CI : (0.438, 0.6367)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.8631
##
## Kappa : 0.0549
##
## Mcnemar's Test P-Value : 0.8852
##
## Sensitivity : 0.4651
## Specificity : 0.5902
## Pos Pred Value : 0.4444
## Neg Pred Value : 0.6102
## Prevalence : 0.4135
## Detection Rate : 0.1923
## Detection Prevalence : 0.4327
## Balanced Accuracy : 0.5276
##
## 'Positive' Class : Down
##
# Load Data
auto <- read.csv("/Users/saransh/Downloads/Statistical_Learning_Resources/Auto.csv", na.strings = "?")
auto <- na.omit(auto) # Remove missing values
# Create binary variable mpg01 (1 if mpg > median, 0 otherwise)
auto$mpg01 <- ifelse(auto$mpg > median(auto$mpg), 1, 0)
# Check distribution of mpg01
table(auto$mpg01)
##
## 0 1
## 196 196
Observations:
- The dataset is split evenly, with 196 observations in each category
(mpg01 = 0
and mpg01 = 1
).
# Boxplots of numerical predictors vs mpg01
feature_vars <- c("displacement", "horsepower", "weight", "acceleration")
par(mfrow = c(2, 2))
for (var in feature_vars) {
boxplot(auto[[var]] ~ auto$mpg01, main = var, xlab = "mpg01", ylab = var)
}
# Scatterplots of features vs mpg
pairs(auto[, c("mpg", feature_vars)])
Findings:
- Features such as weight, displacement, and
horsepower show a significant difference between
mpg01 = 0
and mpg01 = 1
.
- Higher values of weight, displacement, and horsepower are associated with lower mpg.
- Acceleration does not seem to show a strong trend.
set.seed(123)
train_index <- createDataPartition(auto$mpg01, p = 0.7, list = FALSE)
train_data <- auto[train_index, ]
test_data <- auto[-train_index, ]
lda_model <- lda(mpg01 ~ weight + displacement + horsepower, data = train_data)
lda_pred <- predict(lda_model, test_data)$class
lda_cm <- confusionMatrix(as.factor(lda_pred), as.factor(test_data$mpg01))
lda_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 47 1
## 1 11 57
##
## Accuracy : 0.8966
## 95% CI : (0.8263, 0.9454)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7931
##
## Mcnemar's Test P-Value : 0.009375
##
## Sensitivity : 0.8103
## Specificity : 0.9828
## Pos Pred Value : 0.9792
## Neg Pred Value : 0.8382
## Prevalence : 0.5000
## Detection Rate : 0.4052
## Detection Prevalence : 0.4138
## Balanced Accuracy : 0.8966
##
## 'Positive' Class : 0
##
Observations:
- LDA achieves an accuracy of 89.66%.
- It performs well in classifying mpg01 = 0
, with a
specificity of 98.2%.
- Sensitivity is lower (81.03%), indicating some misclassification of
mpg01 = 1
.
qda_model <- qda(mpg01 ~ weight + displacement + horsepower, data = train_data)
qda_pred <- predict(qda_model, test_data)$class
qda_cm <- confusionMatrix(as.factor(qda_pred), as.factor(test_data$mpg01))
qda_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 51 4
## 1 7 54
##
## Accuracy : 0.9052
## 95% CI : (0.8367, 0.9517)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8103
##
## Mcnemar's Test P-Value : 0.5465
##
## Sensitivity : 0.8793
## Specificity : 0.9310
## Pos Pred Value : 0.9273
## Neg Pred Value : 0.8852
## Prevalence : 0.5000
## Detection Rate : 0.4397
## Detection Prevalence : 0.4741
## Balanced Accuracy : 0.9052
##
## 'Positive' Class : 0
##
Observations:
- QDA achieves an accuracy of 90.52%, slightly better than LDA.
- It has high sensitivity and specificity, meaning it correctly classifies most instances.
logit_model <- glm(mpg01 ~ weight + displacement + horsepower, data = train_data, family = binomial)
logit_pred <- ifelse(predict(logit_model, test_data, type = "response") > 0.5, 1, 0)
logit_cm <- confusionMatrix(as.factor(logit_pred), as.factor(test_data$mpg01))
logit_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 51 6
## 1 7 52
##
## Accuracy : 0.8879
## 95% CI : (0.816, 0.939)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7759
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.8793
## Specificity : 0.8966
## Pos Pred Value : 0.8947
## Neg Pred Value : 0.8814
## Prevalence : 0.5000
## Detection Rate : 0.4397
## Detection Prevalence : 0.4914
## Balanced Accuracy : 0.8879
##
## 'Positive' Class : 0
##
Observations:
- Logistic regression achieves an accuracy of 88.79%.
- It performs similarly to LDA but slightly worse than QDA.
nb_model <- naiveBayes(as.factor(mpg01) ~ weight + displacement + horsepower, data = train_data)
nb_pred <- predict(nb_model, test_data)
nb_cm <- confusionMatrix(as.factor(nb_pred), as.factor(test_data$mpg01))
nb_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 47 3
## 1 11 55
##
## Accuracy : 0.8793
## 95% CI : (0.8058, 0.9324)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.7586
##
## Mcnemar's Test P-Value : 0.06137
##
## Sensitivity : 0.8103
## Specificity : 0.9483
## Pos Pred Value : 0.9400
## Neg Pred Value : 0.8333
## Prevalence : 0.5000
## Detection Rate : 0.4052
## Detection Prevalence : 0.4310
## Balanced Accuracy : 0.8793
##
## 'Positive' Class : 0
##
Observations:
- Naive Bayes achieves an accuracy of 87.93%.
- Performs slightly worse than LDA and QDA but is still a strong model.
k_values <- c(1, 3, 5, 10, 15)
k_results <- data.frame(K = integer(), Accuracy = numeric())
train_X <- scale(train_data[, c("weight", "displacement", "horsepower")])
test_X <- scale(test_data[, c("weight", "displacement", "horsepower")])
train_Y <- train_data$mpg01
test_Y <- test_data$mpg01
for (k in k_values) {
knn_pred <- knn(train_X, test_X, train_Y, k = k)
acc <- confusionMatrix(as.factor(knn_pred), as.factor(test_Y))$overall["Accuracy"]
k_results <- rbind(k_results, data.frame(K = k, Accuracy = acc))
}
print(k_results)
## K Accuracy
## Accuracy 1 0.8448276
## Accuracy1 3 0.9051724
## Accuracy2 5 0.8879310
## Accuracy3 10 0.8965517
## Accuracy4 15 0.8879310
Observations: - KNN with K = 3 achieves the highest accuracy of 90.52%. - KNN with K = 10 and K = 15 also perform well.
mpg01
.print(k_results[which.max(k_results$Accuracy), ])
## K Accuracy
## Accuracy1 3 0.9051724