library(ISLR2)
library(ggplot2)
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
## 
##     Boston
library(class)
library(e1071)

# (a) Summaries of the Weekly data
data(Weekly)
summary(Weekly)
##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
## 
plot(Weekly)

# (b) Logistic regression with all predictors
model <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = Weekly, family = "binomial")
summary(model)
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = "binomial", data = Weekly)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4
# (c) Confusion matrix and overall fraction correct for logistic regression
predictions <- ifelse(predict(model, Weekly, type = "response") > 0.5, "Up", "Down")
confusion_matrix <- table(Actual = Weekly$Direction, Predicted = predictions)
fraction_correct <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
confusion_matrix
##       Predicted
## Actual Down  Up
##   Down   54 430
##   Up     48 557
fraction_correct
## [1] 0.5610652
# (d) Logistic regression on training data and confusion matrix for held-out data
train_data <- Weekly[Weekly$Year < 2009, ]
test_data <- Weekly[Weekly$Year >= 2009, ]
train_model <- glm(Direction ~ Lag2, data = train_data, family = "binomial")
test_predictions <- ifelse(predict(train_model, test_data, type = "response") > 0.5, "Up", "Down")
test_confusion_matrix <- table(Actual = test_data$Direction, Predicted = test_predictions)
test_fraction_correct <- sum(diag(test_confusion_matrix)) / sum(test_confusion_matrix)
test_confusion_matrix
##       Predicted
## Actual Down Up
##   Down    9 34
##   Up      5 56
test_fraction_correct
## [1] 0.625
# (e) LDA on training data and confusion matrix for held-out data
lda_model <- lda(Direction ~ Lag2, data = train_data)
lda_predictions <- predict(lda_model, test_data)$class
lda_confusion_matrix <- table(Actual = test_data$Direction, Predicted = lda_predictions)
lda_fraction_correct <- sum(diag(lda_confusion_matrix)) / sum(lda_confusion_matrix)
lda_confusion_matrix
##       Predicted
## Actual Down Up
##   Down    9 34
##   Up      5 56
lda_fraction_correct
## [1] 0.625
# (f) QDA on training data and confusion matrix for held-out data
qda_model <- qda(Direction ~ Lag2, data = train_data)
qda_predictions <- predict(qda_model, test_data)$class
qda_confusion_matrix <- table(Actual = test_data$Direction, Predicted = qda_predictions)
qda_fraction_correct <- sum(diag(qda_confusion_matrix)) / sum(qda_confusion_matrix)
qda_confusion_matrix
##       Predicted
## Actual Down Up
##   Down    0 43
##   Up      0 61
qda_fraction_correct
## [1] 0.5865385
# Convert predictor variables to matrices
train_matrix <- as.matrix(train_data[, "Lag2"])
test_matrix <- as.matrix(test_data[, "Lag2"])

# (g) KNN (K=1) on training data and confusion matrix for held-out data
knn_model <- knn(train_matrix, test_matrix, train_data$Direction, k = 1)
knn_confusion_matrix <- table(Actual = test_data$Direction, Predicted = knn_model)
knn_fraction_correct <- sum(diag(knn_confusion_matrix)) / sum(knn_confusion_matrix)
knn_confusion_matrix
##       Predicted
## Actual Down Up
##   Down   21 22
##   Up     29 32
knn_fraction_correct
## [1] 0.5096154
# (h) Naive Bayes on training data and confusion matrix for held-out data
nb_model <- naiveBayes(Direction ~ Lag2, data = train_data)
nb_predictions <- predict(nb_model, test_data)
nb_confusion_matrix <- table(Actual = test_data$Direction, Predicted = nb_predictions)
nb_fraction_correct <- sum(diag(nb_confusion_matrix)) / sum(nb_confusion_matrix)
nb_confusion_matrix
##       Predicted
## Actual Down Up
##   Down    0 43
##   Up      0 61
nb_fraction_correct
## [1] 0.5865385
# (i) Compare results of different methods
results <- c(fraction_correct, test_fraction_correct, lda_fraction_correct, qda_fraction_correct, knn_fraction_correct, nb_fraction_correct)
method_names <- c("Logistic Regression", "Logistic Regression (Hold-out)", "LDA", "QDA", "KNN (K=1)", "Naive Bayes")
best_method <- method_names[which.max(results)]
best_results <- data.frame(Method = method_names, Fraction_Correct = results)
best_results
##                           Method Fraction_Correct
## 1            Logistic Regression        0.5610652
## 2 Logistic Regression (Hold-out)        0.6250000
## 3                            LDA        0.6250000
## 4                            QDA        0.5865385
## 5                      KNN (K=1)        0.5096154
## 6                    Naive Bayes        0.5865385
# (j) Experiment with different predictors and interactions
# Example: Logistic regression with Lag2 and Lag3 as predictors
new_model <- glm(Direction ~ Lag2 + Lag3, data = train_data, family = "binomial")
new_predictions <- ifelse(predict(new_model, test_data, type = "response") > 0.5, "Up", "Down")
new_confusion_matrix <- table(Actual = test_data$Direction, Predicted = new_predictions)
new_fraction_correct <- sum(diag(new_confusion_matrix)) / sum(new_confusion_matrix)
new_confusion_matrix
##       Predicted
## Actual Down Up
##   Down    8 35
##   Up      4 57
new_fraction_correct
## [1] 0.625