Import library to locate and load the weekly dataset

library(tinytex)
## Warning: package 'tinytex' was built under R version 4.4.3
library(ISLR2)
library(latexpdf)
library(e1071)
## Warning: package 'e1071' was built under R version 4.4.3
names(Weekly)
## [1] "Year"      "Lag1"      "Lag2"      "Lag3"      "Lag4"      "Lag5"     
## [7] "Volume"    "Today"     "Direction"
attach(Weekly)

(a) Produce some numerical and graphical summaries of the Weekly data. Do there appear to be any patterns?

pairs(Weekly)

cor(Weekly[, -9])
##               Year         Lag1        Lag2        Lag3         Lag4
## Year    1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1   -0.03228927  1.000000000 -0.07485305  0.05863568 -0.071273876
## Lag2   -0.03339001 -0.074853051  1.00000000 -0.07572091  0.058381535
## Lag3   -0.03000649  0.058635682 -0.07572091  1.00000000 -0.075395865
## Lag4   -0.03112792 -0.071273876  0.05838153 -0.07539587  1.000000000
## Lag5   -0.03051910 -0.008183096 -0.07249948  0.06065717 -0.075675027
## Volume  0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today  -0.03245989 -0.075031842  0.05916672 -0.07124364 -0.007825873
##                Lag5      Volume        Today
## Year   -0.030519101  0.84194162 -0.032459894
## Lag1   -0.008183096 -0.06495131 -0.075031842
## Lag2   -0.072499482 -0.08551314  0.059166717
## Lag3    0.060657175 -0.06928771 -0.071243639
## Lag4   -0.075675027 -0.06107462 -0.007825873
## Lag5    1.000000000 -0.05851741  0.011012698
## Volume -0.058517414  1.00000000 -0.033077783
## Today   0.011012698 -0.03307778  1.000000000
boxplot(Weekly[, -9])

From the correlation matrix, it appears that only Volume has a positive correlation with Year The graphs does not portrate sufficient information to identify the relations and distribution among the variables

**(b) Use the full data set to perform a logistic regression with* Direction as the response and the five lag variables plus Volume as predictors. Use the summary function to print the results. Do any of the predictors appear to be statistically significant? If so, which ones?

glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume, data = Weekly, family = binomial)
summary(glm.fit)
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Weekly)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4
coef(glm.fit)
## (Intercept)        Lag1        Lag2        Lag3        Lag4        Lag5 
##  0.26686414 -0.04126894  0.05844168 -0.01606114 -0.02779021 -0.01447206 
##      Volume 
## -0.02274153
summary(glm.fit)$coef[, 4]
## (Intercept)        Lag1        Lag2        Lag3        Lag4        Lag5 
## 0.001898848 0.118144368 0.029601361 0.546923890 0.293653342 0.583348244 
##      Volume 
## 0.537674762

It is shown that only Lag2 is significant “0.029601361”

(c) Compute the confusion matrix and overall fraction of correct predictions. Explain what the confusion matrix is telling you about the types of mistakes made by logistic regression.

glm.probs=predict(glm.fit, type = 'response')
contrasts(Direction)
##      Up
## Down  0
## Up    1
glm.pred=rep("Down", 1089)
glm.pred[glm.probs>0.5] = "Up"
table(glm.pred, Direction)
##         Direction
## glm.pred Down  Up
##     Down   54  48
##     Up    430 557
(54+557)/1089
## [1] 0.5610652
mean(glm.pred==Direction)
## [1] 0.5610652

The overall accuracy of the logistic regression is 0.5649351 this shows that the model correctly predicts the market direction 56.4% of the time However, the false positive is high 430, meaning the model is predicting “Up” too often when the market is actually “Down” In addition, the true negative is too low 52 this shows that the market is not able to correctly predict when the market goes down There seems to be some non-significants affect the accuracy of the model removing the non-significant variables leaves us with variable Lag2 only

(d) Now fit the logistic regression model using a training data period from 1990 to 2008, with Lag2 as the only predictor. Compute the confusion matrix and the overall fraction of correct predictions for the held out data (that is, the data from 2009 and 2010).

train=(Year>= 1990 & Year <= 2008)
head(train)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE
tail(train)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE
weekly.test=Weekly[!train,]
dim(weekly.test)
## [1] 104   9
Direction.test=Direction[!train]
glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume, data = Weekly, subset = train, family = binomial)
glm.probs=predict(glm.fit, weekly.test, type = 'response')
glm.pred=rep('Down', 104)
glm.pred[glm.probs>0.5]= 'Up'
table(glm.pred, Direction.test)
##         Direction.test
## glm.pred Down Up
##     Down   31 44
##     Up     12 17
mean(glm.pred==Direction.test)
## [1] 0.4615385
(31+17)/104
## [1] 0.4615385

then

glm.fit=glm(Direction~Lag2, data = Weekly, subset = train, family = binomial)
summary(glm.fit)
## 
## Call:
## glm(formula = Direction ~ Lag2, family = binomial, data = Weekly, 
##     subset = train)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.20326    0.06428   3.162  0.00157 **
## Lag2         0.05810    0.02870   2.024  0.04298 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1354.7  on 984  degrees of freedom
## Residual deviance: 1350.5  on 983  degrees of freedom
## AIC: 1354.5
## 
## Number of Fisher Scoring iterations: 4
glm.probs=predict(glm.fit, weekly.test, type = 'response')
glm.pred=rep('Down', 104)
glm.pred[glm.probs>0.5]= 'Up'
table(glm.pred, Direction.test)
##         Direction.test
## glm.pred Down Up
##     Down    9  5
##     Up     34 56
mean(glm.pred==Direction.test)
## [1] 0.625
(9+56)/104
## [1] 0.625

(e) Repeat (d) using LDA.

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
## 
##     Boston
lda.fit=lda(Direction~Lag2, data = Weekly, subset=train)
lda.fit
## Call:
## lda(Direction ~ Lag2, data = Weekly, subset = train)
## 
## Prior probabilities of groups:
##      Down        Up 
## 0.4477157 0.5522843 
## 
## Group means:
##             Lag2
## Down -0.03568254
## Up    0.26036581
## 
## Coefficients of linear discriminants:
##            LD1
## Lag2 0.4414162
plot(lda.fit)

lda.pred=predict(lda.fit, weekly.test)
names(lda.pred)
## [1] "class"     "posterior" "x"
lda.class=lda.pred$class
table(lda.class, Direction.test)
##          Direction.test
## lda.class Down Up
##      Down    9  5
##      Up     34 56
mean(lda.class==Direction.test)
## [1] 0.625
sum(lda.pred$posterior[,1]>0.9)
## [1] 0

(f) Repeat (d) using QDA.

qda.fit=qda(Direction~Lag2, data = Weekly, subset = train)
qda.fit
## Call:
## qda(Direction ~ Lag2, data = Weekly, subset = train)
## 
## Prior probabilities of groups:
##      Down        Up 
## 0.4477157 0.5522843 
## 
## Group means:
##             Lag2
## Down -0.03568254
## Up    0.26036581
qda.class=predict(qda.fit, weekly.test)$class
qda.class
##   [1] Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up
##  [26] Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up
##  [51] Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up
##  [76] Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up
## [101] Up Up Up Up
## Levels: Down Up
table(qda.class, Direction.test)
##          Direction.test
## qda.class Down Up
##      Down    0  0
##      Up     43 61
mean(qda.class==Direction.test)
## [1] 0.5865385

(g) Repeat (d) using KNN with K = 1.

library(class)
train.x=as.matrix(Weekly$Lag2[train])
test.x=as.matrix(Weekly$Lag2[!train])
train.y= Direction[train]
test.y=Direction[!train]
dim(train.x)
## [1] 985   1
dim(test.x)
## [1] 104   1
set.seed(22)
knn.pred=knn(train.x, test.x, train.y, k = 1)
table(knn.pred, Direction.test)
##         Direction.test
## knn.pred Down Up
##     Down   21 29
##     Up     22 32
mean(knn.pred==Direction.test)
## [1] 0.5096154

(h) Repeat (d) using naive Bayes.

library(e1071)
weekly.train=Weekly[train, ]
weekly.test=Weekly[Direction.test, ]
nb.fit=naiveBayes(Direction~Lag2, data = weekly.train)
nb.pred=predict(nb.fit, weekly.test)
table(nb.pred, Direction.test)
##        Direction.test
## nb.pred Down Up
##    Down    0  0
##    Up     43 61
mean(nb.pred==Direction.test)
## [1] 0.5865385

(i) Which of these methods appears to provide the best results on this data? Logistic Regression and LDA performed best with 62.5% accuracy.

(j) Experiment with different combinations of predictors, including possible transformations and interactions, for each of the methods. Report the variables, method, and associated confusion matrix that appears to provide the best results on the held out data. Note that you should also experiment with values for K in the KNN classifier. new experiment with KNN K=2

library(class)
train.x=as.matrix(Weekly$Lag2[train])
test.x=as.matrix(Weekly$Lag2[!train])
train.y= Direction[train]
test.y=Direction[!train]
dim(train.x)
## [1] 985   1
dim(test.x)
## [1] 104   1
set.seed(22)
knn.pred=knn(train.x, test.x, train.y, k = 2)
table(knn.pred, Direction.test)
##         Direction.test
## knn.pred Down Up
##     Down   22 20
##     Up     21 41
mean(knn.pred==Direction.test)
## [1] 0.6057692

Changing the value of k to 2 improves the accuracy

attempting logistic regression with more predictors fitting the model with Lag1 and Lag2

glm.fit=glm(Direction~Lag1+Lag2, data = Weekly, subset = train, family = binomial)
glm.probs=predict(glm.fit, weekly.test, type = 'response')
glm.pred=rep("Down", 104)
glm.pred[glm.probs>0.5]="Up"
table(glm.pred, Direction.test)
##         Direction.test
## glm.pred Down Up
##       Up   43 61
mean(glm.pred==Direction.test)
## [1] 0.5865385

Adding Lag1 to the model improves the accuracy