Author: Jay Liao (ID: RE6094028)

Exercise 4.10

This question should be answered using the Weekly data set, which is part of the ISLR package. This data is similar in nature to the Smarket data from this chapter’s lab, except that it contains 1,089 weekly returns for 21 years, from the beginning of 1990 to the end of 2010.

library(ISLR)
data('Weekly')
summary(Weekly)
      Year           Lag1               Lag2               Lag3         
 Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
 1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
 Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
 Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
 3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
 Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
      Lag4               Lag5              Volume            Today         
 Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
 1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
 Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
 Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
 3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
 Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
 Direction 
 Down:484  
 Up  :605  
           
           
           
           

Exercise 3.14 - (a)

Produce some numerical and graphical summaries of the Weekly data. Do there appear to be any patterns?

Load in the packages

library(dplyr)
library(GGally)
library(corrplot)

Correlation and scatter plots

ggpairs(Weekly[,-9])

corrplot(cor(Weekly[,-9]))

Stratified view of different Direction

Weekly %>% split(., Weekly$Direction) %>% lapply(., summary)
$Down
      Year           Lag1              Lag2                Lag3         
 Min.   :1990   Min.   :-9.3990   Min.   :-18.19500   Min.   :-10.5380  
 1st Qu.:1995   1st Qu.:-0.9365   1st Qu.: -1.30725   1st Qu.: -1.1550  
 Median :2000   Median : 0.3820   Median :  0.15400   Median :  0.2505  
 Mean   :2000   Mean   : 0.2823   Mean   : -0.04042   Mean   :  0.2076  
 3rd Qu.:2005   3rd Qu.: 1.5888   3rd Qu.:  1.30225   3rd Qu.:  1.4060  
 Max.   :2010   Max.   :12.0260   Max.   : 10.49100   Max.   : 10.4910  
      Lag4               Lag5              Volume            Today         
 Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
 1st Qu.: -1.1542   1st Qu.: -1.0938   1st Qu.:0.33776   1st Qu.: -2.2927  
 Median :  0.2235   Median :  0.3280   Median :1.07355   Median : -1.3345  
 Mean   :  0.2000   Mean   :  0.1878   Mean   :1.60854   Mean   : -1.7466  
 3rd Qu.:  1.4415   3rd Qu.:  1.5020   3rd Qu.:2.01993   3rd Qu.: -0.5917  
 Max.   : 12.0260   Max.   : 10.4910   Max.   :9.32821   Max.   : -0.0020  
 Direction 
 Down:484  
 Up  :  0  
           
           
           
           

$Up
      Year           Lag1                Lag2               Lag3          
 Min.   :1990   Min.   :-18.19500   Min.   :-11.0500   Min.   :-18.19500  
 1st Qu.:1995   1st Qu.: -1.23700   1st Qu.: -1.0010   1st Qu.: -1.17300  
 Median :2000   Median :  0.09900   Median :  0.2990   Median :  0.22400  
 Mean   :2000   Mean   :  0.04522   Mean   :  0.3043   Mean   :  0.09885  
 3rd Qu.:2005   3rd Qu.:  1.31300   3rd Qu.:  1.4630   3rd Qu.:  1.41700  
 Max.   :2010   Max.   : 10.70700   Max.   : 12.0260   Max.   : 12.02600  
      Lag4              Lag5              Volume           Today       
 Min.   :-9.3990   Min.   :-11.0500   Min.   :0.1251   Min.   : 0.010  
 1st Qu.:-1.1580   1st Qu.: -1.2010   1st Qu.:0.3273   1st Qu.: 0.630  
 Median : 0.2410   Median :  0.1280   Median :0.9266   Median : 1.247  
 Mean   : 0.1025   Mean   :  0.1015   Mean   :1.5475   Mean   : 1.667  
 3rd Qu.: 1.3520   3rd Qu.:  1.3380   3rd Qu.:2.0899   3rd Qu.: 2.215  
 Max.   :10.7070   Max.   : 12.0260   Max.   :8.4034   Max.   :12.026  
 Direction 
 Down:  0  
 Up  :605  
           
           
           
           
Weekly_long <- reshape2::melt(Weekly, id.vars = 'Direction')
Weekly_long %>% filter(variable != 'Year') %>%
  qplot(data = ., y = value, x = variable, geom = 'boxplot', col = Direction) +
  labs(y= '') +
  theme_bw() + theme(legend.position = 'top')

除了YearVolume呈現高達0.8419416的高度正相關外,其餘兩兩變項間的相關程度都很低。透過box plot呈現不同Direction的資料差異,發現不同DirectionToday這個變項中有一些差異,在其他變項中則無明顯的差異。

Exercise 3.14 - (b)

Use the full data set to perform a logistic regression with Direction as the response and the five lag variables plus Volume as predictors. Use the summary function to print the results. Do any of the predictors appear to be statistically significant? If so, which ones?

glm_b <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
             data = Weekly, family = binomial('logit'))
summary(glm_b)

Call:
glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
    Volume, family = binomial("logit"), data = Weekly)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.6949  -1.2565   0.9913   1.0849   1.4579  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)   
(Intercept)  0.26686    0.08593   3.106   0.0019 **
Lag1        -0.04127    0.02641  -1.563   0.1181   
Lag2         0.05844    0.02686   2.175   0.0296 * 
Lag3        -0.01606    0.02666  -0.602   0.5469   
Lag4        -0.02779    0.02646  -1.050   0.2937   
Lag5        -0.01447    0.02638  -0.549   0.5833   
Volume      -0.02274    0.03690  -0.616   0.5377   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1496.2  on 1088  degrees of freedom
Residual deviance: 1486.4  on 1082  degrees of freedom
AIC: 1500.4

Number of Fisher Scoring iterations: 4
round(confint(glm_b, 2:7, .95), 4)
         2.5 % 97.5 %
Lag1   -0.0935 0.0103
Lag2    0.0062 0.1117
Lag3   -0.0687 0.0360
Lag4   -0.0800 0.0240
Lag5   -0.0665 0.0371
Volume -0.0951 0.0498

Lag2 is statistically significant.

Exercise 3.14 - (c)

Compute the confusion matrix and overall fraction of correct predictions. Explain what the confusion matrix is telling you about the types of mistakes made by logistic regression.

y_hat_c <- factor(glm_b$fitted.values >= .5,
                  levels = c(F, T), labels = c('Down', 'Up'))
table(y_hat_c, Weekly$Direction)
       
y_hat_c Down  Up
   Down   54  48
   Up    430 557
mean(y_hat_c == Weekly$Direction)
[1] 0.5610652

Exercise 3.14 - (d)

Now fit the logistic regression model using a training data period from 1990 to 2008, with Lag2 as the only predictor. Compute the confusion matrix and the overall fraction of correct predictions for the held out data (that is, the data from 2009 and 2010).

data_tr <- Weekly %>% filter(Year <= 2008)
data_te <- Weekly %>% filter(Year > 2008)
glm_d <- glm(Direction ~ Lag2, family=binomial('logit'), data = data_tr)
summary(glm_d)

Call:
glm(formula = Direction ~ Lag2, family = binomial("logit"), data = data_tr)

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-1.536  -1.264   1.021   1.091   1.368  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)   
(Intercept)  0.20326    0.06428   3.162  0.00157 **
Lag2         0.05810    0.02870   2.024  0.04298 * 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1354.7  on 984  degrees of freedom
Residual deviance: 1350.5  on 983  degrees of freedom
AIC: 1354.5

Number of Fisher Scoring iterations: 4
y_hat_d <- factor(predict.glm(glm_d, newdata = data_te, type='response') >= 0.5,
                  levels = c(F, T), labels = c('Down', 'Up'))
table(y_hat_d, data_te$Direction)
       
y_hat_d Down Up
   Down    9  5
   Up     34 56
mean(y_hat_d == data_te$Direction)
[1] 0.625

透過Confusion matrix可知,當True Direction是Down時,模型表現差,正確預測率只有約11%,但當True Direction是Up時,模型表現佳,正確預測率達92%,這是透過整體正確率無法得知的。

Exercise 3.14 - (e)

Repeat (d) using LDA.

library(MASS)
lda_e <- lda(Direction ~ Lag2, data = data_tr)
lda_e
Call:
lda(Direction ~ Lag2, data = data_tr)

Prior probabilities of groups:
     Down        Up 
0.4477157 0.5522843 

Group means:
            Lag2
Down -0.03568254
Up    0.26036581

Coefficients of linear discriminants:
           LD1
Lag2 0.4414162
plot(lda_e)

y_hat_e <- predict(lda_e, newdata = data_te)$class
table(y_hat_e, data_te$Direction)
       
y_hat_e Down Up
   Down    9  5
   Up     34 56
mean(y_hat_e == data_te$Direction)
[1] 0.625

Exercise 3.14 - (f)

Repeat (d) using QDA.

qda_f <- qda(Direction ~ Lag2, data = data_tr)
qda_f
Call:
qda(Direction ~ Lag2, data = data_tr)

Prior probabilities of groups:
     Down        Up 
0.4477157 0.5522843 

Group means:
            Lag2
Down -0.03568254
Up    0.26036581
y_hat_f <- predict(qda_f, newdata = data_te)$class
table(y_hat_f, data_te$Direction)
       
y_hat_f Down Up
   Down    0  0
   Up     43 61
mean(y_hat_f == data_te$Direction)
[1] 0.5865385

Exercise 3.14 - (g)

Repeat (d) using KNN with \(K = 1\).

set.seed(4028)
y_hat_g <- class::knn(cbind(data_tr$Lag2), cbind(data_te$Lag2),
                      data_tr$Direction, k = 1)
table(y_hat_g, data_te$Direction)
       
y_hat_g Down Up
   Down   21 30
   Up     22 31
mean(y_hat_g == data_te$Direction)
[1] 0.5

Exercise 3.14 - (h)

Which of these methods appears to provide the best results on this data?

KNN較好。雖然Logistic regression和LDA的整體正確預測率最佳,但對True Direction為Down的正確預測率偏低。KNN對True Direction為Down或Up的正確預測率相近。

Exercise 3.14 - (i)

Experiment with different combinations of predictors, includ- ing possible transformations and interactions, for each of the methods. Report the variables, method, and associated confu- sion matrix that appears to provide the best results on the held out data. Note that you should also experiment with values for K in the KNN classifier.

Logistic regression models

#LR
glm_1 <- glm(Direction ~ Lag1, data = data_tr, family=binomial('logit'))
summary(glm_1)

Call:
glm(formula = Direction ~ Lag1, family = binomial("logit"), data = data_tr)

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-1.519  -1.253   1.028   1.094   1.281  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)  0.21829    0.06438   3.391 0.000697 ***
Lag1        -0.05908    0.02892  -2.043 0.041059 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1354.7  on 984  degrees of freedom
Residual deviance: 1350.4  on 983  degrees of freedom
AIC: 1354.4

Number of Fisher Scoring iterations: 4
y_hat_glm_1 <- factor(predict.glm(glm_1, type='response', newdata = data_te) >= 0.5, 
                      levels = c(F, T), labels = c('Down', 'Up'))
table(y_hat_glm_1, data_te$Direction)
           
y_hat_glm_1 Down Up
       Down    4  6
       Up     39 55
mean(y_hat_glm_1 == data_te$Direction)
[1] 0.5673077
glm_2 <- glm(Direction ~ Lag2, data = data_tr, family=binomial('logit'))
summary(glm_2)

Call:
glm(formula = Direction ~ Lag2, family = binomial("logit"), data = data_tr)

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-1.536  -1.264   1.021   1.091   1.368  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)   
(Intercept)  0.20326    0.06428   3.162  0.00157 **
Lag2         0.05810    0.02870   2.024  0.04298 * 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1354.7  on 984  degrees of freedom
Residual deviance: 1350.5  on 983  degrees of freedom
AIC: 1354.5

Number of Fisher Scoring iterations: 4
y_hat_glm_2 <- factor(predict.glm(glm_2, type='response', newdata = data_te) >= 0.5, 
                      levels = c(F, T), labels = c('Down', 'Up'))
table(y_hat_glm_2, data_te$Direction)
           
y_hat_glm_2 Down Up
       Down    9  5
       Up     34 56
mean(y_hat_glm_2 == data_te$Direction)
[1] 0.625
glm_3 <- glm(Direction ~ Lag3, data = data_tr, family=binomial('logit'))
summary(glm_3)

Call:
glm(formula = Direction ~ Lag3, family = binomial("logit"), data = data_tr)

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-1.347  -1.265   1.072   1.092   1.173  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)  0.21206    0.06420   3.303 0.000956 ***
Lag3        -0.01686    0.02829  -0.596 0.551330    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1354.7  on 984  degrees of freedom
Residual deviance: 1354.4  on 983  degrees of freedom
AIC: 1358.4

Number of Fisher Scoring iterations: 3
y_hat_glm_3 <- factor(predict.glm(glm_3, type='response', newdata = data_te) >= 0.5, 
                      levels = c(F, T), labels = c('Down', 'Up'))
table(y_hat_glm_3, data_te$Direction)
           
y_hat_glm_3 Down Up
       Down    0  0
       Up     43 61
mean(y_hat_glm_3 == data_te$Direction)
[1] 0.5865385
glm_4 <- glm(Direction ~ Lag4, data = data_tr, family=binomial('logit'))
summary(glm_4)

Call:
glm(formula = Direction ~ Lag4, family = binomial("logit"), data = data_tr)

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-1.373  -1.265   1.076   1.091   1.146  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)  0.21154    0.06419   3.296 0.000982 ***
Lag4        -0.01302    0.02827  -0.460 0.645158    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1354.7  on 984  degrees of freedom
Residual deviance: 1354.5  on 983  degrees of freedom
AIC: 1358.5

Number of Fisher Scoring iterations: 3
y_hat_glm_4 <- factor(predict.glm(glm_4, type='response', newdata = data_te) >= 0.5, 
                      levels = c(F, T), labels = c('Down', 'Up'))
table(y_hat_glm_4, data_te$Direction)
           
y_hat_glm_4 Down Up
       Down    0  0
       Up     43 61
mean(y_hat_glm_4 == data_te$Direction)
[1] 0.5865385
glm_5 <- glm(Direction ~ Lag5, data = data_tr, family=binomial('logit'))
summary(glm_5)

Call:
glm(formula = Direction ~ Lag5, family = binomial("logit"), data = data_tr)

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-1.536  -1.261   1.055   1.092   1.255  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)  0.21419    0.06424   3.334 0.000856 ***
Lag5        -0.03285    0.02844  -1.155 0.247978    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1354.7  on 984  degrees of freedom
Residual deviance: 1353.4  on 983  degrees of freedom
AIC: 1357.4

Number of Fisher Scoring iterations: 3
y_hat_glm_5 <- factor(predict.glm(glm_5, type='response', newdata = data_te) >= 0.5, 
                      levels = c(F, T), labels = c('Down', 'Up'))
table(y_hat_glm_5, data_te$Direction)
           
y_hat_glm_5 Down Up
       Down    0  3
       Up     43 58
mean(y_hat_glm_5 == data_te$Direction)
[1] 0.5576923
glm_6 <- glm(Direction ~ Volume, data = data_tr, family=binomial('logit'))
summary(glm_6)

Call:
glm(formula = Direction ~ Volume, family = binomial("logit"), 
    data = data_tr)

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-1.302  -1.271   1.060   1.080   1.303  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)  0.29405    0.08899   3.304 0.000952 ***
Volume      -0.06954    0.05089  -1.367 0.171778    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1354.7  on 984  degrees of freedom
Residual deviance: 1352.8  on 983  degrees of freedom
AIC: 1356.8

Number of Fisher Scoring iterations: 3
y_hat_glm_6 <- factor(predict.glm(glm_6, type='response', newdata = data_te) >= 0.5, 
                      levels = c(F, T), labels = c('Down', 'Up'))
table(y_hat_glm_6, data_te$Direction)
           
y_hat_glm_6 Down Up
       Down   31 47
       Up     12 14
mean(y_hat_glm_6 == data_te$Direction)
[1] 0.4326923
glm_7 <- glm(Direction ~ Lag1 + Lag2 + Volume, data = data_tr, family=binomial('logit'))
summary(glm_7)

Call:
glm(formula = Direction ~ Lag1 + Lag2 + Volume, family = binomial("logit"), 
    data = data_tr)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.4681  -1.2581   0.9929   1.0840   1.5339  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)   
(Intercept)  0.29792    0.09136   3.261  0.00111 **
Lag1        -0.05975    0.02917  -2.048  0.04054 * 
Lag2         0.04774    0.02941   1.624  0.10446   
Volume      -0.07093    0.05263  -1.348  0.17777   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1354.7  on 984  degrees of freedom
Residual deviance: 1345.1  on 981  degrees of freedom
AIC: 1353.1

Number of Fisher Scoring iterations: 4
y_hat_glm_7 <- factor(predict.glm(glm_7, type='response', newdata = data_te) >= 0.5, 
                      levels = c(F, T), labels = c('Down', 'Up'))
table(y_hat_glm_7, data_te$Direction)
           
y_hat_glm_7 Down Up
       Down   27 33
       Up     16 28
mean(y_hat_glm_7 == data_te$Direction)
[1] 0.5288462
glm_8 <- glm(Direction ~ Lag1*Lag2*Volume, data = data_tr, family=binomial('logit'))
summary(glm_8)

Call:
glm(formula = Direction ~ Lag1 * Lag2 * Volume, family = binomial("logit"), 
    data = data_tr)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.5521  -1.2561   0.9982   1.0908   1.4513  

Coefficients:
                   Estimate Std. Error z value Pr(>|z|)   
(Intercept)       0.2895010  0.0924319   3.132  0.00174 **
Lag1             -0.0500776  0.0446962  -1.120  0.26254   
Lag2              0.0413390  0.0443998   0.931  0.35182   
Volume           -0.0674435  0.0561516  -1.201  0.22971   
Lag1:Lag2        -0.0194069  0.0146837  -1.322  0.18628   
Lag1:Volume       0.0007076  0.0216339   0.033  0.97391   
Lag2:Volume       0.0105645  0.0204421   0.517  0.60529   
Lag1:Lag2:Volume  0.0061637  0.0050393   1.223  0.22128   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1354.7  on 984  degrees of freedom
Residual deviance: 1341.5  on 977  degrees of freedom
AIC: 1357.5

Number of Fisher Scoring iterations: 5
y_hat_glm_8 <- factor(predict.glm(glm_8, type='response', newdata = data_te) >= 0.5, 
                      levels = c(F, T), labels = c('Down', 'Up'))
table(y_hat_glm_8, data_te$Direction)
           
y_hat_glm_8 Down Up
       Down   21 25
       Up     22 36
mean(y_hat_glm_8 == data_te$Direction)
[1] 0.5480769
glm_9 <- glm(Direction ~ Lag1*Lag2*Volume - Lag1:Volume, data = data_tr, family=binomial('logit'))
summary(glm_9)

Call:
glm(formula = Direction ~ Lag1 * Lag2 * Volume - Lag1:Volume, 
    family = binomial("logit"), data = data_tr)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.5516  -1.2564   0.9993   1.0904   1.4520  

Coefficients:
                  Estimate Std. Error z value Pr(>|z|)   
(Intercept)       0.289757   0.092104   3.146  0.00166 **
Lag1             -0.049040   0.031477  -1.558  0.11925   
Lag2              0.041515   0.044059   0.942  0.34606   
Volume           -0.067905   0.054358  -1.249  0.21159   
Lag1:Lag2        -0.019307   0.014361  -1.344  0.17880   
Lag2:Volume       0.010392   0.019724   0.527  0.59829   
Lag1:Lag2:Volume  0.006083   0.004396   1.384  0.16644   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1354.7  on 984  degrees of freedom
Residual deviance: 1341.5  on 978  degrees of freedom
AIC: 1355.5

Number of Fisher Scoring iterations: 5
y_hat_glm_9 <- factor(predict.glm(glm_9, type='response', newdata = data_te) >= 0.5, 
                      levels = c(F, T), labels = c('Down', 'Up'))
table(y_hat_glm_9, data_te$Direction)
           
y_hat_glm_9 Down Up
       Down   21 25
       Up     22 36
mean(y_hat_glm_9 == data_te$Direction)
[1] 0.5480769
glm_10 <- glm(Direction ~ Lag1*Lag2*Volume - Lag1:Volume - Lag2:Volume, data = data_tr, family=binomial('logit'))
summary(glm_10)

Call:
glm(formula = Direction ~ Lag1 * Lag2 * Volume - Lag1:Volume - 
    Lag2:Volume, family = binomial("logit"), data = data_tr)

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-1.566  -1.250   0.994   1.093   1.378  

Coefficients:
                  Estimate Std. Error z value Pr(>|z|)   
(Intercept)       0.289635   0.092067   3.146  0.00166 **
Lag1             -0.050275   0.031385  -1.602  0.10918   
Lag2              0.058587   0.030861   1.898  0.05764 . 
Volume           -0.070094   0.054031  -1.297  0.19454   
Lag1:Lag2        -0.019328   0.014178  -1.363  0.17280   
Lag1:Lag2:Volume  0.005562   0.003868   1.438  0.15045   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1354.7  on 984  degrees of freedom
Residual deviance: 1341.8  on 979  degrees of freedom
AIC: 1353.8

Number of Fisher Scoring iterations: 5
y_hat_glm_10 <- factor(predict.glm(glm_10, type='response', newdata = data_te) >= 0.5, 
                      levels = c(F, T), labels = c('Down', 'Up'))
table(y_hat_glm_10, data_te$Direction)
            
y_hat_glm_10 Down Up
        Down   22 31
        Up     21 30
mean(y_hat_glm_10 == data_te$Direction)
[1] 0.5
glm_11 <- glm(Direction ~ Lag1*Lag2 + Lag1:Lag2:Volume, data = data_tr, family=binomial('logit'))
summary(glm_11)

Call:
glm(formula = Direction ~ Lag1 * Lag2 + Lag1:Lag2:Volume, family = binomial("logit"), 
    data = data_tr)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.6017  -1.2480   0.9974   1.0967   1.3040  

Coefficients:
                  Estimate Std. Error z value Pr(>|z|)   
(Intercept)       0.205166   0.064854   3.164  0.00156 **
Lag1             -0.044740   0.031094  -1.439  0.15018   
Lag2              0.062449   0.030647   2.038  0.04158 * 
Lag1:Lag2        -0.019756   0.014265  -1.385  0.16609   
Lag1:Lag2:Volume  0.005965   0.003979   1.499  0.13382   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1354.7  on 984  degrees of freedom
Residual deviance: 1343.4  on 980  degrees of freedom
AIC: 1353.4

Number of Fisher Scoring iterations: 5
y_hat_glm_11 <- factor(predict.glm(glm_11, type='response', newdata = data_te) >= 0.5, 
                       levels = c(F, T), labels = c('Down', 'Up'))
table(y_hat_glm_11, data_te$Direction)
            
y_hat_glm_11 Down Up
        Down    6  8
        Up     37 53
mean(y_hat_glm_11 == data_te$Direction)
[1] 0.5673077

It seems that glm_2, which contains Lag2 as the only predictor, has the best performance in these logistic regression models.

Linear discriminant analysis

lda_1 <- lda(Direction ~ Lag1, data = data_tr)
y_hat_lda_1 <- predict(lda_1, newdata = data_te)$class
table(y_hat_lda_1, data_te$Direction)
           
y_hat_lda_1 Down Up
       Down    4  6
       Up     39 55
mean(y_hat_lda_1 == data_te$Direction)
[1] 0.5673077
lda_2 <- lda(Direction ~ Lag2, data = data_tr)
y_hat_lda_2 <- predict(lda_2, newdata = data_te)$class
table(y_hat_lda_2, data_te$Direction)
           
y_hat_lda_2 Down Up
       Down    9  5
       Up     34 56
mean(y_hat_lda_2 == data_te$Direction)
[1] 0.625
lda_3 <- lda(Direction ~ Lag3, data = data_tr)
y_hat_lda_3 <- predict(lda_3, newdata = data_te)$class
table(y_hat_lda_3, data_te$Direction)
           
y_hat_lda_3 Down Up
       Down    0  0
       Up     43 61
mean(y_hat_lda_3 == data_te$Direction)
[1] 0.5865385
lda_4 <- lda(Direction ~ Lag4, data = data_tr)
y_hat_lda_4 <- predict(lda_4, newdata = data_te)$class
table(y_hat_lda_4, data_te$Direction)
           
y_hat_lda_4 Down Up
       Down    0  0
       Up     43 61
mean(y_hat_lda_4 == data_te$Direction)
[1] 0.5865385
lda_5 <- lda(Direction ~ Lag5, data = data_tr)
y_hat_lda_5 <- predict(lda_5, newdata = data_te)$class
table(y_hat_lda_5, data_te$Direction)
           
y_hat_lda_5 Down Up
       Down    0  3
       Up     43 58
mean(y_hat_lda_5 == data_te$Direction)
[1] 0.5576923
lda_6 <- lda(Direction ~ Volume, data = data_tr)
y_hat_lda_6 <- predict(lda_6, newdata = data_te)$class
table(y_hat_lda_6, data_te$Direction)
           
y_hat_lda_6 Down Up
       Down   33 47
       Up     10 14
mean(y_hat_lda_6 == data_te$Direction)
[1] 0.4519231
lda_7 <- lda(Direction ~ Lag2*Lag3, data = data_tr)
y_hat_lda_7 <- predict(lda_7, newdata = data_te)$class
table(y_hat_lda_7, data_te$Direction)
           
y_hat_lda_7 Down Up
       Down    8  4
       Up     35 57
mean(y_hat_lda_7 == data_te$Direction)
[1] 0.625

It seems that lda_2 has the best performance in these LDA models.

Quadratic discriminant analysis

qda_1 <- qda(Direction ~ Lag1, data = data_tr)
y_hat_qda_1 <- predict(qda_1, newdata = data_te)$class
mean(y_hat_qda_1 == data_te$Direction)
[1] 0.5865385
table(y_hat_qda_1, data_te$Direction)
           
y_hat_qda_1 Down Up
       Down    0  0
       Up     43 61
qda_2 <- qda(Direction ~ Lag2, data = data_tr)
y_hat_qda_2 <- predict(qda_2, newdata = data_te)$class
mean(y_hat_qda_2 == data_te$Direction)
[1] 0.5865385
table(y_hat_qda_2, data_te$Direction)
           
y_hat_qda_2 Down Up
       Down    0  0
       Up     43 61
qda_3 <- qda(Direction ~ Lag3, data = data_tr)
y_hat_qda_3 <- predict(qda_3, newdata = data_te)$class
mean(y_hat_qda_3 == data_te$Direction)
[1] 0.5865385
table(y_hat_qda_3, data_te$Direction)
           
y_hat_qda_3 Down Up
       Down    0  0
       Up     43 61
qda_4 <- qda(Direction ~ Lag4, data = data_tr)
y_hat_qda_4 <- predict(qda_4, newdata = data_te)$class
mean(y_hat_qda_4 == data_te$Direction)
[1] 0.5384615
table(y_hat_qda_4, data_te$Direction)
           
y_hat_qda_4 Down Up
       Down    7 12
       Up     36 49
qda_5 <- qda(Direction ~ Lag5, data = data_tr)
y_hat_qda_5 <- predict(qda_5, newdata = data_te)$class
mean(y_hat_qda_5 == data_te$Direction)
[1] 0.4807692
table(y_hat_qda_5, data_te$Direction)
           
y_hat_qda_5 Down Up
       Down    2 13
       Up     41 48
qda_6 <- qda(Direction ~ Volume, data = data_tr)
y_hat_qda_6 <- predict(qda_6, newdata = data_te)$class
mean(y_hat_qda_6 == data_te$Direction)
[1] 0.4326923
table(y_hat_qda_6, data_te$Direction)
           
y_hat_qda_6 Down Up
       Down   43 59
       Up      0  2
qda_7 <- qda(Direction ~ Lag1 + Lag2, data = data_tr)
y_hat_qda_7 <- predict(qda_7, newdata = data_te)$class
mean(y_hat_qda_7 == data_te$Direction)
[1] 0.5576923
table(y_hat_qda_7, data_te$Direction)
           
y_hat_qda_7 Down Up
       Down    7 10
       Up     36 51
qda_8 <- qda(Direction ~ Lag2 + Lag3, data = data_tr)
y_hat_qda_8 <- predict(qda_8, newdata = data_te)$class
mean(y_hat_qda_8 == data_te$Direction)
[1] 0.6057692
table(y_hat_qda_8, data_te$Direction)
           
y_hat_qda_8 Down Up
       Down    4  2
       Up     39 59
qda_9 <- qda(Direction ~ Lag1 + Lag2 + Lag3, data = data_tr)
y_hat_qda_9 <- predict(qda_9, newdata = data_te)$class
mean(y_hat_qda_9 == data_te$Direction)
[1] 0.5480769
table(y_hat_qda_9, data_te$Direction)
           
y_hat_qda_9 Down Up
       Down    6 10
       Up     37 51

It seems that qda_8 has the best performance in the QDA models

K nearest neighbor

KNN with Lag2

f_knn <- function(k, data_tr, data_te, random_state = sample(1:100, 1)) {
  set.seed(random_state)
  y_hat <- class::knn(cbind(data_tr$Lag2), cbind(data_te$Lag2),
                      data_tr$Direction, k)
  cat(paste0('k = ', k, '\n'))
  cat(paste0('Average accuracy rate: ', round(mean(y_hat == data_te$Direction), 4)))
  print(table(y_hat, data_te$Direction))
  cat('\n')
}

lst <- lapply(2:5, function(k) f_knn(k, data_tr, data_te, 4028))
k = 2
Average accuracy rate: 0.5481      
y_hat  Down Up
  Down   19 23
  Up     24 38

k = 3
Average accuracy rate: 0.5481      
y_hat  Down Up
  Down   15 19
  Up     28 42

k = 4
Average accuracy rate: 0.5673      
y_hat  Down Up
  Down   18 20
  Up     25 41

k = 5
Average accuracy rate: 0.5385      
y_hat  Down Up
  Down   16 21
  Up     27 40

KNN with Lag1 and Lag2

f_knn2 <- function(k, data_tr, data_te, random_state = sample(1:100, 1)) {
  set.seed(random_state)
  y_hat <- class::knn(cbind(data_tr$Lag1, data_tr$Lag2),
                      cbind(data_te$Lag1, data_te$Lag2),
                      data_tr$Direction, k)
  cat(paste0('k = ', k, '\n'))
  cat(paste0('Average accuracy rate: ', round(mean(y_hat == data_te$Direction), 4)))
  print(table(y_hat, data_te$Direction))
  cat('\n')
}

lst <- lapply(2:5, function(k) f_knn2(k, data_tr, data_te, 4028))
k = 2
Average accuracy rate: 0.5192      
y_hat  Down Up
  Down   20 27
  Up     23 34

k = 3
Average accuracy rate: 0.5192      
y_hat  Down Up
  Down   22 29
  Up     21 32

k = 4
Average accuracy rate: 0.5      
y_hat  Down Up
  Down   21 30
  Up     22 31

k = 5
Average accuracy rate: 0.4904      
y_hat  Down Up
  Down   22 32
  Up     21 29

KNN with Lag2 and Lag3

f_knn3 <- function(k, data_tr, data_te, random_state = sample(1:100, 1)) {
  set.seed(random_state)
  y_hat <- class::knn(cbind(data_tr$Lag2, data_tr$Lag3),
                      cbind(data_te$Lag2, data_te$Lag3),
                      data_tr$Direction, k)
  cat(paste0('k = ', k, '\n'))
  cat(paste0('Average accuracy rate: ', round(mean(y_hat == data_te$Direction), 4)))
  print(table(y_hat, data_te$Direction))
  cat('\n')
}

lst <- lapply(2:5, function(k) f_knn3(k, data_tr, data_te, 4028))
k = 2
Average accuracy rate: 0.5      
y_hat  Down Up
  Down   14 23
  Up     29 38

k = 3
Average accuracy rate: 0.5481      
y_hat  Down Up
  Down   16 20
  Up     27 41

k = 4
Average accuracy rate: 0.5481      
y_hat  Down Up
  Down   14 18
  Up     29 43

k = 5
Average accuracy rate: 0.5385      
y_hat  Down Up
  Down   15 20
  Up     28 41

It seems that KNN with \(k=4\) and Lag2 as the only predictor has the best performance in these KNN classifiers.

KNN with Lag2 as the only predictor and k=2較好。雖然Logistic regression和LDA的整體正確預測率最佳,但對True Direction為Down的正確預測率極低。KNN雖然整體正預測率較低,但也只有低一點,而其對True Direction為Down的正確預測率則比Logistic regression和LDA表現好許多。