Load necessary libraries

options(warnPartialMatchArgs = FALSE)  # don't want these warnings
library(magrittr)     # pipes
library(tibble)       # tibbles
library(dplyr)        # data wrangling
library(purrr)        # iteration
library(ggplot2)      # tidy plotting
library(broom)        # summarize model objects uniformly
library(yardstick)    # classification performance measures
library(class)        # knn()
library(MASS)         # lda() and qda()
library(ISLR)         # Smarket data set

1 S&P Stock Market (Smarket)

class(Smarket)                           # currently data frame
[1] "data.frame"
Smarket %<>% as.tibble()                 # convert to `tibble` object
class(Smarket)                           # tibbles are `tidyverse` friendly
[1] "tbl_df"     "tbl"        "data.frame"
names(Smarket)                           # List features/covariates available
[1] "Year"      "Lag1"      "Lag2"      "Lag3"      "Lag4"      "Lag5"     
[7] "Volume"    "Today"     "Direction"
dim(Smarket)                             # dimensions of data
[1] 1250    9
Smarket                                  # echo Smarket (top 10 rows)
summary(Smarket)                         # S3 summary method for class `data.frame`
      Year           Lag1                Lag2                Lag3          
 Min.   :2001   Min.   :-4.922000   Min.   :-4.922000   Min.   :-4.922000  
 1st Qu.:2002   1st Qu.:-0.639500   1st Qu.:-0.639500   1st Qu.:-0.640000  
 Median :2003   Median : 0.039000   Median : 0.039000   Median : 0.038500  
 Mean   :2003   Mean   : 0.003834   Mean   : 0.003919   Mean   : 0.001716  
 3rd Qu.:2004   3rd Qu.: 0.596750   3rd Qu.: 0.596750   3rd Qu.: 0.596750  
 Max.   :2005   Max.   : 5.733000   Max.   : 5.733000   Max.   : 5.733000  
      Lag4                Lag5              Volume           Today          
 Min.   :-4.922000   Min.   :-4.92200   Min.   :0.3561   Min.   :-4.922000  
 1st Qu.:-0.640000   1st Qu.:-0.64000   1st Qu.:1.2574   1st Qu.:-0.639500  
 Median : 0.038500   Median : 0.03850   Median :1.4229   Median : 0.038500  
 Mean   : 0.001636   Mean   : 0.00561   Mean   :1.4783   Mean   : 0.003138  
 3rd Qu.: 0.596750   3rd Qu.: 0.59700   3rd Qu.:1.6417   3rd Qu.: 0.596750  
 Max.   : 5.733000   Max.   : 5.73300   Max.   :3.1525   Max.   : 5.733000  
 Direction 
 Down:602  
 Up  :648  
           
           
           
           
pairs(Smarket)                           # pairs plot of 9-choose-2 combinations

# cor(Smarket)                           # Error; non-numerics in 'Direction'
class(Smarket$Direction)                 # Factor
[1] "factor"
cor(dplyr::select(Smarket, -Direction))  # remove 'Direction' and retry
             Year         Lag1         Lag2         Lag3         Lag4         Lag5
Year   1.00000000  0.029699649  0.030596422  0.033194581  0.035688718  0.029787995
Lag1   0.02969965  1.000000000 -0.026294328 -0.010803402 -0.002985911 -0.005674606
Lag2   0.03059642 -0.026294328  1.000000000 -0.025896670 -0.010853533 -0.003557949
Lag3   0.03319458 -0.010803402 -0.025896670  1.000000000 -0.024051036 -0.018808338
Lag4   0.03568872 -0.002985911 -0.010853533 -0.024051036  1.000000000 -0.027083641
Lag5   0.02978799 -0.005674606 -0.003557949 -0.018808338 -0.027083641  1.000000000
Volume 0.53900647  0.040909908 -0.043383215 -0.041823686 -0.048414246 -0.022002315
Today  0.03009523 -0.026155045 -0.010250033 -0.002447647 -0.006899527 -0.034860083
            Volume        Today
Year    0.53900647  0.030095229
Lag1    0.04090991 -0.026155045
Lag2   -0.04338321 -0.010250033
Lag3   -0.04182369 -0.002447647
Lag4   -0.04841425 -0.006899527
Lag5   -0.02200231 -0.034860083
Volume  1.00000000  0.014591823
Today   0.01459182  1.000000000
Smarket %>%
  ggplot(aes(x = 1:length(Volume), y = Volume)) +   # plot `Volume` over time
  geom_point(alpha = 0.5) +                         # points
  xlab("Time") +
  geom_smooth()                                     # smooth fit


2 Logistic regression

2.1 Using the entire data set

glm_fit <- stats::glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 +
                      Lag5 + Volume, data = Smarket, family = "binomial")
class(glm_fit)
[1] "glm" "lm" 
summary(glm_fit)                  # Statistical summary

Call:
stats::glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + 
    Lag5 + Volume, family = "binomial", data = Smarket)

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-1.446  -1.203   1.065   1.145   1.326  

Coefficients:
             Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.126000   0.240736  -0.523    0.601
Lag1        -0.073074   0.050167  -1.457    0.145
Lag2        -0.042301   0.050086  -0.845    0.398
Lag3         0.011085   0.049939   0.222    0.824
Lag4         0.009359   0.049974   0.187    0.851
Lag5         0.010313   0.049511   0.208    0.835
Volume       0.135441   0.158360   0.855    0.392

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1731.2  on 1249  degrees of freedom
Residual deviance: 1727.6  on 1243  degrees of freedom
AIC: 1741.6

Number of Fisher Scoring iterations: 3
coef(glm_fit)                     # Coefficients of the fit
 (Intercept)         Lag1         Lag2         Lag3         Lag4         Lag5 
-0.126000257 -0.073073746 -0.042301344  0.011085108  0.009358938  0.010313068 
      Volume 
 0.135440659 
sum_tbl <- broom::tidy(glm_fit)   # use `broom` to summarize
sum_tbl
sum_tbl %>% dplyr::pull(p.value)  # `pull` vector of p-values for each covariate
[1] 0.6006983 0.1452272 0.3983491 0.8243333 0.8514445 0.8349974 0.3924004
# Make predictions (training only)
glm_probs <- predict(glm_fit, type = "response")    # S3 predict method
class(glm_probs)
[1] "numeric"
glm_probs %>% head(10)
        1         2         3         4         5         6         7         8 
0.5070841 0.4814679 0.4811388 0.5152224 0.5107812 0.5069565 0.4926509 0.5092292 
        9        10 
0.5176135 0.4888378 
Smarket$Direction %>% head(10)
 [1] Up   Up   Down Up   Up   Up   Down Up   Up   Up  
Levels: Down Up
contrasts(Smarket$Direction)
     Up
Down  0
Up    1
glm_pred <- ifelse(glm_probs > 0.5, "Up", "Down")  # predict @ 0.5 cutoff
# Confusion matrix
cmat <- table(truth = Smarket$Direction, pred = glm_pred)
addmargins(cmat)
      pred
truth  Down   Up  Sum
  Down  145  457  602
  Up    141  507  648
  Sum   286  964 1250
# Accuracy (training)
sum(diag(cmat)) / sum(cmat)
[1] 0.5216
# Alternative method (sanity check)
# note: the mean of a boolean vector is the proportion of TRUE
# b/c TRUE = 1; FALSE = 0
mean(glm_pred == Smarket$Direction)  # proportion predicted correctly
[1] 0.5216

2.1.1 Confusion matrices

There are various (arbitrary!) ways to present prediction performance summaries, none more correct than any other. For consistency, this course will use the default construction below, where the truth standard is at left and the predictions along the top. Negative predictions (i.e. controls) occur first, followed by positive predictions. With this construction:

  • accuracy is diagonal sum divided by the total sum
  • specificity is determined by the proportion of correct negative predictions in the first row
  • sensitivity is determined by the proportion of the correct positive predictions in the second row
     Prediction
Truth neg pos
  neg TN  FP 
  pos FN  TP 

There are other predictive measures (e.g. negative predictive value (NPV), and PPV) that become more important depending on the problem. For a complete guide see confusion matrix.


2.2 Using a training (sub)set

# Training on data prior to 2005
train <- dplyr::filter(Smarket, Year < 2005)
# Test on data post-2004
test  <- dplyr::filter(Smarket, Year >= 2005)
# Sanity checks
nrow(train)
[1] 998
nrow(test)
[1] 252
nrow(train) + nrow(test) == nrow(Smarket)  # sanity check
[1] TRUE
head(test$Direction)
[1] Down Down Down Up   Down Up  
Levels: Down Up
# Fit LR model to training set
# and immediately make predictions via `%>%`
glm_probs <- stats::glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
                        data = train, family = "binomial")  %>%  # fit LR model
  predict(newdata = test, type = "response")     # predict on test set
# Recast predictions
cutoff   <- 0.5
glm_pred <- ifelse(glm_probs > cutoff, "Up", "Down")
# Confusion matrix
cmat <- table(truth = test$Direction, pred = glm_pred)
addmargins(cmat)
      pred
truth  Down  Up Sum
  Down   77  34 111
  Up     97  44 141
  Sum   174  78 252
# Accuracy
acc <- sum(diag(cmat)) / sum(cmat)
acc
[1] 0.4801587
# Sensitivity (predicting Up)
sens <- prop.table(cmat["Up", ])[2]
sens
       Up 
0.3120567 
# Specificity (predicting Down)
spec <- prop.table(cmat["Down", ])[1]
spec
     Down 
0.6936937 
# Sanity checks
mean(glm_pred == test$Direction) == acc      # recall proportion trick!
[1] TRUE
mean(glm_pred != test$Direction) == 1 - acc  # machine precision error!
[1] FALSE
# Try `dplyr::near()`; safer than `==` when floating point errors can bite
dplyr::near(mean(glm_pred != test$Direction), 1 - acc)
[1] TRUE

2.3 Using a subset of predictors

# Fit using only 2 predictors; Lag1 and Lag2
glm_fit2 <- stats::glm(Direction ~ Lag1 + Lag2 ,
                       data = train,
                       family = "binomial")
glm_probs <- predict(glm_fit2, newdata = test, type = "response")
cutoff    <- 0.5
glm_pred  <- ifelse(glm_probs > cutoff, "Up", "Down")
# Confusion matrix
cmat <- table(truth = test$Direction, pred = glm_pred)
addmargins(cmat)
      pred
truth  Down  Up Sum
  Down   35  76 111
  Up     35 106 141
  Sum    70 182 252
# Accuracy
acc <- sum(diag(cmat)) / sum(cmat)
acc
[1] 0.5595238
# Sanity check
mean(glm_pred == test$Direction) == acc
[1] TRUE
# Sensitivity (predicting Up)
sens <- prop.table(cmat["Up", ])[2]
sens
      Up 
0.751773 
# Specificity (predicting Down)
spec <- prop.table(cmat["Down", ])[1]
spec
     Down 
0.3153153 
# Make predictions for specific values of Lag1 and Lag2 also possible
predict(glm_fit2,
        newdata = data.frame(Lag1 = c(1.2, 1.5),
                             Lag2 = c(1.1, -0.8)),
        type = "response")
        1         2 
0.4791462 0.4960939 

3 Discriminant Analyses

3.1 Linear Discriminant Analysis (LDA)

lda_fit <- MASS::lda(Direction ~ Lag1 + Lag2, data = train)
class(lda_fit)
[1] "lda"
lda_fit
Call:
lda(Direction ~ Lag1 + Lag2, data = train)

Prior probabilities of groups:
    Down       Up 
0.491984 0.508016 

Group means:
            Lag1        Lag2
Down  0.04279022  0.03389409
Up   -0.03954635 -0.03132544

Coefficients of linear discriminants:
            LD1
Lag1 -0.6420190
Lag2 -0.5135293
# plot histogram/density/both for the `training`(!) set; split by class labels
plot(lda_fit, type = "hist", col = "grey")     # S3 plot method

# Predict on the `test` set
# S3 predict method for class `lda` (above) returns 3 elements:
#   class     = the class prediction for each observation (prob. >= 0.5 cutoff)
#   x         = the linear discriminats
#   posterior = the posterior probabilities by class for each observation
lda_pred <- predict(lda_fit, newdata = test)   
head(lda_pred$x, 20)           # scores (linear discriminants) for each class
           LD1
1   0.08293096
2   0.59114102
3   1.16723063
4   0.83335022
5  -0.03792892
6  -0.08743142
7  -0.14512719
8   0.21701324
9   0.05873792
10  0.35068642
11  0.05897298
12 -0.92794134
13  0.11370190
14  0.98783874
15  0.81206862
16  0.55681363
17 -0.07452314
18 -0.51514029
19 -0.27386231
20  0.15458312
range(lda_pred$x)              # min/max score
[1] -1.309852  1.587994

Sidenote: To convince yourself that the plot() method above plots the linear discriminants of the training set by default, run the code below yourselves and compare:

lda_pred_easy <- predict(lda_fit, newdata = train) # `predict()` on training set
lda_pred_easy %>%
  purrr::pluck("x") %>%            # pull out 'x'; linear discriminants
  hist(prob = TRUE, col = "grey",  # plot 'x' in histogram
       main = "", xlab = "Up/Down Combined")
head(lda_pred$posterior, 20)   # posterior probabilities each class
        Down        Up
1  0.4901792 0.5098208
2  0.4792185 0.5207815
3  0.4668185 0.5331815
4  0.4740011 0.5259989
5  0.4927877 0.5072123
6  0.4938562 0.5061438
7  0.4951016 0.5048984
8  0.4872861 0.5127139
9  0.4907013 0.5092987
10 0.4844026 0.5155974
11 0.4906963 0.5093037
12 0.5119988 0.4880012
13 0.4895152 0.5104848
14 0.4706761 0.5293239
15 0.4744593 0.5255407
16 0.4799583 0.5200417
17 0.4935775 0.5064225
18 0.5030894 0.4969106
19 0.4978806 0.5021194
20 0.4886331 0.5113669
head(lda_pred$class, 20)       # predicted class
 [1] Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Down Up   Up   Up   Up  
[17] Up   Down Up   Up  
Levels: Down Up
lda_class <- lda_pred$class    # predicted class
# Confusion matrix
cmat <- table(truth = test$Direction, pred = lda_class)
addmargins(cmat)
      pred
truth  Down  Up Sum
  Down   35  76 111
  Up     35 106 141
  Sum    70 182 252
# Accuracy
acc <- sum(diag(cmat)) / sum(cmat)
acc
[1] 0.5595238
# Sanity check
mean(lda_class == test$Direction) == acc
[1] TRUE
# Sensitivity (predicting Up)
sens <- prop.table(cmat["Up", ])[2]
sens
      Up 
0.751773 
# Specificity (predicting Down)
spec <- prop.table(cmat["Down", ])[1]
spec
     Down 
0.3153153 
# Specify decision criteria
sum(lda_pred$posterior[, "Down"] >= 0.5)  # compare to addmargins() above
[1] 70
sum(lda_pred$posterior[, "Down"] < 0.5)   # compare to addmargins() above
[1] 182
sum(lda_pred$posterior[, "Down"] > 0.9)   # none are predicted at 0.9 or above
[1] 0

3.2 Quadratic Discriminant Analysis (QDA)

qda_fit <- MASS::qda(Direction ~ Lag1 + Lag2, data = train)
qda_fit
Call:
qda(Direction ~ Lag1 + Lag2, data = train)

Prior probabilities of groups:
    Down       Up 
0.491984 0.508016 

Group means:
            Lag1        Lag2
Down  0.04279022  0.03389409
Up   -0.03954635 -0.03132544
qda_class <- predict(qda_fit, newdata = test)$class
# Confusion matrix
cmat <- table(truth = test$Direction, pred = qda_class)
addmargins(cmat)
      pred
truth  Down  Up Sum
  Down   30  81 111
  Up     20 121 141
  Sum    50 202 252
# Accuracy
acc <- sum(diag(cmat)) / sum(cmat)
acc
[1] 0.5992063
# Sanity check
mean(qda_class == test$Direction) == acc
[1] TRUE
# Sensitivity (predicting Up)
sens <- prop.table(cmat["Up", ])[2]
sens
      Up 
0.858156 
# Specificity (predicting Down)
spec <- prop.table(cmat["Down", ])[1]
spec
     Down 
0.2702703 

4 KNN: K-Nearest Neighbors

4.1 K = 1

training_classes <- train$Direction  # KNN models pass true classes separately
knn_train    <- dplyr::select(train, Lag1, Lag2)  # and remove `Direction`
test_classes <- test$Direction
knn_test     <- dplyr::select(test, Lag1, Lag2)
set.seed(1)
# Smallest neighborhood, K = 1
knn_pred_class <- class::knn(train = knn_train,
                             test = knn_test,
                             cl = training_classes, k = 1)
# Confusion matrix
cmat <- table(truth = test_classes, pred = knn_pred_class)
addmargins(cmat)
      pred
truth  Down  Up Sum
  Down   43  68 111
  Up     58  83 141
  Sum   101 151 252
# Accuracy: Really bad for K = 1!
acc <- sum(diag(cmat)) / sum(cmat)    # a coin flip!
acc
[1] 0.5
# Sanity check
mean(knn_pred_class == test_classes) == acc
[1] TRUE
# Sensitivity (predicting Up)
sens <- prop.table(cmat["Up", ])[2]
sens
       Up 
0.5886525 
# Specificity (predicting Down)
spec <- prop.table(cmat["Down", ])[1]
spec
     Down 
0.3873874 

4.2 K = 1, 2, …, 10

Iterate over K = 1, 2, ..., 10 using purrr::map_df().

purrr::map_df(1:10, function(.x) {
  cmat <- class::knn(knn_train, knn_test, training_classes, k = .x) %>%
    table(truth = test_classes, pred = .)
  tibble::tibble(K    = .x,
                 acc  = sum(diag(cmat)) / sum(cmat),     # accuracy
                 sens = prop.table(cmat["Up", ])[2],     # sensitivity
                 spec = prop.table(cmat["Down", ])[1])   # specificity
})

Created on 2020-01-24 by Rmarkdown (v1.11) and R version 3.5.1 (2018-07-02).

---
title: 'STAA 577: Laboratory Four </br> Classification (`tidyverse`)'
author: 'Adapted by Tavener & Field </br> From: James, Witten, Hastie and Tibshirani'
date: "`r format(Sys.Date(), '%e %B %Y')`"
output:
  html_notebook:
    code_folding: show
    number_sections: yes
    toc: yes
    toc_float:
      collapsed: no
ratio: '9:16'
tables: yes
fontsize: 12pt
---



-----------------------



### Load necessary libraries {-}
```{r setup, message = FALSE, warning = FALSE}
options(warnPartialMatchArgs = FALSE)  # don't want these warnings
library(magrittr)     # pipes
library(tibble)       # tibbles
library(dplyr)        # data wrangling
library(purrr)        # iteration
library(ggplot2)      # tidy plotting
library(broom)        # summarize model objects uniformly
library(yardstick)    # classification performance measures
library(class)        # knn()
library(MASS)         # lda() and qda()
library(ISLR)         # Smarket data set
```

-----------------------------

# S&P Stock Market (Smarket)
```{r Stock_Market_Data, error = TRUE}
class(Smarket)                           # currently data frame
Smarket %<>% as.tibble()                 # convert to `tibble` object
class(Smarket)                           # tibbles are `tidyverse` friendly
names(Smarket)                           # List features/covariates available
dim(Smarket)                             # dimensions of data
Smarket                                  # echo Smarket (top 10 rows)
summary(Smarket)                         # S3 summary method for class `data.frame`
pairs(Smarket)                           # pairs plot of 9-choose-2 combinations
# cor(Smarket)                           # Error; non-numerics in 'Direction'
class(Smarket$Direction)                 # Factor
cor(dplyr::select(Smarket, -Direction))  # remove 'Direction' and retry
Smarket %>%
  ggplot(aes(x = 1:length(Volume), y = Volume)) +   # plot `Volume` over time
  geom_point(alpha = 0.5) +                         # points
  xlab("Time") +
  geom_smooth()                                     # smooth fit
```


--------------------------------

# Logistic regression

## Using the entire data set
```{r Logistic_regression}
glm_fit <- stats::glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 +
                      Lag5 + Volume, data = Smarket, family = "binomial")
class(glm_fit)
summary(glm_fit)                  # Statistical summary
coef(glm_fit)                     # Coefficients of the fit
sum_tbl <- broom::tidy(glm_fit)   # use `broom` to summarize
sum_tbl
sum_tbl %>% dplyr::pull(p.value)  # `pull` vector of p-values for each covariate

# Make predictions (training only)
glm_probs <- predict(glm_fit, type = "response")    # S3 predict method
class(glm_probs)
glm_probs %>% head(10)
Smarket$Direction %>% head(10)
contrasts(Smarket$Direction)
glm_pred <- ifelse(glm_probs > 0.5, "Up", "Down")  # predict @ 0.5 cutoff

# Confusion matrix
cmat <- table(truth = Smarket$Direction, pred = glm_pred)
addmargins(cmat)

# Accuracy (training)
sum(diag(cmat)) / sum(cmat)

# Alternative method (sanity check)
# note: the mean of a boolean vector is the proportion of TRUE
# b/c TRUE = 1; FALSE = 0
mean(glm_pred == Smarket$Direction)  # proportion predicted correctly
```


### Confusion matrices
There are various (arbitrary!) ways to present prediction performance summaries,
none more `correct` than any other. For consistency, this course will use the
default construction below, where the `truth` standard is at left
and the predictions along the top. Negative predictions (i.e. `controls`) occur
first, followed by positive predictions. With this construction:

* `accuracy` is diagonal sum divided by the total sum
* `specificity` is determined by the proportion of correct *negative*
  predictions in the first row
* `sensitivity` is determined by the proportion of the correct *positive*
  predictions in the second row



```{r confusion, echo = FALSE}
c("TN", "FN", "FP", "TP") %>%
  matrix(ncol = 2, dimnames = list(Truth = c("neg", "pos"),
                                   Prediction = c("neg", "pos"))) %>%
  as.table()
```

There are other predictive measures (e.g. negative predictive value (NPV),
and PPV) that become more important depending on the problem. For a complete
guide see
[confusion matrix](https://en.wikipedia.org/wiki/Confusion_matrix).

---------------------------




## Using a training (sub)set

```{r Logistic_regression2, error = TRUE}
# Training on data prior to 2005
train <- dplyr::filter(Smarket, Year < 2005)

# Test on data post-2004
test  <- dplyr::filter(Smarket, Year >= 2005)

# Sanity checks
nrow(train)
nrow(test)
nrow(train) + nrow(test) == nrow(Smarket)  # sanity check
head(test$Direction)

# Fit LR model to training set
# and immediately make predictions via `%>%`
glm_probs <- stats::glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
                        data = train, family = "binomial")  %>%  # fit LR model
  predict(newdata = test, type = "response")     # predict on test set

# Recast predictions
cutoff   <- 0.5
glm_pred <- ifelse(glm_probs > cutoff, "Up", "Down")

# Confusion matrix
cmat <- table(truth = test$Direction, pred = glm_pred)
addmargins(cmat)

# Accuracy
acc <- sum(diag(cmat)) / sum(cmat)
acc

# Sensitivity (predicting Up)
sens <- prop.table(cmat["Up", ])[2]
sens

# Specificity (predicting Down)
spec <- prop.table(cmat["Down", ])[1]
spec

# Sanity checks
mean(glm_pred == test$Direction) == acc      # recall proportion trick!
mean(glm_pred != test$Direction) == 1 - acc  # machine precision error!

# Try `dplyr::near()`; safer than `==` when floating point errors can bite
dplyr::near(mean(glm_pred != test$Direction), 1 - acc)
```


---------------------------

## Using a subset of predictors
```{r Logistic_regression3}
# Fit using only 2 predictors; Lag1 and Lag2
glm_fit2 <- stats::glm(Direction ~ Lag1 + Lag2 ,
                       data = train,
                       family = "binomial")
glm_probs <- predict(glm_fit2, newdata = test, type = "response")
cutoff    <- 0.5
glm_pred  <- ifelse(glm_probs > cutoff, "Up", "Down")

# Confusion matrix
cmat <- table(truth = test$Direction, pred = glm_pred)
addmargins(cmat)

# Accuracy
acc <- sum(diag(cmat)) / sum(cmat)
acc

# Sanity check
mean(glm_pred == test$Direction) == acc

# Sensitivity (predicting Up)
sens <- prop.table(cmat["Up", ])[2]
sens

# Specificity (predicting Down)
spec <- prop.table(cmat["Down", ])[1]
spec

# Make predictions for specific values of Lag1 and Lag2 also possible
predict(glm_fit2,
        newdata = data.frame(Lag1 = c(1.2, 1.5),
                             Lag2 = c(1.1, -0.8)),
        type = "response")
```

----------------------


# Discriminant Analyses


## Linear Discriminant Analysis (LDA)

```{r LDA1, fig.height = 7}
lda_fit <- MASS::lda(Direction ~ Lag1 + Lag2, data = train)
class(lda_fit)
lda_fit
# plot histogram/density/both for the `training`(!) set; split by class labels
plot(lda_fit, type = "hist", col = "grey")     # S3 plot method
# Predict on the `test` set
# S3 predict method for class `lda` (above) returns 3 elements:
#   class     = the class prediction for each observation (prob. >= 0.5 cutoff)
#   x         = the linear discriminats
#   posterior = the posterior probabilities by class for each observation
lda_pred <- predict(lda_fit, newdata = test)   
head(lda_pred$x, 20)           # scores (linear discriminants) for each class
range(lda_pred$x)              # min/max score
```


**Sidenote:** To convince yourself that the `plot()` method above plots
the linear discriminants of the *training* set by default,
run the code below yourselves and compare:

```{r LDA2, eval = FALSE}
lda_pred_easy <- predict(lda_fit, newdata = train) # `predict()` on training set
lda_pred_easy %>%
  purrr::pluck("x") %>%            # pull out 'x'; linear discriminants
  hist(prob = TRUE, col = "grey",  # plot 'x' in histogram
       main = "", xlab = "Up/Down Combined")
```

```{r LDA3}
head(lda_pred$posterior, 20)   # posterior probabilities each class
head(lda_pred$class, 20)       # predicted class
lda_class <- lda_pred$class    # predicted class

# Confusion matrix
cmat <- table(truth = test$Direction, pred = lda_class)
addmargins(cmat)

# Accuracy
acc <- sum(diag(cmat)) / sum(cmat)
acc

# Sanity check
mean(lda_class == test$Direction) == acc

# Sensitivity (predicting Up)
sens <- prop.table(cmat["Up", ])[2]
sens

# Specificity (predicting Down)
spec <- prop.table(cmat["Down", ])[1]
spec

# Specify decision criteria
sum(lda_pred$posterior[, "Down"] >= 0.5)  # compare to addmargins() above
sum(lda_pred$posterior[, "Down"] < 0.5)   # compare to addmargins() above
sum(lda_pred$posterior[, "Down"] > 0.9)   # none are predicted at 0.9 or above
```


-------------------


## Quadratic Discriminant Analysis (QDA)
```{r QDA}
qda_fit <- MASS::qda(Direction ~ Lag1 + Lag2, data = train)
qda_fit
qda_class <- predict(qda_fit, newdata = test)$class

# Confusion matrix
cmat <- table(truth = test$Direction, pred = qda_class)
addmargins(cmat)

# Accuracy
acc <- sum(diag(cmat)) / sum(cmat)
acc

# Sanity check
mean(qda_class == test$Direction) == acc

# Sensitivity (predicting Up)
sens <- prop.table(cmat["Up", ])[2]
sens

# Specificity (predicting Down)
spec <- prop.table(cmat["Down", ])[1]
spec
```



-----------------------



# KNN: K-Nearest Neighbors

## K = 1
```{r KNN}
training_classes <- train$Direction  # KNN models pass true classes separately
knn_train    <- dplyr::select(train, Lag1, Lag2)  # and remove `Direction`
test_classes <- test$Direction
knn_test     <- dplyr::select(test, Lag1, Lag2)
set.seed(1)

# Smallest neighborhood, K = 1
knn_pred_class <- class::knn(train = knn_train,
                             test = knn_test,
                             cl = training_classes, k = 1)

# Confusion matrix
cmat <- table(truth = test_classes, pred = knn_pred_class)
addmargins(cmat)

# Accuracy: Really bad for K = 1!
acc <- sum(diag(cmat)) / sum(cmat)    # a coin flip!
acc

# Sanity check
mean(knn_pred_class == test_classes) == acc

# Sensitivity (predicting Up)
sens <- prop.table(cmat["Up", ])[2]
sens

# Specificity (predicting Down)
spec <- prop.table(cmat["Down", ])[1]
spec
```

## K = 1, 2, ..., 10

Iterate over `K = 1, 2, ..., 10` using `purrr::map_df()`.

```{r varyK}
purrr::map_df(1:10, function(.x) {
  cmat <- class::knn(knn_train, knn_test, training_classes, k = .x) %>%
    table(truth = test_classes, pred = .)
  tibble::tibble(K    = .x,
                 acc  = sum(diag(cmat)) / sum(cmat),     # accuracy
                 sens = prop.table(cmat["Up", ])[2],     # sensitivity
                 spec = prop.table(cmat["Down", ])[1])   # specificity
})
```


----------------------------

Created on `r Sys.Date()` by [Rmarkdown](https://github.com/rstudio/rmarkdown)
(v`r utils::packageVersion("rmarkdown")`) and `r R.version$version.string`.
