tidyverse)options(warnPartialMatchArgs = FALSE) # don't want these warnings
library(magrittr) # pipes
library(tibble) # tibbles
library(dplyr) # data wrangling
library(purrr) # iteration
library(ggplot2) # tidy plotting
library(broom) # summarize model objects uniformly
library(yardstick) # classification performance measures
library(class) # knn()
library(MASS) # lda() and qda()
library(ISLR) # Smarket data set
class(Smarket) # currently data frame
[1] "data.frame"
Smarket %<>% as.tibble() # convert to `tibble` object
class(Smarket) # tibbles are `tidyverse` friendly
[1] "tbl_df" "tbl" "data.frame"
names(Smarket) # List features/covariates available
[1] "Year" "Lag1" "Lag2" "Lag3" "Lag4" "Lag5"
[7] "Volume" "Today" "Direction"
dim(Smarket) # dimensions of data
[1] 1250 9
Smarket # echo Smarket (top 10 rows)
summary(Smarket) # S3 summary method for class `data.frame`
Year Lag1 Lag2 Lag3
Min. :2001 Min. :-4.922000 Min. :-4.922000 Min. :-4.922000
1st Qu.:2002 1st Qu.:-0.639500 1st Qu.:-0.639500 1st Qu.:-0.640000
Median :2003 Median : 0.039000 Median : 0.039000 Median : 0.038500
Mean :2003 Mean : 0.003834 Mean : 0.003919 Mean : 0.001716
3rd Qu.:2004 3rd Qu.: 0.596750 3rd Qu.: 0.596750 3rd Qu.: 0.596750
Max. :2005 Max. : 5.733000 Max. : 5.733000 Max. : 5.733000
Lag4 Lag5 Volume Today
Min. :-4.922000 Min. :-4.92200 Min. :0.3561 Min. :-4.922000
1st Qu.:-0.640000 1st Qu.:-0.64000 1st Qu.:1.2574 1st Qu.:-0.639500
Median : 0.038500 Median : 0.03850 Median :1.4229 Median : 0.038500
Mean : 0.001636 Mean : 0.00561 Mean :1.4783 Mean : 0.003138
3rd Qu.: 0.596750 3rd Qu.: 0.59700 3rd Qu.:1.6417 3rd Qu.: 0.596750
Max. : 5.733000 Max. : 5.73300 Max. :3.1525 Max. : 5.733000
Direction
Down:602
Up :648
pairs(Smarket) # pairs plot of 9-choose-2 combinations
# cor(Smarket) # Error; non-numerics in 'Direction'
class(Smarket$Direction) # Factor
[1] "factor"
cor(dplyr::select(Smarket, -Direction)) # remove 'Direction' and retry
Year Lag1 Lag2 Lag3 Lag4 Lag5
Year 1.00000000 0.029699649 0.030596422 0.033194581 0.035688718 0.029787995
Lag1 0.02969965 1.000000000 -0.026294328 -0.010803402 -0.002985911 -0.005674606
Lag2 0.03059642 -0.026294328 1.000000000 -0.025896670 -0.010853533 -0.003557949
Lag3 0.03319458 -0.010803402 -0.025896670 1.000000000 -0.024051036 -0.018808338
Lag4 0.03568872 -0.002985911 -0.010853533 -0.024051036 1.000000000 -0.027083641
Lag5 0.02978799 -0.005674606 -0.003557949 -0.018808338 -0.027083641 1.000000000
Volume 0.53900647 0.040909908 -0.043383215 -0.041823686 -0.048414246 -0.022002315
Today 0.03009523 -0.026155045 -0.010250033 -0.002447647 -0.006899527 -0.034860083
Volume Today
Year 0.53900647 0.030095229
Lag1 0.04090991 -0.026155045
Lag2 -0.04338321 -0.010250033
Lag3 -0.04182369 -0.002447647
Lag4 -0.04841425 -0.006899527
Lag5 -0.02200231 -0.034860083
Volume 1.00000000 0.014591823
Today 0.01459182 1.000000000
Smarket %>%
ggplot(aes(x = 1:length(Volume), y = Volume)) + # plot `Volume` over time
geom_point(alpha = 0.5) + # points
xlab("Time") +
geom_smooth() # smooth fit
glm_fit <- stats::glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 +
Lag5 + Volume, data = Smarket, family = "binomial")
class(glm_fit)
[1] "glm" "lm"
summary(glm_fit) # Statistical summary
Call:
stats::glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 +
Lag5 + Volume, family = "binomial", data = Smarket)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.446 -1.203 1.065 1.145 1.326
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.126000 0.240736 -0.523 0.601
Lag1 -0.073074 0.050167 -1.457 0.145
Lag2 -0.042301 0.050086 -0.845 0.398
Lag3 0.011085 0.049939 0.222 0.824
Lag4 0.009359 0.049974 0.187 0.851
Lag5 0.010313 0.049511 0.208 0.835
Volume 0.135441 0.158360 0.855 0.392
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1731.2 on 1249 degrees of freedom
Residual deviance: 1727.6 on 1243 degrees of freedom
AIC: 1741.6
Number of Fisher Scoring iterations: 3
coef(glm_fit) # Coefficients of the fit
(Intercept) Lag1 Lag2 Lag3 Lag4 Lag5
-0.126000257 -0.073073746 -0.042301344 0.011085108 0.009358938 0.010313068
Volume
0.135440659
sum_tbl <- broom::tidy(glm_fit) # use `broom` to summarize
sum_tbl
sum_tbl %>% dplyr::pull(p.value) # `pull` vector of p-values for each covariate
[1] 0.6006983 0.1452272 0.3983491 0.8243333 0.8514445 0.8349974 0.3924004
# Make predictions (training only)
glm_probs <- predict(glm_fit, type = "response") # S3 predict method
class(glm_probs)
[1] "numeric"
glm_probs %>% head(10)
1 2 3 4 5 6 7 8
0.5070841 0.4814679 0.4811388 0.5152224 0.5107812 0.5069565 0.4926509 0.5092292
9 10
0.5176135 0.4888378
Smarket$Direction %>% head(10)
[1] Up Up Down Up Up Up Down Up Up Up
Levels: Down Up
contrasts(Smarket$Direction)
Up
Down 0
Up 1
glm_pred <- ifelse(glm_probs > 0.5, "Up", "Down") # predict @ 0.5 cutoff
# Confusion matrix
cmat <- table(truth = Smarket$Direction, pred = glm_pred)
addmargins(cmat)
pred
truth Down Up Sum
Down 145 457 602
Up 141 507 648
Sum 286 964 1250
# Accuracy (training)
sum(diag(cmat)) / sum(cmat)
[1] 0.5216
# Alternative method (sanity check)
# note: the mean of a boolean vector is the proportion of TRUE
# b/c TRUE = 1; FALSE = 0
mean(glm_pred == Smarket$Direction) # proportion predicted correctly
[1] 0.5216
There are various (arbitrary!) ways to present prediction performance summaries, none more correct than any other. For consistency, this course will use the default construction below, where the truth standard is at left and the predictions along the top. Negative predictions (i.e. controls) occur first, followed by positive predictions. With this construction:
accuracy is diagonal sum divided by the total sumspecificity is determined by the proportion of correct negative predictions in the first rowsensitivity is determined by the proportion of the correct positive predictions in the second row Prediction
Truth neg pos
neg TN FP
pos FN TP
There are other predictive measures (e.g. negative predictive value (NPV), and PPV) that become more important depending on the problem. For a complete guide see confusion matrix.
# Training on data prior to 2005
train <- dplyr::filter(Smarket, Year < 2005)
# Test on data post-2004
test <- dplyr::filter(Smarket, Year >= 2005)
# Sanity checks
nrow(train)
[1] 998
nrow(test)
[1] 252
nrow(train) + nrow(test) == nrow(Smarket) # sanity check
[1] TRUE
head(test$Direction)
[1] Down Down Down Up Down Up
Levels: Down Up
# Fit LR model to training set
# and immediately make predictions via `%>%`
glm_probs <- stats::glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = train, family = "binomial") %>% # fit LR model
predict(newdata = test, type = "response") # predict on test set
# Recast predictions
cutoff <- 0.5
glm_pred <- ifelse(glm_probs > cutoff, "Up", "Down")
# Confusion matrix
cmat <- table(truth = test$Direction, pred = glm_pred)
addmargins(cmat)
pred
truth Down Up Sum
Down 77 34 111
Up 97 44 141
Sum 174 78 252
# Accuracy
acc <- sum(diag(cmat)) / sum(cmat)
acc
[1] 0.4801587
# Sensitivity (predicting Up)
sens <- prop.table(cmat["Up", ])[2]
sens
Up
0.3120567
# Specificity (predicting Down)
spec <- prop.table(cmat["Down", ])[1]
spec
Down
0.6936937
# Sanity checks
mean(glm_pred == test$Direction) == acc # recall proportion trick!
[1] TRUE
mean(glm_pred != test$Direction) == 1 - acc # machine precision error!
[1] FALSE
# Try `dplyr::near()`; safer than `==` when floating point errors can bite
dplyr::near(mean(glm_pred != test$Direction), 1 - acc)
[1] TRUE
# Fit using only 2 predictors; Lag1 and Lag2
glm_fit2 <- stats::glm(Direction ~ Lag1 + Lag2 ,
data = train,
family = "binomial")
glm_probs <- predict(glm_fit2, newdata = test, type = "response")
cutoff <- 0.5
glm_pred <- ifelse(glm_probs > cutoff, "Up", "Down")
# Confusion matrix
cmat <- table(truth = test$Direction, pred = glm_pred)
addmargins(cmat)
pred
truth Down Up Sum
Down 35 76 111
Up 35 106 141
Sum 70 182 252
# Accuracy
acc <- sum(diag(cmat)) / sum(cmat)
acc
[1] 0.5595238
# Sanity check
mean(glm_pred == test$Direction) == acc
[1] TRUE
# Sensitivity (predicting Up)
sens <- prop.table(cmat["Up", ])[2]
sens
Up
0.751773
# Specificity (predicting Down)
spec <- prop.table(cmat["Down", ])[1]
spec
Down
0.3153153
# Make predictions for specific values of Lag1 and Lag2 also possible
predict(glm_fit2,
newdata = data.frame(Lag1 = c(1.2, 1.5),
Lag2 = c(1.1, -0.8)),
type = "response")
1 2
0.4791462 0.4960939
lda_fit <- MASS::lda(Direction ~ Lag1 + Lag2, data = train)
class(lda_fit)
[1] "lda"
lda_fit
Call:
lda(Direction ~ Lag1 + Lag2, data = train)
Prior probabilities of groups:
Down Up
0.491984 0.508016
Group means:
Lag1 Lag2
Down 0.04279022 0.03389409
Up -0.03954635 -0.03132544
Coefficients of linear discriminants:
LD1
Lag1 -0.6420190
Lag2 -0.5135293
# plot histogram/density/both for the `training`(!) set; split by class labels
plot(lda_fit, type = "hist", col = "grey") # S3 plot method
# Predict on the `test` set
# S3 predict method for class `lda` (above) returns 3 elements:
# class = the class prediction for each observation (prob. >= 0.5 cutoff)
# x = the linear discriminats
# posterior = the posterior probabilities by class for each observation
lda_pred <- predict(lda_fit, newdata = test)
head(lda_pred$x, 20) # scores (linear discriminants) for each class
LD1
1 0.08293096
2 0.59114102
3 1.16723063
4 0.83335022
5 -0.03792892
6 -0.08743142
7 -0.14512719
8 0.21701324
9 0.05873792
10 0.35068642
11 0.05897298
12 -0.92794134
13 0.11370190
14 0.98783874
15 0.81206862
16 0.55681363
17 -0.07452314
18 -0.51514029
19 -0.27386231
20 0.15458312
range(lda_pred$x) # min/max score
[1] -1.309852 1.587994
Sidenote: To convince yourself that the plot() method above plots the linear discriminants of the training set by default, run the code below yourselves and compare:
lda_pred_easy <- predict(lda_fit, newdata = train) # `predict()` on training set
lda_pred_easy %>%
purrr::pluck("x") %>% # pull out 'x'; linear discriminants
hist(prob = TRUE, col = "grey", # plot 'x' in histogram
main = "", xlab = "Up/Down Combined")
head(lda_pred$posterior, 20) # posterior probabilities each class
Down Up
1 0.4901792 0.5098208
2 0.4792185 0.5207815
3 0.4668185 0.5331815
4 0.4740011 0.5259989
5 0.4927877 0.5072123
6 0.4938562 0.5061438
7 0.4951016 0.5048984
8 0.4872861 0.5127139
9 0.4907013 0.5092987
10 0.4844026 0.5155974
11 0.4906963 0.5093037
12 0.5119988 0.4880012
13 0.4895152 0.5104848
14 0.4706761 0.5293239
15 0.4744593 0.5255407
16 0.4799583 0.5200417
17 0.4935775 0.5064225
18 0.5030894 0.4969106
19 0.4978806 0.5021194
20 0.4886331 0.5113669
head(lda_pred$class, 20) # predicted class
[1] Up Up Up Up Up Up Up Up Up Up Up Down Up Up Up Up
[17] Up Down Up Up
Levels: Down Up
lda_class <- lda_pred$class # predicted class
# Confusion matrix
cmat <- table(truth = test$Direction, pred = lda_class)
addmargins(cmat)
pred
truth Down Up Sum
Down 35 76 111
Up 35 106 141
Sum 70 182 252
# Accuracy
acc <- sum(diag(cmat)) / sum(cmat)
acc
[1] 0.5595238
# Sanity check
mean(lda_class == test$Direction) == acc
[1] TRUE
# Sensitivity (predicting Up)
sens <- prop.table(cmat["Up", ])[2]
sens
Up
0.751773
# Specificity (predicting Down)
spec <- prop.table(cmat["Down", ])[1]
spec
Down
0.3153153
# Specify decision criteria
sum(lda_pred$posterior[, "Down"] >= 0.5) # compare to addmargins() above
[1] 70
sum(lda_pred$posterior[, "Down"] < 0.5) # compare to addmargins() above
[1] 182
sum(lda_pred$posterior[, "Down"] > 0.9) # none are predicted at 0.9 or above
[1] 0
qda_fit <- MASS::qda(Direction ~ Lag1 + Lag2, data = train)
qda_fit
Call:
qda(Direction ~ Lag1 + Lag2, data = train)
Prior probabilities of groups:
Down Up
0.491984 0.508016
Group means:
Lag1 Lag2
Down 0.04279022 0.03389409
Up -0.03954635 -0.03132544
qda_class <- predict(qda_fit, newdata = test)$class
# Confusion matrix
cmat <- table(truth = test$Direction, pred = qda_class)
addmargins(cmat)
pred
truth Down Up Sum
Down 30 81 111
Up 20 121 141
Sum 50 202 252
# Accuracy
acc <- sum(diag(cmat)) / sum(cmat)
acc
[1] 0.5992063
# Sanity check
mean(qda_class == test$Direction) == acc
[1] TRUE
# Sensitivity (predicting Up)
sens <- prop.table(cmat["Up", ])[2]
sens
Up
0.858156
# Specificity (predicting Down)
spec <- prop.table(cmat["Down", ])[1]
spec
Down
0.2702703
training_classes <- train$Direction # KNN models pass true classes separately
knn_train <- dplyr::select(train, Lag1, Lag2) # and remove `Direction`
test_classes <- test$Direction
knn_test <- dplyr::select(test, Lag1, Lag2)
set.seed(1)
# Smallest neighborhood, K = 1
knn_pred_class <- class::knn(train = knn_train,
test = knn_test,
cl = training_classes, k = 1)
# Confusion matrix
cmat <- table(truth = test_classes, pred = knn_pred_class)
addmargins(cmat)
pred
truth Down Up Sum
Down 43 68 111
Up 58 83 141
Sum 101 151 252
# Accuracy: Really bad for K = 1!
acc <- sum(diag(cmat)) / sum(cmat) # a coin flip!
acc
[1] 0.5
# Sanity check
mean(knn_pred_class == test_classes) == acc
[1] TRUE
# Sensitivity (predicting Up)
sens <- prop.table(cmat["Up", ])[2]
sens
Up
0.5886525
# Specificity (predicting Down)
spec <- prop.table(cmat["Down", ])[1]
spec
Down
0.3873874
Iterate over K = 1, 2, ..., 10 using purrr::map_df().
purrr::map_df(1:10, function(.x) {
cmat <- class::knn(knn_train, knn_test, training_classes, k = .x) %>%
table(truth = test_classes, pred = .)
tibble::tibble(K = .x,
acc = sum(diag(cmat)) / sum(cmat), # accuracy
sens = prop.table(cmat["Up", ])[2], # sensitivity
spec = prop.table(cmat["Down", ])[1]) # specificity
})
Created on 2020-01-24 by Rmarkdown (v1.11) and R version 3.5.1 (2018-07-02).