require(ISLR); require(tidyverse); require(ggthemes);
## Loading required package: ISLR
## Loading required package: tidyverse
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.5 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.0.2 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## Loading required package: ggthemes
require(GGally); require(knitr); require(broom);
## Loading required package: GGally
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
## Loading required package: knitr
## Loading required package: broom
require(kableExtra)
## Loading required package: kableExtra
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
set.seed(1)
theme_set(theme_tufte(base_size = 14))
data('Weekly')
require(ISLR)
require(tidyverse)
require(ggthemes)
require(GGally)
require(knitr)
require(broom)
require(kableExtra)
require(MASS)
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
require(class)
## Loading required package: class
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
plot(Today~Lag1, col="darkred", data=Weekly)
simplelm = lm(Today~Lag1, data=Weekly)
abline(simplelm, lwd= 3, col= "yellow")

pairs(Weekly)

t.test(Lag1 ~ Direction, data = Weekly)
##
## Welch Two Sample t-test
##
## data: Lag1 by Direction
## t = 1.6563, df = 1047.9, p-value = 0.09795
## alternative hypothesis: true difference in means between group Down and group Up is not equal to 0
## 95 percent confidence interval:
## -0.04378476 0.51794261
## sample estimates:
## mean in group Down mean in group Up
## 0.28229545 0.04521653
logmod = glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,family = "binomial", data=Weekly)
summary(logmod)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = "binomial", data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
probs = predict(logmod, type="response")
preds = rep("Down", 1089)
preds[probs > 0.5] = "Up"
table(preds, Weekly$Direction)
##
## preds Down Up
## Down 54 48
## Up 430 557
hist(probs, breaks= 100, col= "red")
abline(v = mean(probs), lwd = 2)

plot(probs, col= ifelse(Weekly$Direction=="Down", "blue","orange"), pch=16)
abline(h = 0.5, lwd= 3)

training.data = Weekly[Weekly$Year<2009,]
test.data = Weekly[Weekly$Year>2008,]
simpglm = glm(Direction~Lag2, data= training.data, family = "binomial")
summary(simpglm)
##
## Call:
## glm(formula = Direction ~ Lag2, family = "binomial", data = training.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.536 -1.264 1.021 1.091 1.368
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.20326 0.06428 3.162 0.00157 **
## Lag2 0.05810 0.02870 2.024 0.04298 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1354.7 on 984 degrees of freedom
## Residual deviance: 1350.5 on 983 degrees of freedom
## AIC: 1354.5
##
## Number of Fisher Scoring iterations: 4
testprobs = predict(simpglm, type="response", newdata = test.data)
testdirs = Weekly$Direction[Weekly$Year>2008]
plot(testprobs, col= ifelse(Weekly$Direction[Weekly$Year>2008]=="Down", "red","green"), pch=16)
abline(h = 0.5, lwd= 3)

testpreds = rep("Down", 104)
testpreds[testprobs>0.5] = "Up"
mean(probs)
## [1] 0.5555556
table(testpreds, testdirs)
## testdirs
## testpreds Down Up
## Down 9 5
## Up 34 56
lda.fit = lda(Direction~Lag2, data= training.data)
lda.fit
## Call:
## lda(Direction ~ Lag2, data = training.data)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
##
## Coefficients of linear discriminants:
## LD1
## Lag2 0.4414162
plot(lda.fit)

lda.pred = predict(lda.fit, newdata=test.data, type="response")
lda.class = lda.pred$class
table(lda.class, test.data$Direction)
##
## lda.class Down Up
## Down 9 5
## Up 34 56
qda.fit = qda(Direction~Lag2, data= training.data)
qda.fit
## Call:
## qda(Direction ~ Lag2, data = training.data)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
qda.pred = predict(qda.fit, newdata=test.data, type="response")
qda.class = qda.pred$class
table(qda.class, test.data$Direction)
##
## qda.class Down Up
## Down 0 0
## Up 43 61
set.seed(1)
train.X = cbind(training.data$Lag2)
test.X = cbind(test.data$Lag2)
train.Y = cbind(training.data$Direction)
knn.pred = knn(train.X, test.X, train.Y, k=1)
table(knn.pred, test.data$Direction)
##
## knn.pred Down Up
## 1 21 30
## 2 22 31
knn3.pred = knn(train.X, test.X, train.Y, k=3)
table(knn3.pred, test.data$Direction)
##
## knn3.pred Down Up
## 1 16 19
## 2 27 42
qda.fit2 = qda(Direction~Lag1 + Lag2 + Lag4, data= training.data)
qda.fit2
## Call:
## qda(Direction ~ Lag1 + Lag2 + Lag4, data = training.data)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag1 Lag2 Lag4
## Down 0.289444444 -0.03568254 0.15925624
## Up -0.009213235 0.26036581 0.09220956
qda.pred2 = predict(qda.fit2, newdata=test.data, type="response")
qda.class2 = qda.pred2$class
table(qda.class2, test.data$Direction)
##
## qda.class2 Down Up
## Down 9 20
## Up 34 41
lda.fit2 = lda(Direction~Lag1 + Lag2 + Lag4, data= training.data)
lda.fit2
## Call:
## lda(Direction ~ Lag1 + Lag2 + Lag4, data = training.data)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag1 Lag2 Lag4
## Down 0.289444444 -0.03568254 0.15925624
## Up -0.009213235 0.26036581 0.09220956
##
## Coefficients of linear discriminants:
## LD1
## Lag1 -0.2984478
## Lag2 0.2960224
## Lag4 -0.1113485
lda.pred2 = predict(lda.fit2, newdata=test.data, type="response")
lda.class2 = lda.pred2$class
table(lda.class2, test.data$Direction)
##
## lda.class2 Down Up
## Down 9 7
## Up 34 54