require(ISLR); require(tidyverse); require(ggthemes);
## Loading required package: ISLR
## Loading required package: tidyverse
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.5     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.0.2     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## Loading required package: ggthemes
require(GGally); require(knitr); require(broom);
## Loading required package: GGally
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## Loading required package: knitr
## Loading required package: broom
require(kableExtra)
## Loading required package: kableExtra
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
set.seed(1)
theme_set(theme_tufte(base_size = 14))
data('Weekly')
require(ISLR)
require(tidyverse)
require(ggthemes)
require(GGally)
require(knitr)
require(broom)
require(kableExtra)
require(MASS)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
require(class)
## Loading required package: class
summary(Weekly)
##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
## 
plot(Today~Lag1, col="darkred", data=Weekly)
simplelm = lm(Today~Lag1, data=Weekly)
abline(simplelm, lwd= 3, col= "yellow")

pairs(Weekly)

t.test(Lag1 ~ Direction, data = Weekly)
## 
##  Welch Two Sample t-test
## 
## data:  Lag1 by Direction
## t = 1.6563, df = 1047.9, p-value = 0.09795
## alternative hypothesis: true difference in means between group Down and group Up is not equal to 0
## 95 percent confidence interval:
##  -0.04378476  0.51794261
## sample estimates:
## mean in group Down   mean in group Up 
##         0.28229545         0.04521653
logmod = glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,family = "binomial", data=Weekly)
summary(logmod)
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = "binomial", data = Weekly)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6949  -1.2565   0.9913   1.0849   1.4579  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4
probs = predict(logmod, type="response")
preds = rep("Down", 1089)
preds[probs > 0.5] = "Up"
table(preds, Weekly$Direction)
##       
## preds  Down  Up
##   Down   54  48
##   Up    430 557
hist(probs, breaks= 100, col= "red")
abline(v = mean(probs), lwd = 2)

plot(probs, col= ifelse(Weekly$Direction=="Down", "blue","orange"), pch=16)
abline(h = 0.5, lwd= 3)

training.data = Weekly[Weekly$Year<2009,]
test.data = Weekly[Weekly$Year>2008,]
simpglm = glm(Direction~Lag2, data= training.data, family = "binomial")
summary(simpglm)
## 
## Call:
## glm(formula = Direction ~ Lag2, family = "binomial", data = training.data)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.536  -1.264   1.021   1.091   1.368  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.20326    0.06428   3.162  0.00157 **
## Lag2         0.05810    0.02870   2.024  0.04298 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1354.7  on 984  degrees of freedom
## Residual deviance: 1350.5  on 983  degrees of freedom
## AIC: 1354.5
## 
## Number of Fisher Scoring iterations: 4
testprobs = predict(simpglm, type="response", newdata = test.data)
testdirs = Weekly$Direction[Weekly$Year>2008]
plot(testprobs, col= ifelse(Weekly$Direction[Weekly$Year>2008]=="Down", "red","green"), pch=16)
abline(h = 0.5, lwd= 3)

testpreds = rep("Down", 104)
testpreds[testprobs>0.5] = "Up"
mean(probs)
## [1] 0.5555556
table(testpreds, testdirs)
##          testdirs
## testpreds Down Up
##      Down    9  5
##      Up     34 56
lda.fit = lda(Direction~Lag2, data= training.data)
lda.fit
## Call:
## lda(Direction ~ Lag2, data = training.data)
## 
## Prior probabilities of groups:
##      Down        Up 
## 0.4477157 0.5522843 
## 
## Group means:
##             Lag2
## Down -0.03568254
## Up    0.26036581
## 
## Coefficients of linear discriminants:
##            LD1
## Lag2 0.4414162
plot(lda.fit)

lda.pred = predict(lda.fit, newdata=test.data, type="response")
lda.class = lda.pred$class
table(lda.class, test.data$Direction)
##          
## lda.class Down Up
##      Down    9  5
##      Up     34 56
qda.fit = qda(Direction~Lag2, data= training.data)
qda.fit
## Call:
## qda(Direction ~ Lag2, data = training.data)
## 
## Prior probabilities of groups:
##      Down        Up 
## 0.4477157 0.5522843 
## 
## Group means:
##             Lag2
## Down -0.03568254
## Up    0.26036581
qda.pred = predict(qda.fit, newdata=test.data, type="response")
qda.class = qda.pred$class
table(qda.class, test.data$Direction)
##          
## qda.class Down Up
##      Down    0  0
##      Up     43 61
set.seed(1)
train.X = cbind(training.data$Lag2)
test.X = cbind(test.data$Lag2)
train.Y = cbind(training.data$Direction)
knn.pred = knn(train.X, test.X, train.Y, k=1)
table(knn.pred, test.data$Direction)
##         
## knn.pred Down Up
##        1   21 30
##        2   22 31
knn3.pred = knn(train.X, test.X, train.Y, k=3)
table(knn3.pred, test.data$Direction)
##          
## knn3.pred Down Up
##         1   16 19
##         2   27 42
qda.fit2 = qda(Direction~Lag1 + Lag2 + Lag4, data= training.data)
qda.fit2
## Call:
## qda(Direction ~ Lag1 + Lag2 + Lag4, data = training.data)
## 
## Prior probabilities of groups:
##      Down        Up 
## 0.4477157 0.5522843 
## 
## Group means:
##              Lag1        Lag2       Lag4
## Down  0.289444444 -0.03568254 0.15925624
## Up   -0.009213235  0.26036581 0.09220956
qda.pred2 = predict(qda.fit2, newdata=test.data, type="response")
qda.class2 = qda.pred2$class
table(qda.class2, test.data$Direction)
##           
## qda.class2 Down Up
##       Down    9 20
##       Up     34 41
lda.fit2 = lda(Direction~Lag1 + Lag2 + Lag4, data= training.data)
lda.fit2
## Call:
## lda(Direction ~ Lag1 + Lag2 + Lag4, data = training.data)
## 
## Prior probabilities of groups:
##      Down        Up 
## 0.4477157 0.5522843 
## 
## Group means:
##              Lag1        Lag2       Lag4
## Down  0.289444444 -0.03568254 0.15925624
## Up   -0.009213235  0.26036581 0.09220956
## 
## Coefficients of linear discriminants:
##             LD1
## Lag1 -0.2984478
## Lag2  0.2960224
## Lag4 -0.1113485
lda.pred2 = predict(lda.fit2, newdata=test.data, type="response")
lda.class2 = lda.pred2$class
table(lda.class2, test.data$Direction)
##           
## lda.class2 Down Up
##       Down    9  7
##       Up     34 54