ISLR ch. 4: Lab

data("Smarket")
glimpse(Smarket)
## Observations: 1,250
## Variables: 9
## $ Year      <dbl> 2001, 2001, 2001, 2001, 2001, 2001, 2001, 2001, 2001, …
## $ Lag1      <dbl> 0.381, 0.959, 1.032, -0.623, 0.614, 0.213, 1.392, -0.4…
## $ Lag2      <dbl> -0.192, 0.381, 0.959, 1.032, -0.623, 0.614, 0.213, 1.3…
## $ Lag3      <dbl> -2.624, -0.192, 0.381, 0.959, 1.032, -0.623, 0.614, 0.…
## $ Lag4      <dbl> -1.055, -2.624, -0.192, 0.381, 0.959, 1.032, -0.623, 0…
## $ Lag5      <dbl> 5.010, -1.055, -2.624, -0.192, 0.381, 0.959, 1.032, -0…
## $ Volume    <dbl> 1.1913, 1.2965, 1.4112, 1.2760, 1.2057, 1.3491, 1.4450…
## $ Today     <dbl> 0.959, 1.032, -0.623, 0.614, 0.213, 1.392, -0.403, 0.0…
## $ Direction <fct> Up, Up, Down, Up, Up, Up, Down, Up, Up, Up, Down, Down…
set.seed(2)
train <- sample(nrow(Smarket), nrow(Smarket)/2)
Smarket_train <- Smarket[train,]
Smarket_test <- Smarket[-train,]

glm.fit <- glm(Direction ~ Lag1+Lag2, data=Smarket_train, family = "binomial") 

summary(glm.fit)
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2, family = "binomial", data = Smarket_train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.359  -1.197   1.068   1.153   1.287  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)  0.06024    0.08012   0.752    0.452
## Lag1         0.00466    0.07249   0.064    0.949
## Lag2        -0.08106    0.07065  -1.147    0.251
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 865.86  on 624  degrees of freedom
## Residual deviance: 864.53  on 622  degrees of freedom
## AIC: 870.53
## 
## Number of Fisher Scoring iterations: 3
glm.probs <- predict(glm.fit, type = "response")
glm.pred <- rep("Down", 625)
glm.pred[glm.probs > 0.5 ] = "Up"

table(glm.pred, Smarket_train$Direction)
##         
## glm.pred Down  Up
##     Down   76  56
##     Up    227 266
1-mean(glm.pred==Smarket_train$Direction) # Training error rate
## [1] 0.4528
contrasts(Smarket_train$Direction) # Show coding of Up/ Down as 1/0
##      Up
## Down  0
## Up    1

Fit model on test set

glm.probs <- predict(glm.fit, Smarket_test, type = "response")
glm.pred <- rep("Down", 625)
glm.pred[glm.probs > 0.5 ] = "Up"

table(glm.pred, Smarket_test$Direction)
##         
## glm.pred Down  Up
##     Down   68  60
##     Up    231 266
t.er.rate <- 1-mean(glm.pred==Smarket_test$Direction) # Test error rate
t.er.rate
## [1] 0.4656

Test error rate is 0.47, close to random guessing!

Lab: Linear Discriminant Analysis

library(MASS)

lda.fit = lda(Direction~Lag1+Lag2, data=Smarket_train)
lda.fit
## Call:
## lda(Direction ~ Lag1 + Lag2, data = Smarket_train)
## 
## Prior probabilities of groups:
##   Down     Up 
## 0.4848 0.5152 
## 
## Group means:
##              Lag1        Lag2
## Down -0.007838284  0.04465677
## Up   -0.006968944 -0.06000932
## 
## Coefficients of linear discriminants:
##              LD1
## Lag1  0.05047901
## Lag2 -0.87782155

Applied Exercises

10a)

data("Weekly")

train <- sample(nrow(Weekly), nrow(Weekly)/2)

glm.fit <- glm(Direction ~  Lag1 + Lag2 + Lag3 + Lag4 + Lag5, 
               data = Weekly, subset = train, family = "binomial")

summary(glm.fit)
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5, family = "binomial", 
##     data = Weekly, subset = train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.385  -1.218   1.038   1.131   1.450  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept)  1.089e-01  8.733e-02   1.247    0.213
## Lag1        -5.025e-02  3.605e-02  -1.394    0.163
## Lag2         1.627e-02  3.540e-02   0.460    0.646
## Lag3         1.723e-02  3.681e-02   0.468    0.640
## Lag4         1.994e-02  3.581e-02   0.557    0.578
## Lag5         6.413e-05  3.678e-02   0.002    0.999
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 752.70  on 543  degrees of freedom
## Residual deviance: 749.44  on 538  degrees of freedom
## AIC: 761.44
## 
## Number of Fisher Scoring iterations: 4
glm.probs <- predict(glm.fit, Weekly[-train,], type = "response")
head(glm.probs, 20)
##         1         2         4         9        12        14        16 
## 0.5052108 0.5210653 0.4755437 0.5299227 0.5443445 0.4671608 0.5615190 
##        17        21        22        23        25        26        29 
## 0.5200592 0.5086513 0.5256930 0.4894065 0.5602394 0.5539951 0.5563850 
##        31        33        34        35        38        40 
## 0.5054715 0.5592364 0.5339589 0.4803530 0.5679099 0.5389703
glm.pred <- rep("Down", 544)
glm.pred[glm.probs > 0.5 ] = "Up"
glm.pred <- factor(glm.pred)

test.er.rate <- 1-mean(glm.pred==Weekly[-train,]$Direction)
test.er.rate
## [1] 0.4366972
glm.fit <- glm(Direction ~  Lag1 + Lag2, 
               data = Weekly, subset = train, family = "binomial")

summary(glm.fit)
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2, family = "binomial", data = Weekly, 
##     subset = train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.397  -1.215   1.041   1.126   1.391  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)  0.11512    0.08676   1.327    0.185
## Lag1        -0.05294    0.03548  -1.492    0.136
## Lag2         0.01737    0.03503   0.496    0.620
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 752.7  on 543  degrees of freedom
## Residual deviance: 749.9  on 541  degrees of freedom
## AIC: 755.9
## 
## Number of Fisher Scoring iterations: 3
glm.probs <- predict(glm.fit, Weekly[-train,], type = "response")

glm.pred <- rep("Down", 544)
glm.pred[glm.probs > 0.5 ] = "Up"
glm.pred <- factor(glm.pred)

test.er.rate <- 1-mean(glm.pred==Weekly[-train,]$Direction)
test.er.rate
## [1] 0.4220183

Confusion matrix

cont.table <- table(factor(glm.pred), Weekly[-train,]$Direction)
cont.table
##       
##        Down  Up
##   Down   32  36
##   Up    194 283

10d)

Predict years 2009 and 2010 using model fit with binomial for data before year 2009

train <- Weekly %>% 
  filter(Year > 2008)
test <- Weekly %>% filter(Year < 2009)

glm.fit <- glm(Direction ~  Lag2, 
               data = train, family = "binomial")

summary(glm.fit)
## 
## Call:
## glm(formula = Direction ~ Lag2, family = "binomial", data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5767  -1.3052   0.9242   1.0419   1.2978  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)  0.32377    0.20136   1.608    0.108
## Lag2         0.08562    0.06707   1.277    0.202
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 141.04  on 103  degrees of freedom
## Residual deviance: 139.37  on 102  degrees of freedom
## AIC: 143.37
## 
## Number of Fisher Scoring iterations: 4
glm.probs <- predict(glm.fit, test, type = "response")

glm.pred <- rep("Down", nrow(test))
glm.pred[glm.probs > 0.5 ] = "Up"


test.er.rate.log <- 1-mean(glm.pred==test$Direction)
test.er.rate.log
## [1] 0.4497462
cont.table <- table(factor(glm.pred), test$Direction)
cont.table
##       
##        Down  Up
##   Down   18  20
##   Up    423 524

10 e)

Using LDA

train <- Weekly %>% 
  filter(Year >= 2008)
test <- Weekly %>% filter(Year < 2008)

lda.fit <- lda(Direction ~  Lag2, 
               data = train)

lda.fit
## Call:
## lda(Direction ~ Lag2, data = train)
## 
## Prior probabilities of groups:
##      Down        Up 
## 0.4615385 0.5384615 
## 
## Group means:
##            Lag2
## Down -0.6911528
## Up    0.5100714
## 
## Coefficients of linear discriminants:
##            LD1
## Lag2 0.2734846
lda.pred <- predict(lda.fit, test)
names(lda.pred)
## [1] "class"     "posterior" "x"
lda.class <- lda.pred$class

table(lda.class, test$Direction)
##          
## lda.class Down  Up
##      Down   64  69
##      Up    348 452
test.er.rate.lda <- 1-mean(lda.class==test$Direction)
test.er.rate.lda
## [1] 0.4469453

QDA

train <- Weekly %>% 
  filter(Year > 2008)
test <- Weekly %>% filter(Year < 2009)

qda.fit <- qda(Direction ~ Lag2, 
               data = train)

qda.fit
## Call:
## qda(Direction ~ Lag2, data = train)
## 
## Prior probabilities of groups:
##      Down        Up 
## 0.4134615 0.5865385 
## 
## Group means:
##             Lag2
## Down -0.08904651
## Up    0.69591803
qda.pred <- predict(qda.fit, test)
names(qda.pred)
## [1] "class"     "posterior"
qda.class <- qda.pred$class

table(qda.class, test$Direction)
##          
## qda.class Down  Up
##      Down   21  20
##      Up    420 524
test.er.rate.qda <- 1-mean(qda.class==test$Direction)
test.er.rate.qda
## [1] 0.4467005

Test error rate is 0.4467005, meaning 0.5532995 are correctly predicted :)

KNN

Four inputs needed in one step.

  • Matrix containing the predictors associated with the training data train.X
  • Matrix containing the predictors associated with the test data test.X
  • A vector containing the class labels for the training observations, labeled train.Direction
  • A value of K
attach(Weekly)
train = (Year <= 2008)
Weekly.0910 = Weekly[!train, ]

library(class)

train.X <- cbind(Weekly[train,3])
test.X <- cbind(Weekly[!train,3])

train.Direction <- Weekly[train,9]
test.Direction <- Weekly[!train,9]

set.seed(1)
knn.pred <- knn(train.X, test.X, train.Direction, k=1)
table(knn.pred, test.Direction)
##         test.Direction
## knn.pred Down Up
##     Down   21 30
##     Up     22 31
test.error <- mean(knn.pred==test.Direction)
test.error
## [1] 0.5
detach(Weekly)

Test error rate 0.5 with \(K=1\)

11a)

data("Auto")
glimpse(Auto)
## Observations: 392
## Variables: 9
## $ mpg          <dbl> 18, 15, 18, 16, 17, 15, 14, 14, 14, 15, 15, 14, 15,…
## $ cylinders    <dbl> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 6, 6, …
## $ displacement <dbl> 307, 350, 318, 304, 302, 429, 454, 440, 455, 390, 3…
## $ horsepower   <dbl> 130, 165, 150, 150, 140, 198, 220, 215, 225, 190, 1…
## $ weight       <dbl> 3504, 3693, 3436, 3433, 3449, 4341, 4354, 4312, 442…
## $ acceleration <dbl> 12.0, 11.5, 11.0, 12.0, 10.5, 10.0, 9.0, 8.5, 10.0,…
## $ year         <dbl> 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70,…
## $ origin       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, …
## $ name         <fct> chevrolet chevelle malibu, buick skylark 320, plymo…
Auto <- Auto %>%
  mutate(mpg01 = factor(ifelse(mpg > median(mpg), 1, 0)),
         origin = factor(origin))

Auto %>%
  ggplot(aes(mpg01, cylinders)) + # Useful
  geom_jitter()

Auto %>%
  ggplot(aes(mpg01, displacement)) + # Useful
  geom_boxplot()

Auto %>%
  ggplot(aes(mpg01, horsepower)) + # Useful
  geom_boxplot()

Auto %>%
  ggplot(aes(mpg01, weight)) + # Useful
  geom_boxplot()

Auto %>%
  ggplot(aes(mpg01, acceleration)) + # Maybe useful 
  geom_boxplot()

Auto %>%
  ggplot(aes(mpg01, year)) + # Useful
  geom_boxplot()

Auto %>%
  ggplot(aes(mpg01, origin)) + # Useful
  geom_jitter()

set.seed(1)
train <- sample(nrow(Auto), nrow(Auto)*0.6)

A_train <- Auto[train,]
A_test <- Auto[-train,]