data("Smarket")
glimpse(Smarket)
## Observations: 1,250
## Variables: 9
## $ Year <dbl> 2001, 2001, 2001, 2001, 2001, 2001, 2001, 2001, 2001, …
## $ Lag1 <dbl> 0.381, 0.959, 1.032, -0.623, 0.614, 0.213, 1.392, -0.4…
## $ Lag2 <dbl> -0.192, 0.381, 0.959, 1.032, -0.623, 0.614, 0.213, 1.3…
## $ Lag3 <dbl> -2.624, -0.192, 0.381, 0.959, 1.032, -0.623, 0.614, 0.…
## $ Lag4 <dbl> -1.055, -2.624, -0.192, 0.381, 0.959, 1.032, -0.623, 0…
## $ Lag5 <dbl> 5.010, -1.055, -2.624, -0.192, 0.381, 0.959, 1.032, -0…
## $ Volume <dbl> 1.1913, 1.2965, 1.4112, 1.2760, 1.2057, 1.3491, 1.4450…
## $ Today <dbl> 0.959, 1.032, -0.623, 0.614, 0.213, 1.392, -0.403, 0.0…
## $ Direction <fct> Up, Up, Down, Up, Up, Up, Down, Up, Up, Up, Down, Down…
set.seed(2)
train <- sample(nrow(Smarket), nrow(Smarket)/2)
Smarket_train <- Smarket[train,]
Smarket_test <- Smarket[-train,]
glm.fit <- glm(Direction ~ Lag1+Lag2, data=Smarket_train, family = "binomial")
summary(glm.fit)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2, family = "binomial", data = Smarket_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.359 -1.197 1.068 1.153 1.287
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.06024 0.08012 0.752 0.452
## Lag1 0.00466 0.07249 0.064 0.949
## Lag2 -0.08106 0.07065 -1.147 0.251
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 865.86 on 624 degrees of freedom
## Residual deviance: 864.53 on 622 degrees of freedom
## AIC: 870.53
##
## Number of Fisher Scoring iterations: 3
glm.probs <- predict(glm.fit, type = "response")
glm.pred <- rep("Down", 625)
glm.pred[glm.probs > 0.5 ] = "Up"
table(glm.pred, Smarket_train$Direction)
##
## glm.pred Down Up
## Down 76 56
## Up 227 266
1-mean(glm.pred==Smarket_train$Direction) # Training error rate
## [1] 0.4528
contrasts(Smarket_train$Direction) # Show coding of Up/ Down as 1/0
## Up
## Down 0
## Up 1
Fit model on test set
glm.probs <- predict(glm.fit, Smarket_test, type = "response")
glm.pred <- rep("Down", 625)
glm.pred[glm.probs > 0.5 ] = "Up"
table(glm.pred, Smarket_test$Direction)
##
## glm.pred Down Up
## Down 68 60
## Up 231 266
t.er.rate <- 1-mean(glm.pred==Smarket_test$Direction) # Test error rate
t.er.rate
## [1] 0.4656
Test error rate is 0.47, close to random guessing!
library(MASS)
lda.fit = lda(Direction~Lag1+Lag2, data=Smarket_train)
lda.fit
## Call:
## lda(Direction ~ Lag1 + Lag2, data = Smarket_train)
##
## Prior probabilities of groups:
## Down Up
## 0.4848 0.5152
##
## Group means:
## Lag1 Lag2
## Down -0.007838284 0.04465677
## Up -0.006968944 -0.06000932
##
## Coefficients of linear discriminants:
## LD1
## Lag1 0.05047901
## Lag2 -0.87782155
data("Weekly")
train <- sample(nrow(Weekly), nrow(Weekly)/2)
glm.fit <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5,
data = Weekly, subset = train, family = "binomial")
summary(glm.fit)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5, family = "binomial",
## data = Weekly, subset = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.385 -1.218 1.038 1.131 1.450
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.089e-01 8.733e-02 1.247 0.213
## Lag1 -5.025e-02 3.605e-02 -1.394 0.163
## Lag2 1.627e-02 3.540e-02 0.460 0.646
## Lag3 1.723e-02 3.681e-02 0.468 0.640
## Lag4 1.994e-02 3.581e-02 0.557 0.578
## Lag5 6.413e-05 3.678e-02 0.002 0.999
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 752.70 on 543 degrees of freedom
## Residual deviance: 749.44 on 538 degrees of freedom
## AIC: 761.44
##
## Number of Fisher Scoring iterations: 4
glm.probs <- predict(glm.fit, Weekly[-train,], type = "response")
head(glm.probs, 20)
## 1 2 4 9 12 14 16
## 0.5052108 0.5210653 0.4755437 0.5299227 0.5443445 0.4671608 0.5615190
## 17 21 22 23 25 26 29
## 0.5200592 0.5086513 0.5256930 0.4894065 0.5602394 0.5539951 0.5563850
## 31 33 34 35 38 40
## 0.5054715 0.5592364 0.5339589 0.4803530 0.5679099 0.5389703
glm.pred <- rep("Down", 544)
glm.pred[glm.probs > 0.5 ] = "Up"
glm.pred <- factor(glm.pred)
test.er.rate <- 1-mean(glm.pred==Weekly[-train,]$Direction)
test.er.rate
## [1] 0.4366972
glm.fit <- glm(Direction ~ Lag1 + Lag2,
data = Weekly, subset = train, family = "binomial")
summary(glm.fit)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2, family = "binomial", data = Weekly,
## subset = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.397 -1.215 1.041 1.126 1.391
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.11512 0.08676 1.327 0.185
## Lag1 -0.05294 0.03548 -1.492 0.136
## Lag2 0.01737 0.03503 0.496 0.620
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 752.7 on 543 degrees of freedom
## Residual deviance: 749.9 on 541 degrees of freedom
## AIC: 755.9
##
## Number of Fisher Scoring iterations: 3
glm.probs <- predict(glm.fit, Weekly[-train,], type = "response")
glm.pred <- rep("Down", 544)
glm.pred[glm.probs > 0.5 ] = "Up"
glm.pred <- factor(glm.pred)
test.er.rate <- 1-mean(glm.pred==Weekly[-train,]$Direction)
test.er.rate
## [1] 0.4220183
Confusion matrix
cont.table <- table(factor(glm.pred), Weekly[-train,]$Direction)
cont.table
##
## Down Up
## Down 32 36
## Up 194 283
Predict years 2009 and 2010 using model fit with binomial for data before year 2009
train <- Weekly %>%
filter(Year > 2008)
test <- Weekly %>% filter(Year < 2009)
glm.fit <- glm(Direction ~ Lag2,
data = train, family = "binomial")
summary(glm.fit)
##
## Call:
## glm(formula = Direction ~ Lag2, family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5767 -1.3052 0.9242 1.0419 1.2978
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.32377 0.20136 1.608 0.108
## Lag2 0.08562 0.06707 1.277 0.202
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 141.04 on 103 degrees of freedom
## Residual deviance: 139.37 on 102 degrees of freedom
## AIC: 143.37
##
## Number of Fisher Scoring iterations: 4
glm.probs <- predict(glm.fit, test, type = "response")
glm.pred <- rep("Down", nrow(test))
glm.pred[glm.probs > 0.5 ] = "Up"
test.er.rate.log <- 1-mean(glm.pred==test$Direction)
test.er.rate.log
## [1] 0.4497462
cont.table <- table(factor(glm.pred), test$Direction)
cont.table
##
## Down Up
## Down 18 20
## Up 423 524
Using LDA
train <- Weekly %>%
filter(Year >= 2008)
test <- Weekly %>% filter(Year < 2008)
lda.fit <- lda(Direction ~ Lag2,
data = train)
lda.fit
## Call:
## lda(Direction ~ Lag2, data = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4615385 0.5384615
##
## Group means:
## Lag2
## Down -0.6911528
## Up 0.5100714
##
## Coefficients of linear discriminants:
## LD1
## Lag2 0.2734846
lda.pred <- predict(lda.fit, test)
names(lda.pred)
## [1] "class" "posterior" "x"
lda.class <- lda.pred$class
table(lda.class, test$Direction)
##
## lda.class Down Up
## Down 64 69
## Up 348 452
test.er.rate.lda <- 1-mean(lda.class==test$Direction)
test.er.rate.lda
## [1] 0.4469453
train <- Weekly %>%
filter(Year > 2008)
test <- Weekly %>% filter(Year < 2009)
qda.fit <- qda(Direction ~ Lag2,
data = train)
qda.fit
## Call:
## qda(Direction ~ Lag2, data = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4134615 0.5865385
##
## Group means:
## Lag2
## Down -0.08904651
## Up 0.69591803
qda.pred <- predict(qda.fit, test)
names(qda.pred)
## [1] "class" "posterior"
qda.class <- qda.pred$class
table(qda.class, test$Direction)
##
## qda.class Down Up
## Down 21 20
## Up 420 524
test.er.rate.qda <- 1-mean(qda.class==test$Direction)
test.er.rate.qda
## [1] 0.4467005
Test error rate is 0.4467005, meaning 0.5532995 are correctly predicted :)
Four inputs needed in one step.
attach(Weekly)
train = (Year <= 2008)
Weekly.0910 = Weekly[!train, ]
library(class)
train.X <- cbind(Weekly[train,3])
test.X <- cbind(Weekly[!train,3])
train.Direction <- Weekly[train,9]
test.Direction <- Weekly[!train,9]
set.seed(1)
knn.pred <- knn(train.X, test.X, train.Direction, k=1)
table(knn.pred, test.Direction)
## test.Direction
## knn.pred Down Up
## Down 21 30
## Up 22 31
test.error <- mean(knn.pred==test.Direction)
test.error
## [1] 0.5
detach(Weekly)
Test error rate 0.5 with \(K=1\)
data("Auto")
glimpse(Auto)
## Observations: 392
## Variables: 9
## $ mpg <dbl> 18, 15, 18, 16, 17, 15, 14, 14, 14, 15, 15, 14, 15,…
## $ cylinders <dbl> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 6, 6, …
## $ displacement <dbl> 307, 350, 318, 304, 302, 429, 454, 440, 455, 390, 3…
## $ horsepower <dbl> 130, 165, 150, 150, 140, 198, 220, 215, 225, 190, 1…
## $ weight <dbl> 3504, 3693, 3436, 3433, 3449, 4341, 4354, 4312, 442…
## $ acceleration <dbl> 12.0, 11.5, 11.0, 12.0, 10.5, 10.0, 9.0, 8.5, 10.0,…
## $ year <dbl> 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70,…
## $ origin <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, …
## $ name <fct> chevrolet chevelle malibu, buick skylark 320, plymo…
Auto <- Auto %>%
mutate(mpg01 = factor(ifelse(mpg > median(mpg), 1, 0)),
origin = factor(origin))
Auto %>%
ggplot(aes(mpg01, cylinders)) + # Useful
geom_jitter()
Auto %>%
ggplot(aes(mpg01, displacement)) + # Useful
geom_boxplot()
Auto %>%
ggplot(aes(mpg01, horsepower)) + # Useful
geom_boxplot()
Auto %>%
ggplot(aes(mpg01, weight)) + # Useful
geom_boxplot()
Auto %>%
ggplot(aes(mpg01, acceleration)) + # Maybe useful
geom_boxplot()
Auto %>%
ggplot(aes(mpg01, year)) + # Useful
geom_boxplot()
Auto %>%
ggplot(aes(mpg01, origin)) + # Useful
geom_jitter()
set.seed(1)
train <- sample(nrow(Auto), nrow(Auto)*0.6)
A_train <- Auto[train,]
A_test <- Auto[-train,]