library(ISLR)
## Warning: package 'ISLR' was built under R version 4.0.3
library(MASS)
library(class)
## Warning: package 'class' was built under R version 4.0.3
library(caret)
## Warning: package 'caret' was built under R version 4.0.3
## Loading required package: lattice
## Loading required package: ggplot2
Part A.
data("Weekly")
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
pairs(Weekly)
cor(Weekly[,-9])
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876
## Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535
## Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865
## Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000
## Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027
## Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873
## Lag5 Volume Today
## Year -0.030519101 0.84194162 -0.032459894
## Lag1 -0.008183096 -0.06495131 -0.075031842
## Lag2 -0.072499482 -0.08551314 0.059166717
## Lag3 0.060657175 -0.06928771 -0.071243639
## Lag4 -0.075675027 -0.06107462 -0.007825873
## Lag5 1.000000000 -0.05851741 0.011012698
## Volume -0.058517414 1.00000000 -0.033077783
## Today 0.011012698 -0.03307778 1.000000000
Year and Volume seem to have a positive relationship.
Part B.
m12 = glm(Direction ~ ., data = Weekly[, -c(1,8)], family = binomial)
summary(m12)
##
## Call:
## glm(formula = Direction ~ ., family = binomial, data = Weekly[,
## -c(1, 8)])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
Lag 2 is statistically significant with a p-value of 0.03.
Part C.
m12.probs = predict(m12, type = "response")
m12.pred = rep("Down", length(m12.probs))
m12.pred[m12.probs > 0.5] = "Up"
table(m12.pred, Weekly$Direction)
##
## m12.pred Down Up
## Down 54 48
## Up 430 557
(54+557)/(54+557+48+430)
## [1] 0.5610652
557/(557+48)
## [1] 0.9206612
54/(430+54)
## [1] 0.1115702
56% of the predictions are correct. When the market goes up the model is right 92.07% of the time. When the market goes down the model is right 11.16% of the time.
Part D.
train <- (Weekly$Year < 2009)
Weekly0910 <- Weekly[!train, ]
Direction0901 <-Weekly$Direction[!train]
m13 <- glm(Direction ~ Lag2, data = Weekly, family = binomial, subset = train)
summary(m13)
##
## Call:
## glm(formula = Direction ~ Lag2, family = binomial, data = Weekly,
## subset = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.536 -1.264 1.021 1.091 1.368
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.20326 0.06428 3.162 0.00157 **
## Lag2 0.05810 0.02870 2.024 0.04298 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1354.7 on 984 degrees of freedom
## Residual deviance: 1350.5 on 983 degrees of freedom
## AIC: 1354.5
##
## Number of Fisher Scoring iterations: 4
m13probs <- predict(m13, Weekly0910, type = "response")
m13preds <- rep("Down", length(m13probs))
m13preds[m13probs> 0.5] <- "Up"
table(m13preds, Direction0901)
## Direction0901
## m13preds Down Up
## Down 9 5
## Up 34 56
(9+56)/(9+5+34+56)
## [1] 0.625
56/(56+5)
## [1] 0.9180328
9/(9+34)
## [1] 0.2093023
The model predicts 62.5% of the predictions correctly. When the market goes up, the model correctly predicts this 91.80% of the time. When the market goes down, the model is correct 20.93% of the time.
Part E
mlda = lda(Direction ~ Lag2, data = Weekly, subset = train)
mlda
## Call:
## lda(Direction ~ Lag2, data = Weekly, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
##
## Coefficients of linear discriminants:
## LD1
## Lag2 0.4414162
pred.lda <- predict(mlda, Weekly0910)
table(pred.lda$class, Direction0901)
## Direction0901
## Down Up
## Down 9 5
## Up 34 56
(9+56)/(9+5+34+56)
## [1] 0.625
56/(56+5)
## [1] 0.9180328
9/(9+34)
## [1] 0.2093023
The model predicts 62.5% of the predictions correctly. When the market goes up, the model correctly predicts this 91.80% of the time. When the market goes down, the model is correct 20.93% of the time.
Part F.
mqda = qda(Direction ~ Lag2, data = Weekly, subset = train)
mqda
## Call:
## qda(Direction ~ Lag2, data = Weekly, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
pred.qda <- predict(mqda, Weekly0910)
table(pred.qda$class, Direction0901)
## Direction0901
## Down Up
## Down 0 0
## Up 43 61
(0+61)/(0+0+43+61)
## [1] 0.5865385
61/(61 +0)
## [1] 1
0/(0+43)
## [1] 0
The model predicts 58.65% of the predictions correctly. When the market goes up, the model correctly predicts this 100% of the time. When the market goes down, the model is correct 0% of the time.
Part G.
train.X <- as.matrix(Weekly$Lag2[train])
test.X <- as.matrix(Weekly$Lag2[!train])
train.Direction <- Weekly$Direction[train]
set.seed(1)
pred.knn <- knn(train.X, test.X, train.Direction, k = 1)
table(pred.knn, Direction0901)
## Direction0901
## pred.knn Down Up
## Down 21 30
## Up 22 31
(21+31)/(21+30+22+31)
## [1] 0.5
31/(31+30)
## [1] 0.5081967
21/(21+22)
## [1] 0.4883721
The model predicts 50% of the predictions correctly. When the market goes up, the model correctly predicts this 50.82% of the time. When the market goes down, the model is correct 48.84% of the time.
Part H.
The logistic model and the LDA model have a higher accuracy. QDA performs better than KNN.
Part I
glm.fit = glm(Direction ~ Lag2:Lag1, data = Weekly, family = binomial, subset = train)
glm.probs = predict(glm.fit, Weekly0910, type = "response")
glm.pred = rep("Down", length(glm.probs))
glm.pred[glm.probs > 0.5] = "Up"
Direction.0910 = Weekly$Direction[!train]
table(glm.pred, Direction.0910)
## Direction.0910
## glm.pred Down Up
## Down 1 1
## Up 42 60
(1+60)/(1+1+42+60)
## [1] 0.5865385
lda.fit2 = lda(Direction ~ Lag2:Lag1, data = Weekly, subset = train)
lda.pred2 = predict(lda.fit2, Weekly0910)
table(lda.pred2$class, Direction0901)
## Direction0901
## Down Up
## Down 0 1
## Up 43 60
(0+60)/(0+1+43+60)
## [1] 0.5769231
qda.fit = qda(Direction ~ Lag2:Lag1, data = Weekly, subset = train)
qda.class = predict(qda.fit, Weekly0910)$class
table(qda.class, Direction.0910)
## Direction.0910
## qda.class Down Up
## Down 16 32
## Up 27 29
(16+29)/(16+32+27+29)
## [1] 0.4326923
knn.pred = knn(train.X, test.X, train.Direction, k = 10)
table(knn.pred, Direction.0910)
## Direction.0910
## knn.pred Down Up
## Down 17 18
## Up 26 43
(17+43)/(18+17+26+43)
## [1] 0.5769231
The logistic model with the interaction of Lag1 and Lag2 predict 58.65% of the predictions correctly. The LDA model with the interaction between Lag1 and Lag2 predicts 57.69% of the predictions correctly. The QDA model with the interaction between Lag1 and Lag2 predict 43.27% of the predictions correctly. The KNN with k=10 predicts 57.69% of the predictions correctly.
Part A.
mpg01 = rep(0, length(Auto$mpg))
mpg01[Auto$mpg > median(Auto$mpg)] = 1
Auto2 = data.frame(Auto, mpg01)
pairs(Auto2)
boxplot(cylinders ~ mpg01, data = Auto2, main = "Cylinders vs mpg01")
boxplot(displacement ~ mpg01, data = Auto2, main = "Displacement vs mpg01")
boxplot(horsepower ~ mpg01, data = Auto2, main = "Horsepower vs mpg01")
boxplot(weight ~ mpg01, data = Auto2, main = "Weight vs mpg01")
boxplot(acceleration ~ mpg01, data = Auto2, main = "Acceleration vs mpg01")
There is a possible relationship between mpg01 and cylinders, weight, displacement, and horsepower.
Part C.
train = (Auto2$year%%2 == 0)
test = !train
Auto.train = Auto2[train, ]
Auto.test = Auto2[test, ]
mpg01.test = mpg01[test]
Part D.
lda.fit = lda(mpg01 ~ cylinders + weight + displacement + horsepower, data = Auto2,
subset = train)
lda.pred = predict(lda.fit, Auto.test)
mean(lda.pred$class != mpg01.test)
## [1] 0.1263736
There is a 12.64% test error rate.
Part E.
qda.fit = qda(mpg01 ~ cylinders + weight + displacement + horsepower, data = Auto2,
subset = train)
qda.pred = predict(qda.fit, Auto.test)
mean(qda.pred$class != mpg01.test)
## [1] 0.1318681
There is a 13.19% test error rate.
Part F.
glm.fit = glm(mpg01 ~ cylinders + weight + displacement + horsepower, data = Auto2,
family = binomial, subset = train)
glm.probs = predict(glm.fit, Auto.test, type = "response")
glm.pred = rep(0, length(glm.probs))
glm.pred[glm.probs > 0.5] = 1
mean(glm.pred != mpg01.test)
## [1] 0.1208791
There is a 12.09% test error rate.
Part G.
train.X = cbind(Auto$cylinders, Auto$weight, Auto$displacement, Auto$horsepower)[train, ]
test.X = cbind(Auto$cylinders, Auto$weight, Auto$displacement, Auto$horsepower)[test, ]
train.mpg01 = mpg01[train]
set.seed(1)
knn.pred = knn(train.X, test.X, train.mpg01, k = 1)
mean(knn.pred != mpg01.test)
## [1] 0.1538462
knn.pred = knn(train.X, test.X, train.mpg01, k = 10)
mean(knn.pred != mpg01.test)
## [1] 0.1648352
knn.pred = knn(train.X, test.X, train.mpg01, k = 7)
mean(knn.pred != mpg01.test)
## [1] 0.1483516
k=1 there is a 15.38% test error rate, k=10 there is a 16.48%, k=7 there is a 14.84% test error rate. 7 k had the lowest test error rate so it performed better than the others.
data("Boston")
attach(Boston)
crime01 = rep(0, length(crim))
crime01[crim > median(crim)] = 1
Boston = data.frame(Boston, crime01)
train = 1:(dim(Boston)[1]/2)
test = (dim(Boston)[1]/2 + 1):dim(Boston)[1]
Boston.train = Boston[train, ]
Boston.test = Boston[test, ]
crime01.test = crime01[test]
glm.fit = glm(crime01 ~ . - crime01 - crim, data = Boston, family = binomial,
subset = train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
glm.probs = predict(glm.fit, Boston.test, type = "response")
glm.pred = rep(0, length(glm.probs))
glm.pred[glm.probs > 0.5] = 1
mean(glm.pred != crime01.test)
## [1] 0.1818182
Logistic regression has an error rate of 18.18%.
lda.fit = lda(crime01 ~ . - crime01 - crim - chas - tax, data = Boston, subset = train)
lda.pred = predict(lda.fit, Boston.test)
mean(lda.pred$class != crime01.test)
## [1] 0.1225296
This LDA model has a 12.25% error rate.
train.X = cbind(zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, black,
lstat, medv)[train, ]
test.X = cbind(zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, black,
lstat, medv)[test, ]
train.crime01 = crime01[train]
set.seed(1)
knn.pred = knn(train.X, test.X, train.crime01, k = 1)
mean(knn.pred != crime01.test)
## [1] 0.458498
This KNN model where k = 1 has a 45.85 test error rate.
knn.pred = knn(train.X, test.X, train.crime01, k = 5)
mean(knn.pred != crime01.test)
## [1] 0.1699605
This KNN model where k = 5 has a 16.99% test error rate.
knn.pred = knn(train.X, test.X, train.crime01, k = 7)
mean(knn.pred != crime01.test)
## [1] 0.1146245
This KNN model where k = 7 has a 11.46% test error rate.