This question should be answered using the Weekly data set, which is part of the ISLR package. This data is similar in nature to the Smarket data from this chapter’s lab, except that it contains 1,089 weekly returns for 21 years, from beginning of 1900 to the end of 2010. (a) Produce some numerical and graphical summaries of the Weekly data. Do there appear to be any patterns?
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.0.5
data(Weekly)
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
cor(Weekly[,-9])
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876
## Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535
## Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865
## Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000
## Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027
## Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873
## Lag5 Volume Today
## Year -0.030519101 0.84194162 -0.032459894
## Lag1 -0.008183096 -0.06495131 -0.075031842
## Lag2 -0.072499482 -0.08551314 0.059166717
## Lag3 0.060657175 -0.06928771 -0.071243639
## Lag4 -0.075675027 -0.06107462 -0.007825873
## Lag5 1.000000000 -0.05851741 0.011012698
## Volume -0.058517414 1.00000000 -0.033077783
## Today 0.011012698 -0.03307778 1.000000000
pairs(Weekly)
plot(Weekly$Year, Weekly$Volume)
* There is no correlation between any of the variables, including Today with any of the Lag times. There does seem to be an increase in volume over time.
glm.fit.weekly <- glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume, data = Weekly, family=binomial)
summary(glm.fit.weekly)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
glm.probs = predict(glm.fit.weekly, type="response")
glm.pred = rep("Down", length(glm.probs))
glm.pred[glm.probs>0.5]="Up"
table(glm.pred, Weekly$Direction)
##
## glm.pred Down Up
## Down 54 48
## Up 430 557
#Accuracy rate
mean(glm.pred == Weekly$Direction)
## [1] 0.5610652
(54+557)/(54+48+430+557)
## [1] 0.5610652
#Error rate
mean(glm.pred != Weekly$Direction)
## [1] 0.4389348
#Positive predictive value
(557)/(557+430)
## [1] 0.5643364
#Negative predictive value
54/(54+48)
## [1] 0.5294118
#True positive rate
557/(557+48)
## [1] 0.9206612
#False positive rate
430/(430+54)
## [1] 0.8884298
train = (Weekly$Year < 2009)
Weekly.2009=Weekly[!train,]
dim(Weekly.2009)
## [1] 104 9
Direction.2009=Weekly$Direction[!train]
glm.fits = glm(Direction~Lag2, data = Weekly, family=binomial, subset=train)
glm.probs = predict(glm.fits, Weekly.2009, type="response")
glm.pred = rep("Down", length(glm.probs))
glm.pred[glm.probs>0.5] = "Up"
table(glm.pred, Direction.2009)
## Direction.2009
## glm.pred Down Up
## Down 9 5
## Up 34 56
#Accurate predictions
(9+56)/(9+5+34+56)
## [1] 0.625
library(MASS)
lda.fit = lda(Direction~Lag2, data=Weekly, subset=train)
lda.fit
## Call:
## lda(Direction ~ Lag2, data = Weekly, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
##
## Coefficients of linear discriminants:
## LD1
## Lag2 0.4414162
plot(lda.fit)
lda.pred = predict(lda.fit, Weekly.2009)
names(lda.pred)
## [1] "class" "posterior" "x"
lda.class = lda.pred$class
table(lda.class, Direction.2009)
## Direction.2009
## lda.class Down Up
## Down 9 5
## Up 34 56
#Accuracy rate
(9+56)/(9+5+34+56)
## [1] 0.625
#Error rate
mean(lda.fit == Direction.2009)
## Warning in `==.default`(lda.fit, Direction.2009): longer object length is not a
## multiple of shorter object length
## Warning in is.na(e1) | is.na(e2): longer object length is not a multiple of
## shorter object length
## [1] 0
qda.fit = qda(Direction~Lag2, data = Weekly, subset=train)
qda.fit
## Call:
## qda(Direction ~ Lag2, data = Weekly, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
qda.class = predict(qda.fit, Weekly.2009)$class
table(qda.class, Direction.2009)
## Direction.2009
## qda.class Down Up
## Down 0 0
## Up 43 61
#Accuracy rate
(61)/(61+43)
## [1] 0.5865385
#Error rate
mean(qda.class != Direction.2009)
## [1] 0.4134615
library(class)
train.X = as.matrix(Weekly$Lag2[train])
test.X = as.matrix(Weekly$Lag2[!train])
train.Direction = Weekly$Direction[train]
set.seed(1)
knn.pred = knn(train.X, test.X, train.Direction, k = 1)
table(knn.pred, Direction.2009)
## Direction.2009
## knn.pred Down Up
## Down 21 30
## Up 22 31
#Accuracy rate
(21+31)/(21+30+22+31)
## [1] 0.5
#Logistic regression with Lag2, volume & interaction of lag2 and volume
glm.fits = glm(Direction~Lag2+Volume+Volume*Lag2, data = Weekly, family=binomial, subset=train)
glm.probs = predict(glm.fits, Weekly.2009, type="response")
glm.pred = rep("Down", length(glm.probs))
glm.pred[glm.probs>0.5] = "Up"
table(glm.pred, Direction.2009)
## Direction.2009
## glm.pred Down Up
## Down 20 25
## Up 23 36
#Error rate
mean(glm.pred != Direction.2009)
## [1] 0.4615385
#Accuracy rate
mean(glm.pred == Direction.2009)
## [1] 0.5384615
#LDA with Lag2, volume & interaction of lag2 and volume
lda.fit = lda(Direction~Lag2+Volume+Volume*Lag2, data=Weekly, subset=train)
lda.fit
## Call:
## lda(Direction ~ Lag2 + Volume + Volume * Lag2, data = Weekly,
## subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2 Volume Lag2:Volume
## Down -0.03568254 1.266966 -0.70124208
## Up 0.26036581 1.156529 0.06257301
##
## Coefficients of linear discriminants:
## LD1
## Lag2 0.343334406
## Volume -0.369489705
## Lag2:Volume 0.007130365
plot(lda.fit)
lda.pred = predict(lda.fit, Weekly.2009)
names(lda.pred)
## [1] "class" "posterior" "x"
lda.class = lda.pred$class
table(lda.class, Direction.2009)
## Direction.2009
## lda.class Down Up
## Down 20 25
## Up 23 36
#Error rate
mean(lda.class != Direction.2009)
## [1] 0.4615385
#Accuracy rate
mean(lda.class == Direction.2009)
## [1] 0.5384615
#QDA with Lag2, volume & interaction of lag2 and volume
qda.fit = qda(Direction~Lag2+Volume+Volume*Lag2, data = Weekly, subset=train)
qda.fit
## Call:
## qda(Direction ~ Lag2 + Volume + Volume * Lag2, data = Weekly,
## subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2 Volume Lag2:Volume
## Down -0.03568254 1.266966 -0.70124208
## Up 0.26036581 1.156529 0.06257301
qda.class = predict(qda.fit, Weekly.2009)$class
table(qda.class, Direction.2009)
## Direction.2009
## qda.class Down Up
## Down 37 49
## Up 6 12
#Error rate
mean(qda.class != Direction.2009)
## [1] 0.5288462
#Accuracy rate
mean(qda.class == Direction.2009)
## [1] 0.4711538
#KNN with k=10
train.X = as.matrix(Weekly$Lag2[train])
test.X = as.matrix(Weekly$Lag2[!train])
train.Direction = Weekly$Direction[train]
set.seed(1)
knn.pred = knn(train.X, test.X, train.Direction, k = 10)
table(knn.pred, Direction.2009)
## Direction.2009
## knn.pred Down Up
## Down 17 21
## Up 26 40
#Error rate
mean(knn.pred != Direction.2009)
## [1] 0.4519231
#Accuracy rate
mean(knn.pred == Direction.2009)
## [1] 0.5480769
#KNN with k=100
train.X = as.matrix(Weekly$Lag2[train])
test.X = as.matrix(Weekly$Lag2[!train])
train.Direction = Weekly$Direction[train]
set.seed(1)
knn.pred = knn(train.X, test.X, train.Direction, k = 100)
table(knn.pred, Direction.2009)
## Direction.2009
## knn.pred Down Up
## Down 10 11
## Up 33 50
#Error rate
mean(knn.pred != Direction.2009)
## [1] 0.4230769
#Accuracy rate
mean(knn.pred == Direction.2009)
## [1] 0.5769231
In this problem, you will develop a model to predict whether a given car gets high or low gas mileage based on the Auto data set.
auto <- data(Auto)
mpg01 = rep(0, length(Auto$mpg))
mpg01[Auto$mpg>median(Auto$mpg)] = 1
auto2 = data.frame(Auto, mpg01)
cor(auto2[,-9])
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## mpg01 0.8369392 -0.7591939 -0.7534766 -0.6670526 -0.7577566
## acceleration year origin mpg01
## mpg 0.4233285 0.5805410 0.5652088 0.8369392
## cylinders -0.5046834 -0.3456474 -0.5689316 -0.7591939
## displacement -0.5438005 -0.3698552 -0.6145351 -0.7534766
## horsepower -0.6891955 -0.4163615 -0.4551715 -0.6670526
## weight -0.4168392 -0.3091199 -0.5850054 -0.7577566
## acceleration 1.0000000 0.2903161 0.2127458 0.3468215
## year 0.2903161 1.0000000 0.1815277 0.4299042
## origin 0.2127458 0.1815277 1.0000000 0.5136984
## mpg01 0.3468215 0.4299042 0.5136984 1.0000000
pairs(auto2)
par(mfrow = c(2, 2))
boxplot(auto2$mpg01, auto2$cylinders)
boxplot(auto2$mpg01, auto2$displacement)
boxplot(auto2$mpg01, auto2$weight)
boxplot(auto2$mpg01, auto2$horsepower)
* Looking at the correlation matrix, we can see that there seems to be an negative correlation with mpg01, cylinders, displacement, weight, and a little so with horsepower.
* Of course the strongest correlation is with the mpg variable.
train <- (auto2$year %% 2 == 0)
auto.train <- auto2[train,]
auto.test <- auto2[!train,]
mpg01.test <- mpg01[!train]
lda.fit = lda(mpg01~cylinders+weight+horsepower+displacement, data=auto2, subset=train)
lda.fit
## Call:
## lda(mpg01 ~ cylinders + weight + horsepower + displacement, data = auto2,
## subset = train)
##
## Prior probabilities of groups:
## 0 1
## 0.4571429 0.5428571
##
## Group means:
## cylinders weight horsepower displacement
## 0 6.812500 3604.823 133.14583 271.7396
## 1 4.070175 2314.763 77.92105 111.6623
##
## Coefficients of linear discriminants:
## LD1
## cylinders -0.6741402638
## weight -0.0011465750
## horsepower 0.0059035377
## displacement 0.0004481325
plot(lda.fit)
lda.pred = predict(lda.fit, auto.test)
names(lda.pred)
## [1] "class" "posterior" "x"
lda.class = lda.pred$class
table(lda.class, mpg01.test)
## mpg01.test
## lda.class 0 1
## 0 86 9
## 1 14 73
#Error rate
mean(lda.pred$class != mpg01.test)
## [1] 0.1263736
#Accuracy rate
mean(lda.pred$class == mpg01.test)
## [1] 0.8736264
qda.fit = qda(mpg01~cylinders+weight+horsepower+displacement, data=auto2, subset=train)
qda.fit
## Call:
## qda(mpg01 ~ cylinders + weight + horsepower + displacement, data = auto2,
## subset = train)
##
## Prior probabilities of groups:
## 0 1
## 0.4571429 0.5428571
##
## Group means:
## cylinders weight horsepower displacement
## 0 6.812500 3604.823 133.14583 271.7396
## 1 4.070175 2314.763 77.92105 111.6623
qda.class = predict(qda.fit, auto.test)$class
table(qda.class, mpg01.test)
## mpg01.test
## qda.class 0 1
## 0 89 13
## 1 11 69
#Accuracy rate
mean(qda.class == mpg01.test)
## [1] 0.8681319
#Error rate
mean(qda.class != mpg01.test)
## [1] 0.1318681
glm.fits = glm(mpg01~cylinders+weight+horsepower+displacement, family=binomial, data=auto2, subset=train)
summary(glm.fits)
##
## Call:
## glm(formula = mpg01 ~ cylinders + weight + horsepower + displacement,
## family = binomial, data = auto2, subset = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.48027 -0.03413 0.10583 0.29634 2.57584
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 17.658730 3.409012 5.180 2.22e-07 ***
## cylinders -1.028032 0.653607 -1.573 0.1158
## weight -0.002922 0.001137 -2.569 0.0102 *
## horsepower -0.050611 0.025209 -2.008 0.0447 *
## displacement 0.002462 0.015030 0.164 0.8699
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 289.58 on 209 degrees of freedom
## Residual deviance: 83.24 on 205 degrees of freedom
## AIC: 93.24
##
## Number of Fisher Scoring iterations: 7
glm.probs = predict(glm.fits, auto.test, type="response")
glm.pred = rep(0, length(glm.probs))
glm.pred[glm.probs>0.5] = 1
table(glm.pred, mpg01.test)
## mpg01.test
## glm.pred 0 1
## 0 89 11
## 1 11 71
# Accuracy rate
mean(glm.pred == mpg01.test)
## [1] 0.8791209
# Error rate
mean(glm.pred != mpg01.test)
## [1] 0.1208791
train.X = cbind(auto2$cylinders, auto2$weight, auto2$displacement, auto2$horsepower)[train, ]
test.X = cbind(auto2$cylinders, auto2$weight, auto2$displacement, auto2$horsepower)[!train, ]
train.mpg01 = auto2$mpg01[train]
set.seed(1)
#For K=1
knn.pred = knn(train.X, test.X, train.mpg01, k = 1)
table(knn.pred, mpg01.test)
## mpg01.test
## knn.pred 0 1
## 0 83 11
## 1 17 71
#Error rate
mean(knn.pred != mpg01.test)
## [1] 0.1538462
#For K=10
knn.pred = knn(train.X, test.X, train.mpg01, k = 10)
table(knn.pred, mpg01.test)
## mpg01.test
## knn.pred 0 1
## 0 77 7
## 1 23 75
#Error rate
mean(knn.pred != mpg01.test)
## [1] 0.1648352
#For K=100
knn.pred = knn(train.X, test.X, train.mpg01, k = 100)
table(knn.pred, mpg01.test)
## mpg01.test
## knn.pred 0 1
## 0 81 7
## 1 19 75
#Error rate
mean(knn.pred != mpg01.test)
## [1] 0.1428571
#For K=1000
knn.pred = knn(train.X, test.X, train.mpg01, k = 200)
table(knn.pred, mpg01.test)
## mpg01.test
## knn.pred 0 1
## 0 0 0
## 1 100 82
#Error rate
mean(knn.pred != mpg01.test)
## [1] 0.5494505
Using the Boston data set, fit classification models in order to predict whether a given suburb has a crime rate above or below the median. Explore logistic regression, LDA, and KNN models using various subsets of the predictors. Describe your findings.
data(Boston)
crime = rep(0, length(Boston$crim))
crime[Boston$crim > median(Boston$crim)] = 1
boston = data.frame(Boston, crime)
train = 1:(length(boston$crim)/2)
test = (length(boston$crim)/2 + 1):length(boston$crim)
Boston.train = boston[train,]
Boston.test = boston[test,]
crime.test = crime[test]
#logistic regression
glm.fit.boston = glm(crime~. -crim -crime, data = boston, family = binomial, subset = train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
glm.probs = predict(glm.fit.boston, Boston.test, type="response")
glm.pred = rep(0, length(glm.probs))
glm.pred[glm.probs > 0.5] = 1
table(glm.pred, crime.test)
## crime.test
## glm.pred 0 1
## 0 68 24
## 1 22 139
mean(glm.pred != crime.test)
## [1] 0.1818182
summary(glm.fit.boston)
##
## Call:
## glm(formula = crime ~ . - crim - crime, family = binomial, data = boston,
## subset = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.83229 -0.06593 0.00000 0.06181 2.61513
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -91.319906 19.490273 -4.685 2.79e-06 ***
## zn -0.815573 0.193373 -4.218 2.47e-05 ***
## indus 0.354172 0.173862 2.037 0.04164 *
## chas 0.167396 0.991922 0.169 0.86599
## nox 93.706326 21.202008 4.420 9.88e-06 ***
## rm -4.719108 1.788765 -2.638 0.00833 **
## age 0.048634 0.024199 2.010 0.04446 *
## dis 4.301493 0.979996 4.389 1.14e-05 ***
## rad 3.039983 0.719592 4.225 2.39e-05 ***
## tax -0.006546 0.007855 -0.833 0.40461
## ptratio 1.430877 0.359572 3.979 6.91e-05 ***
## black -0.017552 0.006734 -2.606 0.00915 **
## lstat 0.190439 0.086722 2.196 0.02809 *
## medv 0.598533 0.185514 3.226 0.00125 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 329.367 on 252 degrees of freedom
## Residual deviance: 69.568 on 239 degrees of freedom
## AIC: 97.568
##
## Number of Fisher Scoring iterations: 10
glm.fit.boston = glm(crime~indus+nox+ptratio, data=boston, family = binomial, subset=train)
glm.probs = predict(glm.fit.boston, Boston.test, type="response")
glm.pred = rep(0, length(glm.probs))
glm.pred[glm.probs > 0.5] = 1
table(glm.pred, crime.test)
## crime.test
## glm.pred 0 1
## 0 64 11
## 1 26 152
mean(glm.pred != crime.test)
## [1] 0.1462451
#LDA
lda.fit = lda(crime~. -crime - crim, data = boston, family = binomial, subset=train)
lda.pred = predict(lda.fit, Boston.test)
table(lda.pred$class, crime.test)
## crime.test
## 0 1
## 0 80 24
## 1 10 139
mean(lda.pred$class != crime.test)
## [1] 0.1343874
lda.fit = lda(crime~indus+nox+ptratio, data = boston, family = binomial, subset = train)
lda.pred = predict(lda.fit, Boston.test)
table(lda.pred$class, crime.test)
## crime.test
## 0 1
## 0 75 28
## 1 15 135
mean(lda.pred$class != crime.test)
## [1] 0.1699605
#KNN
train.X = cbind(boston$indus, boston$nox, boston$ptratio)[train,]
test.X = cbind(boston$indus, boston$nox, boston$ptratio)[test,]
train.crime = crime[train]
set.seed(1)
knn.pred = knn(train.X, test.X, train.crime, k=1)
table(knn.pred, crime.test)
## crime.test
## knn.pred 0 1
## 0 85 163
## 1 5 0
mean(knn.pred != crime.test)
## [1] 0.6640316
knn.pred = knn(train.X, test.X, train.crime, k=10)
table(knn.pred, crime.test)
## crime.test
## knn.pred 0 1
## 0 74 27
## 1 16 136
mean(knn.pred != crime.test)
## [1] 0.1699605
knn.pred = knn(train.X, test.X, train.crime, k=100)
table(knn.pred, crime.test)
## crime.test
## knn.pred 0 1
## 0 90 163
## 1 0 0
mean(knn.pred != crime.test)
## [1] 0.6442688