library(ISLR2)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
library(class)
library(e1071)
Weekly data
set, which is part of the ISLR2 package. This data is
similar in nature to the Smarket data from this chapter’s
lab, except that it contains 1,089 weekly returns for 21 years, from the
beginning of 1990 to the end of 2010.str(Weekly)
## 'data.frame': 1089 obs. of 9 variables:
## $ Year : num 1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
## $ Lag1 : num 0.816 -0.27 -2.576 3.514 0.712 ...
## $ Lag2 : num 1.572 0.816 -0.27 -2.576 3.514 ...
## $ Lag3 : num -3.936 1.572 0.816 -0.27 -2.576 ...
## $ Lag4 : num -0.229 -3.936 1.572 0.816 -0.27 ...
## $ Lag5 : num -3.484 -0.229 -3.936 1.572 0.816 ...
## $ Volume : num 0.155 0.149 0.16 0.162 0.154 ...
## $ Today : num -0.27 -2.576 3.514 0.712 1.178 ...
## $ Direction: Factor w/ 2 levels "Down","Up": 1 1 2 2 2 1 2 2 2 1 ...
pairs(Weekly)
cor(Weekly[,-9])
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876
## Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535
## Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865
## Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000
## Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027
## Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873
## Lag5 Volume Today
## Year -0.030519101 0.84194162 -0.032459894
## Lag1 -0.008183096 -0.06495131 -0.075031842
## Lag2 -0.072499482 -0.08551314 0.059166717
## Lag3 0.060657175 -0.06928771 -0.071243639
## Lag4 -0.075675027 -0.06107462 -0.007825873
## Lag5 1.000000000 -0.05851741 0.011012698
## Volume -0.058517414 1.00000000 -0.033077783
## Today 0.011012698 -0.03307778 1.000000000
attach(Weekly)
plot(Volume)
Volume is increasing
over time.Direction as the response and the five lag variables plus
Volume as predictors. Use the summary function to print the
results. Do any of the predictors appear to be statistically
significant? If so, which ones?glm.fits=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume, data=Weekly,family=binomial)
summary (glm.fits)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
glm.probs = predict(glm.fits, type = "response")
glm.pred = rep("Down", 1089)
glm.pred[glm.probs > 0.5] = "Up"
table(glm.pred, Weekly$Direction)
##
## glm.pred Down Up
## Down 54 48
## Up 430 557
mean(glm.pred==Weekly$Direction )
## [1] 0.5610652
Lag2 as the only predictor.
Compute the confusion matrix and the overall fraction of correct
predictions for the held out data (that is, the data from 2009 and
2010).train=(Year<2009)
Weekly.2009= Weekly[!train ,]
dim(Weekly.2009)
## [1] 104 9
glm.fit = glm(Direction ~ Lag2, data = Weekly, subset = train, family = "binomial")
summary(glm.fit)
##
## Call:
## glm(formula = Direction ~ Lag2, family = "binomial", data = Weekly,
## subset = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.536 -1.264 1.021 1.091 1.368
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.20326 0.06428 3.162 0.00157 **
## Lag2 0.05810 0.02870 2.024 0.04298 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1354.7 on 984 degrees of freedom
## Residual deviance: 1350.5 on 983 degrees of freedom
## AIC: 1354.5
##
## Number of Fisher Scoring iterations: 4
glm.probs = predict(glm.fit, Weekly.2009, type = "response")
glm.pred = rep("Down", 104)
glm.pred[glm.probs > 0.5] = "Up"
table(glm.pred, Weekly.2009$Direction)
##
## glm.pred Down Up
## Down 9 5
## Up 34 56
mean(glm.pred == Weekly.2009$Direction)
## [1] 0.625
Lag2 as the predictor, the model
correctly predicted the market direction for 62.5% of the weeks in the
held-out data (the data from 2009 and 2010).lda.fit = lda(Direction ~ Lag2, data = Weekly, subset = train)
lda.fit
## Call:
## lda(Direction ~ Lag2, data = Weekly, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
##
## Coefficients of linear discriminants:
## LD1
## Lag2 0.4414162
plot(lda.fit)
lda.pred=predict (lda.fit , Weekly.2009)
names(lda.pred)
## [1] "class" "posterior" "x"
lda.class=lda.pred$class
table(lda.class ,Weekly.2009$Direction)
##
## lda.class Down Up
## Down 9 5
## Up 34 56
mean(lda.class == Weekly.2009$Direction)
## [1] 0.625
Lag2 as the predictor, we ended up with an identical
confusion matrix to the one from Part (d) with the logistic regression
model. As we saw in Part (d), the model correctly predicted the market
direction for 62.5% of the weeks in the held-out data (the data from
2009 and 2010).qda.fit=qda(Direction~Lag2 ,data=Weekly ,subset=train)
qda.fit
## Call:
## qda(Direction ~ Lag2, data = Weekly, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
qda.class=predict(qda.fit ,Weekly.2009)$class
table(qda.class ,Weekly.2009$Direction)
##
## qda.class Down Up
## Down 0 0
## Up 43 61
mean(qda.class==Weekly.2009$Direction)
## [1] 0.5865385
train.X = cbind(Weekly[train, ]$Lag2)
test.X = cbind(Weekly.2009$Lag2)
train.Direction = Weekly[train, ]$Direction
set.seed(1)
knn.pred = knn(train.X, test.X, train.Direction, k = 1)
table(knn.pred, Weekly.2009$Direction)
##
## knn.pred Down Up
## Down 21 30
## Up 22 31
mean(knn.pred == Weekly.2009$Direction)
## [1] 0.5
NB.fits <- naiveBayes(Direction ~ Lag2, data = Weekly, subset = train)
NB.preds <- predict(NB.fits, Weekly.2009)
table(NB.preds, Weekly.2009$Direction)
##
## NB.preds Down Up
## Down 0 0
## Up 43 61
mean(NB.preds == Weekly.2009$Direction)
## [1] 0.5865385
lda.fit2 = lda(Direction ~ Lag1+Lag2+Lag4, data = Weekly, subset = train)
lda.fit2
## Call:
## lda(Direction ~ Lag1 + Lag2 + Lag4, data = Weekly, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag1 Lag2 Lag4
## Down 0.289444444 -0.03568254 0.15925624
## Up -0.009213235 0.26036581 0.09220956
##
## Coefficients of linear discriminants:
## LD1
## Lag1 -0.2984478
## Lag2 0.2960224
## Lag4 -0.1113485
plot(lda.fit2)
lda.pred=predict (lda.fit2 , Weekly.2009)
names(lda.pred)
## [1] "class" "posterior" "x"
lda.class=lda.pred$class
table(lda.class ,Weekly.2009$Direction)
##
## lda.class Down Up
## Down 9 7
## Up 34 54
mean(lda.class == Weekly.2009$Direction)
## [1] 0.6057692
qda.fit2=qda(Direction~Lag1+Lag2+Lag4 ,data=Weekly ,subset=train)
qda.fit
## Call:
## qda(Direction ~ Lag2, data = Weekly, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
qda.class=predict(qda.fit2 ,Weekly.2009)$class
table(qda.class ,Weekly.2009$Direction)
##
## qda.class Down Up
## Down 9 20
## Up 34 41
mean(qda.class==Weekly.2009$Direction)
## [1] 0.4807692
Auto data
setmpg01, that contains a 1
if mpg contains a value above its median, and a 0 if
mpg contains a value below its median. You can compute the
median using the median() function. Note you may find it
helpful to use the data.frame() function to create a single
data set containing both mpg01 and the other
Auto variables.mpg01 = rep(0, dim(Auto)[1])
mpg01[Auto$mpg > median(Auto$mpg)] = 1
Auto = data.frame(Auto, mpg01)
head(Auto)
## mpg cylinders displacement horsepower weight acceleration year origin
## 1 18 8 307 130 3504 12.0 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11.0 70 1
## 4 16 8 304 150 3433 12.0 70 1
## 5 17 8 302 140 3449 10.5 70 1
## 6 15 8 429 198 4341 10.0 70 1
## name mpg01
## 1 chevrolet chevelle malibu 0
## 2 buick skylark 320 0
## 3 plymouth satellite 0
## 4 amc rebel sst 0
## 5 ford torino 0
## 6 ford galaxie 500 0
mpg01?
Scatterplots and boxplots may be useful tools to answer this question.
Describe your findings.par(mfrow = c(3, 2))
plot(Auto$cylinders, Auto$mpg01)
plot(Auto$displacement, Auto$mpg01)
plot(Auto$horsepower, Auto$mpg01)
plot(Auto$weight, Auto$mpg01)
plot(Auto$acceleration, Auto$mpg01)
plot(Auto$year, Auto$mpg01)
set.seed(1)
train = sample(dim(Auto)[1], size = 0.75*dim(Auto)[1])
mpg01 in
(b). What is the test error of the model obtained?lda.fit = lda(mpg01 ~ cylinders + displacement + horsepower + weight + year + origin, data = Auto, subset = train)
lda.pred = predict(lda.fit, Auto[-train, ])
table(lda.pred$class, Auto[-train, "mpg01"], dnn = c("Predicted", "Actual"))
## Actual
## Predicted 0 1
## 0 40 0
## 1 13 45
1 - mean(lda.pred$class == Auto[-train, "mpg01"])
## [1] 0.1326531
mpg01 in
(b). What is the test error of the model obtained?qda.fit = qda(mpg01 ~ cylinders + displacement + horsepower + weight + year + origin, data = Auto, subset = train)
qda.pred = predict(qda.fit, Auto[-train, ])
table(qda.pred$class, Auto[-train, "mpg01"], dnn = c("Predicted", "Actual"))
## Actual
## Predicted 0 1
## 0 46 4
## 1 7 41
1 - mean(qda.pred$class == Auto[-train, "mpg01"])
## [1] 0.1122449
glm.fit = glm(mpg01 ~ cylinders + displacement + horsepower + weight + year + origin, data = Auto, subset = train,
family = "binomial")
summary(glm.fit)
##
## Call:
## glm(formula = mpg01 ~ cylinders + displacement + horsepower +
## weight + year + origin, family = "binomial", data = Auto,
## subset = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.39692 -0.11768 0.03119 0.25456 2.94902
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -16.706955 5.528219 -3.022 0.002510 **
## cylinders -0.030542 0.476841 -0.064 0.948929
## displacement 0.001708 0.013417 0.127 0.898717
## horsepower -0.037070 0.019066 -1.944 0.051864 .
## weight -0.004289 0.001135 -3.777 0.000159 ***
## year 0.417675 0.085249 4.899 9.61e-07 ***
## origin 0.381162 0.396158 0.962 0.335977
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 407.35 on 293 degrees of freedom
## Residual deviance: 124.92 on 287 degrees of freedom
## AIC: 138.92
##
## Number of Fisher Scoring iterations: 7
glm.probs = predict(glm.fit, Auto[-train, ], type = "response")
glm.pred = rep(0, dim(Auto[-train, ])[1])
glm.pred[glm.probs > 0.5] = 1
table(glm.pred, Auto[-train, "mpg01"], dnn = c("Predicted", "Actual"))
## Actual
## Predicted 0 1
## 0 46 2
## 1 7 43
1 - mean(glm.pred == Auto[-train, "mpg01"])
## [1] 0.09183673
train.X = cbind(Auto[train, ]$displacement, Auto[train, ]$weight, Auto[train, ]$cylinders, Auto[train, ]$year)
test.X = cbind(Auto[-train, ]$displacement, Auto[-train, ]$weight, Auto[-train, ]$cylinders, Auto[-train, ]$year)
train.Y = cbind(Auto[train, ]$mpg01)
knn.pred1 = knn(train.X, test.X, train.Y, k=1)
tab = table(knn.pred1, Auto[-train, ]$mpg01)
1 - mean(knn.pred1 == Auto[-train, "mpg01"])
## [1] 0.1530612
knn.pred2 = knn(train.X, test.X, train.Y, k=5)
tab = table(knn.pred2, Auto[-train, ]$mpg01)
1 - mean(knn.pred2 == Auto[-train, "mpg01"])
## [1] 0.1428571
knn.pred3 = knn(train.X, test.X, train.Y, k=10)
tab = table(knn.pred3, Auto[-train, ]$mpg01)
1 - mean(knn.pred3 == Auto[-train, "mpg01"])
## [1] 0.1530612
knn.pred4 = knn(train.X, test.X, train.Y, k=15)
tab = table(knn.pred4, Auto[-train, ]$mpg01)
1 - mean(knn.pred4 == Auto[-train, "mpg01"])
## [1] 0.1530612
Boston data set, fit classification models in
order to predict whether a given census tract has a crime rate above or
below the median. Explore logistic regression, LDA, naive Bayes, and KNN
models using various subsets of the predictors. Describe your
findings.crim01 <- rep(0, length(Boston$crim))
crim01[Boston$crim > median(Boston$crim)] = 1
Boston <- data.frame(Boston, crim01)
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv crim01
## Min. : 1.73 Min. : 5.00 Min. :0.0
## 1st Qu.: 6.95 1st Qu.:17.02 1st Qu.:0.0
## Median :11.36 Median :21.20 Median :0.5
## Mean :12.65 Mean :22.53 Mean :0.5
## 3rd Qu.:16.95 3rd Qu.:25.00 3rd Qu.:1.0
## Max. :37.97 Max. :50.00 Max. :1.0
set.seed(1)
train_num <- sample(nrow(Boston), size = (nrow(Boston) * .75))
test_num <- -train_num
boston_train <- Boston[train_num, ]
boston_test <- Boston[test_num, ]
crim01_test <- Boston$crim01[test_num]
boston.glm <- glm(crim01 ~ tax + rad + dis + age + nox,
data = Boston)
glm.probs <- predict(boston.glm, boston_test, type = 'response')
glm.preds <- round(glm.probs)
table(glm.preds, crim01_test)
## crim01_test
## glm.preds 0 1
## 0 62 15
## 1 1 49
1-mean(glm.preds == crim01_test)
## [1] 0.1259843
boston.lda <- lda(crim01 ~ tax + rad + dis + age + nox,
data = Boston)
lda.preds <- predict(boston.lda, boston_test)
table(lda.preds$class, crim01_test)
## crim01_test
## 0 1
## 0 62 15
## 1 1 49
1 - mean(lda.preds$class == crim01_test)
## [1] 0.1259843
boston.qda <- qda(crim01 ~ tax + rad + dis + age + nox + nox:dis,
data = Boston)
qda.preds <- predict(boston.qda, boston_test)
table(qda.preds$class, crim01_test)
## crim01_test
## 0 1
## 0 59 10
## 1 4 54
1-mean(qda.preds$class == crim01_test)
## [1] 0.1102362
boston.NB <- naiveBayes(crim01 ~ tax + rad + dis + age + nox,
data = Boston)
NB.preds <- predict(boston.NB, boston_test)
table(NB.preds, crim01_test)
## crim01_test
## NB.preds 0 1
## 0 59 14
## 1 4 50
1-mean(NB.preds == crim01_test)
## [1] 0.1417323