#Chapter 04: 13, 14, 16
This question should be answered using the Weekly data set, which is part of the ISLR2 package. This data is similar in nature to the Smarket data from #this chapter’s lab, except that it contains 1, 089 weekly returns for 21 years, from the beginning of 1990 to the end of 2010. | #
# (a) Produce some numerical and graphical summaries of the Weekly data. Do there appear to be any patterns?
library(ISLR2)
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
pairs(Weekly)
cor(Weekly[,-9])
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876
## Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535
## Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865
## Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000
## Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027
## Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873
## Lag5 Volume Today
## Year -0.030519101 0.84194162 -0.032459894
## Lag1 -0.008183096 -0.06495131 -0.075031842
## Lag2 -0.072499482 -0.08551314 0.059166717
## Lag3 0.060657175 -0.06928771 -0.071243639
## Lag4 -0.075675027 -0.06107462 -0.007825873
## Lag5 1.000000000 -0.05851741 0.011012698
## Volume -0.058517414 1.00000000 -0.033077783
## Today 0.011012698 -0.03307778 1.000000000
glm.fits <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = Weekly, family = binomial)
summary(glm.fits)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
glm.probs <- predict(glm.fits, type = "response")
glm.pred <- rep("Down", length(glm.probs))
glm.pred[glm.probs > 0.5] <- "Up"
table(glm.pred, Weekly$Direction)
##
## glm.pred Down Up
## Down 54 48
## Up 430 557
mean(glm.pred == Weekly$Direction)
## [1] 0.5610652
train <- (Weekly$Year <= 2008)
Weekly.test <- Weekly[!train, ]
Direction.test <- Weekly$Direction[!train]
glm.fits2 <- glm(Direction ~ Lag2, data=Weekly, family = binomial, subset = train)
glm.probs2 <- predict(glm.fits2, Weekly.test, type = "response")
glm.pred2 <- rep("Down", length(glm.probs2))
glm.pred2[glm.probs2 > 0.5] <- "Up"
table(glm.pred2, Direction.test)
## Direction.test
## glm.pred2 Down Up
## Down 9 5
## Up 34 56
mean(glm.pred2 ==Direction.test)
## [1] 0.625
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
lda.fit <- lda(Direction ~ Lag2,data = Weekly, subset = train)
lda.pred <- predict(lda.fit, Weekly.test)
table(lda.pred$class, Direction.test)
## Direction.test
## Down Up
## Down 9 5
## Up 34 56
mean(lda.pred$class ==Direction.test)
## [1] 0.625
qda.fit <- qda (Direction ~ Lag2, data = Weekly, subset = train)
qda.pred <- predict(qda.fit, Weekly.test)
table(qda.pred$class, Direction.test)
## Direction.test
## Down Up
## Down 0 0
## Up 43 61
mean(qda.pred$class == Direction.test)
## [1] 0.5865385
library(class)
train.X <- as.matrix(Weekly$Lag2[train])
test.X <- as.matrix(Weekly$Lag2[!train])
train.Direction <- Weekly$Direction[train]
set.seed(1)
knn.pred <- knn(train.X, test.X, train.Direction, k = 1)
table(knn.pred, Direction.test)
## Direction.test
## knn.pred Down Up
## Down 21 30
## Up 22 31
mean(knn.pred == Direction.test)
## [1] 0.5
library(e1071)
nb.fit <- naiveBayes(Direction ~ Lag2, data = Weekly, subset = train)
nb.class <- predict(nb.fit, Weekly.test)
table(nb.class, Direction.test)
## Direction.test
## nb.class Down Up
## Down 0 0
## Up 43 61
mean(nb.class == Direction.test)
## [1] 0.5865385
glm.fit.interact <- glm(Direction ~ Lag1 * Lag2, data = Weekly, family = binomial, subset = train)
glm.prob.interact <- predict(glm.fit.interact, Weekly.test, type = "response")
glm.pred.interact <- rep("Down", length(glm.prob.interact))
glm.pred.interact[glm.prob.interact > 0.5] <- "Up"
table(glm.pred.interact, Direction.test)
## Direction.test
## glm.pred.interact Down Up
## Down 7 8
## Up 36 53
mean(glm.pred.interact == Direction.test)
## [1] 0.5769231
lda.fit.multi <- lda(Direction ~ Lag1 + Lag2, data = Weekly, subset = train)
lda.pred.multi <- predict(lda.fit.multi, Weekly.test)
table(lda.pred.multi$class, Direction.test)
## Direction.test
## Down Up
## Down 7 8
## Up 36 53
mean(lda.pred.multi$class == Direction.test)
## [1] 0.5769231
In this problem, you will develop a model to predict whether a given car gets high or low gas mileage based on the Auto data set.
library(ISLR2)
mpg_median <- median(Auto$mpg)
Auto$mpg01 <- ifelse(Auto$mpg > mpg_median, 1, 0)
pairs(Auto[, -9])
boxplot(cylinders ~ mpg01, data=Auto, main="Cylinders vs mpg01")
boxplot(displacement ~ mpg01, data=Auto, main="Displacement vs mpg01")
boxplot(horsepower ~ mpg01, data = Auto, main="Horsepower vs mpg01")
boxplot(weight ~ mpg01, data = Auto, main="Weight vs mpg01")
pairs(Auto[, -9])
boxplot(cylinders ~ mpg01, data = Auto, main="Cylinders vs mpg01")
boxplot(displacement ~ mpg01, data = Auto, main="Displacement vs mpg01")
boxplot(horsepower ~ mpg01, data = Auto, main="Horsepower vs mpg01")
boxplot(weight ~ mpg01, data=Auto, main="Weight vs mpg01")
set.seed(42)
train_indices <- sample(1:nrow(Auto), 0.8 * nrow(Auto))
train_set <- Auto[train_indices, ]
test_set <- Auto[-train_indices, ]
mpg01_test <- test_set$mpg01
lda.fit <- lda(mpg01 ~ cylinders + displacement + horsepower + weight, data = train_set)
lda.pred <- predict(lda.fit, test_set)
table(lda.pred$class, mpg01_test)
## mpg01_test
## 0 1
## 0 29 3
## 1 2 45
mean(lda.pred$class !=mpg01_test)
## [1] 0.06329114
qda.fit <- qda(mpg01 ~ cylinders + displacement + horsepower + weight, data = train_set)
qda.pred <-predict(qda.fit, test_set)
table(qda.pred$class, mpg01_test)
## mpg01_test
## 0 1
## 0 29 4
## 1 2 44
mean(qda.pred$class !=mpg01_test)
## [1] 0.07594937
glm.fit <- glm(mpg01 ~ cylinders + displacement + horsepower + weight, data = train_set, family=binomial)
glm.probs <- predict(glm.fit, test_set, type="response")
glm.pred <- ifelse(glm.probs > 0.5, 1,0)
table(glm.pred, mpg01_test)
## mpg01_test
## glm.pred 0 1
## 0 29 3
## 1 2 45
mean(glm.pred !=mpg01_test)
## [1] 0.06329114
library(e1071)
nb.fit <- naiveBayes(mpg01 ~ cylinders + displacement + horsepower + weight, data = train_set)
nb.pred <- predict(nb.fit, test_set)
table(nb.pred, mpg01_test)
## mpg01_test
## nb.pred 0 1
## 0 29 3
## 1 2 45
mean(nb.pred != mpg01_test)
## [1] 0.06329114
train.X <- cbind(train_set$cylinders, train_set$displacement, train_set$horsepower, train_set$weight)
test.X <- cbind(test_set$cylinders, test_set$displacement, test_set$horsepower, test_set$weight)
train.Y <- train_set$mpg01
for (k in c(1, 5, 10, 50)) {
set.seed(42)
knn.pred <- knn(train.X, test.X, train.Y, k=k)
print(paste("K =", k, "Test Error:", mean(knn.pred != mpg01_test)))
}
## [1] "K = 1 Test Error: 0.113924050632911"
## [1] "K = 5 Test Error: 0.113924050632911"
## [1] "K = 10 Test Error: 0.126582278481013"
## [1] "K = 50 Test Error: 0.113924050632911"
Using the Boston data set, fit classification models in order to predict whether a given census tract has a crime rate above or below the median. Explore logistic regression, LDA, naive Bayes, and KNN models using various subsets of the predictors. Describe your findings.
Hint: You will have to create the response variable yourself, using the variables that are contained in the Boston data set.
crime_median <- median(Boston$crim)
Boston$crim01 <- ifelse(Boston$crim > crime_median, 1, 0)
set.seed(123)
train_idx <- sample(1: nrow(Boston), 0.8 * nrow(Boston))
boston_train <- Boston[train_idx, ]
boston_test <- Boston[-train_idx, ]
crim01_test <- boston_test$crim01
# Logistic Regression
fit.glm <- glm(crim01 ~ indus + nox + age + rad + tax, data=boston_train, family=binomial)
predict.glm <- ifelse(predict(fit.glm, boston_test, type="response") > 0.5, 1,0)
err.glm <- mean(predict.glm !=crim01_test)
#LDA
fit.lda <- lda(crim01 ~ indus + nox + age + rad + tax, data = boston_train)
pred.lda <- predict(fit.lda, boston_test)$class
err.lda <- mean(pred.lda != crim01_test)
#Naive Bayes
fit.nb <- naiveBayes(crim01 ~ indus + nox + age + rad + tax, data = boston_train)
pred.nb <- predict(fit.nb, boston_test)
err.nb <- mean(pred.nb != crim01_test)
#KNN
train.X.b <- as.matrix(boston_train[, c("indus", "nox", "age", "rad", "tax")])
test.X.b <- as.matrix(boston_test[, c("indus", "nox", "age", "rad", "tax")])
pred.knn <- knn(train.X.b, test.X.b, boston_train$crim01, k=10)
err.knn <- mean(pred.knn != crim01_test)
cat("Logistic Error:", err.glm, "\nLA Error:", err.lda, "\nNaive Bayes Error:", err.nb, "\nKNN (K=10) Error:", err.knn)
## Logistic Error: 0.1078431
## LA Error: 0.1764706
## Naive Bayes Error: 0.1960784
## KNN (K=10) Error: 0.1078431