13
install.packages("ISLR2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(ISLR2)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
library(class)
library(e1071)
# a
data("Weekly")
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
pairs(Weekly[, -9]) # Numerical + graphical summaries (excluding Direction)

# b
glm_full <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Weekly, family = binomial)
summary(glm_full)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
# c
probs <- predict(glm_full, type = "response")
preds <- ifelse(probs > 0.5, "Up", "Down")
table(Predicted = preds, Actual = Weekly$Direction)
## Actual
## Predicted Down Up
## Down 54 48
## Up 430 557
mean(preds == Weekly$Direction) # Accuracy
## [1] 0.5610652
# d
train <- Weekly$Year <= 2008
test <- Weekly$Year > 2008
glm_train <- glm(Direction ~ Lag2, data = Weekly, subset = train, family = binomial)
probs_test <- predict(glm_train, newdata = Weekly[test, ], type = "response")
preds_test <- ifelse(probs_test > 0.5, "Up", "Down")
table(Predicted = preds_test, Actual = Weekly$Direction[test])
## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
mean(preds_test == Weekly$Direction[test])
## [1] 0.625
# e
lda_fit <- lda(Direction ~ Lag2, data = Weekly, subset = train)
lda_pred <- predict(lda_fit, Weekly[test, ])
table(Predicted = lda_pred$class, Actual = Weekly$Direction[test])
## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
mean(lda_pred$class == Weekly$Direction[test])
## [1] 0.625
# f
qda_fit <- qda(Direction ~ Lag2, data = Weekly, subset = train)
qda_pred <- predict(qda_fit, Weekly[test, ])
table(Predicted = qda_pred$class, Actual = Weekly$Direction[test])
## Actual
## Predicted Down Up
## Down 0 0
## Up 43 61
mean(qda_pred$class == Weekly$Direction[test])
## [1] 0.5865385
# g
train_X <- as.matrix(Weekly[train, "Lag2"])
test_X <- as.matrix(Weekly[test, "Lag2"])
train_Dir <- Weekly$Direction[train]
set.seed(1)
knn_pred <- knn(train_X, test_X, train_Dir, k = 1)
table(Predicted = knn_pred, Actual = Weekly$Direction[test])
## Actual
## Predicted Down Up
## Down 21 30
## Up 22 31
mean(knn_pred == Weekly$Direction[test])
## [1] 0.5
# h
nb_fit <- naiveBayes(Direction ~ Lag2, data = Weekly, subset = train)
nb_pred <- predict(nb_fit, Weekly[test, ])
table(Predicted = nb_pred, Actual = Weekly$Direction[test])
## Actual
## Predicted Down Up
## Down 0 0
## Up 43 61
mean(nb_pred == Weekly$Direction[test])
## [1] 0.5865385
# i Logistic Regression and LDA have the best performance with a test accuracy of 62.5%.
# j
glm_combo <- glm(Direction ~ Lag1 + Lag2, data = Weekly, subset = train, family = binomial)
probs_combo <- predict(glm_combo, newdata = Weekly[test, ], type = "response")
pred_combo <- ifelse(probs_combo > 0.5, "Up", "Down")
table(Predicted = pred_combo, Actual = Weekly$Direction[test])
## Actual
## Predicted Down Up
## Down 7 8
## Up 36 53
mean(pred_combo == Weekly$Direction[test])
## [1] 0.5769231
14
data("Auto")
Auto <- na.omit(Auto) # remove missing values
# a
mpg01 <- ifelse(Auto$mpg > median(Auto$mpg), 1, 0)
Auto <- data.frame(Auto, mpg01)
# b
boxplot(horsepower ~ mpg01, data = Auto, main = "Horsepower vs mpg01")

boxplot(weight ~ mpg01, data = Auto, main = "Weight vs mpg01")

boxplot(acceleration ~ mpg01, data = Auto, main = "Acceleration vs mpg01")

boxplot(displacement ~ mpg01, data = Auto, main = "Displacement vs mpg01")

pairs(Auto[, c("mpg", "horsepower", "weight", "acceleration", "displacement")])

# c
set.seed(1)
train_index <- sample(1:nrow(Auto), nrow(Auto) / 2)
train <- Auto[train_index, ]
test <- Auto[-train_index, ]
# d
lda_model <- lda(mpg01 ~ horsepower + weight + acceleration + displacement, data = train)
lda_pred <- predict(lda_model, test)
mean(lda_pred$class != test$mpg01)
## [1] 0.127551
# e
qda_model <- qda(mpg01 ~ horsepower + weight + acceleration + displacement, data = train)
qda_pred <- predict(qda_model, test)
mean(qda_pred$class != test$mpg01)
## [1] 0.127551
# f
glm_model <- glm(mpg01 ~ horsepower + weight + acceleration + displacement,
data = train, family = binomial)
glm_probs <- predict(glm_model, test, type = "response")
glm_pred <- ifelse(glm_probs > 0.5, 1, 0)
mean(glm_pred != test$mpg01)
## [1] 0.127551
# g
nb_model <- naiveBayes(as.factor(mpg01) ~ horsepower + weight + acceleration + displacement,
data = train)
nb_pred <- predict(nb_model, test)
mean(nb_pred != test$mpg01)
## [1] 0.127551
# h
train_X <- train[, c("horsepower", "weight", "acceleration", "displacement")]
test_X <- test[, c("horsepower", "weight", "acceleration", "displacement")]
train_Y <- train$mpg01
set.seed(1)
for (k in c(1, 3, 5, 10, 15)) {
knn_pred <- knn(train_X, test_X, train_Y, k = k)
cat("K =", k, " Test Error:", mean(knn_pred != test$mpg01), "\n")
}
## K = 1 Test Error: 0.1785714
## K = 3 Test Error: 0.122449
## K = 5 Test Error: 0.127551
## K = 10 Test Error: 0.1173469
## K = 15 Test Error: 0.127551
16
library(MASS)
data("Boston")
Boston$crim01 <- ifelse(Boston$crim > median(Boston$crim), 1, 0)
Boston_data <- Boston[, !(names(Boston) %in% "crim")]
set.seed(123)
train_index <- sample(1:nrow(Boston), nrow(Boston) * 0.7)
train <- Boston_data[train_index, ]
test <- Boston_data[-train_index, ]
log_fit <- glm(crim01 ~ . - crim01, data = train, family = binomial)
log_probs <- predict(log_fit, newdata = test, type = "response")
log_pred <- ifelse(log_probs > 0.5, 1, 0)
log_acc <- mean(log_pred == test$crim01)
log_acc
## [1] 0.8881579
lda_fit <- lda(crim01 ~ . - crim01, data = train)
lda_pred <- predict(lda_fit, test)$class
lda_acc <- mean(lda_pred == test$crim01)
lda_acc
## [1] 0.8355263
library(e1071)
nb_fit <- naiveBayes(crim01 ~ . - crim01, data = train)
nb_pred <- predict(nb_fit, test)
nb_acc <- mean(nb_pred == test$crim01)
nb_acc
## [1] 0.8157895
library(class)
train_X <- scale(train[, -which(names(train) == "crim01")])
test_X <- scale(test[, -which(names(test) == "crim01")])
train_Y <- train$crim01
set.seed(123)
knn_pred <- knn(train_X, test_X, train_Y, k = 5)
knn_acc <- mean(knn_pred == test$crim01)
knn_acc
## [1] 0.9013158