13

install.packages("ISLR2")     
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(ISLR2)
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
## 
##     Boston
library(class)
library(e1071)

# a
data("Weekly")
summary(Weekly)
##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
## 
pairs(Weekly[, -9])  # Numerical + graphical summaries (excluding Direction)

# b
glm_full <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
                data = Weekly, family = binomial)
summary(glm_full)
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Weekly)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4
# c
probs <- predict(glm_full, type = "response")
preds <- ifelse(probs > 0.5, "Up", "Down")
table(Predicted = preds, Actual = Weekly$Direction)
##          Actual
## Predicted Down  Up
##      Down   54  48
##      Up    430 557
mean(preds == Weekly$Direction)  # Accuracy
## [1] 0.5610652
# d
train <- Weekly$Year <= 2008
test <- Weekly$Year > 2008

glm_train <- glm(Direction ~ Lag2, data = Weekly, subset = train, family = binomial)
probs_test <- predict(glm_train, newdata = Weekly[test, ], type = "response")
preds_test <- ifelse(probs_test > 0.5, "Up", "Down")
table(Predicted = preds_test, Actual = Weekly$Direction[test])
##          Actual
## Predicted Down Up
##      Down    9  5
##      Up     34 56
mean(preds_test == Weekly$Direction[test])
## [1] 0.625
# e
lda_fit <- lda(Direction ~ Lag2, data = Weekly, subset = train)
lda_pred <- predict(lda_fit, Weekly[test, ])
table(Predicted = lda_pred$class, Actual = Weekly$Direction[test])
##          Actual
## Predicted Down Up
##      Down    9  5
##      Up     34 56
mean(lda_pred$class == Weekly$Direction[test])
## [1] 0.625
# f
qda_fit <- qda(Direction ~ Lag2, data = Weekly, subset = train)
qda_pred <- predict(qda_fit, Weekly[test, ])
table(Predicted = qda_pred$class, Actual = Weekly$Direction[test])
##          Actual
## Predicted Down Up
##      Down    0  0
##      Up     43 61
mean(qda_pred$class == Weekly$Direction[test])
## [1] 0.5865385
# g
train_X <- as.matrix(Weekly[train, "Lag2"])
test_X <- as.matrix(Weekly[test, "Lag2"])
train_Dir <- Weekly$Direction[train]
set.seed(1)
knn_pred <- knn(train_X, test_X, train_Dir, k = 1)
table(Predicted = knn_pred, Actual = Weekly$Direction[test])
##          Actual
## Predicted Down Up
##      Down   21 30
##      Up     22 31
mean(knn_pred == Weekly$Direction[test])
## [1] 0.5
# h
nb_fit <- naiveBayes(Direction ~ Lag2, data = Weekly, subset = train)
nb_pred <- predict(nb_fit, Weekly[test, ])
table(Predicted = nb_pred, Actual = Weekly$Direction[test])
##          Actual
## Predicted Down Up
##      Down    0  0
##      Up     43 61
mean(nb_pred == Weekly$Direction[test])
## [1] 0.5865385
# i Logistic Regression and LDA have the best performance with a test accuracy of 62.5%.

# j
glm_combo <- glm(Direction ~ Lag1 + Lag2, data = Weekly, subset = train, family = binomial)
probs_combo <- predict(glm_combo, newdata = Weekly[test, ], type = "response")
pred_combo <- ifelse(probs_combo > 0.5, "Up", "Down")
table(Predicted = pred_combo, Actual = Weekly$Direction[test])
##          Actual
## Predicted Down Up
##      Down    7  8
##      Up     36 53
mean(pred_combo == Weekly$Direction[test])
## [1] 0.5769231

14

data("Auto")
Auto <- na.omit(Auto)  # remove missing values

# a 
mpg01 <- ifelse(Auto$mpg > median(Auto$mpg), 1, 0)
Auto <- data.frame(Auto, mpg01)

# b
boxplot(horsepower ~ mpg01, data = Auto, main = "Horsepower vs mpg01")

boxplot(weight ~ mpg01, data = Auto, main = "Weight vs mpg01")

boxplot(acceleration ~ mpg01, data = Auto, main = "Acceleration vs mpg01")

boxplot(displacement ~ mpg01, data = Auto, main = "Displacement vs mpg01")

pairs(Auto[, c("mpg", "horsepower", "weight", "acceleration", "displacement")])

# c
set.seed(1)
train_index <- sample(1:nrow(Auto), nrow(Auto) / 2)
train <- Auto[train_index, ]
test <- Auto[-train_index, ]

# d
lda_model <- lda(mpg01 ~ horsepower + weight + acceleration + displacement, data = train)
lda_pred <- predict(lda_model, test)
mean(lda_pred$class != test$mpg01)
## [1] 0.127551
# e
qda_model <- qda(mpg01 ~ horsepower + weight + acceleration + displacement, data = train)
qda_pred <- predict(qda_model, test)
mean(qda_pred$class != test$mpg01)
## [1] 0.127551
# f
glm_model <- glm(mpg01 ~ horsepower + weight + acceleration + displacement,
                 data = train, family = binomial)
glm_probs <- predict(glm_model, test, type = "response")
glm_pred <- ifelse(glm_probs > 0.5, 1, 0)
mean(glm_pred != test$mpg01)
## [1] 0.127551
# g
nb_model <- naiveBayes(as.factor(mpg01) ~ horsepower + weight + acceleration + displacement,
                       data = train)
nb_pred <- predict(nb_model, test)
mean(nb_pred != test$mpg01) 
## [1] 0.127551
# h
train_X <- train[, c("horsepower", "weight", "acceleration", "displacement")]
test_X <- test[, c("horsepower", "weight", "acceleration", "displacement")]
train_Y <- train$mpg01
set.seed(1)
for (k in c(1, 3, 5, 10, 15)) {
  knn_pred <- knn(train_X, test_X, train_Y, k = k)
  cat("K =", k, " Test Error:", mean(knn_pred != test$mpg01), "\n")
}
## K = 1  Test Error: 0.1785714 
## K = 3  Test Error: 0.122449 
## K = 5  Test Error: 0.127551 
## K = 10  Test Error: 0.1173469 
## K = 15  Test Error: 0.127551

16

library(MASS)
data("Boston")
Boston$crim01 <- ifelse(Boston$crim > median(Boston$crim), 1, 0)
Boston_data <- Boston[, !(names(Boston) %in% "crim")]

set.seed(123)
train_index <- sample(1:nrow(Boston), nrow(Boston) * 0.7)
train <- Boston_data[train_index, ]
test <- Boston_data[-train_index, ]

log_fit <- glm(crim01 ~ . - crim01, data = train, family = binomial)
log_probs <- predict(log_fit, newdata = test, type = "response")
log_pred <- ifelse(log_probs > 0.5, 1, 0)

log_acc <- mean(log_pred == test$crim01)
log_acc
## [1] 0.8881579
lda_fit <- lda(crim01 ~ . - crim01, data = train)
lda_pred <- predict(lda_fit, test)$class

lda_acc <- mean(lda_pred == test$crim01)
lda_acc
## [1] 0.8355263
library(e1071)
nb_fit <- naiveBayes(crim01 ~ . - crim01, data = train)
nb_pred <- predict(nb_fit, test)

nb_acc <- mean(nb_pred == test$crim01)
nb_acc
## [1] 0.8157895
library(class)
train_X <- scale(train[, -which(names(train) == "crim01")])
test_X <- scale(test[, -which(names(test) == "crim01")])
train_Y <- train$crim01

set.seed(123)
knn_pred <- knn(train_X, test_X, train_Y, k = 5)
knn_acc <- mean(knn_pred == test$crim01)
knn_acc
## [1] 0.9013158