Question 13

(a) Numerical and Graphical Summaries

# Load libraries
library(ISLR2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(MASS)
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## The following object is masked from 'package:ISLR2':
## 
##     Boston
library(class)
library(e1071)

# Load the Weekly dataset
data("Weekly")
summary(Weekly)
##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
## 
# Plot time-series data for Volume
plot(Weekly$Volume, type = "l", main = "Volume Over Time", ylab = "Volume")

# Create boxplots for lag variables
boxplot(Weekly[, 2:6], main = "Boxplots for Lag Variables")

# Visualize Direction (Up or Down)
table(Weekly$Direction)
## 
## Down   Up 
##  484  605
barplot(table(Weekly$Direction), main = "Distribution of Direction", col = c("red", "green"))

Answer: The Weekly dataset shows an increasing trend in trading volume over time, Lag variables have similar distributions with some outliers, and the Direction variable is fairly balanced, with slightly more “Up” movements than “Down”.

(b) Logistic Regression

log_model <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = Weekly, family = binomial)
summary(log_model)
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Weekly)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4

Answer: Lag2 is the only statistically significant predictor (p = 0.0296), while Lag1, Lag3, Lag4, Lag5, and Volume are not significant.

(c) Confusion Matrix & Accuracy

pred_probs <- predict(log_model, type = "response")
pred_class <- ifelse(pred_probs > 0.5, "Up", "Down")
conf_matrix <- table(Predicted = pred_class, Actual = Weekly$Direction)
conf_matrix
##          Actual
## Predicted Down  Up
##      Down   54  48
##      Up    430 557
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
accuracy
## [1] 0.5610652

Answer: The confusion matrix shows the model correctly predicts 561 out of 1,089 cases (accuracy = 56.1%). However, it misclassifies many “Down” movements as “Up,” indicating poor predictive performance.

(d) Logistic Regression with Lag2

train_data <- Weekly[Weekly$Year < 2009, ]
test_data <- Weekly[Weekly$Year >= 2009, ]

log_model_train <- glm(Direction ~ Lag2, data = train_data, family = binomial)
test_probs <- predict(log_model_train, newdata = test_data, type = "response")
test_pred_class <- ifelse(test_probs > 0.5, "Up", "Down")
test_conf_matrix <- table(Predicted = test_pred_class, Actual = test_data$Direction)
test_conf_matrix
##          Actual
## Predicted Down Up
##      Down    9  5
##      Up     34 56
test_accuracy <- sum(diag(test_conf_matrix)) / sum(test_conf_matrix)
test_accuracy
## [1] 0.625

Answer: The logistic regression model using Lag2 as the only predictor achieves an accuracy of 62.5%, slightly improving performance but still misclassifying some “Down” movements as “Up.”

(e) Linear Discriminant Analysis (LDA)

lda_model <- lda(Direction ~ Lag2, data = train_data)
lda_pred <- predict(lda_model, test_data)
lda_conf_matrix <- table(Predicted = lda_pred$class, Actual = test_data$Direction)
lda_conf_matrix
##          Actual
## Predicted Down Up
##      Down    9  5
##      Up     34 56
lda_accuracy <- sum(diag(lda_conf_matrix)) / sum(lda_conf_matrix)
lda_accuracy
## [1] 0.625

Answer: The LDA model achieves 62.5% accuracy, identical to logistic regression, with similar misclassification patterns.

(f) Quadratic Discriminant Analysis (QDA)

qda_model <- qda(Direction ~ Lag2, data = train_data)
qda_pred <- predict(qda_model, test_data)
qda_conf_matrix <- table(Predicted = qda_pred$class, Actual = test_data$Direction)
qda_conf_matrix
##          Actual
## Predicted Down Up
##      Down    0  0
##      Up     43 61
qda_accuracy <- sum(diag(qda_conf_matrix)) / sum(qda_conf_matrix)
qda_accuracy
## [1] 0.5865385

Answer: The QDA model achieves 58.7% accuracy, lower than logistic regression and LDA. It misclassifies all observations as “Up,” failing to classify any “Down” movements correctly.

(g) K-Nearest Neighbors (KNN)

train_X <- train_data$Lag2
test_X <- test_data$Lag2
train_Y <- train_data$Direction
test_Y <- test_data$Direction

knn_pred <- knn(train = matrix(train_X), test = matrix(test_X), cl = train_Y, k = 1)
knn_conf_matrix <- table(Predicted = knn_pred, Actual = test_Y)
knn_conf_matrix
##          Actual
## Predicted Down Up
##      Down   21 29
##      Up     22 32
knn_accuracy <- sum(diag(knn_conf_matrix)) / sum(knn_conf_matrix)
knn_accuracy
## [1] 0.5096154

Answer: The KNN model (K=1) achieves 50.96% accuracy, making it the worst performer, close to random guessing.

(h) Naïve Bayes

nb_model <- naiveBayes(Direction ~ Lag2, data = train_data)
nb_pred <- predict(nb_model, test_data)
nb_conf_matrix <- table(Predicted = nb_pred, Actual = test_data$Direction)
nb_conf_matrix
##          Actual
## Predicted Down Up
##      Down    0  0
##      Up     43 61
nb_accuracy <- sum(diag(nb_conf_matrix)) / sum(nb_conf_matrix)
nb_accuracy
## [1] 0.5865385

Answer: The Naïve Bayes model achieves 58.7% accuracy, matching QDA, but fails to classify any “Down” movements correctly.

(i) Model Comparison

accuracy_results <- data.frame(
  Model = c("Logistic Regression", "LDA", "QDA", "KNN (K=1)", "Naïve Bayes"),
  Accuracy = c(test_accuracy, lda_accuracy, qda_accuracy, knn_accuracy, nb_accuracy)
)

print(accuracy_results)
##                 Model  Accuracy
## 1 Logistic Regression 0.6250000
## 2                 LDA 0.6250000
## 3                 QDA 0.5865385
## 4           KNN (K=1) 0.5096154
## 5         Naïve Bayes 0.5865385

Answer: The best models are Logistic Regression and LDA, both achieving 62.5% accuracy. QDA and Naïve Bayes perform worse at 58.7%, while KNN is the worst with 50.96% accuracy.

Question 14

(a) Create Binary Variable mpg01

data(Auto, package = "ISLR2")
Auto$mpg01 <- ifelse(Auto$mpg > median(Auto$mpg), 1, 0)
table(Auto$mpg01)
## 
##   0   1 
## 196 196

Answer: The binary variable mpg01 is successfully created, with an equal distribution of 196 cars classified as “low mileage (0)” and 196 as “high mileage (1).”

(b) Data Exploration

pairs(Auto)

boxplot(Auto$horsepower ~ Auto$mpg01, main="Horsepower vs mpg01", col=c("red", "green"))

boxplot(Auto$weight ~ Auto$mpg01, main="Weight vs mpg01", col=c("blue", "yellow"))

boxplot(Auto$acceleration ~ Auto$mpg01, main="Acceleration vs mpg01", col=c("purple", "orange"))

Answer: Cars with higher horsepower and weight tend to have lower mileage, while cars with lower horsepower and weight have higher mileage. Acceleration shows a smaller difference.

(c) Train-Test Split

set.seed(1)
train_indices <- sample(1:nrow(Auto), nrow(Auto) * 0.7)
train_data <- Auto[train_indices, ]
test_data <- Auto[-train_indices, ]

(d) Linear Discriminant Analysis (LDA)

lda_model <- lda(mpg01 ~ horsepower + weight + acceleration, data = train_data)
lda_pred <- predict(lda_model, test_data)
lda_conf_matrix <- table(Predicted = lda_pred$class, Actual = test_data$mpg01)
lda_conf_matrix
##          Actual
## Predicted  0  1
##         0 48  3
##         1 13 54
lda_accuracy <- sum(diag(lda_conf_matrix)) / sum(lda_conf_matrix)
lda_accuracy
## [1] 0.8644068

Answer: The LDA model achieves 86.44% accuracy, correctly classifying most observations with a few misclassifications.

(e) Quadratic Discriminant Analysis (QDA)

qda_model <- qda(mpg01 ~ horsepower + weight + acceleration, data = train_data)
qda_pred <- predict(qda_model, test_data)
qda_conf_matrix <- table(Predicted = qda_pred$class, Actual = test_data$mpg01)
qda_conf_matrix
##          Actual
## Predicted  0  1
##         0 53  6
##         1  8 51
qda_accuracy <- sum(diag(qda_conf_matrix)) / sum(qda_conf_matrix)
qda_accuracy
## [1] 0.8813559

Answer: The QDA model achieves 88.14% accuracy, performing slightly better than LDA.

(f) Logistic Regression

log_model <- glm(mpg01 ~ horsepower + weight + acceleration, data = train_data, family = binomial)
log_probs <- predict(log_model, test_data, type = "response")
log_pred <- ifelse(log_probs > 0.5, 1, 0)
log_conf_matrix <- table(Predicted = log_pred, Actual = test_data$mpg01)
log_conf_matrix
##          Actual
## Predicted  0  1
##         0 53  7
##         1  8 50
log_accuracy <- sum(diag(log_conf_matrix)) / sum(log_conf_matrix)
log_accuracy
## [1] 0.8728814

Answer: The logistic regression model achieves 87.29% accuracy, slightly lower than QDA but higher than LDA.

(g) Naïve Bayes

nb_model <- naiveBayes(mpg01 ~ horsepower + weight + acceleration, data = train_data)
nb_pred <- predict(nb_model, test_data)
nb_conf_matrix <- table(Predicted = nb_pred, Actual = test_data$mpg01)
nb_conf_matrix
##          Actual
## Predicted  0  1
##         0 51  4
##         1 10 53
nb_accuracy <- sum(diag(nb_conf_matrix)) / sum(nb_conf_matrix)
nb_accuracy
## [1] 0.8813559

Answer: The Naïve Bayes model achieves 88.14% accuracy, matching QDA and performing slightly better than logistic regression and LDA.

(h) K-Nearest Neighbors (KNN)

train_X <- train_data[, c("horsepower", "weight", "acceleration")]
test_X <- test_data[, c("horsepower", "weight", "acceleration")]
train_Y <- train_data$mpg01
test_Y <- test_data$mpg01

set.seed(1)
k_values <- c(1, 3, 5, 10)
knn_results <- data.frame(K = k_values, Accuracy = NA)

for (i in 1:length(k_values)) {
  knn_pred <- knn(train = train_X, test = test_X, cl = train_Y, k = k_values[i])
  knn_conf_matrix <- table(Predicted = knn_pred, Actual = test_Y)
  knn_accuracy <- sum(diag(knn_conf_matrix)) / sum(knn_conf_matrix)
  knn_results$Accuracy[i] <- knn_accuracy
}

print(knn_results)
##    K  Accuracy
## 1  1 0.8559322
## 2  3 0.9067797
## 3  5 0.8813559
## 4 10 0.8728814

Answer: The best-performing KNN model is K = 3, achieving 90.68% accuracy, the highest among all models.

Question 16

(a) Creating the Binary Crime Rate Variable

# Load the Boston dataset
library(MASS)
data(Boston)

# Create binary variable Crime_High (1 if above median, 0 if below median)
Boston$Crime_High <- ifelse(Boston$crim > median(Boston$crim), 1, 0)
table(Boston$Crime_High)
## 
##   0   1 
## 253 253
summary(Boston)
##       crim                zn             indus            chas        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
##  1st Qu.: 0.08205   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
##       nox               rm             age              dis        
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       rad              tax           ptratio          black       
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   :  0.32  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.:375.38  
##  Median : 5.000   Median :330.0   Median :19.05   Median :391.44  
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :356.67  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:396.23  
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :396.90  
##      lstat            medv         Crime_High 
##  Min.   : 1.73   Min.   : 5.00   Min.   :0.0  
##  1st Qu.: 6.95   1st Qu.:17.02   1st Qu.:0.0  
##  Median :11.36   Median :21.20   Median :0.5  
##  Mean   :12.65   Mean   :22.53   Mean   :0.5  
##  3rd Qu.:16.95   3rd Qu.:25.00   3rd Qu.:1.0  
##  Max.   :37.97   Max.   :50.00   Max.   :1.0

Answer: The crime rate variable is successfully converted into a binary classification problem, with an equal distribution of census tracts classified as high crime (1) and low crime (0).

(b) Data Exploration

# Correlation analysis
correlation_matrix <- cor(Boston)
print(correlation_matrix["crim",])  # Show correlation of crim with other variables
##        crim          zn       indus        chas         nox          rm 
##  1.00000000 -0.20046922  0.40658341 -0.05589158  0.42097171 -0.21924670 
##         age         dis         rad         tax     ptratio       black 
##  0.35273425 -0.37967009  0.62550515  0.58276431  0.28994558 -0.38506394 
##       lstat        medv  Crime_High 
##  0.45562148 -0.38830461  0.40939545
# Boxplots to explore feature relationships
boxplot(Boston$age ~ Boston$Crime_High, main = "Age vs Crime Rate", col = c("blue", "red"))

boxplot(Boston$tax ~ Boston$Crime_High, main = "Tax Rate vs Crime Rate", col = c("green", "yellow"))

boxplot(Boston$rm ~ Boston$Crime_High, main = "Rooms per Dwelling vs Crime Rate", col = c("purple", "orange"))

Answer: Higher crime areas tend to have older housing (higher age), higher tax rates, and fewer rooms per dwelling.

(c) Train-Test Split

set.seed(1)
train_indices <- sample(1:nrow(Boston), nrow(Boston) * 0.7)
train_data <- Boston[train_indices, ]
test_data <- Boston[-train_indices, ]

(d) Logistic Regression

log_model <- glm(Crime_High ~ age + tax + rm, data = train_data, family = binomial)
log_probs <- predict(log_model, test_data, type = "response")
log_pred <- ifelse(log_probs > 0.5, 1, 0)
log_conf_matrix <- table(Predicted = log_pred, Actual = test_data$Crime_High)
log_conf_matrix
##          Actual
## Predicted  0  1
##         0 66 13
##         1  7 66
log_accuracy <- sum(diag(log_conf_matrix)) / sum(log_conf_matrix)
log_accuracy
## [1] 0.8684211

Answer: The logistic regression model achieves 86.84% accuracy, correctly classifying most observations with few misclassifications.

(e) Linear Discriminant Analysis (LDA)

lda_model <- lda(Crime_High ~ age + tax + rm, data = train_data)
lda_pred <- predict(lda_model, test_data)
lda_conf_matrix <- table(Predicted = lda_pred$class, Actual = test_data$Crime_High)
lda_conf_matrix
##          Actual
## Predicted  0  1
##         0 66 15
##         1  7 64
lda_accuracy <- sum(diag(lda_conf_matrix)) / sum(lda_conf_matrix)
lda_accuracy
## [1] 0.8552632

Answer: The LDA model achieves 85.53% accuracy, slightly lower than logistic regression but still performs well.

(f) Naïve Bayes

nb_model <- naiveBayes(Crime_High ~ age + tax + rm, data = train_data)
nb_pred <- predict(nb_model, test_data)
nb_conf_matrix <- table(Predicted = nb_pred, Actual = test_data$Crime_High)
nb_conf_matrix
##          Actual
## Predicted  0  1
##         0 65 18
##         1  8 61
nb_accuracy <- sum(diag(nb_conf_matrix)) / sum(nb_conf_matrix)
nb_accuracy
## [1] 0.8289474

Answer: The Naïve Bayes model achieves 82.89% accuracy, lower than both logistic regression and LDA.

(g) K-Nearest Neighbors (KNN)

train_X <- train_data[, c("age", "tax", "rm")]
test_X <- test_data[, c("age", "tax", "rm")]
train_Y <- train_data$Crime_High
test_Y <- test_data$Crime_High

set.seed(1)
k_values <- c(1, 3, 5, 10)
knn_results <- data.frame(K = k_values, Accuracy = NA)

for (i in 1:length(k_values)) {
  knn_pred <- knn(train = train_X, test = test_X, cl = train_Y, k = k_values[i])
  knn_conf_matrix <- table(Predicted = knn_pred, Actual = test_Y)
  knn_accuracy <- sum(diag(knn_conf_matrix)) / sum(knn_conf_matrix)
  knn_results$Accuracy[i] <- knn_accuracy
}

print(knn_results)
##    K  Accuracy
## 1  1 0.8815789
## 2  3 0.9013158
## 3  5 0.9013158
## 4 10 0.8684211

Answer: The best-performing KNN model is K=3 or K=5, achieving 90.13% accuracy, the highest among all models.

(h) Model Comparison

accuracy_results <- data.frame(
  Model = c("Logistic Regression", "LDA", "Naïve Bayes", "Best KNN"),
  Accuracy = c(log_accuracy, lda_accuracy, nb_accuracy, max(knn_results$Accuracy))
)

print(accuracy_results)
##                 Model  Accuracy
## 1 Logistic Regression 0.8684211
## 2                 LDA 0.8552632
## 3         Naïve Bayes 0.8289474
## 4            Best KNN 0.9013158

Answer: The best model is KNN (K = 3 or 5), achieving 90.13% accuracy. Logistic regression and LDA performed slightly worse, while Naïve Bayes had the lowest accuracy.