#Question 13 # (a) Produce numerical and graphical summaries

library(ISLR2)
library(ggplot2)

# Weekly dataset
data(Weekly)

summary(Weekly)

##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
##

str(Weekly)

## 'data.frame':    1089 obs. of  9 variables:
##  $ Year     : num  1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
##  $ Lag1     : num  0.816 -0.27 -2.576 3.514 0.712 ...
##  $ Lag2     : num  1.572 0.816 -0.27 -2.576 3.514 ...
##  $ Lag3     : num  -3.936 1.572 0.816 -0.27 -2.576 ...
##  $ Lag4     : num  -0.229 -3.936 1.572 0.816 -0.27 ...
##  $ Lag5     : num  -3.484 -0.229 -3.936 1.572 0.816 ...
##  $ Volume   : num  0.155 0.149 0.16 0.162 0.154 ...
##  $ Today    : num  -0.27 -2.576 3.514 0.712 1.178 ...
##  $ Direction: Factor w/ 2 levels "Down","Up": 1 1 2 2 2 1 2 2 2 1 ...

pairs(Weekly[, -9])  # Scatterplot matrix excluding 'Direction' (categorical variable)

# Histogram of Volume
ggplot(Weekly, aes(x = Volume)) + 
  geom_histogram(binwidth = 0.1, fill = "blue", color = "black") + 
  ggtitle("Histogram of Volume")

# Boxplot of Lag1 returns grouped by Direction
ggplot(Weekly, aes(x = Direction, y = Lag1, fill = Direction)) + 
  geom_boxplot() +
  ggtitle("Boxplot of Lag1 Returns by Direction")

#Scatterplot The scatterplot matrix shows relationships between all numerical variables in the dataset (Lag1 to Lag5, Volume, Today). There is no strong linear relationship between lagged return variables (Lag1 to Lag5) and Today. Volume appears to have an increasing trend over time #Histogram The distribution is highly right-skewed, meaning most weekly trading volumes are low, but a few weeks experienced extremely high volumes. The highest frequency of occurrences is concentrated at lower trading volumes, suggesting that only a few weeks had exceptionally high activity. #Boxplot The boxplot of Lag1 by Direction suggests that past weekly returns (Lag1) have some influence on future movement but are not distinctly different between “Up” and “Down” weeks.

(b) Perform logistic regression

logistic_model <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, 
                      data = Weekly, family = binomial)

# Display summary of the model
summary(logistic_model)

## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Weekly)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4

Lag2 is statistically significant in predicting whether the market moves up or down. Most lagged return variables (Lag1, Lag3, Lag4, Lag5) and Volume are not significant predictors.

# Predict probabilities of "Up" direction
pred_probs <- predict(logistic_model, type = "response")

# Convert probabilities to class labels (Threshold = 0.5)
pred_labels <- ifelse(pred_probs > 0.5, "Up", "Down")

# Create confusion matrix
conf_matrix <- table(Predicted = pred_labels, Actual = Weekly$Direction)
print(conf_matrix)

##          Actual
## Predicted Down  Up
##      Down   54  48
##      Up    430 557

# Compute overall accuracy
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
print(paste("Overall Accuracy:", round(accuracy, 4)))

## [1] "Overall Accuracy: 0.5611"

The model correctly predicts market direction only 56.1% of the time. This is only slightly better than random guessing (50%), meaning the model has weak predictive power. The model is not reliable for making trading decisions, as it makes too many incorrect “Up” predictions (FP). It struggles to identify true “Down” weeks, making it risky for traders relying on it to avoid losses.

# Split data into training (1990-2008) and test (2009-2010)
train <- Weekly$Year < 2009
test <- !train

# Training and testing subsets
train_data <- Weekly[train, ]
test_data <- Weekly[test, ]
summary(train_data$Year)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1990    1994    1999    1999    2004    2008

# logistic regression model with Lag2 as the only predictor
logistic_model_lag2 <- glm(Direction ~ Lag2, data = train_data, family = binomial)
summary(logistic_model_lag2)

## 
## Call:
## glm(formula = Direction ~ Lag2, family = binomial, data = train_data)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.20326    0.06428   3.162  0.00157 **
## Lag2         0.05810    0.02870   2.024  0.04298 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1354.7  on 984  degrees of freedom
## Residual deviance: 1350.5  on 983  degrees of freedom
## AIC: 1354.5
## 
## Number of Fisher Scoring iterations: 4

# Predict probabilities on test data
test_probs <- predict(logistic_model_lag2, test_data, type = "response")
test_preds <- ifelse(test_probs > 0.5, "Up", "Down")
# confusion matrix
conf_matrix_test <- table(Predicted = test_preds, Actual = test_data$Direction)
print(conf_matrix_test)

##          Actual
## Predicted Down Up
##      Down    9  5
##      Up     34 56

accuracy_test <- sum(diag(conf_matrix_test)) / sum(conf_matrix_test)
print(paste("Overall Accuracy on Test Data:", round(accuracy_test, 4)))

## [1] "Overall Accuracy on Test Data: 0.625"

Lag2 is statistically significant, but only weakly predictive. The model is biased towards predicting “Up”, leading to many false positives. 62.5% accuracy is better than random guessing but not strong enough for a reliable trading strategy.

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:ISLR2':
## 
##     Boston

lda_model <- lda(Direction ~ Lag2, data = train_data)
lda_preds <- predict(lda_model, test_data)$class
conf_matrix_lda <- table(Predicted = lda_preds, Actual = test_data$Direction)
print(conf_matrix_lda)

##          Actual
## Predicted Down Up
##      Down    9  5
##      Up     34 56

accuracy_lda <- sum(diag(conf_matrix_lda)) / sum(conf_matrix_lda)
print(paste("LDA Accuracy:", round(accuracy_lda, 4)))

## [1] "LDA Accuracy: 0.625"

LDA performs exactly the same as logistic regression (Accuracy = 62.5%). The model is biased towards predicting “Up”, leading to a high number of False Positives (FP = 34).

qda_model <- qda(Direction ~ Lag2, data = train_data)
qda_preds <- predict(qda_model, test_data)$class
conf_matrix_qda <- table(Predicted = qda_preds, Actual = test_data$Direction)
print(conf_matrix_qda)

##          Actual
## Predicted Down Up
##      Down    0  0
##      Up     43 61

accuracy_qda <- sum(diag(conf_matrix_qda)) / sum(conf_matrix_qda)
print(paste("QDA Accuracy:", round(accuracy_qda, 4)))

## [1] "QDA Accuracy: 0.5865"

QDA never predicts “Down” (all predictions are “Up”). Model has a strong upward bias, making it unreliable for identifying downturns. Accuracy is worse than logistic regression (62.5%) and LDA (62.5%).

library(class)
train_X <- as.matrix(train_data$Lag2)
test_X <- as.matrix(test_data$Lag2)
train_Y <- train_data$Direction
knn_preds <- knn(train_X, test_X, train_Y, k = 1)
conf_matrix_knn <- table(Predicted = knn_preds, Actual = test_data$Direction)
print(conf_matrix_knn)

##          Actual
## Predicted Down Up
##      Down   21 29
##      Up     22 32

accuracy_knn <- sum(diag(conf_matrix_knn)) / sum(conf_matrix_knn)
print(paste("KNN (K=1) Accuracy:", round(accuracy_knn, 4)))

## [1] "KNN (K=1) Accuracy: 0.5096"

Unlike QDA, KNN does not predict only “Up” but its decisions are highly unstable due to K=1 being too small.

library(e1071)
nb_model <- naiveBayes(Direction ~ Lag2, data = train_data)
nb_preds <- predict(nb_model, test_data)
conf_matrix_nb <- table(Predicted = nb_preds, Actual = test_data$Direction)
print(conf_matrix_nb)

##          Actual
## Predicted Down Up
##      Down    0  0
##      Up     43 61

accuracy_nb <- sum(diag(conf_matrix_nb)) / sum(conf_matrix_nb)
print(paste("Naive Bayes Accuracy:", round(accuracy_nb, 4)))

## [1] "Naive Bayes Accuracy: 0.5865"

Naive Bayes never predicts “Down”, similar to QDA. Model is strongly biased towards predicting “Up”, leading to a high False Positive (FP = 43) rate. Fails to classify any “Down” weeks, making it unusable for predicting market downturns. Performs slightly better than KNN (50%) but still worse than LDA and logistic regression.

#i LDA and Logistic Regression (Lag2 only) achieved the highest accuracy: 62.5%. Both models outperform QDA, Naive Bayes, and KNN (K=1). Logistic Regression is simpler, while LDA assumes normally distributed classes. QDA and Naive Bayes failed to predict “Down” weeks and had lower accuracy (58.65%).

# Trying different predictor combinations
predictor_sets <- list(
  Lag2 = Direction ~ Lag2,
  Lag1_Lag2 = Direction ~ Lag1 + Lag2,
  Lag1_Lag2_Volume = Direction ~ Lag1 + Lag2 + Volume,
  Lag1_Lag2_Lag3_Volume = Direction ~ Lag1 + Lag2 + Lag3 + Volume,
  Lag1_Lag2_Interaction = Direction ~ Lag1 * Lag2,
  Lag2_Quadratic = Direction ~ Lag2 + I(Lag2^2)  # Quadratic term
)
# Function to fit logistic regression and evaluate performance
evaluate_logistic <- function(formula) {
  model <- glm(formula, data = train_data, family = binomial)
  preds <- ifelse(predict(model, test_data, type = "response") > 0.5, "Up", "Down")
  conf_matrix <- table(Predicted = preds, Actual = test_data$Direction)
  accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
  return(list(conf_matrix = conf_matrix, accuracy = accuracy))
}

logistic_results <- lapply(predictor_sets, evaluate_logistic)
for (name in names(logistic_results)) {
  print(paste("Logistic Regression with:", name))
  print(logistic_results[[name]]$conf_matrix)
  print(paste("Accuracy:", round(logistic_results[[name]]$accuracy, 4)))
}

## [1] "Logistic Regression with: Lag2"
##          Actual
## Predicted Down Up
##      Down    9  5
##      Up     34 56
## [1] "Accuracy: 0.625"
## [1] "Logistic Regression with: Lag1_Lag2"
##          Actual
## Predicted Down Up
##      Down    7  8
##      Up     36 53
## [1] "Accuracy: 0.5769"
## [1] "Logistic Regression with: Lag1_Lag2_Volume"
##          Actual
## Predicted Down Up
##      Down   27 33
##      Up     16 28
## [1] "Accuracy: 0.5288"
## [1] "Logistic Regression with: Lag1_Lag2_Lag3_Volume"
##          Actual
## Predicted Down Up
##      Down   30 37
##      Up     13 24
## [1] "Accuracy: 0.5192"
## [1] "Logistic Regression with: Lag1_Lag2_Interaction"
##          Actual
## Predicted Down Up
##      Down    7  8
##      Up     36 53
## [1] "Accuracy: 0.5769"
## [1] "Logistic Regression with: Lag2_Quadratic"
##          Actual
## Predicted Down Up
##      Down    8  4
##      Up     35 57
## [1] "Accuracy: 0.625"

# LDA & QDA with Different Predictors
evaluate_lda_qda <- function(formula, method) {
  model <- if (method == "LDA") lda(formula, data = train_data) else qda(formula, data = train_data)
  preds <- predict(model, test_data)$class
  conf_matrix <- table(Predicted = preds, Actual = test_data$Direction)
  accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
  return(list(conf_matrix = conf_matrix, accuracy = accuracy))
}

# Loop through predictor sets and store results
lda_results <- lapply(predictor_sets, evaluate_lda_qda, method = "LDA")
qda_results <- lapply(predictor_sets, evaluate_lda_qda, method = "QDA")

# results
for (name in names(lda_results)) {
  print(paste("LDA with:", name))
  print(lda_results[[name]]$conf_matrix)
  print(paste("Accuracy:", round(lda_results[[name]]$accuracy, 4)))
}

## [1] "LDA with: Lag2"
##          Actual
## Predicted Down Up
##      Down    9  5
##      Up     34 56
## [1] "Accuracy: 0.625"
## [1] "LDA with: Lag1_Lag2"
##          Actual
## Predicted Down Up
##      Down    7  8
##      Up     36 53
## [1] "Accuracy: 0.5769"
## [1] "LDA with: Lag1_Lag2_Volume"
##          Actual
## Predicted Down Up
##      Down   27 33
##      Up     16 28
## [1] "Accuracy: 0.5288"
## [1] "LDA with: Lag1_Lag2_Lag3_Volume"
##          Actual
## Predicted Down Up
##      Down   30 37
##      Up     13 24
## [1] "Accuracy: 0.5192"
## [1] "LDA with: Lag1_Lag2_Interaction"
##          Actual
## Predicted Down Up
##      Down    7  8
##      Up     36 53
## [1] "Accuracy: 0.5769"
## [1] "LDA with: Lag2_Quadratic"
##          Actual
## Predicted Down Up
##      Down    7  4
##      Up     36 57
## [1] "Accuracy: 0.6154"

for (name in names(qda_results)) {
  print(paste("QDA with:", name))
  print(qda_results[[name]]$conf_matrix)
  print(paste("Accuracy:", round(qda_results[[name]]$accuracy, 4)))
}

## [1] "QDA with: Lag2"
##          Actual
## Predicted Down Up
##      Down    0  0
##      Up     43 61
## [1] "Accuracy: 0.5865"
## [1] "QDA with: Lag1_Lag2"
##          Actual
## Predicted Down Up
##      Down    7 10
##      Up     36 51
## [1] "Accuracy: 0.5577"
## [1] "QDA with: Lag1_Lag2_Volume"
##          Actual
## Predicted Down Up
##      Down   31 44
##      Up     12 17
## [1] "Accuracy: 0.4615"
## [1] "QDA with: Lag1_Lag2_Lag3_Volume"
##          Actual
## Predicted Down Up
##      Down   35 47
##      Up      8 14
## [1] "Accuracy: 0.4712"
## [1] "QDA with: Lag1_Lag2_Interaction"
##          Actual
## Predicted Down Up
##      Down   23 36
##      Up     20 25
## [1] "Accuracy: 0.4615"
## [1] "QDA with: Lag2_Quadratic"
##          Actual
## Predicted Down Up
##      Down    7  3
##      Up     36 58
## [1] "Accuracy: 0.625"

#KNN with Different K-values and Predictors

library(class)  
evaluate_knn <- function(formula, k_value) {
  train_X <- model.matrix(formula, data = train_data)[, -1, drop = FALSE]  
  test_X <- model.matrix(formula, data = test_data)[, -1, drop = FALSE]    

  # Ensure both matrices have the same columns
  common_cols <- intersect(colnames(train_X), colnames(test_X))
  train_X <- as.matrix(train_X[, common_cols, drop = FALSE])  
  test_X <- as.matrix(test_X[, common_cols, drop = FALSE])   
  # KNN model
  preds <- knn(train_X, test_X, train_data$Direction, k = k_value)

  # confusion matrix and accuracy
  conf_matrix <- table(Predicted = preds, Actual = test_data$Direction)
  accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
  
  return(list(conf_matrix = conf_matrix, accuracy = accuracy))
}

k_values <- c(1, 5, 10, 20)

# predictor sets and store results
knn_results <- list()
for (k in k_values) {
  knn_results[[paste("KNN_K", k, sep = "_")]] <- lapply(
    predictor_sets, 
    function(formula) evaluate_knn(formula, k)  # Pass formula and k_value correctly
  )
}

for (k in names(knn_results)) {
  for (name in names(knn_results[[k]])) {
    print(paste(k, "with:", name))
    print(knn_results[[k]][[name]]$conf_matrix)
    print(paste("Accuracy:", round(knn_results[[k]][[name]]$accuracy, 4)))
  }
}

## [1] "KNN_K_1 with: Lag2"
##          Actual
## Predicted Down Up
##      Down   21 30
##      Up     22 31
## [1] "Accuracy: 0.5"
## [1] "KNN_K_1 with: Lag1_Lag2"
##          Actual
## Predicted Down Up
##      Down   18 29
##      Up     25 32
## [1] "Accuracy: 0.4808"
## [1] "KNN_K_1 with: Lag1_Lag2_Volume"
##          Actual
## Predicted Down Up
##      Down   26 35
##      Up     17 26
## [1] "Accuracy: 0.5"
## [1] "KNN_K_1 with: Lag1_Lag2_Lag3_Volume"
##          Actual
## Predicted Down Up
##      Down   22 31
##      Up     21 30
## [1] "Accuracy: 0.5"
## [1] "KNN_K_1 with: Lag1_Lag2_Interaction"
##          Actual
## Predicted Down Up
##      Down   18 25
##      Up     25 36
## [1] "Accuracy: 0.5192"
## [1] "KNN_K_1 with: Lag2_Quadratic"
##          Actual
## Predicted Down Up
##      Down   21 30
##      Up     22 31
## [1] "Accuracy: 0.5"
## [1] "KNN_K_5 with: Lag2"
##          Actual
## Predicted Down Up
##      Down   15 21
##      Up     28 40
## [1] "Accuracy: 0.5288"
## [1] "KNN_K_5 with: Lag1_Lag2"
##          Actual
## Predicted Down Up
##      Down   22 32
##      Up     21 29
## [1] "Accuracy: 0.4904"
## [1] "KNN_K_5 with: Lag1_Lag2_Volume"
##          Actual
## Predicted Down Up
##      Down   33 36
##      Up     10 25
## [1] "Accuracy: 0.5577"
## [1] "KNN_K_5 with: Lag1_Lag2_Lag3_Volume"
##          Actual
## Predicted Down Up
##      Down   23 28
##      Up     20 33
## [1] "Accuracy: 0.5385"
## [1] "KNN_K_5 with: Lag1_Lag2_Interaction"
##          Actual
## Predicted Down Up
##      Down   27 24
##      Up     16 37
## [1] "Accuracy: 0.6154"
## [1] "KNN_K_5 with: Lag2_Quadratic"
##          Actual
## Predicted Down Up
##      Down   16 21
##      Up     27 40
## [1] "Accuracy: 0.5385"
## [1] "KNN_K_10 with: Lag2"
##          Actual
## Predicted Down Up
##      Down   17 19
##      Up     26 42
## [1] "Accuracy: 0.5673"
## [1] "KNN_K_10 with: Lag1_Lag2"
##          Actual
## Predicted Down Up
##      Down   16 29
##      Up     27 32
## [1] "Accuracy: 0.4615"
## [1] "KNN_K_10 with: Lag1_Lag2_Volume"
##          Actual
## Predicted Down Up
##      Down   31 34
##      Up     12 27
## [1] "Accuracy: 0.5577"
## [1] "KNN_K_10 with: Lag1_Lag2_Lag3_Volume"
##          Actual
## Predicted Down Up
##      Down   24 27
##      Up     19 34
## [1] "Accuracy: 0.5577"
## [1] "KNN_K_10 with: Lag1_Lag2_Interaction"
##          Actual
## Predicted Down Up
##      Down   16 26
##      Up     27 35
## [1] "Accuracy: 0.4904"
## [1] "KNN_K_10 with: Lag2_Quadratic"
##          Actual
## Predicted Down Up
##      Down   19 18
##      Up     24 43
## [1] "Accuracy: 0.5962"
## [1] "KNN_K_20 with: Lag2"
##          Actual
## Predicted Down Up
##      Down   19 21
##      Up     24 40
## [1] "Accuracy: 0.5673"
## [1] "KNN_K_20 with: Lag1_Lag2"
##          Actual
## Predicted Down Up
##      Down   18 21
##      Up     25 40
## [1] "Accuracy: 0.5577"
## [1] "KNN_K_20 with: Lag1_Lag2_Volume"
##          Actual
## Predicted Down Up
##      Down   20 35
##      Up     23 26
## [1] "Accuracy: 0.4423"
## [1] "KNN_K_20 with: Lag1_Lag2_Lag3_Volume"
##          Actual
## Predicted Down Up
##      Down   15 24
##      Up     28 37
## [1] "Accuracy: 0.5"
## [1] "KNN_K_20 with: Lag1_Lag2_Interaction"
##          Actual
## Predicted Down Up
##      Down   14 20
##      Up     29 41
## [1] "Accuracy: 0.5288"
## [1] "KNN_K_20 with: Lag2_Quadratic"
##          Actual
## Predicted Down Up
##      Down   21 22
##      Up     22 39
## [1] "Accuracy: 0.5769"

Logistic Regression with Lag2 has the best predictive performance. LDA is a good alternative but does not outperform Logistic Regression. KNN, QDA, and Naive Bayes are not suitable for this dataset due to poor performance.

Question 14

library(ISLR2)
data("Auto")
summary(Auto$mpg)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    9.00   17.00   22.75   23.45   29.00   46.60

mpg_median <- median(Auto$mpg)

# Create mpg01: 1 if mpg > median, 0 otherwise
Auto$mpg01 <- ifelse(Auto$mpg > mpg_median, 1, 0)
Auto$mpg01 <- as.factor(Auto$mpg01)
table(Auto$mpg01)

## 
##   0   1 
## 196 196

head(Auto)

The least fuel-efficient car has 9 mpg. 25% of the cars have mpg ≤ 17. The midpoint (50% of cars have mpg ≤ 22.75). Mean (23.45): The average mpg across all cars. 3rd Quartile (29.00): 75% of cars have mpg ≤ 29. Max (46.60): The most fuel-efficient car has 46.6 mpg.

library(ISLR2)
library(ggplot2)
library(gridExtra)  
data("Auto")

# ttt6Create binary variable mpg01
mpg_median <- median(Auto$mpg)  # Compute median of mpg
Auto$mpg01 <- ifelse(Auto$mpg > mpg_median, 1, 0)  # Assign 1 if mpg > median, else 0
Auto$mpg01 <- as.factor(Auto$mpg01)  # Convert to factor

# Scatterplots of numerical features vs mpg01
p1 <- ggplot(Auto, aes(x = horsepower, y = as.numeric(mpg01))) +
  geom_jitter(alpha = 0.5) + ggtitle("Horsepower vs. mpg01")

p2 <- ggplot(Auto, aes(x = weight, y = as.numeric(mpg01))) +
  geom_jitter(alpha = 0.5) + ggtitle("Weight vs. mpg01")

p3 <- ggplot(Auto, aes(x = displacement, y = as.numeric(mpg01))) +
  geom_jitter(alpha = 0.5) + ggtitle("Displacement vs. mpg01")

p4 <- ggplot(Auto, aes(x = acceleration, y = as.numeric(mpg01))) +
  geom_jitter(alpha = 0.5) + ggtitle("Acceleration vs. mpg01")

# scatterplots
grid.arrange(p1, p2, p3, p4, ncol = 2)

# Boxplots of key features grouped by mpg01
p5 <- ggplot(Auto, aes(x = mpg01, y = horsepower, fill = mpg01)) + 
  geom_boxplot() + ggtitle("Horsepower vs. mpg01")

p6 <- ggplot(Auto, aes(x = mpg01, y = weight, fill = mpg01)) + 
  geom_boxplot() + ggtitle("Weight vs. mpg01")

p7 <- ggplot(Auto, aes(x = mpg01, y = displacement, fill = mpg01)) + 
  geom_boxplot() + ggtitle("Displacement vs. mpg01")

p8 <- ggplot(Auto, aes(x = mpg01, y = acceleration, fill = mpg01)) + 
  geom_boxplot() + ggtitle("Acceleration vs. mpg01")

# boxplots
grid.arrange(p5, p6, p7, p8, ncol = 2)

# Summary of findings
summary(Auto$mpg01)  # Check distribution of mpg01

##   0   1 
## 196 196

table(Auto$mpg01)    # Count of each category

## 
##   0   1 
## 196 196

cat("\nKey Findings:\n")

## 
## Key Findings:

cat("- Horsepower, Weight, and Displacement are the strongest predictors of mpg01.\n")

## - Horsepower, Weight, and Displacement are the strongest predictors of mpg01.

cat("- Lighter cars, lower horsepower, and lower displacement tend to have high MPG.\n")

## - Lighter cars, lower horsepower, and lower displacement tend to have high MPG.

cat("- Acceleration does not show a strong distinction.\n")

## - Acceleration does not show a strong distinction.

set.seed(42)  # For reproducibility
train_index <- sample(1:nrow(Auto), size = 0.7 * nrow(Auto))

# Split data into training and test sets
train_data <- Auto[train_index, ]
test_data <- Auto[-train_index, ]
cat("Training Set Size:", nrow(train_data), "\n")

## Training Set Size: 274

cat("Test Set Size:", nrow(test_data), "\n")

## Test Set Size: 118

Total observations in the dataset = 274 (train) + 118 (test) = 392. 70% of the data (274 rows) was assigned to the training set. 30% of the data (118 rows) was assigned to the test set. The training set (70% of the data) is used to train the machine learning model. The test set (30% of the data) is held out and used to evaluate the model’s performance on unseen data. This ensures generalization—the model isn’t just memorizing the training data but can make accurate predictions on new data. #d

library(ISLR2)
library(MASS)  # For LDA
data("Auto")

# binary variable mpg01
mpg_median <- median(Auto$mpg)  # Compute median of mpg
Auto$mpg01 <- ifelse(Auto$mpg > mpg_median, 1, 0)  # Assign 1 if mpg > median, else 0
Auto$mpg01 <- as.factor(Auto$mpg01)  # Convert to factor

# Split data into training and test sets
set.seed(42)  # For reproducibility
train_index <- sample(1:nrow(Auto), size = 0.7 * nrow(Auto))  # 70% training
train_data <- Auto[train_index, ]
test_data <- Auto[-train_index, ]

#  LDA model using best predictors (horsepower, weight, displacement)
lda_model <- lda(mpg01 ~ horsepower + weight + displacement, data = train_data)
print(lda_model)

## Call:
## lda(mpg01 ~ horsepower + weight + displacement, data = train_data)
## 
## Prior probabilities of groups:
##         0         1 
## 0.5437956 0.4562044 
## 
## Group means:
##   horsepower   weight displacement
## 0   127.8121 3573.557     268.7584
## 1    78.3280 2336.640     116.9720
## 
## Coefficients of linear discriminants:
##                        LD1
## horsepower    0.0007859338
## weight       -0.0009948081
## displacement -0.0069209301

# Predictions on Test Data
lda_preds <- predict(lda_model, test_data)
lda_class <- lda_preds$class  # Extract predicted class labels

# Compute Confusion Matrix and Test Error
conf_matrix <- table(Predicted = lda_class, Actual = test_data$mpg01)
print(conf_matrix)

##          Actual
## Predicted  0  1
##         0 41  3
##         1  6 68

test_error <- 1 - sum(diag(conf_matrix)) / sum(conf_matrix)
cat("Test Error Rate:", round(test_error, 4), "\n")

## Test Error Rate: 0.0763

High MPG cars have lower horsepower, weight, and displacement. Weight & displacement are the strongest predictors (negative impact on MPG). 92.37% Accuracy (Test Error: 7.63%) → Strong model performance

library(MASS)  # For QDA
qda_model <- qda(mpg01 ~ horsepower + weight + displacement, data = train_data)
print(qda_model)

## Call:
## qda(mpg01 ~ horsepower + weight + displacement, data = train_data)
## 
## Prior probabilities of groups:
##         0         1 
## 0.5437956 0.4562044 
## 
## Group means:
##   horsepower   weight displacement
## 0   127.8121 3573.557     268.7584
## 1    78.3280 2336.640     116.9720

qda_preds <- predict(qda_model, test_data)
qda_class <- qda_preds$class  
conf_matrix_qda <- table(Predicted = qda_class, Actual = test_data$mpg01)
print(conf_matrix_qda)

##          Actual
## Predicted  0  1
##         0 42  4
##         1  5 67

test_error_qda <- 1 - sum(diag(conf_matrix_qda)) / sum(conf_matrix_qda)
cat("Test Error Rate (QDA):", round(test_error_qda, 4), "\n")

## Test Error Rate (QDA): 0.0763

High MPG cars have lower horsepower, weight, and displacement. Group means confirm that lighter, lower displacement cars are more fuel-efficient.

library(ISLR2)
logistic_model <- glm(mpg01 ~ horsepower + weight + displacement, data = train_data, family = binomial)
summary(logistic_model)

## 
## Call:
## glm(formula = mpg01 ~ horsepower + weight + displacement, family = binomial, 
##     data = train_data)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  10.718768   1.727423   6.205 5.47e-10 ***
## horsepower   -0.046803   0.014937  -3.133  0.00173 ** 
## weight       -0.001605   0.000748  -2.145  0.03193 *  
## displacement -0.011170   0.005978  -1.869  0.06168 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 377.74  on 273  degrees of freedom
## Residual deviance: 158.30  on 270  degrees of freedom
## AIC: 166.3
## 
## Number of Fisher Scoring iterations: 7

logistic_probs <- predict(logistic_model, test_data, type = "response")
logistic_class <- ifelse(logistic_probs > 0.5, 1, 0)
logistic_class <- as.factor(logistic_class)  # Ensure factor type

conf_matrix_logistic <- table(Predicted = logistic_class, Actual = test_data$mpg01)
print(conf_matrix_logistic)

##          Actual
## Predicted  0  1
##         0 43  5
##         1  4 66

test_error_logistic <- 1 - sum(diag(conf_matrix_logistic)) / sum(conf_matrix_logistic)
cat("Test Error Rate (Logistic Regression):", round(test_error_logistic, 4), "\n")

## Test Error Rate (Logistic Regression): 0.0763

Horsepower (-0.0468, p = 0.0017) → Strong negative impact on high MPG. Weight (-0.0016, p = 0.0319) → Significant negative impact. Displacement (-0.0112, p = 0.0617) → Weak predictor (borderline significant). Accuracy: 92.37% (Test Error: 7.63%) → Same as LDA & QDA.

library(e1071)
nb_model <- naiveBayes(mpg01 ~ horsepower + weight + displacement, data = train_data)
nb_preds <- predict(nb_model, test_data)
conf_matrix_nb <- table(Predicted = nb_preds, Actual = test_data$mpg01)
print(conf_matrix_nb)

##          Actual
## Predicted  0  1
##         0 42  3
##         1  5 68

test_error_nb <- 1 - sum(diag(conf_matrix_nb)) / sum(conf_matrix_nb)
cat("Test Error Rate (Naive Bayes):", round(test_error_nb, 4), "\n")

## Test Error Rate (Naive Bayes): 0.0678

Test Error: 6.78% (Accuracy: 93.22%) → Slightly better than LDA, QDA, and Logistic Regression (7.63%).

library(class)  # For KNN
train_X <- as.matrix(train_data[, c("horsepower", "weight", "displacement")])
test_X <- as.matrix(test_data[, c("horsepower", "weight", "displacement")])
train_Y <- train_data$mpg01  # Response variable for training
test_Y <- test_data$mpg01    # True labels for testing
k_values <- c(1, 5, 10, 15, 20)
compute_knn_error <- function(k) {
  knn_preds <- knn(train_X, test_X, train_Y, k = k)  # Fit KNN model
  conf_matrix_knn <- table(Predicted = knn_preds, Actual = test_Y)  # Confusion matrix
  test_error_knn <- 1 - sum(diag(conf_matrix_knn)) / sum(conf_matrix_knn)  # Test error rate
  cat("Test Error Rate (K =", k, "):", round(test_error_knn, 4), "\n")
  return(test_error_knn)
}
knn_errors <- sapply(k_values, compute_knn_error)

## Test Error Rate (K = 1 ): 0.1186 
## Test Error Rate (K = 5 ): 0.1271 
## Test Error Rate (K = 10 ): 0.1356 
## Test Error Rate (K = 15 ): 0.1102 
## Test Error Rate (K = 20 ): 0.0847

K = 20 gives the lowest test error (8.47%), making it the best-performing KNN model

lab 5

2025-02-26

(b) Perform logistic regression

Question 14