#Question 13 # (a) Produce numerical and graphical summaries
library(ISLR2)
library(ggplot2)
# Weekly dataset
data(Weekly)
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
str(Weekly)
## 'data.frame': 1089 obs. of 9 variables:
## $ Year : num 1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
## $ Lag1 : num 0.816 -0.27 -2.576 3.514 0.712 ...
## $ Lag2 : num 1.572 0.816 -0.27 -2.576 3.514 ...
## $ Lag3 : num -3.936 1.572 0.816 -0.27 -2.576 ...
## $ Lag4 : num -0.229 -3.936 1.572 0.816 -0.27 ...
## $ Lag5 : num -3.484 -0.229 -3.936 1.572 0.816 ...
## $ Volume : num 0.155 0.149 0.16 0.162 0.154 ...
## $ Today : num -0.27 -2.576 3.514 0.712 1.178 ...
## $ Direction: Factor w/ 2 levels "Down","Up": 1 1 2 2 2 1 2 2 2 1 ...
pairs(Weekly[, -9]) # Scatterplot matrix excluding 'Direction' (categorical variable)
# Histogram of Volume
ggplot(Weekly, aes(x = Volume)) +
geom_histogram(binwidth = 0.1, fill = "blue", color = "black") +
ggtitle("Histogram of Volume")
# Boxplot of Lag1 returns grouped by Direction
ggplot(Weekly, aes(x = Direction, y = Lag1, fill = Direction)) +
geom_boxplot() +
ggtitle("Boxplot of Lag1 Returns by Direction")
#Scatterplot The scatterplot matrix shows relationships between all
numerical variables in the dataset (Lag1 to Lag5, Volume, Today). There
is no strong linear relationship between lagged return variables (Lag1
to Lag5) and Today. Volume appears to have an increasing trend over time
#Histogram The distribution is highly right-skewed, meaning most weekly
trading volumes are low, but a few weeks experienced extremely high
volumes. The highest frequency of occurrences is concentrated at lower
trading volumes, suggesting that only a few weeks had exceptionally high
activity. #Boxplot The boxplot of Lag1 by Direction suggests that past
weekly returns (Lag1) have some influence on future movement but are not
distinctly different between “Up” and “Down” weeks.
logistic_model <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Weekly, family = binomial)
# Display summary of the model
summary(logistic_model)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
Lag2 is statistically significant in predicting whether the market moves up or down. Most lagged return variables (Lag1, Lag3, Lag4, Lag5) and Volume are not significant predictors.
#c
# Predict probabilities of "Up" direction
pred_probs <- predict(logistic_model, type = "response")
# Convert probabilities to class labels (Threshold = 0.5)
pred_labels <- ifelse(pred_probs > 0.5, "Up", "Down")
# Create confusion matrix
conf_matrix <- table(Predicted = pred_labels, Actual = Weekly$Direction)
print(conf_matrix)
## Actual
## Predicted Down Up
## Down 54 48
## Up 430 557
# Compute overall accuracy
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
print(paste("Overall Accuracy:", round(accuracy, 4)))
## [1] "Overall Accuracy: 0.5611"
The model correctly predicts market direction only 56.1% of the time. This is only slightly better than random guessing (50%), meaning the model has weak predictive power. The model is not reliable for making trading decisions, as it makes too many incorrect “Up” predictions (FP). It struggles to identify true “Down” weeks, making it risky for traders relying on it to avoid losses.
#d
# Split data into training (1990-2008) and test (2009-2010)
train <- Weekly$Year < 2009
test <- !train
# Training and testing subsets
train_data <- Weekly[train, ]
test_data <- Weekly[test, ]
summary(train_data$Year)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1990 1994 1999 1999 2004 2008
# logistic regression model with Lag2 as the only predictor
logistic_model_lag2 <- glm(Direction ~ Lag2, data = train_data, family = binomial)
summary(logistic_model_lag2)
##
## Call:
## glm(formula = Direction ~ Lag2, family = binomial, data = train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.20326 0.06428 3.162 0.00157 **
## Lag2 0.05810 0.02870 2.024 0.04298 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1354.7 on 984 degrees of freedom
## Residual deviance: 1350.5 on 983 degrees of freedom
## AIC: 1354.5
##
## Number of Fisher Scoring iterations: 4
# Predict probabilities on test data
test_probs <- predict(logistic_model_lag2, test_data, type = "response")
test_preds <- ifelse(test_probs > 0.5, "Up", "Down")
# confusion matrix
conf_matrix_test <- table(Predicted = test_preds, Actual = test_data$Direction)
print(conf_matrix_test)
## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
accuracy_test <- sum(diag(conf_matrix_test)) / sum(conf_matrix_test)
print(paste("Overall Accuracy on Test Data:", round(accuracy_test, 4)))
## [1] "Overall Accuracy on Test Data: 0.625"
Lag2 is statistically significant, but only weakly predictive. The model is biased towards predicting “Up”, leading to many false positives. 62.5% accuracy is better than random guessing but not strong enough for a reliable trading strategy.
#e
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
lda_model <- lda(Direction ~ Lag2, data = train_data)
lda_preds <- predict(lda_model, test_data)$class
conf_matrix_lda <- table(Predicted = lda_preds, Actual = test_data$Direction)
print(conf_matrix_lda)
## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
accuracy_lda <- sum(diag(conf_matrix_lda)) / sum(conf_matrix_lda)
print(paste("LDA Accuracy:", round(accuracy_lda, 4)))
## [1] "LDA Accuracy: 0.625"
LDA performs exactly the same as logistic regression (Accuracy = 62.5%). The model is biased towards predicting “Up”, leading to a high number of False Positives (FP = 34).
#f
qda_model <- qda(Direction ~ Lag2, data = train_data)
qda_preds <- predict(qda_model, test_data)$class
conf_matrix_qda <- table(Predicted = qda_preds, Actual = test_data$Direction)
print(conf_matrix_qda)
## Actual
## Predicted Down Up
## Down 0 0
## Up 43 61
accuracy_qda <- sum(diag(conf_matrix_qda)) / sum(conf_matrix_qda)
print(paste("QDA Accuracy:", round(accuracy_qda, 4)))
## [1] "QDA Accuracy: 0.5865"
QDA never predicts “Down” (all predictions are “Up”). Model has a strong upward bias, making it unreliable for identifying downturns. Accuracy is worse than logistic regression (62.5%) and LDA (62.5%).
#g
library(class)
train_X <- as.matrix(train_data$Lag2)
test_X <- as.matrix(test_data$Lag2)
train_Y <- train_data$Direction
knn_preds <- knn(train_X, test_X, train_Y, k = 1)
conf_matrix_knn <- table(Predicted = knn_preds, Actual = test_data$Direction)
print(conf_matrix_knn)
## Actual
## Predicted Down Up
## Down 21 29
## Up 22 32
accuracy_knn <- sum(diag(conf_matrix_knn)) / sum(conf_matrix_knn)
print(paste("KNN (K=1) Accuracy:", round(accuracy_knn, 4)))
## [1] "KNN (K=1) Accuracy: 0.5096"
Unlike QDA, KNN does not predict only “Up” but its decisions are highly unstable due to K=1 being too small.
#h
library(e1071)
nb_model <- naiveBayes(Direction ~ Lag2, data = train_data)
nb_preds <- predict(nb_model, test_data)
conf_matrix_nb <- table(Predicted = nb_preds, Actual = test_data$Direction)
print(conf_matrix_nb)
## Actual
## Predicted Down Up
## Down 0 0
## Up 43 61
accuracy_nb <- sum(diag(conf_matrix_nb)) / sum(conf_matrix_nb)
print(paste("Naive Bayes Accuracy:", round(accuracy_nb, 4)))
## [1] "Naive Bayes Accuracy: 0.5865"
Naive Bayes never predicts “Down”, similar to QDA. Model is strongly biased towards predicting “Up”, leading to a high False Positive (FP = 43) rate. Fails to classify any “Down” weeks, making it unusable for predicting market downturns. Performs slightly better than KNN (50%) but still worse than LDA and logistic regression.
#i LDA and Logistic Regression (Lag2 only) achieved the highest accuracy: 62.5%. Both models outperform QDA, Naive Bayes, and KNN (K=1). Logistic Regression is simpler, while LDA assumes normally distributed classes. QDA and Naive Bayes failed to predict “Down” weeks and had lower accuracy (58.65%).
#j
# Trying different predictor combinations
predictor_sets <- list(
Lag2 = Direction ~ Lag2,
Lag1_Lag2 = Direction ~ Lag1 + Lag2,
Lag1_Lag2_Volume = Direction ~ Lag1 + Lag2 + Volume,
Lag1_Lag2_Lag3_Volume = Direction ~ Lag1 + Lag2 + Lag3 + Volume,
Lag1_Lag2_Interaction = Direction ~ Lag1 * Lag2,
Lag2_Quadratic = Direction ~ Lag2 + I(Lag2^2) # Quadratic term
)
# Function to fit logistic regression and evaluate performance
evaluate_logistic <- function(formula) {
model <- glm(formula, data = train_data, family = binomial)
preds <- ifelse(predict(model, test_data, type = "response") > 0.5, "Up", "Down")
conf_matrix <- table(Predicted = preds, Actual = test_data$Direction)
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
return(list(conf_matrix = conf_matrix, accuracy = accuracy))
}
logistic_results <- lapply(predictor_sets, evaluate_logistic)
for (name in names(logistic_results)) {
print(paste("Logistic Regression with:", name))
print(logistic_results[[name]]$conf_matrix)
print(paste("Accuracy:", round(logistic_results[[name]]$accuracy, 4)))
}
## [1] "Logistic Regression with: Lag2"
## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
## [1] "Accuracy: 0.625"
## [1] "Logistic Regression with: Lag1_Lag2"
## Actual
## Predicted Down Up
## Down 7 8
## Up 36 53
## [1] "Accuracy: 0.5769"
## [1] "Logistic Regression with: Lag1_Lag2_Volume"
## Actual
## Predicted Down Up
## Down 27 33
## Up 16 28
## [1] "Accuracy: 0.5288"
## [1] "Logistic Regression with: Lag1_Lag2_Lag3_Volume"
## Actual
## Predicted Down Up
## Down 30 37
## Up 13 24
## [1] "Accuracy: 0.5192"
## [1] "Logistic Regression with: Lag1_Lag2_Interaction"
## Actual
## Predicted Down Up
## Down 7 8
## Up 36 53
## [1] "Accuracy: 0.5769"
## [1] "Logistic Regression with: Lag2_Quadratic"
## Actual
## Predicted Down Up
## Down 8 4
## Up 35 57
## [1] "Accuracy: 0.625"
# LDA & QDA with Different Predictors
evaluate_lda_qda <- function(formula, method) {
model <- if (method == "LDA") lda(formula, data = train_data) else qda(formula, data = train_data)
preds <- predict(model, test_data)$class
conf_matrix <- table(Predicted = preds, Actual = test_data$Direction)
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
return(list(conf_matrix = conf_matrix, accuracy = accuracy))
}
# Loop through predictor sets and store results
lda_results <- lapply(predictor_sets, evaluate_lda_qda, method = "LDA")
qda_results <- lapply(predictor_sets, evaluate_lda_qda, method = "QDA")
# results
for (name in names(lda_results)) {
print(paste("LDA with:", name))
print(lda_results[[name]]$conf_matrix)
print(paste("Accuracy:", round(lda_results[[name]]$accuracy, 4)))
}
## [1] "LDA with: Lag2"
## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
## [1] "Accuracy: 0.625"
## [1] "LDA with: Lag1_Lag2"
## Actual
## Predicted Down Up
## Down 7 8
## Up 36 53
## [1] "Accuracy: 0.5769"
## [1] "LDA with: Lag1_Lag2_Volume"
## Actual
## Predicted Down Up
## Down 27 33
## Up 16 28
## [1] "Accuracy: 0.5288"
## [1] "LDA with: Lag1_Lag2_Lag3_Volume"
## Actual
## Predicted Down Up
## Down 30 37
## Up 13 24
## [1] "Accuracy: 0.5192"
## [1] "LDA with: Lag1_Lag2_Interaction"
## Actual
## Predicted Down Up
## Down 7 8
## Up 36 53
## [1] "Accuracy: 0.5769"
## [1] "LDA with: Lag2_Quadratic"
## Actual
## Predicted Down Up
## Down 7 4
## Up 36 57
## [1] "Accuracy: 0.6154"
for (name in names(qda_results)) {
print(paste("QDA with:", name))
print(qda_results[[name]]$conf_matrix)
print(paste("Accuracy:", round(qda_results[[name]]$accuracy, 4)))
}
## [1] "QDA with: Lag2"
## Actual
## Predicted Down Up
## Down 0 0
## Up 43 61
## [1] "Accuracy: 0.5865"
## [1] "QDA with: Lag1_Lag2"
## Actual
## Predicted Down Up
## Down 7 10
## Up 36 51
## [1] "Accuracy: 0.5577"
## [1] "QDA with: Lag1_Lag2_Volume"
## Actual
## Predicted Down Up
## Down 31 44
## Up 12 17
## [1] "Accuracy: 0.4615"
## [1] "QDA with: Lag1_Lag2_Lag3_Volume"
## Actual
## Predicted Down Up
## Down 35 47
## Up 8 14
## [1] "Accuracy: 0.4712"
## [1] "QDA with: Lag1_Lag2_Interaction"
## Actual
## Predicted Down Up
## Down 23 36
## Up 20 25
## [1] "Accuracy: 0.4615"
## [1] "QDA with: Lag2_Quadratic"
## Actual
## Predicted Down Up
## Down 7 3
## Up 36 58
## [1] "Accuracy: 0.625"
#KNN with Different K-values and Predictors
library(class)
evaluate_knn <- function(formula, k_value) {
train_X <- model.matrix(formula, data = train_data)[, -1, drop = FALSE]
test_X <- model.matrix(formula, data = test_data)[, -1, drop = FALSE]
# Ensure both matrices have the same columns
common_cols <- intersect(colnames(train_X), colnames(test_X))
train_X <- as.matrix(train_X[, common_cols, drop = FALSE])
test_X <- as.matrix(test_X[, common_cols, drop = FALSE])
# KNN model
preds <- knn(train_X, test_X, train_data$Direction, k = k_value)
# confusion matrix and accuracy
conf_matrix <- table(Predicted = preds, Actual = test_data$Direction)
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
return(list(conf_matrix = conf_matrix, accuracy = accuracy))
}
k_values <- c(1, 5, 10, 20)
# predictor sets and store results
knn_results <- list()
for (k in k_values) {
knn_results[[paste("KNN_K", k, sep = "_")]] <- lapply(
predictor_sets,
function(formula) evaluate_knn(formula, k) # Pass formula and k_value correctly
)
}
for (k in names(knn_results)) {
for (name in names(knn_results[[k]])) {
print(paste(k, "with:", name))
print(knn_results[[k]][[name]]$conf_matrix)
print(paste("Accuracy:", round(knn_results[[k]][[name]]$accuracy, 4)))
}
}
## [1] "KNN_K_1 with: Lag2"
## Actual
## Predicted Down Up
## Down 21 30
## Up 22 31
## [1] "Accuracy: 0.5"
## [1] "KNN_K_1 with: Lag1_Lag2"
## Actual
## Predicted Down Up
## Down 18 29
## Up 25 32
## [1] "Accuracy: 0.4808"
## [1] "KNN_K_1 with: Lag1_Lag2_Volume"
## Actual
## Predicted Down Up
## Down 26 35
## Up 17 26
## [1] "Accuracy: 0.5"
## [1] "KNN_K_1 with: Lag1_Lag2_Lag3_Volume"
## Actual
## Predicted Down Up
## Down 22 31
## Up 21 30
## [1] "Accuracy: 0.5"
## [1] "KNN_K_1 with: Lag1_Lag2_Interaction"
## Actual
## Predicted Down Up
## Down 18 25
## Up 25 36
## [1] "Accuracy: 0.5192"
## [1] "KNN_K_1 with: Lag2_Quadratic"
## Actual
## Predicted Down Up
## Down 21 30
## Up 22 31
## [1] "Accuracy: 0.5"
## [1] "KNN_K_5 with: Lag2"
## Actual
## Predicted Down Up
## Down 15 21
## Up 28 40
## [1] "Accuracy: 0.5288"
## [1] "KNN_K_5 with: Lag1_Lag2"
## Actual
## Predicted Down Up
## Down 22 32
## Up 21 29
## [1] "Accuracy: 0.4904"
## [1] "KNN_K_5 with: Lag1_Lag2_Volume"
## Actual
## Predicted Down Up
## Down 33 36
## Up 10 25
## [1] "Accuracy: 0.5577"
## [1] "KNN_K_5 with: Lag1_Lag2_Lag3_Volume"
## Actual
## Predicted Down Up
## Down 23 28
## Up 20 33
## [1] "Accuracy: 0.5385"
## [1] "KNN_K_5 with: Lag1_Lag2_Interaction"
## Actual
## Predicted Down Up
## Down 27 24
## Up 16 37
## [1] "Accuracy: 0.6154"
## [1] "KNN_K_5 with: Lag2_Quadratic"
## Actual
## Predicted Down Up
## Down 16 21
## Up 27 40
## [1] "Accuracy: 0.5385"
## [1] "KNN_K_10 with: Lag2"
## Actual
## Predicted Down Up
## Down 17 19
## Up 26 42
## [1] "Accuracy: 0.5673"
## [1] "KNN_K_10 with: Lag1_Lag2"
## Actual
## Predicted Down Up
## Down 16 29
## Up 27 32
## [1] "Accuracy: 0.4615"
## [1] "KNN_K_10 with: Lag1_Lag2_Volume"
## Actual
## Predicted Down Up
## Down 31 34
## Up 12 27
## [1] "Accuracy: 0.5577"
## [1] "KNN_K_10 with: Lag1_Lag2_Lag3_Volume"
## Actual
## Predicted Down Up
## Down 24 27
## Up 19 34
## [1] "Accuracy: 0.5577"
## [1] "KNN_K_10 with: Lag1_Lag2_Interaction"
## Actual
## Predicted Down Up
## Down 16 26
## Up 27 35
## [1] "Accuracy: 0.4904"
## [1] "KNN_K_10 with: Lag2_Quadratic"
## Actual
## Predicted Down Up
## Down 19 18
## Up 24 43
## [1] "Accuracy: 0.5962"
## [1] "KNN_K_20 with: Lag2"
## Actual
## Predicted Down Up
## Down 19 21
## Up 24 40
## [1] "Accuracy: 0.5673"
## [1] "KNN_K_20 with: Lag1_Lag2"
## Actual
## Predicted Down Up
## Down 18 21
## Up 25 40
## [1] "Accuracy: 0.5577"
## [1] "KNN_K_20 with: Lag1_Lag2_Volume"
## Actual
## Predicted Down Up
## Down 20 35
## Up 23 26
## [1] "Accuracy: 0.4423"
## [1] "KNN_K_20 with: Lag1_Lag2_Lag3_Volume"
## Actual
## Predicted Down Up
## Down 15 24
## Up 28 37
## [1] "Accuracy: 0.5"
## [1] "KNN_K_20 with: Lag1_Lag2_Interaction"
## Actual
## Predicted Down Up
## Down 14 20
## Up 29 41
## [1] "Accuracy: 0.5288"
## [1] "KNN_K_20 with: Lag2_Quadratic"
## Actual
## Predicted Down Up
## Down 21 22
## Up 22 39
## [1] "Accuracy: 0.5769"
Logistic Regression with Lag2 has the best predictive performance. LDA is a good alternative but does not outperform Logistic Regression. KNN, QDA, and Naive Bayes are not suitable for this dataset due to poor performance.
#a
library(ISLR2)
data("Auto")
summary(Auto$mpg)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 9.00 17.00 22.75 23.45 29.00 46.60
mpg_median <- median(Auto$mpg)
# Create mpg01: 1 if mpg > median, 0 otherwise
Auto$mpg01 <- ifelse(Auto$mpg > mpg_median, 1, 0)
Auto$mpg01 <- as.factor(Auto$mpg01)
table(Auto$mpg01)
##
## 0 1
## 196 196
head(Auto)
The least fuel-efficient car has 9 mpg. 25% of the cars have mpg ≤ 17. The midpoint (50% of cars have mpg ≤ 22.75). Mean (23.45): The average mpg across all cars. 3rd Quartile (29.00): 75% of cars have mpg ≤ 29. Max (46.60): The most fuel-efficient car has 46.6 mpg.
#b
library(ISLR2)
library(ggplot2)
library(gridExtra)
data("Auto")
# ttt6Create binary variable mpg01
mpg_median <- median(Auto$mpg) # Compute median of mpg
Auto$mpg01 <- ifelse(Auto$mpg > mpg_median, 1, 0) # Assign 1 if mpg > median, else 0
Auto$mpg01 <- as.factor(Auto$mpg01) # Convert to factor
# Scatterplots of numerical features vs mpg01
p1 <- ggplot(Auto, aes(x = horsepower, y = as.numeric(mpg01))) +
geom_jitter(alpha = 0.5) + ggtitle("Horsepower vs. mpg01")
p2 <- ggplot(Auto, aes(x = weight, y = as.numeric(mpg01))) +
geom_jitter(alpha = 0.5) + ggtitle("Weight vs. mpg01")
p3 <- ggplot(Auto, aes(x = displacement, y = as.numeric(mpg01))) +
geom_jitter(alpha = 0.5) + ggtitle("Displacement vs. mpg01")
p4 <- ggplot(Auto, aes(x = acceleration, y = as.numeric(mpg01))) +
geom_jitter(alpha = 0.5) + ggtitle("Acceleration vs. mpg01")
# scatterplots
grid.arrange(p1, p2, p3, p4, ncol = 2)
# Boxplots of key features grouped by mpg01
p5 <- ggplot(Auto, aes(x = mpg01, y = horsepower, fill = mpg01)) +
geom_boxplot() + ggtitle("Horsepower vs. mpg01")
p6 <- ggplot(Auto, aes(x = mpg01, y = weight, fill = mpg01)) +
geom_boxplot() + ggtitle("Weight vs. mpg01")
p7 <- ggplot(Auto, aes(x = mpg01, y = displacement, fill = mpg01)) +
geom_boxplot() + ggtitle("Displacement vs. mpg01")
p8 <- ggplot(Auto, aes(x = mpg01, y = acceleration, fill = mpg01)) +
geom_boxplot() + ggtitle("Acceleration vs. mpg01")
# boxplots
grid.arrange(p5, p6, p7, p8, ncol = 2)
# Summary of findings
summary(Auto$mpg01) # Check distribution of mpg01
## 0 1
## 196 196
table(Auto$mpg01) # Count of each category
##
## 0 1
## 196 196
cat("\nKey Findings:\n")
##
## Key Findings:
cat("- Horsepower, Weight, and Displacement are the strongest predictors of mpg01.\n")
## - Horsepower, Weight, and Displacement are the strongest predictors of mpg01.
cat("- Lighter cars, lower horsepower, and lower displacement tend to have high MPG.\n")
## - Lighter cars, lower horsepower, and lower displacement tend to have high MPG.
cat("- Acceleration does not show a strong distinction.\n")
## - Acceleration does not show a strong distinction.
#c
set.seed(42) # For reproducibility
train_index <- sample(1:nrow(Auto), size = 0.7 * nrow(Auto))
# Split data into training and test sets
train_data <- Auto[train_index, ]
test_data <- Auto[-train_index, ]
cat("Training Set Size:", nrow(train_data), "\n")
## Training Set Size: 274
cat("Test Set Size:", nrow(test_data), "\n")
## Test Set Size: 118
Total observations in the dataset = 274 (train) + 118 (test) = 392. 70% of the data (274 rows) was assigned to the training set. 30% of the data (118 rows) was assigned to the test set. The training set (70% of the data) is used to train the machine learning model. The test set (30% of the data) is held out and used to evaluate the model’s performance on unseen data. This ensures generalization—the model isn’t just memorizing the training data but can make accurate predictions on new data. #d
library(ISLR2)
library(MASS) # For LDA
data("Auto")
# binary variable mpg01
mpg_median <- median(Auto$mpg) # Compute median of mpg
Auto$mpg01 <- ifelse(Auto$mpg > mpg_median, 1, 0) # Assign 1 if mpg > median, else 0
Auto$mpg01 <- as.factor(Auto$mpg01) # Convert to factor
# Split data into training and test sets
set.seed(42) # For reproducibility
train_index <- sample(1:nrow(Auto), size = 0.7 * nrow(Auto)) # 70% training
train_data <- Auto[train_index, ]
test_data <- Auto[-train_index, ]
# LDA model using best predictors (horsepower, weight, displacement)
lda_model <- lda(mpg01 ~ horsepower + weight + displacement, data = train_data)
print(lda_model)
## Call:
## lda(mpg01 ~ horsepower + weight + displacement, data = train_data)
##
## Prior probabilities of groups:
## 0 1
## 0.5437956 0.4562044
##
## Group means:
## horsepower weight displacement
## 0 127.8121 3573.557 268.7584
## 1 78.3280 2336.640 116.9720
##
## Coefficients of linear discriminants:
## LD1
## horsepower 0.0007859338
## weight -0.0009948081
## displacement -0.0069209301
# Predictions on Test Data
lda_preds <- predict(lda_model, test_data)
lda_class <- lda_preds$class # Extract predicted class labels
# Compute Confusion Matrix and Test Error
conf_matrix <- table(Predicted = lda_class, Actual = test_data$mpg01)
print(conf_matrix)
## Actual
## Predicted 0 1
## 0 41 3
## 1 6 68
test_error <- 1 - sum(diag(conf_matrix)) / sum(conf_matrix)
cat("Test Error Rate:", round(test_error, 4), "\n")
## Test Error Rate: 0.0763
High MPG cars have lower horsepower, weight, and displacement. Weight & displacement are the strongest predictors (negative impact on MPG). 92.37% Accuracy (Test Error: 7.63%) → Strong model performance
#e
library(MASS) # For QDA
qda_model <- qda(mpg01 ~ horsepower + weight + displacement, data = train_data)
print(qda_model)
## Call:
## qda(mpg01 ~ horsepower + weight + displacement, data = train_data)
##
## Prior probabilities of groups:
## 0 1
## 0.5437956 0.4562044
##
## Group means:
## horsepower weight displacement
## 0 127.8121 3573.557 268.7584
## 1 78.3280 2336.640 116.9720
qda_preds <- predict(qda_model, test_data)
qda_class <- qda_preds$class
conf_matrix_qda <- table(Predicted = qda_class, Actual = test_data$mpg01)
print(conf_matrix_qda)
## Actual
## Predicted 0 1
## 0 42 4
## 1 5 67
test_error_qda <- 1 - sum(diag(conf_matrix_qda)) / sum(conf_matrix_qda)
cat("Test Error Rate (QDA):", round(test_error_qda, 4), "\n")
## Test Error Rate (QDA): 0.0763
High MPG cars have lower horsepower, weight, and displacement. Group means confirm that lighter, lower displacement cars are more fuel-efficient.
#f
library(ISLR2)
logistic_model <- glm(mpg01 ~ horsepower + weight + displacement, data = train_data, family = binomial)
summary(logistic_model)
##
## Call:
## glm(formula = mpg01 ~ horsepower + weight + displacement, family = binomial,
## data = train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 10.718768 1.727423 6.205 5.47e-10 ***
## horsepower -0.046803 0.014937 -3.133 0.00173 **
## weight -0.001605 0.000748 -2.145 0.03193 *
## displacement -0.011170 0.005978 -1.869 0.06168 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 377.74 on 273 degrees of freedom
## Residual deviance: 158.30 on 270 degrees of freedom
## AIC: 166.3
##
## Number of Fisher Scoring iterations: 7
logistic_probs <- predict(logistic_model, test_data, type = "response")
logistic_class <- ifelse(logistic_probs > 0.5, 1, 0)
logistic_class <- as.factor(logistic_class) # Ensure factor type
conf_matrix_logistic <- table(Predicted = logistic_class, Actual = test_data$mpg01)
print(conf_matrix_logistic)
## Actual
## Predicted 0 1
## 0 43 5
## 1 4 66
test_error_logistic <- 1 - sum(diag(conf_matrix_logistic)) / sum(conf_matrix_logistic)
cat("Test Error Rate (Logistic Regression):", round(test_error_logistic, 4), "\n")
## Test Error Rate (Logistic Regression): 0.0763
Horsepower (-0.0468, p = 0.0017) → Strong negative impact on high MPG. Weight (-0.0016, p = 0.0319) → Significant negative impact. Displacement (-0.0112, p = 0.0617) → Weak predictor (borderline significant). Accuracy: 92.37% (Test Error: 7.63%) → Same as LDA & QDA.
#g
library(e1071)
nb_model <- naiveBayes(mpg01 ~ horsepower + weight + displacement, data = train_data)
nb_preds <- predict(nb_model, test_data)
conf_matrix_nb <- table(Predicted = nb_preds, Actual = test_data$mpg01)
print(conf_matrix_nb)
## Actual
## Predicted 0 1
## 0 42 3
## 1 5 68
test_error_nb <- 1 - sum(diag(conf_matrix_nb)) / sum(conf_matrix_nb)
cat("Test Error Rate (Naive Bayes):", round(test_error_nb, 4), "\n")
## Test Error Rate (Naive Bayes): 0.0678
Test Error: 6.78% (Accuracy: 93.22%) → Slightly better than LDA, QDA, and Logistic Regression (7.63%).
#h
library(class) # For KNN
train_X <- as.matrix(train_data[, c("horsepower", "weight", "displacement")])
test_X <- as.matrix(test_data[, c("horsepower", "weight", "displacement")])
train_Y <- train_data$mpg01 # Response variable for training
test_Y <- test_data$mpg01 # True labels for testing
k_values <- c(1, 5, 10, 15, 20)
compute_knn_error <- function(k) {
knn_preds <- knn(train_X, test_X, train_Y, k = k) # Fit KNN model
conf_matrix_knn <- table(Predicted = knn_preds, Actual = test_Y) # Confusion matrix
test_error_knn <- 1 - sum(diag(conf_matrix_knn)) / sum(conf_matrix_knn) # Test error rate
cat("Test Error Rate (K =", k, "):", round(test_error_knn, 4), "\n")
return(test_error_knn)
}
knn_errors <- sapply(k_values, compute_knn_error)
## Test Error Rate (K = 1 ): 0.1186
## Test Error Rate (K = 5 ): 0.1271
## Test Error Rate (K = 10 ): 0.1356
## Test Error Rate (K = 15 ): 0.1102
## Test Error Rate (K = 20 ): 0.0847
K = 20 gives the lowest test error (8.47%), making it the best-performing KNN model