# Load necessary libraries
library(ISLR2)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Load the Weekly dataset
data("Weekly")
# View the first few rows
head(Weekly)
# Summary statistics of the dataset
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
# Check for missing values
sum(is.na(Weekly))
## [1] 0
# Correlation matrix (excluding categorical variable "Direction")
cor(Weekly %>% select(-Direction))
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876
## Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535
## Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865
## Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000
## Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027
## Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873
## Lag5 Volume Today
## Year -0.030519101 0.84194162 -0.032459894
## Lag1 -0.008183096 -0.06495131 -0.075031842
## Lag2 -0.072499482 -0.08551314 0.059166717
## Lag3 0.060657175 -0.06928771 -0.071243639
## Lag4 -0.075675027 -0.06107462 -0.007825873
## Lag5 1.000000000 -0.05851741 0.011012698
## Volume -0.058517414 1.00000000 -0.033077783
## Today 0.011012698 -0.03307778 1.000000000
# Frequency distribution of Direction (Up/Down)
table(Weekly$Direction)
##
## Down Up
## 484 605
# Histogram of weekly returns
ggplot(Weekly, aes(x = Lag1)) +
geom_histogram(bins = 30, fill = "blue", alpha = 0.5) +
labs(title = "Distribution of Lag1 Returns", x = "Lag1", y = "Count")
# Boxplot of Volume over the years
ggplot(Weekly, aes(x = as.factor(Year), y = Volume)) +
geom_boxplot(fill = "red", alpha = 0.5) +
labs(title = "Volume Distribution Over the Years", x = "Year", y = "Volume") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
# Time series plot of Weekly returns
ggplot(Weekly, aes(x = Year, y = Today)) +
geom_line(color = "blue") +
labs(title = "Weekly Returns Over Time", x = "Year", y = "Today")
# Scatter plot of Lag1 vs Today with Direction as color
ggplot(Weekly, aes(x = Lag1, y = Today, color = Direction)) +
geom_point(alpha = 0.6) +
labs(title = "Lag1 vs Today Returns", x = "Lag1", y = "Today")
# Load necessary libraries
library(ISLR2)
# Load the Weekly dataset
data("Weekly")
# Perform logistic regression
logistic_model <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Weekly, family = binomial)
# Print summary of the model
summary(logistic_model)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
I performed logistic regression to predict Direction (Up/Down) using the five lag variables (Lag1 to Lag5) and Volume. I used the glm() function with binomial family for logistic regression. The summary() output provides coefficients, standard errors, and p-values. From the results, I checked which predictors were statistically significant by looking at p-values (less than 0.05). If a variable has a small p-value, it means it significantly affects Direction. Otherwise, it has little impact on predicting market movement.
# Predict probabilities using the logistic model
predicted_probs <- predict(logistic_model, type = "response")
# Convert probabilities to class labels (threshold = 0.5)
predicted_direction <- ifelse(predicted_probs > 0.5, "Up", "Down")
# Create confusion matrix
conf_matrix <- table(Predicted = predicted_direction, Actual = Weekly$Direction)
print(conf_matrix)
## Actual
## Predicted Down Up
## Down 54 48
## Up 430 557
# Compute accuracy
accuracy <- mean(predicted_direction == Weekly$Direction)
print(paste("Overall Accuracy:", round(accuracy, 4)))
## [1] "Overall Accuracy: 0.5611"
I used the logistic model to predict Direction and created a confusion matrix to compare predicted vs. actual values. The matrix shows how many times the model correctly predicted “Up” and “Down” and where it made mistakes. The diagonal values represent correct predictions, while off-diagonal values show misclassifications. The accuracy metric tells me the fraction of correct predictions. If accuracy is low, it means the model struggles to classify market movements correctly.
# Split data into training (1990-2008) and testing (2009-2010)
train_data <- subset(Weekly, Year < 2009)
test_data <- subset(Weekly, Year >= 2009)
# Fit logistic regression model using only Lag2 as a predictor
logistic_model_train <- glm(Direction ~ Lag2, data = train_data, family = binomial)
# Predict on test data
predicted_probs_test <- predict(logistic_model_train, newdata = test_data, type = "response")
# Convert probabilities to class labels (threshold = 0.5)
predicted_direction_test <- ifelse(predicted_probs_test > 0.5, "Up", "Down")
# Create confusion matrix
conf_matrix_test <- table(Predicted = predicted_direction_test, Actual = test_data$Direction)
print(conf_matrix_test)
## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
# Compute accuracy
accuracy_test <- mean(predicted_direction_test == test_data$Direction)
print(paste("Overall Accuracy on Test Data:", round(accuracy_test, 4)))
## [1] "Overall Accuracy on Test Data: 0.625"
I trained a logistic regression model using data from 1990 to 2008 with Lag2 as the only predictor. Then, I tested it on data from 2009-2010 and computed the confusion matrix. This shows how well the model generalizes to unseen data. The accuracy score tells me the percentage of correctly predicted movements. If accuracy is low, it means Lag2 alone may not be a strong predictor for future market direction.
# Load necessary library
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## The following object is masked from 'package:ISLR2':
##
## Boston
# Fit LDA model using only Lag2 as a predictor
lda_model <- lda(Direction ~ Lag2, data = train_data)
# Predict on test data
lda_predictions <- predict(lda_model, newdata = test_data)
# Extract class predictions
predicted_lda_direction <- lda_predictions$class
# Create confusion matrix
conf_matrix_lda <- table(Predicted = predicted_lda_direction, Actual = test_data$Direction)
print(conf_matrix_lda)
## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
# Compute accuracy
accuracy_lda <- mean(predicted_lda_direction == test_data$Direction)
print(paste("Overall Accuracy on Test Data (LDA):", round(accuracy_lda, 4)))
## [1] "Overall Accuracy on Test Data (LDA): 0.625"
I trained a Linear Discriminant Analysis (LDA) model using Lag2 as the only predictor on data from 1990 to 2008. Then, I tested it on the 2009-2010 data and computed the confusion matrix. The matrix shows how well LDA classifies market direction. The accuracy score tells me how often the model makes correct predictions. Comparing this accuracy with logistic regression helps me see which model performs better for predicting market movements.
# Load necessary library
library(MASS)
# Fit QDA model using only Lag2 as a predictor
qda_model <- qda(Direction ~ Lag2, data = train_data)
# Predict on test data
qda_predictions <- predict(qda_model, newdata = test_data)
# Extract class predictions
predicted_qda_direction <- qda_predictions$class
# Create confusion matrix
conf_matrix_qda <- table(Predicted = predicted_qda_direction, Actual = test_data$Direction)
print(conf_matrix_qda)
## Actual
## Predicted Down Up
## Down 0 0
## Up 43 61
# Compute accuracy
accuracy_qda <- mean(predicted_qda_direction == test_data$Direction)
print(paste("Overall Accuracy on Test Data (QDA):", round(accuracy_qda, 4)))
## [1] "Overall Accuracy on Test Data (QDA): 0.5865"
I trained a Quadratic Discriminant Analysis (QDA) model using Lag2 as the only predictor on data from 1990 to 2008. Then, I tested it on 2009-2010 data and computed the confusion matrix. This shows how well QDA classifies market direction. The accuracy score tells me how often the model makes correct predictions. By comparing this accuracy with logistic regression and LDA, I can determine which model performs best for market movement predictions.
# Load necessary library
library(class)
# Define predictors and response for training and test sets
train_X <- train_data$Lag2
test_X <- test_data$Lag2
train_Y <- train_data$Direction
# Convert to matrix format for KNN
train_X <- matrix(train_X, ncol = 1)
test_X <- matrix(test_X, ncol = 1)
# Perform KNN classification with K = 1
knn_pred <- knn(train = train_X, test = test_X, cl = train_Y, k = 1)
# Create confusion matrix
conf_matrix_knn <- table(Predicted = knn_pred, Actual = test_data$Direction)
print(conf_matrix_knn)
## Actual
## Predicted Down Up
## Down 21 29
## Up 22 32
# Compute accuracy
accuracy_knn <- mean(knn_pred == test_data$Direction)
print(paste("Overall Accuracy on Test Data (KNN, K=1):", round(accuracy_knn, 4)))
## [1] "Overall Accuracy on Test Data (KNN, K=1): 0.5096"
I used K-Nearest Neighbors (KNN) with K=1 to classify market direction based on Lag2. The model was trained on 1990-2008 data and tested on 2009-2010 data. The confusion matrix shows how well KNN predicted the market movement. The accuracy score tells me how often the model’s predictions were correct. Comparing this accuracy with logistic regression, LDA, and QDA helps me see which method is best for predicting market trends.
# Load necessary library
library(e1071)
# Fit Naïve Bayes model using only Lag2 as a predictor
nb_model <- naiveBayes(Direction ~ Lag2, data = train_data)
# Predict on test data
nb_predictions <- predict(nb_model, newdata = test_data)
# Create confusion matrix
conf_matrix_nb <- table(Predicted = nb_predictions, Actual = test_data$Direction)
print(conf_matrix_nb)
## Actual
## Predicted Down Up
## Down 0 0
## Up 43 61
# Compute accuracy
accuracy_nb <- mean(nb_predictions == test_data$Direction)
print(paste("Overall Accuracy on Test Data (Naïve Bayes):", round(accuracy_nb, 4)))
## [1] "Overall Accuracy on Test Data (Naïve Bayes): 0.5865"
I trained a Naïve Bayes model using Lag2 as the only predictor on data from 1990 to 2008 and tested it on 2009-2010 data. The confusion matrix shows how well the model classifies market direction, while the accuracy score tells me how often it made correct predictions. Comparing this accuracy with logistic regression, LDA, QDA, and KNN helps me determine which model is the most effective for predicting market movements.
I select the model with the highest accuracy on test data as the best method. If multiple models have similar accuracy, I consider interpretability, robustness, and consistency.
If QDA or KNN has the highest accuracy, it suggests a non-linear decision boundary in the data. If LDA or logistic regression performs best, then a linear decision boundary is sufficient. If Naïve Bayes performs well, it indicates independence assumptions work well for this dataset.
I would finalize the best method based on the actual accuracy values obtained from the models.
To find the best model, I will:
Try different predictor combinations – using multiple lags, Volume, and interaction terms. 1. Apply transformations – such as polynomial terms and log transformations. 3. Tune KNN – testing multiple values of K (e.g., K=3, 5, 10). 3. Compare models – using confusion matrices and accuracy on test data.
# Load necessary libraries
library(ISLR2)
library(MASS)
library(class)
library(e1071)
# Generate new predictor combinations
train_data$Lag1_Lag2 <- train_data$Lag1 * train_data$Lag2
test_data$Lag1_Lag2 <- test_data$Lag1 * test_data$Lag2
train_data$Lag2_squared <- train_data$Lag2^2
test_data$Lag2_squared <- test_data$Lag2^2
train_data$Log_Volume <- log(train_data$Volume + 1)
test_data$Log_Volume <- log(test_data$Volume + 1)
# ---------------- Logistic Regression with New Predictors ----------------
logistic_model_exp <- glm(Direction ~ Lag1 + Lag2 + Lag1_Lag2 + Log_Volume,
data = train_data, family = binomial)
pred_logistic <- predict(logistic_model_exp, newdata = test_data, type = "response")
pred_logistic_class <- ifelse(pred_logistic > 0.5, "Up", "Down")
conf_matrix_logistic <- table(Predicted = pred_logistic_class, Actual = test_data$Direction)
logistic_acc <- mean(pred_logistic_class == test_data$Direction)
# ---------------- LDA with New Predictors ----------------
lda_model_exp <- lda(Direction ~ Lag1 + Lag2 + Lag1_Lag2 + Log_Volume, data = train_data)
pred_lda <- predict(lda_model_exp, newdata = test_data)$class
conf_matrix_lda <- table(Predicted = pred_lda, Actual = test_data$Direction)
lda_acc <- mean(pred_lda == test_data$Direction)
# ---------------- QDA with New Predictors ----------------
qda_model_exp <- qda(Direction ~ Lag1 + Lag2 + Lag1_Lag2 + Log_Volume, data = train_data)
pred_qda <- predict(qda_model_exp, newdata = test_data)$class
conf_matrix_qda <- table(Predicted = pred_qda, Actual = test_data$Direction)
qda_acc <- mean(pred_qda == test_data$Direction)
# ---------------- KNN with Tuned K ----------------
train_X <- train_data[, c("Lag1", "Lag2", "Lag1_Lag2", "Log_Volume")]
test_X <- test_data[, c("Lag1", "Lag2", "Lag1_Lag2", "Log_Volume")]
train_Y <- train_data$Direction
knn_acc_results <- c()
for (k in c(3, 5, 10, 15)) {
pred_knn <- knn(train = train_X, test = test_X, cl = train_Y, k = k)
acc_knn <- mean(pred_knn == test_data$Direction)
knn_acc_results <- c(knn_acc_results, acc_knn)
}
# ---------------- Naïve Bayes with New Predictors ----------------
nb_model_exp <- naiveBayes(Direction ~ Lag1 + Lag2 + Lag1_Lag2 + Log_Volume, data = train_data)
pred_nb <- predict(nb_model_exp, newdata = test_data)
conf_matrix_nb <- table(Predicted = pred_nb, Actual = test_data$Direction)
nb_acc <- mean(pred_nb == test_data$Direction)
# ---------------- Compare Accuracy Scores ----------------
accuracy_results <- data.frame(
Model = c("Logistic Regression", "LDA", "QDA", "Naïve Bayes", "KNN (Best K)"),
Accuracy = c(logistic_acc, lda_acc, qda_acc, nb_acc, max(knn_acc_results))
)
print("Confusion Matrices:")
## [1] "Confusion Matrices:"
print("Logistic Regression"); print(conf_matrix_logistic)
## [1] "Logistic Regression"
## Actual
## Predicted Down Up
## Down 24 24
## Up 19 37
print("LDA"); print(conf_matrix_lda)
## [1] "LDA"
## Actual
## Predicted Down Up
## Down 24 23
## Up 19 38
print("QDA"); print(conf_matrix_qda)
## [1] "QDA"
## Actual
## Predicted Down Up
## Down 24 37
## Up 19 24
print("Naïve Bayes"); print(conf_matrix_nb)
## [1] "Naïve Bayes"
## Actual
## Predicted Down Up
## Down 34 48
## Up 9 13
print("Best K in KNN and Accuracy:"); print(knn_acc_results)
## [1] "Best K in KNN and Accuracy:"
## [1] 0.5192308 0.5288462 0.5673077 0.4230769
print("Model Comparison:")
## [1] "Model Comparison:"
print(accuracy_results)
## Model Accuracy
## 1 Logistic Regression 0.5865385
## 2 LDA 0.5961538
## 3 QDA 0.4615385
## 4 Naïve Bayes 0.4519231
## 5 KNN (Best K) 0.5673077
I experimented with different predictor combinations by adding interaction terms (Lag1 * Lag2), quadratic terms (Lag2^2), and log transformations (log(Volume)). Then, I trained Logistic Regression, LDA, QDA, Naïve Bayes, and KNN using these new predictors. I tuned KNN by testing values of K (3, 5, 10, 15) and selected the best performing one.
I compared all models using their confusion matrices and accuracy scores. The best model is the one with the highest accuracy on test data (2009-2010). This tells me which method and feature combination work best for predicting market movement.
# Load necessary library
library(ISLR2)
# Load the Auto dataset
data("Auto")
# Create binary variable mpg01 (1 if mpg > median, 0 otherwise)
mpg_median <- median(Auto$mpg)
Auto$mpg01 <- ifelse(Auto$mpg > mpg_median, 1, 0)
# Convert mpg01 to a factor (for classification)
Auto$mpg01 <- as.factor(Auto$mpg01)
# View summary
summary(Auto$mpg01)
## 0 1
## 196 196
# View the first few rows
head(Auto)
I created a new binary variable mpg01 that is 1 if a car’s mpg is above the median and 0 if it is below. I used the median() function to find the median mpg value and ifelse() to assign 1s and 0s accordingly. Converting mpg01 to a factor ensures proper classification handling in models. The summary and head() functions help verify the new variable.
# Load necessary libraries
library(ggplot2)
library(dplyr)
library(GGally) # For pair plots
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
# Boxplots for numerical features grouped by mpg01
features <- c("displacement", "horsepower", "weight", "acceleration")
for (feature in features) {
print(
ggplot(Auto, aes(x = mpg01, y = .data[[feature]], fill = mpg01)) +
geom_boxplot(alpha = 0.6) +
labs(title = paste("Boxplot of", feature, "by mpg01"),
x = "mpg01 (0 = Low, 1 = High)", y = feature) +
theme_minimal()
)
}
# Scatterplot matrix for continuous variables colored by mpg01
ggpairs(Auto, columns = c("mpg01", "displacement", "horsepower", "weight", "acceleration"),
aes(color = mpg01, alpha = 0.5))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
I explored how mpg01 relates to other features using boxplots and scatterplot matrices. The boxplots show that displacement, horsepower, and weight tend to be lower for high mpg01 cars, while acceleration differences are less clear. The scatterplot matrix helps visualize trends—mpg01 = 1 cars generally have lower horsepower, displacement, and weight, suggesting these variables are useful predictors.
# Load necessary library
set.seed(1) # Set seed for reproducibility
train_indices <- sample(1:nrow(Auto), nrow(Auto) * 0.7) # 70% training data
# Create training and test sets
train_data <- Auto[train_indices, ]
test_data <- Auto[-train_indices, ]
# Check dimensions
print(dim(train_data)) # Should be ~70% of the dataset
## [1] 274 10
print(dim(test_data)) # Should be ~30% of the dataset
## [1] 118 10
I split the Auto dataset into training (70%) and test (30%) sets using sample(). The set.seed(1) ensures reproducibility, so I get the same split each time. The training set is used to build models, and the test set evaluates their performance.
# Load necessary library
library(MASS)
# Fit LDA model using selected predictors (from part b)
lda_model <- lda(mpg01 ~ displacement + horsepower + weight, data = train_data)
# Predict on test data
lda_predictions <- predict(lda_model, newdata = test_data)
# Extract class predictions
predicted_lda_class <- lda_predictions$class
# Create confusion matrix
conf_matrix_lda <- table(Predicted = predicted_lda_class, Actual = test_data$mpg01)
print(conf_matrix_lda)
## Actual
## Predicted 0 1
## 0 47 1
## 1 14 56
# Compute test error rate
test_error_lda <- mean(predicted_lda_class != test_data$mpg01)
print(paste("Test Error Rate (LDA):", round(test_error_lda, 4)))
## [1] "Test Error Rate (LDA): 0.1271"
I performed Linear Discriminant Analysis (LDA) using displacement, horsepower, and weight as predictors, as they were most associated with mpg01. I trained the model on the training set and tested it on the test set. The confusion matrix shows how well LDA classified cars into high or low mpg. The test error rate (misclassification rate) tells me how often LDA made incorrect predictions. A lower test error means a better model.
# Load necessary library
library(MASS)
# Fit QDA model using selected predictors (from part b)
qda_model <- qda(mpg01 ~ displacement + horsepower + weight, data = train_data)
# Predict on test data
qda_predictions <- predict(qda_model, newdata = test_data)
# Extract class predictions
predicted_qda_class <- qda_predictions$class
# Create confusion matrix
conf_matrix_qda <- table(Predicted = predicted_qda_class, Actual = test_data$mpg01)
print(conf_matrix_qda)
## Actual
## Predicted 0 1
## 0 51 3
## 1 10 54
# Compute test error rate
test_error_qda <- mean(predicted_qda_class != test_data$mpg01)
print(paste("Test Error Rate (QDA):", round(test_error_qda, 4)))
## [1] "Test Error Rate (QDA): 0.1102"
I performed Quadratic Discriminant Analysis (QDA) using displacement, horsepower, and weight as predictors. I trained the model on the training set and tested it on the test set. The confusion matrix shows the classification performance, and the test error rate tells me how often QDA misclassified cars. Comparing this test error with LDA helps me see if a non-linear decision boundary improves performance.
# Fit Logistic Regression model using selected predictors (from part b)
logistic_model <- glm(mpg01 ~ displacement + horsepower + weight,
data = train_data, family = binomial)
# Predict probabilities on test data
predicted_probs <- predict(logistic_model, newdata = test_data, type = "response")
# Convert probabilities to class labels (threshold = 0.5)
predicted_logistic_class <- ifelse(predicted_probs > 0.5, 1, 0)
# Create confusion matrix
conf_matrix_logistic <- table(Predicted = predicted_logistic_class, Actual = test_data$mpg01)
print(conf_matrix_logistic)
## Actual
## Predicted 0 1
## 0 53 3
## 1 8 54
# Compute test error rate
test_error_logistic <- mean(predicted_logistic_class != test_data$mpg01)
print(paste("Test Error Rate (Logistic Regression):", round(test_error_logistic, 4)))
## [1] "Test Error Rate (Logistic Regression): 0.0932"
I performed Logistic Regression using displacement, horsepower, and weight as predictors. The model was trained on the training set and tested on the test set. I converted predicted probabilities to binary values (1 if probability > 0.5, else 0). The confusion matrix shows classification performance, and the test error rate indicates how often logistic regression made incorrect predictions. Comparing this error with LDA and QDA helps me choose the best model.
# Load necessary library
library(e1071)
# Fit Naïve Bayes model using selected predictors (from part b)
nb_model <- naiveBayes(mpg01 ~ displacement + horsepower + weight, data = train_data)
# Predict on test data
nb_predictions <- predict(nb_model, newdata = test_data)
# Create confusion matrix
conf_matrix_nb <- table(Predicted = nb_predictions, Actual = test_data$mpg01)
print(conf_matrix_nb)
## Actual
## Predicted 0 1
## 0 50 2
## 1 11 55
# Compute test error rate
test_error_nb <- mean(nb_predictions != test_data$mpg01)
print(paste("Test Error Rate (Naïve Bayes):", round(test_error_nb, 4)))
## [1] "Test Error Rate (Naïve Bayes): 0.1102"
I trained a Naïve Bayes model using displacement, horsepower, and weight as predictors. The model was trained on the training set and tested on the test set. The confusion matrix shows how well the model classified mpg01. The test error rate indicates how often the model misclassified cars. Comparing this with LDA, QDA, and Logistic Regression helps me find the best method for predicting high or low gas mileage.
# Load necessary library
library(class)
# Define predictors and response for training and test sets
train_X <- train_data[, c("displacement", "horsepower", "weight")]
test_X <- test_data[, c("displacement", "horsepower", "weight")]
train_Y <- train_data$mpg01
# Standardize the predictors (KNN is sensitive to scale)
train_X <- scale(train_X)
test_X <- scale(test_X)
# Try different values of K
k_values <- c(1, 3, 5, 7, 10, 15, 20)
knn_test_errors <- c()
for (k in k_values) {
knn_pred <- knn(train = train_X, test = test_X, cl = train_Y, k = k)
test_error_knn <- mean(knn_pred != test_data$mpg01)
knn_test_errors <- c(knn_test_errors, test_error_knn)
print(paste("Test Error Rate for K =", k, ":", round(test_error_knn, 4)))
}
## [1] "Test Error Rate for K = 1 : 0.1441"
## [1] "Test Error Rate for K = 3 : 0.1441"
## [1] "Test Error Rate for K = 5 : 0.1102"
## [1] "Test Error Rate for K = 7 : 0.1186"
## [1] "Test Error Rate for K = 10 : 0.1186"
## [1] "Test Error Rate for K = 15 : 0.1271"
## [1] "Test Error Rate for K = 20 : 0.1186"
# Find the best K (lowest test error)
best_k <- k_values[which.min(knn_test_errors)]
print(paste("Best K:", best_k, "with Test Error:", round(min(knn_test_errors), 4)))
## [1] "Best K: 5 with Test Error: 0.1102"
I applied K-Nearest Neighbors (KNN) to predict mpg01, using displacement, horsepower, and weight as predictors. Since KNN is sensitive to scale, I standardized the data before fitting the model. I tested different values of K (1, 3, 5, 7, 10, 15, 20) and calculated the test error rate for each. The best K is the one with the lowest test error. Comparing this with LDA, QDA, Logistic Regression, and Naïve Bayes helps determine the most effective model for classifying cars based on gas mileage.