library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.4.2
data("Weekly")
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
#Graphical summaries
hist(Weekly$Lag1, main = "Histogram of Lag1 Weekly Returns", xlab = "Lag1 Returns", col = "skyblue", breaks = 20)
pairs(Weekly, main = "Pairs Plot of Weekly Data")
boxplot(Lag1 ~ Direction, data = Weekly, main = "Boxplot of Lag1 Returns by Direction",
xlab = "Direction", ylab = "Lag1 Returns", col = c("red", "green"))
plot(Weekly$Year, Weekly$Today, type = "l", col = "blue",
main = "Weekly Returns Over Time", xlab = "Year", ylab = "Today")
model_logistic <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Weekly, family = binomial)
summary(model_logistic)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
Based on the output from the logistic regression model, the statistically significant predictor is Lag2 since its p-value is 0.0296 (less than significance level of 0.05). The other predictors (Lag1, Lag3, Lag4, Lag5) have p-values greater than 0.05 so they are not statistically significant.
#Prediction and confusion matrix
predicted_probs <- predict(model_logistic, type = "response")
predicted_classes <- ifelse(predicted_probs > 0.5, "Up", "Down")
conf_matrix <- table(Predicted = predicted_classes, Actual = Weekly$Direction)
print(conf_matrix)
## Actual
## Predicted Down Up
## Down 54 48
## Up 430 557
#Accuracy
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
print(paste("Overall Accuracy: ", round(accuracy, 4)))
## [1] "Overall Accuracy: 0.5611"
From the confusion matrix: True Negatives (TN): The model correctly predicted “Down” 54 times when the actual value was “Down.” True Positives (TP): The model correctly predicted “Up” 557 times when the actual value was “Up.” False Positives (FP): The model incorrectly predicted “Down” 48 times when the actual value was “Up.” False Negatives (FN): The model incorrectly predicted “Up” 430 times when the actual value was “Down.”
#Split data into training and testing data sets
train_data <- Weekly[Weekly$Year <= 2008, ]
test_data <- Weekly[Weekly$Year > 2008, ]
model_train <- glm(Direction ~ Lag2, data = train_data, family = binomial)
summary(model_train)
##
## Call:
## glm(formula = Direction ~ Lag2, family = binomial, data = train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.20326 0.06428 3.162 0.00157 **
## Lag2 0.05810 0.02870 2.024 0.04298 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1354.7 on 984 degrees of freedom
## Residual deviance: 1350.5 on 983 degrees of freedom
## AIC: 1354.5
##
## Number of Fisher Scoring iterations: 4
#Predict on test data
predicted_probs_test <- predict(model_train, newdata = test_data, type = "response")
predicted_classes_test <- ifelse(predicted_probs_test > 0.5, "Up", "Down")
#Compute confusion matrix
conf_matrix_test <- table(Predicted = predicted_classes_test, Actual = test_data$Direction)
print(conf_matrix_test)
## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
accuracy_test <- sum(diag(conf_matrix_test)) / sum(conf_matrix_test)
print(paste("Test Accuracy: ", round(accuracy_test, 4)))
## [1] "Test Accuracy: 0.625"
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
#Using LDA model
lda_model <- lda(Direction ~ Lag2, data = train_data)
print(lda_model)
## Call:
## lda(Direction ~ Lag2, data = train_data)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
##
## Coefficients of linear discriminants:
## LD1
## Lag2 0.4414162
#Predict on test data
lda_predictions <- predict(lda_model, newdata = test_data)
conf_matrix_lda <- table(Predicted = lda_predictions$class, Actual = test_data$Direction)
print(conf_matrix_lda)
## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
accuracy_lda <- sum(diag(conf_matrix_lda)) / sum(conf_matrix_lda)
print(paste("Test Accuracy (LDA): ", round(accuracy_lda, 4)))
## [1] "Test Accuracy (LDA): 0.625"
#Using QDA model
qda_model <- qda(Direction ~ Lag2, data = train_data)
print(qda_model)
## Call:
## qda(Direction ~ Lag2, data = train_data)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
#Predict on test data
qda_predictions <- predict(qda_model, newdata = test_data)
conf_matrix_qda <- table(Predicted = qda_predictions$class, Actual = test_data$Direction)
print(conf_matrix_qda)
## Actual
## Predicted Down Up
## Down 0 0
## Up 43 61
accuracy_qda <- sum(diag(conf_matrix_qda)) / sum(conf_matrix_qda)
print(paste("Test Accuracy (QDA): ", round(accuracy_qda, 4)))
## [1] "Test Accuracy (QDA): 0.5865"
library(class)
#Prepare training and testing data
train_X <- as.matrix(train_data$Lag2) # Predictor for training
test_X <- as.matrix(test_data$Lag2) # Predictor for testing
train_Y <- train_data$Direction # Response for training
knn_predictions <- knn(train = train_X, test = test_X, cl = train_Y, k = 1)
conf_matrix_knn <- table(Predicted = knn_predictions, Actual = test_data$Direction)
print(conf_matrix_knn)
## Actual
## Predicted Down Up
## Down 21 29
## Up 22 32
accuracy_knn <- sum(diag(conf_matrix_knn)) / sum(conf_matrix_knn)
print(paste("Test Accuracy (k-NN, k=1): ", round(accuracy_knn, 4)))
## [1] "Test Accuracy (k-NN, k=1): 0.5096"
library(e1071)
## Warning: package 'e1071' was built under R version 4.4.2
#Using Naive Bayes model
nb_model <- naiveBayes(Direction ~ Lag2, data = train_data)
nb_predictions <- predict(nb_model, newdata = test_data)
conf_matrix_nb <- table(Predicted = nb_predictions, Actual = test_data$Direction)
print(conf_matrix_nb)
## Actual
## Predicted Down Up
## Down 0 0
## Up 43 61
accuracy_nb <- sum(diag(conf_matrix_nb)) / sum(conf_matrix_nb)
print(paste("Test Accuracy (Naive Bayes): ", round(accuracy_nb, 4)))
## [1] "Test Accuracy (Naive Bayes): 0.5865"
Base on the test accuracy results, the best methods are logistics regression model and LDA model.
Experimenting with different combinations of predictors, transformations, interactions, and methods.
#Logistics regression
logistic_model_alt <- glm(Direction ~ Lag2 + poly(Lag2, 2) + Lag1 * Lag2,
data = train_data, family = binomial)
predicted_probabilities_alt <- predict(logistic_model_alt, newdata = test_data, type = "response")
predicted_classes_alt <- ifelse(predicted_probabilities_alt > 0.5, "Up", "Down")
confusion_matrix_logistic_alt <- table(Prediction = predicted_classes_alt, Actual = test_data$Direction)
print(confusion_matrix_logistic_alt)
## Actual
## Prediction Down Up
## Down 6 9
## Up 37 52
accuracy_logistic_alt <- sum(diag(confusion_matrix_logistic_alt)) / sum(confusion_matrix_logistic_alt)
print(paste("Alternative Logistic Regression Accuracy: ", round(accuracy_logistic_alt, 4)))
## [1] "Alternative Logistic Regression Accuracy: 0.5577"
#LDA
lda_model_alt <- lda(Direction ~ Lag2 + Lag2 * Volume, data = train_data)
lda_predictions_alt <- predict(lda_model_alt, newdata = test_data)
confusion_matrix_lda_alt <- table(Prediction = lda_predictions_alt$class, Actual = test_data$Direction)
print(confusion_matrix_lda_alt)
## Actual
## Prediction Down Up
## Down 20 25
## Up 23 36
accuracy_lda_alt <- sum(diag(confusion_matrix_lda_alt)) / sum(confusion_matrix_lda_alt)
print(paste("Alternative LDA Accuracy: ", round(accuracy_lda_alt, 4)))
## [1] "Alternative LDA Accuracy: 0.5385"
#QDA
qda_model_alt <- qda(Direction ~ Lag2 + I(Lag2^2), data = train_data)
qda_predictions_alt <- predict(qda_model_alt, newdata = test_data)
confusion_matrix_qda_alt <- table(Prediction = qda_predictions_alt$class, Actual = test_data$Direction)
print(confusion_matrix_qda_alt)
## Actual
## Prediction Down Up
## Down 7 3
## Up 36 58
accuracy_qda_alt <- sum(diag(confusion_matrix_qda_alt)) / sum(confusion_matrix_qda_alt)
print(paste("Alternative QDA Accuracy: ", round(accuracy_qda_alt, 4)))
## [1] "Alternative QDA Accuracy: 0.625"
#KNN
training_features_alt <- scale(as.matrix(train_data$Lag2))
test_features_alt <- scale(as.matrix(test_data$Lag2))
training_labels_alt <- train_data$Direction
for (neighbors in c(1, 3, 5, 7)) {
knn_predictions_alt <- knn(train = training_features_alt,
test = test_features_alt,
cl = training_labels_alt,
k = neighbors)
confusion_matrix_knn_alt <- table(Prediction = knn_predictions_alt, Actual = test_data$Direction)
print(paste("Confusion Matrix for k =", neighbors))
print(confusion_matrix_knn_alt)
accuracy_knn_alt <- sum(diag(confusion_matrix_knn_alt)) / sum(confusion_matrix_knn_alt)
print(paste("Alternative k-NN Accuracy for k =", neighbors, ": ", round(accuracy_knn_alt, 4)))
}
## [1] "Confusion Matrix for k = 1"
## Actual
## Prediction Down Up
## Down 14 25
## Up 29 36
## [1] "Alternative k-NN Accuracy for k = 1 : 0.4808"
## [1] "Confusion Matrix for k = 3"
## Actual
## Prediction Down Up
## Down 11 25
## Up 32 36
## [1] "Alternative k-NN Accuracy for k = 3 : 0.4519"
## [1] "Confusion Matrix for k = 5"
## Actual
## Prediction Down Up
## Down 12 25
## Up 31 36
## [1] "Alternative k-NN Accuracy for k = 5 : 0.4615"
## [1] "Confusion Matrix for k = 7"
## Actual
## Prediction Down Up
## Down 16 25
## Up 27 36
## [1] "Alternative k-NN Accuracy for k = 7 : 0.5"
#Naive Bayes
train_data$Lag2_Volume <- train_data$Lag2 * train_data$Volume
test_data$Lag2_Volume <- test_data$Lag2 * test_data$Volume
nb_model_corrected <- naiveBayes(Direction ~ Lag2 + Lag2_Volume, data = train_data)
nb_predictions_corrected <- predict(nb_model_corrected, newdata = test_data)
conf_matrix_nb_corrected <- table(Predicted = nb_predictions_corrected, Actual = test_data$Direction)
print(conf_matrix_nb_corrected)
## Actual
## Predicted Down Up
## Down 19 28
## Up 24 33
accuracy_nb_corrected <- sum(diag(conf_matrix_nb_corrected)) / sum(conf_matrix_nb_corrected)
print(paste("Corrected Naive Bayes Accuracy: ", round(accuracy_nb_corrected, 4)))
## [1] "Corrected Naive Bayes Accuracy: 0.5"
data("Auto")
head(Auto) # View the first few rows of the dataset
## mpg cylinders displacement horsepower weight acceleration year origin
## 1 18 8 307 130 3504 12.0 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11.0 70 1
## 4 16 8 304 150 3433 12.0 70 1
## 5 17 8 302 140 3449 10.5 70 1
## 6 15 8 429 198 4341 10.0 70 1
## name
## 1 chevrolet chevelle malibu
## 2 buick skylark 320
## 3 plymouth satellite
## 4 amc rebel sst
## 5 ford torino
## 6 ford galaxie 500
#Compute median of mpg
mpg_median <- median(Auto$mpg)
print(mpg_median)
## [1] 22.75
#Create mpg01
Auto$mpg01 <- ifelse(Auto$mpg > mpg_median, 1, 0)
#Combine mpg01 with original dataset
head(Auto) # Check the new column
## mpg cylinders displacement horsepower weight acceleration year origin
## 1 18 8 307 130 3504 12.0 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11.0 70 1
## 4 16 8 304 150 3433 12.0 70 1
## 5 17 8 302 140 3449 10.5 70 1
## 6 15 8 429 198 4341 10.0 70 1
## name mpg01
## 1 chevrolet chevelle malibu 0
## 2 buick skylark 320 0
## 3 plymouth satellite 0
## 4 amc rebel sst 0
## 5 ford torino 0
## 6 ford galaxie 500 0
# Summary of new variable
table(Auto$mpg01) # Counts of 0s and 1s
##
## 0 1
## 196 196
#Scatterplots
par(mfrow = c(2, 2)) # Set up a grid for multiple plots
plot(Auto$horsepower, Auto$mpg, col = Auto$mpg01 + 1, pch = 16,
xlab = "Horsepower", ylab = "MPG", main = "Horsepower vs MPG (mpg01)")
legend("topright", legend = c("Low MPG", "High MPG"), col = c(1, 2), pch = 16)
plot(Auto$displacement, Auto$mpg, col = Auto$mpg01 + 1, pch = 16,
xlab = "Displacement", ylab = "MPG", main = "Displacement vs MPG (mpg01)")
plot(Auto$weight, Auto$mpg, col = Auto$mpg01 + 1, pch = 16,
xlab = "Weight", ylab = "MPG", main = "Weight vs MPG (mpg01)")
plot(Auto$acceleration, Auto$mpg, col = Auto$mpg01 + 1, pch = 16,
xlab = "Acceleration", ylab = "MPG", main = "Acceleration vs MPG (mpg01)")
#Boxplots
par(mfrow = c(2, 2)) # Set up a grid for multiple plots
boxplot(horsepower ~ mpg01, data = Auto, col = c("red", "green"),
main = "Horsepower by MPG Category", xlab = "MPG01", ylab = "Horsepower")
boxplot(weight ~ mpg01, data = Auto, col = c("red", "green"),
main = "Weight by MPG Category", xlab = "MPG01", ylab = "Weight")
boxplot(displacement ~ mpg01, data = Auto, col = c("red", "green"),
main = "Displacement by MPG Category", xlab = "MPG01", ylab = "Displacement")
boxplot(acceleration ~ mpg01, data = Auto, col = c("red", "green"),
main = "Acceleration by MPG Category", xlab = "MPG01", ylab = "Acceleration")
Based on the scatterplots: Horsepower, Weight, and Displacement are the features that seem most strongly associated with mpg01. Acceleration may have a weaker relationship, so it might be less useful on its own.
From the boxplots, horsepower, weight, and displacement stand out as the most promising predictors of mpg01, as their distributions exhibit clear differences between the high and low MPG categories.
#Split data into training and test sets
set.seed(123) # For reproducibility
train_indices_auto <- sample(1:nrow(Auto), size = 0.7 * nrow(Auto)) # 70% training
train_set_auto <- Auto[train_indices_auto, ] # Training set
test_set_auto <- Auto[-train_indices_auto, ] # Test set
#Verify the split
nrow(train_set_auto) # Number of rows in the training set
## [1] 274
nrow(test_set_auto) # Number of rows in the test set
## [1] 118
#Fit LDA model
lda_model_auto <- lda(mpg01 ~ horsepower + weight + displacement, data = train_set_auto)
print(lda_model_auto)
## Call:
## lda(mpg01 ~ horsepower + weight + displacement, data = train_set_auto)
##
## Prior probabilities of groups:
## 0 1
## 0.4963504 0.5036496
##
## Group means:
## horsepower weight displacement
## 0 130.96324 3641.022 275.2941
## 1 78.00725 2314.000 114.5290
##
## Coefficients of linear discriminants:
## LD1
## horsepower 0.005992754
## weight -0.001059313
## displacement -0.008641675
#Make predictions on test data
lda_predictions_auto <- predict(lda_model_auto, newdata = test_set_auto)
#Confusion matrix and accuracy
conf_matrix_lda_auto <- table(Predicted = lda_predictions_auto$class, Actual = test_set_auto$mpg01)
print(conf_matrix_lda_auto)
## Actual
## Predicted 0 1
## 0 49 3
## 1 11 55
accuracy_lda_auto <- sum(diag(conf_matrix_lda_auto)) / sum(conf_matrix_lda_auto)
print(paste("LDA Accuracy: ", round(accuracy_lda_auto, 4)))
## [1] "LDA Accuracy: 0.8814"
# Calculate the test error rate
test_error_lda_auto <- 1 - accuracy_lda_auto
print(paste("Test Error (LDA): ", round(test_error_lda_auto, 4)))
## [1] "Test Error (LDA): 0.1186"
#Fit QDA model
qda_model_auto <- qda(mpg01 ~ horsepower + weight + displacement, data = train_set_auto)
print(qda_model_auto)
## Call:
## qda(mpg01 ~ horsepower + weight + displacement, data = train_set_auto)
##
## Prior probabilities of groups:
## 0 1
## 0.4963504 0.5036496
##
## Group means:
## horsepower weight displacement
## 0 130.96324 3641.022 275.2941
## 1 78.00725 2314.000 114.5290
qda_predictions_auto <- predict(qda_model_auto, newdata = test_set_auto)
#Confusion matrix
conf_matrix_qda_auto <- table(Predicted = qda_predictions_auto$class, Actual = test_set_auto$mpg01)
print(conf_matrix_qda_auto)
## Actual
## Predicted 0 1
## 0 53 5
## 1 7 53
# Accuracy
accuracy_qda_auto <- sum(diag(conf_matrix_qda_auto)) / sum(conf_matrix_qda_auto)
print(paste("QDA Accuracy: ", round(accuracy_qda_auto, 4)))
## [1] "QDA Accuracy: 0.8983"
# Test Error
test_error_qda_auto <- 1 - accuracy_qda_auto
print(paste("Test Error (QDA): ", round(test_error_qda_auto, 4)))
## [1] "Test Error (QDA): 0.1017"
#Fit logistics regression model
logistic_model_auto <- glm(mpg01 ~ horsepower + weight + displacement,
data = train_set_auto, family = binomial)
summary(logistic_model_auto)
##
## Call:
## glm(formula = mpg01 ~ horsepower + weight + displacement, family = binomial,
## data = train_set_auto)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 12.0492510 2.0054314 6.008 1.87e-09 ***
## horsepower -0.0421016 0.0164145 -2.565 0.0103 *
## weight -0.0020496 0.0008526 -2.404 0.0162 *
## displacement -0.0131390 0.0064437 -2.039 0.0414 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 379.83 on 273 degrees of freedom
## Residual deviance: 138.49 on 270 degrees of freedom
## AIC: 146.49
##
## Number of Fisher Scoring iterations: 7
#Prediction on test data
logistic_predictions_probs <- predict(logistic_model_auto, newdata = test_set_auto, type = "response")
logistic_predictions_probs <- predict(logistic_model_auto, newdata = test_set_auto, type = "response")
logistic_predictions_classes <- ifelse(logistic_predictions_probs > 0.5, 1, 0)
#Confusion matrix
conf_matrix_logistic_auto <- table(Predicted = logistic_predictions_classes, Actual = test_set_auto$mpg01)
print(conf_matrix_logistic_auto)
## Actual
## Predicted 0 1
## 0 53 6
## 1 7 52
#Test error calculation
# Accuracy
accuracy_logistic_auto <- sum(diag(conf_matrix_logistic_auto)) / sum(conf_matrix_logistic_auto)
print(paste("Logistic Regression Accuracy: ", round(accuracy_logistic_auto, 4)))
## [1] "Logistic Regression Accuracy: 0.8898"
# Test Error
test_error_logistic_auto <- 1 - accuracy_logistic_auto
print(paste("Test Error (Logistic Regression): ", round(test_error_logistic_auto, 4)))
## [1] "Test Error (Logistic Regression): 0.1102"
#Fit the Naive Bayes model
naive_bayes_model_auto <- naiveBayes(mpg01 ~ horsepower + weight + displacement, data = train_set_auto)
#Predictions on test data
nb_predictions_auto <- predict(naive_bayes_model_auto, newdata = test_set_auto)
#Confusion matrix
conf_matrix_nb_auto <- table(Predicted = nb_predictions_auto, Actual = test_set_auto$mpg01)
print(conf_matrix_nb_auto)
## Actual
## Predicted 0 1
## 0 51 4
## 1 9 54
#Test error calculation
# Accuracy
accuracy_nb_auto <- sum(diag(conf_matrix_nb_auto)) / sum(conf_matrix_nb_auto)
print(paste("Naive Bayes Accuracy: ", round(accuracy_nb_auto, 4)))
## [1] "Naive Bayes Accuracy: 0.8898"
# Test Error
test_error_nb_auto <- 1 - accuracy_nb_auto
print(paste("Test Error (Naive Bayes): ", round(test_error_nb_auto, 4)))
## [1] "Test Error (Naive Bayes): 0.1102"
#Fit KNN model
# Standardize predictors
train_features_knn <- scale(train_set_auto[, c("horsepower", "weight", "displacement")])
test_features_knn <- scale(test_set_auto[, c("horsepower", "weight", "displacement")])
# Response variable
train_labels_knn <- train_set_auto$mpg01
#Apply KNN for different values of k
# Loop through different values of k
for (k in c(1, 3, 5, 7, 9)) {
# Perform k-NN
knn_predictions_auto <- knn(train = train_features_knn,
test = test_features_knn,
cl = train_labels_knn,
k = k)
# Confusion matrix
conf_matrix_knn_auto <- table(Predicted = knn_predictions_auto, Actual = test_set_auto$mpg01)
print(paste("Confusion Matrix for k =", k))
print(conf_matrix_knn_auto)
# Accuracy
accuracy_knn_auto <- sum(diag(conf_matrix_knn_auto)) / sum(conf_matrix_knn_auto)
print(paste("Accuracy for k =", k, ": ", round(accuracy_knn_auto, 4)))
# Test error
test_error_knn_auto <- 1 - accuracy_knn_auto
print(paste("Test Error for k =", k, ": ", round(test_error_knn_auto, 4)))
}
## [1] "Confusion Matrix for k = 1"
## Actual
## Predicted 0 1
## 0 52 9
## 1 8 49
## [1] "Accuracy for k = 1 : 0.8559"
## [1] "Test Error for k = 1 : 0.1441"
## [1] "Confusion Matrix for k = 3"
## Actual
## Predicted 0 1
## 0 53 10
## 1 7 48
## [1] "Accuracy for k = 3 : 0.8559"
## [1] "Test Error for k = 3 : 0.1441"
## [1] "Confusion Matrix for k = 5"
## Actual
## Predicted 0 1
## 0 51 8
## 1 9 50
## [1] "Accuracy for k = 5 : 0.8559"
## [1] "Test Error for k = 5 : 0.1441"
## [1] "Confusion Matrix for k = 7"
## Actual
## Predicted 0 1
## 0 52 6
## 1 8 52
## [1] "Accuracy for k = 7 : 0.8814"
## [1] "Test Error for k = 7 : 0.1186"
## [1] "Confusion Matrix for k = 9"
## Actual
## Predicted 0 1
## 0 52 5
## 1 8 53
## [1] "Accuracy for k = 9 : 0.8898"
## [1] "Test Error for k = 9 : 0.1102"
k=9 seems to perform the best for this model.
#Load data
data("Boston")
head(Boston) # Preview the first few rows
## crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12 5.21
## medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7
summary(Boston) # Summary of the dataset
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
#Create response variable
# Compute the median of the crime rate
crime_median <- median(Boston$crim)
# Create binary response variable
Boston$crime_above_median <- ifelse(Boston$crim > crime_median, 1, 0)
#Split data into training and test sets (70/30)
set.seed(123) # For reproducibility
train_indices_boston <- sample(1:nrow(Boston), size = 0.7 * nrow(Boston))
train_set_boston <- Boston[train_indices_boston, ] # Training set
test_set_boston <- Boston[-train_indices_boston, ] # Test set
#Logistics Regression
# Fit the logistic regression model
logistic_model_boston <- glm(crime_above_median ~ lstat + age + indus,
data = train_set_boston, family = binomial)
# Make predictions on the test set
logistic_probs_boston <- predict(logistic_model_boston, newdata = test_set_boston, type = "response")
logistic_predictions_boston <- ifelse(logistic_probs_boston > 0.5, 1, 0)
# Confusion matrix and accuracy
conf_matrix_logistic_boston <- table(Predicted = logistic_predictions_boston, Actual = test_set_boston$crime_above_median)
accuracy_logistic_boston <- sum(diag(conf_matrix_logistic_boston)) / sum(conf_matrix_logistic_boston)
print("Logistic Regression Results:")
## [1] "Logistic Regression Results:"
print(conf_matrix_logistic_boston)
## Actual
## Predicted 0 1
## 0 59 13
## 1 16 64
print(paste("Accuracy: ", round(accuracy_logistic_boston, 4)))
## [1] "Accuracy: 0.8092"
#LDA
# Fit the LDA model
lda_model_boston <- lda(crime_above_median ~ lstat + age + indus, data = train_set_boston)
# Make predictions on the test set
lda_predictions_boston <- predict(lda_model_boston, newdata = test_set_boston)
conf_matrix_lda_boston <- table(Predicted = lda_predictions_boston$class, Actual = test_set_boston$crime_above_median)
accuracy_lda_boston <- sum(diag(conf_matrix_lda_boston)) / sum(conf_matrix_lda_boston)
print("LDA Results:")
## [1] "LDA Results:"
print(conf_matrix_lda_boston)
## Actual
## Predicted 0 1
## 0 58 14
## 1 17 63
print(paste("Accuracy: ", round(accuracy_lda_boston, 4)))
## [1] "Accuracy: 0.7961"
#Naive Bayes model
# Fit the Naive Bayes model
naive_bayes_model_boston <- naiveBayes(crime_above_median ~ lstat + age + indus, data = train_set_boston)
# Make predictions on the test set
naive_bayes_predictions_boston <- predict(naive_bayes_model_boston, newdata = test_set_boston)
conf_matrix_naive_bayes_boston <- table(Predicted = naive_bayes_predictions_boston, Actual = test_set_boston$crime_above_median)
accuracy_naive_bayes_boston <- sum(diag(conf_matrix_naive_bayes_boston)) / sum(conf_matrix_naive_bayes_boston)
print("Naive Bayes Results:")
## [1] "Naive Bayes Results:"
print(conf_matrix_naive_bayes_boston)
## Actual
## Predicted 0 1
## 0 59 18
## 1 16 59
print(paste("Accuracy: ", round(accuracy_naive_bayes_boston, 4)))
## [1] "Accuracy: 0.7763"
#KNN model
# Standardize predictors
train_features_boston <- scale(train_set_boston[, c("lstat", "age", "indus")])
test_features_boston <- scale(test_set_boston[, c("lstat", "age", "indus")])
train_labels_boston <- train_set_boston$crime_above_median
# Loop over different values of k
for (k in c(1, 3, 5, 7, 9)) {
knn_predictions_boston <- knn(train = train_features_boston,
test = test_features_boston,
cl = train_labels_boston,
k = k)
conf_matrix_knn_boston <- table(Predicted = knn_predictions_boston, Actual = test_set_boston$crime_above_median)
accuracy_knn_boston <- sum(diag(conf_matrix_knn_boston)) / sum(conf_matrix_knn_boston)
print(paste("k-NN Results for k =", k))
print(conf_matrix_knn_boston)
print(paste("Accuracy: ", round(accuracy_knn_boston, 4)))
}
## [1] "k-NN Results for k = 1"
## Actual
## Predicted 0 1
## 0 65 15
## 1 10 62
## [1] "Accuracy: 0.8355"
## [1] "k-NN Results for k = 3"
## Actual
## Predicted 0 1
## 0 64 15
## 1 11 62
## [1] "Accuracy: 0.8289"
## [1] "k-NN Results for k = 5"
## Actual
## Predicted 0 1
## 0 63 18
## 1 12 59
## [1] "Accuracy: 0.8026"
## [1] "k-NN Results for k = 7"
## Actual
## Predicted 0 1
## 0 66 16
## 1 9 61
## [1] "Accuracy: 0.8355"
## [1] "k-NN Results for k = 9"
## Actual
## Predicted 0 1
## 0 65 15
## 1 10 62
## [1] "Accuracy: 0.8355"
Findings: k-NN with k = 1, 3, 9 competitive accuracy (around 83.55%), performing slightly better than logistic regression (accuracy 79.61%) and naive Bayes (accuracy 77.63%).
LDA achieves decent performance with an accuracy of 79.61%, but k-NN outperforms it for certain values of k.