Problem 13

a.

library(ISLR2)

## Warning: package 'ISLR2' was built under R version 4.4.2

data("Weekly")

summary(Weekly)

##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
##

#Graphical summaries

hist(Weekly$Lag1, main = "Histogram of Lag1 Weekly Returns", xlab = "Lag1 Returns", col = "skyblue", breaks = 20)

pairs(Weekly, main = "Pairs Plot of Weekly Data")

boxplot(Lag1 ~ Direction, data = Weekly, main = "Boxplot of Lag1 Returns by Direction", 
        xlab = "Direction", ylab = "Lag1 Returns", col = c("red", "green"))

plot(Weekly$Year, Weekly$Today, type = "l", col = "blue", 
     main = "Weekly Returns Over Time", xlab = "Year", ylab = "Today")

b.

model_logistic <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, 
                      data = Weekly, family = binomial)
summary(model_logistic)

## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Weekly)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4

Based on the output from the logistic regression model, the statistically significant predictor is Lag2 since its p-value is 0.0296 (less than significance level of 0.05).
The other predictors (Lag1, Lag3, Lag4, Lag5) have p-values greater than 0.05 so they are not statistically significant.

c.

#Prediction and confusion matrix
predicted_probs <- predict(model_logistic, type = "response")
predicted_classes <- ifelse(predicted_probs > 0.5, "Up", "Down")

conf_matrix <- table(Predicted = predicted_classes, Actual = Weekly$Direction)
print(conf_matrix)

##          Actual
## Predicted Down  Up
##      Down   54  48
##      Up    430 557

#Accuracy
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
print(paste("Overall Accuracy: ", round(accuracy, 4)))

## [1] "Overall Accuracy:  0.5611"

From the confusion matrix:
True Negatives (TN): The model correctly predicted “Down” 54 times when the actual value was “Down.”
True Positives (TP): The model correctly predicted “Up” 557 times when the actual value was “Up.”
False Positives (FP): The model incorrectly predicted “Down” 48 times when the actual value was “Up.”
False Negatives (FN): The model incorrectly predicted “Up” 430 times when the actual value was “Down.”

d.

#Split data into training and testing data sets
train_data <- Weekly[Weekly$Year <= 2008, ]
test_data <- Weekly[Weekly$Year > 2008, ]

model_train <- glm(Direction ~ Lag2, data = train_data, family = binomial)

summary(model_train)

## 
## Call:
## glm(formula = Direction ~ Lag2, family = binomial, data = train_data)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.20326    0.06428   3.162  0.00157 **
## Lag2         0.05810    0.02870   2.024  0.04298 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1354.7  on 984  degrees of freedom
## Residual deviance: 1350.5  on 983  degrees of freedom
## AIC: 1354.5
## 
## Number of Fisher Scoring iterations: 4

#Predict on test data
predicted_probs_test <- predict(model_train, newdata = test_data, type = "response")
predicted_classes_test <- ifelse(predicted_probs_test > 0.5, "Up", "Down")

#Compute confusion matrix
conf_matrix_test <- table(Predicted = predicted_classes_test, Actual = test_data$Direction)
print(conf_matrix_test)

##          Actual
## Predicted Down Up
##      Down    9  5
##      Up     34 56

accuracy_test <- sum(diag(conf_matrix_test)) / sum(conf_matrix_test)
print(paste("Test Accuracy: ", round(accuracy_test, 4)))

## [1] "Test Accuracy:  0.625"

e.

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:ISLR2':
## 
##     Boston

#Using LDA model
lda_model <- lda(Direction ~ Lag2, data = train_data)
print(lda_model)

## Call:
## lda(Direction ~ Lag2, data = train_data)
## 
## Prior probabilities of groups:
##      Down        Up 
## 0.4477157 0.5522843 
## 
## Group means:
##             Lag2
## Down -0.03568254
## Up    0.26036581
## 
## Coefficients of linear discriminants:
##            LD1
## Lag2 0.4414162

#Predict on test data
lda_predictions <- predict(lda_model, newdata = test_data)

conf_matrix_lda <- table(Predicted = lda_predictions$class, Actual = test_data$Direction)
print(conf_matrix_lda)

##          Actual
## Predicted Down Up
##      Down    9  5
##      Up     34 56

accuracy_lda <- sum(diag(conf_matrix_lda)) / sum(conf_matrix_lda)
print(paste("Test Accuracy (LDA): ", round(accuracy_lda, 4)))

## [1] "Test Accuracy (LDA):  0.625"

f.

#Using QDA model
qda_model <- qda(Direction ~ Lag2, data = train_data)
print(qda_model)

## Call:
## qda(Direction ~ Lag2, data = train_data)
## 
## Prior probabilities of groups:
##      Down        Up 
## 0.4477157 0.5522843 
## 
## Group means:
##             Lag2
## Down -0.03568254
## Up    0.26036581

#Predict on test data
qda_predictions <- predict(qda_model, newdata = test_data)

conf_matrix_qda <- table(Predicted = qda_predictions$class, Actual = test_data$Direction)
print(conf_matrix_qda)

##          Actual
## Predicted Down Up
##      Down    0  0
##      Up     43 61

accuracy_qda <- sum(diag(conf_matrix_qda)) / sum(conf_matrix_qda)
print(paste("Test Accuracy (QDA): ", round(accuracy_qda, 4)))

## [1] "Test Accuracy (QDA):  0.5865"

g.

library(class)

#Prepare training and testing data

train_X <- as.matrix(train_data$Lag2)  # Predictor for training
test_X <- as.matrix(test_data$Lag2)   # Predictor for testing

train_Y <- train_data$Direction       # Response for training

knn_predictions <- knn(train = train_X, test = test_X, cl = train_Y, k = 1)

conf_matrix_knn <- table(Predicted = knn_predictions, Actual = test_data$Direction)
print(conf_matrix_knn)

##          Actual
## Predicted Down Up
##      Down   21 29
##      Up     22 32

accuracy_knn <- sum(diag(conf_matrix_knn)) / sum(conf_matrix_knn)
print(paste("Test Accuracy (k-NN, k=1): ", round(accuracy_knn, 4)))

## [1] "Test Accuracy (k-NN, k=1):  0.5096"

h.

library(e1071)

## Warning: package 'e1071' was built under R version 4.4.2

#Using Naive Bayes model
nb_model <- naiveBayes(Direction ~ Lag2, data = train_data)

nb_predictions <- predict(nb_model, newdata = test_data)

conf_matrix_nb <- table(Predicted = nb_predictions, Actual = test_data$Direction)
print(conf_matrix_nb)

##          Actual
## Predicted Down Up
##      Down    0  0
##      Up     43 61

accuracy_nb <- sum(diag(conf_matrix_nb)) / sum(conf_matrix_nb)
print(paste("Test Accuracy (Naive Bayes): ", round(accuracy_nb, 4)))

## [1] "Test Accuracy (Naive Bayes):  0.5865"

i.

Base on the test accuracy results, the best methods are logistics regression model and LDA model.

j.

Experimenting with different combinations of predictors, transformations, interactions, and methods.

#Logistics regression
logistic_model_alt <- glm(Direction ~ Lag2 + poly(Lag2, 2) + Lag1 * Lag2, 
                          data = train_data, family = binomial)
predicted_probabilities_alt <- predict(logistic_model_alt, newdata = test_data, type = "response")
predicted_classes_alt <- ifelse(predicted_probabilities_alt > 0.5, "Up", "Down")
confusion_matrix_logistic_alt <- table(Prediction = predicted_classes_alt, Actual = test_data$Direction)
print(confusion_matrix_logistic_alt)

##           Actual
## Prediction Down Up
##       Down    6  9
##       Up     37 52

accuracy_logistic_alt <- sum(diag(confusion_matrix_logistic_alt)) / sum(confusion_matrix_logistic_alt)
print(paste("Alternative Logistic Regression Accuracy: ", round(accuracy_logistic_alt, 4)))

## [1] "Alternative Logistic Regression Accuracy:  0.5577"

#LDA
lda_model_alt <- lda(Direction ~ Lag2 + Lag2 * Volume, data = train_data)
lda_predictions_alt <- predict(lda_model_alt, newdata = test_data)
confusion_matrix_lda_alt <- table(Prediction = lda_predictions_alt$class, Actual = test_data$Direction)
print(confusion_matrix_lda_alt)

##           Actual
## Prediction Down Up
##       Down   20 25
##       Up     23 36

accuracy_lda_alt <- sum(diag(confusion_matrix_lda_alt)) / sum(confusion_matrix_lda_alt)
print(paste("Alternative LDA Accuracy: ", round(accuracy_lda_alt, 4)))

## [1] "Alternative LDA Accuracy:  0.5385"

#QDA
qda_model_alt <- qda(Direction ~ Lag2 + I(Lag2^2), data = train_data)
qda_predictions_alt <- predict(qda_model_alt, newdata = test_data)
confusion_matrix_qda_alt <- table(Prediction = qda_predictions_alt$class, Actual = test_data$Direction)
print(confusion_matrix_qda_alt)

##           Actual
## Prediction Down Up
##       Down    7  3
##       Up     36 58

accuracy_qda_alt <- sum(diag(confusion_matrix_qda_alt)) / sum(confusion_matrix_qda_alt)
print(paste("Alternative QDA Accuracy: ", round(accuracy_qda_alt, 4)))

## [1] "Alternative QDA Accuracy:  0.625"

#KNN
training_features_alt <- scale(as.matrix(train_data$Lag2))
test_features_alt <- scale(as.matrix(test_data$Lag2))
training_labels_alt <- train_data$Direction

for (neighbors in c(1, 3, 5, 7)) {
    knn_predictions_alt <- knn(train = training_features_alt, 
                               test = test_features_alt, 
                               cl = training_labels_alt, 
                               k = neighbors)
    confusion_matrix_knn_alt <- table(Prediction = knn_predictions_alt, Actual = test_data$Direction)
    print(paste("Confusion Matrix for k =", neighbors))
    print(confusion_matrix_knn_alt)
    accuracy_knn_alt <- sum(diag(confusion_matrix_knn_alt)) / sum(confusion_matrix_knn_alt)
    print(paste("Alternative k-NN Accuracy for k =", neighbors, ": ", round(accuracy_knn_alt, 4)))
}

## [1] "Confusion Matrix for k = 1"
##           Actual
## Prediction Down Up
##       Down   14 25
##       Up     29 36
## [1] "Alternative k-NN Accuracy for k = 1 :  0.4808"
## [1] "Confusion Matrix for k = 3"
##           Actual
## Prediction Down Up
##       Down   11 25
##       Up     32 36
## [1] "Alternative k-NN Accuracy for k = 3 :  0.4519"
## [1] "Confusion Matrix for k = 5"
##           Actual
## Prediction Down Up
##       Down   12 25
##       Up     31 36
## [1] "Alternative k-NN Accuracy for k = 5 :  0.4615"
## [1] "Confusion Matrix for k = 7"
##           Actual
## Prediction Down Up
##       Down   16 25
##       Up     27 36
## [1] "Alternative k-NN Accuracy for k = 7 :  0.5"

#Naive Bayes
train_data$Lag2_Volume <- train_data$Lag2 * train_data$Volume
test_data$Lag2_Volume <- test_data$Lag2 * test_data$Volume

nb_model_corrected <- naiveBayes(Direction ~ Lag2 + Lag2_Volume, data = train_data)

nb_predictions_corrected <- predict(nb_model_corrected, newdata = test_data)

conf_matrix_nb_corrected <- table(Predicted = nb_predictions_corrected, Actual = test_data$Direction)
print(conf_matrix_nb_corrected)

##          Actual
## Predicted Down Up
##      Down   19 28
##      Up     24 33

accuracy_nb_corrected <- sum(diag(conf_matrix_nb_corrected)) / sum(conf_matrix_nb_corrected)
print(paste("Corrected Naive Bayes Accuracy: ", round(accuracy_nb_corrected, 4)))

## [1] "Corrected Naive Bayes Accuracy:  0.5"

Problem 14.

a.

data("Auto")
head(Auto)  # View the first few rows of the dataset

##   mpg cylinders displacement horsepower weight acceleration year origin
## 1  18         8          307        130   3504         12.0   70      1
## 2  15         8          350        165   3693         11.5   70      1
## 3  18         8          318        150   3436         11.0   70      1
## 4  16         8          304        150   3433         12.0   70      1
## 5  17         8          302        140   3449         10.5   70      1
## 6  15         8          429        198   4341         10.0   70      1
##                        name
## 1 chevrolet chevelle malibu
## 2         buick skylark 320
## 3        plymouth satellite
## 4             amc rebel sst
## 5               ford torino
## 6          ford galaxie 500

#Compute median of mpg
mpg_median <- median(Auto$mpg)
print(mpg_median)

## [1] 22.75

#Create mpg01

Auto$mpg01 <- ifelse(Auto$mpg > mpg_median, 1, 0)

#Combine mpg01 with original dataset
head(Auto)  # Check the new column

##   mpg cylinders displacement horsepower weight acceleration year origin
## 1  18         8          307        130   3504         12.0   70      1
## 2  15         8          350        165   3693         11.5   70      1
## 3  18         8          318        150   3436         11.0   70      1
## 4  16         8          304        150   3433         12.0   70      1
## 5  17         8          302        140   3449         10.5   70      1
## 6  15         8          429        198   4341         10.0   70      1
##                        name mpg01
## 1 chevrolet chevelle malibu     0
## 2         buick skylark 320     0
## 3        plymouth satellite     0
## 4             amc rebel sst     0
## 5               ford torino     0
## 6          ford galaxie 500     0

# Summary of new variable

table(Auto$mpg01)  # Counts of 0s and 1s

## 
##   0   1 
## 196 196

b.

#Scatterplots
par(mfrow = c(2, 2))  # Set up a grid for multiple plots

plot(Auto$horsepower, Auto$mpg, col = Auto$mpg01 + 1, pch = 16,
     xlab = "Horsepower", ylab = "MPG", main = "Horsepower vs MPG (mpg01)")
legend("topright", legend = c("Low MPG", "High MPG"), col = c(1, 2), pch = 16)

plot(Auto$displacement, Auto$mpg, col = Auto$mpg01 + 1, pch = 16,
     xlab = "Displacement", ylab = "MPG", main = "Displacement vs MPG (mpg01)")

plot(Auto$weight, Auto$mpg, col = Auto$mpg01 + 1, pch = 16,
     xlab = "Weight", ylab = "MPG", main = "Weight vs MPG (mpg01)")

plot(Auto$acceleration, Auto$mpg, col = Auto$mpg01 + 1, pch = 16,
     xlab = "Acceleration", ylab = "MPG", main = "Acceleration vs MPG (mpg01)")

#Boxplots
par(mfrow = c(2, 2))  # Set up a grid for multiple plots

boxplot(horsepower ~ mpg01, data = Auto, col = c("red", "green"),
        main = "Horsepower by MPG Category", xlab = "MPG01", ylab = "Horsepower")

boxplot(weight ~ mpg01, data = Auto, col = c("red", "green"),
        main = "Weight by MPG Category", xlab = "MPG01", ylab = "Weight")

boxplot(displacement ~ mpg01, data = Auto, col = c("red", "green"),
        main = "Displacement by MPG Category", xlab = "MPG01", ylab = "Displacement")

boxplot(acceleration ~ mpg01, data = Auto, col = c("red", "green"),
        main = "Acceleration by MPG Category", xlab = "MPG01", ylab = "Acceleration")

Based on the scatterplots:
Horsepower, Weight, and Displacement are the features that seem most strongly associated with mpg01. Acceleration may have a weaker relationship, so it might be less useful on its own.

From the boxplots, horsepower, weight, and displacement stand out as the most promising predictors of mpg01, as their distributions exhibit clear differences between the high and low MPG categories.

c.

#Split data into training and test sets
set.seed(123)  # For reproducibility

train_indices_auto <- sample(1:nrow(Auto), size = 0.7 * nrow(Auto))  # 70% training
train_set_auto <- Auto[train_indices_auto, ]  # Training set
test_set_auto <- Auto[-train_indices_auto, ]  # Test set

#Verify the split
nrow(train_set_auto)  # Number of rows in the training set

## [1] 274

nrow(test_set_auto)   # Number of rows in the test set

## [1] 118

d.

#Fit LDA model
lda_model_auto <- lda(mpg01 ~ horsepower + weight + displacement, data = train_set_auto)

print(lda_model_auto)

## Call:
## lda(mpg01 ~ horsepower + weight + displacement, data = train_set_auto)
## 
## Prior probabilities of groups:
##         0         1 
## 0.4963504 0.5036496 
## 
## Group means:
##   horsepower   weight displacement
## 0  130.96324 3641.022     275.2941
## 1   78.00725 2314.000     114.5290
## 
## Coefficients of linear discriminants:
##                       LD1
## horsepower    0.005992754
## weight       -0.001059313
## displacement -0.008641675

#Make predictions on test data
lda_predictions_auto <- predict(lda_model_auto, newdata = test_set_auto)

#Confusion matrix and accuracy
conf_matrix_lda_auto <- table(Predicted = lda_predictions_auto$class, Actual = test_set_auto$mpg01)
print(conf_matrix_lda_auto)

##          Actual
## Predicted  0  1
##         0 49  3
##         1 11 55

accuracy_lda_auto <- sum(diag(conf_matrix_lda_auto)) / sum(conf_matrix_lda_auto)
print(paste("LDA Accuracy: ", round(accuracy_lda_auto, 4)))

## [1] "LDA Accuracy:  0.8814"

# Calculate the test error rate
test_error_lda_auto <- 1 - accuracy_lda_auto
print(paste("Test Error (LDA): ", round(test_error_lda_auto, 4)))

## [1] "Test Error (LDA):  0.1186"

e.

#Fit QDA model
qda_model_auto <- qda(mpg01 ~ horsepower + weight + displacement, data = train_set_auto)

print(qda_model_auto)

## Call:
## qda(mpg01 ~ horsepower + weight + displacement, data = train_set_auto)
## 
## Prior probabilities of groups:
##         0         1 
## 0.4963504 0.5036496 
## 
## Group means:
##   horsepower   weight displacement
## 0  130.96324 3641.022     275.2941
## 1   78.00725 2314.000     114.5290

qda_predictions_auto <- predict(qda_model_auto, newdata = test_set_auto)

#Confusion matrix
conf_matrix_qda_auto <- table(Predicted = qda_predictions_auto$class, Actual = test_set_auto$mpg01)
print(conf_matrix_qda_auto)

##          Actual
## Predicted  0  1
##         0 53  5
##         1  7 53

# Accuracy
accuracy_qda_auto <- sum(diag(conf_matrix_qda_auto)) / sum(conf_matrix_qda_auto)
print(paste("QDA Accuracy: ", round(accuracy_qda_auto, 4)))

## [1] "QDA Accuracy:  0.8983"

# Test Error
test_error_qda_auto <- 1 - accuracy_qda_auto
print(paste("Test Error (QDA): ", round(test_error_qda_auto, 4)))

## [1] "Test Error (QDA):  0.1017"

f.

#Fit logistics regression model
logistic_model_auto <- glm(mpg01 ~ horsepower + weight + displacement, 
                           data = train_set_auto, family = binomial)

summary(logistic_model_auto)

## 
## Call:
## glm(formula = mpg01 ~ horsepower + weight + displacement, family = binomial, 
##     data = train_set_auto)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  12.0492510  2.0054314   6.008 1.87e-09 ***
## horsepower   -0.0421016  0.0164145  -2.565   0.0103 *  
## weight       -0.0020496  0.0008526  -2.404   0.0162 *  
## displacement -0.0131390  0.0064437  -2.039   0.0414 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 379.83  on 273  degrees of freedom
## Residual deviance: 138.49  on 270  degrees of freedom
## AIC: 146.49
## 
## Number of Fisher Scoring iterations: 7

#Prediction on test data
logistic_predictions_probs <- predict(logistic_model_auto, newdata = test_set_auto, type = "response")

logistic_predictions_probs <- predict(logistic_model_auto, newdata = test_set_auto, type = "response")

logistic_predictions_classes <- ifelse(logistic_predictions_probs > 0.5, 1, 0)

#Confusion matrix
conf_matrix_logistic_auto <- table(Predicted = logistic_predictions_classes, Actual = test_set_auto$mpg01)
print(conf_matrix_logistic_auto)

##          Actual
## Predicted  0  1
##         0 53  6
##         1  7 52

#Test error calculation
# Accuracy
accuracy_logistic_auto <- sum(diag(conf_matrix_logistic_auto)) / sum(conf_matrix_logistic_auto)
print(paste("Logistic Regression Accuracy: ", round(accuracy_logistic_auto, 4)))

## [1] "Logistic Regression Accuracy:  0.8898"

# Test Error
test_error_logistic_auto <- 1 - accuracy_logistic_auto
print(paste("Test Error (Logistic Regression): ", round(test_error_logistic_auto, 4)))

## [1] "Test Error (Logistic Regression):  0.1102"

g.

#Fit the Naive Bayes model

naive_bayes_model_auto <- naiveBayes(mpg01 ~ horsepower + weight + displacement, data = train_set_auto)

#Predictions on test data
nb_predictions_auto <- predict(naive_bayes_model_auto, newdata = test_set_auto)

#Confusion matrix
conf_matrix_nb_auto <- table(Predicted = nb_predictions_auto, Actual = test_set_auto$mpg01)
print(conf_matrix_nb_auto)

##          Actual
## Predicted  0  1
##         0 51  4
##         1  9 54

#Test error calculation
# Accuracy
accuracy_nb_auto <- sum(diag(conf_matrix_nb_auto)) / sum(conf_matrix_nb_auto)
print(paste("Naive Bayes Accuracy: ", round(accuracy_nb_auto, 4)))

## [1] "Naive Bayes Accuracy:  0.8898"

# Test Error
test_error_nb_auto <- 1 - accuracy_nb_auto
print(paste("Test Error (Naive Bayes): ", round(test_error_nb_auto, 4)))

## [1] "Test Error (Naive Bayes):  0.1102"

h.

#Fit KNN model
# Standardize predictors
train_features_knn <- scale(train_set_auto[, c("horsepower", "weight", "displacement")])
test_features_knn <- scale(test_set_auto[, c("horsepower", "weight", "displacement")])

# Response variable
train_labels_knn <- train_set_auto$mpg01

#Apply KNN for different values of k

# Loop through different values of k
for (k in c(1, 3, 5, 7, 9)) {
    # Perform k-NN
    knn_predictions_auto <- knn(train = train_features_knn, 
                                 test = test_features_knn, 
                                 cl = train_labels_knn, 
                                 k = k)
    
    # Confusion matrix
    conf_matrix_knn_auto <- table(Predicted = knn_predictions_auto, Actual = test_set_auto$mpg01)
    print(paste("Confusion Matrix for k =", k))
    print(conf_matrix_knn_auto)
    
    # Accuracy
    accuracy_knn_auto <- sum(diag(conf_matrix_knn_auto)) / sum(conf_matrix_knn_auto)
    print(paste("Accuracy for k =", k, ": ", round(accuracy_knn_auto, 4)))
    
    # Test error
    test_error_knn_auto <- 1 - accuracy_knn_auto
    print(paste("Test Error for k =", k, ": ", round(test_error_knn_auto, 4)))
}

## [1] "Confusion Matrix for k = 1"
##          Actual
## Predicted  0  1
##         0 52  9
##         1  8 49
## [1] "Accuracy for k = 1 :  0.8559"
## [1] "Test Error for k = 1 :  0.1441"
## [1] "Confusion Matrix for k = 3"
##          Actual
## Predicted  0  1
##         0 53 10
##         1  7 48
## [1] "Accuracy for k = 3 :  0.8559"
## [1] "Test Error for k = 3 :  0.1441"
## [1] "Confusion Matrix for k = 5"
##          Actual
## Predicted  0  1
##         0 51  8
##         1  9 50
## [1] "Accuracy for k = 5 :  0.8559"
## [1] "Test Error for k = 5 :  0.1441"
## [1] "Confusion Matrix for k = 7"
##          Actual
## Predicted  0  1
##         0 52  6
##         1  8 52
## [1] "Accuracy for k = 7 :  0.8814"
## [1] "Test Error for k = 7 :  0.1186"
## [1] "Confusion Matrix for k = 9"
##          Actual
## Predicted  0  1
##         0 52  5
##         1  8 53
## [1] "Accuracy for k = 9 :  0.8898"
## [1] "Test Error for k = 9 :  0.1102"

k=9 seems to perform the best for this model.

Problem 16.

#Load data

data("Boston")
head(Boston)  # Preview the first few rows

##      crim zn indus chas   nox    rm  age    dis rad tax ptratio  black lstat
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90  4.98
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90  9.14
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83  4.03
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63  2.94
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90  5.33
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12  5.21
##   medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7

summary(Boston)  # Summary of the dataset

##       crim                zn             indus            chas        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
##  1st Qu.: 0.08205   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
##       nox               rm             age              dis        
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       rad              tax           ptratio          black       
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   :  0.32  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.:375.38  
##  Median : 5.000   Median :330.0   Median :19.05   Median :391.44  
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :356.67  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:396.23  
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :396.90  
##      lstat            medv      
##  Min.   : 1.73   Min.   : 5.00  
##  1st Qu.: 6.95   1st Qu.:17.02  
##  Median :11.36   Median :21.20  
##  Mean   :12.65   Mean   :22.53  
##  3rd Qu.:16.95   3rd Qu.:25.00  
##  Max.   :37.97   Max.   :50.00

#Create response variable
# Compute the median of the crime rate
crime_median <- median(Boston$crim)

# Create binary response variable
Boston$crime_above_median <- ifelse(Boston$crim > crime_median, 1, 0)

#Split data into training and test sets (70/30)
set.seed(123)  # For reproducibility
train_indices_boston <- sample(1:nrow(Boston), size = 0.7 * nrow(Boston))
train_set_boston <- Boston[train_indices_boston, ]  # Training set
test_set_boston <- Boston[-train_indices_boston, ]  # Test set

#Logistics Regression

# Fit the logistic regression model
logistic_model_boston <- glm(crime_above_median ~ lstat + age + indus, 
                             data = train_set_boston, family = binomial)

# Make predictions on the test set
logistic_probs_boston <- predict(logistic_model_boston, newdata = test_set_boston, type = "response")
logistic_predictions_boston <- ifelse(logistic_probs_boston > 0.5, 1, 0)

# Confusion matrix and accuracy
conf_matrix_logistic_boston <- table(Predicted = logistic_predictions_boston, Actual = test_set_boston$crime_above_median)
accuracy_logistic_boston <- sum(diag(conf_matrix_logistic_boston)) / sum(conf_matrix_logistic_boston)

print("Logistic Regression Results:")

## [1] "Logistic Regression Results:"

print(conf_matrix_logistic_boston)

##          Actual
## Predicted  0  1
##         0 59 13
##         1 16 64

print(paste("Accuracy: ", round(accuracy_logistic_boston, 4)))

## [1] "Accuracy:  0.8092"

#LDA

# Fit the LDA model
lda_model_boston <- lda(crime_above_median ~ lstat + age + indus, data = train_set_boston)

# Make predictions on the test set
lda_predictions_boston <- predict(lda_model_boston, newdata = test_set_boston)
conf_matrix_lda_boston <- table(Predicted = lda_predictions_boston$class, Actual = test_set_boston$crime_above_median)
accuracy_lda_boston <- sum(diag(conf_matrix_lda_boston)) / sum(conf_matrix_lda_boston)

print("LDA Results:")

## [1] "LDA Results:"

print(conf_matrix_lda_boston)

##          Actual
## Predicted  0  1
##         0 58 14
##         1 17 63

print(paste("Accuracy: ", round(accuracy_lda_boston, 4)))

## [1] "Accuracy:  0.7961"

#Naive Bayes model
# Fit the Naive Bayes model
naive_bayes_model_boston <- naiveBayes(crime_above_median ~ lstat + age + indus, data = train_set_boston)

# Make predictions on the test set
naive_bayes_predictions_boston <- predict(naive_bayes_model_boston, newdata = test_set_boston)
conf_matrix_naive_bayes_boston <- table(Predicted = naive_bayes_predictions_boston, Actual = test_set_boston$crime_above_median)
accuracy_naive_bayes_boston <- sum(diag(conf_matrix_naive_bayes_boston)) / sum(conf_matrix_naive_bayes_boston)

print("Naive Bayes Results:")

## [1] "Naive Bayes Results:"

print(conf_matrix_naive_bayes_boston)

##          Actual
## Predicted  0  1
##         0 59 18
##         1 16 59

print(paste("Accuracy: ", round(accuracy_naive_bayes_boston, 4)))

## [1] "Accuracy:  0.7763"

#KNN model

# Standardize predictors
train_features_boston <- scale(train_set_boston[, c("lstat", "age", "indus")])
test_features_boston <- scale(test_set_boston[, c("lstat", "age", "indus")])
train_labels_boston <- train_set_boston$crime_above_median

# Loop over different values of k
for (k in c(1, 3, 5, 7, 9)) {
    knn_predictions_boston <- knn(train = train_features_boston, 
                                  test = test_features_boston, 
                                  cl = train_labels_boston, 
                                  k = k)
    conf_matrix_knn_boston <- table(Predicted = knn_predictions_boston, Actual = test_set_boston$crime_above_median)
    accuracy_knn_boston <- sum(diag(conf_matrix_knn_boston)) / sum(conf_matrix_knn_boston)
    
    print(paste("k-NN Results for k =", k))
    print(conf_matrix_knn_boston)
    print(paste("Accuracy: ", round(accuracy_knn_boston, 4)))
}

## [1] "k-NN Results for k = 1"
##          Actual
## Predicted  0  1
##         0 65 15
##         1 10 62
## [1] "Accuracy:  0.8355"
## [1] "k-NN Results for k = 3"
##          Actual
## Predicted  0  1
##         0 64 15
##         1 11 62
## [1] "Accuracy:  0.8289"
## [1] "k-NN Results for k = 5"
##          Actual
## Predicted  0  1
##         0 63 18
##         1 12 59
## [1] "Accuracy:  0.8026"
## [1] "k-NN Results for k = 7"
##          Actual
## Predicted  0  1
##         0 66 16
##         1  9 61
## [1] "Accuracy:  0.8355"
## [1] "k-NN Results for k = 9"
##          Actual
## Predicted  0  1
##         0 65 15
##         1 10 62
## [1] "Accuracy:  0.8355"

Findings: k-NN with k = 1, 3, 9 competitive accuracy (around 83.55%), performing slightly better than logistic regression (accuracy 79.61%) and naive Bayes (accuracy 77.63%).

LDA achieves decent performance with an accuracy of 79.61%, but k-NN outperforms it for certain values of k.

Predictive Modeling-HW3

Minh Nguyen - zpx082

2025-03-07

Problem 13

a.

b.

c.

d.

e.

f.

g.

h.

i.

j.

Problem 14.

a.

b.

c.

d.

e.

f.

g.

h.

Problem 16.