Problem # 13

This question should be answered using the Weekly data set, which is part of the ISLP package. This data is similar in nature to the Smarket data from this chapter’s lab, except that it contains 1, 089 weekly returns for 21 years, from the beginning of 1990 to the end of 2010.

Produce some numerical and graphical summaries of the Weekly data. Do there appear to be any patterns?

# Load necessary libraries
library(ISLR)

## Warning: package 'ISLR' was built under R version 4.4.2

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.4.2

library(GGally)

## Warning: package 'GGally' was built under R version 4.4.2

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.4.2

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

weekly_data <- read.csv("C:/Users/HP/OneDrive - University of Texas at San Antonio/Desktop/Semester 2/predictive modelling/ALL CSV FILES - 2nd Edition/Weekly.csv")

#first few rows of the dataset
head(weekly_data)

##   Year   Lag1   Lag2   Lag3   Lag4   Lag5    Volume  Today Direction
## 1 1990  0.816  1.572 -3.936 -0.229 -3.484 0.1549760 -0.270      Down
## 2 1990 -0.270  0.816  1.572 -3.936 -0.229 0.1485740 -2.576      Down
## 3 1990 -2.576 -0.270  0.816  1.572 -3.936 0.1598375  3.514        Up
## 4 1990  3.514 -2.576 -0.270  0.816  1.572 0.1616300  0.712        Up
## 5 1990  0.712  3.514 -2.576 -0.270  0.816 0.1537280  1.178        Up
## 6 1990  1.178  0.712  3.514 -2.576 -0.270 0.1544440 -1.372      Down

# Structure of the dataset
str(weekly_data)

## 'data.frame':    1089 obs. of  9 variables:
##  $ Year     : int  1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
##  $ Lag1     : num  0.816 -0.27 -2.576 3.514 0.712 ...
##  $ Lag2     : num  1.572 0.816 -0.27 -2.576 3.514 ...
##  $ Lag3     : num  -3.936 1.572 0.816 -0.27 -2.576 ...
##  $ Lag4     : num  -0.229 -3.936 1.572 0.816 -0.27 ...
##  $ Lag5     : num  -3.484 -0.229 -3.936 1.572 0.816 ...
##  $ Volume   : num  0.155 0.149 0.16 0.162 0.154 ...
##  $ Today    : num  -0.27 -2.576 3.514 0.712 1.178 ...
##  $ Direction: chr  "Down" "Down" "Up" "Up" ...

# Summary statistics of the dataset
summary(weekly_data)

##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##   Direction        
##  Length:1089       
##  Class :character  
##  Mode  :character  
##                    
##                    
##

# Compute correlation matrix (excluding non-numeric columns)
cor(weekly_data %>% select(-Direction), use = "complete.obs")

##               Year         Lag1        Lag2        Lag3         Lag4
## Year    1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1   -0.03228927  1.000000000 -0.07485305  0.05863568 -0.071273876
## Lag2   -0.03339001 -0.074853051  1.00000000 -0.07572091  0.058381535
## Lag3   -0.03000649  0.058635682 -0.07572091  1.00000000 -0.075395865
## Lag4   -0.03112792 -0.071273876  0.05838153 -0.07539587  1.000000000
## Lag5   -0.03051910 -0.008183096 -0.07249948  0.06065717 -0.075675027
## Volume  0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today  -0.03245989 -0.075031842  0.05916672 -0.07124364 -0.007825873
##                Lag5      Volume        Today
## Year   -0.030519101  0.84194162 -0.032459894
## Lag1   -0.008183096 -0.06495131 -0.075031842
## Lag2   -0.072499482 -0.08551314  0.059166717
## Lag3    0.060657175 -0.06928771 -0.071243639
## Lag4   -0.075675027 -0.06107462 -0.007825873
## Lag5    1.000000000 -0.05851741  0.011012698
## Volume -0.058517414  1.00000000 -0.033077783
## Today   0.011012698 -0.03307778  1.000000000

# Scatterplot matrix
ggpairs(
  weekly_data %>% select(Year, Lag1, Lag2, Lag3, Lag4, Lag5, Volume, Today),
  upper = list(continuous = wrap("cor", size = 4)),  # Reduce correlation text size
  lower = list(continuous = wrap("points", alpha = 0.3)),  # Adjust scatterplot transparency
  title = "Scatterplot Matrix of Weekly Data"
) + 
  theme_bw(base_size = 12) +  # Adjust base font size for clarity
  theme(axis.text.x = element_text(angle = 45, hjust = 1),  # Rotate x-axis labels
        axis.text.y = element_text(angle = 0, hjust = 1),   # Fix y-axis labels
        strip.text = element_text(size = 12))  # Adjust label size

# Scatter plot to visualize the relationship between Year and Volume
ggplot(weekly_data, aes(x = Year, y = Volume)) +
  geom_point(alpha = 0.6, color = "blue") +
  theme_minimal() +
  labs(title = "Year vs Trading Volume",
       x = "Year",
       y = "Trading Volume")

#part b Use the full data set to perform a logistic regression with Direction as the response and the fve lag variables plus Volume as predictors. Use the summary function to print the results. Do any of the predictors appear to be statistically signifcant? If so, which ones?

library(dplyr)
library(ggplot2)
library(stats)
weekly_data <- read.csv("C:/Users/HP/OneDrive - University of Texas at San Antonio/Desktop/Semester 2/predictive modelling/ALL CSV FILES - 2nd Edition/Weekly.csv")

# Convert Direction to a binary response variable (Up = 1, Down = 0)
weekly_data <- weekly_data %>%
  mutate(Direction = ifelse(Direction == "Up", 1, 0))

# Fit logistic regression model using Lag variables and Volume as predictors
logistic_model <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, 
                      data = weekly_data, family = binomial)

# Print the summary of the logistic regression model
summary(logistic_model)

## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = weekly_data)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4

Interpretation: Among the predictors, Lag2 is the only statistically significant variable at the 5% level (p = 0.0296). All other variables (Lag1, Lag3, Lag4, Lag5, and Volume) have p-values greater than 0.05, meaning they are not statistically significant in predicting Direction

#part c

Compute the confusion matrix and overall fraction of correct predictions. Explain what the confusion matrix is telling you about the types of mistakes made by logistic regression.

library(dplyr)
library(caret)  # For confusion matrix

## Warning: package 'caret' was built under R version 4.4.3

## Loading required package: lattice

# Predict probabilities from the logistic regression model
probs <- predict(logistic_model, type = "response")

# Convert probabilities to class labels (threshold = 0.5)
predicted_labels <- ifelse(probs > 0.5, "Up", "Down")

# Convert actual Direction values to factor for comparison
actual_labels <- ifelse(weekly_data$Direction == 1, "Up", "Down")

# Create a confusion matrix
conf_matrix <- table(Predicted = predicted_labels, Truth = actual_labels)

# Print confusion matrix
print(conf_matrix)

##          Truth
## Predicted Down  Up
##      Down   54  48
##      Up    430 557

# Calculate accuracy
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)

# Print accuracy
cat("Overall Accuracy:", round(accuracy, 4) * 100, "%\n")

## Overall Accuracy: 56.11 %

The confusion matrix shows that the logistic regression model overpredicts “Up”, leading to many false positives (430 cases) where the model incorrectly predicts an increase when the market actually goes down. The model’s accuracy is 56%, slightly better than random guessing, but it struggles to correctly identify “Down” movements. This suggests the model is unreliable for predicting market declines and may need improvements.

#part d

Now ft the logistic regression model using a training data period from 1990 to 2008, with Lag2 as the only predictor. Compute the confusion matrix and the overall fraction of correct predictions for the held out data (that is, the data from 2009 and 2010).

library(dplyr)
library(caret)  # For confusion matrix

# Split data: Training set (1990-2008), Test set (2009-2010)
train_data <- weekly_data %>% filter(Year >= 1990 & Year <= 2008)
test_data  <- weekly_data %>% filter(Year >= 2009 & Year <= 2010)

train_mean <- mean(train_data$Lag2)
train_sd <- sd(train_data$Lag2)

train_data$Lag2 <- (train_data$Lag2 - train_mean) / train_sd
test_data$Lag2 <- (test_data$Lag2 - train_mean) / train_sd

# Fit logistic regression model using only Lag2 as predictor
logistic_model_lag2 <- glm(Direction ~ Lag2, data = train_data, family = binomial)

# Predict probabilities for test set
probs <- predict(logistic_model_lag2, test_data, type = "response")

# Convert probabilities to class labels (threshold = 0.5)
predicted_labels <- ifelse(probs > 0.55, "Up", "Down")

# Convert actual Direction values to factor for comparison
actual_labels <- ifelse(test_data$Direction == 1, "Up", "Down")


# Create confusion matrix
conf_matrix <- table(Predicted = predicted_labels, Truth = actual_labels)

# Print confusion matrix
print(conf_matrix)

##          Truth
## Predicted Down Up
##      Down   20 24
##      Up     23 37

# Calculate accuracy
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)

# Print accuracy
cat("Overall Accuracy on Test Data:", round(accuracy, 4) * 100, "%\n")

## Overall Accuracy on Test Data: 54.81 %

The rate of correct predictions on the held out data is around 55%.

#part e

Repeat (d) using LDA.

library(MASS)  # For LDA

## Warning: package 'MASS' was built under R version 4.4.2

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

library(caret) # For confusion matrix
# Train LDA model using training data (1990-2008)
lda_model <- lda(Direction ~ Lag2, data = train_data)

# Predict class labels using LDA
lda_predictions <- predict(lda_model, test_data)

# Extract predicted class labels
predicted_labels <- lda_predictions$class

# Convert actual values to factor for comparison
actual_labels <- factor(ifelse(test_data$Direction == 1, "Up", "Down"))

# Create confusion matrix
conf_matrix <- table(Predicted = predicted_labels, Truth = actual_labels)

# Print confusion matrix
print(conf_matrix)

##          Truth
## Predicted Down Up
##         0    9  5
##         1   34 56

# Calculate accuracy
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)

# Print accuracy
cat("Overall Accuracy using LDA:", round(accuracy, 4) * 100, "%\n")

## Overall Accuracy using LDA: 62.5 %

LDA achieves 62.5% accuracy, performing better than logistic regression (54.8%). However, it still overpredicts “Up”, misclassifying 34 actual “Down” movements as “Up”.

#part f

Repeat (d) using QDA.

library(MASS)  # For QDA
library(caret) # For confusion matrix

# Train QDA model using training data (1990-2008)
qda_model <- qda(Direction ~ Lag2, data = train_data)

# Predict class labels using QDA
qda_predictions <- predict(qda_model, test_data)

# Extract predicted class labels
predicted_labels <- qda_predictions$class

# Convert actual values to factor for comparison
actual_labels <- factor(ifelse(test_data$Direction == 1, "Up", "Down"))

# Create confusion matrix
conf_matrix <- table(Predicted = predicted_labels, Truth = actual_labels)

# Print confusion matrix
print(conf_matrix)

##          Truth
## Predicted Down Up
##         0    0  0
##         1   43 61

# Calculate accuracy
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)

# Print accuracy
cat("Overall Accuracy using QDA:", round(accuracy, 4) * 100, "%\n")

## Overall Accuracy using QDA: 58.65 %

QDA achieves 58.6% accuracy, which is better than logistic regression (54.8%) but worse than LDA (62.5%). However, it predicts all observations as “Up”, failing to classify any “Down” movements correctly.

#part g

Repeat (d) using KNN with K = 1.

library(class)  # For KNN
library(caret)  # For confusion matrix

# Standardize Lag2 using training data mean and standard deviation
train_mean <- mean(train_data$Lag2)
train_sd <- sd(train_data$Lag2)

train_X <- scale(as.matrix(train_data$Lag2), center = train_mean, scale = train_sd)
test_X  <- scale(as.matrix(test_data$Lag2), center = train_mean, scale = train_sd)

# Ensure Direction is a factor with consistent levels
train_Y <- factor(ifelse(train_data$Direction == 1, "Up", "Down"), levels = c("Down", "Up"))
test_Y  <- factor(ifelse(test_data$Direction == 1, "Up", "Down"), levels = c("Down", "Up"))

# Perform KNN classification with K = 1
knn_pred <- knn(train_X, test_X, train_Y, k = 1)

# Create confusion matrix
conf_matrix <- table(Predicted = knn_pred, Truth = test_Y)

# Print confusion matrix
print(conf_matrix)

##          Truth
## Predicted Down Up
##      Down   21 29
##      Up     22 32

# Calculate accuracy
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)

# Print accuracy
cat("Overall Accuracy using KNN (K=1):", round(accuracy, 4) * 100, "%\n")

## Overall Accuracy using KNN (K=1): 50.96 %

KNN with K = 1 performs no better than random guessing (50% accuracy), indicating it overfits to the training data. The model lacks generalization and misclassifies nearly half the test observations.

#part h

Repeat (d) using naive Bayes.

library(e1071)  # For Naïve Bayes
library(caret)  # For confusion matrix

# Train Naïve Bayes model using training data (1990-2008)
nb_model <- naiveBayes(Direction ~ Lag2, data = train_data)

# Predict class labels using Naïve Bayes
nb_predictions <- predict(nb_model, test_data)

# Convert actual values to factor for comparison
actual_labels <- factor(ifelse(test_data$Direction == 1, "Up", "Down"))

# Create confusion matrix
conf_matrix <- table(Predicted = nb_predictions, Truth = actual_labels)

# Print confusion matrix
print(conf_matrix)

##          Truth
## Predicted Down Up
##         0    0  0
##         1   43 61

# Calculate accuracy
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)

# Print accuracy
cat("Overall Accuracy using Naïve Bayes:", round(accuracy, 4) * 100, "%\n")

## Overall Accuracy using Naïve Bayes: 58.65 %

Naïve Bayes achieves 58.6% accuracy, performing better than logistic regression (54.8%) but worse than LDA (62.5%). However, it predicts all observations as “Up”, failing to classify any “Down” movements correctly, similar to QDA.

#part i

Which of these methods appears to provide the best results on this data?

LDA appears to give the best results on the data set.

#part j

Experiment with diferent combinations of predictors, including possible transformations and interactions, for each of the methods. Report the variables, method, and associated confusion matrix that appears to provide the best results on the held out data. Note that you should also experiment with values for K in the KNN classifer.

library(MASS)    # For LDA & QDA
library(e1071)   # For Naïve Bayes
library(class)   # For KNN
library(caret)   # For confusion matrix

# Logistic Regression with more predictors
logistic_model <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, 
                      data = train_data, family = binomial)

# Predictions
logistic_probs <- predict(logistic_model, test_data, type = "response")
logistic_pred <- ifelse(logistic_probs > 0.5, "Up", "Down")

# Confusion Matrix
logistic_conf <- table(Predicted = logistic_pred, Truth = test_data$Direction)
print(logistic_conf)

##          Truth
## Predicted  0  1
##      Down 31 44
##      Up   12 17

# Compute accuracy
accuracy <- sum(diag(logistic_conf)) / sum(logistic_conf)
cat("Logistic Regression Accuracy:", round(accuracy, 4) * 100, "%\n")

## Logistic Regression Accuracy: 46.15 %

#LDA with Interaction Term (Lag1 * Lag2)
lda_model <- lda(Direction ~ Lag1 + Lag2 + Lag1:Lag2, data = train_data)

# Predictions
lda_predictions <- predict(lda_model, test_data)$class

# Confusion Matrix
lda_conf <- table(Predicted = lda_predictions, Truth = test_data$Direction)
print(lda_conf)

##          Truth
## Predicted  0  1
##         0  7  8
##         1 36 53

# Compute accuracy
accuracy <- sum(diag(lda_conf)) / sum(lda_conf)
cat("LDA Accuracy:", round(accuracy, 4) * 100, "%\n")

## LDA Accuracy: 57.69 %

#QDA with Additional Predictors
qda_model <- qda(Direction ~ Lag1 + Lag2 + Lag3, data = train_data)

# Predictions
qda_predictions <- predict(qda_model, test_data)$class

# Confusion Matrix
qda_conf <- table(Predicted = qda_predictions, Truth = test_data$Direction)
print(qda_conf)

##          Truth
## Predicted  0  1
##         0  6 10
##         1 37 51

# Compute accuracy
accuracy <- sum(diag(qda_conf)) / sum(qda_conf)
cat("QDA Accuracy:", round(accuracy, 4) * 100, "%\n")

## QDA Accuracy: 54.81 %

#Naïve Bayes with Lag1, Lag2, and Volume
nb_model <- naiveBayes(Direction ~ Lag1 + Lag2 + Volume, data = train_data)

# Predictions
nb_predictions <- predict(nb_model, test_data)

# Confusion Matrix
nb_conf <- table(Predicted = nb_predictions, Truth = test_data$Direction)
print(nb_conf)

##          Truth
## Predicted  0  1
##         0 41 58
##         1  2  3

# Compute accuracy
accuracy <- sum(diag(nb_conf)) / sum(nb_conf)
cat("Naïve Bayes Accuracy:", round(accuracy, 4) * 100, "%\n")

## Naïve Bayes Accuracy: 42.31 %

#KNN with Different K Values
# Scale Predictors for KNN
train_X <- scale(train_data[, c("Lag1", "Lag2")])
test_X  <- scale(test_data[, c("Lag1", "Lag2")])

train_Y <- factor(ifelse(train_data$Direction == 1, "Up", "Down"))
test_Y  <- factor(ifelse(test_data$Direction == 1, "Up", "Down"))

# Test different values of K
for (k in c(1, 3, 5, 10)) {
  knn_pred <- knn(train_X, test_X, train_Y, k = k)
  knn_conf <- table(Predicted = knn_pred, Truth = test_Y)
  
  cat("\nConfusion Matrix for K =", k, "\n")
  print(knn_conf)
  
  accuracy <- sum(diag(knn_conf)) / sum(knn_conf)
  cat("Accuracy:", round(accuracy, 4) * 100, "%\n")
}

## 
## Confusion Matrix for K = 1 
##          Truth
## Predicted Down Up
##      Down   23 36
##      Up     20 25
## Accuracy: 46.15 %
## 
## Confusion Matrix for K = 3 
##          Truth
## Predicted Down Up
##      Down   23 32
##      Up     20 29
## Accuracy: 50 %
## 
## Confusion Matrix for K = 5 
##          Truth
## Predicted Down Up
##      Down   22 29
##      Up     21 32
## Accuracy: 51.92 %
## 
## Confusion Matrix for K = 10 
##          Truth
## Predicted Down Up
##      Down   25 26
##      Up     18 35
## Accuracy: 57.69 %

Interpretation:

The results indicate that LDA (57.69%) and KNN with K=10 (56.73%) achieved the highest accuracy, making them the best models for predicting market movements. LDA performed the best overall, likely because the data follows a linear decision boundary. KNN showed improvement as K increased, with K=10 outperforming smaller values like K=1 (46.15%), which suffered from high variance and poor generalization. QDA (54.81%) performed better than Logistic Regression (46.15%) and Naïve Bayes (42.31%), but was still outperformed by LDA and KNN. Naïve Bayes struggled significantly, likely due to its assumption of independent predictors, making it the worst-performing method. Overall, LDA is the most reliable model, while KNN with K=10 is a strong alternative. Given these findings, Naïve Bayes and KNN with K=1 should be avoided due to their poor predictive performance.

Problem # 14:

In this problem, you will develop a model to predict whether a given car gets high or low gas mileage based on the Auto data set.

part a

Create a binary variable, mpg01, that contains a 1 if mpg contains a value above its median, and a 0 if mpg contains a value below its median. You can compute the median using the median() method of the data frame. Note you may fnd it helpful to add a column mpg01 to the data frame by assignment.

library(ISLR)  # The dataset Auto is part of ISLR
library(dplyr) # For data manipulation

# Load the Auto dataset
Auto <- ISLR::Auto

# View first few rows
head(Auto)

##   mpg cylinders displacement horsepower weight acceleration year origin
## 1  18         8          307        130   3504         12.0   70      1
## 2  15         8          350        165   3693         11.5   70      1
## 3  18         8          318        150   3436         11.0   70      1
## 4  16         8          304        150   3433         12.0   70      1
## 5  17         8          302        140   3449         10.5   70      1
## 6  15         8          429        198   4341         10.0   70      1
##                        name
## 1 chevrolet chevelle malibu
## 2         buick skylark 320
## 3        plymouth satellite
## 4             amc rebel sst
## 5               ford torino
## 6          ford galaxie 500

# Compute the median of mpg
mpg_median <- median(Auto$mpg)

# Create mpg01: 1 if mpg > median, else 0
Auto <- Auto %>%
  mutate(mpg01 = ifelse(mpg > mpg_median, 1, 0))

# View updated dataset
head(Auto)

##   mpg cylinders displacement horsepower weight acceleration year origin
## 1  18         8          307        130   3504         12.0   70      1
## 2  15         8          350        165   3693         11.5   70      1
## 3  18         8          318        150   3436         11.0   70      1
## 4  16         8          304        150   3433         12.0   70      1
## 5  17         8          302        140   3449         10.5   70      1
## 6  15         8          429        198   4341         10.0   70      1
##                        name mpg01
## 1 chevrolet chevelle malibu     0
## 2         buick skylark 320     0
## 3        plymouth satellite     0
## 4             amc rebel sst     0
## 5               ford torino     0
## 6          ford galaxie 500     0

#part b

Explore the data graphically in order to investigate the association between mpg01 and the other features. Which of the other features seem most likely to be useful in predicting mpg01? Scatterplots and boxplots may be useful tools to answer this question. Describe your fndings.

library(ggplot2)  # For visualization
library(GGally)   # For pair plot
library(dplyr)    # For data manipulation

# Ensure dplyr::select() is used explicitly to avoid conflicts
selected_data <- Auto %>% dplyr::select(mpg, mpg01, cylinders, displacement, horsepower, weight, acceleration, year, origin, mpg01)

# Compute correlation matrix
cor_matrix <- cor(selected_data, use = "complete.obs")

# Print correlation matrix
print(cor_matrix)

##                     mpg      mpg01  cylinders displacement horsepower
## mpg           1.0000000  0.8369392 -0.7776175   -0.8051269 -0.7784268
## mpg01         0.8369392  1.0000000 -0.7591939   -0.7534766 -0.6670526
## cylinders    -0.7776175 -0.7591939  1.0000000    0.9508233  0.8429834
## displacement -0.8051269 -0.7534766  0.9508233    1.0000000  0.8972570
## horsepower   -0.7784268 -0.6670526  0.8429834    0.8972570  1.0000000
## weight       -0.8322442 -0.7577566  0.8975273    0.9329944  0.8645377
## acceleration  0.4233285  0.3468215 -0.5046834   -0.5438005 -0.6891955
## year          0.5805410  0.4299042 -0.3456474   -0.3698552 -0.4163615
## origin        0.5652088  0.5136984 -0.5689316   -0.6145351 -0.4551715
##                  weight acceleration       year     origin
## mpg          -0.8322442    0.4233285  0.5805410  0.5652088
## mpg01        -0.7577566    0.3468215  0.4299042  0.5136984
## cylinders     0.8975273   -0.5046834 -0.3456474 -0.5689316
## displacement  0.9329944   -0.5438005 -0.3698552 -0.6145351
## horsepower    0.8645377   -0.6891955 -0.4163615 -0.4551715
## weight        1.0000000   -0.4168392 -0.3091199 -0.5850054
## acceleration -0.4168392    1.0000000  0.2903161  0.2127458
## year         -0.3091199    0.2903161  1.0000000  0.1815277
## origin       -0.5850054    0.2127458  0.1815277  1.0000000

# Scatterplot matrix for selected features
ggpairs(
  selected_data,
  upper = list(continuous = wrap("cor", size = 3)),  # Reduce correlation text size
  lower = list(continuous = wrap("points", alpha = 0.5, size = 1.5)),  # Increase scatterplot visibility
  diag = list(continuous = wrap("densityDiag", alpha = 0.5)),  # Improve density plots on diagonal
  title = "Scatterplot Matrix of Selected Variables"
) + 
  theme_bw(base_size = 10) +  # Adjust base font size
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels

The correlation matrix and scatterplot matrix indicate that weight (-0.76), displacement (-0.75), and cylinders (-0.76) have the strongest negative correlations with mpg01, meaning heavier cars with larger engines tend to have lower fuel efficiency. The scatterplot matrix confirms this, showing clear separation trends between mpg01 and these predictors, making them the most useful features for classification.

#part c

Split the data into a training set and a test set.

library(caret)   # For train-test split

# Set seed for reproducibility
set.seed(1)

# Define training sample (75% of the data)
train_index <- createDataPartition(Auto$mpg01, p = 0.75, list = FALSE)

# Split the dataset
train_data <- Auto[train_index, ]  # Training set
test_data  <- Auto[-train_index, ] # Test set

# Print the number of observations in each set
cat("Training Set Size:", nrow(train_data), "\n")

## Training Set Size: 294

cat("Test Set Size:", nrow(test_data), "\n")

## Test Set Size: 98

#part d

Perform LDA on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained?

library(MASS)  # For LDA
library(caret) # For confusion matrix

# Set seed for consistency
set.seed(42)

# Remove missing values if any
Auto <- na.omit(Auto)

# Define training sample (75% of data)
train_size <- floor(0.75 * nrow(Auto))
train_index <- sample(seq_len(nrow(Auto)), size = train_size)

# Create training and testing datasets
train_data <- Auto[train_index, ]
test_data  <- Auto[-train_index, ]

# Train LDA using the most associated variables from Part (b)
lda_model <- lda(mpg01 ~ weight + displacement + cylinders, data = train_data)

# Predict class labels using LDA
lda_predictions <- predict(lda_model, test_data)$class

# Create confusion matrix
conf_matrix <- table(Predicted = lda_predictions, Truth = test_data$mpg01)

# Print confusion matrix
print(conf_matrix)

##          Truth
## Predicted  0  1
##         0 35  3
##         1  4 56

# Calculate accuracy
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)

# Calculate test error
test_error <- 1 - accuracy

# Print results
cat("LDA Test Accuracy:", round(accuracy, 4) * 100, "%\n")

## LDA Test Accuracy: 92.86 %

cat("LDA Test Error:", round(test_error, 4) * 100, "%\n")

## LDA Test Error: 7.14 %

The test error of the model is 7.14 %.

#part e

Perform QDA on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained?

library(MASS)  # For QDA
library(caret) # For train-test split and confusion matrix

# Set seed for reproducibility
set.seed(1)

# Standardize predictors before training
train_data_scaled <- train_data
test_data_scaled <- test_data

train_data_scaled[, c("weight", "displacement", "cylinders")] <- scale(train_data[, c("weight", "displacement", "cylinders")])
test_data_scaled[, c("weight", "displacement", "cylinders")] <- scale(test_data[, c("weight", "displacement", "cylinders")])

# Train QDA Model
qda_model <- qda(mpg01 ~ weight + displacement + cylinders, data = train_data_scaled)

# Predict on Test Set
qda_predictions <- predict(qda_model, test_data_scaled)$class

# Compute Confusion Matrix
conf_matrix <- table(Predicted = qda_predictions, Truth = test_data_scaled$mpg01)
print(conf_matrix)

##          Truth
## Predicted  0  1
##         0 35  4
##         1  4 55

# Calculate Accuracy & Test Error
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
test_error <- 1 - accuracy

# Print results
cat("QDA Test Accuracy:", round(accuracy, 4) * 100, "%\n")

## QDA Test Accuracy: 91.84 %

cat("QDA Test Error:", round(test_error, 4) * 100, "%\n")

## QDA Test Error: 8.16 %

The test error of the model is 8.16 %.

#part f

Perform logistic regression on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained?

library(caret)  # For train-test split and accuracy calculation
library(glmnet) # For logistic regression

## Warning: package 'glmnet' was built under R version 4.4.3

## Loading required package: Matrix

## Loaded glmnet 4.1-8

# Set seed for reproducibility
set.seed(42)

# Standardize predictors before training
train_data_scaled <- train_data
test_data_scaled <- test_data

train_data_scaled[, c("weight", "displacement", "cylinders")] <- scale(train_data[, c("weight", "displacement", "cylinders")])
test_data_scaled[, c("weight", "displacement", "cylinders")] <- scale(test_data[, c("weight", "displacement", "cylinders")])

# Train Logistic Regression Model
logistic_model <- glm(mpg01 ~ weight + displacement + cylinders, data = train_data_scaled, family = binomial)

# Predict on Test Set
logistic_probs <- predict(logistic_model, test_data_scaled, type = "response")
logistic_predictions <- ifelse(logistic_probs > 0.5, 1, 0)

# Compute Confusion Matrix
conf_matrix <- table(Predicted = logistic_predictions, Truth = test_data_scaled$mpg01)
print(conf_matrix)

##          Truth
## Predicted  0  1
##         0 37  5
##         1  2 54

# Calculate Accuracy & Test Error
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
test_error <- 1 - accuracy

# Print results
cat("Logistic Regression Test Accuracy:", round(accuracy, 4) * 100, "%\n")

## Logistic Regression Test Accuracy: 92.86 %

cat("Logistic Regression Test Error:", round(test_error, 4) * 100, "%\n")

## Logistic Regression Test Error: 7.14 %

The test error comes out to be 7.14%.

#part g

Perform naive Bayes on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained?

library(e1071)  # For Naïve Bayes
library(caret)  # For train-test split and accuracy calculation

# Train Naïve Bayes using the most associated variables from Part (b)
nb_model <- naiveBayes(mpg01 ~ weight + displacement + cylinders, data = train_data)

# Predict class labels using Naïve Bayes
nb_predictions <- predict(nb_model, test_data)

# Create confusion matrix
conf_matrix <- table(Predicted = nb_predictions, Truth = test_data$mpg01)

# Print confusion matrix
print(conf_matrix)

##          Truth
## Predicted  0  1
##         0 35  3
##         1  4 56

# Calculate accuracy
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)

# Calculate test error
test_error <- 1 - accuracy

# Print results
cat("Naïve Bayes Test Accuracy:", round(accuracy, 4) * 100, "%\n")

## Naïve Bayes Test Accuracy: 92.86 %

cat("Naïve Bayes Test Error:", round(test_error, 4) * 100, "%\n")

## Naïve Bayes Test Error: 7.14 %

The test error comes out to be 7.14% for Naive Bayes Test.

#part h

Perform KNN on the training data, with several values of K, in order to predict mpg01. Use only the variables that seemed most associated with mpg01 in (b). What test errors do you obtain? Which value of K seems to perform the best on this data set?

library(class)  # For KNN
library(caret)  # For train-test split and accuracy calculation

# Standardize training and test data
train_data_scaled <- train_data
test_data_scaled <- test_data

train_data_scaled[, c("weight", "displacement", "cylinders")] <- scale(train_data[, c("weight", "displacement", "cylinders")])
test_data_scaled[, c("weight", "displacement", "cylinders")] <- scale(test_data[, c("weight", "displacement", "cylinders")])

# Define range of K values
k_values <- 1:100

# Initialize vectors to store accuracy and test error results
accuracy_results <- numeric(length(k_values))
test_error_results <- numeric(length(k_values))

# Loop through different values of K
for (i in seq_along(k_values)) {
  k <- k_values[i]
  
  # Perform KNN classification
  knn_predictions <- knn(train = train_data_scaled[, c("weight", "displacement", "cylinders")],
                          test = test_data_scaled[, c("weight", "displacement", "cylinders")],
                          cl = train_data_scaled$mpg01, k = k)
  
  # Compute accuracy
  accuracy_results[i] <- mean(knn_predictions == test_data_scaled$mpg01)
  
  # Compute test error
  test_error_results[i] <- 1 - accuracy_results[i]
  
  # Print results
  cat("K =", k, ", Accuracy =", round(accuracy_results[i], 4), ", Test Error =", round(test_error_results[i] * 100, 2), "%\n")
}

## K = 1 , Accuracy = 0.7755 , Test Error = 22.45 %
## K = 2 , Accuracy = 0.8469 , Test Error = 15.31 %
## K = 3 , Accuracy = 0.8673 , Test Error = 13.27 %
## K = 4 , Accuracy = 0.8571 , Test Error = 14.29 %
## K = 5 , Accuracy = 0.9082 , Test Error = 9.18 %
## K = 6 , Accuracy = 0.9184 , Test Error = 8.16 %
## K = 7 , Accuracy = 0.9184 , Test Error = 8.16 %
## K = 8 , Accuracy = 0.898 , Test Error = 10.2 %
## K = 9 , Accuracy = 0.898 , Test Error = 10.2 %
## K = 10 , Accuracy = 0.898 , Test Error = 10.2 %
## K = 11 , Accuracy = 0.9082 , Test Error = 9.18 %
## K = 12 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 13 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 14 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 15 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 16 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 17 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 18 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 19 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 20 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 21 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 22 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 23 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 24 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 25 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 26 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 27 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 28 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 29 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 30 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 31 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 32 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 33 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 34 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 35 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 36 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 37 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 38 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 39 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 40 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 41 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 42 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 43 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 44 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 45 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 46 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 47 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 48 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 49 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 50 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 51 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 52 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 53 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 54 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 55 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 56 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 57 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 58 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 59 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 60 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 61 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 62 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 63 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 64 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 65 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 66 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 67 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 68 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 69 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 70 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 71 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 72 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 73 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 74 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 75 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 76 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 77 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 78 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 79 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 80 , Accuracy = 0.9184 , Test Error = 8.16 %
## K = 81 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 82 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 83 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 84 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 85 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 86 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 87 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 88 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 89 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 90 , Accuracy = 0.9184 , Test Error = 8.16 %
## K = 91 , Accuracy = 0.9286 , Test Error = 7.14 %
## K = 92 , Accuracy = 0.9184 , Test Error = 8.16 %
## K = 93 , Accuracy = 0.9184 , Test Error = 8.16 %
## K = 94 , Accuracy = 0.9184 , Test Error = 8.16 %
## K = 95 , Accuracy = 0.9184 , Test Error = 8.16 %
## K = 96 , Accuracy = 0.9184 , Test Error = 8.16 %
## K = 97 , Accuracy = 0.9184 , Test Error = 8.16 %
## K = 98 , Accuracy = 0.9184 , Test Error = 8.16 %
## K = 99 , Accuracy = 0.9184 , Test Error = 8.16 %
## K = 100 , Accuracy = 0.9184 , Test Error = 8.16 %

# Find K with the highest accuracy
best_k <- k_values[which.max(accuracy_results)]
best_accuracy <- max(accuracy_results)
best_test_error <- 1 - best_accuracy

# Print best K value and test error
cat("\nThe best K value is:", best_k, "with accuracy of", round(best_accuracy * 100, 2), "% and test error of", round(best_test_error * 100, 2), "%\n")

## 
## The best K value is: 12 with accuracy of 92.86 % and test error of 7.14 %

#Problem 16

Using the Boston data set, ft classifcation models in order to predict whether a given suburb has a crime rate above or below the median. Explore logistic regression, LDA, naive Bayes, and KNN models using various subsets of the predictors. Describe your fndings. Hint: You will have to create the response variable yourself, using the variables that are contained in the Boston data set.

# Load necessary libraries
library(MASS)   # For the Boston dataset
library(dplyr)  # For data manipulation
library(caret)  # For train-test split
library(ggplot2) # For visualization
library(class)  # For KNN
library(e1071)  # For Naive Bayes

# Load the Boston dataset
data("Boston")

# Summary of the dataset
summary(Boston)

##       crim                zn             indus            chas        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
##  1st Qu.: 0.08205   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
##       nox               rm             age              dis        
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       rad              tax           ptratio          black       
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   :  0.32  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.:375.38  
##  Median : 5.000   Median :330.0   Median :19.05   Median :391.44  
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :356.67  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:396.23  
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :396.90  
##      lstat            medv      
##  Min.   : 1.73   Min.   : 5.00  
##  1st Qu.: 6.95   1st Qu.:17.02  
##  Median :11.36   Median :21.20  
##  Mean   :12.65   Mean   :22.53  
##  3rd Qu.:16.95   3rd Qu.:25.00  
##  Max.   :37.97   Max.   :50.00

# Step 1: Create a response variable
# crim_ab is 1 if crime rate (crim) is above the median, 0 otherwise
Boston$crim_ab <- ifelse(Boston$crim > median(Boston$crim), 1, 0)

# Step 2: Explore correlation matrix to find most relevant predictors
cor_matrix <- cor(Boston)
print(cor_matrix)

##                crim          zn       indus         chas         nox
## crim     1.00000000 -0.20046922  0.40658341 -0.055891582  0.42097171
## zn      -0.20046922  1.00000000 -0.53382819 -0.042696719 -0.51660371
## indus    0.40658341 -0.53382819  1.00000000  0.062938027  0.76365145
## chas    -0.05589158 -0.04269672  0.06293803  1.000000000  0.09120281
## nox      0.42097171 -0.51660371  0.76365145  0.091202807  1.00000000
## rm      -0.21924670  0.31199059 -0.39167585  0.091251225 -0.30218819
## age      0.35273425 -0.56953734  0.64477851  0.086517774  0.73147010
## dis     -0.37967009  0.66440822 -0.70802699 -0.099175780 -0.76923011
## rad      0.62550515 -0.31194783  0.59512927 -0.007368241  0.61144056
## tax      0.58276431 -0.31456332  0.72076018 -0.035586518  0.66802320
## ptratio  0.28994558 -0.39167855  0.38324756 -0.121515174  0.18893268
## black   -0.38506394  0.17552032 -0.35697654  0.048788485 -0.38005064
## lstat    0.45562148 -0.41299457  0.60379972 -0.053929298  0.59087892
## medv    -0.38830461  0.36044534 -0.48372516  0.175260177 -0.42732077
## crim_ab  0.40939545 -0.43615103  0.60326017  0.070096774  0.72323480
##                  rm         age         dis          rad         tax    ptratio
## crim    -0.21924670  0.35273425 -0.37967009  0.625505145  0.58276431  0.2899456
## zn       0.31199059 -0.56953734  0.66440822 -0.311947826 -0.31456332 -0.3916785
## indus   -0.39167585  0.64477851 -0.70802699  0.595129275  0.72076018  0.3832476
## chas     0.09125123  0.08651777 -0.09917578 -0.007368241 -0.03558652 -0.1215152
## nox     -0.30218819  0.73147010 -0.76923011  0.611440563  0.66802320  0.1889327
## rm       1.00000000 -0.24026493  0.20524621 -0.209846668 -0.29204783 -0.3555015
## age     -0.24026493  1.00000000 -0.74788054  0.456022452  0.50645559  0.2615150
## dis      0.20524621 -0.74788054  1.00000000 -0.494587930 -0.53443158 -0.2324705
## rad     -0.20984667  0.45602245 -0.49458793  1.000000000  0.91022819  0.4647412
## tax     -0.29204783  0.50645559 -0.53443158  0.910228189  1.00000000  0.4608530
## ptratio -0.35550149  0.26151501 -0.23247054  0.464741179  0.46085304  1.0000000
## black    0.12806864 -0.27353398  0.29151167 -0.444412816 -0.44180801 -0.1773833
## lstat   -0.61380827  0.60233853 -0.49699583  0.488676335  0.54399341  0.3740443
## medv     0.69535995 -0.37695457  0.24992873 -0.381626231 -0.46853593 -0.5077867
## crim_ab -0.15637178  0.61393992 -0.61634164  0.619786249  0.60874128  0.2535684
##               black      lstat       medv     crim_ab
## crim    -0.38506394  0.4556215 -0.3883046  0.40939545
## zn       0.17552032 -0.4129946  0.3604453 -0.43615103
## indus   -0.35697654  0.6037997 -0.4837252  0.60326017
## chas     0.04878848 -0.0539293  0.1752602  0.07009677
## nox     -0.38005064  0.5908789 -0.4273208  0.72323480
## rm       0.12806864 -0.6138083  0.6953599 -0.15637178
## age     -0.27353398  0.6023385 -0.3769546  0.61393992
## dis      0.29151167 -0.4969958  0.2499287 -0.61634164
## rad     -0.44441282  0.4886763 -0.3816262  0.61978625
## tax     -0.44180801  0.5439934 -0.4685359  0.60874128
## ptratio -0.17738330  0.3740443 -0.5077867  0.25356836
## black    1.00000000 -0.3660869  0.3334608 -0.35121093
## lstat   -0.36608690  1.0000000 -0.7376627  0.45326273
## medv     0.33346082 -0.7376627  1.0000000 -0.26301673
## crim_ab -0.35121093  0.4532627 -0.2630167  1.00000000

# Selecting features that are highly correlated with crim_ab
selected_vars <- c("indus", "nox", "age", "dis", "rad", "tax")
X <- Boston[, selected_vars]
y <- Boston$crim_ab

# Step 3: Split data into training and testing sets (80% train, 20% test)
set.seed(1)
train_index <- createDataPartition(y, p = 0.8, list = FALSE)
X_train <- X[train_index, ]
X_test <- X[-train_index, ]
y_train <- y[train_index]
y_test <- y[-train_index]

# Step 4: Logistic Regression
log_model <- glm(crim_ab ~ ., data = Boston[train_index, ], family = binomial)

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

log_probs <- predict(log_model, Boston[-train_index, ], type = "response")
log_pred <- ifelse(log_probs > 0.5, 1, 0)

summary(log_model)

## 
## Call:
## glm(formula = crim_ab ~ ., family = binomial, data = Boston[train_index, 
##     ])
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.781e+02  8.461e+04  -0.002    0.998
## crim         1.118e+03  3.441e+04   0.032    0.974
## zn           8.581e-01  1.072e+02   0.008    0.994
## indus       -3.837e+00  7.995e+02  -0.005    0.996
## chas        -5.503e-01  1.351e+04   0.000    1.000
## nox         -5.013e+02  2.278e+05  -0.002    0.998
## rm          -3.110e+01  2.870e+03  -0.011    0.991
## age         -4.125e-02  4.016e+01  -0.001    0.999
## dis         -1.341e+01  6.927e+03  -0.002    0.998
## rad         -1.252e+01  1.175e+03  -0.011    0.991
## tax          3.843e-01  3.532e+01   0.011    0.991
## ptratio      1.631e+01  3.822e+03   0.004    0.997
## black        1.642e-01  6.939e+01   0.002    0.998
## lstat       -2.789e+00  4.336e+02  -0.006    0.995
## medv         1.574e+00  9.244e+02   0.002    0.999
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5.6284e+02  on 405  degrees of freedom
## Residual deviance: 4.2434e-05  on 391  degrees of freedom
## AIC: 30
## 
## Number of Fisher Scoring iterations: 25

# Compute accuracy and test error
log_acc <- mean(log_pred == y_test)
log_err <- 1 - log_acc
print(paste("Logistic Regression Accuracy:", round(log_acc * 100, 2), "%"))

## [1] "Logistic Regression Accuracy: 99 %"

print(paste("Logistic Regression Test Error:", round(log_err * 100, 2), "%"))

## [1] "Logistic Regression Test Error: 1 %"

# Step 5: LDA
lda_model <- lda(crim_ab ~ ., data = Boston[train_index, ])
lda_pred <- predict(lda_model, Boston[-train_index, ])$class
lda_acc <- mean(lda_pred == y_test)
lda_err <- 1 - lda_acc
print(paste("LDA Accuracy:", round(lda_acc * 100, 2), "%"))

## [1] "LDA Accuracy: 86 %"

print(paste("LDA Test Error:", round(lda_err * 100, 2), "%"))

## [1] "LDA Test Error: 14 %"

# Step 6: Naive Bayes
nb_model <- naiveBayes(X_train, as.factor(y_train))
nb_pred <- predict(nb_model, X_test)
nb_acc <- mean(nb_pred == y_test)
nb_err <- 1 - nb_acc
print(paste("Naive Bayes Accuracy:", round(nb_acc * 100, 2), "%"))

## [1] "Naive Bayes Accuracy: 81 %"

print(paste("Naive Bayes Test Error:", round(nb_err * 100, 2), "%"))

## [1] "Naive Bayes Test Error: 19 %"

# Step 7: KNN with different values of K (1 to 30)
set.seed(1)
k_values <- 1:30
knn_results <- data.frame(K = k_values, Accuracy = rep(0, length(k_values)), Test_Error = rep(0, length(k_values)))

for (k in k_values) {
  knn_pred <- knn(X_train, X_test, cl = y_train, k = k)
  knn_acc <- mean(knn_pred == y_test)
  knn_err <- 1 - knn_acc
  knn_results$Accuracy[k] <- knn_acc
  knn_results$Test_Error[k] <- knn_err
}

# Find the best K
best_k <- knn_results$K[which.max(knn_results$Accuracy)]
best_k_acc <- max(knn_results$Accuracy)

print(paste("Best K =", best_k, "with accuracy =", round(best_k_acc * 100, 2), "%"))

## [1] "Best K = 1 with accuracy = 96 %"

# Display all K values, their accuracies, and test errors
print(knn_results)

##     K Accuracy Test_Error
## 1   1     0.96       0.04
## 2   2     0.95       0.05
## 3   3     0.95       0.05
## 4   4     0.95       0.05
## 5   5     0.95       0.05
## 6   6     0.94       0.06
## 7   7     0.94       0.06
## 8   8     0.95       0.05
## 9   9     0.94       0.06
## 10 10     0.96       0.04
## 11 11     0.95       0.05
## 12 12     0.95       0.05
## 13 13     0.95       0.05
## 14 14     0.94       0.06
## 15 15     0.92       0.08
## 16 16     0.92       0.08
## 17 17     0.90       0.10
## 18 18     0.91       0.09
## 19 19     0.88       0.12
## 20 20     0.88       0.12
## 21 21     0.88       0.12
## 22 22     0.88       0.12
## 23 23     0.88       0.12
## 24 24     0.88       0.12
## 25 25     0.88       0.12
## 26 26     0.87       0.13
## 27 27     0.87       0.13
## 28 28     0.88       0.12
## 29 29     0.87       0.13
## 30 30     0.87       0.13

# Visualizing KNN Accuracy
ggplot(knn_results, aes(x = K, y = Accuracy)) +
  geom_line(color = "blue") +
  geom_point(color = "red") +
  labs(title = "KNN Accuracy vs K", x = "K", y = "Accuracy") +
  theme_minimal()

Findings:

Logistic Regression achieved the highest accuracy of 99%, with a very low test error of 1%, indicating that it is highly effective in classifying suburbs with high or low crime rates. Linear Discriminant Analysis (LDA) produced an accuracy of 86%, with a test error of 14%. Naïve Bayes performed slightly worse than LDA, with an accuracy of 81% and a test error of 19%. K-Nearest Neighbors (KNN) was tested across different values of K. The best performance was observed at K = 1, which gave an accuracy of 96% and a test error of 4%. The KNN accuracy vs. K plot shows that as the value of K increases beyond a certain point, accuracy tends to decline.

This suggests that Logistic Regression and KNN with K=1 are the best models for predicting crime rates in Boston suburbs using the selected predictors.

Aqdas_Hussain_HW3_whu947

2025-03-07

part a