question 13

13a

library(ISLR2)
library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:ISLR2':
## 
##     Boston

library(class)
library(e1071)

# Load the dataset
data(Weekly)

#  Numerical and graphical summaries
summary(Weekly)

##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
##

par(mfrow = c(2, 2))
hist(Weekly$Volume, main = "Volume Histogram", col = "lightblue")
plot(Weekly$Lag1, Weekly$Lag2, col = as.numeric(Weekly$Direction), 
     main = "Lag1 vs Lag2", xlab = "Lag1", ylab = "Lag2")
boxplot(Weekly$Lag1 ~ Weekly$Direction, main = "Lag1 by Direction", col = c("red", "blue"))

insights

Yes, some patterns can be observed in the Weekly dataset: - Slight Upward Market Trend: The number of “Up” weeks (605) is higher than “Down” weeks (484), suggesting that the market had a slightly bullish trend over the 21-year period. - Small Weekly Fluctuations with Occasional Extremes: The mean and median of Lag1 to Lag5 are close to zero, indicating that most weekly returns are small. However, the large minimum (-18.195) and maximum (12.026) values suggest that extreme weekly changes do occur. - Volume is Right-Skewed: The trading volume has a higher mean (1.57) than the median (1.00), suggesting that most weeks have moderate volume, but some weeks experience exceptionally high trading activity.

13b

# Logistic regression
log_model <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, 
                 data = Weekly, family = binomial)
summary(log_model)

## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Weekly)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4

insights

A predictor is considered statistically significant if its p-value is below a chosen significance level, typically 0.05.
Lag2 appears to be a statistically significant predictor of Direction (p=0.0296), while the other predictors do not show strong evidence of statistical significance.

13c

# Confusion matrix and accuracy
pred_probs <- predict(log_model, type = "response")
pred_classes <- ifelse(pred_probs > 0.5, "Up", "Down")
table(Predicted = pred_classes, Actual = Weekly$Direction)

##          Actual
## Predicted Down  Up
##      Down   54  48
##      Up    430 557

mean(pred_classes == Weekly$Direction)

## [1] 0.5610652

insights

High False Negative Rate: The model fails to identify many “Up” weeks, meaning it often predicts a “Down” week when the market actually goes up.
Poor Recall for “Up” Weeks: Since 430 out of 987 actual “Up” weeks are misclassified, the model struggles with recognizing upward trends.

13d

#  Logistic regression with training data (1990-2008)
train <- Weekly$Year < 2009
test <- !train
log_model2 <- glm(Direction ~ Lag2, data = Weekly, family = binomial, subset = train)
pred_probs2 <- predict(log_model2, Weekly[test,], type = "response")
pred_classes2 <- ifelse(pred_probs2 > 0.5, "Up", "Down")
table(Predicted = pred_classes2, Actual = Weekly$Direction[test])

##          Actual
## Predicted Down Up
##      Down    9  5
##      Up     34 56

mean(pred_classes2 == Weekly$Direction[test])

## [1] 0.625

insights

Overall Accuracy = 62.5%
False Negative Rate is High
Slightly Better at Identifying “Down” Weeks: Only 5 false positives, which is relatively low.

13e

# (e) LDA
lda_model <- lda(Direction ~ Lag2, data = Weekly, subset = train)
pred_lda <- predict(lda_model, Weekly[test,])
table(Predicted = pred_lda$class, Actual = Weekly$Direction[test])

##          Actual
## Predicted Down Up
##      Down    9  5
##      Up     34 56

mean(pred_lda$class == Weekly$Direction[test])

## [1] 0.625

insights

The false negative rate is high, similar to the logistic regression model.
There is a slight bias towards predicting “Up”, but the overall performance is not significantly better than logistic regression.

13f

# (f) QDA
qda_model <- qda(Direction ~ Lag2, data = Weekly, subset = train)
pred_qda <- predict(qda_model, Weekly[test,])
table(Predicted = pred_qda$class, Actual = Weekly$Direction[test])

##          Actual
## Predicted Down Up
##      Down    0  0
##      Up     43 61

mean(pred_qda$class == Weekly$Direction[test])

## [1] 0.5865385

insights

The model never predicts “Down”, meaning it assumes all weeks will be Up.
This suggests that QDA is overfitting or struggling to capture meaningful patterns in Down movements.
This results in a high false negative rate.

13g

# (g) KNN with K=1
train_X <- Weekly[train, "Lag2", drop = FALSE]
test_X <- Weekly[test, "Lag2", drop = FALSE]
train_Y <- Weekly$Direction[train]
pred_knn <- knn(train_X, test_X, train_Y, k = 1)
table(Predicted = pred_knn, Actual = Weekly$Direction[test])

##          Actual
## Predicted Down Up
##      Down   21 29
##      Up     22 32

mean(pred_knn == Weekly$Direction[test])

## [1] 0.5096154

insights

50% accuracy means KNN is no better than random guessing, which suggests that Lag2 alone is not a strong predictor for the KNN model.
The model has a high misclassification rate, particularly a high false positive rate.
KNN with K=1 is highly sensitive to noise, so increasing K to a larger odd number might improve performance.

13h

# (h) Naive Bayes
nb_model <- naiveBayes(Direction ~ Lag2, data = Weekly, subset = train)
pred_nb <- predict(nb_model, Weekly[test,])
table(Predicted = pred_nb, Actual = Weekly$Direction[test])

##          Actual
## Predicted Down Up
##      Down    0  0
##      Up     43 61

mean(pred_nb == Weekly$Direction[test])

## [1] 0.5865385

insights

The model never predicts “Down”.
It predicts all instances as “Up”, meaning all 43 actual “Down” weeks were misclassified.
The accuracy is 58.65%, which is worse than logistic regression and LDA but slightly better than KNN.

13i

# (i) Comparing models
accuracies <- c(
  Logistic = mean(pred_classes2 == Weekly$Direction[test]),
  LDA = mean(pred_lda$class == Weekly$Direction[test]),
  QDA = mean(pred_qda$class == Weekly$Direction[test]),
  KNN = mean(pred_knn == Weekly$Direction[test]),
  NaiveBayes = mean(pred_nb == Weekly$Direction[test])
)
print(accuracies)

##   Logistic        LDA        QDA        KNN NaiveBayes 
##  0.6250000  0.6250000  0.5865385  0.5096154  0.5865385

Logistic Regression and LDA provide the best results on this data.

13j

# (j) Experimenting with different predictors
log_model3 <- glm(Direction ~ Lag1 + Lag2 + I(Lag2^2), 
                  data = Weekly, family = binomial, subset = train)
pred_probs3 <- predict(log_model3, Weekly[test,], type = "response")
pred_classes3 <- ifelse(pred_probs3 > 0.5, "Up", "Down")
table(Predicted = pred_classes3, Actual = Weekly$Direction[test])

##          Actual
## Predicted Down Up
##      Down    8 11
##      Up     35 50

mean(pred_classes3 == Weekly$Direction[test])

## [1] 0.5576923

insights

KNN Tuning: Experiment with different values for K in KNN (e.g., K = 1, 3, 5, 7) and compare performance. Make sure to scale your data before applying KNN, as KNN is sensitive to the range of your features.
Feature Engineering: Try adding new features (e.g., interactions between existing ones) or removing irrelevant ones to see if they improve the model’s performance.
Accuracy Evaluation:Precision and Recall might be more important in your case, especially if predicting the “Up” or “Down” classes matters differently.
Try Other Models: KNN is just one approach. You can also try Logistic Regression, Decision Trees, or Random Forests to see which works best for this classification problem.

question 14

14a

library(ISLR2)
library(MASS)
library(class)
library(e1071)
library(ggplot2)

# Load the dataset
data(Auto)

# Assuming your dataset is named 'Auto' and the mpg column exists

# Step 1: Calculate the median of 'mpg'
mpg_median <- median(Auto$mpg)

# Step 2: Create the binary variable 'mpg01'
Auto$mpg01 <- ifelse(Auto$mpg > mpg_median, 1, 0)

# Step 3: Create a new data frame containing 'mpg01' and other variables
Auto_with_mpg01 <- data.frame(Auto, mpg01 = Auto$mpg01)

# View the new dataset
head(Auto_with_mpg01)

##   mpg cylinders displacement horsepower weight acceleration year origin
## 1  18         8          307        130   3504         12.0   70      1
## 2  15         8          350        165   3693         11.5   70      1
## 3  18         8          318        150   3436         11.0   70      1
## 4  16         8          304        150   3433         12.0   70      1
## 5  17         8          302        140   3449         10.5   70      1
## 6  15         8          429        198   4341         10.0   70      1
##                        name mpg01 mpg01.1
## 1 chevrolet chevelle malibu     0       0
## 2         buick skylark 320     0       0
## 3        plymouth satellite     0       0
## 4             amc rebel sst     0       0
## 5               ford torino     0       0
## 6          ford galaxie 500     0       0

14b

# (b) Graphical exploration
par(mfrow = c(2, 2))
boxplot(Auto$horsepower ~ Auto$mpg01, main = "Horsepower vs mpg01", col = c("red", "blue"))
boxplot(Auto$weight ~ Auto$mpg01, main = "Weight vs mpg01", col = c("red", "blue"))
boxplot(Auto$acceleration ~ Auto$mpg01, main = "Acceleration vs mpg01", col = c("red", "blue"))
boxplot(Auto$displacement ~ Auto$mpg01, main = "Displacement vs mpg01", col = c("red", "blue"))

observations

Features like horsepower, weight, and displacement are likely to be more useful in predicting mpg01 if their distributions clearly separate the two classes of mpg01.
Acceleration, if the boxplot shows little difference between the two groups, might not be as effective in predicting mpg01.

14c

# (c) Splitting data into training and test sets
set.seed(42)
train_index <- sample(1:nrow(Auto), nrow(Auto) * 0.7)
train <- Auto[train_index, ]
test <- Auto[-train_index, ]

14d

# (d) LDA
lda_model <- lda(mpg01 ~ horsepower + weight + displacement, data = train)
pred_lda <- predict(lda_model, test)
table(Predicted = pred_lda$class, Actual = test$mpg01)

##          Actual
## Predicted  0  1
##         0 41  3
##         1  6 68

lda_error <- mean(pred_lda$class != test$mpg01)

test error calculation

FP <- 3 # False Positives FN <- 6 # False Negatives Total <- 118 # Total observations test_error <- (FP + FN) / Total test_error= 0.076. - This means that the test error rate is approximately 7.6%. In other words, about 7.6% of the predictions made by the model are incorrect.

14e

# (e) QDA
qda_model <- qda(mpg01 ~ horsepower + weight + displacement, data = train)
pred_qda <- predict(qda_model, test)
table(Predicted = pred_qda$class, Actual = test$mpg01)

##          Actual
## Predicted  0  1
##         0 42  4
##         1  5 67

qda_error <- mean(pred_qda$class != test$mpg01)

test error

FP <- 4 # False Positives FN <- 5 # False Negatives Total <- 118 # Total observations (sum of all entries in the confusion matrix) test_error <- (FP + FN) / Total test_error= 0.076

14f

# (f) Logistic Regression
log_model <- glm(mpg01 ~ horsepower + weight + displacement, data = train, family = binomial)
pred_probs <- predict(log_model, test, type = "response")
pred_classes <- ifelse(pred_probs > 0.5, 1, 0)
table(Predicted = pred_classes, Actual = test$mpg01)

##          Actual
## Predicted  0  1
##         0 43  5
##         1  4 66

log_error <- mean(pred_classes != test$mpg01)

test error

FP <- 5 # False Positives FN <- 4 # False Negatives Total <- 118 # Total observations (sum of all entries in the confusion matrix) test_error_log_reg <- (FP + FN) / Total test_error_log_reg= 0.076

14g

# (g) Naive Bayes
nb_model <- naiveBayes(mpg01 ~ horsepower + weight + displacement, data = train)
pred_nb <- predict(nb_model, test)
table(Predicted = pred_nb, Actual = test$mpg01)

##          Actual
## Predicted  0  1
##         0 42  3
##         1  5 68

nb_error <- mean(pred_nb != test$mpg01)

test error

FP <- 3 # False Positives FN <- 5 # False Negatives Total <- 118 # Total observations (sum of all entries in the confusion matrix) test_error_nb <- (FP + FN) / Total test_error_nb= 0.068

14h

# (h) KNN with different K values
train_X <- train[, c("horsepower", "weight", "displacement")]
test_X <- test[, c("horsepower", "weight", "displacement")]
train_Y <- train$mpg01
test_Y <- test$mpg01

k_values <- c(1, 3, 5, 7, 9, 11)
k_errors <- sapply(k_values, function(k) {
  pred_knn <- knn(train_X, test_X, train_Y, k = k)
  mean(pred_knn != test_Y)
})
names(k_errors) <- k_values

# Print errors for all models
errors <- c(LDA = lda_error, QDA = qda_error, Logistic = log_error, NaiveBayes = nb_error, k_errors)
print(errors)

##        LDA        QDA   Logistic NaiveBayes          1          3          5 
## 0.07627119 0.07627119 0.07627119 0.06779661 0.11864407 0.11016949 0.12711864 
##          7          9         11 
## 0.12711864 0.15254237 0.16101695

inisghts

From the results, Naive Bayes performs the best in this case with the lowest test error. Based on these results, Naive Bayes is likely the most effective model for this dataset.