Statistical learning - lab5

Question 13

library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.4.3
library(MASS)
## Warning: package 'MASS' was built under R version 4.4.3
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
## 
##     Boston
library(class)
## Warning: package 'class' was built under R version 4.4.3
library(e1071)
## Warning: package 'e1071' was built under R version 4.4.3
# Load Weekly dataset
data("Weekly")

# View first few rows
head(Weekly)
##   Year   Lag1   Lag2   Lag3   Lag4   Lag5    Volume  Today Direction
## 1 1990  0.816  1.572 -3.936 -0.229 -3.484 0.1549760 -0.270      Down
## 2 1990 -0.270  0.816  1.572 -3.936 -0.229 0.1485740 -2.576      Down
## 3 1990 -2.576 -0.270  0.816  1.572 -3.936 0.1598375  3.514        Up
## 4 1990  3.514 -2.576 -0.270  0.816  1.572 0.1616300  0.712        Up
## 5 1990  0.712  3.514 -2.576 -0.270  0.816 0.1537280  1.178        Up
## 6 1990  1.178  0.712  3.514 -2.576 -0.270 0.1544440 -1.372      Down
# Summary statistics
summary(Weekly)
##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
## 

(a).

# Check structure of data
str(Weekly)
## 'data.frame':    1089 obs. of  9 variables:
##  $ Year     : num  1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
##  $ Lag1     : num  0.816 -0.27 -2.576 3.514 0.712 ...
##  $ Lag2     : num  1.572 0.816 -0.27 -2.576 3.514 ...
##  $ Lag3     : num  -3.936 1.572 0.816 -0.27 -2.576 ...
##  $ Lag4     : num  -0.229 -3.936 1.572 0.816 -0.27 ...
##  $ Lag5     : num  -3.484 -0.229 -3.936 1.572 0.816 ...
##  $ Volume   : num  0.155 0.149 0.16 0.162 0.154 ...
##  $ Today    : num  -0.27 -2.576 3.514 0.712 1.178 ...
##  $ Direction: Factor w/ 2 levels "Down","Up": 1 1 2 2 2 1 2 2 2 1 ...
# Summary statistics
summary(Weekly)
##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
## 
# Plot correlations between numerical variables
pairs(Weekly[, -9], main="Scatterplot Matrix for Weekly Data")

# Plot Direction over time
plot(Weekly$Year, Weekly$Volume, type="l", main="Weekly Volume Over Time", xlab="Year", ylab="Volume")

# Histogram of Weekly Returns
hist(Weekly$Today, breaks=30, main="Histogram of Weekly Returns", xlab="Weekly Returns", col="lightblue")

Based on the numerical and graphical summaries, the following patterns can be observed:

1. Scatterplot Matrix Observations

  • The scatterplot matrix does not show strong linear relationships between the predictor variables (Lag1 to Lag5) and the response variable (Today or Direction).

  • Most points appear to be randomly scattered, indicating weak correlations between the lag variables and weekly returns.

  • However, there may be some slight trends or patterns in certain lag variables that need further statistical testing.

2. Summary Statistics Observations

  • The dataset consists of 605 “Up” movements and 484 “Down” movements, suggesting a slightly higher frequency of upward market movements.

  • The Volume variable has increased over time, as seen in the time-series plot.

3. Weekly Volume Over Time

  • The trading volume has increased significantly over the years, especially after 2005.

  • There are noticeable spikes in 2008–2010, which may correspond to financial events such as the 2008 financial crisis.

4. Histogram of Weekly Returns

  • The histogram shows that weekly returns are approximately normally distributed, with most values centered around 0.

  • There is a slight skew towards negative returns, indicating that large negative movements happen occasionally, though most returns are close to zero.

  • The distribution has some extreme values (outliers) on both ends.

(b).

# Fit logistic regression model
glm.fit <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, 
               data=Weekly, family=binomial)

# Print summary
summary(glm.fit)
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Weekly)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4
# Identifying significant predictors (p-value < 0.05)
significant_predictors <- summary(glm.fit)$coefficients[,4] < 0.05
names(significant_predictors)[significant_predictors]
## [1] "(Intercept)" "Lag2"

From the logistic regression output:

  • Lag2 (p = 0.0296, significant at 5% level)

  • Intercept (p = 0.0019, highly significant)

All other predictors (Lag1, Lag3, Lag4, Lag5, Volume) have p-values > 0.05, meaning they are not statistically significant.

(c).

# Predict probabilities
glm.probs <- predict(glm.fit, type="response")

# Convert to class labels
glm.pred <- ifelse(glm.probs > 0.5, "Up", "Down")

# Create confusion matrix
conf_matrix <- table(glm.pred, Weekly$Direction)
print(conf_matrix)
##         
## glm.pred Down  Up
##     Down   54  48
##     Up    430 557
# Compute accuracy
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
print(paste("Overall Accuracy:", round(accuracy * 100, 2), "%"))
## [1] "Overall Accuracy: 56.11 %"

Confusion Matrix Interpretation

  • True Negatives (TN) = 54 → Correctly predicted “Down”.

  • False Negatives (FN) = 48 → Market went “Up,” but predicted “Down” (missed opportunities).

  • False Positives (FP) = 430 → Market went “Down,” but predicted “Up” (high mistake rate).

  • True Positives (TP) = 557 → Correctly predicted “Up”.

  • Overall Accuracy = 56.11% → The model performs only slightly better than random guessing (50%).

  • Key Issue: High false positive rate (430 FP) → The model frequently predicts “Up” when the market actually goes “Down”.

(d).

# Split data into training (1990-2008) and test (2009-2010)
train <- Weekly$Year < 2009
Weekly.train <- Weekly[train, ]
Weekly.test <- Weekly[!train, ]

# Fit logistic regression with Lag2 as predictor
glm.fit2 <- glm(Direction ~ Lag2, data=Weekly.train, family=binomial)

# Predict on test set
glm.probs2 <- predict(glm.fit2, Weekly.test, type="response")
glm.pred2 <- ifelse(glm.probs2 > 0.5, "Up", "Down")

# Confusion matrix
conf_matrix2 <- table(glm.pred2, Weekly.test$Direction)
print(conf_matrix2)
##          
## glm.pred2 Down Up
##      Down    9  5
##      Up     34 56
# Accuracy
accuracy2 <- sum(diag(conf_matrix2)) / sum(conf_matrix2)
print(paste("Accuracy for Logistic Regression (Lag2 only):", round(accuracy2 * 100, 2), "%"))
## [1] "Accuracy for Logistic Regression (Lag2 only): 62.5 %"

(e).

lda.fit <- lda(Direction ~ Lag2, data=Weekly.train)
lda.pred <- predict(lda.fit, Weekly.test)
conf_matrix_lda <- table(lda.pred$class, Weekly.test$Direction)

print(conf_matrix_lda)
##       
##        Down Up
##   Down    9  5
##   Up     34 56
accuracy_lda <- sum(diag(conf_matrix_lda)) / sum(conf_matrix_lda)
print(paste("Accuracy for LDA:", round(accuracy_lda * 100, 2), "%"))
## [1] "Accuracy for LDA: 62.5 %"

(f).

qda.fit <- qda(Direction ~ Lag2, data=Weekly.train)
qda.pred <- predict(qda.fit, Weekly.test)
conf_matrix_qda <- table(qda.pred$class, Weekly.test$Direction)

print(conf_matrix_qda)
##       
##        Down Up
##   Down    0  0
##   Up     43 61
accuracy_qda <- sum(diag(conf_matrix_qda)) / sum(conf_matrix_qda)
print(paste("Accuracy for QDA:", round(accuracy_qda * 100, 2), "%"))
## [1] "Accuracy for QDA: 58.65 %"

(g).

# Create feature matrix and labels
train.X <- Weekly.train$Lag2
test.X <- Weekly.test$Lag2
train.Y <- Weekly.train$Direction

# Use KNN with k=1
knn.pred <- knn(as.matrix(train.X), as.matrix(test.X), train.Y, k=1)

# Confusion matrix
conf_matrix_knn <- table(knn.pred, Weekly.test$Direction)

print(conf_matrix_knn)
##         
## knn.pred Down Up
##     Down   21 29
##     Up     22 32
accuracy_knn <- sum(diag(conf_matrix_knn)) / sum(conf_matrix_knn)
print(paste("Accuracy for KNN (k=1):", round(accuracy_knn * 100, 2), "%"))
## [1] "Accuracy for KNN (k=1): 50.96 %"

(h).

nb.fit <- naiveBayes(Direction ~ Lag2, data=Weekly.train)
nb.pred <- predict(nb.fit, Weekly.test)
conf_matrix_nb <- table(nb.pred, Weekly.test$Direction)

print(conf_matrix_nb)
##        
## nb.pred Down Up
##    Down    0  0
##    Up     43 61
accuracy_nb <- sum(diag(conf_matrix_nb)) / sum(conf_matrix_nb)
print(paste("Accuracy for Naive Bayes:", round(accuracy_nb * 100, 2), "%"))
## [1] "Accuracy for Naive Bayes: 58.65 %"

(i).

# Print all accuracy results
accuracy_results <- data.frame(
  Method = c("Logistic Regression", "LDA", "QDA", "KNN (k=1)", "Naive Bayes"),
  Accuracy = c(accuracy2, accuracy_lda, accuracy_qda, accuracy_knn, accuracy_nb)
)

print(accuracy_results)
##                Method  Accuracy
## 1 Logistic Regression 0.6250000
## 2                 LDA 0.6250000
## 3                 QDA 0.5865385
## 4           KNN (k=1) 0.5096154
## 5         Naive Bayes 0.5865385
  • Logistic Regression and LDA (both 62.5%) provide the best results in terms of accuracy.

  • QDA and Naive Bayes perform slightly worse (~58.65% accuracy).

  • KNN (k=1) performs the worst (50%), indicating poor predictive power for this dataset.

Thus, Logistic Regression or LDA would be the best choices for this dataset.

(j).

# Try using multiple predictors in logistic regression
glm.fit3 <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Volume, data=Weekly.train, family=binomial)
glm.probs3 <- predict(glm.fit3, Weekly.test, type="response")
glm.pred3 <- ifelse(glm.probs3 > 0.5, "Up", "Down")

conf_matrix3 <- table(glm.pred3, Weekly.test$Direction)
accuracy3 <- sum(diag(conf_matrix3)) / sum(conf_matrix3)

print(paste("Accuracy for Logistic Regression (Lag1, Lag2, Lag3, Volume):", round(accuracy3 * 100, 2), "%"))
## [1] "Accuracy for Logistic Regression (Lag1, Lag2, Lag3, Volume): 51.92 %"
  • The full logistic regression model (all Lag variables + Volume) or LDA provides the best accuracy (~62.5%).
  • Using only Lag1, Lag2, Lag3, and Volume reduces accuracy to 51.92%, so it’s not the best combination.

Question 14

(a).

# Load the Auto dataset
data("Auto")

# Create the binary variable mpg01
Auto$mpg01 <- ifelse(Auto$mpg > median(Auto$mpg), 1, 0)

# Convert mpg01 to a factor (binary classification)
Auto$mpg01 <- as.factor(Auto$mpg01)

# View dataset with new mpg01 column
head(Auto)
##   mpg cylinders displacement horsepower weight acceleration year origin
## 1  18         8          307        130   3504         12.0   70      1
## 2  15         8          350        165   3693         11.5   70      1
## 3  18         8          318        150   3436         11.0   70      1
## 4  16         8          304        150   3433         12.0   70      1
## 5  17         8          302        140   3449         10.5   70      1
## 6  15         8          429        198   4341         10.0   70      1
##                        name mpg01
## 1 chevrolet chevelle malibu     0
## 2         buick skylark 320     0
## 3        plymouth satellite     0
## 4             amc rebel sst     0
## 5               ford torino     0
## 6          ford galaxie 500     0

(b).

# Summary of dataset
summary(Auto)
##       mpg          cylinders      displacement     horsepower        weight    
##  Min.   : 9.00   Min.   :3.000   Min.   : 68.0   Min.   : 46.0   Min.   :1613  
##  1st Qu.:17.00   1st Qu.:4.000   1st Qu.:105.0   1st Qu.: 75.0   1st Qu.:2225  
##  Median :22.75   Median :4.000   Median :151.0   Median : 93.5   Median :2804  
##  Mean   :23.45   Mean   :5.472   Mean   :194.4   Mean   :104.5   Mean   :2978  
##  3rd Qu.:29.00   3rd Qu.:8.000   3rd Qu.:275.8   3rd Qu.:126.0   3rd Qu.:3615  
##  Max.   :46.60   Max.   :8.000   Max.   :455.0   Max.   :230.0   Max.   :5140  
##                                                                                
##   acceleration        year           origin                      name    
##  Min.   : 8.00   Min.   :70.00   Min.   :1.000   amc matador       :  5  
##  1st Qu.:13.78   1st Qu.:73.00   1st Qu.:1.000   ford pinto        :  5  
##  Median :15.50   Median :76.00   Median :1.000   toyota corolla    :  5  
##  Mean   :15.54   Mean   :75.98   Mean   :1.577   amc gremlin       :  4  
##  3rd Qu.:17.02   3rd Qu.:79.00   3rd Qu.:2.000   amc hornet        :  4  
##  Max.   :24.80   Max.   :82.00   Max.   :3.000   chevrolet chevette:  4  
##                                                  (Other)           :365  
##  mpg01  
##  0:196  
##  1:196  
##         
##         
##         
##         
## 
# Pairs plot to visualize correlations
pairs(Auto[, c("mpg01", "cylinders", "displacement", "horsepower", "weight", "acceleration")])

# Boxplots to compare mpg01 with other variables
boxplot(Auto$weight ~ Auto$mpg01, main="Weight vs mpg01", xlab="mpg01", ylab="Weight", col=c("red", "blue"))

boxplot(Auto$horsepower ~ Auto$mpg01, main="Horsepower vs mpg01", xlab="mpg01", ylab="Horsepower", col=c("red", "blue"))

boxplot(Auto$displacement ~ Auto$mpg01, main="Displacement vs mpg01", xlab="mpg01", ylab="Displacement", col=c("red", "blue"))

# Correlation matrix
cor(Auto[, c("mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration")])
##                     mpg  cylinders displacement horsepower     weight
## mpg           1.0000000 -0.7776175   -0.8051269 -0.7784268 -0.8322442
## cylinders    -0.7776175  1.0000000    0.9508233  0.8429834  0.8975273
## displacement -0.8051269  0.9508233    1.0000000  0.8972570  0.9329944
## horsepower   -0.7784268  0.8429834    0.8972570  1.0000000  0.8645377
## weight       -0.8322442  0.8975273    0.9329944  0.8645377  1.0000000
## acceleration  0.4233285 -0.5046834   -0.5438005 -0.6891955 -0.4168392
##              acceleration
## mpg             0.4233285
## cylinders      -0.5046834
## displacement   -0.5438005
## horsepower     -0.6891955
## weight         -0.4168392
## acceleration    1.0000000
  • Weight, horsepower, and displacement seem to have a strong association with mpg01 (lower values correspond to mpg01=1, meaning higher fuel efficiency).
  • Cylinders might also be a useful predictor.

(c).

set.seed(42)  # Set seed for reproducibility
train <- sample(1:nrow(Auto), nrow(Auto) * 0.7)  # 70% training data
Auto.train <- Auto[train, ]
Auto.test <- Auto[-train, ]

(d).

lda.fit <- lda(mpg01 ~ weight + horsepower + displacement, data=Auto.train)
lda.pred <- predict(lda.fit, Auto.test)
conf_matrix_lda <- table(lda.pred$class, Auto.test$mpg01)

# Compute test error
lda_accuracy <- sum(diag(conf_matrix_lda)) / sum(conf_matrix_lda)
lda_error <- 1 - lda_accuracy

print(conf_matrix_lda)
##    
##      0  1
##   0 41  3
##   1  6 68
print(paste("LDA Test Error:", round(lda_error * 100, 2), "%"))
## [1] "LDA Test Error: 7.63 %"

(e).

qda.fit <- qda(mpg01 ~ weight + horsepower + displacement, data=Auto.train)
qda.pred <- predict(qda.fit, Auto.test)
conf_matrix_qda <- table(qda.pred$class, Auto.test$mpg01)

# Compute test error
qda_accuracy <- sum(diag(conf_matrix_qda)) / sum(conf_matrix_qda)
qda_error <- 1 - qda_accuracy

print(conf_matrix_qda)
##    
##      0  1
##   0 42  4
##   1  5 67
print(paste("QDA Test Error:", round(qda_error * 100, 2), "%"))
## [1] "QDA Test Error: 7.63 %"

(f).

glm.fit <- glm(mpg01 ~ weight + horsepower + displacement, data=Auto.train, family=binomial)
glm.probs <- predict(glm.fit, Auto.test, type="response")
glm.pred <- ifelse(glm.probs > 0.5, 1, 0)
conf_matrix_glm <- table(glm.pred, Auto.test$mpg01)

# Compute test error
glm_accuracy <- sum(diag(conf_matrix_glm)) / sum(conf_matrix_glm)
glm_error <- 1 - glm_accuracy

print(conf_matrix_glm)
##         
## glm.pred  0  1
##        0 43  5
##        1  4 66
print(paste("Logistic Regression Test Error:", round(glm_error * 100, 2), "%"))
## [1] "Logistic Regression Test Error: 7.63 %"

(g).

nb.fit <- naiveBayes(mpg01 ~ weight + horsepower + displacement, data=Auto.train)
nb.pred <- predict(nb.fit, Auto.test)
conf_matrix_nb <- table(nb.pred, Auto.test$mpg01)

# Compute test error
nb_accuracy <- sum(diag(conf_matrix_nb)) / sum(conf_matrix_nb)
nb_error <- 1 - nb_accuracy

print(conf_matrix_nb)
##        
## nb.pred  0  1
##       0 42  3
##       1  5 68
print(paste("Naive Bayes Test Error:", round(nb_error * 100, 2), "%"))
## [1] "Naive Bayes Test Error: 6.78 %"

(h).

# Prepare feature matrix and labels
train.X <- Auto.train[, c("weight", "horsepower", "displacement")]
test.X <- Auto.test[, c("weight", "horsepower", "displacement")]
train.Y <- Auto.train$mpg01

# Try different K values
k_values <- c(1, 3, 5, 10)
knn_results <- data.frame(K = k_values, Accuracy = NA, Error = NA)

for (i in 1:length(k_values)) {
  knn.pred <- knn(train.X, test.X, train.Y, k = k_values[i])
  conf_matrix_knn <- table(knn.pred, Auto.test$mpg01)
  
  # Compute test error
  knn_accuracy <- sum(diag(conf_matrix_knn)) / sum(conf_matrix_knn)
  knn_error <- 1 - knn_accuracy
  
  knn_results$Accuracy[i] <- knn_accuracy
  knn_results$Error[i] <- knn_error
  
  print(paste("K =", k_values[i], "Test Error:", round(knn_error * 100, 2), "%"))
}
## [1] "K = 1 Test Error: 11.86 %"
## [1] "K = 3 Test Error: 11.02 %"
## [1] "K = 5 Test Error: 12.71 %"
## [1] "K = 10 Test Error: 15.25 %"
# Find best K
best_k <- knn_results$K[which.max(knn_results$Accuracy)]
print(paste("Best K value:", best_k))
## [1] "Best K value: 3"