Abhishek

Statistical learning - lab5

Question 13

library(ISLR2)

## Warning: package 'ISLR2' was built under R version 4.4.3

library(MASS)

## Warning: package 'MASS' was built under R version 4.4.3

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:ISLR2':
## 
##     Boston

library(class)

## Warning: package 'class' was built under R version 4.4.3

library(e1071)

## Warning: package 'e1071' was built under R version 4.4.3

# Load Weekly dataset
data("Weekly")

# View first few rows
head(Weekly)

##   Year   Lag1   Lag2   Lag3   Lag4   Lag5    Volume  Today Direction
## 1 1990  0.816  1.572 -3.936 -0.229 -3.484 0.1549760 -0.270      Down
## 2 1990 -0.270  0.816  1.572 -3.936 -0.229 0.1485740 -2.576      Down
## 3 1990 -2.576 -0.270  0.816  1.572 -3.936 0.1598375  3.514        Up
## 4 1990  3.514 -2.576 -0.270  0.816  1.572 0.1616300  0.712        Up
## 5 1990  0.712  3.514 -2.576 -0.270  0.816 0.1537280  1.178        Up
## 6 1990  1.178  0.712  3.514 -2.576 -0.270 0.1544440 -1.372      Down

# Summary statistics
summary(Weekly)

##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
##

(a).

# Check structure of data
str(Weekly)

## 'data.frame':    1089 obs. of  9 variables:
##  $ Year     : num  1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
##  $ Lag1     : num  0.816 -0.27 -2.576 3.514 0.712 ...
##  $ Lag2     : num  1.572 0.816 -0.27 -2.576 3.514 ...
##  $ Lag3     : num  -3.936 1.572 0.816 -0.27 -2.576 ...
##  $ Lag4     : num  -0.229 -3.936 1.572 0.816 -0.27 ...
##  $ Lag5     : num  -3.484 -0.229 -3.936 1.572 0.816 ...
##  $ Volume   : num  0.155 0.149 0.16 0.162 0.154 ...
##  $ Today    : num  -0.27 -2.576 3.514 0.712 1.178 ...
##  $ Direction: Factor w/ 2 levels "Down","Up": 1 1 2 2 2 1 2 2 2 1 ...

# Summary statistics
summary(Weekly)

##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
##

# Plot correlations between numerical variables
pairs(Weekly[, -9], main="Scatterplot Matrix for Weekly Data")

# Plot Direction over time
plot(Weekly$Year, Weekly$Volume, type="l", main="Weekly Volume Over Time", xlab="Year", ylab="Volume")

# Histogram of Weekly Returns
hist(Weekly$Today, breaks=30, main="Histogram of Weekly Returns", xlab="Weekly Returns", col="lightblue")

Based on the numerical and graphical summaries, the following patterns can be observed:

1. Scatterplot Matrix Observations

The scatterplot matrix does not show strong linear relationships between the predictor variables (Lag1 to Lag5) and the response variable (Today or Direction).
Most points appear to be randomly scattered, indicating weak correlations between the lag variables and weekly returns.
However, there may be some slight trends or patterns in certain lag variables that need further statistical testing.

2. Summary Statistics Observations

The dataset consists of 605 “Up” movements and 484 “Down” movements, suggesting a slightly higher frequency of upward market movements.
The Volume variable has increased over time, as seen in the time-series plot.

3. Weekly Volume Over Time

The trading volume has increased significantly over the years, especially after 2005.
There are noticeable spikes in 2008–2010, which may correspond to financial events such as the 2008 financial crisis.

4. Histogram of Weekly Returns

The histogram shows that weekly returns are approximately normally distributed, with most values centered around 0.
There is a slight skew towards negative returns, indicating that large negative movements happen occasionally, though most returns are close to zero.
The distribution has some extreme values (outliers) on both ends.

(b).

# Fit logistic regression model
glm.fit <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, 
               data=Weekly, family=binomial)

# Print summary
summary(glm.fit)

## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Weekly)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4

# Identifying significant predictors (p-value < 0.05)
significant_predictors <- summary(glm.fit)$coefficients[,4] < 0.05
names(significant_predictors)[significant_predictors]

## [1] "(Intercept)" "Lag2"

From the logistic regression output:

Lag2 (p = 0.0296, significant at 5% level)
Intercept (p = 0.0019, highly significant)

All other predictors (Lag1, Lag3, Lag4, Lag5, Volume) have p-values > 0.05, meaning they are not statistically significant.

(c).

# Predict probabilities
glm.probs <- predict(glm.fit, type="response")

# Convert to class labels
glm.pred <- ifelse(glm.probs > 0.5, "Up", "Down")

# Create confusion matrix
conf_matrix <- table(glm.pred, Weekly$Direction)
print(conf_matrix)

##         
## glm.pred Down  Up
##     Down   54  48
##     Up    430 557

# Compute accuracy
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
print(paste("Overall Accuracy:", round(accuracy * 100, 2), "%"))

## [1] "Overall Accuracy: 56.11 %"

Confusion Matrix Interpretation

True Negatives (TN) = 54 → Correctly predicted “Down”.
False Negatives (FN) = 48 → Market went “Up,” but predicted “Down” (missed opportunities).
False Positives (FP) = 430 → Market went “Down,” but predicted “Up” (high mistake rate).
True Positives (TP) = 557 → Correctly predicted “Up”.
Overall Accuracy = 56.11% → The model performs only slightly better than random guessing (50%).
Key Issue: High false positive rate (430 FP) → The model frequently predicts “Up” when the market actually goes “Down”.

(d).

# Split data into training (1990-2008) and test (2009-2010)
train <- Weekly$Year < 2009
Weekly.train <- Weekly[train, ]
Weekly.test <- Weekly[!train, ]

# Fit logistic regression with Lag2 as predictor
glm.fit2 <- glm(Direction ~ Lag2, data=Weekly.train, family=binomial)

# Predict on test set
glm.probs2 <- predict(glm.fit2, Weekly.test, type="response")
glm.pred2 <- ifelse(glm.probs2 > 0.5, "Up", "Down")

# Confusion matrix
conf_matrix2 <- table(glm.pred2, Weekly.test$Direction)
print(conf_matrix2)

##          
## glm.pred2 Down Up
##      Down    9  5
##      Up     34 56

# Accuracy
accuracy2 <- sum(diag(conf_matrix2)) / sum(conf_matrix2)
print(paste("Accuracy for Logistic Regression (Lag2 only):", round(accuracy2 * 100, 2), "%"))

## [1] "Accuracy for Logistic Regression (Lag2 only): 62.5 %"

(e).

lda.fit <- lda(Direction ~ Lag2, data=Weekly.train)
lda.pred <- predict(lda.fit, Weekly.test)
conf_matrix_lda <- table(lda.pred$class, Weekly.test$Direction)

print(conf_matrix_lda)

##       
##        Down Up
##   Down    9  5
##   Up     34 56

accuracy_lda <- sum(diag(conf_matrix_lda)) / sum(conf_matrix_lda)
print(paste("Accuracy for LDA:", round(accuracy_lda * 100, 2), "%"))

## [1] "Accuracy for LDA: 62.5 %"

(f).

qda.fit <- qda(Direction ~ Lag2, data=Weekly.train)
qda.pred <- predict(qda.fit, Weekly.test)
conf_matrix_qda <- table(qda.pred$class, Weekly.test$Direction)

print(conf_matrix_qda)

##       
##        Down Up
##   Down    0  0
##   Up     43 61

accuracy_qda <- sum(diag(conf_matrix_qda)) / sum(conf_matrix_qda)
print(paste("Accuracy for QDA:", round(accuracy_qda * 100, 2), "%"))

## [1] "Accuracy for QDA: 58.65 %"

(g).

# Create feature matrix and labels
train.X <- Weekly.train$Lag2
test.X <- Weekly.test$Lag2
train.Y <- Weekly.train$Direction

# Use KNN with k=1
knn.pred <- knn(as.matrix(train.X), as.matrix(test.X), train.Y, k=1)

# Confusion matrix
conf_matrix_knn <- table(knn.pred, Weekly.test$Direction)

print(conf_matrix_knn)

##         
## knn.pred Down Up
##     Down   21 29
##     Up     22 32

accuracy_knn <- sum(diag(conf_matrix_knn)) / sum(conf_matrix_knn)
print(paste("Accuracy for KNN (k=1):", round(accuracy_knn * 100, 2), "%"))

## [1] "Accuracy for KNN (k=1): 50.96 %"

(h).

nb.fit <- naiveBayes(Direction ~ Lag2, data=Weekly.train)
nb.pred <- predict(nb.fit, Weekly.test)
conf_matrix_nb <- table(nb.pred, Weekly.test$Direction)

print(conf_matrix_nb)

##        
## nb.pred Down Up
##    Down    0  0
##    Up     43 61

accuracy_nb <- sum(diag(conf_matrix_nb)) / sum(conf_matrix_nb)
print(paste("Accuracy for Naive Bayes:", round(accuracy_nb * 100, 2), "%"))

## [1] "Accuracy for Naive Bayes: 58.65 %"

(i).

# Print all accuracy results
accuracy_results <- data.frame(
  Method = c("Logistic Regression", "LDA", "QDA", "KNN (k=1)", "Naive Bayes"),
  Accuracy = c(accuracy2, accuracy_lda, accuracy_qda, accuracy_knn, accuracy_nb)
)

print(accuracy_results)

##                Method  Accuracy
## 1 Logistic Regression 0.6250000
## 2                 LDA 0.6250000
## 3                 QDA 0.5865385
## 4           KNN (k=1) 0.5096154
## 5         Naive Bayes 0.5865385

Logistic Regression and LDA (both 62.5%) provide the best results in terms of accuracy.
QDA and Naive Bayes perform slightly worse (~58.65% accuracy).
KNN (k=1) performs the worst (50%), indicating poor predictive power for this dataset.

Thus, Logistic Regression or LDA would be the best choices for this dataset.

(j).

# Try using multiple predictors in logistic regression
glm.fit3 <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Volume, data=Weekly.train, family=binomial)
glm.probs3 <- predict(glm.fit3, Weekly.test, type="response")
glm.pred3 <- ifelse(glm.probs3 > 0.5, "Up", "Down")

conf_matrix3 <- table(glm.pred3, Weekly.test$Direction)
accuracy3 <- sum(diag(conf_matrix3)) / sum(conf_matrix3)

print(paste("Accuracy for Logistic Regression (Lag1, Lag2, Lag3, Volume):", round(accuracy3 * 100, 2), "%"))

## [1] "Accuracy for Logistic Regression (Lag1, Lag2, Lag3, Volume): 51.92 %"

The full logistic regression model (all Lag variables + Volume) or LDA provides the best accuracy (~62.5%).

Using only Lag1, Lag2, Lag3, and Volume reduces accuracy to 51.92%, so it’s not the best combination.

Question 14

(a).

# Load the Auto dataset
data("Auto")

# Create the binary variable mpg01
Auto$mpg01 <- ifelse(Auto$mpg > median(Auto$mpg), 1, 0)

# Convert mpg01 to a factor (binary classification)
Auto$mpg01 <- as.factor(Auto$mpg01)

# View dataset with new mpg01 column
head(Auto)

##   mpg cylinders displacement horsepower weight acceleration year origin
## 1  18         8          307        130   3504         12.0   70      1
## 2  15         8          350        165   3693         11.5   70      1
## 3  18         8          318        150   3436         11.0   70      1
## 4  16         8          304        150   3433         12.0   70      1
## 5  17         8          302        140   3449         10.5   70      1
## 6  15         8          429        198   4341         10.0   70      1
##                        name mpg01
## 1 chevrolet chevelle malibu     0
## 2         buick skylark 320     0
## 3        plymouth satellite     0
## 4             amc rebel sst     0
## 5               ford torino     0
## 6          ford galaxie 500     0

(b).

# Summary of dataset
summary(Auto)

##       mpg          cylinders      displacement     horsepower        weight    
##  Min.   : 9.00   Min.   :3.000   Min.   : 68.0   Min.   : 46.0   Min.   :1613  
##  1st Qu.:17.00   1st Qu.:4.000   1st Qu.:105.0   1st Qu.: 75.0   1st Qu.:2225  
##  Median :22.75   Median :4.000   Median :151.0   Median : 93.5   Median :2804  
##  Mean   :23.45   Mean   :5.472   Mean   :194.4   Mean   :104.5   Mean   :2978  
##  3rd Qu.:29.00   3rd Qu.:8.000   3rd Qu.:275.8   3rd Qu.:126.0   3rd Qu.:3615  
##  Max.   :46.60   Max.   :8.000   Max.   :455.0   Max.   :230.0   Max.   :5140  
##                                                                                
##   acceleration        year           origin                      name    
##  Min.   : 8.00   Min.   :70.00   Min.   :1.000   amc matador       :  5  
##  1st Qu.:13.78   1st Qu.:73.00   1st Qu.:1.000   ford pinto        :  5  
##  Median :15.50   Median :76.00   Median :1.000   toyota corolla    :  5  
##  Mean   :15.54   Mean   :75.98   Mean   :1.577   amc gremlin       :  4  
##  3rd Qu.:17.02   3rd Qu.:79.00   3rd Qu.:2.000   amc hornet        :  4  
##  Max.   :24.80   Max.   :82.00   Max.   :3.000   chevrolet chevette:  4  
##                                                  (Other)           :365  
##  mpg01  
##  0:196  
##  1:196  
##         
##         
##         
##         
##

# Pairs plot to visualize correlations
pairs(Auto[, c("mpg01", "cylinders", "displacement", "horsepower", "weight", "acceleration")])

# Boxplots to compare mpg01 with other variables
boxplot(Auto$weight ~ Auto$mpg01, main="Weight vs mpg01", xlab="mpg01", ylab="Weight", col=c("red", "blue"))

boxplot(Auto$horsepower ~ Auto$mpg01, main="Horsepower vs mpg01", xlab="mpg01", ylab="Horsepower", col=c("red", "blue"))

boxplot(Auto$displacement ~ Auto$mpg01, main="Displacement vs mpg01", xlab="mpg01", ylab="Displacement", col=c("red", "blue"))

# Correlation matrix
cor(Auto[, c("mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration")])

##                     mpg  cylinders displacement horsepower     weight
## mpg           1.0000000 -0.7776175   -0.8051269 -0.7784268 -0.8322442
## cylinders    -0.7776175  1.0000000    0.9508233  0.8429834  0.8975273
## displacement -0.8051269  0.9508233    1.0000000  0.8972570  0.9329944
## horsepower   -0.7784268  0.8429834    0.8972570  1.0000000  0.8645377
## weight       -0.8322442  0.8975273    0.9329944  0.8645377  1.0000000
## acceleration  0.4233285 -0.5046834   -0.5438005 -0.6891955 -0.4168392
##              acceleration
## mpg             0.4233285
## cylinders      -0.5046834
## displacement   -0.5438005
## horsepower     -0.6891955
## weight         -0.4168392
## acceleration    1.0000000

Weight, horsepower, and displacement seem to have a strong association with mpg01 (lower values correspond to mpg01=1, meaning higher fuel efficiency).

Cylinders might also be a useful predictor.

(c).

set.seed(42)  # Set seed for reproducibility
train <- sample(1:nrow(Auto), nrow(Auto) * 0.7)  # 70% training data
Auto.train <- Auto[train, ]
Auto.test <- Auto[-train, ]

(d).

lda.fit <- lda(mpg01 ~ weight + horsepower + displacement, data=Auto.train)
lda.pred <- predict(lda.fit, Auto.test)
conf_matrix_lda <- table(lda.pred$class, Auto.test$mpg01)

# Compute test error
lda_accuracy <- sum(diag(conf_matrix_lda)) / sum(conf_matrix_lda)
lda_error <- 1 - lda_accuracy

print(conf_matrix_lda)

##    
##      0  1
##   0 41  3
##   1  6 68

print(paste("LDA Test Error:", round(lda_error * 100, 2), "%"))

## [1] "LDA Test Error: 7.63 %"

(e).

qda.fit <- qda(mpg01 ~ weight + horsepower + displacement, data=Auto.train)
qda.pred <- predict(qda.fit, Auto.test)
conf_matrix_qda <- table(qda.pred$class, Auto.test$mpg01)

# Compute test error
qda_accuracy <- sum(diag(conf_matrix_qda)) / sum(conf_matrix_qda)
qda_error <- 1 - qda_accuracy

print(conf_matrix_qda)

##    
##      0  1
##   0 42  4
##   1  5 67

print(paste("QDA Test Error:", round(qda_error * 100, 2), "%"))

## [1] "QDA Test Error: 7.63 %"

(f).

glm.fit <- glm(mpg01 ~ weight + horsepower + displacement, data=Auto.train, family=binomial)
glm.probs <- predict(glm.fit, Auto.test, type="response")
glm.pred <- ifelse(glm.probs > 0.5, 1, 0)
conf_matrix_glm <- table(glm.pred, Auto.test$mpg01)

# Compute test error
glm_accuracy <- sum(diag(conf_matrix_glm)) / sum(conf_matrix_glm)
glm_error <- 1 - glm_accuracy

print(conf_matrix_glm)

##         
## glm.pred  0  1
##        0 43  5
##        1  4 66

print(paste("Logistic Regression Test Error:", round(glm_error * 100, 2), "%"))

## [1] "Logistic Regression Test Error: 7.63 %"

(g).

nb.fit <- naiveBayes(mpg01 ~ weight + horsepower + displacement, data=Auto.train)
nb.pred <- predict(nb.fit, Auto.test)
conf_matrix_nb <- table(nb.pred, Auto.test$mpg01)

# Compute test error
nb_accuracy <- sum(diag(conf_matrix_nb)) / sum(conf_matrix_nb)
nb_error <- 1 - nb_accuracy

print(conf_matrix_nb)

##        
## nb.pred  0  1
##       0 42  3
##       1  5 68

print(paste("Naive Bayes Test Error:", round(nb_error * 100, 2), "%"))

## [1] "Naive Bayes Test Error: 6.78 %"

(h).

# Prepare feature matrix and labels
train.X <- Auto.train[, c("weight", "horsepower", "displacement")]
test.X <- Auto.test[, c("weight", "horsepower", "displacement")]
train.Y <- Auto.train$mpg01

# Try different K values
k_values <- c(1, 3, 5, 10)
knn_results <- data.frame(K = k_values, Accuracy = NA, Error = NA)

for (i in 1:length(k_values)) {
  knn.pred <- knn(train.X, test.X, train.Y, k = k_values[i])
  conf_matrix_knn <- table(knn.pred, Auto.test$mpg01)
  
  # Compute test error
  knn_accuracy <- sum(diag(conf_matrix_knn)) / sum(conf_matrix_knn)
  knn_error <- 1 - knn_accuracy
  
  knn_results$Accuracy[i] <- knn_accuracy
  knn_results$Error[i] <- knn_error
  
  print(paste("K =", k_values[i], "Test Error:", round(knn_error * 100, 2), "%"))
}

## [1] "K = 1 Test Error: 11.86 %"
## [1] "K = 3 Test Error: 11.02 %"
## [1] "K = 5 Test Error: 12.71 %"
## [1] "K = 10 Test Error: 15.25 %"

# Find best K
best_k <- knn_results$K[which.max(knn_results$Accuracy)]
print(paste("Best K value:", best_k))

## [1] "Best K value: 3"

Abhishek_lab5

2025-03-10

Statistical learning - lab5

Question 13

(a).

1. Scatterplot Matrix Observations

2. Summary Statistics Observations

3. Weekly Volume Over Time

4. Histogram of Weekly Returns

(b).

(c).

Confusion Matrix Interpretation

(d).

(e).

(f).

(g).

(h).

(i).

(j).

Question 14

(a).

(b).

(c).

(d).

(e).

(f).

(g).

(h).