library(ISLR2)
data(Weekly)
Assignment 3
Assignmnet 3
Problem 13
Part A
summary(Weekly)
Year Lag1 Lag2 Lag3
Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
Lag4 Lag5 Volume Today
Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
Direction
Down:484
Up :605
cor(Weekly[, -9]) # exclude Direction, which is qualitative
Year Lag1 Lag2 Lag3 Lag4
Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876
Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535
Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865
Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000
Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027
Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873
Lag5 Volume Today
Year -0.030519101 0.84194162 -0.032459894
Lag1 -0.008183096 -0.06495131 -0.075031842
Lag2 -0.072499482 -0.08551314 0.059166717
Lag3 0.060657175 -0.06928771 -0.071243639
Lag4 -0.075675027 -0.06107462 -0.007825873
Lag5 1.000000000 -0.05851741 0.011012698
Volume -0.058517414 1.00000000 -0.033077783
Today 0.011012698 -0.03307778 1.000000000
pairs(Weekly[, -9])
plot(Weekly$Volume, type = "l", main = "Volume Over Time", ylab = "Volume")
boxplot(Today ~ Direction, data = Weekly,
main = "Today’s Return vs Direction",
ylab = "Today",
xlab = "Market Direction")
Pairs plot: Shows that the lag variables (Lag1 to Lag5) don’t have strong patterns with the target variable Today, but some relationships between lag values themselves are slightly visible.
Volume over time: There’s a noticeable upward trend in trading volume over the years, especially after the mid-1990s, which could reflect increasing market activity.
Today’s return vs. direction: The boxplot shows that when the market goes Up, returns tend to be higher and more positive. When it goes Down, returns are more negative and spread out, with some extreme outliers.
Part B
# Fit logistic regression model
<- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
log_model data = Weekly,
family = binomial)
summary(log_model)
Call:
glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
Volume, family = binomial, data = Weekly)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.26686 0.08593 3.106 0.0019 **
Lag1 -0.04127 0.02641 -1.563 0.1181
Lag2 0.05844 0.02686 2.175 0.0296 *
Lag3 -0.01606 0.02666 -0.602 0.5469
Lag4 -0.02779 0.02646 -1.050 0.2937
Lag5 -0.01447 0.02638 -0.549 0.5833
Volume -0.02274 0.03690 -0.616 0.5377
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1496.2 on 1088 degrees of freedom
Residual deviance: 1486.4 on 1082 degrees of freedom
AIC: 1500.4
Number of Fisher Scoring iterations: 4
Statistically Significant Predictor: Lag2
Estimate = 0.05844 p-value = 0.0296 Since p-value < 0.05, this variable is statistically significant.
Part C
<- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
glm.fit family = binomial, data = Weekly)
# Step 1: Predict probabilities
<- predict(glm.fit, type = "response")
glm.probs
# Step 2: Convert probabilities to "Up"/"Down"
<- ifelse(glm.probs > 0.5, "Up", "Down")
glm.pred
# Step 3: Create confusion matrix
<- table(Predicted = glm.pred, Actual = Weekly$Direction)
confusion print(confusion)
Actual
Predicted Down Up
Down 54 48
Up 430 557
# Step 4: Compute accuracy
<- mean(glm.pred == Weekly$Direction)
accuracy print(accuracy)
[1] 0.5610652
The logistic regression model correctly predicted the market direction about 56% of the time. It mostly predicts “Up,” and while it gets many of those right, it performs poorly when the market actually goes “Down,” often misclassifying them. This suggests the model isn’t very reliable for detecting downward trends.
Part D
# Create training and test indicators
<- Weekly$Year <= 2008
train
# Fit logistic regression on training data with Lag2 only
<- glm(Direction ~ Lag2, data = Weekly, family = binomial, subset = train)
glm.fit
# Predict probabilities on test data (2009–2010)
<- predict(glm.fit, newdata = Weekly[!train, ], type = "response")
glm.probs
# Convert probabilities to class labels
<- ifelse(glm.probs > 0.5, "Up", "Down")
glm.pred <- factor(glm.pred, levels = levels(Weekly$Direction)) # ensure matching factor levels
glm.pred
# Actual values for test data
<- Weekly$Direction[!train]
actual
# Confusion matrix
table(Predicted = glm.pred, Actual = actual)
Actual
Predicted Down Up
Down 9 5
Up 34 56
# Accuracy
mean(glm.pred == actual)
[1] 0.625
Parts E through H
library(ISLR2)
library(MASS) # for LDA, QDA
Attaching package: 'MASS'
The following object is masked from 'package:ISLR2':
Boston
library(class) # for KNN
library(e1071) # for Naive Bayes
# Train/test split
<- Weekly$Year <= 2008
train <- Weekly$Year > 2008
test
# Training and test sets
<- Weekly$Lag2[train]
train.X <- Weekly$Lag2[test]
test.X <- Weekly$Direction[train]
train.Y <- Weekly$Direction[test]
test.Y
# Must be data frames for LDA/QDA/Bayes
<- data.frame(Direction = train.Y, Lag2 = train.X)
train.df <- data.frame(Lag2 = test.X) test.df
LDA
<- lda(Direction ~ Lag2, data = train.df)
lda.fit <- predict(lda.fit, test.df)$class
lda.pred
# Confusion matrix and accuracy
table(Predicted = lda.pred, Actual = test.Y)
Actual
Predicted Down Up
Down 9 5
Up 34 56
mean(lda.pred == test.Y)
[1] 0.625
QDA
<- qda(Direction ~ Lag2, data = train.df)
qda.fit <- predict(qda.fit, test.df)$class
qda.pred
# Confusion matrix and accuracy
table(Predicted = qda.pred, Actual = test.Y)
Actual
Predicted Down Up
Down 0 0
Up 43 61
mean(qda.pred == test.Y)
[1] 0.5865385
KNN with K=1
<- as.matrix(train.X)
train.X.knn <- as.matrix(test.X)
test.X.knn
<- knn(train = train.X.knn, test = test.X.knn, cl = train.Y, k = 1)
knn.pred
# Confusion matrix and accuracy
table(Predicted = knn.pred, Actual = test.Y)
Actual
Predicted Down Up
Down 21 29
Up 22 32
mean(knn.pred == test.Y)
[1] 0.5096154
Naive Bayes
<- naiveBayes(Direction ~ Lag2, data = train.df)
nb.fit <- predict(nb.fit, test.df)
nb.pred
# Confusion matrix and accuracy
table(Predicted = nb.pred, Actual = test.Y)
Actual
Predicted Down Up
Down 0 0
Up 43 61
mean(nb.pred == test.Y)
[1] 0.5865385
Logistic regression and LDA gave the best results, each correctly predicting about 62.5% of market directions. They outperformed QDA, Naive Bayes, and KNN, which had lower accuracy or predicted only one outcome. Overall, logistic regression or LDA are the most reliable choices for this data.
Part J
data("Weekly")
# Split data: train = before 2009, test = 2009–2010
<- subset(Weekly, Year <= 2008)
train <- subset(Weekly, Year > 2008) test
<- glm(Direction ~ Lag1 * Lag2, data = train, family = binomial)
glm.fit <- predict(glm.fit, test, type = "response")
glm.probs <- ifelse(glm.probs > 0.5, "Up", "Down")
glm.pred
# Make prediction factors to match "Direction"
<- factor(glm.pred, levels = levels(test$Direction))
glm.pred
# Confusion matrix
table(Predicted = glm.pred, Actual = test$Direction)
Actual
Predicted Down Up
Down 7 8
Up 36 53
# Accuracy
mean(glm.pred == test$Direction)
[1] 0.5769231
LDA with Lag2 and Volume (Transformed)
$logVolume <- log(train$Volume + 1)
train$logVolume <- log(test$Volume + 1)
test
<- lda(Direction ~ Lag2 + logVolume, data = train)
lda.fit <- predict(lda.fit, test)$class
lda.pred table(Predicted = lda.pred, Actual = test$Direction)
Actual
Predicted Down Up
Down 17 19
Up 26 42
mean(lda.pred == test$Direction)
[1] 0.5673077
QDA with Transformed Lag1
$sqrtLag1 <- sqrt(abs(train$Lag1))
train$sqrtLag1 <- sqrt(abs(test$Lag1))
test
<- qda(Direction ~ sqrtLag1 + Lag2, data = train)
qda.fit <- predict(qda.fit, test)$class
qda.pred table(Predicted = qda.pred, Actual = test$Direction)
Actual
Predicted Down Up
Down 0 0
Up 43 61
mean(qda.pred == test$Direction)
[1] 0.5865385
KNN with Optimized K
library(class)
<- cbind(train$Lag2, train$Lag1)
train.X <- cbind(test$Lag2, test$Lag1)
test.X <- train$Direction
train.Direction
# Try k = 1 to 10
for (k in 1:10) {
<- knn(train.X, test.X, train.Direction, k = k)
knn.pred <- mean(knn.pred == test$Direction)
acc cat("K =", k, "Accuracy =", acc, "\n")
}
K = 1 Accuracy = 0.4807692
K = 2 Accuracy = 0.4903846
K = 3 Accuracy = 0.5192308
K = 4 Accuracy = 0.5673077
K = 5 Accuracy = 0.4903846
K = 6 Accuracy = 0.4807692
K = 7 Accuracy = 0.5288462
K = 8 Accuracy = 0.5865385
K = 9 Accuracy = 0.5384615
K = 10 Accuracy = 0.4711538
<- knn(train.X, test.X, train.Direction, k = 6)
best.knn table(Predicted = best.knn, Actual = test$Direction)
Actual
Predicted Down Up
Down 23 31
Up 20 30
Problem 14
Part A
library(ISLR2)
# Calculate the median of mpg
<- median(Auto$mpg)
mpg_median
# Create binary variable: 1 if mpg > median, else 0
<- ifelse(Auto$mpg > mpg_median, 1, 0)
mpg01
# Add mpg01 to the Auto dataset
<- data.frame(Auto, mpg01)
Auto.MPG01
summary(Auto.MPG01)
mpg cylinders displacement horsepower weight
Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0 Min. :1613
1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.0 1st Qu.:2225
Median :22.75 Median :4.000 Median :151.0 Median : 93.5 Median :2804
Mean :23.45 Mean :5.472 Mean :194.4 Mean :104.5 Mean :2978
3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8 3rd Qu.:126.0 3rd Qu.:3615
Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0 Max. :5140
acceleration year origin name
Min. : 8.00 Min. :70.00 Min. :1.000 amc matador : 5
1st Qu.:13.78 1st Qu.:73.00 1st Qu.:1.000 ford pinto : 5
Median :15.50 Median :76.00 Median :1.000 toyota corolla : 5
Mean :15.54 Mean :75.98 Mean :1.577 amc gremlin : 4
3rd Qu.:17.02 3rd Qu.:79.00 3rd Qu.:2.000 amc hornet : 4
Max. :24.80 Max. :82.00 Max. :3.000 chevrolet chevette: 4
(Other) :365
mpg01
Min. :0.0
1st Qu.:0.0
Median :0.5
Mean :0.5
3rd Qu.:1.0
Max. :1.0
Part B
boxplot(displacement ~ mpg01, data = Auto.MPG01, main = "Displacement vs mpg01", xlab = "mpg01", ylab = "Displacement")
boxplot(horsepower ~ mpg01, data = Auto.MPG01, main = "Horsepower vs mpg01", xlab = "mpg01", ylab = "Horsepower")
boxplot(weight ~ mpg01, data = Auto.MPG01, main = "Weight vs mpg01", xlab = "mpg01", ylab = "Weight")
boxplot(acceleration ~ mpg01, data = Auto.MPG01, main = "Acceleration vs mpg01", xlab = "mpg01", ylab = "Acceleration")
Cars with high mpg (mpg01 = 1) usually have lower displacement, horsepower, and weight. These three features show the biggest difference between high and low mpg cars. Acceleration also shows a small difference, but it’s not as strong. So, displacement, horsepower, and weight are the most useful for predicting fuel efficiency.
Part C
set.seed(1)
<- sample(1:nrow(Auto.MPG01), 0.7 * nrow(Auto.MPG01))
train_index <- Auto.MPG01[train_index, ]
train <- Auto.MPG01[-train_index, ] test
Part D
Step 1: Identify variables most associated with mpg01
From the boxplots in part (b), the strongest predictors of mpg01 appear to be:
displacement
horsepower
weight
acceleration
Step 2: Run LDA on training data
library(MASS)
<- lda(mpg01 ~ displacement + horsepower + weight + acceleration, data = train) lda.fit
Step 3: Predict on test data
<- predict(lda.fit, test)
lda.pred <- lda.pred$class lda.class
Step 4: Create confusion matrix & calculate test error
table(Predicted = lda.class, Actual = test$mpg01)
Actual
Predicted 0 1
0 47 1
1 14 56
mean(lda.class != test$mpg01) # test error rate
[1] 0.1271186
Part E
Run QDA
# Fit QDA model on training data
<- qda(mpg01 ~ displacement + horsepower + weight + acceleration, data = train)
qda.model
# Predict on test set
<- predict(qda.model, test)$class
qda.pred
# Create confusion matrix
table(Predicted = qda.pred, Actual = test$mpg01)
Actual
Predicted 0 1
0 49 5
1 12 52
# Calculate test error rate
mean(qda.pred != test$mpg01)
[1] 0.1440678
Part F
<- glm(mpg01 ~ weight + horsepower + displacement + acceleration,
glm.fit data = train, family = binomial)
<- predict(glm.fit, newdata = test, type = "response")
glm.probs
<- ifelse(glm.probs > 0.5, 1, 0)
glm.pred
table(Predicted = glm.pred, Actual = test$mpg01)
Actual
Predicted 0 1
0 53 4
1 8 53
mean(glm.pred != test$mpg01)
[1] 0.1016949
Part G
library(e1071)
<- naiveBayes(mpg01 ~ displacement + horsepower + weight + acceleration, data = train)
nb.model
<- predict(nb.model, test)
nb.pred
table(Predicted = nb.pred, Actual = test$mpg01)
Actual
Predicted 0 1
0 49 3
1 12 54
mean(nb.pred != test$mpg01)
[1] 0.1271186
Part H
# Load required library
library(class)
# Select only the relevant predictors
<- train[, c("displacement", "horsepower", "weight", "acceleration")]
train.X <- test[, c("displacement", "horsepower", "weight", "acceleration")]
test.X
# Set up the response variable
<- train$mpg01
train.Y
# Try different values of K and calculate test error
for (k in 1:10) {
<- knn(train.X, test.X, train.Y, k = k)
knn.pred <- mean(knn.pred != test$mpg01)
error_rate cat("K =", k, "Test Error Rate =", round(error_rate, 4), "\n")
}
K = 1 Test Error Rate = 0.1356
K = 2 Test Error Rate = 0.1525
K = 3 Test Error Rate = 0.1102
K = 4 Test Error Rate = 0.1102
K = 5 Test Error Rate = 0.1271
K = 6 Test Error Rate = 0.1356
K = 7 Test Error Rate = 0.1271
K = 8 Test Error Rate = 0.1356
K = 9 Test Error Rate = 0.1441
K = 10 Test Error Rate = 0.1441
Looks like K=3 and K=4 gave me the best results. The error rate was the lowest at around 11%.
Problem 16
data("Boston")
<- median(Boston$crim)
median_crim $crim01 <- ifelse(Boston$crim > median_crim, 1, 0)
Boston
$crim01 <- as.factor(Boston$crim01) Boston
set.seed(1)
<- sample(1:nrow(Boston), nrow(Boston) * 0.7)
train_index <- Boston[train_index, ]
train <- Boston[-train_index, ] test
<- glm(crim01 ~ nox + rad + tax + dis + lstat, data = train, family = binomial)
log_model <- predict(log_model, test, type = "response")
log_probs <- ifelse(log_probs > 0.5, 1, 0)
log_preds mean(log_preds != test$crim01)
[1] 0.1907895
<- lda(crim01 ~ nox + rad + tax + dis + lstat, data = train)
lda_model <- predict(lda_model, test)$class
lda_preds mean(lda_preds != test$crim01)
[1] 0.1513158
<- naiveBayes(crim01 ~ nox + rad + tax + dis + lstat, data = train)
nb_model <- predict(nb_model, test)
nb_preds mean(nb_preds != test$crim01)
[1] 0.1644737
# Normalize predictors
<- function(x) (x - min(x)) / (max(x) - min(x))
normalize <- as.data.frame(lapply(Boston[, c("nox", "rad", "tax", "dis", "lstat")], normalize))
Boston_norm $crim01 <- Boston$crim01
Boston_norm
<- Boston_norm[train_index, ]
train_knn <- Boston_norm[-train_index, ]
test_knn
# Try different K values
for (k in 1:10) {
<- knn(train_knn[, 1:5], test_knn[, 1:5], train_knn$crim01, k = k)
knn_pred cat("K =", k, "Test Error Rate =", mean(knn_pred != test_knn$crim01), "\n")
}
K = 1 Test Error Rate = 0.05921053
K = 2 Test Error Rate = 0.05921053
K = 3 Test Error Rate = 0.06578947
K = 4 Test Error Rate = 0.05921053
K = 5 Test Error Rate = 0.07236842
K = 6 Test Error Rate = 0.09868421
K = 7 Test Error Rate = 0.08552632
K = 8 Test Error Rate = 0.09210526
K = 9 Test Error Rate = 0.07236842
K = 10 Test Error Rate = 0.07894737
KNN clearly outperformed the other models in this case — especially with K = 1. That suggests local patterns in the data are very strong for classifying crime rate, and simpler distance-based models capture them well. Logistic regression and LDA worked, but not as precisely.