library(ISLR2)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
library(class)
library(e1071)
# Load the dataset
data(Weekly)
# Numerical and graphical summaries
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
par(mfrow = c(2, 2))
hist(Weekly$Volume, main = "Volume Histogram", col = "lightblue")
plot(Weekly$Lag1, Weekly$Lag2, col = as.numeric(Weekly$Direction),
main = "Lag1 vs Lag2", xlab = "Lag1", ylab = "Lag2")
boxplot(Weekly$Lag1 ~ Weekly$Direction, main = "Lag1 by Direction", col = c("red", "blue"))
Yes, some patterns can be observed in the Weekly dataset: - Slight Upward Market Trend: The number of “Up” weeks (605) is higher than “Down” weeks (484), suggesting that the market had a slightly bullish trend over the 21-year period. - Small Weekly Fluctuations with Occasional Extremes: The mean and median of Lag1 to Lag5 are close to zero, indicating that most weekly returns are small. However, the large minimum (-18.195) and maximum (12.026) values suggest that extreme weekly changes do occur. - Volume is Right-Skewed: The trading volume has a higher mean (1.57) than the median (1.00), suggesting that most weeks have moderate volume, but some weeks experience exceptionally high trading activity.
# Logistic regression
log_model <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Weekly, family = binomial)
summary(log_model)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
# Confusion matrix and accuracy
pred_probs <- predict(log_model, type = "response")
pred_classes <- ifelse(pred_probs > 0.5, "Up", "Down")
table(Predicted = pred_classes, Actual = Weekly$Direction)
## Actual
## Predicted Down Up
## Down 54 48
## Up 430 557
mean(pred_classes == Weekly$Direction)
## [1] 0.5610652
# Logistic regression with training data (1990-2008)
train <- Weekly$Year < 2009
test <- !train
log_model2 <- glm(Direction ~ Lag2, data = Weekly, family = binomial, subset = train)
pred_probs2 <- predict(log_model2, Weekly[test,], type = "response")
pred_classes2 <- ifelse(pred_probs2 > 0.5, "Up", "Down")
table(Predicted = pred_classes2, Actual = Weekly$Direction[test])
## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
mean(pred_classes2 == Weekly$Direction[test])
## [1] 0.625
# (e) LDA
lda_model <- lda(Direction ~ Lag2, data = Weekly, subset = train)
pred_lda <- predict(lda_model, Weekly[test,])
table(Predicted = pred_lda$class, Actual = Weekly$Direction[test])
## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
mean(pred_lda$class == Weekly$Direction[test])
## [1] 0.625
# (f) QDA
qda_model <- qda(Direction ~ Lag2, data = Weekly, subset = train)
pred_qda <- predict(qda_model, Weekly[test,])
table(Predicted = pred_qda$class, Actual = Weekly$Direction[test])
## Actual
## Predicted Down Up
## Down 0 0
## Up 43 61
mean(pred_qda$class == Weekly$Direction[test])
## [1] 0.5865385
# (g) KNN with K=1
train_X <- Weekly[train, "Lag2", drop = FALSE]
test_X <- Weekly[test, "Lag2", drop = FALSE]
train_Y <- Weekly$Direction[train]
pred_knn <- knn(train_X, test_X, train_Y, k = 1)
table(Predicted = pred_knn, Actual = Weekly$Direction[test])
## Actual
## Predicted Down Up
## Down 21 29
## Up 22 32
mean(pred_knn == Weekly$Direction[test])
## [1] 0.5096154
# (h) Naive Bayes
nb_model <- naiveBayes(Direction ~ Lag2, data = Weekly, subset = train)
pred_nb <- predict(nb_model, Weekly[test,])
table(Predicted = pred_nb, Actual = Weekly$Direction[test])
## Actual
## Predicted Down Up
## Down 0 0
## Up 43 61
mean(pred_nb == Weekly$Direction[test])
## [1] 0.5865385
# (i) Comparing models
accuracies <- c(
Logistic = mean(pred_classes2 == Weekly$Direction[test]),
LDA = mean(pred_lda$class == Weekly$Direction[test]),
QDA = mean(pred_qda$class == Weekly$Direction[test]),
KNN = mean(pred_knn == Weekly$Direction[test]),
NaiveBayes = mean(pred_nb == Weekly$Direction[test])
)
print(accuracies)
## Logistic LDA QDA KNN NaiveBayes
## 0.6250000 0.6250000 0.5865385 0.5096154 0.5865385
# (j) Experimenting with different predictors
log_model3 <- glm(Direction ~ Lag1 + Lag2 + I(Lag2^2),
data = Weekly, family = binomial, subset = train)
pred_probs3 <- predict(log_model3, Weekly[test,], type = "response")
pred_classes3 <- ifelse(pred_probs3 > 0.5, "Up", "Down")
table(Predicted = pred_classes3, Actual = Weekly$Direction[test])
## Actual
## Predicted Down Up
## Down 8 11
## Up 35 50
mean(pred_classes3 == Weekly$Direction[test])
## [1] 0.5576923
library(ISLR2)
library(MASS)
library(class)
library(e1071)
library(ggplot2)
# Load the dataset
data(Auto)
# Assuming your dataset is named 'Auto' and the mpg column exists
# Step 1: Calculate the median of 'mpg'
mpg_median <- median(Auto$mpg)
# Step 2: Create the binary variable 'mpg01'
Auto$mpg01 <- ifelse(Auto$mpg > mpg_median, 1, 0)
# Step 3: Create a new data frame containing 'mpg01' and other variables
Auto_with_mpg01 <- data.frame(Auto, mpg01 = Auto$mpg01)
# View the new dataset
head(Auto_with_mpg01)
## mpg cylinders displacement horsepower weight acceleration year origin
## 1 18 8 307 130 3504 12.0 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11.0 70 1
## 4 16 8 304 150 3433 12.0 70 1
## 5 17 8 302 140 3449 10.5 70 1
## 6 15 8 429 198 4341 10.0 70 1
## name mpg01 mpg01.1
## 1 chevrolet chevelle malibu 0 0
## 2 buick skylark 320 0 0
## 3 plymouth satellite 0 0
## 4 amc rebel sst 0 0
## 5 ford torino 0 0
## 6 ford galaxie 500 0 0
# (b) Graphical exploration
par(mfrow = c(2, 2))
boxplot(Auto$horsepower ~ Auto$mpg01, main = "Horsepower vs mpg01", col = c("red", "blue"))
boxplot(Auto$weight ~ Auto$mpg01, main = "Weight vs mpg01", col = c("red", "blue"))
boxplot(Auto$acceleration ~ Auto$mpg01, main = "Acceleration vs mpg01", col = c("red", "blue"))
boxplot(Auto$displacement ~ Auto$mpg01, main = "Displacement vs mpg01", col = c("red", "blue"))
# (c) Splitting data into training and test sets
set.seed(42)
train_index <- sample(1:nrow(Auto), nrow(Auto) * 0.7)
train <- Auto[train_index, ]
test <- Auto[-train_index, ]
# (d) LDA
lda_model <- lda(mpg01 ~ horsepower + weight + displacement, data = train)
pred_lda <- predict(lda_model, test)
table(Predicted = pred_lda$class, Actual = test$mpg01)
## Actual
## Predicted 0 1
## 0 41 3
## 1 6 68
lda_error <- mean(pred_lda$class != test$mpg01)
FP <- 3 # False Positives FN <- 6 # False Negatives Total <- 118 # Total observations test_error <- (FP + FN) / Total test_error= 0.076. - This means that the test error rate is approximately 7.6%. In other words, about 7.6% of the predictions made by the model are incorrect.
# (e) QDA
qda_model <- qda(mpg01 ~ horsepower + weight + displacement, data = train)
pred_qda <- predict(qda_model, test)
table(Predicted = pred_qda$class, Actual = test$mpg01)
## Actual
## Predicted 0 1
## 0 42 4
## 1 5 67
qda_error <- mean(pred_qda$class != test$mpg01)
FP <- 4 # False Positives FN <- 5 # False Negatives Total <- 118 # Total observations (sum of all entries in the confusion matrix) test_error <- (FP + FN) / Total test_error= 0.076
# (f) Logistic Regression
log_model <- glm(mpg01 ~ horsepower + weight + displacement, data = train, family = binomial)
pred_probs <- predict(log_model, test, type = "response")
pred_classes <- ifelse(pred_probs > 0.5, 1, 0)
table(Predicted = pred_classes, Actual = test$mpg01)
## Actual
## Predicted 0 1
## 0 43 5
## 1 4 66
log_error <- mean(pred_classes != test$mpg01)
FP <- 5 # False Positives FN <- 4 # False Negatives Total <- 118 # Total observations (sum of all entries in the confusion matrix) test_error_log_reg <- (FP + FN) / Total test_error_log_reg= 0.076
# (g) Naive Bayes
nb_model <- naiveBayes(mpg01 ~ horsepower + weight + displacement, data = train)
pred_nb <- predict(nb_model, test)
table(Predicted = pred_nb, Actual = test$mpg01)
## Actual
## Predicted 0 1
## 0 42 3
## 1 5 68
nb_error <- mean(pred_nb != test$mpg01)
FP <- 3 # False Positives FN <- 5 # False Negatives Total <- 118 # Total observations (sum of all entries in the confusion matrix) test_error_nb <- (FP + FN) / Total test_error_nb= 0.068
# (h) KNN with different K values
train_X <- train[, c("horsepower", "weight", "displacement")]
test_X <- test[, c("horsepower", "weight", "displacement")]
train_Y <- train$mpg01
test_Y <- test$mpg01
k_values <- c(1, 3, 5, 7, 9, 11)
k_errors <- sapply(k_values, function(k) {
pred_knn <- knn(train_X, test_X, train_Y, k = k)
mean(pred_knn != test_Y)
})
names(k_errors) <- k_values
# Print errors for all models
errors <- c(LDA = lda_error, QDA = qda_error, Logistic = log_error, NaiveBayes = nb_error, k_errors)
print(errors)
## LDA QDA Logistic NaiveBayes 1 3 5
## 0.07627119 0.07627119 0.07627119 0.06779661 0.11864407 0.11016949 0.12711864
## 7 9 11
## 0.12711864 0.15254237 0.16101695