library(ISLR2)
library(corrplot)
## corrplot 0.95 loaded
head(Weekly)
## Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
## 1 1990 0.816 1.572 -3.936 -0.229 -3.484 0.1549760 -0.270 Down
## 2 1990 -0.270 0.816 1.572 -3.936 -0.229 0.1485740 -2.576 Down
## 3 1990 -2.576 -0.270 0.816 1.572 -3.936 0.1598375 3.514 Up
## 4 1990 3.514 -2.576 -0.270 0.816 1.572 0.1616300 0.712 Up
## 5 1990 0.712 3.514 -2.576 -0.270 0.816 0.1537280 1.178 Up
## 6 1990 1.178 0.712 3.514 -2.576 -0.270 0.1544440 -1.372 Down
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
# Excluding Year and Direction columns:
weekly_numeric <- Weekly[, sapply(Weekly, is.numeric)]
# Correlation matrix:
cor_matrix <- cor(weekly_numeric)
# Correlation plot:
corrplot(cor_matrix, method = "color", type = "upper",
tl.col = "black", tl.srt = 45, addCoef.col = "black",
number.cex = 0.7)
Here, summary and the correlation plot of the Weekly data set has been produced to obtain numerical and graphical summaries. And by observing the correlation plot, we can clearly see that variables “Year” and “Volume” have significant linear relationship. Here, “Volume” basically represent the average number of daily shares traded in billion.
# Logistic regression model:
logistic_model <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Weekly,
family = binomial)
summary(logistic_model)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
From the above results, we can see that “Lag2” appears to be statistically significant as evidenced by small p-value of 0.0296 which is less than α = 0.05. All other variables are statistically insignificant.
probabilities <- predict(logistic_model, type = "response")
predictions <- ifelse(probabilities > 0.5, "Up", "Down")
# Confusion matrix:
conf_matrix <- table(Predicted = predictions, Actual = Weekly$Direction)
print(conf_matrix)
## Actual
## Predicted Down Up
## Down 54 48
## Up 430 557
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
cat("Overall Accuracy:", accuracy, "\n")
## Overall Accuracy: 0.5610652
From the above results, we obtained that the logistic model correctly predicts 557 Up and 54 Down, however, inaccurately predicts 48 Up as Down and 430 Down as Up.
This means the model is doing great job in predicting Up (557) while performing very bad in predicting Down (430).
Since, the accuracy is more than 50 % which represents the overall fraction of correct prediction - this can be misleading or inaccurate due to one class (Up) dominating the other class (Down). To better explain this, we can separately obtain how model predicted Up and Down, as (557/(557 + 48) = 0.92) for Up, and (54/(430 + 54) = 0.11) for Down. We can see model is correctly predicting Up at higher accuracy while correctly predicting Down at very low accuracy. This scenario here is also known as class imbalance.
Hence, the logistic model is leaning toward correctly predicting one class more than the other - thus the mistake of model.
train <- Weekly$Year <= 2008
test <- Weekly$Year > 2008
# Logistic Model with only Lag2:
logistic_model_lag2 <- glm(Direction ~ Lag2, data = Weekly, subset = train, family = binomial)
# Predicting probabilities on test data (2009-2010):
prob_test <- predict(logistic_model_lag2, newdata = Weekly[test, ], type = "response")
pred_test <- ifelse(prob_test > 0.5, "Up", "Down")
# Confusion matrix:
conf_matrix_test <- table(Predicted = pred_test, Actual = Weekly$Direction[test])
print(conf_matrix_test)
## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
accuracy_test <- sum(diag(conf_matrix_test)) / sum(conf_matrix_test)
cat("Test Set Accuracy:", accuracy_test, "\n")
## Test Set Accuracy: 0.625
From the above results, we obtained that the logistic model with only “Lag2” correctly predicts 56 Up and 9 Down, however, inaccurately predicts 5 Up as Down and 34 Down as Up.
Also, the accuracy is around 63% now using only “Lag2” which is little better than before. Furthermore, we can separately obtain how model predicted Up and Down, as (56/(56 + 5) = 0.918) for Up, and (9/(9 + 34) = 0.21) for Down.
The model continues to predict ‘Up’ with higher accuracy compared to ‘Down’. However, overall performance has improved, making this model more effective than the previous one.
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
lda_model <- lda(Direction ~ Lag2, data = Weekly, subset = train)
lda_pred <- predict(lda_model, newdata = Weekly[test, ])
# Confusion matrix:
conf_matrix <- table(Predicted = lda_pred$class, Actual = Weekly$Direction[test])
print(conf_matrix)
## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
accuracy <- mean(lda_pred$class == Weekly$Direction[test])
print(paste("Overall Accuracy:", round(accuracy, 3)))
## [1] "Overall Accuracy: 0.625"
After using LDA, the model yielded a similar results as obtained in #d.
qda_model <- qda(Direction ~ Lag2, data = Weekly, subset = train)
qda_pred <- predict(qda_model, newdata = Weekly[test, ])
# Confusion matrix:
conf_matrix_qda <- table(Predicted = qda_pred$class, Actual = Weekly$Direction[test])
print(conf_matrix_qda)
## Actual
## Predicted Down Up
## Down 0 0
## Up 43 61
accuracy_qda <- mean(qda_pred$class == Weekly$Direction[test])
print(paste("Overall Accuracy:", round(accuracy_qda, 3)))
## [1] "Overall Accuracy: 0.587"
Here, we obtained an interesting result. The model has accuracy of approx. 59% which is lower than that of LDA. Also, the QDA model is only predicting the Up trends while Down trends are completely disregarded. The Up trend is correctly predicted better than LDA model but the total ignorance of Down trends questions the robustness of QDA model.
library(class)
knn_pred <- knn(train = as.matrix(Weekly[train, c("Lag2")]),
test = as.matrix(Weekly[test, c("Lag2")]),
cl = Weekly$Direction[train],
k = 1)
# Confusion matrix:
confusion_matrix_knn <- table(Predicted = knn_pred, Actual = Weekly$Direction[test])
print(confusion_matrix_knn)
## Actual
## Predicted Down Up
## Down 21 30
## Up 22 31
accuracy_knn <- sum(diag(confusion_matrix_knn)) / sum(confusion_matrix_knn)
print(paste("Overall Accuracy: ", round(accuracy_knn, 3)))
## [1] "Overall Accuracy: 0.5"
From the above results, we obtained accuracy of 50% which is sort of similar to guessing randomly. In other words, the number of correct predictions equals to the number of incorrect predictions. This simply does not make the model better as model’s accuracy does not provide a distinct improvement.
library(e1071)
Weekly$Lag2 <- as.numeric(Weekly$Lag2)
nb_model <- naiveBayes(Direction ~ Lag2, data = Weekly[train, ])
# Extracting the test data and ensuring Lag2 is numeric
test_data <- Weekly[test, , drop = FALSE]
test_data$Lag2 <- as.numeric(test_data$Lag2)
pred_nb <- predict(nb_model, newdata = test_data)
# Confusion Matrix:
confusion_nb <- table(Predicted = pred_nb, Actual = Weekly[test, "Direction"])
print(confusion_nb)
## Actual
## Predicted Down Up
## Down 0 0
## Up 43 61
accuracy_nb <- sum(diag(confusion_nb)) / sum(confusion_nb)
cat("Overall Accuracy:", accuracy_nb, "\n")
## Overall Accuracy: 0.5865385
From the above results, we can say that naive Bayes is yielding results similar to that of QDA.
Since, Logistic Model with only “Lag2” and LDA both yielded 62.5%, hence the best methods.
Since, after “Lag2”, the second highest significant variable is “Lag1”, so let’s experiment with the interaction term between “Lag1” and “Lag2”.
Weekly$Lag1 <- as.numeric(Weekly$Lag1)
Weekly$Lag2 <- as.numeric(Weekly$Lag2)
train <- Weekly$Year < 2009
test <- !train
# Logistic Regression:
logistic_model <- glm(Direction ~ Lag1 + Lag2, data = Weekly[train, ], family = "binomial")
prob_logistic <- predict(logistic_model, newdata = Weekly[test, ], type = "response")
pred_logistic <- ifelse(prob_logistic > 0.5, "Up", "Down")
# Confusion Matrix and Accuracy for Logistic Regression:
confusion_logistic <- table(Predicted = pred_logistic, Actual = Weekly[test, "Direction"])
accuracy_logistic <- sum(diag(confusion_logistic)) / sum(confusion_logistic)
cat("Logistic Regression\n")
## Logistic Regression
print(confusion_logistic)
## Actual
## Predicted Down Up
## Down 7 8
## Up 36 53
cat("Overall Accuracy:", accuracy_logistic, "\n\n")
## Overall Accuracy: 0.5769231
# LDA:
lda_model <- lda(Direction ~ Lag1 + Lag2, data = Weekly[train, ])
pred_lda <- predict(lda_model, newdata = Weekly[test, ])$class
# Confusion Matrix and Accuracy for LDA:
confusion_lda <- table(Predicted = pred_lda, Actual = Weekly[test, "Direction"])
accuracy_lda <- sum(diag(confusion_lda)) / sum(confusion_lda)
cat("LDA\n")
## LDA
print(confusion_lda)
## Actual
## Predicted Down Up
## Down 7 8
## Up 36 53
cat("Overall Accuracy:", accuracy_lda, "\n\n")
## Overall Accuracy: 0.5769231
# QDA:
qda_model <- qda(Direction ~ Lag1 + Lag2, data = Weekly[train, ])
pred_qda <- predict(qda_model, newdata = Weekly[test, ])$class
# Confusion Matrix and Accuracy for QDA:
confusion_qda <- table(Predicted = pred_qda, Actual = Weekly[test, "Direction"])
accuracy_qda <- sum(diag(confusion_qda)) / sum(confusion_qda)
cat("QDA\n")
## QDA
print(confusion_qda)
## Actual
## Predicted Down Up
## Down 7 10
## Up 36 51
cat("Overall Accuracy:", accuracy_qda, "\n\n")
## Overall Accuracy: 0.5576923
# Experimenting with different values of K in KNN:
k_values <- c(1, 3, 5, 7, 9) # Different values for k
accuracy_knn <- numeric(length(k_values))
for (k in k_values) {
knn_model <- knn(train = Weekly[train, c("Lag1", "Lag2")],
test = Weekly[test, c("Lag1", "Lag2")],
cl = Weekly[train, "Direction"],
k = k)
# Confusion Matrix and Accuracy for KNN:
confusion_knn <- table(Predicted = knn_model, Actual = Weekly[test, "Direction"])
accuracy_knn[k == k_values] <- sum(diag(confusion_knn)) / sum(confusion_knn)
cat("KNN (k =", k, ")\n")
print(confusion_knn)
cat("Overall Accuracy:", accuracy_knn[k == k_values], "\n\n")
}
## KNN (k = 1 )
## Actual
## Predicted Down Up
## Down 18 29
## Up 25 32
## Overall Accuracy: 0.4807692
##
## KNN (k = 3 )
## Actual
## Predicted Down Up
## Down 22 29
## Up 21 32
## Overall Accuracy: 0.5192308
##
## KNN (k = 5 )
## Actual
## Predicted Down Up
## Down 22 32
## Up 21 29
## Overall Accuracy: 0.4903846
##
## KNN (k = 7 )
## Actual
## Predicted Down Up
## Down 22 28
## Up 21 33
## Overall Accuracy: 0.5288462
##
## KNN (k = 9 )
## Actual
## Predicted Down Up
## Down 18 23
## Up 25 38
## Overall Accuracy: 0.5384615
# Naive Bayes:
nb_model <- naiveBayes(Direction ~ Lag1 + Lag2, data = Weekly[train, ])
pred_nb <- predict(nb_model, newdata = Weekly[test, ])
# Confusion Matrix and Accuracy for Naive Bayes:
confusion_nb <- table(Predicted = pred_nb, Actual = Weekly[test, "Direction"])
accuracy_nb <- sum(diag(confusion_nb)) / sum(confusion_nb)
cat("Naive Bayes\n")
## Naive Bayes
print(confusion_nb)
## Actual
## Predicted Down Up
## Down 3 8
## Up 40 53
cat("Overall Accuracy:", accuracy_nb, "\n")
## Overall Accuracy: 0.5384615
From the above results, we can clearly see that the highest accuracy yielded by the models are Logistic and LDA which is around 58%. QDA is following the second lead with approx. 56%. Also, with different k values in KNN, we are still achieving no higher accuracy. The interesting pattern to observe is that all models beside KNN are correctly predicting Up at higher accuracy than that of Down. With KNN, the model with increasing k values is still roaming around the perimeter or 50% which did not yield a distinct accuracy.
Hence, Logistic and LDA methods provides the best result even with interaction term between “Lag1” and “Lag2”.
median_mpg <- median(Auto$mpg)
# Creating the binary variable mpg01
Auto$mpg01 <- ifelse(Auto$mpg > median_mpg, 1, 0)
# Creating a new data frame containing both mpg01 and other variables from Auto
Auto_with_mpg01 <- cbind(Auto, mpg01 = Auto$mpg01)
head(Auto)
## mpg cylinders displacement horsepower weight acceleration year origin
## 1 18 8 307 130 3504 12.0 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11.0 70 1
## 4 16 8 304 150 3433 12.0 70 1
## 5 17 8 302 140 3449 10.5 70 1
## 6 15 8 429 198 4341 10.0 70 1
## name mpg01
## 1 chevrolet chevelle malibu 0
## 2 buick skylark 320 0
## 3 plymouth satellite 0
## 4 amc rebel sst 0
## 5 ford torino 0
## 6 ford galaxie 500 0
library(ggplot2)
# Scatterplots for horsepower vs mpg01
ggplot(Auto, aes(x = horsepower, y = mpg01)) +
geom_jitter(width = 0.1, height = 0.05) +
labs(title = "Scatterplot: Horsepower vs mpg01", x = "Horsepower", y = "mpg01") +
theme_minimal()
# Scatterplots for weight vs mpg01
ggplot(Auto, aes(x = weight, y = mpg01)) +
geom_jitter(width = 0.1, height = 0.05) +
labs(title = "Scatterplot: Weight vs mpg01", x = "Weight", y = "mpg01") +
theme_minimal()
# Scatterplots for displacement vs mpg01
ggplot(Auto, aes(x = displacement, y = mpg01)) +
geom_jitter(width = 0.1, height = 0.05) +
labs(title = "Scatterplot: Displacement vs mpg01", x = "Displacement", y = "mpg01") +
theme_minimal()
From the above three scatter plots we can see how horsepower, weights of cars, and their displacement (engines’ size) relates to either low or high mileage. Since the predictors are selected on the basis of their numerical characteristics, other variables such as cylinder, year, and acceleration are ignored for analysis basis. Also, the analysis of these predictors can help us understand relationship with other variables too.
From the first scatter plot, we saw that cars with lower horsepower tends to give higher mileage and vice versa. There is a some mix of both mpg outputs for cars with horsepower between 75 to 100. However, the bigger clustering of data points in target 1 than that of target 0, tell us that horsepower in that range favors higher mileage too.
From the second scatter plot, we saw that cars with lower weights tends to give higher mileage and vice versa. This is similar to horsepower relationship with mpg. Here the clustering of data on target 1 shows the small range of car weights that favor high mileage. And the larger dispersion of car weights above ~2300 to 5000 shows lower mileage.
From the third scatter plot, we saw that cars with lower displacement tends to give higher mileage but the trend for higher displacement cars is a bit difficult to analyze due to presence of patterns (poor distribution). There could also be potential outliers in high mileage. Hence, an appropriate model can be used to analyze the relationship between displacement and mpg.
set.seed(42)
train_index <- sample(1:nrow(Auto), size = 0.7 * nrow(Auto))
# Splitting the data into training and test sets:
train_data <- Auto[train_index, ]
test_data <- Auto[-train_index, ]
dim(train_data)
## [1] 274 10
dim(test_data)
## [1] 118 10
lda_model <- lda(mpg01 ~ cylinders + displacement + horsepower + weight + acceleration, data = train_data)
lda_pred <- predict(lda_model, newdata = test_data)
lda_pred_class <- lda_pred$class
# Confusion matrix:
conf_matrix <- table(Predicted = lda_pred_class, Actual = test_data$mpg01)
print(conf_matrix)
## Actual
## Predicted 0 1
## 0 43 5
## 1 4 66
test_error <- mean(lda_pred_class != test_data$mpg01)
cat("Test Error: ", test_error, "\n")
## Test Error: 0.07627119
From the above results, we obtained 7.6% test error, which indicates model is performing well with accurate predictions of high or low mileage with above mentioned predictors.
qda_model <- qda(mpg01 ~ cylinders + displacement + horsepower + weight + acceleration, data = train_data)
qda_pred <- predict(qda_model, newdata = test_data)
qda_pred_class <- qda_pred$class
# Confusion matrix:
conf_matrix_qda <- table(Predicted = qda_pred_class, Actual = test_data$mpg01)
print(conf_matrix_qda)
## Actual
## Predicted 0 1
## 0 43 6
## 1 4 65
test_error_qda <- mean(qda_pred_class != test_data$mpg01)
cat("Test Error: ", test_error_qda, "\n")
## Test Error: 0.08474576
From the above results, we obtained 8.47% test error, which indicates model is performing well with accurate predictions of high or low mileage with above mentioned predictors. However, LDA is slightly better than QDA by looking at their respective test errors.
# Perform logistic regression on the training data
logistic_model <- glm(mpg01 ~ displacement + horsepower + weight,
data = train_data, family = binomial)
logistic_pred <- predict(logistic_model, newdata = test_data, type = "response")
logistic_pred_class <- ifelse(logistic_pred > 0.5, 1, 0)
# Confusion matrix:
conf_matrix_logistic <- table(Predicted = logistic_pred_class, Actual = test_data$mpg01)
print(conf_matrix_logistic)
## Actual
## Predicted 0 1
## 0 43 5
## 1 4 66
test_error_logistic <- mean(logistic_pred_class != test_data$mpg01)
cat("Test Error for Logistic Regression: ", test_error_logistic, "\n")
## Test Error for Logistic Regression: 0.07627119
From the above logistic regression model, using displacement, horsepower and weight yielded a test error of 7.6 % which is lower than QDA and similar to LDA.
naive_bayes_model <- naiveBayes(mpg01 ~ displacement + horsepower + weight,
data = train_data)
naive_bayes_pred <- predict(naive_bayes_model, newdata = test_data)
# Confusion matrix:
conf_matrix_nb <- table(Predicted = naive_bayes_pred, Actual = test_data$mpg01)
print(conf_matrix_nb)
## Actual
## Predicted 0 1
## 0 42 3
## 1 5 68
test_error_nb <- mean(naive_bayes_pred != test_data$mpg01)
cat("Test Error for Naive Bayes: ", test_error_nb, "\n")
## Test Error for Naive Bayes: 0.06779661
From above results, we can see that Naive Bayes yielded even a lower test error of 6.8 % which is by far the lowest of all different methods.
train_data_knn <- train_data[, c("displacement", "horsepower", "weight")]
test_data_knn <- test_data[, c("displacement", "horsepower", "weight")]
train_target <- train_data$mpg01
test_target <- test_data$mpg01
k_values <- c(1, 3, 5, 7, 9)
test_errors <- numeric(length(k_values))
for (i in 1:length(k_values)) {
k <- k_values[i]
knn_pred <- knn(train = train_data_knn, test = test_data_knn, cl = train_target, k = k)
test_errors[i] <- mean(knn_pred != test_target)
}
# Printing the test errors for different values of K:
for (i in 1:length(k_values)) {
cat("Test Error for K =", k_values[i], ": ", test_errors[i], "\n")
}
## Test Error for K = 1 : 0.1186441
## Test Error for K = 3 : 0.1101695
## Test Error for K = 5 : 0.1271186
## Test Error for K = 7 : 0.1271186
## Test Error for K = 9 : 0.1525424
# Determining the best K based on the minimum test error:
best_k <- k_values[which.min(test_errors)]
cat("Best value of K:", best_k, "\n")
## Best value of K: 3
From the above result, we can see that the KNN model is performing well with k value: 3, yielding test error close to 11 %. This test error is still higher than other methods.