library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.4.2
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.2
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
library(class)
library(e1071)
## Warning: package 'e1071' was built under R version 4.4.2
data(Weekly)
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
pairs(Weekly)
ggplot(Weekly, aes(x = Year, y = Volume)) + geom_line() + ggtitle("Trading Volume Over Time")
ggplot(Weekly, aes(x = Year, y = Lag1, color = Direction)) + geom_point() + ggtitle("Lag1 vs Year")
It seems like there is a steady increase over time from 1990 to 2010,
but there are major spikes starting around 2007 to 2010.
glm_model <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Weekly, family = binomial)
summary(glm_model)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
There is one predictor that seems significant, and that is Lag2. The p-value of Lag2 is 0.0296, and that is less than 0.05. This means that Lag2 is statistically significant in predicting the market direction. The rest of the predictors have p-values greater than 0.05, so they aren’t statistically significant.
predicted_probs <- predict(glm_model, type = "response")
predicted_class <- ifelse(predicted_probs > 0.5, "Up", "Down")
confusion_matrix <- table(Predicted = predicted_class, Actual = Weekly$Direction)
print(confusion_matrix)
## Actual
## Predicted Down Up
## Down 54 48
## Up 430 557
So we can see here that the confusion matrix has a bias for predicting Up(market increase) and doesnt want to predict much Down(market decline). That means that it correctly predicts Up most of the time but makes a lot of mistakes when the market goes down. That is a mistake of the logistic regression.
train <- Weekly$Year <= 2008
test <- Weekly$Year > 2008
glm_train <- glm(Direction ~ Lag2, data = Weekly, family = binomial, subset = train)
glm_probs <- predict(glm_train, newdata = Weekly[test, ], type = "response")
glm_pred <- ifelse(glm_probs > 0.5, "Up", "Down")
conf_matrix_glm <- table(Predicted = glm_pred, Actual = Weekly$Direction[test])
print(conf_matrix_glm)
## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
glm_accuracy <- sum(diag(conf_matrix_glm)) / sum(conf_matrix_glm)
print(paste("Logistic Regression Accuracy:", glm_accuracy))
## [1] "Logistic Regression Accuracy: 0.625"
lda_model <- lda(Direction ~ Lag2, data = Weekly, subset = train)
lda_pred <- predict(lda_model, Weekly[test, ])
conf_matrix_lda <- table(Predicted = lda_pred$class, Actual = Weekly$Direction[test])
print(conf_matrix_lda)
## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
lda_accuracy <- sum(diag(conf_matrix_lda)) / sum(conf_matrix_lda)
print(paste("LDA Accuracy:", lda_accuracy))
## [1] "LDA Accuracy: 0.625"
qda_model <- qda(Direction ~ Lag2, data = Weekly, subset = train)
qda_pred <- predict(qda_model, Weekly[test, ])
conf_matrix_qda <- table(Predicted = qda_pred$class, Actual = Weekly$Direction[test])
print(conf_matrix_qda)
## Actual
## Predicted Down Up
## Down 0 0
## Up 43 61
qda_accuracy <- sum(diag(conf_matrix_qda)) / sum(conf_matrix_qda)
print(paste("QDA Accuracy:", qda_accuracy))
## [1] "QDA Accuracy: 0.586538461538462"
train_X <- Weekly[train, "Lag2", drop = FALSE]
test_X <- Weekly[test, "Lag2", drop = FALSE]
train_Y <- Weekly$Direction[train]
knn_pred <- knn(train = train_X, test = test_X, cl = train_Y, k = 1)
conf_matrix_knn <- table(Predicted = knn_pred, Actual = Weekly$Direction[test])
print(conf_matrix_knn)
## Actual
## Predicted Down Up
## Down 21 29
## Up 22 32
knn_accuracy <- sum(diag(conf_matrix_knn)) / sum(conf_matrix_knn)
print(paste("KNN (k=1) Accuracy:", knn_accuracy))
## [1] "KNN (k=1) Accuracy: 0.509615384615385"
nb_model <- naiveBayes(Direction ~ Lag2, data = Weekly, subset = train)
nb_pred <- predict(nb_model, Weekly[test, ])
conf_matrix_nb <- table(Predicted = nb_pred, Actual = Weekly$Direction[test])
print(conf_matrix_nb)
## Actual
## Predicted Down Up
## Down 0 0
## Up 43 61
nb_accuracy <- sum(diag(conf_matrix_nb)) / sum(conf_matrix_nb)
print(paste("Naive Bayes Accuracy:", nb_accuracy))
## [1] "Naive Bayes Accuracy: 0.586538461538462"
accuracy_results <- data.frame(
Method = c("Logistic Regression", "LDA", "QDA", "KNN (k=1)", "Naive Bayes"),
Accuracy = c(glm_accuracy, lda_accuracy, qda_accuracy, knn_accuracy, nb_accuracy)
)
print(accuracy_results)
## Method Accuracy
## 1 Logistic Regression 0.6250000
## 2 LDA 0.6250000
## 3 QDA 0.5865385
## 4 KNN (k=1) 0.5096154
## 5 Naive Bayes 0.5865385
Based on the accuracy scores, logistic regression and LDA are the best, with those methods having the highest accuracy.
glm_exp <- glm(Direction ~ Lag2 + I(Lag2^2), data = Weekly, family = binomial, subset = train)
glm_exp_probs <- predict(glm_exp, newdata = Weekly[test, ], type = "response")
glm_exp_pred <- ifelse(glm_exp_probs > 0.5, "Up", "Down")
conf_matrix_glm_exp <- table(Predicted = glm_exp_pred, Actual = Weekly$Direction[test])
print(conf_matrix_glm_exp)
## Actual
## Predicted Down Up
## Down 8 4
## Up 35 57
glm_exp_accuracy <- sum(diag(conf_matrix_glm_exp)) / sum(conf_matrix_glm_exp)
print(paste("Improved Logistic Regression Accuracy:", glm_exp_accuracy))
## [1] "Improved Logistic Regression Accuracy: 0.625"
knn_pred_5 <- knn(train = train_X, test = test_X, cl = train_Y, k = 5)
conf_matrix_knn_5 <- table(Predicted = knn_pred_5, Actual = Weekly$Direction[test])
print(conf_matrix_knn_5)
## Actual
## Predicted Down Up
## Down 15 21
## Up 28 40
knn_5_accuracy <- sum(diag(conf_matrix_knn_5)) / sum(conf_matrix_knn_5)
print(paste("KNN (k=5) Accuracy:", knn_5_accuracy))
## [1] "KNN (k=5) Accuracy: 0.528846153846154"
The logistic regression model with quadratic transformation of Lag2 is the method with with the better results. we can see that its accuracy score(0.625) is higher than that of the KNN model(0.528846153846154).
library(ISLR2)
data(Auto)
median_mpg <- median(Auto$mpg)
Auto$mpg01 <- ifelse(Auto$mpg > median_mpg, 1, 0)
head(Auto)
## mpg cylinders displacement horsepower weight acceleration year origin
## 1 18 8 307 130 3504 12.0 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11.0 70 1
## 4 16 8 304 150 3433 12.0 70 1
## 5 17 8 302 140 3449 10.5 70 1
## 6 15 8 429 198 4341 10.0 70 1
## name mpg01
## 1 chevrolet chevelle malibu 0
## 2 buick skylark 320 0
## 3 plymouth satellite 0
## 4 amc rebel sst 0
## 5 ford torino 0
## 6 ford galaxie 500 0
pairs(Auto[, c("mpg", "horsepower", "weight", "acceleration", "displacement")],
col = Auto$mpg01 + 1, pch = 19,
main = "Scatterplot Matrix")
par(mfrow = c(2, 2))
boxplot(mpg ~ horsepower, data = Auto, main = "mpg vs horsepower",
col = "lightblue", ylab = "mpg")
boxplot(mpg ~ weight, data = Auto, main = "mpg vs weight",
col = "lightblue", ylab = "mpg")
boxplot(mpg ~ acceleration, data = Auto, main = "mpg vs acceleration",
col = "lightblue", ylab = "mpg")
boxplot(mpg ~ displacement, data = Auto, main = "mpg vs displacement",
col = "lightblue", ylab = "mpg")
Based on these plots, it looks like horsepower, weight, and displacement seem useful in predicting mpg01. That is because all three of those predictors show a negative trend in the plots, and they show stronger patterns compared to the acceleration plot.
set.seed(42)
train_index <- sample(1:nrow(Auto), nrow(Auto)*0.8)
train_data <- Auto[train_index, ]
test_data <- Auto[-train_index, ]
library(MASS)
lda_model <- lda(mpg01 ~ horsepower + weight + displacement, data = train_data)
lda_pred <- predict(lda_model, test_data)$class
conf_matrix_lda <- table(Predicted = lda_pred, Actual = test_data$mpg01)
test_error_lda <- mean(lda_pred != test_data$mpg01)
print(conf_matrix_lda)
## Actual
## Predicted 0 1
## 0 27 1
## 1 4 47
print(paste("LDA Test Error:", test_error_lda))
## [1] "LDA Test Error: 0.0632911392405063"
The test error is 0.0632911392405063
qda_model <- qda(mpg01 ~ horsepower + weight + displacement, data = train_data)
qda_pred <- predict(qda_model, test_data)$class
conf_matrix_qda <- table(Predicted = qda_pred, Actual = test_data$mpg01)
test_error_qda <- mean(qda_pred != test_data$mpg01)
print(conf_matrix_qda)
## Actual
## Predicted 0 1
## 0 28 3
## 1 3 45
print(paste("QDA Test Error:", test_error_qda))
## [1] "QDA Test Error: 0.0759493670886076"
The test error of this model is 0.0759493670886076.
logreg_model <- glm(mpg01 ~ horsepower + weight + displacement,
data = train_data, family = binomial)
logreg_prob <- predict(logreg_model, newdata = test_data, type = "response")
logreg_pred <- ifelse(logreg_prob > 0.5, 1, 0)
conf_matrix_logreg <- table(Predicted = logreg_pred, Actual = test_data$mpg01)
test_error_logreg <- mean(logreg_pred != test_data$mpg01)
print(conf_matrix_logreg)
## Actual
## Predicted 0 1
## 0 29 3
## 1 2 45
print(paste("Logistic Regression Test Error:", test_error_logreg))
## [1] "Logistic Regression Test Error: 0.0632911392405063"
The test error is 0.0632911392405063.
library(e1071)
nb_model <- naiveBayes(mpg01 ~ horsepower + weight + displacement, data = train_data)
nb_pred <- predict(nb_model, test_data)
conf_matrix_nb <- table(Predicted = nb_pred, Actual = test_data$mpg01)
test_error_nb <- mean(nb_pred != test_data$mpg01)
print(conf_matrix_nb)
## Actual
## Predicted 0 1
## 0 28 3
## 1 3 45
print(paste("Naive Bayes Test Error:", test_error_nb))
## [1] "Naive Bayes Test Error: 0.0759493670886076"
The test error is 0.0759493670886076.
library(class)
train_X <- train_data[, c("horsepower", "weight", "acceleration")]
test_X <- test_data[, c("horsepower", "weight", "acceleration")]
train_Y <- train_data$mpg01
test_Y <- test_data$mpg01
k_values <- c(1, 3, 5, 7, 9)
knn_errors <- sapply(k_values, function(k) {
knn_pred <- knn(train_X, test_X, train_Y, k = k)
mean(knn_pred != test_Y)
})
print(data.frame(K = k_values, Test_Error = knn_errors))
## K Test_Error
## 1 1 0.11392405
## 2 3 0.08860759
## 3 5 0.11392405
## 4 7 0.13924051
## 5 9 0.15189873
plot(k_values, knn_errors, type = "b", col = "blue",
xlab = "k", ylab = "Test Error", main = "KNN Test Errors for different k")
By looking at these test error values for the different values of K, we
can see that k = 3 performs the best on this data set, with it having
the lowest test error value.
library(MASS)
library(class)
library(e1071)
library(MASS)
library(ggplot2)
data("Boston")
crime_rate_median <- median(Boston$crim)
Boston$crime_rate_above_median <- ifelse(Boston$crim > crime_rate_median, 1, 0)
head(Boston)
## crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12 5.21
## medv crime_rate_above_median
## 1 24.0 0
## 2 21.6 0
## 3 34.7 0
## 4 33.4 0
## 5 36.2 0
## 6 28.7 0
set.seed(42)
train_index <- sample(1:nrow(Boston), 0.7 * nrow(Boston))
train_data <- Boston[train_index, ]
test_data <- Boston[-train_index, ]
dim(train_data)
## [1] 354 15
dim(test_data)
## [1] 152 15
log_reg_model <- glm(crime_rate_above_median ~ ., data = train_data, family = binomial)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
log_reg_probs <- predict(log_reg_model, newdata = test_data, type = "response")
log_reg_pred <- ifelse(log_reg_probs > 0.5, 1, 0)
conf_matrix_log_reg <- table(Predicted = log_reg_pred, Actual = test_data$crime_rate_above_median)
log_reg_accuracy <- sum(diag(conf_matrix_log_reg)) / sum(conf_matrix_log_reg)
print(conf_matrix_log_reg)
## Actual
## Predicted 0 1
## 0 77 0
## 1 1 74
print(paste("Logistic Regression Accuracy:", log_reg_accuracy))
## [1] "Logistic Regression Accuracy: 0.993421052631579"
lda_model <- lda(crime_rate_above_median ~ ., data = train_data)
lda_pred <- predict(lda_model, newdata = test_data)$class
conf_matrix_lda <- table(Predicted = lda_pred, Actual = test_data$crime_rate_above_median)
lda_accuracy <- sum(diag(conf_matrix_lda)) / sum(conf_matrix_lda)
print(conf_matrix_lda)
## Actual
## Predicted 0 1
## 0 76 19
## 1 2 55
print(paste("LDA Accuracy:", lda_accuracy))
## [1] "LDA Accuracy: 0.861842105263158"
nb_model <- naiveBayes(crime_rate_above_median ~ ., data = train_data)
nb_pred <- predict(nb_model, newdata = test_data)
conf_matrix_nb <- table(Predicted = nb_pred, Actual = test_data$crime_rate_above_median)
nb_accuracy <- sum(diag(conf_matrix_nb)) / sum(conf_matrix_nb)
print(conf_matrix_nb)
## Actual
## Predicted 0 1
## 0 77 4
## 1 1 70
print(paste("Naive Bayes Accuracy:", nb_accuracy))
## [1] "Naive Bayes Accuracy: 0.967105263157895"
train_X <- scale(train_data[, -c(ncol(train_data))])
test_X <- scale(test_data[, -c(ncol(test_data))])
knn_pred_3 <- knn(train = train_X, test = test_X, cl = train_data$crime_rate_above_median, k = 3)
conf_matrix_knn_3 <- table(Predicted = knn_pred_3, Actual = test_data$crime_rate_above_median)
knn_accuracy_3 <- sum(diag(conf_matrix_knn_3)) / sum(conf_matrix_knn_3)
print(conf_matrix_knn_3)
## Actual
## Predicted 0 1
## 0 75 8
## 1 3 66
print(paste("KNN (k=3) Accuracy:", knn_accuracy_3))
## [1] "KNN (k=3) Accuracy: 0.927631578947368"
accuracy_results <- data.frame(
Method = c("Logistic Regression", "LDA", "Naive Bayes", "KNN (k=3)"),
Accuracy = c(log_reg_accuracy, lda_accuracy, nb_accuracy, knn_accuracy_3)
)
print(accuracy_results)
## Method Accuracy
## 1 Logistic Regression 0.9934211
## 2 LDA 0.8618421
## 3 Naive Bayes 0.9671053
## 4 KNN (k=3) 0.9276316
From looking at the results of all of these models, we can clearly see that the best model is the logistic regression model. It has the highest accuracy percentage of 99.34%, and it shows to have few missclassifications, overall making it the best model. The second best model would be the Naive Bayes model. This is because it has the second highest accuracy value of 96.71%, which is still performing well, but not as well as the logistic regression model. The third best model would be the KNN model when K = 3. That is because it had the third highest accuracy value of 92.76%. It still performs well with classifications like the last two methods, but not as well. The fourth best model would be the LDA model. This is because it has the lowest accuracy value of 86.18%, and contains more misclassifications that the rest of the models.