knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
13.
a.The S&P 500 returns seem to be more positive than negative from 1990 - 2010. There is also a significant positive correlation between Volume and Year suggesting that as the years went by the number of shares traded increased.
library(ISLR2)
library(GGally)
library(MASS)
library(class)
library(e1071)
data(Weekly)
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
numeric_vars <- Weekly[sapply(Weekly, is.numeric)]
ggpairs(numeric_vars)
b. Lag2 is just slightly significant.
model <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = Weekly, family = "binomial")
summary(model)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = "binomial", data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
c. The confusion matrix shows that the logistic regression model is much better at predicting when the stock market will go up than when it will go down. The model gets the direction right about 56% of the time. So, while the model does okay at spotting up weeks, it’s not very reliable for catching down weeks.
probs <- predict(model, type = "response")
actual_class <- ifelse(Weekly$Direction == "Up", 1, 0)
predicted_class <- ifelse(probs > 0.5, 1, 0)
conf_matrix <- table(Predicted = predicted_class, Actual = actual_class)
print(conf_matrix)
## Actual
## Predicted 0 1
## 0 54 48
## 1 430 557
accuracy <- mean(predicted_class == actual_class)
print(paste("Overall Accuracy:", round(accuracy, 4)))
## [1] "Overall Accuracy: 0.5611"
d.
train_data <- subset(Weekly, Year <= 2008)
test_data <- subset(Weekly, Year > 2008)
model1 <- glm(Direction ~ Lag2, data = train_data, family = "binomial")
probs1 <- predict(model1, newdata = test_data, type = "response")
predicted_class1 <- ifelse(probs1 > 0.5, 1, 0)
actual_class1 <- ifelse(test_data$Direction == "Up", 1, 0)
conf_matrix1 <- table(Predicted = predicted_class1, Actual = actual_class1)
print(conf_matrix1)
## Actual
## Predicted 0 1
## 0 9 5
## 1 34 56
accuracy1 <- mean(predicted_class1 == actual_class1)
print(paste("Overall Accuracy:", round(accuracy1, 4)))
## [1] "Overall Accuracy: 0.625"
e.
lda_model <- lda(Direction ~ Lag2, data = train_data)
lda_pred <- predict(lda_model, newdata = test_data)
predicted_classl <- lda_pred$class
actual_classl <- test_data$Direction
conf_matrixl <- table(Predicted = predicted_classl, Actual = actual_classl)
print(conf_matrixl)
## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
accuracyl <- mean(predicted_classl == actual_classl)
print(paste("Overall Accuracy:", round(accuracyl, 4)))
## [1] "Overall Accuracy: 0.625"
f.
qda_model <- qda(Direction ~ Lag2, data = train_data)
qda_pred <- predict(qda_model, newdata = test_data)
predicted_classq <- qda_pred$class
actual_classq <- test_data$Direction
conf_matrixq <- table(Predicted = predicted_classq, Actual = actual_classq)
print(conf_matrixq)
## Actual
## Predicted Down Up
## Down 0 0
## Up 43 61
accuracyq <- mean(predicted_classq == actual_classq)
print(paste("Overall Accuracy:", round(accuracyq, 4)))
## [1] "Overall Accuracy: 0.5865"
g.
train_X <- data.frame(Lag2 = train_data$Lag2)
test_X <- data.frame(Lag2 = test_data$Lag2)
train_y <- train_data$Direction
test_y <- test_data$Direction
set.seed(1)
knn_pred <- knn(train = train_X, test = test_X, cl = train_y, k = 1)
conf_matrix_knn <- table(Predicted = knn_pred, Actual = test_y)
print(conf_matrix_knn)
## Actual
## Predicted Down Up
## Down 21 30
## Up 22 31
accuracy_knn <- mean(knn_pred == test_y)
print(paste("Overall Accuracy:", round(accuracy_knn, 4)))
## [1] "Overall Accuracy: 0.5"
h.
nb_model <- naiveBayes(Direction ~ Lag2, data = train_data)
nb_pred <- predict(nb_model, newdata = test_data)
conf_matrix_nb <- table(Predicted = nb_pred, Actual = test_data$Direction)
print(conf_matrix_nb)
## Actual
## Predicted Down Up
## Down 0 0
## Up 43 61
accuracy_nb <- mean(nb_pred == test_data$Direction)
print(paste("Overall Accuracy:", round(accuracy_nb, 4)))
## [1] "Overall Accuracy: 0.5865"
i. The lda model and the train and test logistic regression tied for the most accurate models at 62.5%
j.
log_model <- glm(Direction ~ Lag2 + Lag1 + Lag1:Lag2, data = train_data, family = "binomial")
log_pred <- predict(log_model, newdata = test_data, type = "response")
log_class <- ifelse(log_pred > 0.5, "Up", "Down")
conf_log <- table(Predicted = log_class, Actual = test_y)
acc_log <- mean(log_class == test_y)
print(paste("Overall Accuracy:", round(acc_log, 4)))
## [1] "Overall Accuracy: 0.5769"
lda_model1 <- lda(Direction ~ Lag2 + Lag5, data = train_data)
lda_pred1 <- predict(lda_model1, newdata = test_data)$class
conf_lda <- table(Predicted = lda_pred1, Actual = test_y)
acc_lda <- mean(lda_pred1 == test_y)
print(paste("Overall Accuracy:", round(acc_lda, 4)))
## [1] "Overall Accuracy: 0.5962"
train_data$Lag2_sq <- train_data$Lag2^2
test_data$Lag2_sq <- test_data$Lag2^2
qda_model1 <- qda(Direction ~ Lag2 + Lag2_sq, data = train_data)
qda_pred1 <- predict(qda_model1, newdata = test_data)$class
conf_qda <- table(Predicted = qda_pred1, Actual = test_y)
acc_qda <- mean(qda_pred1 == test_y)
print(paste("Overall Accuracy:", round(acc_qda, 4)))
## [1] "Overall Accuracy: 0.625"
train_X1 <- data.frame(Lag2 = train_data$Lag2, Lag5 = train_data$Lag5)
test_X1 <- data.frame(Lag2 = test_data$Lag2, Lag5 = test_data$Lag5)
acc_knn <- c()
conf_knn_list <- list()
best_k <- NULL
best_acc <- 0
for (k in c(1, 3, 5, 10)) {
set.seed(2)
knn_pred1 <- knn(train_X1, test_X1, train_y, k = k)
acc <- mean(knn_pred1 == test_y)
acc_knn <- c(acc_knn, acc)
conf_knn_list[[as.character(k)]] <- table(Predicted = knn_pred1, Actual = test_y)
if (acc > best_acc) {
best_acc <- acc
best_k <- k
}
}
print(paste("Overall Accuracy:", round(best_acc, 4)))
## [1] "Overall Accuracy: 0.5577"
print(paste("Best K:", best_k))
## [1] "Best K: 10"
nb_model1 <- naiveBayes(Direction ~ Lag2 + Lag1, data = train_data)
nb_pred1 <- predict(nb_model1, newdata = test_data)
conf_nb1 <- table(Predicted = nb_pred1, Actual = test_y)
acc_nb <- mean(nb_pred1 == test_y)
print(paste("Overall Accuracy:", round(acc_nb, 4)))
## [1] "Overall Accuracy: 0.5385"
14.
data(Auto)
a.
mpg_median <- median(Auto$mpg)
Auto$mpg01 <- ifelse(Auto$mpg > mpg_median, 1, 0)
head(Auto[c("mpg", "mpg01")])
## mpg mpg01
## 1 18 0
## 2 15 0
## 3 18 0
## 4 16 0
## 5 17 0
## 6 15 0
b. Weight, horsepower, displacement, and cylinders all have a strong negative correlation with mpg01. As they go up, mpg01 goes down.
numeric_varsA <- Auto[sapply(Auto, is.numeric)]
ggpairs(numeric_varsA)
par(mfrow = c(2, 2))
boxplot(horsepower ~ mpg01, data = Auto, xlab = "mpg01", ylab = "Horsepower")
boxplot(weight ~ mpg01, data = Auto, xlab = "mpg01", ylab = "Weight")
boxplot(acceleration ~ mpg01, data = Auto, xlab = "mpg01", ylab = "Acceleration")
boxplot(displacement ~ mpg01, data = Auto, xlab = "mpg01", ylab = "Displacement")
par(mfrow = c(2, 2))
boxplot(mpg ~ mpg01, data = Auto, xlab = "mpg01", ylab = "Mpg")
boxplot(cylinders ~ mpg01, data = Auto, xlab = "mpg01", ylab = "Cylinders")
boxplot(year ~ mpg01, data = Auto, xlab = "mpg01", ylab = "Year")
boxplot(origin ~ mpg01, data = Auto, xlab = "mpg01", ylab = "Origin")
c.
set.seed(1)
train_index <- sample(1:nrow(Auto), size = 0.7 * nrow(Auto))
train_data1 <- Auto[train_index, ]
test_data1 <- Auto[-train_index, ]
nrow(train_data1)
## [1] 274
nrow(test_data1)
## [1] 118
d.
lda_automodel <- lda(mpg01 ~ weight + horsepower + displacement + cylinders, data = train_data1)
lda_autopred <- predict(lda_automodel, newdata = test_data1)
pred_autoclass <- lda_autopred$class
conf_automatrix <- table(Predicted = pred_autoclass, Actual = test_data1$mpg01)
print(conf_automatrix)
## Actual
## Predicted 0 1
## 0 50 3
## 1 11 54
test_error <- mean(pred_autoclass != test_data1$mpg01)
print(paste("Test Error:", round(test_error, 4)))
## [1] "Test Error: 0.1186"
e.
qda_automodel <- qda(mpg01 ~ weight + horsepower + displacement + cylinders, data = train_data1)
qda_autopred <- predict(qda_automodel, newdata = test_data1)
pred_autoclassq <- qda_autopred$class
conf_automatrixq <- table(Predicted = pred_autoclassq, Actual = test_data1$mpg01)
print(conf_automatrixq)
## Actual
## Predicted 0 1
## 0 52 5
## 1 9 52
test_autoerror <- mean(pred_autoclassq != test_data1$mpg01)
print(paste("Test Error:", round(test_autoerror, 4)))
## [1] "Test Error: 0.1186"
f.
log_automodel <- glm(mpg01 ~ weight + horsepower + displacement + cylinders, data = train_data1, family = "binomial")
log_autopred <- predict(log_automodel, newdata = test_data1, type = "response")
log_predautoclass <- ifelse(log_autopred > 0.5, 1, 0)
conf_autolog <- table(Predicted = log_predautoclass, Actual = test_data1$mpg01)
test_logerror <- mean(log_predautoclass != test_data1$mpg01)
print(paste("Test Error:", round(test_logerror, 4)))
## [1] "Test Error: 0.0932"
g.
nb_automodel <- naiveBayes(mpg01 ~ weight + horsepower + displacement + cylinders, data = train_data1)
nb_autopred <- predict(nb_automodel, newdata = test_data1)
conf_autonb <- table(Predicted = nb_autopred, Actual = test_data1$mpg01)
print(conf_autonb)
## Actual
## Predicted 0 1
## 0 52 4
## 1 9 53
test_nberror <- mean(nb_autopred != test_data1$mpg01)
print(paste("Test Error:", round(test_nberror, 4)))
## [1] "Test Error: 0.1102"
h.
predictors <- c("cylinders", "weight", "horsepower", "displacement")
train_XA <- scale(train_data1[, predictors])
test_XA <- scale(test_data1[, predictors],
center = attr(train_XA, "scaled:center"),
scale = attr(train_XA, "scaled:scale"))
train_yA <- train_data1$mpg01
test_yA <- test_data1$mpg01
k_values <- 1:20
error_rates <- numeric(length(k_values))
set.seed(1)
for (i in seq_along(k_values)) {
kauto <- k_values[i]
pred <- knn(train = train_XA, test = test_XA, cl = train_yA, k = kauto)
error_rates[i] <- mean(pred != test_yA)
}
best_kauto <- k_values[which.min(error_rates)]
best_error <- min(error_rates)
cat("Best K:", best_k, "\n")
## Best K: 10
cat("Lowest Test Error Rate:", round(best_error, 4), "\n")
## Lowest Test Error Rate: 0.1102
16.
I tested four different methods to predict whether a neighborhood in Boston has a high crime rate or not. Logistic regression has an accuracy of about 82%. Linear Discriminant Analysis (LDA) did slightly better with 84% accuracy. Naive Bayes and K-Nearest Neighbors (KNN) tied for the best performance, each correctly predicting about 86% of the time. This shows that Naive Bayes and KNN were the most effective at spotting high-crime areas in this case.
data(Boston)
Boston$crim01 <- ifelse(Boston$crim > median(Boston$crim), 1, 0)
set.seed(1)
train_indexB <- sample(1:nrow(Boston), nrow(Boston) * 0.7)
train_dataB <- Boston[train_indexB, ]
test_dataB <- Boston[-train_indexB, ]
log_modelB <- glm(crim01 ~ lstat + rm + dis + nox + age, data = train_dataB, family = "binomial")
log_probs <- predict(log_modelB, newdata = test_dataB, type = "response")
log_predB <- ifelse(log_probs > 0.5, 1, 0)
log_error <- mean(log_predB != test_dataB$crim01)
lda_modelB <- lda(crim01 ~ lstat + rm + dis + nox + age, data = train_dataB)
lda_predB <- predict(lda_modelB, newdata = test_dataB)$class
lda_error <- mean(lda_predB != test_dataB$crim01)
nb_modelB <- naiveBayes(crim01 ~ lstat + rm + dis + nox + age, data = train_dataB)
nb_predB <- predict(nb_modelB, newdata = test_dataB)
nb_error <- mean(nb_predB != test_dataB$crim01)
predictorsB <- c("lstat", "rm", "dis", "nox", "age")
train_XB <- scale(train_dataB[, predictorsB])
test_XB <- scale(test_dataB[, predictorsB],
center = attr(train_XB, "scaled:center"),
scale = attr(train_XB, "scaled:scale"))
train_yB <- train_dataB$crim01
test_yB <- test_dataB$crim01
set.seed(1)
knn_predB <- knn(train_XB, test_XB, cl = train_yB, k = 5)
knn_error <- mean(knn_predB != test_yB)
log_accB <- 1 - log_error
lda_accB <- 1 - lda_error
nb_accB <- 1 - nb_error
knn_accB <- 1 - knn_error
cat("Test Error and Accuracy Rates:\n")
## Test Error and Accuracy Rates:
cat("----------------------------------\n")
## ----------------------------------
cat("Logistic Regression:\n Error =", round(log_error, 4),
"| Accuracy =", round(log_accB, 4), "\n")
## Logistic Regression:
## Error = 0.1776 | Accuracy = 0.8224
cat("LDA:\n Error =", round(lda_error, 4),
"| Accuracy =", round(lda_accB, 4), "\n")
## LDA:
## Error = 0.1579 | Accuracy = 0.8421
cat("Naive Bayes:\n Error =", round(nb_error, 4),
"| Accuracy =", round(nb_accB, 4), "\n")
## Naive Bayes:
## Error = 0.1447 | Accuracy = 0.8553
cat("KNN (k = 5):\n Error =", round(knn_error, 4),
"| Accuracy =", round(knn_accB, 4), "\n")
## KNN (k = 5):
## Error = 0.1447 | Accuracy = 0.8553