library(ISLR2)
data("Weekly")
head(Weekly)
## Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
## 1 1990 0.816 1.572 -3.936 -0.229 -3.484 0.1549760 -0.270 Down
## 2 1990 -0.270 0.816 1.572 -3.936 -0.229 0.1485740 -2.576 Down
## 3 1990 -2.576 -0.270 0.816 1.572 -3.936 0.1598375 3.514 Up
## 4 1990 3.514 -2.576 -0.270 0.816 1.572 0.1616300 0.712 Up
## 5 1990 0.712 3.514 -2.576 -0.270 0.816 0.1537280 1.178 Up
## 6 1990 1.178 0.712 3.514 -2.576 -0.270 0.1544440 -1.372 Down
str(Weekly)
## 'data.frame': 1089 obs. of 9 variables:
## $ Year : num 1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
## $ Lag1 : num 0.816 -0.27 -2.576 3.514 0.712 ...
## $ Lag2 : num 1.572 0.816 -0.27 -2.576 3.514 ...
## $ Lag3 : num -3.936 1.572 0.816 -0.27 -2.576 ...
## $ Lag4 : num -0.229 -3.936 1.572 0.816 -0.27 ...
## $ Lag5 : num -3.484 -0.229 -3.936 1.572 0.816 ...
## $ Volume : num 0.155 0.149 0.16 0.162 0.154 ...
## $ Today : num -0.27 -2.576 3.514 0.712 1.178 ...
## $ Direction: Factor w/ 2 levels "Down","Up": 1 1 2 2 2 1 2 2 2 1 ...
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
cor(Weekly[, -c(1, 9)])
## Lag1 Lag2 Lag3 Lag4 Lag5
## Lag1 1.000000000 -0.07485305 0.05863568 -0.071273876 -0.008183096
## Lag2 -0.074853051 1.00000000 -0.07572091 0.058381535 -0.072499482
## Lag3 0.058635682 -0.07572091 1.00000000 -0.075395865 0.060657175
## Lag4 -0.071273876 0.05838153 -0.07539587 1.000000000 -0.075675027
## Lag5 -0.008183096 -0.07249948 0.06065717 -0.075675027 1.000000000
## Volume -0.064951313 -0.08551314 -0.06928771 -0.061074617 -0.058517414
## Today -0.075031842 0.05916672 -0.07124364 -0.007825873 0.011012698
## Volume Today
## Lag1 -0.06495131 -0.075031842
## Lag2 -0.08551314 0.059166717
## Lag3 -0.06928771 -0.071243639
## Lag4 -0.06107462 -0.007825873
## Lag5 -0.05851741 0.011012698
## Volume 1.00000000 -0.033077783
## Today -0.03307778 1.000000000
hist(Weekly$Today, breaks = 20, col = "blue", main = "Histogram of Weekly Returns", xlab = "Return")
hist(Weekly$Volume, breaks = 20, col = "red", main = "Histogram of Weekly Returns", xlab = "Volume")
Histogram for Volume shows a right-skewed distribution, meaning that the trading volume is low but occassionally has large values.
Volume has low correlation with Today so it may not be a strong predictor of market movement.
The of Up(605) vs Down(484) suggests some bias.
Weekly$Direction <- as.factor(Weekly$Direction)
model1 <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = Weekly, family = binomial)
summary(model1)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
prob_pred <- predict(model1, type = "response")
predictions <- ifelse(prob_pred > 0.5, "Up", "Down")
predictions <- factor(predictions, levels = c("Down", "Up"))
conf_matrix <- table(Predicted = predictions, Actual = Weekly$Direction)
print(conf_matrix)
## Actual
## Predicted Down Up
## Down 54 48
## Up 430 557
accuracy1 <- sum(diag(conf_matrix)) / sum(conf_matrix)
cat("Overall correct predictions:", accuracy1)
## Overall correct predictions: 0.5610652
train_data <- subset(Weekly, Year <= 2008)
test_data <- subset(Weekly, Year > 2008)
model2 <- glm(Direction ~ Lag2, data = train_data, family = binomial)
summary(model2)
##
## Call:
## glm(formula = Direction ~ Lag2, family = binomial, data = train_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.536 -1.264 1.021 1.091 1.368
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.20326 0.06428 3.162 0.00157 **
## Lag2 0.05810 0.02870 2.024 0.04298 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1354.7 on 984 degrees of freedom
## Residual deviance: 1350.5 on 983 degrees of freedom
## AIC: 1354.5
##
## Number of Fisher Scoring iterations: 4
prob_pred2 <- predict(model2, newdata = test_data, type = "response")
predictions2 <- ifelse(prob_pred2 > 0.5, "Up", "Down")
predictions2 <- factor(predictions2, levels = c("Down", "Up"))
conf_matrix_test <- table(Predicted = predictions2, Actual = test_data$Direction)
print(conf_matrix_test)
## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
accuracy2 <- sum(diag(conf_matrix_test)) / sum(conf_matrix_test)
cat("Overall correct predictions:", accuracy2)
## Overall correct predictions: 0.625
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
library(class)
library(e1071)
train_X <- train_data$Lag2
train_Y <- train_data$Direction
test_X <- test_data$Lag2
test_Y <- test_data$Direction
lda_model <- lda(Direction ~ Lag2, data = train_data)
lda_pred <- predict(lda_model, newdata = test_data)
lda_class <- lda_pred$class
lda_conf_matrix <- table(Predicted = lda_class, Actual = test_Y)
print(lda_conf_matrix)
## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
lda_accuracy <- sum(diag(lda_conf_matrix)) / sum(lda_conf_matrix)
cat("LDA Accuracy:", lda_accuracy, "\n")
## LDA Accuracy: 0.625
qda_model <- qda(Direction ~ Lag2, data = train_data)
qda_pred <- predict(qda_model, newdata = test_data)
qda_class <- qda_pred$class
qda_conf_matrix <- table(Predicted = qda_class, Actual = test_Y)
print(qda_conf_matrix)
## Actual
## Predicted Down Up
## Down 0 0
## Up 43 61
qda_accuracy <- sum(diag(qda_conf_matrix)) / sum(qda_conf_matrix)
cat("QDA Accuracy:", qda_accuracy, "\n")
## QDA Accuracy: 0.5865385
knn_pred <- knn(train = matrix(train_X), test = matrix(test_X),
cl = train_Y, k = 1)
knn_conf_matrix <- table(Predicted = knn_pred, Actual = test_Y)
print(knn_conf_matrix)
## Actual
## Predicted Down Up
## Down 21 29
## Up 22 32
knn_accuracy <- sum(diag(knn_conf_matrix)) / sum(knn_conf_matrix)
cat("KNN with K=1 Accuracy:", knn_accuracy, "\n")
## KNN with K=1 Accuracy: 0.5096154
nb_model <- naiveBayes(Direction ~ Lag2, data = train_data)
nb_pred <- predict(nb_model, newdata = test_data)
nb_conf_matrix <- table(Predicted = nb_pred, Actual = test_Y)
print(nb_conf_matrix)
## Actual
## Predicted Down Up
## Down 0 0
## Up 43 61
nb_accuracy <- sum(diag(nb_conf_matrix)) / sum(nb_conf_matrix)
cat("Naïve Bayes Accuracy:", nb_accuracy, "\n")
## Naïve Bayes Accuracy: 0.5865385
transformed and interaction variables:
train_data$Lag2_sq <- train_data$Lag2^2
train_data$Lag1_Lag2 <- train_data$Lag1 * train_data$Lag2
train_data$Log_Volume <- log(train_data$Volume + 1)
test_data$Lag2_sq <- test_data$Lag2^2
test_data$Lag1_Lag2 <- test_data$Lag1 * test_data$Lag2
test_data$Log_Volume <- log(test_data$Volume + 1)
log_model <- glm(Direction ~ Lag2 + Lag2_sq + Lag1_Lag2 + Log_Volume, data = train_data, family = binomial)
logit_pred_prob <- predict(log_model, newdata = test_data, type = "response")
logit_pred <- ifelse(logit_pred_prob > 0.5, "Up", "Down")
logit_pred <- factor(logit_pred, levels = c("Down", "Up"))
logit_conf_matrix <- table(Predicted = logit_pred, Actual = test_data$Direction)
logit_accuracy <- sum(diag(logit_conf_matrix)) / sum(logit_conf_matrix)
print(logit_conf_matrix)
## Actual
## Predicted Down Up
## Down 22 31
## Up 21 30
cat("Logistic Regression Accuracy:", logit_accuracy, "\n")
## Logistic Regression Accuracy: 0.5
lda_model2 <- lda(Direction ~ Lag2 + Lag2_sq + Lag1_Lag2 + Log_Volume, data = train_data)
lda_pred2 <- predict(lda_model2, newdata = test_data)$class
lda_conf_matrix2 <- table(Predicted = lda_pred2, Actual = test_data$Direction)
lda_accuracy2 <- sum(diag(lda_conf_matrix2)) / sum(lda_conf_matrix2)
print(lda_conf_matrix2)
## Actual
## Predicted Down Up
## Down 22 31
## Up 21 30
cat("LDA Accuracy:", lda_accuracy2, "\n")
## LDA Accuracy: 0.5
qda_model2 <- qda(Direction ~ Lag2 + Lag2_sq + Lag1_Lag2 + Log_Volume, data = train_data)
qda_pred2 <- predict(qda_model2, newdata = test_data)$class
qda_conf_matrix2 <- table(Predicted = qda_pred2, Actual = test_data$Direction)
qda_accuracy2 <- sum(diag(qda_conf_matrix2)) / sum(qda_conf_matrix2)
print(qda_conf_matrix2)
## Actual
## Predicted Down Up
## Down 25 38
## Up 18 23
cat("QDA Accuracy:", qda_accuracy2, "\n")
## QDA Accuracy: 0.4615385
# scale predictors
train_knn <- scale(train_data[, c("Lag2", "Lag2_sq", "Lag1_Lag2", "Log_Volume")])
test_knn <- scale(test_data[, c("Lag2", "Lag2_sq", "Lag1_Lag2", "Log_Volume")],
center = attr(train_knn, "scaled:center"),
scale = attr(train_knn, "scaled:scale"))
train_Y <- train_data$Direction
test_Y <- test_data$Direction
k_values <- c(1, 3, 5, 7, 10)
knn_accuracies <- c()
for (k in k_values) {
knn_pred <- knn(train_knn, test_knn, train_Y, k = k)
knn_conf_matrix <- table(Predicted = knn_pred, Actual = test_Y)
knn_acc <- sum(diag(knn_conf_matrix)) / sum(knn_conf_matrix)
knn_accuracies <- c(knn_accuracies, knn_acc)
cat("K =", k, "Accuracy:", knn_acc, "\n")
}
## K = 1 Accuracy: 0.5384615
## K = 3 Accuracy: 0.5
## K = 5 Accuracy: 0.5673077
## K = 7 Accuracy: 0.5192308
## K = 10 Accuracy: 0.5096154
best_k <- k_values[which.max(knn_accuracies)]
cat("Best K:", best_k, "with accuracy:", max(knn_accuracies), "\n")
## Best K: 5 with accuracy: 0.5673077
nb_model <- naiveBayes(Direction ~ Lag2 + Lag2_sq + Lag1_Lag2 + Log_Volume, data = train_data)
nb_pred <- predict(nb_model, newdata = test_data)
nb_conf_matrix <- table(Predicted = nb_pred, Actual = test_data$Direction)
nb_accuracy <- sum(diag(nb_conf_matrix)) / sum(nb_conf_matrix)
print(nb_conf_matrix)
## Actual
## Predicted Down Up
## Down 25 37
## Up 18 24
cat("Naïve Bayes Accuracy:", nb_accuracy, "\n")
## Naïve Bayes Accuracy: 0.4711538
Experiment with different combinations of predictors, includ- ing possible transformations and interactions, for each of the methods. Report the variables, method, and associated confu- sion matrix that appears to provide the best results on the held out data. Note that you should also experiment with values for K in the KNN classifier.
14:
data("Auto")
str(Auto)
## 'data.frame': 392 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : int 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : int 3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : int 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
## - attr(*, "na.action")= 'omit' Named int [1:5] 33 127 331 337 355
## ..- attr(*, "names")= chr [1:5] "33" "127" "331" "337" ...
mpg_median <- median(Auto$mpg)
Auto$mpg01 <- ifelse(Auto$mpg > mpg_median, 1, 0)
Auto$mpg01 <- as.factor(Auto$mpg01)
table(Auto$mpg01)
##
## 0 1
## 196 196
summary(Auto$mpg01)
## 0 1
## 196 196
Auto_new <- data.frame(mpg01 = Auto$mpg01, Auto)
head(Auto_new)
## mpg01 mpg cylinders displacement horsepower weight acceleration year origin
## 1 0 18 8 307 130 3504 12.0 70 1
## 2 0 15 8 350 165 3693 11.5 70 1
## 3 0 18 8 318 150 3436 11.0 70 1
## 4 0 16 8 304 150 3433 12.0 70 1
## 5 0 17 8 302 140 3449 10.5 70 1
## 6 0 15 8 429 198 4341 10.0 70 1
## name mpg01.1
## 1 chevrolet chevelle malibu 0
## 2 buick skylark 320 0
## 3 plymouth satellite 0
## 4 amc rebel sst 0
## 5 ford torino 0
## 6 ford galaxie 500 0
library(ggplot2)
ggplot(Auto, aes(x = mpg01, y = cylinders, fill = mpg01)) +
geom_boxplot() +
ggtitle("Cylinders vs mpg01") +
xlab("mpg01 (0 = Low, 1 = High)") +
ylab("Cylinders")
ggplot(Auto, aes(x = mpg01, y = displacement, fill = mpg01)) +
geom_boxplot() +
ggtitle("Displacement vs mpg01") +
xlab("mpg01 (0 = Low, 1 = High)") +
ylab("Displacement")
ggplot(Auto, aes(x = mpg01, y = horsepower, fill = mpg01)) +
geom_boxplot() +
ggtitle("Horsepower vs mpg01") +
xlab("mpg01 (0 = Low, 1 = High)") +
ylab("Horsepower")
ggplot(Auto, aes(x = mpg01, y = weight, fill = mpg01)) +
geom_boxplot() +
ggtitle("Weight vs mpg01") +
xlab("mpg01 (0 = Low, 1 = High)") +
ylab("Weight")
ggplot(Auto, aes(x = mpg01, y = acceleration, fill = mpg01)) +
geom_boxplot() +
ggtitle("Acceleration vs mpg01") +
xlab("mpg01 (0 = Low, 1 = High)") +
ylab("Acceleration")
ggplot(Auto, aes(x = mpg01, y = year, fill = mpg01)) +
geom_boxplot() +
ggtitle("Year vs mpg01") +
xlab("mpg01 (0 = Low, 1 = High)") +
ylab("Year")
ggplot(Auto, aes(x = displacement, y = mpg, color = mpg01)) +
geom_point(alpha = 0.7) +
ggtitle("Displacement vs MPG") +
xlab("Displacement") +
ylab("MPG")
ggplot(Auto, aes(x = horsepower, y = mpg, color = mpg01)) +
geom_point(alpha = 0.7) +
ggtitle("Horsepower vs MPG") +
xlab("Horsepower") +
ylab("MPG")
ggplot(Auto, aes(x = weight, y = mpg, color = mpg01)) +
geom_point(alpha = 0.7) +
ggtitle("Weight vs MPG") +
xlab("Weight") +
ylab("MPG")
cor_matrix <- cor(Auto[, sapply(Auto, is.numeric)])
print(cor_matrix["mpg",])
## mpg cylinders displacement horsepower weight acceleration
## 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442 0.4233285
## year origin
## 0.5805410 0.5652088
Higher displacments leads to lower mpg
higiher horsepower tend ot be in lower mpg category
higher weights have lower mpg
set.seed(123)
train_indices <- sample(1:nrow(Auto), size = 0.7 * nrow(Auto))
train_data2 <- Auto[train_indices, ]
test_data2 <- Auto[-train_indices, ]
lda_model3 <- lda(mpg01 ~ weight + displacement + horsepower + cylinders + year, data = train_data2)
print(lda_model3)
## Call:
## lda(mpg01 ~ weight + displacement + horsepower + cylinders +
## year, data = train_data2)
##
## Prior probabilities of groups:
## 0 1
## 0.4963504 0.5036496
##
## Group means:
## weight displacement horsepower cylinders year
## 0 3641.022 275.2941 130.96324 6.786765 74.39706
## 1 2314.000 114.5290 78.00725 4.188406 77.84058
##
## Coefficients of linear discriminants:
## LD1
## weight -0.001196208
## displacement -0.001452526
## horsepower 0.010098725
## cylinders -0.398890632
## year 0.133073372
lda_pred3 <- predict(lda_model3, newdata = test_data2)
lda_class3 <- lda_pred3$class
lda_conf_matrix3 <- table(Predicted = lda_class3, Actual = test_data2$mpg01)
print(lda_conf_matrix)
## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
test_error <- 1 - sum(diag(lda_conf_matrix3)) / sum(lda_conf_matrix3)
cat("Test Error Rate:", test_error, "\n")
## Test Error Rate: 0.1016949
qda_model3 <- qda(mpg01 ~ weight + displacement + horsepower + cylinders + year, data = train_data2)
qda_pred3 <- predict(qda_model3, newdata = test_data2)$class
qda_conf_matrix3 <- table(Predicted = qda_pred3, Actual = test_data2$mpg01)
qda_test_error3 <- 1 - sum(diag(qda_conf_matrix3)) / sum(qda_conf_matrix3)
print(qda_conf_matrix3)
## Actual
## Predicted 0 1
## 0 53 5
## 1 7 53
cat("QDA Test Error:", qda_test_error3, "\n")
## QDA Test Error: 0.1016949
logit_model3 <- glm(mpg01 ~ weight + displacement + horsepower + cylinders + year,
data = train_data2, family = binomial)
logit_pred_prob2 <- predict(logit_model3, newdata = test_data2, type = "response")
logit_pred2 <- ifelse(logit_pred_prob2 > 0.5, 1, 0)
logit_pred2 <- factor(logit_pred2, levels = c(0, 1))
logit_conf_matrix2 <- table(Predicted = logit_pred2, Actual = test_data2$mpg01)
logit_test_error2 <- 1 - sum(diag(logit_conf_matrix2)) / sum(logit_conf_matrix2)
print(logit_conf_matrix2)
## Actual
## Predicted 0 1
## 0 56 9
## 1 4 49
cat("Logistic Regression Test Error:", logit_test_error2, "\n")
## Logistic Regression Test Error: 0.1101695
nb_model3 <- naiveBayes(mpg01 ~ weight + displacement + horsepower + cylinders + year, data = train_data2)
nb_pred3 <- predict(nb_model3, newdata = test_data2)
nb_conf_matrix3 <- table(Predicted = nb_pred3, Actual = test_data2$mpg01)
nb_test_error3 <- 1 - sum(diag(nb_conf_matrix3)) / sum(nb_conf_matrix3)
print(nb_conf_matrix3)
## Actual
## Predicted 0 1
## 0 52 4
## 1 8 54
cat("Naïve Bayes Test Error:", nb_test_error3, "\n")
## Naïve Bayes Test Error: 0.1016949
train_knn2 <- scale(train_data2[, c("weight", "displacement", "horsepower", "cylinders", "year")])
test_knn2 <- scale(test_data2[, c("weight", "displacement", "horsepower", "cylinders", "year")],
center = attr(train_knn2, "scaled:center"),
scale = attr(train_knn2, "scaled:scale"))
train_Y2 <- train_data2$mpg01
test_Y2 <- test_data2$mpg01
# Try different values of K
k_values2 <- c(1, 3, 5, 7, 10, 15, 20)
knn_errors2 <- c()
for (k in k_values2) {
knn_pred2 <- knn(train_knn2, test_knn2, train_Y2, k = k)
knn_conf_matrix2 <- table(Predicted = knn_pred2, Actual = test_Y2)
knn_error2 <- 1 - sum(diag(knn_conf_matrix2)) / sum(knn_conf_matrix2)
knn_errors2 <- c(knn_errors2, knn_error2)
cat("K =", k, "Test Error:", knn_error2, "\n")
}
## K = 1 Test Error: 0.05932203
## K = 3 Test Error: 0.1186441
## K = 5 Test Error: 0.08474576
## K = 7 Test Error: 0.09322034
## K = 10 Test Error: 0.08474576
## K = 15 Test Error: 0.1016949
## K = 20 Test Error: 0.1016949
best_k2 <- k_values2[which.min(knn_errors2)]
cat("Best K:", best_k2, "with Test Error:", min(knn_errors2), "\n")
## Best K: 1 with Test Error: 0.05932203
data("Boston")
crim_median <- median(Boston$crim)
Boston$crim01 <- ifelse(Boston$crim > crim_median, 1, 0)
Boston$crim01 <- as.factor(Boston$crim01)
str(Boston)
## 'data.frame': 506 obs. of 15 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
## $ crim01 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
cor_matrix <- cor(Boston[, sapply(Boston, is.numeric)])
print(cor_matrix["crim",])
## crim zn indus chas nox rm
## 1.00000000 -0.20046922 0.40658341 -0.05589158 0.42097171 -0.21924670
## age dis rad tax ptratio black
## 0.35273425 -0.37967009 0.62550515 0.58276431 0.28994558 -0.38506394
## lstat medv
## 0.45562148 -0.38830461
set.seed(123)
train_indices3 <- sample(1:nrow(Boston), size = 0.7 * nrow(Boston))
train_data3 <- Boston[train_indices3, ]
test_data3 <- Boston[-train_indices3, ]
Logistic Regression:
logit_model4 <- glm(crim01 ~ lstat + dis + rm + age + indus + nox,
data = train_data3, family = binomial)
logit_pred_prob3 <- predict(logit_model4, newdata = test_data3, type = "response")
logit_pred3 <- ifelse(logit_pred_prob3 > 0.5, 1, 0)
logit_pred3 <- factor(logit_pred3, levels = c(0, 1))
logit_conf_matrix3 <- table(Predicted = logit_pred3, Actual = test_data3$crim01)
logit_test_error3 <- 1 - sum(diag(logit_conf_matrix3)) / sum(logit_conf_matrix3)
print(logit_conf_matrix3)
## Actual
## Predicted 0 1
## 0 60 9
## 1 15 68
cat("Logistic Regression Test Error:", logit_test_error3, "\n")
## Logistic Regression Test Error: 0.1578947
15.8% is moderate accuracy, with 9 high crime areas and 15 misclassified low crime areas
LDA:
lda_model4 <- lda(crim01 ~ lstat + dis + rm + age + indus + nox, data = train_data3)
lda_pred3 <- predict(lda_model4, newdata = test_data3)$class
lda_conf_matrix3 <- table(Predicted = lda_pred3, Actual = test_data3$crim01)
lda_test_error3 <- 1 - sum(diag(lda_conf_matrix3)) / sum(lda_conf_matrix3)
print(lda_conf_matrix3)
## Actual
## Predicted 0 1
## 0 65 11
## 1 10 66
cat("LDA Test Error:", lda_test_error3, "\n")
## LDA Test Error: 0.1381579
13.8% test error is better than logistic regression. Misclassification is slightly lower.
nb_model4 <- naiveBayes(crim01 ~ lstat + dis + rm + age + indus + nox, data = train_data3)
nb_pred3 <- predict(nb_model4, newdata = test_data3)
nb_conf_matrix3 <- table(Predicted = nb_pred3, Actual = test_data3$crim01)
nb_test_error3 <- 1 - sum(diag(nb_conf_matrix3)) / sum(nb_conf_matrix3)
print(nb_conf_matrix3)
## Actual
## Predicted 0 1
## 0 64 12
## 1 11 65
cat("Naïve Bayes Test Error:", nb_test_error3, "\n")
## Naïve Bayes Test Error: 0.1513158
Error rate is similar to logistic regression.
train_knn3 <- scale(train_data3[, c("lstat", "dis", "rm", "age", "indus", "nox")])
test_knn3 <- scale(test_data3[, c("lstat", "dis", "rm", "age", "indus", "nox")],
center = attr(train_knn3, "scaled:center"),
scale = attr(train_knn3, "scaled:scale"))
train_Y3 <- train_data3$crim01
test_Y3 <- test_data3$crim01
k_values3 <- c(1, 3, 5, 7, 10, 15, 20)
knn_errors3 <- c()
for (k in k_values3) {
knn_pred3 <- knn(train_knn3, test_knn3, train_Y3, k = k)
knn_conf_matrix3 <- table(Predicted = knn_pred3, Actual = test_Y3)
knn_error3 <- 1 - sum(diag(knn_conf_matrix3)) / sum(knn_conf_matrix3)
knn_errors3 <- c(knn_errors3, knn_error3)
cat("K =", k, "Test Error:", knn_error3, "\n")
}
## K = 1 Test Error: 0.1052632
## K = 3 Test Error: 0.08552632
## K = 5 Test Error: 0.09868421
## K = 7 Test Error: 0.1118421
## K = 10 Test Error: 0.1315789
## K = 15 Test Error: 0.1578947
## K = 20 Test Error: 0.1513158
best_k3 <- k_values3[which.min(knn_errors3)]
cat("Best K:", best_k3, "with Test Error:", min(knn_errors3), "\n")
## Best K: 3 with Test Error: 0.08552632
The best performing value of K is 3, provides the lowest test error rate at 8.6% compared to all other models above.