library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.3.3
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'tibble' was built under R version 4.3.3
## Warning: package 'tidyr' was built under R version 4.3.3
## Warning: package 'readr' was built under R version 4.3.3
## Warning: package 'dplyr' was built under R version 4.3.3
## Warning: package 'lubridate' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.1
## ✔ ggplot2 4.0.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data(Weekly)
head(Weekly)
## Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
## 1 1990 0.816 1.572 -3.936 -0.229 -3.484 0.1549760 -0.270 Down
## 2 1990 -0.270 0.816 1.572 -3.936 -0.229 0.1485740 -2.576 Down
## 3 1990 -2.576 -0.270 0.816 1.572 -3.936 0.1598375 3.514 Up
## 4 1990 3.514 -2.576 -0.270 0.816 1.572 0.1616300 0.712 Up
## 5 1990 0.712 3.514 -2.576 -0.270 0.816 0.1537280 1.178 Up
## 6 1990 1.178 0.712 3.514 -2.576 -0.270 0.1544440 -1.372 Down
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
ggplot(Weekly, aes(x = Today)) +
geom_histogram(bins = 30, fill = "steelblue", color = "white") +
theme_minimal() +
labs(title = "Distribution of Weekly Returns", x = "Today's Return (%)", y = "Count")
ggplot(Weekly, aes(x = Volume)) +
geom_density(fill = "purple", alpha = 0.4) +
theme_minimal() +
labs(title = "Density Plot of Trading Volume", x = "Volume (in billions)", y = "Density")
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.95 loaded
cor_matrix <- cor(Weekly[, -9])
corrplot(cor_matrix, method = "color", type = "upper", tl.col = "black", tl.srt = 45)
We do see some patterns appear. Volume predictor seems to be normally distributed. We can also see from the correlation plot that there is almost no correlation between each Lag predictor and the Today variable.
log_model <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = Weekly, family = binomial)
summary(log_model)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
Lag2 is the only predictor that is statistically significant.
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.3.3
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
train <- Weekly[Weekly$Year < 2009, ]
test <- Weekly[Weekly$Year >= 2009, ]
prob <- predict(log_model, newdata = test, type = "response")
pred_class <- ifelse(prob > 0.5, "Up", "Down")
pred_class <- factor(pred_class, levels = c("Down", "Up"))
actual_class <- factor(test$Direction, levels = c("Down", "Up"))
table(Predicted = pred_class, Actual = actual_class)
## Actual
## Predicted Down Up
## Down 17 13
## Up 26 48
The log model is showing more false positives than false positives. It predicted up when the market was down 26 times compared to opposite which was only 13 times. The model is better at identifying Up days than Down days.
train <- (Weekly$Year < 2009)
test <- Weekly[Weekly$Year >= 2009, ]
log_model1 <- glm(Direction ~ Lag2, data = Weekly, family = binomial)
prob1 <- predict(log_model1, newdata = test, type = "response")
pred_class <- ifelse(prob1 > 0.5, "Up", "Down")
pred_class <- factor(pred_class, levels = c("Down", "Up"))
actual_class <- factor(test$Direction, levels = c("Down", "Up"))
table(Predicted = pred_class, Actual = actual_class)
## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## The following object is masked from 'package:ISLR2':
##
## Boston
Direction.test <- Weekly$Direction[!train]
lda_model <- lda(Direction ~ Lag2, data = Weekly, subset = train)
lda_pred <- predict(lda_model, test)
lda.class <- lda_pred$class
table(Predicted = lda.class, Actual = Direction.test)
## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
Direction.test <- Weekly$Direction[!train]
qda_model <- qda(Direction ~ Lag2, data = Weekly, subset = train)
qda_pred <- predict(qda_model, test)
qda.class <- qda_pred$class
table(Predicted = qda.class, Actual = Direction.test)
## Actual
## Predicted Down Up
## Down 0 0
## Up 43 61
library(class)
## Warning: package 'class' was built under R version 4.3.3
train.index <- (Weekly$Year < 2009)
X.train <- Weekly[train.index, "Lag2", drop = FALSE]
X.test <- Weekly[!train.index, "Lag2", drop = FALSE]
Y.train <- Weekly$Direction[train.index]
Direction.test <- Weekly$Direction[!train.index]
set.seed(1)
knn.pred <- knn(train = X.train, test = X.test, cl = Y.train, k = 1)
table(Predicted = knn.pred, Actual = Direction.test)
## Actual
## Predicted Down Up
## Down 21 30
## Up 22 31
library(e1071)
## Warning: package 'e1071' was built under R version 4.3.3
##
## Attaching package: 'e1071'
## The following object is masked from 'package:ggplot2':
##
## element
train <- (Weekly$Year < 2009)
Weekly.test <- Weekly[!train, ]
Direction.test <- Weekly$Direction[!train]
nb.fit <- naiveBayes(Direction ~ Lag2, data = Weekly, subset = train)
nb.class <- predict(nb.fit, Weekly.test)
table(Predicted = nb.class, Actual = Direction.test)
## Actual
## Predicted Down Up
## Down 0 0
## Up 43 61
Log regression and LDA tie for the best results.
train <- (Weekly$Year < 2009)
Weekly.test <- Weekly[!train, ]
Direction.test <- Weekly$Direction[!train]
best.logit <- glm(Direction ~ Lag1 + Lag2,
data = Weekly,
family = binomial,
subset = train)
best.probs <- predict(best.logit, Weekly.test, type = "response")
best.pred <- rep("Down", length(best.probs))
best.pred[best.probs > 0.5] <- "Up"
table(Predicted = best.pred, Actual = Direction.test)
## Actual
## Predicted Down Up
## Down 7 8
## Up 36 53
X.train <- Weekly[train, "Lag2", drop = FALSE]
X.test <- Weekly[!train, "Lag2", drop = FALSE]
Y.train <- Weekly$Direction[train]
set.seed(1)
knn.pred.9 <- knn(train = X.train, test = X.test, cl = Y.train, k = 9)
table(Predicted = knn.pred.9, Actual = Direction.test)
## Actual
## Predicted Down Up
## Down 17 20
## Up 26 41
mpg_median <- median(Auto$mpg)
mpg01 <- as.integer(Auto$mpg > mpg_median)
Auto_new <- data.frame(mpg01, Auto)
head(Auto_new)
## mpg01 mpg cylinders displacement horsepower weight acceleration year origin
## 1 0 18 8 307 130 3504 12.0 70 1
## 2 0 15 8 350 165 3693 11.5 70 1
## 3 0 18 8 318 150 3436 11.0 70 1
## 4 0 16 8 304 150 3433 12.0 70 1
## 5 0 17 8 302 140 3449 10.5 70 1
## 6 0 15 8 429 198 4341 10.0 70 1
## name
## 1 chevrolet chevelle malibu
## 2 buick skylark 320
## 3 plymouth satellite
## 4 amc rebel sst
## 5 ford torino
## 6 ford galaxie 500
boxplot(cylinders ~ mpg01, data = Auto_new, main = "Cylinders vs mpg01", xlab = "mpg01", ylab = "Cylinders", col = "lightblue")
boxplot(displacement ~ mpg01, data = Auto_new, main = "Displacement vs mpg01", xlab = "mpg01", ylab = "Displacement", col = "lightgreen")
boxplot(horsepower ~ mpg01, data = Auto_new, main = "Horsepower vs mpg01", xlab = "mpg01", ylab = "Horsepower", col = "lightpink")
boxplot(weight ~ mpg01, data = Auto_new, main = "Weight vs mpg01", xlab = "mpg01", ylab = "Weight", col = "lightyellow")
boxplot(acceleration ~ mpg01, data = Auto_new, main = "Acceleration vs mpg01", xlab = "mpg01", ylab = "Acceleration", col = "lavender")
boxplot(year ~ mpg01, data = Auto_new, main = "Year vs mpg01", xlab = "mpg01", ylab = "Year", col = "wheat")
pairs(Auto_new[, c("mpg01", "cylinders", "displacement", "horsepower", "weight", "acceleration", "year")])
library(caret)
set.seed(42)
train_size <- floor(0.80 * nrow(Auto_new))
train_indices <- sample(seq_len(nrow(Auto_new)), size = train_size)
train_set <- Auto_new[train_indices, ]
test_set <- Auto_new[-train_indices, ]
lda_automodel <- lda(mpg01 ~ weight + displacement + horsepower + cylinders, data = train_set)
print(lda_automodel)
## Call:
## lda(mpg01 ~ weight + displacement + horsepower + cylinders, data = train_set)
##
## Prior probabilities of groups:
## 0 1
## 0.5271565 0.4728435
##
## Group means:
## weight displacement horsepower cylinders
## 0 3597.970 272.5394 129.38788 6.751515
## 1 2345.905 117.0912 78.66892 4.202703
##
## Coefficients of linear discriminants:
## LD1
## weight -0.0009597286
## displacement -0.0016031063
## horsepower 0.0013528155
## cylinders -0.3798340772
lda_autopred <- predict(lda_automodel, newdata = test_set)
lda_autoclass <- lda_autopred$class
test_error <- mean(lda_autoclass != test_set$mpg01)
print(test_error)
## [1] 0.06329114
Test error is .0633
qda_automodel <- qda(mpg01 ~ weight + displacement + horsepower + cylinders, data = train_set)
print(qda_automodel)
## Call:
## qda(mpg01 ~ weight + displacement + horsepower + cylinders, data = train_set)
##
## Prior probabilities of groups:
## 0 1
## 0.5271565 0.4728435
##
## Group means:
## weight displacement horsepower cylinders
## 0 3597.970 272.5394 129.38788 6.751515
## 1 2345.905 117.0912 78.66892 4.202703
qda_autopred <- predict(qda_automodel, newdata = test_set)
qda_autoclass <- qda_autopred$class
test_error_qda <- mean(qda_autoclass != test_set$mpg01)
print(test_error_qda)
## [1] 0.07594937
Test error is .0760
log_automodel <- glm(mpg01 ~ weight + displacement + horsepower + cylinders, data = train_set, family = binomial)
summary(log_automodel)
##
## Call:
## glm(formula = mpg01 ~ weight + displacement + horsepower + cylinders,
## family = binomial, data = train_set)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 11.4938653 1.8908145 6.079 1.21e-09 ***
## weight -0.0018412 0.0007409 -2.485 0.0130 *
## displacement -0.0109249 0.0088261 -1.238 0.2158
## horsepower -0.0471302 0.0148478 -3.174 0.0015 **
## cylinders -0.0045738 0.3720303 -0.012 0.9902
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 432.99 on 312 degrees of freedom
## Residual deviance: 174.26 on 308 degrees of freedom
## AIC: 184.26
##
## Number of Fisher Scoring iterations: 7
prob_autolog <- predict(log_automodel, newdata = test_set, type = "response")
pred_autolog <- ifelse(prob_autolog > 0.5, 1, 0)
test_error_log <- mean(pred_autolog != test_set$mpg01)
nb_automodel <- naiveBayes(mpg01 ~ weight + displacement + horsepower + cylinders, data = train_set)
print(nb_automodel)
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## 0 1
## 0.5271565 0.4728435
##
## Conditional probabilities:
## weight
## Y [,1] [,2]
## 0 3597.970 670.9549
## 1 2345.905 409.4496
##
## displacement
## Y [,1] [,2]
## 0 272.5394 89.45761
## 1 117.0912 41.22017
##
## horsepower
## Y [,1] [,2]
## 0 129.38788 36.80148
## 1 78.66892 16.11328
##
## cylinders
## Y [,1] [,2]
## 0 6.751515 1.4542036
## 1 4.202703 0.7186482
nb_autopred <- predict(nb_automodel, newdata = test_set)
test_error_nb <- mean(nb_autopred != test_set$mpg01)
print(test_error_nb)
## [1] 0.06329114
predictors <- c("weight", "displacement", "horsepower", "cylinders")
scaled_predictors <- scale(Auto_new[, predictors])
train_X <- scaled_predictors[train_indices, ]
test_X <- scaled_predictors[-train_indices, ]
train_y <- Auto_new$mpg01[train_indices]
test_y <- Auto_new$mpg01[-train_indices]
k_values <- c(1, 3, 5, 7, 10, 15, 20, 50)
knn_errors <- numeric(length(k_values))
for (i in seq_along(k_values)) {
set.seed(42)
knn_pred <- knn(train = train_X, test = test_X, cl = train_y, k = k_values[i])
knn_errors[i] <- mean(knn_pred != test_y)
}
knn_results <- data.frame(K = k_values, Test_Error = round(knn_errors, 4))
print(knn_results)
## K Test_Error
## 1 1 0.1266
## 2 3 0.0886
## 3 5 0.0759
## 4 7 0.0633
## 5 10 0.0506
## 6 15 0.0633
## 7 20 0.0633
## 8 50 0.0633
The best K-value would 10 followe by 15, 20, and 50.
crime_median <- median(Boston$crim)
crim01 <- as.integer(Boston$crim > crime_median)
Boston_new <- data.frame(crim01, Boston)
Boston_new$crim <- NULL
cor(Boston_new)[, "crim01"]
## crim01 zn indus chas nox rm
## 1.00000000 -0.43615103 0.60326017 0.07009677 0.72323480 -0.15637178
## age dis rad tax ptratio black
## 0.61393992 -0.61634164 0.61978625 0.60874128 0.25356836 -0.35121093
## lstat medv
## 0.45326273 -0.26301673
boxplot(rad ~ crim01, data = Boston_new, main = "rad vs crim01", xlab = "crim01", ylab = "rad", col = "lightblue")
boxplot(indus ~ crim01, data = Boston_new, main = "indus vs crim01", xlab = "crim01", ylab = "indus", col = "lightgreen")
boxplot(nox ~ crim01, data = Boston_new, main = "nox vs crim01", xlab = "crim01", ylab = "nox", col = "lightpink")
boxplot(age ~ crim01, data = Boston_new, main = "age vs crim01", xlab = "crim01", ylab = "age", col = "lightyellow")
boxplot(dis ~ crim01, data = Boston_new, main = "dis vs crim01", xlab = "crim01", ylab = "dis", col = "lavender")
boxplot(tax ~ crim01, data = Boston_new, main = "Year vs crim01", xlab = "crim01", ylab = "tax", col = "wheat")
set.seed(123)
train_indices <- sample(1:nrow(Boston_new), 0.8 * nrow(Boston_new))
bostontrain_set <- Boston_new[train_indices, ]
bostontest_set <- Boston_new[-train_indices, ]
boston_log <- glm(crim01 ~ nox + rad + tax + dis, data = bostontrain_set, family = binomial)
summary(boston_log)
##
## Call:
## glm(formula = crim01 ~ nox + rad + tax + dis, family = binomial,
## data = bostontrain_set)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -21.432095 3.566290 -6.010 1.86e-09 ***
## nox 39.659745 6.067602 6.536 6.31e-11 ***
## rad 0.552769 0.123378 4.480 7.45e-06 ***
## tax -0.009133 0.002500 -3.654 0.000258 ***
## dis 0.077851 0.156504 0.497 0.618880
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 560.02 on 403 degrees of freedom
## Residual deviance: 202.12 on 399 degrees of freedom
## AIC: 212.12
##
## Number of Fisher Scoring iterations: 8
bostonlog_prob <- predict(boston_log, newdata = bostontest_set, type = "response")
bostonlog_pred <- ifelse(bostonlog_prob > 0.5, 1, 0)
bostonlog_error <- mean(bostonlog_pred != test_set$crim01)
print(bostonlog_error)
## [1] NaN
boston_lda <- lda(crim01 ~ nox + rad + tax + dis, data = bostontrain_set)
print(boston_lda)
## Call:
## lda(crim01 ~ nox + rad + tax + dis, data = bostontrain_set)
##
## Prior probabilities of groups:
## 0 1
## 0.5049505 0.4950495
##
## Group means:
## nox rad tax dis
## 0 0.4699054 4.22549 309.6765 5.147072
## 1 0.6361000 14.79000 508.6700 2.502883
##
## Coefficients of linear discriminants:
## LD1
## nox 8.311749661
## rad 0.088010880
## tax -0.001745742
## dis -0.141699629
bostonpred_lda <- predict(boston_lda, newdata = bostontest_set)$class
lda_bostonerror <- mean(bostonpred_lda != bostontest_set$crim01)
print(lda_bostonerror)
## [1] 0.1568627
boston_nb <- naiveBayes(crim01 ~ nox + rad + tax + dis, data = bostontrain_set)
print(boston_nb)
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## 0 1
## 0.5049505 0.4950495
##
## Conditional probabilities:
## nox
## Y [,1] [,2]
## 0 0.4699054 0.05497320
## 1 0.6361000 0.09875222
##
## rad
## Y [,1] [,2]
## 0 4.22549 1.618288
## 1 14.79000 9.563115
##
## tax
## Y [,1] [,2]
## 0 309.6765 91.01661
## 1 508.6700 168.01885
##
## dis
## Y [,1] [,2]
## 0 5.147072 2.09731
## 1 2.502883 1.07322
boston_nbpred <- predict(boston_nb, newdata = test_set)
## Warning in predict.naiveBayes(boston_nb, newdata = test_set): Type mismatch
## between training and new data for variable 'nox'. Did you use factors with
## numeric labels for training, and numeric values for new data?
## Warning in predict.naiveBayes(boston_nb, newdata = test_set): Type mismatch
## between training and new data for variable 'rad'. Did you use factors with
## numeric labels for training, and numeric values for new data?
## Warning in predict.naiveBayes(boston_nb, newdata = test_set): Type mismatch
## between training and new data for variable 'tax'. Did you use factors with
## numeric labels for training, and numeric values for new data?
## Warning in predict.naiveBayes(boston_nb, newdata = test_set): Type mismatch
## between training and new data for variable 'dis'. Did you use factors with
## numeric labels for training, and numeric values for new data?
boston_nberror <- mean(boston_nbpred != bostontest_set$crim01)
## Warning in `!=.default`(boston_nbpred, bostontest_set$crim01): longer object
## length is not a multiple of shorter object length
## Warning in is.na(e1) | is.na(e2): longer object length is not a multiple of
## shorter object length
print(boston_nberror)
## [1] 0.5196078
bostonscaled_features <- scale(Boston_new[, -1])
train_boston <- bostonscaled_features[train_indices, c("nox", "rad", "tax", "dis")]
test_boston <- bostonscaled_features[-train_indices, c("nox", "rad", "tax", "dis")]
train_y <- Boston_new$crim01[train_indices]
test_y <- Boston_new$crim01[-train_indices]
set.seed(123)
knn_pred <- knn(train_boston, test_boston, train_y, k = 5)
err_knn <- mean(knn_pred != test_y)
print(err_knn)
## [1] 0.06862745