library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.3.3
library(MASS)
## Warning: package 'MASS' was built under R version 4.3.2
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
library(class)
## Warning: package 'class' was built under R version 4.3.3
library(e1071)
## Warning: package 'e1071' was built under R version 4.3.3
data("Weekly")
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
pairs(Weekly[, -9]) # Exclude Direction (factor)
cor(Weekly[, -9])
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876
## Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535
## Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865
## Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000
## Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027
## Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873
## Lag5 Volume Today
## Year -0.030519101 0.84194162 -0.032459894
## Lag1 -0.008183096 -0.06495131 -0.075031842
## Lag2 -0.072499482 -0.08551314 0.059166717
## Lag3 0.060657175 -0.06928771 -0.071243639
## Lag4 -0.075675027 -0.06107462 -0.007825873
## Lag5 1.000000000 -0.05851741 0.011012698
## Volume -0.058517414 1.00000000 -0.033077783
## Today 0.011012698 -0.03307778 1.000000000
plot(Weekly$Volume, type='l', main="Volume Over Time")
table(Weekly$Direction)
##
## Down Up
## 484 605
Patterns are that there are more down than ups in the data. There is a gradual increase in weekly volume over time. There are weak relationships between Lag1 to Lag2 and Today as well as in Lag1 to Lag2 and Year. Strong correlation between year and volume.
glm.fit <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Weekly, family = binomial)
summary(glm.fit)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
The statistically significant predictor is Lag2 with a p-value less than 0.05.
probs <- predict(glm.fit, type = "response")
pred <- ifelse(probs > 0.5, "Up", "Down")
table(Predicted = pred, Actual = Weekly$Direction)
## Actual
## Predicted Down Up
## Down 54 48
## Up 430 557
accuracy <- (54 + 557) / 1089 # TP + TN / total
accuracy
## [1] 0.5610652
This is telling us that this model often predicts “Up” more than “down.” The false negative is very high. There is a moderate accuracy of 56.11% with a bias towards predicting “Up” direction.
train <- Weekly$Year <= 2008
test <- Weekly$Year > 2008
glm.fit2 <- glm(Direction ~ Lag2, data = Weekly, family = binomial, subset = train)
probs2 <- predict(glm.fit2, newdata = Weekly[test, ], type = "response")
pred2 <- ifelse(probs2 > 0.5, "Up", "Down")
table(Predicted = pred2, Actual = Weekly$Direction[test])
## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
mean(pred2 == Weekly$Direction[test])
## [1] 0.625
lda.fit <- lda(Direction ~ Lag2, data = Weekly, subset = train)
lda.pred <- predict(lda.fit, Weekly[test, ])
table(Predicted = lda.pred$class, Actual = Weekly$Direction[test])
## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
mean(lda.pred$class == Weekly$Direction[test])
## [1] 0.625
qda.fit <- qda(Direction ~ Lag2, data = Weekly, subset = train)
qda.pred <- predict(qda.fit, Weekly[test, ])
table(Predicted = qda.pred$class, Actual = Weekly$Direction[test])
## Actual
## Predicted Down Up
## Down 0 0
## Up 43 61
mean(qda.pred$class == Weekly$Direction[test])
## [1] 0.5865385
train.X <- as.matrix(Weekly[train, "Lag2"])
test.X <- as.matrix(Weekly[test, "Lag2"])
train.Direction <- Weekly$Direction[train]
set.seed(1)
knn.pred <- knn(train.X, test.X, train.Direction, k = 1)
table(Predicted = knn.pred, Actual = Weekly$Direction[test])
## Actual
## Predicted Down Up
## Down 21 30
## Up 22 31
mean(knn.pred == Weekly$Direction[test])
## [1] 0.5
nb.fit <- naiveBayes(Direction ~ Lag2, data = Weekly, subset = train)
nb.pred <- predict(nb.fit, Weekly[test, ])
table(Predicted = nb.pred, Actual = Weekly$Direction[test])
## Actual
## Predicted Down Up
## Down 0 0
## Up 43 61
mean(nb.pred == Weekly$Direction[test])
## [1] 0.5865385
The best method and the highest accuracy was LDA with a 62.50% accuracy.
train <- Weekly$Year <= 2008
test <- Weekly$Year > 2008
glm.fit <- glm(Direction ~ Lag2 * Lag1 + Volume,
data = Weekly, family = binomial, subset = train)
probs <- predict(glm.fit, Weekly[test, ], type = "response")
pred <- ifelse(probs > 0.5, "Up", "Down")
table(Predicted = pred, Actual = Weekly$Direction[test])
## Actual
## Predicted Down Up
## Down 27 32
## Up 16 29
mean(pred == Weekly$Direction[test])
## [1] 0.5384615
lda.fit <- lda(Direction ~ Lag2 + Lag1 + I(Lag1*Lag2), data = Weekly, subset = train)
lda.pred <- predict(lda.fit, Weekly[test, ])
table(Predicted = lda.pred$class, Actual = Weekly$Direction[test])
## Actual
## Predicted Down Up
## Down 7 8
## Up 36 53
mean(lda.pred$class == Weekly$Direction[test])
## [1] 0.5769231
qda.fit <- qda(Direction ~ Lag2 + I(Lag2^2) + I(Lag1*Lag2), data = Weekly, subset = train)
qda.pred <- predict(qda.fit, Weekly[test, ])
table(Predicted = qda.pred$class, Actual = Weekly$Direction[test])
## Actual
## Predicted Down Up
## Down 18 29
## Up 25 32
mean(qda.pred$class == Weekly$Direction[test])
## [1] 0.4807692
train.X <- scale(as.matrix(Weekly[train, c("Lag1", "Lag2")]))
test.X <- scale(as.matrix(Weekly[test, c("Lag1", "Lag2")]))
train.Direction <- Weekly$Direction[train]
set.seed(1)
knn.pred <- knn(train.X, test.X, train.Direction, k = 3)
table(Predicted = knn.pred, Actual = Weekly$Direction[test])
## Actual
## Predicted Down Up
## Down 23 32
## Up 20 29
mean(knn.pred == Weekly$Direction[test])
## [1] 0.5
nb.fit <- naiveBayes(Direction ~ Lag2 + I(Lag2^2), data = Weekly, subset = train)
nb.pred <- predict(nb.fit, Weekly[test, ])
## Warning in predict.naiveBayes(nb.fit, Weekly[test, ]): Type mismatch between
## training and new data for variable 'I(Lag2^2)'. Did you use factors with
## numeric labels for training, and numeric values for new data?
table(Predicted = nb.pred, Actual = Weekly$Direction[test])
## Actual
## Predicted Down Up
## Down 0 0
## Up 43 61
mean(nb.pred == Weekly$Direction[test])
## [1] 0.5865385
After these transformations performed on this subset of the data, the naive bayes method proved to have the highest accuracy of 58.65% therefore it is considered the best method in this case.
Auto <- read.csv("C:/Users/lclha/Documents/MSDA Program 2024-2026/Summer 2025/Predictive Modeling/Datasets/Auto.csv", header = TRUE, na.strings = "?")
Auto <- na.omit(Auto)
Auto$mpg01 <- ifelse(Auto$mpg > median(Auto$mpg), 1, 0)
par(mfrow = c(2, 2))
boxplot(horsepower ~ mpg01, data = Auto, main = "Horsepower vs mpg01")
boxplot(weight ~ mpg01, data = Auto, main = "Weight vs mpg01")
boxplot(displacement ~ mpg01, data = Auto, main = "Displacement vs mpg01")
boxplot(cylinders ~ mpg01, data = Auto, main = "Cylinders vs mpg01")
cor(Auto[, sapply(Auto, is.numeric)])
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## mpg01 0.8369392 -0.7591939 -0.7534766 -0.6670526 -0.7577566
## acceleration year origin mpg01
## mpg 0.4233285 0.5805410 0.5652088 0.8369392
## cylinders -0.5046834 -0.3456474 -0.5689316 -0.7591939
## displacement -0.5438005 -0.3698552 -0.6145351 -0.7534766
## horsepower -0.6891955 -0.4163615 -0.4551715 -0.6670526
## weight -0.4168392 -0.3091199 -0.5850054 -0.7577566
## acceleration 1.0000000 0.2903161 0.2127458 0.3468215
## year 0.2903161 1.0000000 0.1815277 0.4299042
## origin 0.2127458 0.1815277 1.0000000 0.5136984
## mpg01 0.3468215 0.4299042 0.5136984 1.0000000
Useful predictors would be horsepower, weight,displacement,cylinder have far apart medians between the binary responses for mpg01 and they all have strong negative correlations against mpg01 variable.
set.seed(1)
train_index <- sample(1:nrow(Auto), nrow(Auto) / 2)
train <- Auto[train_index, ]
test <- Auto[-train_index, ]
vars <- c("horsepower", "weight", "displacement", "cylinders")
library(MASS)
lda.fit <- lda(mpg01 ~ horsepower + weight + displacement + cylinders, data = train)
lda.pred <- predict(lda.fit, test)
lda_error <- mean(lda.pred$class != test$mpg01)
lda_error
## [1] 0.127551
qda.fit <- qda(mpg01 ~ horsepower + weight + displacement + cylinders, data = train)
qda.pred <- predict(qda.fit, test)
qda_error <- mean(qda.pred$class != test$mpg01)
qda_error
## [1] 0.1173469
glm.fit <- glm(mpg01 ~ horsepower + weight + displacement + cylinders, data = train, family = binomial)
probs <- predict(glm.fit, test, type = "response")
glm.pred <- ifelse(probs > 0.5, 1, 0)
logit_error <- mean(glm.pred != test$mpg01)
logit_error
## [1] 0.122449
library(e1071)
nb.fit <- naiveBayes(as.factor(mpg01) ~ horsepower + weight + displacement + cylinders, data = train)
nb.pred <- predict(nb.fit, test)
nb_error <- mean(nb.pred != test$mpg01)
nb_error
## [1] 0.1173469
library(class)
train.X <- scale(train[, vars])
test.X <- scale(test[, vars], center = attr(train.X, "scaled:center"), scale = attr(train.X, "scaled:scale"))
train.Y <- train$mpg01
# Try different K values
set.seed(1)
knn_errors <- sapply(1:10, function(k) {
knn.pred <- knn(train.X, test.X, train.Y, k = k)
mean(knn.pred != test$mpg01)
})
knn_errors
## [1] 0.1530612 0.1275510 0.1122449 0.1173469 0.1173469 0.1275510 0.1173469
## [8] 0.1224490 0.1173469 0.1173469
best_k <- which.min(knn_errors)
best_knn_error <- knn_errors[best_k]
best_k
## [1] 3
best_knn_error
## [1] 0.1122449
cat("LDA error:", lda_error, "\n")
## LDA error: 0.127551
cat("QDA error:", qda_error, "\n")
## QDA error: 0.1173469
cat("Logistic Regression error:", logit_error, "\n")
## Logistic Regression error: 0.122449
cat("Naive Bayes error:", nb_error, "\n")
## Naive Bayes error: 0.1173469
cat("Best KNN error:", best_knn_error, "at K =", best_k, "\n")
## Best KNN error: 0.1122449 at K = 3
library(MASS)
library(class)
library(e1071)
# Load Boston data
data("Boston")
# Create binary target variable: 1 if crime rate > median, else 0
Boston$crim01 <- ifelse(Boston$crim > median(Boston$crim), 1, 0)
?Boston
## starting httpd help server ... done
correlations <- cor(Boston[, sapply(Boston, is.numeric)])
correlations
## crim zn indus chas nox
## crim 1.00000000 -0.20046922 0.40658341 -0.055891582 0.42097171
## zn -0.20046922 1.00000000 -0.53382819 -0.042696719 -0.51660371
## indus 0.40658341 -0.53382819 1.00000000 0.062938027 0.76365145
## chas -0.05589158 -0.04269672 0.06293803 1.000000000 0.09120281
## nox 0.42097171 -0.51660371 0.76365145 0.091202807 1.00000000
## rm -0.21924670 0.31199059 -0.39167585 0.091251225 -0.30218819
## age 0.35273425 -0.56953734 0.64477851 0.086517774 0.73147010
## dis -0.37967009 0.66440822 -0.70802699 -0.099175780 -0.76923011
## rad 0.62550515 -0.31194783 0.59512927 -0.007368241 0.61144056
## tax 0.58276431 -0.31456332 0.72076018 -0.035586518 0.66802320
## ptratio 0.28994558 -0.39167855 0.38324756 -0.121515174 0.18893268
## black -0.38506394 0.17552032 -0.35697654 0.048788485 -0.38005064
## lstat 0.45562148 -0.41299457 0.60379972 -0.053929298 0.59087892
## medv -0.38830461 0.36044534 -0.48372516 0.175260177 -0.42732077
## crim01 0.40939545 -0.43615103 0.60326017 0.070096774 0.72323480
## rm age dis rad tax ptratio
## crim -0.21924670 0.35273425 -0.37967009 0.625505145 0.58276431 0.2899456
## zn 0.31199059 -0.56953734 0.66440822 -0.311947826 -0.31456332 -0.3916785
## indus -0.39167585 0.64477851 -0.70802699 0.595129275 0.72076018 0.3832476
## chas 0.09125123 0.08651777 -0.09917578 -0.007368241 -0.03558652 -0.1215152
## nox -0.30218819 0.73147010 -0.76923011 0.611440563 0.66802320 0.1889327
## rm 1.00000000 -0.24026493 0.20524621 -0.209846668 -0.29204783 -0.3555015
## age -0.24026493 1.00000000 -0.74788054 0.456022452 0.50645559 0.2615150
## dis 0.20524621 -0.74788054 1.00000000 -0.494587930 -0.53443158 -0.2324705
## rad -0.20984667 0.45602245 -0.49458793 1.000000000 0.91022819 0.4647412
## tax -0.29204783 0.50645559 -0.53443158 0.910228189 1.00000000 0.4608530
## ptratio -0.35550149 0.26151501 -0.23247054 0.464741179 0.46085304 1.0000000
## black 0.12806864 -0.27353398 0.29151167 -0.444412816 -0.44180801 -0.1773833
## lstat -0.61380827 0.60233853 -0.49699583 0.488676335 0.54399341 0.3740443
## medv 0.69535995 -0.37695457 0.24992873 -0.381626231 -0.46853593 -0.5077867
## crim01 -0.15637178 0.61393992 -0.61634164 0.619786249 0.60874128 0.2535684
## black lstat medv crim01
## crim -0.38506394 0.4556215 -0.3883046 0.40939545
## zn 0.17552032 -0.4129946 0.3604453 -0.43615103
## indus -0.35697654 0.6037997 -0.4837252 0.60326017
## chas 0.04878848 -0.0539293 0.1752602 0.07009677
## nox -0.38005064 0.5908789 -0.4273208 0.72323480
## rm 0.12806864 -0.6138083 0.6953599 -0.15637178
## age -0.27353398 0.6023385 -0.3769546 0.61393992
## dis 0.29151167 -0.4969958 0.2499287 -0.61634164
## rad -0.44441282 0.4886763 -0.3816262 0.61978625
## tax -0.44180801 0.5439934 -0.4685359 0.60874128
## ptratio -0.17738330 0.3740443 -0.5077867 0.25356836
## black 1.00000000 -0.3660869 0.3334608 -0.35121093
## lstat -0.36608690 1.0000000 -0.7376627 0.45326273
## medv 0.33346082 -0.7376627 1.0000000 -0.26301673
## crim01 -0.35121093 0.4532627 -0.2630167 1.00000000
boxplot(Boston$tax ~ Boston$crim01, main = "Tax vs Crime01")
boxplot(Boston$nox ~ Boston$crim01, main = "NOx vs Crime01")
boxplot(Boston$dis ~ Boston$crim01, main = "Distance vs Crime01")
boxplot(Boston$indus ~ Boston$crim01, main = "Industry vs Crime01")
boxplot(Boston$age ~ Boston$crim01, main = "Age of Units Prior 1940 vs Crime01")
boxplot(Boston$rad ~ Boston$crim01, main = "Radical Highways vs Crime01")
Useful predictors would be tax, nox, dis,indus,age, rad variables have
far apart medians between the binary responses for crim01 and they all
have strong correlations against crim01 variable.
set.seed(1)
train_index <- sample(1:nrow(Boston), nrow(Boston) / 2)
train <- Boston[train_index, ]
test <- Boston[-train_index, ]
glm.fit <- glm(crim01 ~ nox + rad + dis + tax + indus + age, data = train, family = binomial)
glm.probs <- predict(glm.fit, newdata = test, type = "response")
glm.pred <- ifelse(glm.probs > 0.5, 1, 0)
logit_error2 <- mean(glm.pred != test$crim01)
logit_error2
## [1] 0.1146245
lda.fit <- lda(crim01 ~ nox + rad + dis + tax + indus + age, data = train)
lda.pred <- predict(lda.fit, test)
lda_error2 <- mean(lda.pred$class != test$crim01)
lda_error2
## [1] 0.1581028
nb.fit <- naiveBayes(as.factor(crim01) ~nox + rad + dis + tax + indus + age, data = train)
nb.pred <- predict(nb.fit, test)
nb_error2 <- mean(nb.pred != test$crim01)
nb_error2
## [1] 0.1778656
vars <- c("nox", "rad", "dis", "tax", "age","indus")
train.X <- scale(train[, vars])
test.X <- scale(test[, vars], center = attr(train.X, "scaled:center"), scale = attr(train.X, "scaled:scale"))
train.Y <- train$crim01
# Try K = 1 to 10
knn.errors <- sapply(1:10, function(k) {
pred <- knn(train.X, test.X, cl = train.Y, k = k)
mean(pred != test$crim01)
})
# Best K
best_k2 <- which.min(knn.errors)
best_knn_error2 <- knn.errors[which.min(knn.errors)]
best_k2
## [1] 3
best_knn_error2
## [1] 0.07509881
cat("LDA error:", lda_error2, "\n")
## LDA error: 0.1581028
cat("Logistic Regression error:", logit_error2, "\n")
## Logistic Regression error: 0.1146245
cat("Naive Bayes error:", nb_error2, "\n")
## Naive Bayes error: 0.1778656
cat("Best KNN error:", best_knn_error2, "at K =", best_k2, "\n")
## Best KNN error: 0.07509881 at K = 3
Based on test error rates, KNN (with K = 3) performed best on this dataset. Variables like nox, rad, dis, tax, age,indus were the most predictive of whether a suburb’s crime rate is above the median. LDA and Naive Bayes also performed reasonably well, while logistic regression was slightly better.