# Load libraries
library(ISLR2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
##
## The following object is masked from 'package:ISLR2':
##
## Boston
library(class)
library(e1071)
# Load the Weekly dataset
data("Weekly")
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
# Plot time-series data for Volume
plot(Weekly$Volume, type = "l", main = "Volume Over Time", ylab = "Volume")
# Create boxplots for lag variables
boxplot(Weekly[, 2:6], main = "Boxplots for Lag Variables")
# Visualize Direction (Up or Down)
table(Weekly$Direction)
##
## Down Up
## 484 605
barplot(table(Weekly$Direction), main = "Distribution of Direction", col = c("red", "green"))
Answer: The Weekly dataset shows an increasing trend in trading
volume over time, Lag variables have similar distributions with some
outliers, and the Direction variable is fairly balanced, with slightly
more “Up” movements than “Down”.
log_model <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = Weekly, family = binomial)
summary(log_model)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
Answer: Lag2 is the only statistically significant predictor (p = 0.0296), while Lag1, Lag3, Lag4, Lag5, and Volume are not significant.
pred_probs <- predict(log_model, type = "response")
pred_class <- ifelse(pred_probs > 0.5, "Up", "Down")
conf_matrix <- table(Predicted = pred_class, Actual = Weekly$Direction)
conf_matrix
## Actual
## Predicted Down Up
## Down 54 48
## Up 430 557
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
accuracy
## [1] 0.5610652
Answer: The confusion matrix shows the model correctly predicts 561 out of 1,089 cases (accuracy = 56.1%). However, it misclassifies many “Down” movements as “Up,” indicating poor predictive performance.
train_data <- Weekly[Weekly$Year < 2009, ]
test_data <- Weekly[Weekly$Year >= 2009, ]
log_model_train <- glm(Direction ~ Lag2, data = train_data, family = binomial)
test_probs <- predict(log_model_train, newdata = test_data, type = "response")
test_pred_class <- ifelse(test_probs > 0.5, "Up", "Down")
test_conf_matrix <- table(Predicted = test_pred_class, Actual = test_data$Direction)
test_conf_matrix
## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
test_accuracy <- sum(diag(test_conf_matrix)) / sum(test_conf_matrix)
test_accuracy
## [1] 0.625
Answer: The logistic regression model using Lag2 as the only predictor achieves an accuracy of 62.5%, slightly improving performance but still misclassifying some “Down” movements as “Up.”
lda_model <- lda(Direction ~ Lag2, data = train_data)
lda_pred <- predict(lda_model, test_data)
lda_conf_matrix <- table(Predicted = lda_pred$class, Actual = test_data$Direction)
lda_conf_matrix
## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
lda_accuracy <- sum(diag(lda_conf_matrix)) / sum(lda_conf_matrix)
lda_accuracy
## [1] 0.625
Answer: The LDA model achieves 62.5% accuracy, identical to logistic regression, with similar misclassification patterns.
qda_model <- qda(Direction ~ Lag2, data = train_data)
qda_pred <- predict(qda_model, test_data)
qda_conf_matrix <- table(Predicted = qda_pred$class, Actual = test_data$Direction)
qda_conf_matrix
## Actual
## Predicted Down Up
## Down 0 0
## Up 43 61
qda_accuracy <- sum(diag(qda_conf_matrix)) / sum(qda_conf_matrix)
qda_accuracy
## [1] 0.5865385
Answer: The QDA model achieves 58.7% accuracy, lower than logistic regression and LDA. It misclassifies all observations as “Up,” failing to classify any “Down” movements correctly.
train_X <- train_data$Lag2
test_X <- test_data$Lag2
train_Y <- train_data$Direction
test_Y <- test_data$Direction
knn_pred <- knn(train = matrix(train_X), test = matrix(test_X), cl = train_Y, k = 1)
knn_conf_matrix <- table(Predicted = knn_pred, Actual = test_Y)
knn_conf_matrix
## Actual
## Predicted Down Up
## Down 21 29
## Up 22 32
knn_accuracy <- sum(diag(knn_conf_matrix)) / sum(knn_conf_matrix)
knn_accuracy
## [1] 0.5096154
Answer: The KNN model (K=1) achieves 50.96% accuracy, making it the worst performer, close to random guessing.
nb_model <- naiveBayes(Direction ~ Lag2, data = train_data)
nb_pred <- predict(nb_model, test_data)
nb_conf_matrix <- table(Predicted = nb_pred, Actual = test_data$Direction)
nb_conf_matrix
## Actual
## Predicted Down Up
## Down 0 0
## Up 43 61
nb_accuracy <- sum(diag(nb_conf_matrix)) / sum(nb_conf_matrix)
nb_accuracy
## [1] 0.5865385
Answer: The Naïve Bayes model achieves 58.7% accuracy, matching QDA, but fails to classify any “Down” movements correctly.
accuracy_results <- data.frame(
Model = c("Logistic Regression", "LDA", "QDA", "KNN (K=1)", "Naïve Bayes"),
Accuracy = c(test_accuracy, lda_accuracy, qda_accuracy, knn_accuracy, nb_accuracy)
)
print(accuracy_results)
## Model Accuracy
## 1 Logistic Regression 0.6250000
## 2 LDA 0.6250000
## 3 QDA 0.5865385
## 4 KNN (K=1) 0.5096154
## 5 Naïve Bayes 0.5865385
Answer: The best models are Logistic Regression and LDA, both achieving 62.5% accuracy. QDA and Naïve Bayes perform worse at 58.7%, while KNN is the worst with 50.96% accuracy.
mpg01
data(Auto, package = "ISLR2")
Auto$mpg01 <- ifelse(Auto$mpg > median(Auto$mpg), 1, 0)
table(Auto$mpg01)
##
## 0 1
## 196 196
Answer: The binary variable mpg01
is
successfully created, with an equal distribution of 196 cars classified
as “low mileage (0)” and 196 as “high mileage (1).”
pairs(Auto)
boxplot(Auto$horsepower ~ Auto$mpg01, main="Horsepower vs mpg01", col=c("red", "green"))
boxplot(Auto$weight ~ Auto$mpg01, main="Weight vs mpg01", col=c("blue", "yellow"))
boxplot(Auto$acceleration ~ Auto$mpg01, main="Acceleration vs mpg01", col=c("purple", "orange"))
Answer: Cars with higher horsepower and weight tend to have
lower mileage, while cars with lower horsepower and weight have higher
mileage. Acceleration shows a smaller difference.
set.seed(1)
train_indices <- sample(1:nrow(Auto), nrow(Auto) * 0.7)
train_data <- Auto[train_indices, ]
test_data <- Auto[-train_indices, ]
lda_model <- lda(mpg01 ~ horsepower + weight + acceleration, data = train_data)
lda_pred <- predict(lda_model, test_data)
lda_conf_matrix <- table(Predicted = lda_pred$class, Actual = test_data$mpg01)
lda_conf_matrix
## Actual
## Predicted 0 1
## 0 48 3
## 1 13 54
lda_accuracy <- sum(diag(lda_conf_matrix)) / sum(lda_conf_matrix)
lda_accuracy
## [1] 0.8644068
Answer: The LDA model achieves 86.44% accuracy, correctly classifying most observations with a few misclassifications.
qda_model <- qda(mpg01 ~ horsepower + weight + acceleration, data = train_data)
qda_pred <- predict(qda_model, test_data)
qda_conf_matrix <- table(Predicted = qda_pred$class, Actual = test_data$mpg01)
qda_conf_matrix
## Actual
## Predicted 0 1
## 0 53 6
## 1 8 51
qda_accuracy <- sum(diag(qda_conf_matrix)) / sum(qda_conf_matrix)
qda_accuracy
## [1] 0.8813559
Answer: The QDA model achieves 88.14% accuracy, performing slightly better than LDA.
log_model <- glm(mpg01 ~ horsepower + weight + acceleration, data = train_data, family = binomial)
log_probs <- predict(log_model, test_data, type = "response")
log_pred <- ifelse(log_probs > 0.5, 1, 0)
log_conf_matrix <- table(Predicted = log_pred, Actual = test_data$mpg01)
log_conf_matrix
## Actual
## Predicted 0 1
## 0 53 7
## 1 8 50
log_accuracy <- sum(diag(log_conf_matrix)) / sum(log_conf_matrix)
log_accuracy
## [1] 0.8728814
Answer: The logistic regression model achieves 87.29% accuracy, slightly lower than QDA but higher than LDA.
nb_model <- naiveBayes(mpg01 ~ horsepower + weight + acceleration, data = train_data)
nb_pred <- predict(nb_model, test_data)
nb_conf_matrix <- table(Predicted = nb_pred, Actual = test_data$mpg01)
nb_conf_matrix
## Actual
## Predicted 0 1
## 0 51 4
## 1 10 53
nb_accuracy <- sum(diag(nb_conf_matrix)) / sum(nb_conf_matrix)
nb_accuracy
## [1] 0.8813559
Answer: The Naïve Bayes model achieves 88.14% accuracy, matching QDA and performing slightly better than logistic regression and LDA.
train_X <- train_data[, c("horsepower", "weight", "acceleration")]
test_X <- test_data[, c("horsepower", "weight", "acceleration")]
train_Y <- train_data$mpg01
test_Y <- test_data$mpg01
set.seed(1)
k_values <- c(1, 3, 5, 10)
knn_results <- data.frame(K = k_values, Accuracy = NA)
for (i in 1:length(k_values)) {
knn_pred <- knn(train = train_X, test = test_X, cl = train_Y, k = k_values[i])
knn_conf_matrix <- table(Predicted = knn_pred, Actual = test_Y)
knn_accuracy <- sum(diag(knn_conf_matrix)) / sum(knn_conf_matrix)
knn_results$Accuracy[i] <- knn_accuracy
}
print(knn_results)
## K Accuracy
## 1 1 0.8559322
## 2 3 0.9067797
## 3 5 0.8813559
## 4 10 0.8728814
Answer: The best-performing KNN model is K = 3, achieving 90.68% accuracy, the highest among all models.
# Load the Boston dataset
library(MASS)
data(Boston)
# Create binary variable Crime_High (1 if above median, 0 if below median)
Boston$Crime_High <- ifelse(Boston$crim > median(Boston$crim), 1, 0)
table(Boston$Crime_High)
##
## 0 1
## 253 253
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv Crime_High
## Min. : 1.73 Min. : 5.00 Min. :0.0
## 1st Qu.: 6.95 1st Qu.:17.02 1st Qu.:0.0
## Median :11.36 Median :21.20 Median :0.5
## Mean :12.65 Mean :22.53 Mean :0.5
## 3rd Qu.:16.95 3rd Qu.:25.00 3rd Qu.:1.0
## Max. :37.97 Max. :50.00 Max. :1.0
Answer: The crime rate variable is successfully converted into a binary classification problem, with an equal distribution of census tracts classified as high crime (1) and low crime (0).
# Correlation analysis
correlation_matrix <- cor(Boston)
print(correlation_matrix["crim",]) # Show correlation of crim with other variables
## crim zn indus chas nox rm
## 1.00000000 -0.20046922 0.40658341 -0.05589158 0.42097171 -0.21924670
## age dis rad tax ptratio black
## 0.35273425 -0.37967009 0.62550515 0.58276431 0.28994558 -0.38506394
## lstat medv Crime_High
## 0.45562148 -0.38830461 0.40939545
# Boxplots to explore feature relationships
boxplot(Boston$age ~ Boston$Crime_High, main = "Age vs Crime Rate", col = c("blue", "red"))
boxplot(Boston$tax ~ Boston$Crime_High, main = "Tax Rate vs Crime Rate", col = c("green", "yellow"))
boxplot(Boston$rm ~ Boston$Crime_High, main = "Rooms per Dwelling vs Crime Rate", col = c("purple", "orange"))
Answer: Higher crime areas tend to have older housing
(higher age), higher tax rates, and fewer rooms per
dwelling.
set.seed(1)
train_indices <- sample(1:nrow(Boston), nrow(Boston) * 0.7)
train_data <- Boston[train_indices, ]
test_data <- Boston[-train_indices, ]
log_model <- glm(Crime_High ~ age + tax + rm, data = train_data, family = binomial)
log_probs <- predict(log_model, test_data, type = "response")
log_pred <- ifelse(log_probs > 0.5, 1, 0)
log_conf_matrix <- table(Predicted = log_pred, Actual = test_data$Crime_High)
log_conf_matrix
## Actual
## Predicted 0 1
## 0 66 13
## 1 7 66
log_accuracy <- sum(diag(log_conf_matrix)) / sum(log_conf_matrix)
log_accuracy
## [1] 0.8684211
Answer: The logistic regression model achieves 86.84% accuracy, correctly classifying most observations with few misclassifications.
lda_model <- lda(Crime_High ~ age + tax + rm, data = train_data)
lda_pred <- predict(lda_model, test_data)
lda_conf_matrix <- table(Predicted = lda_pred$class, Actual = test_data$Crime_High)
lda_conf_matrix
## Actual
## Predicted 0 1
## 0 66 15
## 1 7 64
lda_accuracy <- sum(diag(lda_conf_matrix)) / sum(lda_conf_matrix)
lda_accuracy
## [1] 0.8552632
Answer: The LDA model achieves 85.53% accuracy, slightly lower than logistic regression but still performs well.
nb_model <- naiveBayes(Crime_High ~ age + tax + rm, data = train_data)
nb_pred <- predict(nb_model, test_data)
nb_conf_matrix <- table(Predicted = nb_pred, Actual = test_data$Crime_High)
nb_conf_matrix
## Actual
## Predicted 0 1
## 0 65 18
## 1 8 61
nb_accuracy <- sum(diag(nb_conf_matrix)) / sum(nb_conf_matrix)
nb_accuracy
## [1] 0.8289474
Answer: The Naïve Bayes model achieves 82.89% accuracy, lower than both logistic regression and LDA.
train_X <- train_data[, c("age", "tax", "rm")]
test_X <- test_data[, c("age", "tax", "rm")]
train_Y <- train_data$Crime_High
test_Y <- test_data$Crime_High
set.seed(1)
k_values <- c(1, 3, 5, 10)
knn_results <- data.frame(K = k_values, Accuracy = NA)
for (i in 1:length(k_values)) {
knn_pred <- knn(train = train_X, test = test_X, cl = train_Y, k = k_values[i])
knn_conf_matrix <- table(Predicted = knn_pred, Actual = test_Y)
knn_accuracy <- sum(diag(knn_conf_matrix)) / sum(knn_conf_matrix)
knn_results$Accuracy[i] <- knn_accuracy
}
print(knn_results)
## K Accuracy
## 1 1 0.8815789
## 2 3 0.9013158
## 3 5 0.9013158
## 4 10 0.8684211
Answer: The best-performing KNN model is K=3 or K=5, achieving 90.13% accuracy, the highest among all models.
accuracy_results <- data.frame(
Model = c("Logistic Regression", "LDA", "Naïve Bayes", "Best KNN"),
Accuracy = c(log_accuracy, lda_accuracy, nb_accuracy, max(knn_results$Accuracy))
)
print(accuracy_results)
## Model Accuracy
## 1 Logistic Regression 0.8684211
## 2 LDA 0.8552632
## 3 Naïve Bayes 0.8289474
## 4 Best KNN 0.9013158
Answer: The best model is KNN (K = 3 or 5), achieving 90.13% accuracy. Logistic regression and LDA performed slightly worse, while Naïve Bayes had the lowest accuracy.