library(knitr)
library(caret)
## Warning: package 'caret' was built under R version 4.4.2
## Loading required package: ggplot2
## Loading required package: lattice
library(e1071)
library(class)
library(MASS)
library(GGally)
## Warning: package 'GGally' was built under R version 4.4.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(ggplot2)
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.4.2
##
## Attaching package: 'ISLR2'
## The following object is masked from 'package:MASS':
##
## Boston
attach(Weekly)
str(Weekly)
## 'data.frame': 1089 obs. of 9 variables:
## $ Year : num 1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
## $ Lag1 : num 0.816 -0.27 -2.576 3.514 0.712 ...
## $ Lag2 : num 1.572 0.816 -0.27 -2.576 3.514 ...
## $ Lag3 : num -3.936 1.572 0.816 -0.27 -2.576 ...
## $ Lag4 : num -0.229 -3.936 1.572 0.816 -0.27 ...
## $ Lag5 : num -3.484 -0.229 -3.936 1.572 0.816 ...
## $ Volume : num 0.155 0.149 0.16 0.162 0.154 ...
## $ Today : num -0.27 -2.576 3.514 0.712 1.178 ...
## $ Direction: Factor w/ 2 levels "Down","Up": 1 1 2 2 2 1 2 2 2 1 ...
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
(cor(Weekly[sapply(Weekly, is.numeric)]))
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876
## Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535
## Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865
## Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000
## Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027
## Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873
## Lag5 Volume Today
## Year -0.030519101 0.84194162 -0.032459894
## Lag1 -0.008183096 -0.06495131 -0.075031842
## Lag2 -0.072499482 -0.08551314 0.059166717
## Lag3 0.060657175 -0.06928771 -0.071243639
## Lag4 -0.075675027 -0.06107462 -0.007825873
## Lag5 1.000000000 -0.05851741 0.011012698
## Volume -0.058517414 1.00000000 -0.033077783
## Today 0.011012698 -0.03307778 1.000000000
pairs(Weekly)
From examining the plot, we can observe a clear correlation between
Year and Volume, with both variables
demonstrating a positive trend over time. The data reveals a consistent
increase in volume as years progress, suggesting a sustained pattern of
growth rather than random fluctuation.
ggplot(Weekly, aes(x = Direction)) +
geom_bar()
Your bar plot correctly represents the count of each category in the
Direction variable. The bars clearly show that there are
more instances of “Up” compared to
“Down”.
Weekly_glm <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Weekly, family = binomial)
summary(Weekly_glm)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
Looking at the results, we can see that the predictor
Lag2 is significant, having a p-value less than 0.05.
Weekly_glm <- predict(Weekly_glm, newdata = Weekly, type = 'response')
Weekly_glm_pred <- ifelse(Weekly_glm > 0.5, 'Up', 'Down')
table(Weekly_glm_pred, Weekly$Direction)
##
## Weekly_glm_pred Down Up
## Down 54 48
## Up 430 557
confusionMatrix(as.factor(Weekly_glm_pred),Weekly$Direction,positive = 'Up')
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 54 48
## Up 430 557
##
## Accuracy : 0.5611
## 95% CI : (0.531, 0.5908)
## No Information Rate : 0.5556
## P-Value [Acc > NIR] : 0.369
##
## Kappa : 0.035
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9207
## Specificity : 0.1116
## Pos Pred Value : 0.5643
## Neg Pred Value : 0.5294
## Prevalence : 0.5556
## Detection Rate : 0.5115
## Detection Prevalence : 0.9063
## Balanced Accuracy : 0.5161
##
## 'Positive' Class : Up
##
56.1% of your predictions were correct.
False Positives (48): These are cases where the model predicted “Up” but the true label was actually “Down.” The model is incorrectly saying “Up” when it should have said “Down.”
False Negatives (430): These are cases where the model predicted “Down” but the true label was actually “Up.” The model is missing a lot of “Up” cases and saying “Down” when it should have said “Up.”
Weekly_glm2 <- glm(Direction ~ Lag2, data = Weekly, subset = (Year <= 2008), family = binomial)
Weekly_glm_prob <- predict(Weekly_glm2, Weekly[Weekly$Year >= 2009, ], type = "response")
Weekly_glm_pred2 <- ifelse(Weekly_glm_prob > 0.5, "Up", "Down")
confusionMatrix(as.factor(Weekly_glm_pred2), as.factor(Weekly$Direction[Weekly$Year >= 2009]), positive = "Up")
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 9 5
## Up 34 56
##
## Accuracy : 0.625
## 95% CI : (0.5247, 0.718)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.2439
##
## Kappa : 0.1414
##
## Mcnemar's Test P-Value : 7.34e-06
##
## Sensitivity : 0.9180
## Specificity : 0.2093
## Pos Pred Value : 0.6222
## Neg Pred Value : 0.6429
## Prevalence : 0.5865
## Detection Rate : 0.5385
## Detection Prevalence : 0.8654
## Balanced Accuracy : 0.5637
##
## 'Positive' Class : Up
##
Accuracy score is 62%
The model correctly predicted 9 “Down” (True Negatives) and 56 “Up” cases (True Positives), but it misclassified 5 “Up” cases as “Down” (False Positives) and 34 “Down” cases as “Up” (False Negatives).
Weekly_lda <- lda(Direction ~ Lag2, data = Weekly, subset = (Year <= 2008))
Weekly_lda_pred <- predict(Weekly_lda, Weekly[Weekly$Year >= 2009, ])$class
confusionMatrix(as.factor(Weekly_lda_pred), as.factor(Weekly$Direction[Weekly$Year >= 2009]), positive = "Up")
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 9 5
## Up 34 56
##
## Accuracy : 0.625
## 95% CI : (0.5247, 0.718)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.2439
##
## Kappa : 0.1414
##
## Mcnemar's Test P-Value : 7.34e-06
##
## Sensitivity : 0.9180
## Specificity : 0.2093
## Pos Pred Value : 0.6222
## Neg Pred Value : 0.6429
## Prevalence : 0.5865
## Detection Rate : 0.5385
## Detection Prevalence : 0.8654
## Balanced Accuracy : 0.5637
##
## 'Positive' Class : Up
##
Accuracy score is 62%.
The result is the same as (d)
Weekly_qda <- qda(Direction ~ Lag2, data = Weekly, subset = (Year <= 2008))
Weekly_qda_pred <- predict(Weekly_qda, Weekly[Weekly$Year >= 2009, ])$class
confusionMatrix(as.factor(Weekly_qda_pred), as.factor(Weekly$Direction[Weekly$Year >= 2009]), positive = "Up")
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 0 0
## Up 43 61
##
## Accuracy : 0.5865
## 95% CI : (0.4858, 0.6823)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.5419
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 1.504e-10
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.5865
## Neg Pred Value : NaN
## Prevalence : 0.5865
## Detection Rate : 0.5865
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : Up
##
Accuracy score is 58%
The model correctly predicted 61 ‘Up’ cases (True Positives), but it misclassified 43 ‘Down’ cases as ‘Up’ (False Positives). It did not misclassify any ‘Up’ cases as ‘Down’ (False Negatives), and it did not correctly predict any ‘Down’ cases as ‘Down’ (True Negatives).
set.seed(1)
train <- Weekly$Year <= 2008
Week_train <- as.matrix(Weekly[train, "Lag2"])
Week_test <- as.matrix(Weekly[!train, "Lag2"])
train_Direction <- Weekly[train, "Direction"]
test_Direction <- Weekly[!train, "Direction"]
Weekknn_pred <- knn(Week_train, Week_test, train_Direction, k = 1)
confusionMatrix(as.factor(Weekknn_pred), as.factor(Weekly$Direction[!train]), positive = 'Up')
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 21 30
## Up 22 31
##
## Accuracy : 0.5
## 95% CI : (0.4003, 0.5997)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.9700
##
## Kappa : -0.0033
##
## Mcnemar's Test P-Value : 0.3317
##
## Sensitivity : 0.5082
## Specificity : 0.4884
## Pos Pred Value : 0.5849
## Neg Pred Value : 0.4118
## Prevalence : 0.5865
## Detection Rate : 0.2981
## Detection Prevalence : 0.5096
## Balanced Accuracy : 0.4983
##
## 'Positive' Class : Up
##
Accuracy score is 50%
The model correctly predicted 21 ‘Down’ cases (True Negatives) and 31 ‘Up’ cases (True Positives). However, it misclassified 22 ‘Down’ cases as ‘Up’ (False Positives) and 30 ‘Up’ cases as ‘Down’ (False Negatives).
weekly_nabay <- naiveBayes(Direction~Lag2 ,data=Weekly ,subset=train)
weekly_nabay_pred <- predict(weekly_nabay, newdata = Weekly[!train, ])
confusionMatrix(weekly_nabay_pred, test_Direction)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 0 0
## Up 43 61
##
## Accuracy : 0.5865
## 95% CI : (0.4858, 0.6823)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.5419
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 1.504e-10
##
## Sensitivity : 0.0000
## Specificity : 1.0000
## Pos Pred Value : NaN
## Neg Pred Value : 0.5865
## Prevalence : 0.4135
## Detection Rate : 0.0000
## Detection Prevalence : 0.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : Down
##
Accuracy score is 58%
Result is the same as the QDA
After performing LDA, QDA, KNN, and Naive Bayes, LDA appears to have the best results, with an accuracy score of 62%.
LDA
Weekly_lda2 <- lda(Direction ~ I(Lag3^2), data = Weekly, subset = (Year <= 2008), family = binomial)
Weekly_lda_pred2 <- predict(Weekly_lda2, Weekly[Weekly$Year >= 2009, ])$class
confusionMatrix(as.factor(Weekly_lda_pred2), as.factor(Weekly$Direction[Weekly$Year >= 2009]), positive = "Up")
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 0 0
## Up 43 61
##
## Accuracy : 0.5865
## 95% CI : (0.4858, 0.6823)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.5419
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 1.504e-10
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.5865
## Neg Pred Value : NaN
## Prevalence : 0.5865
## Detection Rate : 0.5865
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : Up
##
Using the predictor Lag3^2, LDA achieved an accuracy
score of 58%.
QDA
Weekly_qda2 <- qda(Direction ~ Lag1:Lag5, data = Weekly, subset = (Year <= 2008))
Weekly_qda_pred2 <- predict(Weekly_qda2, Weekly[Weekly$Year >= 2009, ])$class
confusionMatrix(as.factor(Weekly_qda_pred2), as.factor(Weekly$Direction[Weekly$Year >= 2009]), positive = "Up")
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 10 14
## Up 33 47
##
## Accuracy : 0.5481
## 95% CI : (0.4474, 0.6459)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.81516
##
## Kappa : 0.0033
##
## Mcnemar's Test P-Value : 0.00865
##
## Sensitivity : 0.7705
## Specificity : 0.2326
## Pos Pred Value : 0.5875
## Neg Pred Value : 0.4167
## Prevalence : 0.5865
## Detection Rate : 0.4519
## Detection Prevalence : 0.7692
## Balanced Accuracy : 0.5015
##
## 'Positive' Class : Up
##
Using the predictor Lag1:Lag5, QDA achieved an accuracy
score of 54%. Which is 4% lower than our previous
QDA
KNN
set.seed(1)
Weekknn_pred2 <- knn(Week_train, Week_test, train_Direction, k = 13)
confusionMatrix(as.factor(Weekknn_pred2), as.factor(Weekly$Direction[!train]), positive = 'Up')
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 20 19
## Up 23 42
##
## Accuracy : 0.5962
## 95% CI : (0.4954, 0.6913)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.4626
##
## Kappa : 0.1558
##
## Mcnemar's Test P-Value : 0.6434
##
## Sensitivity : 0.6885
## Specificity : 0.4651
## Pos Pred Value : 0.6462
## Neg Pred Value : 0.5128
## Prevalence : 0.5865
## Detection Rate : 0.4038
## Detection Prevalence : 0.6250
## Balanced Accuracy : 0.5768
##
## 'Positive' Class : Up
##
After performing KNN with K=13, our accuracy score increased to 59%, which was better than our previous accuracy in part (g) when using K=1.
library(ISLR2)
attach(Auto)
## The following object is masked from package:ggplot2:
##
## mpg
Auto$mpg01 <- ifelse(Auto$mpg > median(Auto$mpg), 1, 0)
pairs(Auto[, sapply(Auto, is.numeric)])
As the plot shows, displacement,
horsepower, and weight are predictors with
useful predictive power for mpg01, and there are linear
negative relationships between these variables and
mpg01.
boxplot(Auto$cylinders ~ Auto$mpg01, main = "Cylinders vs mpg01")
boxplot(displacement ~ Auto$mpg01, main = "Displacement vs mpg01")
boxplot(horsepower ~ Auto$mpg01, main = "Horsepower vs mpg01")
boxplot(weight ~ Auto$mpg01, main = "Weight vs mpg01")
boxplot(acceleration ~ Auto$mpg01, main = "Acceleration vs mpg01")
boxplot(year ~ Auto$mpg01, main = "Year vs mpg01")
Looking at our box plot, it still shows that
displacement, horsepower, and
weight still that relationship with mpg01
compared to the others. However, we do observe that
acceleration and Year has a positive
relationship. Additionally, when looking at cylinders,
there is a large gap.
set.seed(27)
Auto_trainIndex <- createDataPartition(Auto$mpg01, p = 0.7, list = FALSE)
Auto_train_data <- Auto[Auto_trainIndex, ]
Auto_test_data <- Auto[-Auto_trainIndex, ]
Auto_lda <- lda(mpg01 ~ cylinders + displacement + weight, data = Auto_train_data)
Auto_lda_pred <- predict(Auto_lda, Auto_test_data)
Auto_lda_class <- Auto_lda_pred$class
confusionMatrix(Auto_lda_class, as.factor(Auto_test_data$mpg01))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 52 5
## 1 6 53
##
## Accuracy : 0.9052
## 95% CI : (0.8367, 0.9517)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8103
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.8966
## Specificity : 0.9138
## Pos Pred Value : 0.9123
## Neg Pred Value : 0.8983
## Prevalence : 0.5000
## Detection Rate : 0.4483
## Detection Prevalence : 0.4914
## Balanced Accuracy : 0.9052
##
## 'Positive' Class : 0
##
After performing LDA, we achieved a test accuracy of 90%.
Auto_qda <- qda(mpg01 ~ cylinders + displacement + horsepower + weight, data = Auto_train_data)
Auto_qda_pred <- predict(Auto_qda, Auto_test_data)
Auto_qda_class <- Auto_qda_pred$class
confusionMatrix(Auto_qda_class, as.factor(Auto_test_data$mpg01))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 53 5
## 1 5 53
##
## Accuracy : 0.9138
## 95% CI : (0.8472, 0.9579)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8276
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9138
## Specificity : 0.9138
## Pos Pred Value : 0.9138
## Neg Pred Value : 0.9138
## Prevalence : 0.5000
## Detection Rate : 0.4569
## Detection Prevalence : 0.5000
## Balanced Accuracy : 0.9138
##
## 'Positive' Class : 0
##
After performing QDA, we achieved a test accuracy of 91%.
Auto_glm <- glm(mpg01 ~ cylinders + displacement + horsepower + weight, data = Auto_train_data)
Auto_glm_prob <- predict(Auto_glm, Auto_test_data, type = 'response')
Auto_glm_pred <- ifelse(Auto_glm_prob > 0.5, 1, 0)
confusionMatrix(as.factor(Auto_glm_pred), as.factor(Auto_test_data$mpg01))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 52 3
## 1 6 55
##
## Accuracy : 0.9224
## 95% CI : (0.8578, 0.9639)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8448
##
## Mcnemar's Test P-Value : 0.505
##
## Sensitivity : 0.8966
## Specificity : 0.9483
## Pos Pred Value : 0.9455
## Neg Pred Value : 0.9016
## Prevalence : 0.5000
## Detection Rate : 0.4483
## Detection Prevalence : 0.4741
## Balanced Accuracy : 0.9224
##
## 'Positive' Class : 0
##
After performing logistic regression, we achieved a test accuracy of 92%.
Auto_nabay <- naiveBayes(mpg01 ~ cylinders + displacement + horsepower + weight, data = Auto_train_data)
Auto_nabay_pred <- predict(Auto_nabay, Auto_test_data)
confusionMatrix(as.factor(Auto_nabay_pred), as.factor(Auto_test_data$mpg01))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 52 5
## 1 6 53
##
## Accuracy : 0.9052
## 95% CI : (0.8367, 0.9517)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8103
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.8966
## Specificity : 0.9138
## Pos Pred Value : 0.9123
## Neg Pred Value : 0.8983
## Prevalence : 0.5000
## Detection Rate : 0.4483
## Detection Prevalence : 0.4914
## Balanced Accuracy : 0.9052
##
## 'Positive' Class : 0
##
After performing naive Bayes, we achieved a test accuracy of 90%.
set.seed(30)
Auto_Knn_train1 <- Auto_train_data[, c("cylinders", "displacement", "horsepower", "weight")]
Auto_Knn_test1 <- Auto_test_data[, c("cylinders", "displacement", "horsepower", "weight")]
Auto_Knn_train2 <- Auto_train_data$mpg01
Auto_Knn_test2 <- Auto_test_data$mpg01
Auto_knn_preds1 <- knn(Auto_Knn_train1, Auto_Knn_test1, Auto_Knn_train2, k = 2)
Auto_knn_preds2 <- knn(Auto_Knn_train1, Auto_Knn_test1, Auto_Knn_train2, k = 4)
Auto_knn_preds3 <- knn(Auto_Knn_train1, Auto_Knn_test1, Auto_Knn_train2, k = 6)
Auto_knn_preds4 <- knn(Auto_Knn_train1, Auto_Knn_test1, Auto_Knn_train2, k = 8)
confusionMatrix(as.factor(Auto_knn_preds1), as.factor(Auto_Knn_test2))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 48 7
## 1 10 51
##
## Accuracy : 0.8534
## 95% CI : (0.7758, 0.9122)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 1.478e-15
##
## Kappa : 0.7069
##
## Mcnemar's Test P-Value : 0.6276
##
## Sensitivity : 0.8276
## Specificity : 0.8793
## Pos Pred Value : 0.8727
## Neg Pred Value : 0.8361
## Prevalence : 0.5000
## Detection Rate : 0.4138
## Detection Prevalence : 0.4741
## Balanced Accuracy : 0.8534
##
## 'Positive' Class : 0
##
K = 2 gave an accuracy score of 85%
K = 4
confusionMatrix(as.factor(Auto_knn_preds2), as.factor(Auto_Knn_test2))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 52 4
## 1 6 54
##
## Accuracy : 0.9138
## 95% CI : (0.8472, 0.9579)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8276
##
## Mcnemar's Test P-Value : 0.7518
##
## Sensitivity : 0.8966
## Specificity : 0.9310
## Pos Pred Value : 0.9286
## Neg Pred Value : 0.9000
## Prevalence : 0.5000
## Detection Rate : 0.4483
## Detection Prevalence : 0.4828
## Balanced Accuracy : 0.9138
##
## 'Positive' Class : 0
##
K = 4 gave an accuracy score of 91%
K= 6
confusionMatrix(as.factor(Auto_knn_preds3), as.factor(Auto_Knn_test2))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 53 6
## 1 5 52
##
## Accuracy : 0.9052
## 95% CI : (0.8367, 0.9517)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8103
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9138
## Specificity : 0.8966
## Pos Pred Value : 0.8983
## Neg Pred Value : 0.9123
## Prevalence : 0.5000
## Detection Rate : 0.4569
## Detection Prevalence : 0.5086
## Balanced Accuracy : 0.9052
##
## 'Positive' Class : 0
##
K = 6 gave an accuracy score of 91%
K = 8
confusionMatrix(as.factor(Auto_knn_preds4), as.factor(Auto_Knn_test2))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 53 5
## 1 5 53
##
## Accuracy : 0.9138
## 95% CI : (0.8472, 0.9579)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8276
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9138
## Specificity : 0.9138
## Pos Pred Value : 0.9138
## Neg Pred Value : 0.9138
## Prevalence : 0.5000
## Detection Rate : 0.4569
## Detection Prevalence : 0.5000
## Balanced Accuracy : 0.9138
##
## 'Positive' Class : 0
##
K = 8 gave an accuracy score of 91%
data('Boston')
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio lstat
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 1.73
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.: 6.95
## Median : 5.000 Median :330.0 Median :19.05 Median :11.36
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :12.65
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:16.95
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :37.97
## medv
## Min. : 5.00
## 1st Qu.:17.02
## Median :21.20
## Mean :22.53
## 3rd Qu.:25.00
## Max. :50.00
1 = High crime
0 = Low crime
Boston$crim01 <- ifelse(Boston$crim > median(Boston$crim), 1, 0)
pairs(Boston)
Convert crim01 to a factor for visualization
Boston$crim01 <- as.factor(Boston$crim01)
ggplot(Boston, aes(x = dis, y = lstat, color = crim01)) +
geom_point(alpha = 0.6) +
labs(title = "Distance vs. Lower Status",
x = "Distance to Jobs",
y = "% Lower Status") +
theme_minimal()
ggplot(Boston, aes(x = rm, y = ptratio, color = crim01)) +
geom_point(alpha = 0.6) +
labs(title = "Rooms vs. Pupil-Teacher Ratio",
x = "Avg. Rooms",
y = "Pupil-Teacher Ratio") +
theme_minimal()
ggplot(Boston, aes(x = crim01, y = tax, fill = crim01)) +
geom_boxplot() +
labs(title = "Tax Rate by Crime Level",
x = "Crime Level",
y = "Tax Rate") +
theme_minimal()
ggplot(Boston, aes(x = crim01, y = age, fill = crim01)) +
geom_boxplot() +
labs(title = "Older Homes by Crime Level",
x = "Crime Level",
y = "% Older Homes") +
theme_minimal()
Identify Significant Predictors Using Logistic Regression
Boston_glm <- glm(crim01 ~ ., data = Boston[, -which(names(Boston) == "crim")], family = binomial)
summary(Boston_glm)
##
## Call:
## glm(formula = crim01 ~ ., family = binomial, data = Boston[,
## -which(names(Boston) == "crim")])
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -38.832417 6.086280 -6.380 1.77e-10 ***
## zn -0.086228 0.034090 -2.529 0.0114 *
## indus -0.052438 0.042817 -1.225 0.2207
## chas 0.619914 0.722150 0.858 0.3907
## nox 47.913820 7.344213 6.524 6.84e-11 ***
## rm -0.271941 0.676239 -0.402 0.6876
## age 0.021474 0.012105 1.774 0.0761 .
## dis 0.669991 0.214618 3.122 0.0018 **
## rad 0.669240 0.151742 4.410 1.03e-05 ***
## tax -0.006165 0.002622 -2.351 0.0187 *
## ptratio 0.326433 0.116296 2.807 0.0050 **
## lstat 0.053537 0.047105 1.137 0.2557
## medv 0.147987 0.064347 2.300 0.0215 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 701.46 on 505 degrees of freedom
## Residual deviance: 218.75 on 493 degrees of freedom
## AIC: 244.75
##
## Number of Fisher Scoring iterations: 9
After analyzing the results, variables such as nox,
dis, rad, and ptratio appear to
be the most influential in predicting whether a census tract has a high
or low crime rate.
set.seed(25)
Boston_train_index <- createDataPartition(Boston$crim01, p = 0.7, list = FALSE)
Boston_train<- Boston[Boston_train_index, ]
Boston_test <- Boston[-Boston_train_index, ]
Since we have already performed logistic regression to identify significant variables, let’s now use those variables to predict the outcome using Logistic Regression, LDA, QDA, Naive Bayes, and KNN.
Boston_glm_model <- glm(crim01 ~ nox + dis + rad + ptratio, data = Boston_train, family = binomial)
Boston_glm_probs <- predict(Boston_glm_model, Boston_test, type = "response")
Boston_glm_preds <- ifelse(Boston_glm_probs > 0.5, 1, 0)
confusionMatrix(as.factor(Boston_glm_preds), as.factor(Boston_test$crim01))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 69 12
## 1 6 63
##
## Accuracy : 0.88
## 95% CI : (0.817, 0.9273)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.76
##
## Mcnemar's Test P-Value : 0.2386
##
## Sensitivity : 0.9200
## Specificity : 0.8400
## Pos Pred Value : 0.8519
## Neg Pred Value : 0.9130
## Prevalence : 0.5000
## Detection Rate : 0.4600
## Detection Prevalence : 0.5400
## Balanced Accuracy : 0.8800
##
## 'Positive' Class : 0
##
The logistic regression model has an accuracy score of 88%.
Boston_lda_model <- lda(crim01 ~ nox + dis + rad + ptratio, data = Boston_train)
Boston_lda_preds <- predict(Boston_lda_model, Boston_test)$class
confusionMatrix(as.factor(Boston_lda_preds), as.factor(Boston_test$crim01))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 70 17
## 1 5 58
##
## Accuracy : 0.8533
## 95% CI : (0.7864, 0.9057)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.7067
##
## Mcnemar's Test P-Value : 0.01902
##
## Sensitivity : 0.9333
## Specificity : 0.7733
## Pos Pred Value : 0.8046
## Neg Pred Value : 0.9206
## Prevalence : 0.5000
## Detection Rate : 0.4667
## Detection Prevalence : 0.5800
## Balanced Accuracy : 0.8533
##
## 'Positive' Class : 0
##
The LDA model has an accuracy score of 85%.
Boston_qda_model <- qda(crim01 ~ nox + dis + rad + ptratio, data = Boston_train)
Boston_qda_preds <- predict(Boston_qda_model, Boston_test)$class
confusionMatrix(as.factor(Boston_qda_preds), as.factor(Boston_test$crim01))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 73 18
## 1 2 57
##
## Accuracy : 0.8667
## 95% CI : (0.8016, 0.9166)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7333
##
## Mcnemar's Test P-Value : 0.0007962
##
## Sensitivity : 0.9733
## Specificity : 0.7600
## Pos Pred Value : 0.8022
## Neg Pred Value : 0.9661
## Prevalence : 0.5000
## Detection Rate : 0.4867
## Detection Prevalence : 0.6067
## Balanced Accuracy : 0.8667
##
## 'Positive' Class : 0
##
The QDA model has an accuracy score of 86%.
Boston_nb_model <- naiveBayes(crim01 ~ nox + dis + rad + ptratio, data = Boston_train)
Boston_nb_preds <- predict(Boston_nb_model, Boston_test)
confusionMatrix(as.factor(Boston_nb_preds), as.factor(Boston_test$crim01))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 68 16
## 1 7 59
##
## Accuracy : 0.8467
## 95% CI : (0.7789, 0.9002)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.6933
##
## Mcnemar's Test P-Value : 0.09529
##
## Sensitivity : 0.9067
## Specificity : 0.7867
## Pos Pred Value : 0.8095
## Neg Pred Value : 0.8939
## Prevalence : 0.5000
## Detection Rate : 0.4533
## Detection Prevalence : 0.5600
## Balanced Accuracy : 0.8467
##
## 'Positive' Class : 0
##
The Naive Bayes model has an accuracy score of 84%.
KNN with k = 3
bosKNN_train_x <- Boston_train[, c('nox', 'dis', 'rad', 'ptratio')]
bosKNNtest_x <- Boston_test[, c('nox', 'dis', 'rad', 'ptratio')]
bosKNNtrain_y <- Boston_train$crim01
bosKNN_preds_3 <- knn(bosKNN_train_x, bosKNNtest_x, bosKNNtrain_y, k = 3)
bosKNN_preds_6 <- knn(bosKNN_train_x, bosKNNtest_x, bosKNNtrain_y, k = 6)
bosKNN_preds_9 <- knn(bosKNN_train_x, bosKNNtest_x, bosKNNtrain_y, k = 9)
confusionMatrix(as.factor(bosKNN_preds_3), as.factor(Boston_test$crim01))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 71 3
## 1 4 72
##
## Accuracy : 0.9533
## 95% CI : (0.9062, 0.981)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9067
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9467
## Specificity : 0.9600
## Pos Pred Value : 0.9595
## Neg Pred Value : 0.9474
## Prevalence : 0.5000
## Detection Rate : 0.4733
## Detection Prevalence : 0.4933
## Balanced Accuracy : 0.9533
##
## 'Positive' Class : 0
##
K = 3 gave an accuracy score of 95%
KNN with k = 6
confusionMatrix(as.factor(bosKNN_preds_6), as.factor(Boston_test$crim01))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 73 4
## 1 2 71
##
## Accuracy : 0.96
## 95% CI : (0.915, 0.9852)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.92
##
## Mcnemar's Test P-Value : 0.6831
##
## Sensitivity : 0.9733
## Specificity : 0.9467
## Pos Pred Value : 0.9481
## Neg Pred Value : 0.9726
## Prevalence : 0.5000
## Detection Rate : 0.4867
## Detection Prevalence : 0.5133
## Balanced Accuracy : 0.9600
##
## 'Positive' Class : 0
##
K = 6 gave an accuracy score of 96%
KNN with k = 9
confusionMatrix(as.factor(bosKNN_preds_9), as.factor(Boston_test$crim01))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 72 4
## 1 3 71
##
## Accuracy : 0.9533
## 95% CI : (0.9062, 0.981)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9067
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9600
## Specificity : 0.9467
## Pos Pred Value : 0.9474
## Neg Pred Value : 0.9595
## Prevalence : 0.5000
## Detection Rate : 0.4800
## Detection Prevalence : 0.5067
## Balanced Accuracy : 0.9533
##
## 'Positive' Class : 0
##
K = 9 gave an accuracy score of 95%
Findings
After evaluating all the prediction models, it appears that KNN
delivered the best accuracy, with K = 3 achieving an
accuracy score of 95%, K = 6 achieving
96%, and K = 9 achieving
95%, with the highest accuracy at K = 6.
Compared to other models, our lowest accuracy score came from the Naive
Bayes , which had a score of 84%.