library(ISLR2)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
library(class)
library(e1071)
library(car)
## Loading required package: carData
d1<-Weekly
summary(d1)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
# Box plot for Amount_purchased, Frequency, and Last_purchased
par(mfrow = c(1, 5))
boxplot(d1$Lag1,
main = 'Lag 1')
boxplot(d1$Lag2,
main = 'Lag 2')
boxplot(d1$Lag3,
main = 'Lag 3')
boxplot(d1$Lag4,
main = 'Lag 4')
boxplot(d1$Lag5,
main = 'Lag 5')
par(mfrow = c(1, 1))
par(mfrow = c(1, 2))
boxplot(d1$Volume,
main = 'Volume')
boxplot(d1$Today,
main = 'Today')
par(mfrow = c(1, 1))
Looking at the summary statistics as well as the box plots above you
can tell various things about the predictor variables. One of the most
interesting things is that all of the Lag variables as well
as the Today variable are all symmetrical with medians
close to 0. Additionally they all share the same minimum and maximum
values. The only variable that was different was Volume
which was skewed heavily to the right with outliers only to the right.
The other predictor variables have outliers to both sides Additionally,
none of the predictors appear to be significant as can be seen by the
high p-value.
logit_model1 = glm(as.factor(Direction) ~ . - Year - Today, data = d1, family = 'binomial')
summary(logit_model1)
##
## Call:
## glm(formula = as.factor(Direction) ~ . - Year - Today, family = "binomial",
## data = d1)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
The summary of the logistic regression model above shows that
Lag 2 is statistically significant in predicting
Direction as shown by the p-value of 0.0296.
glm.probs <- predict(logit_model1, type = "response")
glm.probs[1:10]
## 1 2 3 4 5 6 7 8
## 0.6086249 0.6010314 0.5875699 0.4816416 0.6169013 0.5684190 0.5786097 0.5151972
## 9 10
## 0.5715200 0.5554287
contrasts(d1$Direction)
## Up
## Down 0
## Up 1
glm.pred <- rep("Down", 1089)
glm.pred[glm.probs > .5] = "Up"
glm.pred <- as.factor(glm.pred)
confusionMatrix(glm.pred, d1$Direction, positive = 'Up')
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 54 48
## Up 430 557
##
## Accuracy : 0.5611
## 95% CI : (0.531, 0.5908)
## No Information Rate : 0.5556
## P-Value [Acc > NIR] : 0.369
##
## Kappa : 0.035
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9207
## Specificity : 0.1116
## Pos Pred Value : 0.5643
## Neg Pred Value : 0.5294
## Prevalence : 0.5556
## Detection Rate : 0.5115
## Detection Prevalence : 0.9063
## Balanced Accuracy : 0.5161
##
## 'Positive' Class : Up
##
The confusion matrix above shows that the model is only 56.11%
accurate. This can be seen by the 430 instances in which
Down was classified incorrectly, and the 48 times that
Up was classified incorrectly. This also tell us that the
model is better at predicting Up than
Down.
train<- subset(d1, Year < 2009)
test <- subset(d1, Year > 2008)
logit_model2 = glm(Direction ~ Lag2, data = train, family = 'binomial')
summary(logit_model2)
##
## Call:
## glm(formula = Direction ~ Lag2, family = "binomial", data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.20326 0.06428 3.162 0.00157 **
## Lag2 0.05810 0.02870 2.024 0.04298 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1354.7 on 984 degrees of freedom
## Residual deviance: 1350.5 on 983 degrees of freedom
## AIC: 1354.5
##
## Number of Fisher Scoring iterations: 4
glm.probs2 <- predict(logit_model2, test, type = "response")
glm.pred2 <- rep("Down", 104)
glm.pred2[glm.probs2 > .5] = "Up"
glm.pred2 <- as.factor(glm.pred2)
confusionMatrix(glm.pred2, test$Direction, positive = 'Up')
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 9 5
## Up 34 56
##
## Accuracy : 0.625
## 95% CI : (0.5247, 0.718)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.2439
##
## Kappa : 0.1414
##
## Mcnemar's Test P-Value : 7.34e-06
##
## Sensitivity : 0.9180
## Specificity : 0.2093
## Pos Pred Value : 0.6222
## Neg Pred Value : 0.6429
## Prevalence : 0.5865
## Detection Rate : 0.5385
## Detection Prevalence : 0.8654
## Balanced Accuracy : 0.5637
##
## 'Positive' Class : Up
##
The confusion matrix above shows that Logistic regression model was able to correctly predict 64 out of 104 observations in the test data.
lda.model = lda(Direction ~ Lag2, data = train)
lda.model
## Call:
## lda(Direction ~ Lag2, data = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
##
## Coefficients of linear discriminants:
## LD1
## Lag2 0.4414162
lda.pred <- predict(lda.model, test)
lda.class <- lda.pred$class
lda.class <- as.factor(lda.class)
confusionMatrix(lda.class, test$Direction, positive = 'Up')
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 9 5
## Up 34 56
##
## Accuracy : 0.625
## 95% CI : (0.5247, 0.718)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.2439
##
## Kappa : 0.1414
##
## Mcnemar's Test P-Value : 7.34e-06
##
## Sensitivity : 0.9180
## Specificity : 0.2093
## Pos Pred Value : 0.6222
## Neg Pred Value : 0.6429
## Prevalence : 0.5865
## Detection Rate : 0.5385
## Detection Prevalence : 0.8654
## Balanced Accuracy : 0.5637
##
## 'Positive' Class : Up
##
The confusion matrix above shows that LDA model was able to correctly predict 64 out of 104 observations in the test data. This is the exact same as the logistic regression model.
qda.model = qda(Direction ~ Lag2, data = train)
qda.model
## Call:
## qda(Direction ~ Lag2, data = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
qda.class = predict(qda.model, test)$class
qda.class <- as.factor(qda.class)
confusionMatrix(qda.class, test$Direction, positive = 'Up')
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 0 0
## Up 43 61
##
## Accuracy : 0.5865
## 95% CI : (0.4858, 0.6823)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.5419
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 1.504e-10
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.5865
## Neg Pred Value : NaN
## Prevalence : 0.5865
## Detection Rate : 0.5865
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : Up
##
The confusion matrix above shows that QDA model was able to correctly
predict 61 out of 104 observations in the test data. However it is
important to note that this model only predicted Up. This
would not be an ideal model to pick.
train.X = cbind(train$Lag2)
test.X = cbind(test$Lag2)
knn.pred = knn(train.X, test.X, train$Direction, k = 1)
confusionMatrix(knn.pred, test$Direction, positive = 'Up')
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 21 29
## Up 22 32
##
## Accuracy : 0.5096
## 95% CI : (0.4097, 0.609)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.9540
##
## Kappa : 0.0127
##
## Mcnemar's Test P-Value : 0.4008
##
## Sensitivity : 0.5246
## Specificity : 0.4884
## Pos Pred Value : 0.5926
## Neg Pred Value : 0.4200
## Prevalence : 0.5865
## Detection Rate : 0.3077
## Detection Prevalence : 0.5192
## Balanced Accuracy : 0.5065
##
## 'Positive' Class : Up
##
The confusion matrix above shows that KNN model was able to correctly
predict 53 out of 104 observations in the test data. So far out of all
the models, this one has been the best at balancing predicting
Down as much as Up. This makes this models
stand out from the others.
bayes.model = naiveBayes(Direction ~ Lag2, data = train)
bayes.model
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## Down Up
## 0.4477157 0.5522843
##
## Conditional probabilities:
## Lag2
## Y [,1] [,2]
## Down -0.03568254 2.199504
## Up 0.26036581 2.317485
bayes.class = predict(bayes.model, test)
confusionMatrix(bayes.class, test$Direction, positive = 'Up')
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 0 0
## Up 43 61
##
## Accuracy : 0.5865
## 95% CI : (0.4858, 0.6823)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.5419
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 1.504e-10
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.5865
## Neg Pred Value : NaN
## Prevalence : 0.5865
## Detection Rate : 0.5865
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : Up
##
The confusion matrix above shows that Naive Bayes model was able to
correctly predict 61 out of 104 observations in the test data. This
model, similar to the QDA model, only predicted Up. This
makes it a somewhat undesirable model.
Out of all of the models above, I would say that KNN has the best
results as it has the best balance of accuracy, sensitivity, and
specificity. And it also has the highest specificity which means it is
the best at accurately predicting Down which is
important.
logit_model2 = glm(Direction ~ Lag2 + Lag1, data = train, family = 'binomial')
summary(logit_model2)
##
## Call:
## glm(formula = Direction ~ Lag2 + Lag1, family = "binomial", data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.21109 0.06456 3.269 0.00108 **
## Lag2 0.05384 0.02905 1.854 0.06379 .
## Lag1 -0.05421 0.02886 -1.878 0.06034 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1354.7 on 984 degrees of freedom
## Residual deviance: 1347.0 on 982 degrees of freedom
## AIC: 1353
##
## Number of Fisher Scoring iterations: 4
glm.probs2 <- predict(logit_model2, test, type = "response")
glm.pred2 <- rep("Down", 104)
glm.pred2[glm.probs2 > .5] = "Up"
glm.pred2 <- as.factor(glm.pred2)
confusionMatrix(glm.pred2, test$Direction, positive = 'Up')
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 7 8
## Up 36 53
##
## Accuracy : 0.5769
## 95% CI : (0.4761, 0.6732)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.6193
##
## Kappa : 0.035
##
## Mcnemar's Test P-Value : 4.693e-05
##
## Sensitivity : 0.8689
## Specificity : 0.1628
## Pos Pred Value : 0.5955
## Neg Pred Value : 0.4667
## Prevalence : 0.5865
## Detection Rate : 0.5096
## Detection Prevalence : 0.8558
## Balanced Accuracy : 0.5158
##
## 'Positive' Class : Up
##
lda.model = lda(Direction ~ Lag2 + Lag1, data = train)
lda.model
## Call:
## lda(Direction ~ Lag2 + Lag1, data = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2 Lag1
## Down -0.03568254 0.289444444
## Up 0.26036581 -0.009213235
##
## Coefficients of linear discriminants:
## LD1
## Lag2 0.2982579
## Lag1 -0.3013148
lda.pred <- predict(lda.model, test)
lda.class <- lda.pred$class
lda.class <- as.factor(lda.class)
confusionMatrix(lda.class, test$Direction, positive = 'Up')
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 7 8
## Up 36 53
##
## Accuracy : 0.5769
## 95% CI : (0.4761, 0.6732)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.6193
##
## Kappa : 0.035
##
## Mcnemar's Test P-Value : 4.693e-05
##
## Sensitivity : 0.8689
## Specificity : 0.1628
## Pos Pred Value : 0.5955
## Neg Pred Value : 0.4667
## Prevalence : 0.5865
## Detection Rate : 0.5096
## Detection Prevalence : 0.8558
## Balanced Accuracy : 0.5158
##
## 'Positive' Class : Up
##
qda.model = qda(Direction ~ Lag2 + Lag1, data = train)
qda.model
## Call:
## qda(Direction ~ Lag2 + Lag1, data = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2 Lag1
## Down -0.03568254 0.289444444
## Up 0.26036581 -0.009213235
qda.class = predict(qda.model, test)$class
qda.class <- as.factor(qda.class)
confusionMatrix(qda.class, test$Direction, positive = 'Up')
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 7 10
## Up 36 51
##
## Accuracy : 0.5577
## 95% CI : (0.457, 0.655)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.7579156
##
## Kappa : -0.0013
##
## Mcnemar's Test P-Value : 0.0002278
##
## Sensitivity : 0.8361
## Specificity : 0.1628
## Pos Pred Value : 0.5862
## Neg Pred Value : 0.4118
## Prevalence : 0.5865
## Detection Rate : 0.4904
## Detection Prevalence : 0.8365
## Balanced Accuracy : 0.4994
##
## 'Positive' Class : Up
##
train.X = cbind(train$Lag2)
test.X = cbind(test$Lag2)
k_values <- c(1, 3, 5, 7, 10, 15, 20)
test_errors <- numeric(length(k_values))
for (i in seq_along(k_values)) {
k <- k_values[i]
# Perform KNN classification
knn.pred <- knn(train.X, test.X, train$Direction, k = k)
cm <- confusionMatrix(knn.pred, test$Direction, positive = 'Up')
test_errors[i] <- 1 - cm$overall["Accuracy"]
}
results <- data.frame(K = k_values, Test_Error = test_errors)
print(results)
## K Test_Error
## 1 1 0.5000000
## 2 3 0.4615385
## 3 5 0.4711538
## 4 7 0.4519231
## 5 10 0.4326923
## 6 15 0.4134615
## 7 20 0.4134615
#knn.pred = knn(train.X, test.X, train$Direction, k = 1)
#confusionMatrix(knn.pred, test$Direction, positive = 'Up')
bayes.model = naiveBayes(Direction ~ Lag2 + Lag1, data = train)
bayes.model
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## Down Up
## 0.4477157 0.5522843
##
## Conditional probabilities:
## Lag2
## Y [,1] [,2]
## Down -0.03568254 2.199504
## Up 0.26036581 2.317485
##
## Lag1
## Y [,1] [,2]
## Down 0.289444444 2.211721
## Up -0.009213235 2.308387
bayes.class = predict(bayes.model, test)
confusionMatrix(bayes.class, test$Direction, positive = 'Up')
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 3 8
## Up 40 53
##
## Accuracy : 0.5385
## 95% CI : (0.438, 0.6367)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.8631
##
## Kappa : -0.069
##
## Mcnemar's Test P-Value : 7.66e-06
##
## Sensitivity : 0.86885
## Specificity : 0.06977
## Pos Pred Value : 0.56989
## Neg Pred Value : 0.27273
## Prevalence : 0.58654
## Detection Rate : 0.50962
## Detection Prevalence : 0.89423
## Balanced Accuracy : 0.46931
##
## 'Positive' Class : Up
##
The only model that seemed to improve after experimenting with
various variables, and k-values was the KNN model. I found that just
leaving Lag2 in the model seem like the best option but a k=15, I was
able to get the test error down to 0.4134615 which is better than I was
originally getting and this model still predicts Down
better than all of the other models.
Auto <- read.table("Auto.data", header = T, na.strings = "?", stringsAsFactors = T)
mpg_median <- median(Auto$mpg)
Auto$mpg01 <- ifelse(Auto$mpg > mpg_median, 1, 0)
head(Auto)
## mpg cylinders displacement horsepower weight acceleration year origin
## 1 18 8 307 130 3504 12.0 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11.0 70 1
## 4 16 8 304 150 3433 12.0 70 1
## 5 17 8 302 140 3449 10.5 70 1
## 6 15 8 429 198 4341 10.0 70 1
## name mpg01
## 1 chevrolet chevelle malibu 0
## 2 buick skylark 320 0
## 3 plymouth satellite 0
## 4 amc rebel sst 0
## 5 ford torino 0
## 6 ford galaxie 500 0
summary(Auto)
## mpg cylinders displacement horsepower weight
## Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0 Min. :1613
## 1st Qu.:17.50 1st Qu.:4.000 1st Qu.:104.0 1st Qu.: 75.0 1st Qu.:2223
## Median :23.00 Median :4.000 Median :146.0 Median : 93.5 Median :2800
## Mean :23.52 Mean :5.458 Mean :193.5 Mean :104.5 Mean :2970
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:262.0 3rd Qu.:126.0 3rd Qu.:3609
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0 Max. :5140
## NA's :5
## acceleration year origin name
## Min. : 8.00 Min. :70.00 Min. :1.000 ford pinto : 6
## 1st Qu.:13.80 1st Qu.:73.00 1st Qu.:1.000 amc matador : 5
## Median :15.50 Median :76.00 Median :1.000 ford maverick : 5
## Mean :15.56 Mean :75.99 Mean :1.574 toyota corolla: 5
## 3rd Qu.:17.10 3rd Qu.:79.00 3rd Qu.:2.000 amc gremlin : 4
## Max. :24.80 Max. :82.00 Max. :3.000 amc hornet : 4
## (Other) :368
## mpg01
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.4811
## 3rd Qu.:1.0000
## Max. :1.0000
##
# Box plot for Amount_purchased, Frequency, and Last_purchased
par(mfrow = c(1, 5))
boxplot(Auto$cylinders,
main = 'Cylinders')
boxplot(Auto$displacement,
main = 'displacement')
boxplot(Auto$weight,
main = 'Weight')
boxplot(Auto$acceleration,
main = 'Acceleration')
boxplot(Auto$horsepower,
main = 'Horsepower')
par(mfrow = c(1, 1))
plot(Auto$year, Auto$mpg01,
main = "Year vs. MPG",
xlab = "Year",
ylab = "MPG",
col = "blue",
pch = 16)
plot(Auto$weight, Auto$mpg01,
main = "Weight vs. MPG",
xlab = "Weight",
ylab = "MPG",
col = "blue",
pch = 16)
plot(Auto$displacement, Auto$mpg01,
main = "Displacement vs. MPG",
xlab = "Displacement",
ylab = "MPG",
col = "blue",
pch = 16)
Looking at the statistical summaries as well as the box plots and
scatter plots above, it can be seen that certain variables seem to have
a relationship with mpg01. Specifically when looking at the
two scatter plots comparing Displacement and
Weight to mpg01. For both variables, the
observations that have 1 for mpg01 are concentrated toward
the lower end, and for Weight, you can also see a
concentration of observations that have a 0 for mpg01
towards the right.
set.seed(42)
# Create an index for training data (80% of data)
train_index <- createDataPartition(Auto$mpg01, p = 0.8, list = FALSE)
# Subset into training and test sets
train_set <- Auto[train_index, ]
test_set <- Auto[-train_index, ]
train_set <- na.omit(train_set)
test_set <- na.omit(test_set)
# Check the dimensions
dim(train_set)
## [1] 315 10
dim(test_set)
## [1] 77 10
lda.model2 = lda(mpg01 ~ displacement + weight + horsepower, data = train_set)
lda.model2
## Call:
## lda(mpg01 ~ displacement + weight + horsepower, data = train_set)
##
## Prior probabilities of groups:
## 0 1
## 0.5238095 0.4761905
##
## Group means:
## displacement weight horsepower
## 0 264.9636 3545.533 126.87879
## 1 114.3567 2313.547 77.70667
##
## Coefficients of linear discriminants:
## LD1
## displacement -0.005820541
## weight -0.001193985
## horsepower 0.002566988
lda.class2 <- predict(lda.model2, test_set)$class
#lda.class2 <- as.factor(lda.class2)
lda.class2 <- factor(lda.class2, levels = c("0", "1"))
test_set$mpg01 <- factor(test_set$mpg01, levels = c("0", "1"))
confusionMatrix(lda.class2, test_set$mpg01, positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 35 1
## 1 5 36
##
## Accuracy : 0.9221
## 95% CI : (0.8381, 0.9709)
## No Information Rate : 0.5195
## P-Value [Acc > NIR] : 2.046e-14
##
## Kappa : 0.8445
##
## Mcnemar's Test P-Value : 0.2207
##
## Sensitivity : 0.9730
## Specificity : 0.8750
## Pos Pred Value : 0.8780
## Neg Pred Value : 0.9722
## Prevalence : 0.4805
## Detection Rate : 0.4675
## Detection Prevalence : 0.5325
## Balanced Accuracy : 0.9240
##
## 'Positive' Class : 1
##
The test error of this model is 1 - .9221 = .0779.
qda.model2 = qda(mpg01 ~ displacement + weight + horsepower, data = train_set)
qda.model2
## Call:
## qda(mpg01 ~ displacement + weight + horsepower, data = train_set)
##
## Prior probabilities of groups:
## 0 1
## 0.5238095 0.4761905
##
## Group means:
## displacement weight horsepower
## 0 264.9636 3545.533 126.87879
## 1 114.3567 2313.547 77.70667
qda.class2 = predict(qda.model2, test_set)$class
qda.class2 <- as.factor(qda.class2)
confusionMatrix(qda.class2, test_set$mpg01, positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 37 6
## 1 3 31
##
## Accuracy : 0.8831
## 95% CI : (0.7897, 0.9451)
## No Information Rate : 0.5195
## P-Value [Acc > NIR] : 1.165e-11
##
## Kappa : 0.7652
##
## Mcnemar's Test P-Value : 0.505
##
## Sensitivity : 0.8378
## Specificity : 0.9250
## Pos Pred Value : 0.9118
## Neg Pred Value : 0.8605
## Prevalence : 0.4805
## Detection Rate : 0.4026
## Detection Prevalence : 0.4416
## Balanced Accuracy : 0.8814
##
## 'Positive' Class : 1
##
The test error of this model is 1 - .8831 = .1169.
logit_model3 = glm(mpg01 ~ displacement + weight + horsepower, data = train_set, family = 'binomial')
summary(logit_model3)
##
## Call:
## glm(formula = mpg01 ~ displacement + weight + horsepower, family = "binomial",
## data = train_set)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 11.927068 1.768020 6.746 1.52e-11 ***
## displacement -0.011006 0.005881 -1.871 0.061278 .
## weight -0.001817 0.000736 -2.468 0.013582 *
## horsepower -0.055490 0.016242 -3.417 0.000634 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 435.97 on 314 degrees of freedom
## Residual deviance: 177.71 on 311 degrees of freedom
## AIC: 185.71
##
## Number of Fisher Scoring iterations: 7
glm.probs3 <- predict(logit_model3, test_set, type = "response")
glm.pred3 <- rep("0", 77)
glm.pred3[glm.probs3 > .5] = "1"
glm.pred3 <- as.factor(glm.pred3)
confusionMatrix(glm.pred3, test_set$mpg01, positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 37 6
## 1 3 31
##
## Accuracy : 0.8831
## 95% CI : (0.7897, 0.9451)
## No Information Rate : 0.5195
## P-Value [Acc > NIR] : 1.165e-11
##
## Kappa : 0.7652
##
## Mcnemar's Test P-Value : 0.505
##
## Sensitivity : 0.8378
## Specificity : 0.9250
## Pos Pred Value : 0.9118
## Neg Pred Value : 0.8605
## Prevalence : 0.4805
## Detection Rate : 0.4026
## Detection Prevalence : 0.4416
## Balanced Accuracy : 0.8814
##
## 'Positive' Class : 1
##
The test error of this model is 1 - .8831 = .1169.
bayes.model2 = naiveBayes(mpg01 ~ displacement + weight + horsepower, data = train_set)
bayes.model2
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## 0 1
## 0.5238095 0.4761905
##
## Conditional probabilities:
## displacement
## Y [,1] [,2]
## 0 264.9636 93.87907
## 1 114.3567 36.25702
##
## weight
## Y [,1] [,2]
## 0 3545.533 687.6779
## 1 2313.547 390.8718
##
## horsepower
## Y [,1] [,2]
## 0 126.87879 37.01941
## 1 77.70667 14.58764
bayes.class2 = predict(bayes.model2, test_set)
confusionMatrix(bayes.class2, test_set$mpg01, positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 37 4
## 1 3 33
##
## Accuracy : 0.9091
## 95% CI : (0.8216, 0.9627)
## No Information Rate : 0.5195
## P-Value [Acc > NIR] : 1.954e-13
##
## Kappa : 0.8177
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.8919
## Specificity : 0.9250
## Pos Pred Value : 0.9167
## Neg Pred Value : 0.9024
## Prevalence : 0.4805
## Detection Rate : 0.4286
## Detection Prevalence : 0.4675
## Balanced Accuracy : 0.9084
##
## 'Positive' Class : 1
##
The test error of this model is 1 - .9091 = .0909.
train.X2 = cbind(train_set$displacement, train_set$weight, train_set$horsepower)
test.X2 = cbind(test_set$displacement, test_set$weight, test_set$horsepower)
train.Y <- factor(train_set$mpg01, levels = c("0", "1"))
test.Y <- factor(test_set$mpg01, levels = c("0", "1"))
colSums(is.na(train.X2)) # Check missing values in training set
## [1] 0 0 0
colSums(is.na(test.X2))
## [1] 0 0 0
k_values <- c(1, 3, 5, 7, 10, 15, 20)
test_errors <- numeric(length(k_values))
for (i in seq_along(k_values)) {
k <- k_values[i]
# Perform KNN classification
knn.pred2 <- knn(train.X2, test.X2, train.Y, k = k)
cm <- confusionMatrix(knn.pred2, test.Y, positive = '1')
test_errors[i] <- 1 - cm$overall["Accuracy"]
}
results <- data.frame(K = k_values, Test_Error = test_errors)
print(results)
## K Test_Error
## 1 1 0.11688312
## 2 3 0.07792208
## 3 5 0.09090909
## 4 7 0.09090909
## 5 10 0.10389610
## 6 15 0.09090909
## 7 20 0.09090909
After trying KNN with multiple values for K, I found that the lowest test error was a k=3. It produces a test error of 0.07792208.
Using the Boston data set, fit classification models in order to predict whether a given census tract has a crime rate above or below the median. Explore logistic regression, LDA, naive Bayes, and KNN models using various subsets of the predictors. Describe your findings.
Boston_df <- Boston
crime_median <- median(Boston_df$crim)
Boston_df$CrimeLevel <- ifelse(Boston_df$crim > crime_median, 'High', 'Low')
Boston_df$CrimeLevel <- factor(Boston_df$CrimeLevel, levels = c("Low", "High"))
head(Boston_df)
## crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12 5.21
## medv CrimeLevel
## 1 24.0 Low
## 2 21.6 Low
## 3 34.7 Low
## 4 33.4 Low
## 5 36.2 Low
## 6 28.7 Low
summary(Boston_df)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv CrimeLevel
## Min. : 1.73 Min. : 5.00 Low :253
## 1st Qu.: 6.95 1st Qu.:17.02 High:253
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
# Box plot for Amount_purchased, Frequency, and Last_purchased
par(mfrow = c(1, 5))
boxplot(Boston_df$nox,
main = 'Nox')
boxplot(Boston_df$rad,
main = 'Rad')
boxplot(Boston_df$tax,
main = 'Tax')
boxplot(Boston_df$ptratio,
main = 'PT-Ratio')
boxplot(Boston_df$medv,
main = 'Medv')
par(mfrow = c(1, 1))
par(mfrow = c(1, 3))
boxplot(Boston_df$zn,
main = 'Zn')
boxplot(Boston_df$indus,
main = 'Indus')
boxplot(Boston_df$age,
main = 'Age')
par(mfrow = c(1, 1))
set.seed(42)
# Create an index for training data (70% of data)
train_index2 <- createDataPartition(Boston_df$CrimeLevel, p = 0.8, list = FALSE)
# Subset into training and test sets
train_set2 <- Boston_df[train_index2, ]
test_set2 <- Boston_df[-train_index2, ]
train_set2 <- na.omit(train_set2)
test_set2 <- na.omit(test_set2)
# Check the dimensions
dim(train_set2)
## [1] 406 15
dim(test_set2)
## [1] 100 15
logit_model4 = glm(CrimeLevel ~ . - crim - medv, data = train_set2, family = 'binomial')
summary(logit_model4)
##
## Call:
## glm(formula = CrimeLevel ~ . - crim - medv, family = "binomial",
## data = train_set2)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -36.331263 7.254285 -5.008 5.49e-07 ***
## zn -0.096393 0.040634 -2.372 0.01768 *
## indus -0.052732 0.047850 -1.102 0.27045
## chas 0.779005 0.812902 0.958 0.33791
## nox 49.200655 8.192152 6.006 1.90e-09 ***
## rm 1.090307 0.501277 2.175 0.02963 *
## age 0.001185 0.012033 0.098 0.92157
## dis 0.626720 0.226851 2.763 0.00573 **
## rad 0.674765 0.165176 4.085 4.41e-05 ***
## tax -0.006635 0.002805 -2.366 0.01800 *
## ptratio 0.153465 0.108831 1.410 0.15850
## black -0.009018 0.005771 -1.563 0.11811
## lstat 0.061706 0.051469 1.199 0.23057
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 562.84 on 405 degrees of freedom
## Residual deviance: 173.55 on 393 degrees of freedom
## AIC: 199.55
##
## Number of Fisher Scoring iterations: 9
vif(logit_model4)
## zn indus chas nox rm age dis rad
## 1.992748 2.631929 1.187205 3.999250 2.211386 1.961204 3.430697 1.847246
## tax ptratio black lstat
## 1.740734 1.545320 1.065309 2.496497
glm.probs4 <- predict(logit_model4, test_set2, type = "response")
glm.pred4 <- rep("Low", 100)
glm.pred4[glm.probs4 > .5] = "High"
glm.pred4 <- as.factor(glm.pred4)
confusionMatrix(glm.pred4, test_set2$CrimeLevel, positive = 'High')
## Warning in confusionMatrix.default(glm.pred4, test_set2$CrimeLevel, positive =
## "High"): Levels are not in the same order for reference and data. Refactoring
## data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction Low High
## Low 46 4
## High 4 46
##
## Accuracy : 0.92
## 95% CI : (0.8484, 0.9648)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.84
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.92
## Specificity : 0.92
## Pos Pred Value : 0.92
## Neg Pred Value : 0.92
## Prevalence : 0.50
## Detection Rate : 0.46
## Detection Prevalence : 0.50
## Balanced Accuracy : 0.92
##
## 'Positive' Class : High
##
I used logistic regression on the entire models and then took out
both crim and medv due to them having high
multi-colinearity. I then decided to leave all of the other variables in
regardless of the significance to see what the accuracy would be in
prediction. After running prediction on the test split of the data I got
an error rate of 1 - .92 = .08.
lda.model3 = lda(CrimeLevel ~ . - crim - medv, data = train_set2)
lda.model3
## Call:
## lda(CrimeLevel ~ . - crim - medv, data = train_set2)
##
## Prior probabilities of groups:
## Low High
## 0.5 0.5
##
## Group means:
## zn indus chas nox rm age dis
## Low 20.768473 7.037044 0.05911330 0.4729690 6.407394 53.75074 5.020854
## High 1.103448 15.466404 0.08866995 0.6399064 6.154266 85.65813 2.496151
## rad tax ptratio black lstat
## Low 4.147783 305.0936 17.87783 388.4625 9.560443
## High 15.211823 516.4729 19.01034 322.0028 15.842956
##
## Coefficients of linear discriminants:
## LD1
## zn -0.0050050404
## indus 0.0167892409
## chas 0.0261930745
## nox 7.3425964483
## rm 0.1426337043
## age 0.0089883387
## dis 0.0022607675
## rad 0.0841676436
## tax -0.0010584670
## ptratio -0.0139125840
## black -0.0005291344
## lstat -0.0032503657
lda.class3 <- predict(lda.model3, test_set2)$class
lda.class3 <- as.factor(lda.class3)
confusionMatrix(lda.class3, test_set2$CrimeLevel, positive = 'High')
## Confusion Matrix and Statistics
##
## Reference
## Prediction Low High
## Low 50 14
## High 0 36
##
## Accuracy : 0.86
## 95% CI : (0.7763, 0.9213)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 4.142e-14
##
## Kappa : 0.72
##
## Mcnemar's Test P-Value : 0.000512
##
## Sensitivity : 0.7200
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.7812
## Prevalence : 0.5000
## Detection Rate : 0.3600
## Detection Prevalence : 0.3600
## Balanced Accuracy : 0.8600
##
## 'Positive' Class : High
##
I used the same formula as the logistic regression model for the sake of consistency after running predictions and creating a confusion matrix, I got an error rate of 1 - 0.86 = 0.14.
qda.model3 = qda(CrimeLevel ~ . - crim - medv, data = train_set2)
qda.model3
## Call:
## qda(CrimeLevel ~ . - crim - medv, data = train_set2)
##
## Prior probabilities of groups:
## Low High
## 0.5 0.5
##
## Group means:
## zn indus chas nox rm age dis
## Low 20.768473 7.037044 0.05911330 0.4729690 6.407394 53.75074 5.020854
## High 1.103448 15.466404 0.08866995 0.6399064 6.154266 85.65813 2.496151
## rad tax ptratio black lstat
## Low 4.147783 305.0936 17.87783 388.4625 9.560443
## High 15.211823 516.4729 19.01034 322.0028 15.842956
qda.class3 = predict(qda.model3, test_set2)$class
qda.class3 <- as.factor(qda.class3)
confusionMatrix(qda.class3, test_set2$CrimeLevel, positive = 'High')
## Confusion Matrix and Statistics
##
## Reference
## Prediction Low High
## Low 50 12
## High 0 38
##
## Accuracy : 0.88
## 95% CI : (0.7998, 0.9364)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 9.557e-16
##
## Kappa : 0.76
##
## Mcnemar's Test P-Value : 0.001496
##
## Sensitivity : 0.7600
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.8065
## Prevalence : 0.5000
## Detection Rate : 0.3800
## Detection Prevalence : 0.3800
## Balanced Accuracy : 0.8800
##
## 'Positive' Class : High
##
Using the same variables as the previous models, I trained the data using a QDA model. After making predictions on the test set and creating a confusion matrix shown above, I got a test error of 1 - .88 = .12.
bayes.model3 = naiveBayes(CrimeLevel ~ . - crim - medv, data = train_set2)
bayes.model3
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## Low High
## 0.5 0.5
##
## Conditional probabilities:
## zn
## Y [,1] [,2]
## Low 20.768473 28.317517
## High 1.103448 4.624958
##
## indus
## Y [,1] [,2]
## Low 7.037044 5.601236
## High 15.466404 5.290022
##
## chas
## Y [,1] [,2]
## Low 0.05911330 0.2364197
## High 0.08866995 0.2849695
##
## nox
## Y [,1] [,2]
## Low 0.4729690 0.05562693
## High 0.6399064 0.09869383
##
## rm
## Y [,1] [,2]
## Low 6.407394 0.5648336
## High 6.154266 0.7621746
##
## age
## Y [,1] [,2]
## Low 53.75074 25.18766
## High 85.65813 17.63208
##
## dis
## Y [,1] [,2]
## Low 5.020854 2.079226
## High 2.496151 1.111658
##
## rad
## Y [,1] [,2]
## Low 4.147783 1.682123
## High 15.211823 9.530018
##
## tax
## Y [,1] [,2]
## Low 305.0936 88.40235
## High 516.4729 166.54543
##
## ptratio
## Y [,1] [,2]
## Low 17.87783 1.876549
## High 19.01034 2.328119
##
## black
## Y [,1] [,2]
## Low 388.4625 24.73457
## High 322.0028 119.94687
##
## lstat
## Y [,1] [,2]
## Low 9.560443 4.922170
## High 15.842956 7.293663
bayes.class3 = predict(bayes.model3, test_set2)
confusionMatrix(bayes.class3, test_set2$CrimeLevel, positive = 'High')
## Confusion Matrix and Statistics
##
## Reference
## Prediction Low High
## Low 46 14
## High 4 36
##
## Accuracy : 0.82
## 95% CI : (0.7305, 0.8897)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 3.074e-11
##
## Kappa : 0.64
##
## Mcnemar's Test P-Value : 0.03389
##
## Sensitivity : 0.7200
## Specificity : 0.9200
## Pos Pred Value : 0.9000
## Neg Pred Value : 0.7667
## Prevalence : 0.5000
## Detection Rate : 0.3600
## Detection Prevalence : 0.4000
## Balanced Accuracy : 0.8200
##
## 'Positive' Class : High
##
I trained the data using navie bayes, and the same predicotrs as the previous models. After prpedicting the test data and creating a confusion matrix, I got a test error of 1 - .82 = .18.
train.X3 = cbind(train_set2$zn, train_set2$nox, train_set2$rm, train_set2$dis, train_set2$rad, train_set2$tax, train_set2$ptratio, train_set2$black)
test.X3 = cbind(test_set2$zn, test_set2$nox, test_set2$rm, test_set2$dis, test_set2$rad, test_set2$tax, test_set2$ptratio, test_set2$black)
k_values <- c(1, 3, 5, 7, 10, 15, 20)
test_errors <- numeric(length(k_values))
for (i in seq_along(k_values)) {
k <- k_values[i]
# Perform KNN classification
knn.pred <- knn(train.X3, test.X3, train_set2$CrimeLevel, k = k)
cm <- confusionMatrix(knn.pred, test_set2$CrimeLevel, positive = 'High')
test_errors[i] <- 1 - cm$overall["Accuracy"]
}
results <- data.frame(K = k_values, Test_Error = test_errors)
print(results)
## K Test_Error
## 1 1 0.08
## 2 3 0.12
## 3 5 0.12
## 4 7 0.11
## 5 10 0.10
## 6 15 0.11
## 7 20 0.18
#knn.pred = knn(train.X, test.X, train$Direction, k = 1)
#confusionMatrix(knn.pred, test$Direction, positive = 'Up')
For the KNN model, I used the same predictors as the previous models, and then ran a loop trying different k-values. I calculated and printed the test errors for each iteration of the training. As shown above, they lowest test error was at k = 1 with a value of .08.
Looking at all of the models above, Logistic Regression and KNN were the two that had the lowest test error values of .08. It would be worth looking deeper into those two models to see if any other combination of predictors make for a better model.