# Load dependencies
pacman::p_load(ISLR2, dplyr, car, caret, MASS, class, e1071, pROC)
Produce some numerical and graphical summaries of the
Weekly data. Do there appear to be any patterns?
Based on the numerical summaries Lag 1, 2, and 3 have the same
median. All stock returns have the same maximum and minimum value. The
correlation matrix reveals that most unique predictor pairs have
correlation coefficients near zero, except for the strong correlation
between year and volume, which has an r-value of 0.84. This suggests
minimal to no correlation between today’s returns and previous days’
returns, but Volume is increasing over time.
The graphical summaries, in the form of box plots, shows all
Lags and Today are normally distributed. They
also seem to all have outliers below Q1. In Volume, we can
see it is heavily skewed right.
# View the data
df = Weekly
head(df)
## Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
## 1 1990 0.816 1.572 -3.936 -0.229 -3.484 0.1549760 -0.270 Down
## 2 1990 -0.270 0.816 1.572 -3.936 -0.229 0.1485740 -2.576 Down
## 3 1990 -2.576 -0.270 0.816 1.572 -3.936 0.1598375 3.514 Up
## 4 1990 3.514 -2.576 -0.270 0.816 1.572 0.1616300 0.712 Up
## 5 1990 0.712 3.514 -2.576 -0.270 0.816 0.1537280 1.178 Up
## 6 1990 1.178 0.712 3.514 -2.576 -0.270 0.1544440 -1.372 Down
# Summary stats of numerical columns
summary(df)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
# Correlation Matrix
cor(dplyr::select(df, where(is.numeric)))
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876
## Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535
## Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865
## Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000
## Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027
## Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873
## Lag5 Volume Today
## Year -0.030519101 0.84194162 -0.032459894
## Lag1 -0.008183096 -0.06495131 -0.075031842
## Lag2 -0.072499482 -0.08551314 0.059166717
## Lag3 0.060657175 -0.06928771 -0.071243639
## Lag4 -0.075675027 -0.06107462 -0.007825873
## Lag5 1.000000000 -0.05851741 0.011012698
## Volume -0.058517414 1.00000000 -0.033077783
## Today 0.011012698 -0.03307778 1.000000000
# Boxplots of numerical columns
par (mfrow = c(1 ,5))
boxplot(df$Lag1, main= 'Lag1')
boxplot(df$Lag2, main= 'Lag2')
boxplot(df$Lag3, main= 'Lag3')
boxplot(df$Lag4, main= 'Lag4')
boxplot(df$Lag5, main= 'Lag5')
boxplot(df$Volume, main= 'Volume')
boxplot(df$Today, main= 'Today')
Use the full data set to perform a logistic regression with
Direction as the response and the five lag variables plus
Volume as predictors. Use the summary function to print the
results. Do any of the predictors appear to be statistically
significant? If so, which ones?
Yes, only one predictor is statistically significant. With a
significance level of 0.05 we can see Lag2 is significant
with a p-value of 0.0296.
# First Logistic Regression Model
logit_model = glm(Direction ~ . -Today -Year, data=df, family= 'binomial')
summary(logit_model)
##
## Call:
## glm(formula = Direction ~ . - Today - Year, family = "binomial",
## data = df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
Compute the confusion matrix and overall fraction of correct predictions. Explain what the confusion matrix is telling you about the types of mistakes made by logistic regression.
The logistic regression model produces an overall accuracy of 56.11% where 611 observations were correctly predicted out 1089. With a sensitivity of 92.07% and specificity of 11.16%, this indicates that the model is performing well at identifying when the return will go up, but struggling significantly when returns will go down.
# Prediction probabilities
pred_prob = predict(logit_model, type= 'response')
# Classifications
pred_class = rep('Down', 1089)
pred_class[pred_prob > 0.5] = 'Up'
pred_class = as.factor(pred_class)
# Confusion Matrix
confusionMatrix(pred_class, df$Direction, , positive= 'Up')
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 54 48
## Up 430 557
##
## Accuracy : 0.5611
## 95% CI : (0.531, 0.5908)
## No Information Rate : 0.5556
## P-Value [Acc > NIR] : 0.369
##
## Kappa : 0.035
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9207
## Specificity : 0.1116
## Pos Pred Value : 0.5643
## Neg Pred Value : 0.5294
## Prevalence : 0.5556
## Detection Rate : 0.5115
## Detection Prevalence : 0.9063
## Balanced Accuracy : 0.5161
##
## 'Positive' Class : Up
##
Now ft the logistic regression model using a training data period
from 1990 to 2008, with Lag2 as the only predictor. Compute
the confusion matrix and the overall fraction of correct predictions for
the held out data (that is, the data from 2009 and 2010).
The second logistic regression model produces an overall accuracy of 62.5% where 65 observations were correctly predicted out 104. With a sensitivity of 91.8% and specificity of 20.93%. A better model than the first but not by much as the specificity is still pretty low.
# Split data into train and test
train = subset(df, Year <= 2008)
test = subset(df, Year > 2008)
# Second Logistic Regression Model
logit_model_2 = glm(Direction ~ Lag2, data= train, family= 'binomial')
summary(logit_model_2)
##
## Call:
## glm(formula = Direction ~ Lag2, family = "binomial", data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.20326 0.06428 3.162 0.00157 **
## Lag2 0.05810 0.02870 2.024 0.04298 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1354.7 on 984 degrees of freedom
## Residual deviance: 1350.5 on 983 degrees of freedom
## AIC: 1354.5
##
## Number of Fisher Scoring iterations: 4
# Prediction probabilities
logit_probs = predict(logit_model_2, test, type= 'response')
# Classifications
logit_class = ifelse(logit_probs > 0.5, 'Up', 'Down')
logit_class = as.factor(logit_class)
# Confusion Matrix
confusionMatrix(logit_class, test$Direction, positive= 'Up')
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 9 5
## Up 34 56
##
## Accuracy : 0.625
## 95% CI : (0.5247, 0.718)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.2439
##
## Kappa : 0.1414
##
## Mcnemar's Test P-Value : 7.34e-06
##
## Sensitivity : 0.9180
## Specificity : 0.2093
## Pos Pred Value : 0.6222
## Neg Pred Value : 0.6429
## Prevalence : 0.5865
## Detection Rate : 0.5385
## Detection Prevalence : 0.8654
## Balanced Accuracy : 0.5637
##
## 'Positive' Class : Up
##
Repeat (d) using LDA.
The Linear Discriminate Analysis model produces an overall accuracy of 62.5% where 65 observations were correctly predicted out 104 producing a similar model to the previous Logistic Regression model. With a sensitivity of 91.8% and specificity of 20.93%.
# LDA Model
lda_model = lda(Direction ~ Lag2, data= train)
lda_model
## Call:
## lda(Direction ~ Lag2, data = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
##
## Coefficients of linear discriminants:
## LD1
## Lag2 0.4414162
# Classifications
lda_class = predict(lda_model, test)$class
# Confusion Matrix
confusionMatrix(lda_class, test$Direction, positive= 'Up')
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 9 5
## Up 34 56
##
## Accuracy : 0.625
## 95% CI : (0.5247, 0.718)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.2439
##
## Kappa : 0.1414
##
## Mcnemar's Test P-Value : 7.34e-06
##
## Sensitivity : 0.9180
## Specificity : 0.2093
## Pos Pred Value : 0.6222
## Neg Pred Value : 0.6429
## Prevalence : 0.5865
## Detection Rate : 0.5385
## Detection Prevalence : 0.8654
## Balanced Accuracy : 0.5637
##
## 'Positive' Class : Up
##
Repeat (d) using QDA.
The Quadratic Discriminate Analysis model produces an overall accuracy of 58.65% where 61 observations were correctly predicted out 104. With a sensitivity of 100% and specificity of 0%. The best resulting sensitivity, but at the cost of unable to predict cases that are “Down”.
# QDA Model
qda_model = qda(Direction ~ Lag2, data= train)
qda_model
## Call:
## qda(Direction ~ Lag2, data = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
# Classifications
qda_class = predict(qda_model, test)$class
# Confusion Matrix
confusionMatrix(qda_class, test$Direction, positive= 'Up')
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 0 0
## Up 43 61
##
## Accuracy : 0.5865
## 95% CI : (0.4858, 0.6823)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.5419
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 1.504e-10
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.5865
## Neg Pred Value : NaN
## Prevalence : 0.5865
## Detection Rate : 0.5865
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : Up
##
Repeat (d) using KNN with K = 1.
The K-Nearest Neighbors model produces an overall accuracy of 50% where 52 observations were correctly predicted out 104. With a sensitivity of 50.82% and specificity of 48.84%. This model produces the lowest accuracy, however sees a significant improvement in predicting when the returns go down.
# Combine predictors from train and test data into a single matrix
train.X = cbind(train$Lag2)
test.X = cbind(test$Lag2)
# KNN Model classifications
knn_class = knn(train.X, test.X, train$Direction, k=1)
# Confusion Matrix
confusionMatrix(knn_class, test$Direction, positive= 'Up')
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 21 29
## Up 22 32
##
## Accuracy : 0.5096
## 95% CI : (0.4097, 0.609)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.9540
##
## Kappa : 0.0127
##
## Mcnemar's Test P-Value : 0.4008
##
## Sensitivity : 0.5246
## Specificity : 0.4884
## Pos Pred Value : 0.5926
## Neg Pred Value : 0.4200
## Prevalence : 0.5865
## Detection Rate : 0.3077
## Detection Prevalence : 0.5192
## Balanced Accuracy : 0.5065
##
## 'Positive' Class : Up
##
Repeat (d) using naive Bayes.
The naive Bayes model produces an overall accuracy of 58.65% where 61 observations were correctly predicted out 104. With a sensitivity of 100% and specificity of 0%. This results were similar to the QDA model.
# naive Bayes Model
bayes_model = naiveBayes(Direction ~ Lag2, data=train)
bayes_model
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## Down Up
## 0.4477157 0.5522843
##
## Conditional probabilities:
## Lag2
## Y [,1] [,2]
## Down -0.03568254 2.199504
## Up 0.26036581 2.317485
# Classifications
bayes_class = predict(bayes_model, test)
# Confusion Matrix
confusionMatrix(bayes_class, test$Direction, positive='Up')
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 0 0
## Up 43 61
##
## Accuracy : 0.5865
## 95% CI : (0.4858, 0.6823)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.5419
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 1.504e-10
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.5865
## Neg Pred Value : NaN
## Prevalence : 0.5865
## Detection Rate : 0.5865
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : Up
##
Which of these methods appears to provide the best results on this data?
The model that appears to provide the best results is the Logistic
Regression model with Lag2 as the single predictor on the
split test and training dataset. Although, the LDA model did produce the
same results with both having an accuracy of 62.5%, sensitivity of
91.8%, and specificity of 20.93%. These models can predict well when a
stock return would go up, but not so much if it were to go down.
Experiment with different combinations of predictors, including possible transformations and interactions, for each of the methods. Report the variables, method, and associated confusion matrix that appears to provide the best results on the held out data. Note that you should also experiment with values for K in the KNN classifier
Formal normality tests were conducted on the numerical predictor variables at a significance level of 0.05, revealing p-values below this threshold for all variables. As a result, log transformations were applied to address potential skewness in their distributions. Two-way interaction terms were also conducted.
After experimenting with both two-way interaction terms and log()
transformations, I found that the Logistic Regression model with
transformations of all lag features and an interaction term between
Lag5 and Volume produced the best results.
This model achieved an accuracy of 61.54%, with a sensitivity of 63.93%
and specificity of 58.14%. While the new model showed a relative
decrease of 1.54% in accuracy, it showed a much more balanced
sensitivity and specificity, as the previous model had a high
sensitivity of 91.8% with a much lower specificity of 20.93%.
For the K-Nearest Neighbors (KNN) model, experimenting with different values of K revealed that K = 3 gave the best results. However, it could only achieve a test error of 47.11%, or 52.88% accuracy, indicating that it didn’t perform as well as the tuned logistic regression model.
# Test for normality
shapiro.test(train$Lag1)
##
## Shapiro-Wilk normality test
##
## data: train$Lag1
## W = 0.94096, p-value < 2.2e-16
shapiro.test(train$Lag2)
##
## Shapiro-Wilk normality test
##
## data: train$Lag2
## W = 0.94084, p-value < 2.2e-16
shapiro.test(train$Lag3)
##
## Shapiro-Wilk normality test
##
## data: train$Lag3
## W = 0.94141, p-value < 2.2e-16
shapiro.test(train$Lag4)
##
## Shapiro-Wilk normality test
##
## data: train$Lag4
## W = 0.94145, p-value < 2.2e-16
shapiro.test(train$Lag5)
##
## Shapiro-Wilk normality test
##
## data: train$Lag5
## W = 0.94168, p-value < 2.2e-16
shapiro.test(train$Volume)
##
## Shapiro-Wilk normality test
##
## data: train$Volume
## W = 0.7677, p-value < 2.2e-16
# Transforming all numerical features with log()
train$Lag1 = log(train$Lag1 + 18.195 + 1)
train$Lag2 = log(train$Lag2 + 18.195 + 1)
train$Lag3 = log(train$Lag3 + 18.195 + 1)
train$Lag4 = log(train$Lag4 + 18.195 + 1)
train$Lag5 = log(train$Lag5 + 18.195 + 1)
train$Volume = log(train$Volume)
test$Lag1 = log(test$Lag1 + 18.195 + 1)
test$Lag2 = log(test$Lag2 + 18.195 + 1)
test$Lag3 = log(test$Lag3 + 18.195 + 1)
test$Lag4 = log(test$Lag4 + 18.195 + 1)
test$Lag5 = log(test$Lag5 + 18.195 + 1)
test$Volume = log(test$Volume)
# Stepwise Logistic Regression with transformations
stepwise_model = step(glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data= train, family= 'binomial'), direction= 'both', trace= 0)
summary(stepwise_model)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2, family = "binomial", data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.4816 2.1661 0.222 0.8241
## Lag1 -0.9674 0.5050 -1.916 0.0554 .
## Lag2 0.8757 0.5106 1.715 0.0864 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1354.7 on 984 degrees of freedom
## Residual deviance: 1346.8 on 982 degrees of freedom
## AIC: 1352.8
##
## Number of Fisher Scoring iterations: 4
# Prediction probabilities
pred_prob = predict(stepwise_model, test, type= 'response')
# Classifications
pred_class = ifelse(pred_prob > 0.5, 'Up', 'Down')
pred_class = as.factor(pred_class)
# Confusion Matrix
confusionMatrix(pred_class, test$Direction, , positive= 'Up')
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 6 6
## Up 37 55
##
## Accuracy : 0.5865
## 95% CI : (0.4858, 0.6823)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.5419
##
## Kappa : 0.0461
##
## Mcnemar's Test P-Value : 4.763e-06
##
## Sensitivity : 0.9016
## Specificity : 0.1395
## Pos Pred Value : 0.5978
## Neg Pred Value : 0.5000
## Prevalence : 0.5865
## Detection Rate : 0.5288
## Detection Prevalence : 0.8846
## Balanced Accuracy : 0.5206
##
## 'Positive' Class : Up
##
# Check VIF
vif(stepwise_model)
## Lag1 Lag2
## 1.00167 1.00167
# Stepwise Logistic Regression with transformations and two-way interactions
stepwise_model = step(glm(Direction ~ (Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume)^2, data= train, family= 'binomial'), direction= 'both', trace= 0)
summary(stepwise_model)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume + Lag3:Lag4 + Lag5:Volume, family = "binomial", data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 46.4780 25.8458 1.798 0.0721 .
## Lag1 -1.0922 0.5212 -2.095 0.0361 *
## Lag2 0.8451 0.5025 1.682 0.0926 .
## Lag3 -13.9783 8.4649 -1.651 0.0987 .
## Lag4 -14.5777 8.7634 -1.663 0.0962 .
## Lag5 -0.8739 0.5290 -1.652 0.0986 .
## Volume -2.6039 1.3001 -2.003 0.0452 *
## Lag3:Lag4 4.7432 2.9047 1.633 0.1025
## Lag5:Volume 0.8492 0.4395 1.932 0.0534 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1354.7 on 984 degrees of freedom
## Residual deviance: 1335.0 on 976 degrees of freedom
## AIC: 1353
##
## Number of Fisher Scoring iterations: 5
# Check VIF
vif(stepwise_model, type = 'predictor')
## Warning in vif.lm(stepwise_model, type = "predictor"): type = 'predictor' is available only for unweighted linear models;
## type = 'terms' will be used
## Lag1 Lag2 Lag3 Lag4 Lag5 Volume
## 1.019757 1.029104 226.311626 286.593598 1.259099 394.104782
## Lag3:Lag4 Lag5:Volume
## 451.843448 392.449032
# Model with no multicollinearity
logit_model = glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Lag5:Volume,
family = "binomial", data = train)
summary(logit_model)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Lag5:Volume, family = "binomial", data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.59168 3.15878 0.820 0.4119
## Lag1 -1.01109 0.51166 -1.976 0.0481 *
## Lag2 0.78404 0.50826 1.543 0.1229
## Lag3 -0.31769 0.47990 -0.662 0.5080
## Lag4 -0.14421 0.43221 -0.334 0.7386
## Lag5 -0.12634 0.45252 -0.279 0.7801
## Lag5:Volume -0.03126 0.02248 -1.390 0.1644
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1354.7 on 984 degrees of freedom
## Residual deviance: 1344.4 on 978 degrees of freedom
## AIC: 1358.4
##
## Number of Fisher Scoring iterations: 4
# Prediction probabilities
pred_prob = predict(logit_model, test, type= 'response')
# Classifications
pred_class = ifelse(pred_prob > 0.5, 'Up', 'Down')
pred_class = as.factor(pred_class)
# Confusion Matrix
confusionMatrix(pred_class, test$Direction, , positive= 'Up')
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 25 22
## Up 18 39
##
## Accuracy : 0.6154
## 95% CI : (0.5149, 0.7091)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.3110
##
## Kappa : 0.2178
##
## Mcnemar's Test P-Value : 0.6353
##
## Sensitivity : 0.6393
## Specificity : 0.5814
## Pos Pred Value : 0.6842
## Neg Pred Value : 0.5319
## Prevalence : 0.5865
## Detection Rate : 0.3750
## Detection Prevalence : 0.5481
## Balanced Accuracy : 0.6104
##
## 'Positive' Class : Up
##
# Check VIF
vif(logit_model, type= 'predictor')
## Warning in vif.lm(logit_model, type = "predictor"): type = 'predictor' is available only for unweighted linear models;
## type = 'terms' will be used
## Lag1 Lag2 Lag3 Lag4 Lag5 Lag5:Volume
## 1.022782 1.030132 1.025558 1.028924 1.015863 1.040996
# LDA Model with transformations
lda_model = lda(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data= train)
# Classifications
lda_class = predict(lda_model, test)$class
# Confusion Matrix
confusionMatrix(lda_class, test$Direction, positive= 'Up')
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 25 24
## Up 18 37
##
## Accuracy : 0.5962
## 95% CI : (0.4954, 0.6913)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.4626
##
## Kappa : 0.1842
##
## Mcnemar's Test P-Value : 0.4404
##
## Sensitivity : 0.6066
## Specificity : 0.5814
## Pos Pred Value : 0.6727
## Neg Pred Value : 0.5102
## Prevalence : 0.5865
## Detection Rate : 0.3558
## Detection Prevalence : 0.5288
## Balanced Accuracy : 0.5940
##
## 'Positive' Class : Up
##
# LDA Model with transformations and two-way interactions
lda_model = lda(Direction ~ (Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume)^2, data= train)
# Classifications
lda_class = predict(lda_model, test)$class
# Confusion Matrix
confusionMatrix(lda_class, test$Direction, positive= 'Up')
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 18 18
## Up 25 43
##
## Accuracy : 0.5865
## 95% CI : (0.4858, 0.6823)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.5419
##
## Kappa : 0.1266
##
## Mcnemar's Test P-Value : 0.3602
##
## Sensitivity : 0.7049
## Specificity : 0.4186
## Pos Pred Value : 0.6324
## Neg Pred Value : 0.5000
## Prevalence : 0.5865
## Detection Rate : 0.4135
## Detection Prevalence : 0.6538
## Balanced Accuracy : 0.5618
##
## 'Positive' Class : Up
##
# QDA Model with transformations
qda_model = qda(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data= train)
# Classifications
qda_class = predict(qda_model, test)$class
# Confusion Matrix
confusionMatrix(qda_class, test$Direction, positive= 'Up')
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 17 26
## Up 26 35
##
## Accuracy : 0.5
## 95% CI : (0.4003, 0.5997)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.97
##
## Kappa : -0.0309
##
## Mcnemar's Test P-Value : 1.00
##
## Sensitivity : 0.5738
## Specificity : 0.3953
## Pos Pred Value : 0.5738
## Neg Pred Value : 0.3953
## Prevalence : 0.5865
## Detection Rate : 0.3365
## Detection Prevalence : 0.5865
## Balanced Accuracy : 0.4846
##
## 'Positive' Class : Up
##
# Combine predictors from train and test data into a single matrix
train.X = cbind(train$Lag1, train$Lag2, train$Lag3, train$Lag4, train$Lag5, train$Volume)
test.X = cbind(test$Lag1, test$Lag2, test$Lag3, test$Lag4, test$Lag5, test$Volume)
# KNN Model with transformations
k_values = c(1, 3, 5, 7, 10, 15, 20)
test_errors = numeric(length(k_values))
for(i in seq_along(k_values)) {
# Set K
k = k_values[i]
# Classification
knn_class = knn(train.X, test.X, train$Direction, k=k)
# Confusion Matrix
confusion_m = confusionMatrix(knn_class, test$Direction, positive= 'Up')
# Test Errors
test_errors[i] = 1 - confusion_m$overall['Accuracy']
}
results = data.frame(K = k_values, Test_Error= test_errors)
print(results)
## K Test_Error
## 1 1 0.5192308
## 2 3 0.4711538
## 3 5 0.5192308
## 4 7 0.5096154
## 5 10 0.4903846
## 6 15 0.5288462
## 7 20 0.5480769
# naive Bayes Model with transformations
bayes_model = naiveBayes(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data=train)
bayes_model
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## Down Up
## 0.4477157 0.5522843
##
## Conditional probabilities:
## Lag1
## Y [,1] [,2]
## Down 2.963004 0.1165716
## Up 2.943562 0.1750842
##
## Lag2
## Y [,1] [,2]
## Down 2.942547 0.1771446
## Up 2.960461 0.1275173
##
## Lag3
## Y [,1] [,2]
## Down 2.956503 0.1211246
## Up 2.948639 0.1732251
##
## Lag4
## Y [,1] [,2]
## Down 2.951281 0.1863346
## Up 2.952810 0.1174156
##
## Lag5
## Y [,1] [,2]
## Down 2.954334 0.1850852
## Up 2.950197 0.1191650
##
## Volume
## Y [,1] [,2]
## Down -0.2386605 1.0047308
## Up -0.3281402 0.9924812
# Classifications
bayes_class = predict(bayes_model, test)
# Confusion Matrix
confusionMatrix(bayes_class, test$Direction, positive='Up')
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 17 28
## Up 26 33
##
## Accuracy : 0.4808
## 95% CI : (0.3817, 0.5809)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.9885
##
## Kappa : -0.0632
##
## Mcnemar's Test P-Value : 0.8918
##
## Sensitivity : 0.5410
## Specificity : 0.3953
## Pos Pred Value : 0.5593
## Neg Pred Value : 0.3778
## Prevalence : 0.5865
## Detection Rate : 0.3173
## Detection Prevalence : 0.5673
## Balanced Accuracy : 0.4682
##
## 'Positive' Class : Up
##
Create a binary variable, mpg01, that contains a 1
if mpg contains a value above its median, and a 0 if mpg contains a
value below its median. You can compute the median using the median()
function. Note you may find it helpful to use the data.frame() function
to create a single data set containing both mpg01 and the
other Auto variables.
The best performing model was QDA, with a test error of 6.33% or
accuracy of 93.67%. With predictors displacement,
horsepower, and weight produced a sensitivity
of 94.74% and specificity of 92.68% proving to be a balanced model in
predicting if mpg would be above or below the median. With 95%
confidence, the model’s true accuracy is expected to fall between 85.84%
and 97.91% when applied to a larger dataset.
# Read in Auto data
auto = read.csv('Auto.csv', na.strings= '?', stringsAsFactors= T)
head(auto)
## mpg cylinders displacement horsepower weight acceleration year origin
## 1 18 8 307 130 3504 12.0 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11.0 70 1
## 4 16 8 304 150 3433 12.0 70 1
## 5 17 8 302 140 3449 10.5 70 1
## 6 15 8 429 198 4341 10.0 70 1
## name
## 1 chevrolet chevelle malibu
## 2 buick skylark 320
## 3 plymouth satellite
## 4 amc rebel sst
## 5 ford torino
## 6 ford galaxie 500
# Create mpg01 the response variable
mpg_median = median(auto$mpg)
auto$mpg01 = ifelse(auto$mpg > mpg_median, 1, 0)
# Convert categorical columns to factor
auto$cylinders = as.factor(auto$cylinders)
auto$origin = as.factor(auto$origin)
auto$mpg01 = as.factor(auto$mpg01)
glimpse(auto)
## Rows: 397
## Columns: 10
## $ mpg <dbl> 18, 15, 18, 16, 17, 15, 14, 14, 14, 15, 15, 14, 15, 14, 2…
## $ cylinders <fct> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 6, 6, 6, 4, …
## $ displacement <dbl> 307, 350, 318, 304, 302, 429, 454, 440, 455, 390, 383, 34…
## $ horsepower <int> 130, 165, 150, 150, 140, 198, 220, 215, 225, 190, 170, 16…
## $ weight <int> 3504, 3693, 3436, 3433, 3449, 4341, 4354, 4312, 4425, 385…
## $ acceleration <dbl> 12.0, 11.5, 11.0, 12.0, 10.5, 10.0, 9.0, 8.5, 10.0, 8.5, …
## $ year <int> 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 7…
## $ origin <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 3, …
## $ name <fct> chevrolet chevelle malibu, buick skylark 320, plymouth sa…
## $ mpg01 <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, …
Explore the data graphically in order to investigate the
association between mpg01 and the other features. Which of
the other features seem most likely to be useful in predicting
mpg01? Scatter plots and box plots may be useful tools to
answer this question. Describe your findings.
The scatter plot matrix suggests a negative linear relationship
between mpg and the variables displacement,
horsepower, weight, and
acceleration, meaning as these features increase, mpg tends
to decrease. Additionally, the box plots comparing the two levels of
mpg01 (whether a vehicle has mpg above or below the median) show
significant differences for displacement,
horsepower, and weight. These features are
likely important predictors, as vehicles with mpg above the median tend
to have lower values for these three features.
# Scatter plot
pairs(Filter(is.numeric, (auto)))
# Box plots of numerical predictors
par (mfrow = c(1 ,5))
boxplot(auto$displacement, main= 'Displacement')
boxplot(auto$horsepower, main= 'Horsepower')
boxplot(auto$weight, main= 'Weight')
boxplot(auto$acceleration, main= 'Acceleration')
boxplot(auto$year, main= 'Year')
# Box plots across the two levels
par (mfrow = c(1 ,5))
boxplot(displacement ~ mpg01, data= auto, xlab= '')
boxplot(horsepower ~ mpg01, data= auto, xlab= '')
boxplot(weight ~ mpg01, data= auto, xlab= '')
boxplot(acceleration ~ mpg01, data= auto, xlab= '')
boxplot(year ~ mpg01, data= auto, xlab= '')
Split the data into a training set and a test set.
set.seed(42)
# Create an index for training data
index = createDataPartition(auto$mpg01, p=0.8, list= FALSE)
# Subset into train and test
train = auto[index, ]
test = auto[-index, ]
# Remove null values
train = na.omit(train)
test = na.omit(test)
# Check dimensions
dim(train)
## [1] 313 10
dim(test)
## [1] 79 10
Perform LDA on the training data in order to predict
mpg01 using the variables that seemed most associated with
mpg01 in (b). What is the test error of the model
obtained?
The test error for the LDA model is 10.13%.
# LDA Model
lda_model = lda(mpg01 ~ displacement + horsepower + weight, data= train)
lda_model
## Call:
## lda(mpg01 ~ displacement + horsepower + weight, data = train)
##
## Prior probabilities of groups:
## 0 1
## 0.5239617 0.4760383
##
## Group means:
## displacement horsepower weight
## 0 263.4512 125.6037 3551.067
## 1 114.9497 78.1745 2323.262
##
## Coefficients of linear discriminants:
## LD1
## displacement -0.007006962
## horsepower 0.005550055
## weight -0.001181492
# Classifications
lda_class = predict(lda_model, test)$class
# Confusion Matrix
confusionMatrix(lda_class, test$mpg01, positive= '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 34 1
## 1 7 37
##
## Accuracy : 0.8987
## 95% CI : (0.8102, 0.9553)
## No Information Rate : 0.519
## P-Value [Acc > NIR] : 5.061e-13
##
## Kappa : 0.7983
##
## Mcnemar's Test P-Value : 0.0771
##
## Sensitivity : 0.9737
## Specificity : 0.8293
## Pos Pred Value : 0.8409
## Neg Pred Value : 0.9714
## Prevalence : 0.4810
## Detection Rate : 0.4684
## Detection Prevalence : 0.5570
## Balanced Accuracy : 0.9015
##
## 'Positive' Class : 1
##
Perform QDA on the training data in order to predict
mpg01 using the variables that seemed most associated with
mpg01 in (b). What is the test error of the model
obtained?
The test error for the QDA model is 6.33%.
# QDA Model
qda_model = qda(mpg01 ~ displacement + horsepower + weight, data= train)
qda_model
## Call:
## qda(mpg01 ~ displacement + horsepower + weight, data = train)
##
## Prior probabilities of groups:
## 0 1
## 0.5239617 0.4760383
##
## Group means:
## displacement horsepower weight
## 0 263.4512 125.6037 3551.067
## 1 114.9497 78.1745 2323.262
# Classifications
qda_class = predict(qda_model, test)$class
# Confusion Matrix
confusionMatrix(qda_class, test$mpg01, positive= '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 38 2
## 1 3 36
##
## Accuracy : 0.9367
## 95% CI : (0.8584, 0.9791)
## No Information Rate : 0.519
## P-Value [Acc > NIR] : 5.214e-16
##
## Kappa : 0.8734
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9474
## Specificity : 0.9268
## Pos Pred Value : 0.9231
## Neg Pred Value : 0.9500
## Prevalence : 0.4810
## Detection Rate : 0.4557
## Detection Prevalence : 0.4937
## Balanced Accuracy : 0.9371
##
## 'Positive' Class : 1
##
Perform logistic regression on the training data in order to
predict mpg01 using the variables that seemed most
associated with mpg01 in (b). What is the test error of the
model obtained?
The test error for the Logistic Regression model is 11.39%.
# Logistic Regression Model
logit_model = glm(mpg01 ~ displacement + horsepower + weight, data= train, family= 'binomial')
summary(logit_model)
##
## Call:
## glm(formula = mpg01 ~ displacement + horsepower + weight, family = "binomial",
## data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 11.2126397 1.6814695 6.668 2.59e-11 ***
## displacement -0.0107743 0.0059813 -1.801 0.07165 .
## horsepower -0.0368146 0.0148766 -2.475 0.01334 *
## weight -0.0021747 0.0007484 -2.906 0.00366 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 433.19 on 312 degrees of freedom
## Residual deviance: 181.68 on 309 degrees of freedom
## AIC: 189.68
##
## Number of Fisher Scoring iterations: 7
# Probabilities
pred_prob = predict(logit_model, test, type= 'response')
# Classifications
pred_class = ifelse(pred_prob > 0.5, '1', '0')
pred_class = as.factor(pred_class)
# Confusion Matrix
confusionMatrix(pred_class, test$mpg01, positive= '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 36 4
## 1 5 34
##
## Accuracy : 0.8861
## 95% CI : (0.7947, 0.9466)
## No Information Rate : 0.519
## P-Value [Acc > NIR] : 3.771e-12
##
## Kappa : 0.772
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.8947
## Specificity : 0.8780
## Pos Pred Value : 0.8718
## Neg Pred Value : 0.9000
## Prevalence : 0.4810
## Detection Rate : 0.4304
## Detection Prevalence : 0.4937
## Balanced Accuracy : 0.8864
##
## 'Positive' Class : 1
##
Perform naive Bayes on the training data in order to predict
mpg01 using the variables that seemed most associated with
mpg01 in (b). What is the test error of the model
obtained?
The test error for the naive Bayes model is 8.86%.
# naive Bayes Model
bayes_model = naiveBayes(mpg01 ~ displacement + horsepower + weight, data= train)
bayes_model
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## 0 1
## 0.5239617 0.4760383
##
## Conditional probabilities:
## displacement
## Y [,1] [,2]
## 0 263.4512 91.50422
## 1 114.9497 36.85544
##
## horsepower
## Y [,1] [,2]
## 0 125.6037 36.81701
## 1 78.1745 15.45776
##
## weight
## Y [,1] [,2]
## 0 3551.067 688.9551
## 1 2323.262 385.2206
# Classifications
bayes_class = predict(bayes_model, test)
# Confusion Matrix
confusionMatrix(bayes_class, test$mpg01, positive= '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 36 2
## 1 5 36
##
## Accuracy : 0.9114
## 95% CI : (0.8259, 0.9636)
## No Information Rate : 0.519
## P-Value [Acc > NIR] : 5.959e-14
##
## Kappa : 0.823
##
## Mcnemar's Test P-Value : 0.4497
##
## Sensitivity : 0.9474
## Specificity : 0.8780
## Pos Pred Value : 0.8780
## Neg Pred Value : 0.9474
## Prevalence : 0.4810
## Detection Rate : 0.4557
## Detection Prevalence : 0.5190
## Balanced Accuracy : 0.9127
##
## 'Positive' Class : 1
##
Perform KNN on the training data, with several values of K, in
order to predict mpg01. Use only the variables that seemed
most associated with mpg01 in (b). What test errors do you
obtain? Which value of K seems to perform the best on this data
set?
The test errors for the KNN model are 8.86%, 10.13%, 10.13%, 10.13%, 11.39%, 12.66%, 12.66%. The value of K that performed the best was K = 1.
# Combine predictors from train and test data into a single matrix
train.X = cbind(train$displacement, train$horsepower, train$weight)
test.X = cbind(test$displacement, test$horsepower, test$weight)
# KNN Model
k_values = c(1, 3, 5, 7, 10, 15, 20)
test_errors = numeric(length(k_values))
for(i in seq_along(k_values)) {
# Set K
k = k_values[i]
# Classification
knn_class = knn(train.X, test.X, train$mpg01, k=k)
# Confusion Matrix
confusion_m = confusionMatrix(knn_class, test$mpg01, positive= '1')
# Test Errors
test_errors[i] = 1 - confusion_m$overall['Accuracy']
}
results = data.frame(K = k_values, Test_Error= test_errors)
print(results)
## K Test_Error
## 1 1 0.08860759
## 2 3 0.10126582
## 3 5 0.10126582
## 4 7 0.10126582
## 5 10 0.11392405
## 6 15 0.12658228
## 7 20 0.12658228
Using the Boston data set, fit classification models
in order to predict whether a given census tract has a crime rate above
or below the median. Explore logistic regression, LDA, KNN models, and
naive Bayes using various subsets of the predictors. Describe your
findings. Hint: You will have to create the response variable yourself,
using the variables that are contained in the Boston data
set.
The best performing model was QDA, with a test error of 12% or
accuracy of 88%. With predictors zn, indus,
chas, nox, dis, tax,
ptratio, black, lstat, and
medv produced a sensitivity of 86% and specificity of 90%
proving to be a balanced model in predicting if crime rate would be
above or below the median. With 95% confidence, the model’s true
accuracy is expected to fall between 79.98% and 93.64% when applied to a
larger dataset.
Create a binary variable.
# Read in Boston data
boston = Boston
head(boston)
## crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12 5.21
## medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7
# Check feature that looks to be binary
count(boston, chas)
## chas n
## 1 0 471
## 2 1 35
# Create Crime Level, response variable
crime_median = median(boston$crim)
boston$CrimeLevel = ifelse(boston$crim > crime_median, 'High', 'Low')
boston$CrimeLevel = as.factor(boston$CrimeLevel)
# Convert chas('Next to Charles River') column to factor
boston$chas = as.factor(boston$chas)
Explore the data.
Based on the scatter plot matrix, age, dis,
and medv appear to be the strongest potential predictors of
crime level. The correlation matrix shows rad to have the
higher r-value at 0.6255. However this feature was excluded from all
models due to perfect correlation in certain values, which could lead to
numerical instability and separation issues. The second highest r-value
found was tax at 0.5827.
The box plots of each numerical feature indicates zn,
indus, age, dis,
rad, tax, ptratio,
black, lstat, and medv are not
normally distributed. The box plots across the two levels of crime level
indicates a significant difference in indus,
nox, age, and dis highlighting
them as potential predictors.
glimpse(boston)
## Rows: 506
## Columns: 15
## $ crim <dbl> 0.00632, 0.02731, 0.02729, 0.03237, 0.06905, 0.02985, 0.088…
## $ zn <dbl> 18.0, 0.0, 0.0, 0.0, 0.0, 0.0, 12.5, 12.5, 12.5, 12.5, 12.5…
## $ indus <dbl> 2.31, 7.07, 7.07, 2.18, 2.18, 2.18, 7.87, 7.87, 7.87, 7.87,…
## $ chas <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ nox <dbl> 0.538, 0.469, 0.469, 0.458, 0.458, 0.458, 0.524, 0.524, 0.5…
## $ rm <dbl> 6.575, 6.421, 7.185, 6.998, 7.147, 6.430, 6.012, 6.172, 5.6…
## $ age <dbl> 65.2, 78.9, 61.1, 45.8, 54.2, 58.7, 66.6, 96.1, 100.0, 85.9…
## $ dis <dbl> 4.0900, 4.9671, 4.9671, 6.0622, 6.0622, 6.0622, 5.5605, 5.9…
## $ rad <int> 1, 2, 2, 3, 3, 3, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4,…
## $ tax <dbl> 296, 242, 242, 222, 222, 222, 311, 311, 311, 311, 311, 311,…
## $ ptratio <dbl> 15.3, 17.8, 17.8, 18.7, 18.7, 18.7, 15.2, 15.2, 15.2, 15.2,…
## $ black <dbl> 396.90, 396.90, 392.83, 394.63, 396.90, 394.12, 395.60, 396…
## $ lstat <dbl> 4.98, 9.14, 4.03, 2.94, 5.33, 5.21, 12.43, 19.15, 29.93, 17…
## $ medv <dbl> 24.0, 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9,…
## $ CrimeLevel <fct> Low, Low, Low, Low, Low, Low, Low, Low, Low, Low, Low, Low,…
# Scatter plot matrix
pairs(Filter(is.numeric, boston))
# Correlation Matrix
cor(dplyr::select(boston, where(is.numeric)))
## crim zn indus nox rm age
## crim 1.0000000 -0.2004692 0.4065834 0.4209717 -0.2192467 0.3527343
## zn -0.2004692 1.0000000 -0.5338282 -0.5166037 0.3119906 -0.5695373
## indus 0.4065834 -0.5338282 1.0000000 0.7636514 -0.3916759 0.6447785
## nox 0.4209717 -0.5166037 0.7636514 1.0000000 -0.3021882 0.7314701
## rm -0.2192467 0.3119906 -0.3916759 -0.3021882 1.0000000 -0.2402649
## age 0.3527343 -0.5695373 0.6447785 0.7314701 -0.2402649 1.0000000
## dis -0.3796701 0.6644082 -0.7080270 -0.7692301 0.2052462 -0.7478805
## rad 0.6255051 -0.3119478 0.5951293 0.6114406 -0.2098467 0.4560225
## tax 0.5827643 -0.3145633 0.7207602 0.6680232 -0.2920478 0.5064556
## ptratio 0.2899456 -0.3916785 0.3832476 0.1889327 -0.3555015 0.2615150
## black -0.3850639 0.1755203 -0.3569765 -0.3800506 0.1280686 -0.2735340
## lstat 0.4556215 -0.4129946 0.6037997 0.5908789 -0.6138083 0.6023385
## medv -0.3883046 0.3604453 -0.4837252 -0.4273208 0.6953599 -0.3769546
## dis rad tax ptratio black lstat
## crim -0.3796701 0.6255051 0.5827643 0.2899456 -0.3850639 0.4556215
## zn 0.6644082 -0.3119478 -0.3145633 -0.3916785 0.1755203 -0.4129946
## indus -0.7080270 0.5951293 0.7207602 0.3832476 -0.3569765 0.6037997
## nox -0.7692301 0.6114406 0.6680232 0.1889327 -0.3800506 0.5908789
## rm 0.2052462 -0.2098467 -0.2920478 -0.3555015 0.1280686 -0.6138083
## age -0.7478805 0.4560225 0.5064556 0.2615150 -0.2735340 0.6023385
## dis 1.0000000 -0.4945879 -0.5344316 -0.2324705 0.2915117 -0.4969958
## rad -0.4945879 1.0000000 0.9102282 0.4647412 -0.4444128 0.4886763
## tax -0.5344316 0.9102282 1.0000000 0.4608530 -0.4418080 0.5439934
## ptratio -0.2324705 0.4647412 0.4608530 1.0000000 -0.1773833 0.3740443
## black 0.2915117 -0.4444128 -0.4418080 -0.1773833 1.0000000 -0.3660869
## lstat -0.4969958 0.4886763 0.5439934 0.3740443 -0.3660869 1.0000000
## medv 0.2499287 -0.3816262 -0.4685359 -0.5077867 0.3334608 -0.7376627
## medv
## crim -0.3883046
## zn 0.3604453
## indus -0.4837252
## nox -0.4273208
## rm 0.6953599
## age -0.3769546
## dis 0.2499287
## rad -0.3816262
## tax -0.4685359
## ptratio -0.5077867
## black 0.3334608
## lstat -0.7376627
## medv 1.0000000
# Check feature that may be perfectly correlated
table(boston$rad, boston$CrimeLevel)
##
## High Low
## 1 0 20
## 2 1 23
## 3 1 37
## 4 49 61
## 5 47 68
## 6 3 23
## 7 2 15
## 8 18 6
## 24 132 0
# Box plots of numerical predictors
par (mfrow = c(1 ,1))
boxplot(boston$crim, main= 'Crime Rate')
boxplot(boston$zn, main= 'Proportion of Residential Land Zoned')
boxplot(boston$indus, main= 'Proportion of non-retail Business Acres')
boxplot(boston$nox, main= 'Nitrogen Oxide Concentration')
boxplot(boston$rm, main= 'Average Number of Rooms')
boxplot(boston$age, main= 'Proportion of Owner-Occupied Units Built Prior to 1940')
boxplot(boston$dis, main= 'Weighted Distance to Five Boston Employment Centers')
boxplot(boston$rad, main= 'Accessibility to Radial Highways')
boxplot(boston$tax, main= 'Property Tax per $10,000')
boxplot(boston$ptratio, main= 'Pupil-Teacher Ratio')
boxplot(boston$black, main= 'Proportion of Black Residents')
boxplot(boston$lstat, main= 'Precentange of Lower-Status Population')
boxplot(boston$medv, main= 'Median Value of Owner-Occupied Homes (in $1,000s)')
# Box plots across the two levels
par (mfrow = c(1 ,5))
boxplot(zn ~ CrimeLevel, data= boston, xlab= '')
boxplot(indus ~ CrimeLevel, data= boston, xlab= '')
boxplot(nox ~ CrimeLevel, data= boston, xlab= 'Crime Level')
boxplot(rm ~ CrimeLevel, data= boston, xlab= '')
boxplot(age ~ CrimeLevel, data= boston, xlab= '')
boxplot(dis ~ CrimeLevel, data= boston, xlab= '')
boxplot(rad ~ CrimeLevel, data= boston, xlab= '')
boxplot(tax ~ CrimeLevel, data= boston, xlab= 'Crime Level')
boxplot(ptratio ~ CrimeLevel, data= boston, xlab= '')
boxplot(black ~ CrimeLevel, data= boston, xlab= '')
boxplot(lstat ~ CrimeLevel, data= boston, xlab= 'Crime Level')
boxplot(medv ~ CrimeLevel, data= boston, xlab= '')
Split the data into a training set and a test set.
# Using createDataPartition, set seed ensures data is reproducible
set.seed(42)
# Create an index for training data
index = createDataPartition(boston$CrimeLevel, p=0.8, list= FALSE)
# Subset into train and test
train = boston[index, ]
test = boston[-index, ]
# Remove null values
train = na.omit(train)
test = na.omit(test)
# Check dimensions
dim(train)
## [1] 406 15
dim(test)
## [1] 100 15
Perform Logistic Regression on the training data.
The test error for the Logistic Regression model is 88%.
# Stepwise Logistic Regression Model
stepwise_model = step(glm(CrimeLevel ~ . -crim -rad, data= train, family= 'binomial'), direction= 'both', trace=0)
summary(stepwise_model)
##
## Call:
## glm(formula = CrimeLevel ~ zn + indus + chas + nox + dis + tax +
## ptratio + black + lstat + medv, family = "binomial", data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 22.157524 7.382962 3.001 0.002689 **
## zn 0.081505 0.034155 2.386 0.017015 *
## indus 0.110505 0.046631 2.370 0.017798 *
## chas1 -1.907805 0.685150 -2.785 0.005361 **
## nox -49.346780 7.258741 -6.798 1.06e-11 ***
## dis -0.767629 0.214203 -3.584 0.000339 ***
## tax -0.003539 0.001813 -1.953 0.050875 .
## ptratio -0.345220 0.123943 -2.785 0.005347 **
## black 0.047570 0.015868 2.998 0.002719 **
## lstat -0.064698 0.045032 -1.437 0.150799
## medv -0.174111 0.040141 -4.338 1.44e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 562.84 on 405 degrees of freedom
## Residual deviance: 191.80 on 395 degrees of freedom
## AIC: 213.8
##
## Number of Fisher Scoring iterations: 9
# Check VIF for multicollinearity
vif(stepwise_model)
## zn indus chas nox dis tax ptratio black
## 1.645794 2.675880 1.291171 4.035112 3.145968 1.533963 1.792019 1.186190
## lstat medv
## 2.099531 2.801855
# Tuned Logistic Regression
logit_model = glm(CrimeLevel ~ zn + indus + chas + nox + dis + tax +
ptratio + black + lstat + medv, data= train, family= "binomial")
# Prediction probabilities
logit_probs = predict(logit_model, test, type= 'response')
# Classifications
logit_class = ifelse(logit_probs > 0.5, 'High', 'Low')
logit_class = as.factor(logit_class)
# Confusion Matrix
confusionMatrix(logit_class, test$CrimeLevel, positive= 'High')
## Confusion Matrix and Statistics
##
## Reference
## Prediction High Low
## High 6 44
## Low 44 6
##
## Accuracy : 0.12
## 95% CI : (0.0636, 0.2002)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 1
##
## Kappa : -0.76
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.12
## Specificity : 0.12
## Pos Pred Value : 0.12
## Neg Pred Value : 0.12
## Prevalence : 0.50
## Detection Rate : 0.06
## Detection Prevalence : 0.50
## Balanced Accuracy : 0.12
##
## 'Positive' Class : High
##
Perform LDA on the training data.
The test error for the LDA model is 17%.
# LDA Model
lda_model = lda(CrimeLevel ~ zn + indus + chas + nox + dis + tax +
ptratio + black + lstat + medv, data= train)
summary(lda_model)
## Length Class Mode
## prior 2 -none- numeric
## counts 2 -none- numeric
## means 20 -none- numeric
## scaling 10 -none- numeric
## lev 2 -none- character
## svd 1 -none- numeric
## N 1 -none- numeric
## call 3 -none- call
## terms 3 terms call
## xlevels 1 -none- list
# Classifications
lda_class = predict(lda_model, test)$class
# Confusion Matrix
confusionMatrix(lda_class, test$CrimeLevel, positive= 'High')
## Confusion Matrix and Statistics
##
## Reference
## Prediction High Low
## High 39 6
## Low 11 44
##
## Accuracy : 0.83
## 95% CI : (0.7418, 0.8977)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 6.549e-12
##
## Kappa : 0.66
##
## Mcnemar's Test P-Value : 0.332
##
## Sensitivity : 0.7800
## Specificity : 0.8800
## Pos Pred Value : 0.8667
## Neg Pred Value : 0.8000
## Prevalence : 0.5000
## Detection Rate : 0.3900
## Detection Prevalence : 0.4500
## Balanced Accuracy : 0.8300
##
## 'Positive' Class : High
##
Perform QDA on the training data.
The test error for the QDA model is 12%.
# QDA Model
qda_model = qda(CrimeLevel ~ zn + indus + chas + nox + dis + tax +
ptratio + black + lstat + medv, data= train)
qda_model
## Call:
## qda(CrimeLevel ~ zn + indus + chas + nox + dis + tax + ptratio +
## black + lstat + medv, data = train)
##
## Prior probabilities of groups:
## High Low
## 0.5 0.5
##
## Group means:
## zn indus chas1 nox dis tax ptratio
## High 1.29064 15.196453 0.08866995 0.6319064 2.514907 506.8719 19.01724
## Low 22.05665 6.909951 0.05418719 0.4702695 5.080020 307.8522 17.88374
## black lstat medv
## High 322.3817 15.525222 20.62167
## Low 390.1891 9.244433 25.00345
# Classifications
qda_class = predict(qda_model, test)$class
# Confusion Matrix
confusionMatrix(qda_class, test$CrimeLevel, positive= 'High')
## Confusion Matrix and Statistics
##
## Reference
## Prediction High Low
## High 43 5
## Low 7 45
##
## Accuracy : 0.88
## 95% CI : (0.7998, 0.9364)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 9.557e-16
##
## Kappa : 0.76
##
## Mcnemar's Test P-Value : 0.7728
##
## Sensitivity : 0.8600
## Specificity : 0.9000
## Pos Pred Value : 0.8958
## Neg Pred Value : 0.8654
## Prevalence : 0.5000
## Detection Rate : 0.4300
## Detection Prevalence : 0.4800
## Balanced Accuracy : 0.8800
##
## 'Positive' Class : High
##
Perform KNN on the training data.
The test errors for the KNN model are 6%, 5%, 6%, 8%, 8%, 10%, 12%. The value of K that performed the best was K = 3.
#Combine predictors from train and test data into a single matrix
train.X = cbind(train$zn, train$indus, train$chas, train$nox, train$dis, train$tax, train$ptratio, train$black, train$lstat, train$medv)
test.X = cbind(test$zn, test$indus, test$chas, test$nox, test$dis, test$tax, test$ptratio, test$black, test$lstat, test$medv)
# KNN model
k_values = c(1, 3, 5, 7, 10, 15, 20)
test_errors = numeric(length(k_values))
for(i in seq_along(k_values)) {
# Set K
k = k_values[i]
# Classification
knn_class = knn(train.X, test.X, train$CrimeLevel, k=k)
# Confusion Matrix
confusion_m = confusionMatrix(knn_class, test$CrimeLevel, positive= 'High')
# Test Errors
test_errors[i] = 1 - confusion_m$overall['Accuracy']
}
results = data.frame(K = k_values, Test_Error= test_errors)
print(results)
## K Test_Error
## 1 1 0.06
## 2 3 0.05
## 3 5 0.06
## 4 7 0.08
## 5 10 0.08
## 6 15 0.10
## 7 20 0.12
Perform naive Bayes on the training data.
The test error for the naive Bayes model is 16%.
# naive Bayes Model
bayes_model = naiveBayes(CrimeLevel ~ zn + indus + chas + nox + dis + tax +
ptratio + black + lstat + medv, data= train)
bayes_model
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## High Low
## 0.5 0.5
##
## Conditional probabilities:
## zn
## Y [,1] [,2]
## High 1.29064 4.948173
## Low 22.05665 29.655831
##
## indus
## Y [,1] [,2]
## High 15.196453 5.549251
## Low 6.909951 5.351984
##
## chas
## Y 0 1
## High 0.91133005 0.08866995
## Low 0.94581281 0.05418719
##
## nox
## Y [,1] [,2]
## High 0.6319064 0.09188756
## Low 0.4702695 0.05564134
##
## dis
## Y [,1] [,2]
## High 2.514907 1.064752
## Low 5.080020 2.097736
##
## tax
## Y [,1] [,2]
## High 506.8719 169.32877
## Low 307.8522 88.18852
##
## ptratio
## Y [,1] [,2]
## High 19.01724 2.343740
## Low 17.88374 1.798509
##
## black
## Y [,1] [,2]
## High 322.3817 121.073494
## Low 390.1891 9.894726
##
## lstat
## Y [,1] [,2]
## High 15.525222 7.655475
## Low 9.244433 4.702821
##
## medv
## Y [,1] [,2]
## High 20.62167 10.564293
## Low 25.00345 7.129359
# Classifications
bayes_class = predict(bayes_model, test)
# Confusion Matrix
confusionMatrix(bayes_class, test$CrimeLevel, positive= 'High')
## Confusion Matrix and Statistics
##
## Reference
## Prediction High Low
## High 40 6
## Low 10 44
##
## Accuracy : 0.84
## 95% CI : (0.7532, 0.9057)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 1.303e-12
##
## Kappa : 0.68
##
## Mcnemar's Test P-Value : 0.4533
##
## Sensitivity : 0.8000
## Specificity : 0.8800
## Pos Pred Value : 0.8696
## Neg Pred Value : 0.8148
## Prevalence : 0.5000
## Detection Rate : 0.4000
## Detection Prevalence : 0.4600
## Balanced Accuracy : 0.8400
##
## 'Positive' Class : High
##