# Load dependencies
pacman::p_load(ISLR2, dplyr, car, caret, MASS, class, e1071, pROC)

Question 13

Part a)

Produce some numerical and graphical summaries of the Weekly data. Do there appear to be any patterns?

Based on the numerical summaries Lag 1, 2, and 3 have the same median. All stock returns have the same maximum and minimum value. The correlation matrix reveals that most unique predictor pairs have correlation coefficients near zero, except for the strong correlation between year and volume, which has an r-value of 0.84. This suggests minimal to no correlation between today’s returns and previous days’ returns, but Volume is increasing over time.

The graphical summaries, in the form of box plots, shows all Lags and Today are normally distributed. They also seem to all have outliers below Q1. In Volume, we can see it is heavily skewed right.

# View the data
df = Weekly
head(df)

##   Year   Lag1   Lag2   Lag3   Lag4   Lag5    Volume  Today Direction
## 1 1990  0.816  1.572 -3.936 -0.229 -3.484 0.1549760 -0.270      Down
## 2 1990 -0.270  0.816  1.572 -3.936 -0.229 0.1485740 -2.576      Down
## 3 1990 -2.576 -0.270  0.816  1.572 -3.936 0.1598375  3.514        Up
## 4 1990  3.514 -2.576 -0.270  0.816  1.572 0.1616300  0.712        Up
## 5 1990  0.712  3.514 -2.576 -0.270  0.816 0.1537280  1.178        Up
## 6 1990  1.178  0.712  3.514 -2.576 -0.270 0.1544440 -1.372      Down

# Summary stats of numerical columns
summary(df)

##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
##

# Correlation Matrix
cor(dplyr::select(df, where(is.numeric)))

##               Year         Lag1        Lag2        Lag3         Lag4
## Year    1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1   -0.03228927  1.000000000 -0.07485305  0.05863568 -0.071273876
## Lag2   -0.03339001 -0.074853051  1.00000000 -0.07572091  0.058381535
## Lag3   -0.03000649  0.058635682 -0.07572091  1.00000000 -0.075395865
## Lag4   -0.03112792 -0.071273876  0.05838153 -0.07539587  1.000000000
## Lag5   -0.03051910 -0.008183096 -0.07249948  0.06065717 -0.075675027
## Volume  0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today  -0.03245989 -0.075031842  0.05916672 -0.07124364 -0.007825873
##                Lag5      Volume        Today
## Year   -0.030519101  0.84194162 -0.032459894
## Lag1   -0.008183096 -0.06495131 -0.075031842
## Lag2   -0.072499482 -0.08551314  0.059166717
## Lag3    0.060657175 -0.06928771 -0.071243639
## Lag4   -0.075675027 -0.06107462 -0.007825873
## Lag5    1.000000000 -0.05851741  0.011012698
## Volume -0.058517414  1.00000000 -0.033077783
## Today   0.011012698 -0.03307778  1.000000000

# Boxplots of numerical columns
par (mfrow = c(1 ,5))
boxplot(df$Lag1, main= 'Lag1')
boxplot(df$Lag2, main= 'Lag2')
boxplot(df$Lag3, main= 'Lag3')
boxplot(df$Lag4, main= 'Lag4')
boxplot(df$Lag5, main= 'Lag5')

boxplot(df$Volume, main= 'Volume')
boxplot(df$Today, main= 'Today')

Part b)

Use the full data set to perform a logistic regression with Direction as the response and the five lag variables plus Volume as predictors. Use the summary function to print the results. Do any of the predictors appear to be statistically significant? If so, which ones?

Yes, only one predictor is statistically significant. With a significance level of 0.05 we can see Lag2 is significant with a p-value of 0.0296.

# First Logistic Regression Model
logit_model = glm(Direction ~ . -Today -Year, data=df, family= 'binomial')
summary(logit_model)

## 
## Call:
## glm(formula = Direction ~ . - Today - Year, family = "binomial", 
##     data = df)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4

Part c)

Compute the confusion matrix and overall fraction of correct predictions. Explain what the confusion matrix is telling you about the types of mistakes made by logistic regression.

The logistic regression model produces an overall accuracy of 56.11% where 611 observations were correctly predicted out 1089. With a sensitivity of 92.07% and specificity of 11.16%, this indicates that the model is performing well at identifying when the return will go up, but struggling significantly when returns will go down.

# Prediction probabilities
pred_prob = predict(logit_model, type= 'response')

# Classifications
pred_class = rep('Down', 1089)
pred_class[pred_prob > 0.5] = 'Up'
pred_class = as.factor(pred_class)

# Confusion Matrix
confusionMatrix(pred_class, df$Direction, , positive= 'Up')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Down  Up
##       Down   54  48
##       Up    430 557
##                                          
##                Accuracy : 0.5611         
##                  95% CI : (0.531, 0.5908)
##     No Information Rate : 0.5556         
##     P-Value [Acc > NIR] : 0.369          
##                                          
##                   Kappa : 0.035          
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.9207         
##             Specificity : 0.1116         
##          Pos Pred Value : 0.5643         
##          Neg Pred Value : 0.5294         
##              Prevalence : 0.5556         
##          Detection Rate : 0.5115         
##    Detection Prevalence : 0.9063         
##       Balanced Accuracy : 0.5161         
##                                          
##        'Positive' Class : Up             
##

Part d)

Now ft the logistic regression model using a training data period from 1990 to 2008, with Lag2 as the only predictor. Compute the confusion matrix and the overall fraction of correct predictions for the held out data (that is, the data from 2009 and 2010).

The second logistic regression model produces an overall accuracy of 62.5% where 65 observations were correctly predicted out 104. With a sensitivity of 91.8% and specificity of 20.93%. A better model than the first but not by much as the specificity is still pretty low.

# Split data into train and test
train = subset(df, Year <= 2008)
test = subset(df, Year > 2008)

# Second Logistic Regression Model
logit_model_2 = glm(Direction ~ Lag2, data= train, family= 'binomial')
summary(logit_model_2)

## 
## Call:
## glm(formula = Direction ~ Lag2, family = "binomial", data = train)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.20326    0.06428   3.162  0.00157 **
## Lag2         0.05810    0.02870   2.024  0.04298 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1354.7  on 984  degrees of freedom
## Residual deviance: 1350.5  on 983  degrees of freedom
## AIC: 1354.5
## 
## Number of Fisher Scoring iterations: 4

# Prediction probabilities
logit_probs = predict(logit_model_2, test, type= 'response')

# Classifications
logit_class = ifelse(logit_probs > 0.5, 'Up', 'Down')
logit_class = as.factor(logit_class)

# Confusion Matrix
confusionMatrix(logit_class, test$Direction, positive= 'Up')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Down Up
##       Down    9  5
##       Up     34 56
##                                          
##                Accuracy : 0.625          
##                  95% CI : (0.5247, 0.718)
##     No Information Rate : 0.5865         
##     P-Value [Acc > NIR] : 0.2439         
##                                          
##                   Kappa : 0.1414         
##                                          
##  Mcnemar's Test P-Value : 7.34e-06       
##                                          
##             Sensitivity : 0.9180         
##             Specificity : 0.2093         
##          Pos Pred Value : 0.6222         
##          Neg Pred Value : 0.6429         
##              Prevalence : 0.5865         
##          Detection Rate : 0.5385         
##    Detection Prevalence : 0.8654         
##       Balanced Accuracy : 0.5637         
##                                          
##        'Positive' Class : Up             
##

Part e)

Repeat (d) using LDA.

The Linear Discriminate Analysis model produces an overall accuracy of 62.5% where 65 observations were correctly predicted out 104 producing a similar model to the previous Logistic Regression model. With a sensitivity of 91.8% and specificity of 20.93%.

# LDA Model
lda_model = lda(Direction ~ Lag2, data= train)
lda_model

## Call:
## lda(Direction ~ Lag2, data = train)
## 
## Prior probabilities of groups:
##      Down        Up 
## 0.4477157 0.5522843 
## 
## Group means:
##             Lag2
## Down -0.03568254
## Up    0.26036581
## 
## Coefficients of linear discriminants:
##            LD1
## Lag2 0.4414162

# Classifications
lda_class = predict(lda_model, test)$class

# Confusion Matrix
confusionMatrix(lda_class, test$Direction, positive= 'Up')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Down Up
##       Down    9  5
##       Up     34 56
##                                          
##                Accuracy : 0.625          
##                  95% CI : (0.5247, 0.718)
##     No Information Rate : 0.5865         
##     P-Value [Acc > NIR] : 0.2439         
##                                          
##                   Kappa : 0.1414         
##                                          
##  Mcnemar's Test P-Value : 7.34e-06       
##                                          
##             Sensitivity : 0.9180         
##             Specificity : 0.2093         
##          Pos Pred Value : 0.6222         
##          Neg Pred Value : 0.6429         
##              Prevalence : 0.5865         
##          Detection Rate : 0.5385         
##    Detection Prevalence : 0.8654         
##       Balanced Accuracy : 0.5637         
##                                          
##        'Positive' Class : Up             
##

Part f)

Repeat (d) using QDA.

The Quadratic Discriminate Analysis model produces an overall accuracy of 58.65% where 61 observations were correctly predicted out 104. With a sensitivity of 100% and specificity of 0%. The best resulting sensitivity, but at the cost of unable to predict cases that are “Down”.

# QDA Model
qda_model = qda(Direction ~ Lag2, data= train)
qda_model

## Call:
## qda(Direction ~ Lag2, data = train)
## 
## Prior probabilities of groups:
##      Down        Up 
## 0.4477157 0.5522843 
## 
## Group means:
##             Lag2
## Down -0.03568254
## Up    0.26036581

# Classifications
qda_class = predict(qda_model, test)$class

# Confusion Matrix
confusionMatrix(qda_class, test$Direction, positive= 'Up')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Down Up
##       Down    0  0
##       Up     43 61
##                                           
##                Accuracy : 0.5865          
##                  95% CI : (0.4858, 0.6823)
##     No Information Rate : 0.5865          
##     P-Value [Acc > NIR] : 0.5419          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : 1.504e-10       
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.5865          
##          Neg Pred Value :    NaN          
##              Prevalence : 0.5865          
##          Detection Rate : 0.5865          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : Up              
##

Part g)

Repeat (d) using KNN with K = 1.

The K-Nearest Neighbors model produces an overall accuracy of 50% where 52 observations were correctly predicted out 104. With a sensitivity of 50.82% and specificity of 48.84%. This model produces the lowest accuracy, however sees a significant improvement in predicting when the returns go down.

# Combine predictors from train and test data into a single matrix
train.X = cbind(train$Lag2)
test.X = cbind(test$Lag2)

# KNN Model classifications
knn_class = knn(train.X, test.X, train$Direction, k=1)

# Confusion Matrix
confusionMatrix(knn_class, test$Direction, positive= 'Up')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Down Up
##       Down   21 29
##       Up     22 32
##                                          
##                Accuracy : 0.5096         
##                  95% CI : (0.4097, 0.609)
##     No Information Rate : 0.5865         
##     P-Value [Acc > NIR] : 0.9540         
##                                          
##                   Kappa : 0.0127         
##                                          
##  Mcnemar's Test P-Value : 0.4008         
##                                          
##             Sensitivity : 0.5246         
##             Specificity : 0.4884         
##          Pos Pred Value : 0.5926         
##          Neg Pred Value : 0.4200         
##              Prevalence : 0.5865         
##          Detection Rate : 0.3077         
##    Detection Prevalence : 0.5192         
##       Balanced Accuracy : 0.5065         
##                                          
##        'Positive' Class : Up             
##

Part h)

Repeat (d) using naive Bayes.

The naive Bayes model produces an overall accuracy of 58.65% where 61 observations were correctly predicted out 104. With a sensitivity of 100% and specificity of 0%. This results were similar to the QDA model.

# naive Bayes Model
bayes_model = naiveBayes(Direction ~ Lag2, data=train)
bayes_model

## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##      Down        Up 
## 0.4477157 0.5522843 
## 
## Conditional probabilities:
##       Lag2
## Y             [,1]     [,2]
##   Down -0.03568254 2.199504
##   Up    0.26036581 2.317485

# Classifications
bayes_class = predict(bayes_model, test)

# Confusion Matrix
confusionMatrix(bayes_class, test$Direction, positive='Up')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Down Up
##       Down    0  0
##       Up     43 61
##                                           
##                Accuracy : 0.5865          
##                  95% CI : (0.4858, 0.6823)
##     No Information Rate : 0.5865          
##     P-Value [Acc > NIR] : 0.5419          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : 1.504e-10       
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.5865          
##          Neg Pred Value :    NaN          
##              Prevalence : 0.5865          
##          Detection Rate : 0.5865          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : Up              
##

Part i)

Which of these methods appears to provide the best results on this data?

The model that appears to provide the best results is the Logistic Regression model with Lag2 as the single predictor on the split test and training dataset. Although, the LDA model did produce the same results with both having an accuracy of 62.5%, sensitivity of 91.8%, and specificity of 20.93%. These models can predict well when a stock return would go up, but not so much if it were to go down.

Part j)

Experiment with different combinations of predictors, including possible transformations and interactions, for each of the methods. Report the variables, method, and associated confusion matrix that appears to provide the best results on the held out data. Note that you should also experiment with values for K in the KNN classifier

Formal normality tests were conducted on the numerical predictor variables at a significance level of 0.05, revealing p-values below this threshold for all variables. As a result, log transformations were applied to address potential skewness in their distributions. Two-way interaction terms were also conducted.

After experimenting with both two-way interaction terms and log() transformations, I found that the Logistic Regression model with transformations of all lag features and an interaction term between Lag5 and Volume produced the best results. This model achieved an accuracy of 61.54%, with a sensitivity of 63.93% and specificity of 58.14%. While the new model showed a relative decrease of 1.54% in accuracy, it showed a much more balanced sensitivity and specificity, as the previous model had a high sensitivity of 91.8% with a much lower specificity of 20.93%.

For the K-Nearest Neighbors (KNN) model, experimenting with different values of K revealed that K = 3 gave the best results. However, it could only achieve a test error of 47.11%, or 52.88% accuracy, indicating that it didn’t perform as well as the tuned logistic regression model.

# Test for normality
shapiro.test(train$Lag1)

## 
##  Shapiro-Wilk normality test
## 
## data:  train$Lag1
## W = 0.94096, p-value < 2.2e-16

shapiro.test(train$Lag2)

## 
##  Shapiro-Wilk normality test
## 
## data:  train$Lag2
## W = 0.94084, p-value < 2.2e-16

shapiro.test(train$Lag3)

## 
##  Shapiro-Wilk normality test
## 
## data:  train$Lag3
## W = 0.94141, p-value < 2.2e-16

shapiro.test(train$Lag4)

## 
##  Shapiro-Wilk normality test
## 
## data:  train$Lag4
## W = 0.94145, p-value < 2.2e-16

shapiro.test(train$Lag5)

## 
##  Shapiro-Wilk normality test
## 
## data:  train$Lag5
## W = 0.94168, p-value < 2.2e-16

shapiro.test(train$Volume)

## 
##  Shapiro-Wilk normality test
## 
## data:  train$Volume
## W = 0.7677, p-value < 2.2e-16

# Transforming all numerical features with log()
train$Lag1 = log(train$Lag1 + 18.195 + 1)
train$Lag2 = log(train$Lag2 + 18.195 + 1)
train$Lag3 = log(train$Lag3 + 18.195 + 1)
train$Lag4 = log(train$Lag4 + 18.195 + 1)
train$Lag5 = log(train$Lag5 + 18.195 + 1)
train$Volume = log(train$Volume)

test$Lag1 = log(test$Lag1 + 18.195 + 1)
test$Lag2 = log(test$Lag2 + 18.195 + 1)
test$Lag3 = log(test$Lag3 + 18.195 + 1)
test$Lag4 = log(test$Lag4 + 18.195 + 1)
test$Lag5 = log(test$Lag5 + 18.195 + 1)
test$Volume = log(test$Volume)

# Stepwise Logistic Regression with transformations
stepwise_model = step(glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data= train, family= 'binomial'), direction= 'both', trace= 0)

summary(stepwise_model)

## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2, family = "binomial", data = train)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)   0.4816     2.1661   0.222   0.8241  
## Lag1         -0.9674     0.5050  -1.916   0.0554 .
## Lag2          0.8757     0.5106   1.715   0.0864 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1354.7  on 984  degrees of freedom
## Residual deviance: 1346.8  on 982  degrees of freedom
## AIC: 1352.8
## 
## Number of Fisher Scoring iterations: 4

# Prediction probabilities
pred_prob = predict(stepwise_model, test, type= 'response')

# Classifications
pred_class = ifelse(pred_prob > 0.5, 'Up', 'Down')
pred_class = as.factor(pred_class)

# Confusion Matrix
confusionMatrix(pred_class, test$Direction, , positive= 'Up')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Down Up
##       Down    6  6
##       Up     37 55
##                                           
##                Accuracy : 0.5865          
##                  95% CI : (0.4858, 0.6823)
##     No Information Rate : 0.5865          
##     P-Value [Acc > NIR] : 0.5419          
##                                           
##                   Kappa : 0.0461          
##                                           
##  Mcnemar's Test P-Value : 4.763e-06       
##                                           
##             Sensitivity : 0.9016          
##             Specificity : 0.1395          
##          Pos Pred Value : 0.5978          
##          Neg Pred Value : 0.5000          
##              Prevalence : 0.5865          
##          Detection Rate : 0.5288          
##    Detection Prevalence : 0.8846          
##       Balanced Accuracy : 0.5206          
##                                           
##        'Positive' Class : Up              
##

# Check VIF
vif(stepwise_model)

##    Lag1    Lag2 
## 1.00167 1.00167

# Stepwise Logistic Regression with transformations and two-way interactions
stepwise_model = step(glm(Direction ~ (Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume)^2, data= train, family= 'binomial'), direction= 'both', trace= 0)

summary(stepwise_model)

## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume + Lag3:Lag4 + Lag5:Volume, family = "binomial", data = train)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  46.4780    25.8458   1.798   0.0721 .
## Lag1         -1.0922     0.5212  -2.095   0.0361 *
## Lag2          0.8451     0.5025   1.682   0.0926 .
## Lag3        -13.9783     8.4649  -1.651   0.0987 .
## Lag4        -14.5777     8.7634  -1.663   0.0962 .
## Lag5         -0.8739     0.5290  -1.652   0.0986 .
## Volume       -2.6039     1.3001  -2.003   0.0452 *
## Lag3:Lag4     4.7432     2.9047   1.633   0.1025  
## Lag5:Volume   0.8492     0.4395   1.932   0.0534 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1354.7  on 984  degrees of freedom
## Residual deviance: 1335.0  on 976  degrees of freedom
## AIC: 1353
## 
## Number of Fisher Scoring iterations: 5

# Check VIF
vif(stepwise_model, type = 'predictor')

## Warning in vif.lm(stepwise_model, type = "predictor"): type = 'predictor' is available only for unweighted linear models;
##   type = 'terms' will be used

##        Lag1        Lag2        Lag3        Lag4        Lag5      Volume 
##    1.019757    1.029104  226.311626  286.593598    1.259099  394.104782 
##   Lag3:Lag4 Lag5:Volume 
##  451.843448  392.449032

# Model with no multicollinearity
logit_model = glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Lag5:Volume, 
    family = "binomial", data = train)

summary(logit_model)

## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Lag5:Volume, family = "binomial", data = train)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  2.59168    3.15878   0.820   0.4119  
## Lag1        -1.01109    0.51166  -1.976   0.0481 *
## Lag2         0.78404    0.50826   1.543   0.1229  
## Lag3        -0.31769    0.47990  -0.662   0.5080  
## Lag4        -0.14421    0.43221  -0.334   0.7386  
## Lag5        -0.12634    0.45252  -0.279   0.7801  
## Lag5:Volume -0.03126    0.02248  -1.390   0.1644  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1354.7  on 984  degrees of freedom
## Residual deviance: 1344.4  on 978  degrees of freedom
## AIC: 1358.4
## 
## Number of Fisher Scoring iterations: 4

# Prediction probabilities
pred_prob = predict(logit_model, test, type= 'response')

# Classifications
pred_class = ifelse(pred_prob > 0.5, 'Up', 'Down')
pred_class = as.factor(pred_class)

# Confusion Matrix
confusionMatrix(pred_class, test$Direction, , positive= 'Up')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Down Up
##       Down   25 22
##       Up     18 39
##                                           
##                Accuracy : 0.6154          
##                  95% CI : (0.5149, 0.7091)
##     No Information Rate : 0.5865          
##     P-Value [Acc > NIR] : 0.3110          
##                                           
##                   Kappa : 0.2178          
##                                           
##  Mcnemar's Test P-Value : 0.6353          
##                                           
##             Sensitivity : 0.6393          
##             Specificity : 0.5814          
##          Pos Pred Value : 0.6842          
##          Neg Pred Value : 0.5319          
##              Prevalence : 0.5865          
##          Detection Rate : 0.3750          
##    Detection Prevalence : 0.5481          
##       Balanced Accuracy : 0.6104          
##                                           
##        'Positive' Class : Up              
##

# Check VIF
vif(logit_model, type= 'predictor')

## Warning in vif.lm(logit_model, type = "predictor"): type = 'predictor' is available only for unweighted linear models;
##   type = 'terms' will be used

##        Lag1        Lag2        Lag3        Lag4        Lag5 Lag5:Volume 
##    1.022782    1.030132    1.025558    1.028924    1.015863    1.040996

# LDA Model with transformations
lda_model = lda(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data= train)

# Classifications
lda_class = predict(lda_model, test)$class

# Confusion Matrix
confusionMatrix(lda_class, test$Direction, positive= 'Up')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Down Up
##       Down   25 24
##       Up     18 37
##                                           
##                Accuracy : 0.5962          
##                  95% CI : (0.4954, 0.6913)
##     No Information Rate : 0.5865          
##     P-Value [Acc > NIR] : 0.4626          
##                                           
##                   Kappa : 0.1842          
##                                           
##  Mcnemar's Test P-Value : 0.4404          
##                                           
##             Sensitivity : 0.6066          
##             Specificity : 0.5814          
##          Pos Pred Value : 0.6727          
##          Neg Pred Value : 0.5102          
##              Prevalence : 0.5865          
##          Detection Rate : 0.3558          
##    Detection Prevalence : 0.5288          
##       Balanced Accuracy : 0.5940          
##                                           
##        'Positive' Class : Up              
##

# LDA Model with transformations and two-way interactions 
lda_model = lda(Direction ~ (Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume)^2, data= train)

# Classifications
lda_class = predict(lda_model, test)$class

# Confusion Matrix
confusionMatrix(lda_class, test$Direction, positive= 'Up')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Down Up
##       Down   18 18
##       Up     25 43
##                                           
##                Accuracy : 0.5865          
##                  95% CI : (0.4858, 0.6823)
##     No Information Rate : 0.5865          
##     P-Value [Acc > NIR] : 0.5419          
##                                           
##                   Kappa : 0.1266          
##                                           
##  Mcnemar's Test P-Value : 0.3602          
##                                           
##             Sensitivity : 0.7049          
##             Specificity : 0.4186          
##          Pos Pred Value : 0.6324          
##          Neg Pred Value : 0.5000          
##              Prevalence : 0.5865          
##          Detection Rate : 0.4135          
##    Detection Prevalence : 0.6538          
##       Balanced Accuracy : 0.5618          
##                                           
##        'Positive' Class : Up              
##

# QDA Model with transformations
qda_model = qda(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data= train)

# Classifications
qda_class = predict(qda_model, test)$class

# Confusion Matrix
confusionMatrix(qda_class, test$Direction, positive= 'Up')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Down Up
##       Down   17 26
##       Up     26 35
##                                           
##                Accuracy : 0.5             
##                  95% CI : (0.4003, 0.5997)
##     No Information Rate : 0.5865          
##     P-Value [Acc > NIR] : 0.97            
##                                           
##                   Kappa : -0.0309         
##                                           
##  Mcnemar's Test P-Value : 1.00            
##                                           
##             Sensitivity : 0.5738          
##             Specificity : 0.3953          
##          Pos Pred Value : 0.5738          
##          Neg Pred Value : 0.3953          
##              Prevalence : 0.5865          
##          Detection Rate : 0.3365          
##    Detection Prevalence : 0.5865          
##       Balanced Accuracy : 0.4846          
##                                           
##        'Positive' Class : Up              
##

# Combine predictors from train and test data into a single matrix
train.X = cbind(train$Lag1, train$Lag2, train$Lag3, train$Lag4, train$Lag5, train$Volume)
test.X = cbind(test$Lag1, test$Lag2, test$Lag3, test$Lag4, test$Lag5, test$Volume)

# KNN Model with transformations
k_values = c(1, 3, 5, 7, 10, 15, 20)
test_errors = numeric(length(k_values))

for(i in seq_along(k_values)) {
  # Set K
  k = k_values[i]
  
  # Classification
  knn_class = knn(train.X, test.X, train$Direction, k=k)
  
  # Confusion Matrix
  confusion_m = confusionMatrix(knn_class, test$Direction, positive= 'Up')
  
  # Test Errors
  test_errors[i] = 1 - confusion_m$overall['Accuracy']
}

results = data.frame(K = k_values, Test_Error= test_errors)
print(results)

##    K Test_Error
## 1  1  0.5192308
## 2  3  0.4711538
## 3  5  0.5192308
## 4  7  0.5096154
## 5 10  0.4903846
## 6 15  0.5288462
## 7 20  0.5480769

# naive Bayes Model with transformations
bayes_model = naiveBayes(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data=train)
bayes_model

## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##      Down        Up 
## 0.4477157 0.5522843 
## 
## Conditional probabilities:
##       Lag1
## Y          [,1]      [,2]
##   Down 2.963004 0.1165716
##   Up   2.943562 0.1750842
## 
##       Lag2
## Y          [,1]      [,2]
##   Down 2.942547 0.1771446
##   Up   2.960461 0.1275173
## 
##       Lag3
## Y          [,1]      [,2]
##   Down 2.956503 0.1211246
##   Up   2.948639 0.1732251
## 
##       Lag4
## Y          [,1]      [,2]
##   Down 2.951281 0.1863346
##   Up   2.952810 0.1174156
## 
##       Lag5
## Y          [,1]      [,2]
##   Down 2.954334 0.1850852
##   Up   2.950197 0.1191650
## 
##       Volume
## Y            [,1]      [,2]
##   Down -0.2386605 1.0047308
##   Up   -0.3281402 0.9924812

# Classifications
bayes_class = predict(bayes_model, test)

# Confusion Matrix
confusionMatrix(bayes_class, test$Direction, positive='Up')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Down Up
##       Down   17 28
##       Up     26 33
##                                           
##                Accuracy : 0.4808          
##                  95% CI : (0.3817, 0.5809)
##     No Information Rate : 0.5865          
##     P-Value [Acc > NIR] : 0.9885          
##                                           
##                   Kappa : -0.0632         
##                                           
##  Mcnemar's Test P-Value : 0.8918          
##                                           
##             Sensitivity : 0.5410          
##             Specificity : 0.3953          
##          Pos Pred Value : 0.5593          
##          Neg Pred Value : 0.3778          
##              Prevalence : 0.5865          
##          Detection Rate : 0.3173          
##    Detection Prevalence : 0.5673          
##       Balanced Accuracy : 0.4682          
##                                           
##        'Positive' Class : Up              
##

Question 14

Part a)

Create a binary variable, mpg01, that contains a 1 if mpg contains a value above its median, and a 0 if mpg contains a value below its median. You can compute the median using the median() function. Note you may find it helpful to use the data.frame() function to create a single data set containing both mpg01 and the other Auto variables.

The best performing model was QDA, with a test error of 6.33% or accuracy of 93.67%. With predictors displacement, horsepower, and weight produced a sensitivity of 94.74% and specificity of 92.68% proving to be a balanced model in predicting if mpg would be above or below the median. With 95% confidence, the model’s true accuracy is expected to fall between 85.84% and 97.91% when applied to a larger dataset.

# Read in Auto data
auto = read.csv('Auto.csv', na.strings= '?', stringsAsFactors= T)
head(auto)

##   mpg cylinders displacement horsepower weight acceleration year origin
## 1  18         8          307        130   3504         12.0   70      1
## 2  15         8          350        165   3693         11.5   70      1
## 3  18         8          318        150   3436         11.0   70      1
## 4  16         8          304        150   3433         12.0   70      1
## 5  17         8          302        140   3449         10.5   70      1
## 6  15         8          429        198   4341         10.0   70      1
##                        name
## 1 chevrolet chevelle malibu
## 2         buick skylark 320
## 3        plymouth satellite
## 4             amc rebel sst
## 5               ford torino
## 6          ford galaxie 500

# Create mpg01 the response variable
mpg_median = median(auto$mpg)
auto$mpg01 = ifelse(auto$mpg > mpg_median, 1, 0)

# Convert categorical columns to factor
auto$cylinders = as.factor(auto$cylinders)
auto$origin = as.factor(auto$origin)
auto$mpg01 = as.factor(auto$mpg01)

glimpse(auto)

## Rows: 397
## Columns: 10
## $ mpg          <dbl> 18, 15, 18, 16, 17, 15, 14, 14, 14, 15, 15, 14, 15, 14, 2…
## $ cylinders    <fct> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 6, 6, 6, 4, …
## $ displacement <dbl> 307, 350, 318, 304, 302, 429, 454, 440, 455, 390, 383, 34…
## $ horsepower   <int> 130, 165, 150, 150, 140, 198, 220, 215, 225, 190, 170, 16…
## $ weight       <int> 3504, 3693, 3436, 3433, 3449, 4341, 4354, 4312, 4425, 385…
## $ acceleration <dbl> 12.0, 11.5, 11.0, 12.0, 10.5, 10.0, 9.0, 8.5, 10.0, 8.5, …
## $ year         <int> 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 7…
## $ origin       <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 3, …
## $ name         <fct> chevrolet chevelle malibu, buick skylark 320, plymouth sa…
## $ mpg01        <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, …

Part b)

Explore the data graphically in order to investigate the association between mpg01 and the other features. Which of the other features seem most likely to be useful in predicting mpg01? Scatter plots and box plots may be useful tools to answer this question. Describe your findings.

The scatter plot matrix suggests a negative linear relationship between mpg and the variables displacement, horsepower, weight, and acceleration, meaning as these features increase, mpg tends to decrease. Additionally, the box plots comparing the two levels of mpg01 (whether a vehicle has mpg above or below the median) show significant differences for displacement, horsepower, and weight. These features are likely important predictors, as vehicles with mpg above the median tend to have lower values for these three features.

# Scatter plot
pairs(Filter(is.numeric, (auto)))

# Box plots of numerical predictors
par (mfrow = c(1 ,5))
boxplot(auto$displacement, main= 'Displacement')
boxplot(auto$horsepower, main= 'Horsepower')
boxplot(auto$weight, main= 'Weight')
boxplot(auto$acceleration, main= 'Acceleration')
boxplot(auto$year, main= 'Year')

# Box plots across the two levels
par (mfrow = c(1 ,5))
boxplot(displacement ~ mpg01, data= auto, xlab= '')
boxplot(horsepower ~ mpg01, data= auto, xlab= '')
boxplot(weight ~ mpg01, data= auto, xlab= '')
boxplot(acceleration ~ mpg01, data= auto, xlab= '')
boxplot(year ~ mpg01, data= auto, xlab= '')

Part c)

Split the data into a training set and a test set.

set.seed(42)

# Create an index for training data
index = createDataPartition(auto$mpg01, p=0.8, list= FALSE)

# Subset into train and test
train = auto[index, ]
test = auto[-index, ]

# Remove null values
train = na.omit(train)
test = na.omit(test)

# Check dimensions
dim(train)

## [1] 313  10

dim(test)

## [1] 79 10

Part d)

Perform LDA on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained?

The test error for the LDA model is 10.13%.

# LDA Model
lda_model = lda(mpg01 ~ displacement + horsepower + weight, data= train)
lda_model

## Call:
## lda(mpg01 ~ displacement + horsepower + weight, data = train)
## 
## Prior probabilities of groups:
##         0         1 
## 0.5239617 0.4760383 
## 
## Group means:
##   displacement horsepower   weight
## 0     263.4512   125.6037 3551.067
## 1     114.9497    78.1745 2323.262
## 
## Coefficients of linear discriminants:
##                       LD1
## displacement -0.007006962
## horsepower    0.005550055
## weight       -0.001181492

# Classifications
lda_class = predict(lda_model, test)$class

# Confusion Matrix
confusionMatrix(lda_class, test$mpg01, positive= '1')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 34  1
##          1  7 37
##                                           
##                Accuracy : 0.8987          
##                  95% CI : (0.8102, 0.9553)
##     No Information Rate : 0.519           
##     P-Value [Acc > NIR] : 5.061e-13       
##                                           
##                   Kappa : 0.7983          
##                                           
##  Mcnemar's Test P-Value : 0.0771          
##                                           
##             Sensitivity : 0.9737          
##             Specificity : 0.8293          
##          Pos Pred Value : 0.8409          
##          Neg Pred Value : 0.9714          
##              Prevalence : 0.4810          
##          Detection Rate : 0.4684          
##    Detection Prevalence : 0.5570          
##       Balanced Accuracy : 0.9015          
##                                           
##        'Positive' Class : 1               
##

Part e)

Perform QDA on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained?

The test error for the QDA model is 6.33%.

# QDA Model
qda_model = qda(mpg01 ~ displacement + horsepower + weight, data= train)
qda_model

## Call:
## qda(mpg01 ~ displacement + horsepower + weight, data = train)
## 
## Prior probabilities of groups:
##         0         1 
## 0.5239617 0.4760383 
## 
## Group means:
##   displacement horsepower   weight
## 0     263.4512   125.6037 3551.067
## 1     114.9497    78.1745 2323.262

# Classifications
qda_class = predict(qda_model, test)$class

# Confusion Matrix
confusionMatrix(qda_class, test$mpg01, positive= '1')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 38  2
##          1  3 36
##                                           
##                Accuracy : 0.9367          
##                  95% CI : (0.8584, 0.9791)
##     No Information Rate : 0.519           
##     P-Value [Acc > NIR] : 5.214e-16       
##                                           
##                   Kappa : 0.8734          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9474          
##             Specificity : 0.9268          
##          Pos Pred Value : 0.9231          
##          Neg Pred Value : 0.9500          
##              Prevalence : 0.4810          
##          Detection Rate : 0.4557          
##    Detection Prevalence : 0.4937          
##       Balanced Accuracy : 0.9371          
##                                           
##        'Positive' Class : 1               
##

Part f)

Perform logistic regression on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained?

The test error for the Logistic Regression model is 11.39%.

# Logistic Regression Model
logit_model = glm(mpg01 ~ displacement + horsepower + weight, data= train, family= 'binomial')
summary(logit_model)

## 
## Call:
## glm(formula = mpg01 ~ displacement + horsepower + weight, family = "binomial", 
##     data = train)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  11.2126397  1.6814695   6.668 2.59e-11 ***
## displacement -0.0107743  0.0059813  -1.801  0.07165 .  
## horsepower   -0.0368146  0.0148766  -2.475  0.01334 *  
## weight       -0.0021747  0.0007484  -2.906  0.00366 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 433.19  on 312  degrees of freedom
## Residual deviance: 181.68  on 309  degrees of freedom
## AIC: 189.68
## 
## Number of Fisher Scoring iterations: 7

# Probabilities
pred_prob = predict(logit_model, test, type= 'response')

# Classifications
pred_class = ifelse(pred_prob > 0.5, '1', '0')
pred_class = as.factor(pred_class)

# Confusion Matrix
confusionMatrix(pred_class, test$mpg01, positive= '1')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 36  4
##          1  5 34
##                                           
##                Accuracy : 0.8861          
##                  95% CI : (0.7947, 0.9466)
##     No Information Rate : 0.519           
##     P-Value [Acc > NIR] : 3.771e-12       
##                                           
##                   Kappa : 0.772           
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.8947          
##             Specificity : 0.8780          
##          Pos Pred Value : 0.8718          
##          Neg Pred Value : 0.9000          
##              Prevalence : 0.4810          
##          Detection Rate : 0.4304          
##    Detection Prevalence : 0.4937          
##       Balanced Accuracy : 0.8864          
##                                           
##        'Positive' Class : 1               
##

Part g)

Perform naive Bayes on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained?

The test error for the naive Bayes model is 8.86%.

# naive Bayes Model
bayes_model = naiveBayes(mpg01 ~ displacement + horsepower + weight, data= train)
bayes_model

## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##         0         1 
## 0.5239617 0.4760383 
## 
## Conditional probabilities:
##    displacement
## Y       [,1]     [,2]
##   0 263.4512 91.50422
##   1 114.9497 36.85544
## 
##    horsepower
## Y       [,1]     [,2]
##   0 125.6037 36.81701
##   1  78.1745 15.45776
## 
##    weight
## Y       [,1]     [,2]
##   0 3551.067 688.9551
##   1 2323.262 385.2206

# Classifications
bayes_class = predict(bayes_model, test)

# Confusion Matrix
confusionMatrix(bayes_class, test$mpg01, positive= '1')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 36  2
##          1  5 36
##                                           
##                Accuracy : 0.9114          
##                  95% CI : (0.8259, 0.9636)
##     No Information Rate : 0.519           
##     P-Value [Acc > NIR] : 5.959e-14       
##                                           
##                   Kappa : 0.823           
##                                           
##  Mcnemar's Test P-Value : 0.4497          
##                                           
##             Sensitivity : 0.9474          
##             Specificity : 0.8780          
##          Pos Pred Value : 0.8780          
##          Neg Pred Value : 0.9474          
##              Prevalence : 0.4810          
##          Detection Rate : 0.4557          
##    Detection Prevalence : 0.5190          
##       Balanced Accuracy : 0.9127          
##                                           
##        'Positive' Class : 1               
##

Part h)

Perform KNN on the training data, with several values of K, in order to predict mpg01. Use only the variables that seemed most associated with mpg01 in (b). What test errors do you obtain? Which value of K seems to perform the best on this data set?

The test errors for the KNN model are 8.86%, 10.13%, 10.13%, 10.13%, 11.39%, 12.66%, 12.66%. The value of K that performed the best was K = 1.

# Combine predictors from train and test data into a single matrix
train.X = cbind(train$displacement, train$horsepower, train$weight)
test.X = cbind(test$displacement, test$horsepower, test$weight)

# KNN Model
k_values = c(1, 3, 5, 7, 10, 15, 20)
test_errors = numeric(length(k_values))

for(i in seq_along(k_values)) {
  # Set K
  k = k_values[i]
  
  # Classification
  knn_class = knn(train.X, test.X, train$mpg01, k=k)
  
  # Confusion Matrix
  confusion_m = confusionMatrix(knn_class, test$mpg01, positive= '1')
  
  # Test Errors
  test_errors[i] = 1 - confusion_m$overall['Accuracy']
}

results = data.frame(K = k_values, Test_Error= test_errors)
print(results)

##    K Test_Error
## 1  1 0.08860759
## 2  3 0.10126582
## 3  5 0.10126582
## 4  7 0.10126582
## 5 10 0.11392405
## 6 15 0.12658228
## 7 20 0.12658228

Question 16.

Using the Boston data set, fit classification models in order to predict whether a given census tract has a crime rate above or below the median. Explore logistic regression, LDA, KNN models, and naive Bayes using various subsets of the predictors. Describe your findings. Hint: You will have to create the response variable yourself, using the variables that are contained in the Boston data set.

The best performing model was QDA, with a test error of 12% or accuracy of 88%. With predictors zn, indus, chas, nox, dis, tax, ptratio, black, lstat, and medv produced a sensitivity of 86% and specificity of 90% proving to be a balanced model in predicting if crime rate would be above or below the median. With 95% confidence, the model’s true accuracy is expected to fall between 79.98% and 93.64% when applied to a larger dataset.

Part a)

Create a binary variable.

# Read in Boston data
boston = Boston
head(boston)

##      crim zn indus chas   nox    rm  age    dis rad tax ptratio  black lstat
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90  4.98
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90  9.14
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83  4.03
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63  2.94
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90  5.33
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12  5.21
##   medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7

# Check feature that looks to be binary
count(boston, chas)

##   chas   n
## 1    0 471
## 2    1  35

# Create Crime Level, response variable
crime_median = median(boston$crim)
boston$CrimeLevel = ifelse(boston$crim > crime_median, 'High', 'Low')
boston$CrimeLevel = as.factor(boston$CrimeLevel)

# Convert chas('Next to Charles River') column to factor
boston$chas = as.factor(boston$chas)

Part b)

Explore the data.

Based on the scatter plot matrix, age, dis, and medv appear to be the strongest potential predictors of crime level. The correlation matrix shows rad to have the higher r-value at 0.6255. However this feature was excluded from all models due to perfect correlation in certain values, which could lead to numerical instability and separation issues. The second highest r-value found was tax at 0.5827.

The box plots of each numerical feature indicates zn, indus, age, dis, rad, tax, ptratio, black, lstat, and medv are not normally distributed. The box plots across the two levels of crime level indicates a significant difference in indus, nox, age, and dis highlighting them as potential predictors.

glimpse(boston)

## Rows: 506
## Columns: 15
## $ crim       <dbl> 0.00632, 0.02731, 0.02729, 0.03237, 0.06905, 0.02985, 0.088…
## $ zn         <dbl> 18.0, 0.0, 0.0, 0.0, 0.0, 0.0, 12.5, 12.5, 12.5, 12.5, 12.5…
## $ indus      <dbl> 2.31, 7.07, 7.07, 2.18, 2.18, 2.18, 7.87, 7.87, 7.87, 7.87,…
## $ chas       <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ nox        <dbl> 0.538, 0.469, 0.469, 0.458, 0.458, 0.458, 0.524, 0.524, 0.5…
## $ rm         <dbl> 6.575, 6.421, 7.185, 6.998, 7.147, 6.430, 6.012, 6.172, 5.6…
## $ age        <dbl> 65.2, 78.9, 61.1, 45.8, 54.2, 58.7, 66.6, 96.1, 100.0, 85.9…
## $ dis        <dbl> 4.0900, 4.9671, 4.9671, 6.0622, 6.0622, 6.0622, 5.5605, 5.9…
## $ rad        <int> 1, 2, 2, 3, 3, 3, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4,…
## $ tax        <dbl> 296, 242, 242, 222, 222, 222, 311, 311, 311, 311, 311, 311,…
## $ ptratio    <dbl> 15.3, 17.8, 17.8, 18.7, 18.7, 18.7, 15.2, 15.2, 15.2, 15.2,…
## $ black      <dbl> 396.90, 396.90, 392.83, 394.63, 396.90, 394.12, 395.60, 396…
## $ lstat      <dbl> 4.98, 9.14, 4.03, 2.94, 5.33, 5.21, 12.43, 19.15, 29.93, 17…
## $ medv       <dbl> 24.0, 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9,…
## $ CrimeLevel <fct> Low, Low, Low, Low, Low, Low, Low, Low, Low, Low, Low, Low,…

# Scatter plot matrix
pairs(Filter(is.numeric, boston))

# Correlation Matrix
cor(dplyr::select(boston, where(is.numeric)))

##               crim         zn      indus        nox         rm        age
## crim     1.0000000 -0.2004692  0.4065834  0.4209717 -0.2192467  0.3527343
## zn      -0.2004692  1.0000000 -0.5338282 -0.5166037  0.3119906 -0.5695373
## indus    0.4065834 -0.5338282  1.0000000  0.7636514 -0.3916759  0.6447785
## nox      0.4209717 -0.5166037  0.7636514  1.0000000 -0.3021882  0.7314701
## rm      -0.2192467  0.3119906 -0.3916759 -0.3021882  1.0000000 -0.2402649
## age      0.3527343 -0.5695373  0.6447785  0.7314701 -0.2402649  1.0000000
## dis     -0.3796701  0.6644082 -0.7080270 -0.7692301  0.2052462 -0.7478805
## rad      0.6255051 -0.3119478  0.5951293  0.6114406 -0.2098467  0.4560225
## tax      0.5827643 -0.3145633  0.7207602  0.6680232 -0.2920478  0.5064556
## ptratio  0.2899456 -0.3916785  0.3832476  0.1889327 -0.3555015  0.2615150
## black   -0.3850639  0.1755203 -0.3569765 -0.3800506  0.1280686 -0.2735340
## lstat    0.4556215 -0.4129946  0.6037997  0.5908789 -0.6138083  0.6023385
## medv    -0.3883046  0.3604453 -0.4837252 -0.4273208  0.6953599 -0.3769546
##                dis        rad        tax    ptratio      black      lstat
## crim    -0.3796701  0.6255051  0.5827643  0.2899456 -0.3850639  0.4556215
## zn       0.6644082 -0.3119478 -0.3145633 -0.3916785  0.1755203 -0.4129946
## indus   -0.7080270  0.5951293  0.7207602  0.3832476 -0.3569765  0.6037997
## nox     -0.7692301  0.6114406  0.6680232  0.1889327 -0.3800506  0.5908789
## rm       0.2052462 -0.2098467 -0.2920478 -0.3555015  0.1280686 -0.6138083
## age     -0.7478805  0.4560225  0.5064556  0.2615150 -0.2735340  0.6023385
## dis      1.0000000 -0.4945879 -0.5344316 -0.2324705  0.2915117 -0.4969958
## rad     -0.4945879  1.0000000  0.9102282  0.4647412 -0.4444128  0.4886763
## tax     -0.5344316  0.9102282  1.0000000  0.4608530 -0.4418080  0.5439934
## ptratio -0.2324705  0.4647412  0.4608530  1.0000000 -0.1773833  0.3740443
## black    0.2915117 -0.4444128 -0.4418080 -0.1773833  1.0000000 -0.3660869
## lstat   -0.4969958  0.4886763  0.5439934  0.3740443 -0.3660869  1.0000000
## medv     0.2499287 -0.3816262 -0.4685359 -0.5077867  0.3334608 -0.7376627
##               medv
## crim    -0.3883046
## zn       0.3604453
## indus   -0.4837252
## nox     -0.4273208
## rm       0.6953599
## age     -0.3769546
## dis      0.2499287
## rad     -0.3816262
## tax     -0.4685359
## ptratio -0.5077867
## black    0.3334608
## lstat   -0.7376627
## medv     1.0000000

# Check feature that may be perfectly correlated
table(boston$rad, boston$CrimeLevel)

##     
##      High Low
##   1     0  20
##   2     1  23
##   3     1  37
##   4    49  61
##   5    47  68
##   6     3  23
##   7     2  15
##   8    18   6
##   24  132   0

# Box plots of numerical predictors
par (mfrow = c(1 ,1))
boxplot(boston$crim, main= 'Crime Rate')

boxplot(boston$zn, main= 'Proportion of Residential Land Zoned')

boxplot(boston$indus, main= 'Proportion of non-retail Business Acres')

boxplot(boston$nox, main= 'Nitrogen Oxide Concentration')

boxplot(boston$rm, main= 'Average Number of Rooms')

boxplot(boston$age, main= 'Proportion of Owner-Occupied Units Built Prior to 1940')

boxplot(boston$dis, main= 'Weighted Distance to Five Boston Employment Centers')

boxplot(boston$rad, main= 'Accessibility to Radial Highways')

boxplot(boston$tax, main= 'Property Tax per $10,000')

boxplot(boston$ptratio, main= 'Pupil-Teacher Ratio')

boxplot(boston$black, main= 'Proportion of Black Residents')

boxplot(boston$lstat, main= 'Precentange of Lower-Status Population')

boxplot(boston$medv, main= 'Median Value of Owner-Occupied Homes (in $1,000s)')

# Box plots across the two levels
par (mfrow = c(1 ,5))
boxplot(zn ~ CrimeLevel, data= boston, xlab= '')
boxplot(indus ~ CrimeLevel, data= boston, xlab= '')
boxplot(nox ~ CrimeLevel, data= boston, xlab= 'Crime Level')
boxplot(rm ~ CrimeLevel, data= boston, xlab= '')
boxplot(age ~ CrimeLevel, data= boston, xlab= '')

boxplot(dis ~ CrimeLevel, data= boston, xlab= '')
boxplot(rad ~ CrimeLevel, data= boston, xlab= '')
boxplot(tax ~ CrimeLevel, data= boston, xlab= 'Crime Level')
boxplot(ptratio ~ CrimeLevel, data= boston, xlab= '')
boxplot(black ~ CrimeLevel, data= boston, xlab= '')

boxplot(lstat ~ CrimeLevel, data= boston, xlab= 'Crime Level')
boxplot(medv ~ CrimeLevel, data= boston, xlab= '')

Part c)

Split the data into a training set and a test set.

# Using createDataPartition, set seed ensures data is reproducible
set.seed(42)

# Create an index for training data
index = createDataPartition(boston$CrimeLevel, p=0.8, list= FALSE)

# Subset into train and test
train = boston[index, ]
test = boston[-index, ]

# Remove null values
train = na.omit(train)
test = na.omit(test)

# Check dimensions
dim(train)

## [1] 406  15

dim(test)

## [1] 100  15

Part d)

Perform Logistic Regression on the training data.

The test error for the Logistic Regression model is 88%.

# Stepwise Logistic Regression Model
stepwise_model = step(glm(CrimeLevel ~ . -crim -rad, data= train, family= 'binomial'), direction= 'both', trace=0)

summary(stepwise_model)

## 
## Call:
## glm(formula = CrimeLevel ~ zn + indus + chas + nox + dis + tax + 
##     ptratio + black + lstat + medv, family = "binomial", data = train)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  22.157524   7.382962   3.001 0.002689 ** 
## zn            0.081505   0.034155   2.386 0.017015 *  
## indus         0.110505   0.046631   2.370 0.017798 *  
## chas1        -1.907805   0.685150  -2.785 0.005361 ** 
## nox         -49.346780   7.258741  -6.798 1.06e-11 ***
## dis          -0.767629   0.214203  -3.584 0.000339 ***
## tax          -0.003539   0.001813  -1.953 0.050875 .  
## ptratio      -0.345220   0.123943  -2.785 0.005347 ** 
## black         0.047570   0.015868   2.998 0.002719 ** 
## lstat        -0.064698   0.045032  -1.437 0.150799    
## medv         -0.174111   0.040141  -4.338 1.44e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 562.84  on 405  degrees of freedom
## Residual deviance: 191.80  on 395  degrees of freedom
## AIC: 213.8
## 
## Number of Fisher Scoring iterations: 9

# Check VIF for multicollinearity
vif(stepwise_model)

##       zn    indus     chas      nox      dis      tax  ptratio    black 
## 1.645794 2.675880 1.291171 4.035112 3.145968 1.533963 1.792019 1.186190 
##    lstat     medv 
## 2.099531 2.801855

# Tuned Logistic Regression
logit_model = glm(CrimeLevel ~ zn + indus + chas + nox + dis + tax + 
    ptratio + black + lstat + medv, data= train, family= "binomial")

# Prediction probabilities
logit_probs = predict(logit_model, test, type= 'response')

# Classifications 
logit_class = ifelse(logit_probs > 0.5, 'High', 'Low')
logit_class = as.factor(logit_class)

# Confusion Matrix
confusionMatrix(logit_class, test$CrimeLevel, positive= 'High')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction High Low
##       High    6  44
##       Low    44   6
##                                           
##                Accuracy : 0.12            
##                  95% CI : (0.0636, 0.2002)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : -0.76           
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.12            
##             Specificity : 0.12            
##          Pos Pred Value : 0.12            
##          Neg Pred Value : 0.12            
##              Prevalence : 0.50            
##          Detection Rate : 0.06            
##    Detection Prevalence : 0.50            
##       Balanced Accuracy : 0.12            
##                                           
##        'Positive' Class : High            
##

Part e)

Perform LDA on the training data.

The test error for the LDA model is 17%.

# LDA Model
lda_model = lda(CrimeLevel ~ zn + indus + chas + nox + dis + tax + 
    ptratio + black + lstat + medv, data= train)
summary(lda_model)

##         Length Class  Mode     
## prior    2     -none- numeric  
## counts   2     -none- numeric  
## means   20     -none- numeric  
## scaling 10     -none- numeric  
## lev      2     -none- character
## svd      1     -none- numeric  
## N        1     -none- numeric  
## call     3     -none- call     
## terms    3     terms  call     
## xlevels  1     -none- list

# Classifications
lda_class = predict(lda_model, test)$class

# Confusion Matrix
confusionMatrix(lda_class, test$CrimeLevel, positive= 'High')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction High Low
##       High   39   6
##       Low    11  44
##                                           
##                Accuracy : 0.83            
##                  95% CI : (0.7418, 0.8977)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 6.549e-12       
##                                           
##                   Kappa : 0.66            
##                                           
##  Mcnemar's Test P-Value : 0.332           
##                                           
##             Sensitivity : 0.7800          
##             Specificity : 0.8800          
##          Pos Pred Value : 0.8667          
##          Neg Pred Value : 0.8000          
##              Prevalence : 0.5000          
##          Detection Rate : 0.3900          
##    Detection Prevalence : 0.4500          
##       Balanced Accuracy : 0.8300          
##                                           
##        'Positive' Class : High            
##

Part f)

Perform QDA on the training data.

The test error for the QDA model is 12%.

# QDA Model
qda_model = qda(CrimeLevel ~ zn + indus + chas + nox + dis + tax + 
    ptratio + black + lstat + medv, data= train)
qda_model

## Call:
## qda(CrimeLevel ~ zn + indus + chas + nox + dis + tax + ptratio + 
##     black + lstat + medv, data = train)
## 
## Prior probabilities of groups:
## High  Low 
##  0.5  0.5 
## 
## Group means:
##            zn     indus      chas1       nox      dis      tax  ptratio
## High  1.29064 15.196453 0.08866995 0.6319064 2.514907 506.8719 19.01724
## Low  22.05665  6.909951 0.05418719 0.4702695 5.080020 307.8522 17.88374
##         black     lstat     medv
## High 322.3817 15.525222 20.62167
## Low  390.1891  9.244433 25.00345

# Classifications
qda_class = predict(qda_model, test)$class

# Confusion Matrix
confusionMatrix(qda_class, test$CrimeLevel, positive= 'High')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction High Low
##       High   43   5
##       Low     7  45
##                                           
##                Accuracy : 0.88            
##                  95% CI : (0.7998, 0.9364)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 9.557e-16       
##                                           
##                   Kappa : 0.76            
##                                           
##  Mcnemar's Test P-Value : 0.7728          
##                                           
##             Sensitivity : 0.8600          
##             Specificity : 0.9000          
##          Pos Pred Value : 0.8958          
##          Neg Pred Value : 0.8654          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4300          
##    Detection Prevalence : 0.4800          
##       Balanced Accuracy : 0.8800          
##                                           
##        'Positive' Class : High            
##

Part g)

Perform KNN on the training data.

The test errors for the KNN model are 6%, 5%, 6%, 8%, 8%, 10%, 12%. The value of K that performed the best was K = 3.

#Combine predictors from train and test data into a single matrix
train.X = cbind(train$zn, train$indus, train$chas, train$nox, train$dis, train$tax, train$ptratio, train$black, train$lstat,  train$medv)
test.X = cbind(test$zn, test$indus, test$chas, test$nox, test$dis, test$tax, test$ptratio, test$black, test$lstat, test$medv)

# KNN model
k_values = c(1, 3, 5, 7, 10, 15, 20)
test_errors = numeric(length(k_values))

for(i in seq_along(k_values)) {
  # Set K
  k = k_values[i]

  # Classification
  knn_class = knn(train.X, test.X, train$CrimeLevel, k=k)

  # Confusion Matrix
  confusion_m = confusionMatrix(knn_class, test$CrimeLevel, positive= 'High')

  # Test Errors
  test_errors[i] = 1 - confusion_m$overall['Accuracy']
}

results = data.frame(K = k_values, Test_Error= test_errors)
print(results)

##    K Test_Error
## 1  1       0.06
## 2  3       0.05
## 3  5       0.06
## 4  7       0.08
## 5 10       0.08
## 6 15       0.10
## 7 20       0.12

Part h)

Perform naive Bayes on the training data.

The test error for the naive Bayes model is 16%.

# naive Bayes Model
bayes_model = naiveBayes(CrimeLevel ~ zn + indus + chas + nox + dis + tax + 
    ptratio + black + lstat + medv, data= train)
bayes_model

## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
## High  Low 
##  0.5  0.5 
## 
## Conditional probabilities:
##       zn
## Y          [,1]      [,2]
##   High  1.29064  4.948173
##   Low  22.05665 29.655831
## 
##       indus
## Y           [,1]     [,2]
##   High 15.196453 5.549251
##   Low   6.909951 5.351984
## 
##       chas
## Y               0          1
##   High 0.91133005 0.08866995
##   Low  0.94581281 0.05418719
## 
##       nox
## Y           [,1]       [,2]
##   High 0.6319064 0.09188756
##   Low  0.4702695 0.05564134
## 
##       dis
## Y          [,1]     [,2]
##   High 2.514907 1.064752
##   Low  5.080020 2.097736
## 
##       tax
## Y          [,1]      [,2]
##   High 506.8719 169.32877
##   Low  307.8522  88.18852
## 
##       ptratio
## Y          [,1]     [,2]
##   High 19.01724 2.343740
##   Low  17.88374 1.798509
## 
##       black
## Y          [,1]       [,2]
##   High 322.3817 121.073494
##   Low  390.1891   9.894726
## 
##       lstat
## Y           [,1]     [,2]
##   High 15.525222 7.655475
##   Low   9.244433 4.702821
## 
##       medv
## Y          [,1]      [,2]
##   High 20.62167 10.564293
##   Low  25.00345  7.129359

# Classifications
bayes_class = predict(bayes_model, test)

# Confusion Matrix
confusionMatrix(bayes_class, test$CrimeLevel, positive= 'High')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction High Low
##       High   40   6
##       Low    10  44
##                                           
##                Accuracy : 0.84            
##                  95% CI : (0.7532, 0.9057)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 1.303e-12       
##                                           
##                   Kappa : 0.68            
##                                           
##  Mcnemar's Test P-Value : 0.4533          
##                                           
##             Sensitivity : 0.8000          
##             Specificity : 0.8800          
##          Pos Pred Value : 0.8696          
##          Neg Pred Value : 0.8148          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4000          
##    Detection Prevalence : 0.4600          
##       Balanced Accuracy : 0.8400          
##                                           
##        'Positive' Class : High            
##

Assignment 3

Mario Martinez

Question 13

Part a)

Part b)

Part c)

Part d)

Part e)

Part f)

Part g)

Part h)

Part i)

Part j)

Question 14

Part a)

Part b)

Part c)

Part d)

Part e)

Part f)

Part g)

Part h)

Question 16.

Part a)

Part b)

Part c)

Part d)

Part e)

Part f)

Part g)

Part h)