hw4_thurs

##A: Descriptive Data

data <- Weekly

##descriptive
a <- describe(data)
descriptive <- as_tibble(a)
descriptive$vars <- c("Year", "Lag1", "Lag2", "Lag3","Lag4", "Lag5","Volume","today","direction")
descriptive

## # A tibble: 9 × 13
##   vars      n    mean    sd  median trimmed   mad      min    max range     skew
##   <chr> <dbl>   <dbl> <dbl>   <dbl>   <dbl> <dbl>    <dbl>  <dbl> <dbl>    <dbl>
## 1 Year   1089 2.00e+3 6.03  2   e+3 2.00e+3  7.41  1.99e+3 2.01e3 20    -0.00210
## 2 Lag1   1089 1.51e-1 2.36  2.41e-1 1.78e-1  1.87 -1.82e+1 1.20e1 30.2  -0.481  
## 3 Lag2   1089 1.51e-1 2.36  2.41e-1 1.78e-1  1.87 -1.82e+1 1.20e1 30.2  -0.481  
## 4 Lag3   1089 1.47e-1 2.36  2.41e-1 1.75e-1  1.87 -1.82e+1 1.20e1 30.2  -0.479  
## 5 Lag4   1089 1.46e-1 2.36  2.38e-1 1.74e-1  1.87 -1.82e+1 1.20e1 30.2  -0.477  
## 6 Lag5   1089 1.40e-1 2.36  2.34e-1 1.67e-1  1.88 -1.82e+1 1.20e1 30.2  -0.474  
## 7 Volu…  1089 1.57e+0 1.69  1.00e+0 1.25e+0  1.04  8.75e-2 9.33e0  9.24  1.62   
## 8 today  1089 1.50e-1 2.36  2.41e-1 1.77e-1  1.87 -1.82e+1 1.20e1 30.2  -0.480  
## 9 dire…  1089 1.56e+0 0.497 2   e+0 1.57e+0  0     1   e+0 2   e0  1    -0.223  
## # … with 2 more variables: kurtosis <dbl>, se <dbl>

##correlation heatmap
cormat <- round(cor(data[1:8]),2)
melted_cormat <- melt(cormat)
ggheatmap <- ggplot(melted_cormat, aes(Var2, Var1, fill = value))+
  geom_tile(color = "white")+
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
                       midpoint = 0, limit = c(-1,1), space = "Lab", 
                       name="Pearson\nCorrelation") +
  theme_minimal()+ # minimal theme
  theme(axis.text.x = element_text(angle = 45, vjust = 1, 
                                   size = 12, hjust = 1))+
  coord_fixed()
ggheatmap + 
  geom_text(aes(Var2, Var1, label = value), color = "black", size = 4) +
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_blank())

B: Run full model

There is only one significant predictor as Lag2 with significant level at 5%

set.seed(123)
data_split <- initial_split(data, prop=0.8,strata= Direction)
data_training <- training(data_split)
data_testing <- testing(data_split)

model_full <- glm(Direction ~ Lag1+Lag2+Lag3+Lag4+Lag5+Volume, family="binomial", data = data_training)
summary(model_full)

## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = "binomial", data = data_training)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9327  -1.2578   0.9857   1.0834   1.5301  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.249765   0.095918   2.604  0.00922 **
## Lag1        -0.040920   0.030005  -1.364  0.17264   
## Lag2         0.060941   0.029936   2.036  0.04178 * 
## Lag3        -0.007894   0.030042  -0.263  0.79273   
## Lag4        -0.009456   0.031188  -0.303  0.76174   
## Lag5        -0.038853   0.030989  -1.254  0.20992   
## Volume      -0.012404   0.042095  -0.295  0.76825   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1196.6  on 870  degrees of freedom
## Residual deviance: 1187.5  on 864  degrees of freedom
## AIC: 1201.5
## 
## Number of Fisher Scoring iterations: 4

C: Fit test data

The confusion matrix indicate that the accuracy of the model is 52.75% meaning that 52.75% of results are predicted as true. The sensitivity is the ability of a test to correctly identify true Down: ratio = TP/(TP+FN)=9/(9+88) = 9.28%. The specificity is the ability of a test to correctly identify true Up: ratio = TN/(TN+FP)=106/(106+15)=87.6%. The confusion matrix shows that the ability of predicting true Down is very low, only 9.28%

predict_c <- predict(model_full,type="response", newdata=data_testing) 
predict_binary_c <- ifelse(predict_c > 0.5, "Up", "Down")
predict_result_c <- predict_binary_c %>% bind_cols(data_testing %>% dplyr::select(Direction))

## New names:
## • `` -> `...1`

colnames(predict_result_c) <- c("predicted_value", "actual_value")
predict_result_c$predicted_value <- as.factor(predict_result_c$predicted_value)
predict_result_c$actual_value <- as.factor(predict_result_c$actual_value)
confusion_c <- confusionMatrix(predict_result_c$predicted_value, predict_result_c$actual_value)
confusion_c

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Down  Up
##       Down    9  15
##       Up     88 106
##                                          
##                Accuracy : 0.5275         
##                  95% CI : (0.459, 0.5953)
##     No Information Rate : 0.555          
##     P-Value [Acc > NIR] : 0.8123         
##                                          
##                   Kappa : -0.0337        
##                                          
##  Mcnemar's Test P-Value : 1.299e-12      
##                                          
##             Sensitivity : 0.09278        
##             Specificity : 0.87603        
##          Pos Pred Value : 0.37500        
##          Neg Pred Value : 0.54639        
##              Prevalence : 0.44495        
##          Detection Rate : 0.04128        
##    Detection Prevalence : 0.11009        
##       Balanced Accuracy : 0.48441        
##                                          
##        'Positive' Class : Down           
##

D: Logistic Regression for: Direction ~ Lag2

Compare to the full model, the model with only Lag2 shows the increase in accuracy to 55.5%. The specificity increase to 93.4% while sensitivity down to 8.25%. It means that the new model is better in general. However, its ability to detect the Down situation is worse.

logistic_lag2 <- glm(Direction ~ Lag2, family="binomial", data = data_training)
summary(logistic_lag2)

## 
## Call:
## glm(formula = Direction ~ Lag2, family = "binomial", data = data_training)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.586  -1.266   1.003   1.086   1.413  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.21148    0.06857   3.084  0.00204 **
## Lag2         0.06779    0.02928   2.315  0.02061 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1196.6  on 870  degrees of freedom
## Residual deviance: 1191.1  on 869  degrees of freedom
## AIC: 1195.1
## 
## Number of Fisher Scoring iterations: 4

predict_d <- predict(logistic_lag2,type="response", newdata=data_testing) 
predict_binary_d <- ifelse(predict_d > 0.5, "Up", "Down")
predict_result_d <- predict_binary_d %>% bind_cols(data_testing %>% dplyr::select(Direction))

## New names:
## • `` -> `...1`

colnames(predict_result_d) <- c("predicted_value", "actual_value")
predict_result_d$predicted_value <- as.factor(predict_result_d$predicted_value)
predict_result_d$actual_value <- as.factor(predict_result_d$actual_value)
confusion_d <- confusionMatrix(predict_result_d$predicted_value, predict_result_d$actual_value)
confusion_d

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Down  Up
##       Down    8   8
##       Up     89 113
##                                           
##                Accuracy : 0.555           
##                  95% CI : (0.4864, 0.6222)
##     No Information Rate : 0.555           
##     P-Value [Acc > NIR] : 0.5281          
##                                           
##                   Kappa : 0.0178          
##                                           
##  Mcnemar's Test P-Value : 4.557e-16       
##                                           
##             Sensitivity : 0.08247         
##             Specificity : 0.93388         
##          Pos Pred Value : 0.50000         
##          Neg Pred Value : 0.55941         
##              Prevalence : 0.44495         
##          Detection Rate : 0.03670         
##    Detection Prevalence : 0.07339         
##       Balanced Accuracy : 0.50818         
##                                           
##        'Positive' Class : Down            
##

E: LDA for: Direction ~ Lag2

Compare to the logistic regression, LDA has the same accuracy level at 55.5%. The specificity is 94.22% while sensitivity is 7.22%. It means that in general the two models are the same in terms of accuracy. However, the sensitivity of logistic model is better while specificity of LDA is better.

lda_lag2 <- lda(Direction ~ Lag2, data = data_training)
lda_lag2

## Call:
## lda(Direction ~ Lag2, data = data_training)
## 
## Prior probabilities of groups:
##      Down        Up 
## 0.4443169 0.5556831 
## 
## Group means:
##             Lag2
## Down -0.01276227
## Up    0.36730992
## 
## Coefficients of linear discriminants:
##            LD1
## Lag2 0.4199863

predict_e <- predict(lda_lag2, type= "response", newdata=data_testing)$class
predict_e

##   [1] Up   Up   Up   Up   Up   Up   Up   Down Up   Up   Up   Up   Up   Up   Up  
##  [16] Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up  
##  [31] Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up  
##  [46] Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up  
##  [61] Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up  
##  [76] Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up  
##  [91] Down Up   Up   Up   Up   Up   Up   Up   Down Up   Up   Up   Up   Up   Up  
## [106] Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up  
## [121] Up   Up   Up   Up   Up   Up   Up   Up   Up   Down Up   Up   Up   Up   Up  
## [136] Up   Up   Up   Up   Up   Up   Up   Up   Up   Down Up   Up   Up   Up   Up  
## [151] Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up  
## [166] Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up  
## [181] Up   Down Down Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Down Up  
## [196] Down Up   Up   Down Up   Up   Up   Up   Up   Up   Up   Down Up   Up   Down
## [211] Up   Down Down Up   Up   Up   Up   Up  
## Levels: Down Up

predict_result_e <- predict_e %>% bind_cols(data_testing %>% dplyr::select(Direction))

## New names:
## • `` -> `...1`

colnames(predict_result_e) <- c("predicted_value", "actual_value")
confusion_e <- confusionMatrix(predict_result_e$predicted_value, predict_result_e$actual_value)
confusion_e

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Down  Up
##       Down    7   7
##       Up     90 114
##                                           
##                Accuracy : 0.555           
##                  95% CI : (0.4864, 0.6222)
##     No Information Rate : 0.555           
##     P-Value [Acc > NIR] : 0.5281          
##                                           
##                   Kappa : 0.0156          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.07216         
##             Specificity : 0.94215         
##          Pos Pred Value : 0.50000         
##          Neg Pred Value : 0.55882         
##              Prevalence : 0.44495         
##          Detection Rate : 0.03211         
##    Detection Prevalence : 0.06422         
##       Balanced Accuracy : 0.50716         
##                                           
##        'Positive' Class : Down            
##

F: QDA for: Direction ~ Lag2

Compare to the above models, QDA shows the best result with accuracy level at 55.96%. The specificity is improved to 94.22% while sensitivity is 7.22%. It means that in general the two models are the same in terms of accuracy. However, the sensitivity of logistic model is better while specificity of LDA is better.

qda_lag2 <- qda(Direction ~ Lag2, data = data_training)
qda_lag2

## Call:
## qda(Direction ~ Lag2, data = data_training)
## 
## Prior probabilities of groups:
##      Down        Up 
## 0.4443169 0.5556831 
## 
## Group means:
##             Lag2
## Down -0.01276227
## Up    0.36730992

predict_f <- predict(qda_lag2, type= "response", newdata=data_testing)$class
predict_f

##   [1] Up   Up   Up   Up   Up   Up   Up   Down Up   Up   Up   Up   Up   Up   Up  
##  [16] Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up  
##  [31] Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up  
##  [46] Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up  
##  [61] Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up  
##  [76] Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up  
##  [91] Down Up   Up   Up   Up   Up   Up   Up   Down Up   Up   Up   Up   Up   Up  
## [106] Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up  
## [121] Up   Up   Up   Up   Up   Up   Up   Up   Up   Down Up   Up   Up   Up   Up  
## [136] Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up  
## [151] Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up  
## [166] Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up  
## [181] Up   Down Down Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Down Up  
## [196] Down Up   Up   Down Up   Up   Up   Up   Up   Up   Up   Down Up   Up   Down
## [211] Up   Down Down Up   Up   Up   Up   Up  
## Levels: Down Up

predict_result_f <- predict_f %>% bind_cols(data_testing %>% dplyr::select(Direction))

## New names:
## • `` -> `...1`

colnames(predict_result_f) <- c("predicted_value", "actual_value")
confusion_f <- confusionMatrix(predict_result_f$predicted_value, predict_result_f$actual_value)
confusion_f

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Down  Up
##       Down    7   6
##       Up     90 115
##                                          
##                Accuracy : 0.5596         
##                  95% CI : (0.491, 0.6266)
##     No Information Rate : 0.555          
##     P-Value [Acc > NIR] : 0.4738         
##                                          
##                   Kappa : 0.0247         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.07216        
##             Specificity : 0.95041        
##          Pos Pred Value : 0.53846        
##          Neg Pred Value : 0.56098        
##              Prevalence : 0.44495        
##          Detection Rate : 0.03211        
##    Detection Prevalence : 0.05963        
##       Balanced Accuracy : 0.51129        
##                                          
##        'Positive' Class : Down           
##

G: KNN for: Direction ~ Lag2

Compare to other model, KNN is very different in result. In general, this is the worst model in terms of accuracy with only 47.25%. However, the ability to detect true Down is 52.58% while other models’ proportions are just under 10%.

knn_lag2 <- knn(as.matrix(data_training$Lag2), as.matrix(data_testing$Lag2), data_training$Direction, k = 1)
confusion_g <- table(knn_lag2, data_testing$Direction)
confusion_g

##         
## knn_lag2 Down Up
##     Down   50 70
##     Up     47 51

accuracy_g = (51+52)/(51+52+46+69)
sensitivity_g = 51/(51+46)
specificity_g = 52/(52+69)
accuracy_g

## [1] 0.4724771

sensitivity_g

## [1] 0.5257732

specificity_g

## [1] 0.4297521

H: Naive Bayes: Direction ~ Lag2

The naivebayes model had the same results with qda model.

naivebayes_lag2 <- naiveBayes(Direction~ Lag2, data= data_training)
naivebayes_lag2

## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##      Down        Up 
## 0.4443169 0.5556831 
## 
## Conditional probabilities:
##       Lag2
## Y             [,1]     [,2]
##   Down -0.01276227 2.346627
##   Up    0.36730992 2.408171

predict_h <- predict(naivebayes_lag2, type= "class", newdata=data_testing)
predict_h

##   [1] Up   Up   Up   Up   Up   Up   Up   Down Up   Up   Up   Up   Up   Up   Up  
##  [16] Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up  
##  [31] Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up  
##  [46] Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up  
##  [61] Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up  
##  [76] Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up  
##  [91] Down Up   Up   Up   Up   Up   Up   Up   Down Up   Up   Up   Up   Up   Up  
## [106] Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up  
## [121] Up   Up   Up   Up   Up   Up   Up   Up   Up   Down Up   Up   Up   Up   Up  
## [136] Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up  
## [151] Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up  
## [166] Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up  
## [181] Up   Down Down Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Down Up  
## [196] Down Up   Up   Down Up   Up   Up   Up   Up   Up   Up   Down Up   Up   Down
## [211] Up   Down Down Up   Up   Up   Up   Up  
## Levels: Down Up

predict_result_h <- predict_h %>% bind_cols(data_testing %>% dplyr::select(Direction))

## New names:
## • `` -> `...1`

colnames(predict_result_h) <- c("predicted_value", "actual_value")
confusion_h <- confusionMatrix(predict_result_h$predicted_value, predict_result_h$actual_value)
confusion_h

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Down  Up
##       Down    7   6
##       Up     90 115
##                                          
##                Accuracy : 0.5596         
##                  95% CI : (0.491, 0.6266)
##     No Information Rate : 0.555          
##     P-Value [Acc > NIR] : 0.4738         
##                                          
##                   Kappa : 0.0247         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.07216        
##             Specificity : 0.95041        
##          Pos Pred Value : 0.53846        
##          Neg Pred Value : 0.56098        
##              Prevalence : 0.44495        
##          Detection Rate : 0.03211        
##    Detection Prevalence : 0.05963        
##       Balanced Accuracy : 0.51129        
##                                          
##        'Positive' Class : Down           
##

I: Comparision:

Based on the results above, QDA and Naive Bayes shows the best result.