##A: Descriptive Data
data <- Weekly
##descriptive
a <- describe(data)
descriptive <- as_tibble(a)
descriptive$vars <- c("Year", "Lag1", "Lag2", "Lag3","Lag4", "Lag5","Volume","today","direction")
descriptive
## # A tibble: 9 × 13
## vars n mean sd median trimmed mad min max range skew
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Year 1089 2.00e+3 6.03 2 e+3 2.00e+3 7.41 1.99e+3 2.01e3 20 -0.00210
## 2 Lag1 1089 1.51e-1 2.36 2.41e-1 1.78e-1 1.87 -1.82e+1 1.20e1 30.2 -0.481
## 3 Lag2 1089 1.51e-1 2.36 2.41e-1 1.78e-1 1.87 -1.82e+1 1.20e1 30.2 -0.481
## 4 Lag3 1089 1.47e-1 2.36 2.41e-1 1.75e-1 1.87 -1.82e+1 1.20e1 30.2 -0.479
## 5 Lag4 1089 1.46e-1 2.36 2.38e-1 1.74e-1 1.87 -1.82e+1 1.20e1 30.2 -0.477
## 6 Lag5 1089 1.40e-1 2.36 2.34e-1 1.67e-1 1.88 -1.82e+1 1.20e1 30.2 -0.474
## 7 Volu… 1089 1.57e+0 1.69 1.00e+0 1.25e+0 1.04 8.75e-2 9.33e0 9.24 1.62
## 8 today 1089 1.50e-1 2.36 2.41e-1 1.77e-1 1.87 -1.82e+1 1.20e1 30.2 -0.480
## 9 dire… 1089 1.56e+0 0.497 2 e+0 1.57e+0 0 1 e+0 2 e0 1 -0.223
## # … with 2 more variables: kurtosis <dbl>, se <dbl>
##correlation heatmap
cormat <- round(cor(data[1:8]),2)
melted_cormat <- melt(cormat)
ggheatmap <- ggplot(melted_cormat, aes(Var2, Var1, fill = value))+
geom_tile(color = "white")+
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation") +
theme_minimal()+ # minimal theme
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 12, hjust = 1))+
coord_fixed()
ggheatmap +
geom_text(aes(Var2, Var1, label = value), color = "black", size = 4) +
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank())
There is only one significant predictor as Lag2 with significant level at 5%
set.seed(123)
data_split <- initial_split(data, prop=0.8,strata= Direction)
data_training <- training(data_split)
data_testing <- testing(data_split)
model_full <- glm(Direction ~ Lag1+Lag2+Lag3+Lag4+Lag5+Volume, family="binomial", data = data_training)
summary(model_full)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = "binomial", data = data_training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9327 -1.2578 0.9857 1.0834 1.5301
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.249765 0.095918 2.604 0.00922 **
## Lag1 -0.040920 0.030005 -1.364 0.17264
## Lag2 0.060941 0.029936 2.036 0.04178 *
## Lag3 -0.007894 0.030042 -0.263 0.79273
## Lag4 -0.009456 0.031188 -0.303 0.76174
## Lag5 -0.038853 0.030989 -1.254 0.20992
## Volume -0.012404 0.042095 -0.295 0.76825
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1196.6 on 870 degrees of freedom
## Residual deviance: 1187.5 on 864 degrees of freedom
## AIC: 1201.5
##
## Number of Fisher Scoring iterations: 4
The confusion matrix indicate that the accuracy of the model is 52.75% meaning that 52.75% of results are predicted as true. The sensitivity is the ability of a test to correctly identify true Down: ratio = TP/(TP+FN)=9/(9+88) = 9.28%. The specificity is the ability of a test to correctly identify true Up: ratio = TN/(TN+FP)=106/(106+15)=87.6%. The confusion matrix shows that the ability of predicting true Down is very low, only 9.28%
predict_c <- predict(model_full,type="response", newdata=data_testing)
predict_binary_c <- ifelse(predict_c > 0.5, "Up", "Down")
predict_result_c <- predict_binary_c %>% bind_cols(data_testing %>% dplyr::select(Direction))
## New names:
## • `` -> `...1`
colnames(predict_result_c) <- c("predicted_value", "actual_value")
predict_result_c$predicted_value <- as.factor(predict_result_c$predicted_value)
predict_result_c$actual_value <- as.factor(predict_result_c$actual_value)
confusion_c <- confusionMatrix(predict_result_c$predicted_value, predict_result_c$actual_value)
confusion_c
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 9 15
## Up 88 106
##
## Accuracy : 0.5275
## 95% CI : (0.459, 0.5953)
## No Information Rate : 0.555
## P-Value [Acc > NIR] : 0.8123
##
## Kappa : -0.0337
##
## Mcnemar's Test P-Value : 1.299e-12
##
## Sensitivity : 0.09278
## Specificity : 0.87603
## Pos Pred Value : 0.37500
## Neg Pred Value : 0.54639
## Prevalence : 0.44495
## Detection Rate : 0.04128
## Detection Prevalence : 0.11009
## Balanced Accuracy : 0.48441
##
## 'Positive' Class : Down
##
Compare to the full model, the model with only Lag2 shows the increase in accuracy to 55.5%. The specificity increase to 93.4% while sensitivity down to 8.25%. It means that the new model is better in general. However, its ability to detect the Down situation is worse.
logistic_lag2 <- glm(Direction ~ Lag2, family="binomial", data = data_training)
summary(logistic_lag2)
##
## Call:
## glm(formula = Direction ~ Lag2, family = "binomial", data = data_training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.586 -1.266 1.003 1.086 1.413
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.21148 0.06857 3.084 0.00204 **
## Lag2 0.06779 0.02928 2.315 0.02061 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1196.6 on 870 degrees of freedom
## Residual deviance: 1191.1 on 869 degrees of freedom
## AIC: 1195.1
##
## Number of Fisher Scoring iterations: 4
predict_d <- predict(logistic_lag2,type="response", newdata=data_testing)
predict_binary_d <- ifelse(predict_d > 0.5, "Up", "Down")
predict_result_d <- predict_binary_d %>% bind_cols(data_testing %>% dplyr::select(Direction))
## New names:
## • `` -> `...1`
colnames(predict_result_d) <- c("predicted_value", "actual_value")
predict_result_d$predicted_value <- as.factor(predict_result_d$predicted_value)
predict_result_d$actual_value <- as.factor(predict_result_d$actual_value)
confusion_d <- confusionMatrix(predict_result_d$predicted_value, predict_result_d$actual_value)
confusion_d
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 8 8
## Up 89 113
##
## Accuracy : 0.555
## 95% CI : (0.4864, 0.6222)
## No Information Rate : 0.555
## P-Value [Acc > NIR] : 0.5281
##
## Kappa : 0.0178
##
## Mcnemar's Test P-Value : 4.557e-16
##
## Sensitivity : 0.08247
## Specificity : 0.93388
## Pos Pred Value : 0.50000
## Neg Pred Value : 0.55941
## Prevalence : 0.44495
## Detection Rate : 0.03670
## Detection Prevalence : 0.07339
## Balanced Accuracy : 0.50818
##
## 'Positive' Class : Down
##
Compare to the logistic regression, LDA has the same accuracy level at 55.5%. The specificity is 94.22% while sensitivity is 7.22%. It means that in general the two models are the same in terms of accuracy. However, the sensitivity of logistic model is better while specificity of LDA is better.
lda_lag2 <- lda(Direction ~ Lag2, data = data_training)
lda_lag2
## Call:
## lda(Direction ~ Lag2, data = data_training)
##
## Prior probabilities of groups:
## Down Up
## 0.4443169 0.5556831
##
## Group means:
## Lag2
## Down -0.01276227
## Up 0.36730992
##
## Coefficients of linear discriminants:
## LD1
## Lag2 0.4199863
predict_e <- predict(lda_lag2, type= "response", newdata=data_testing)$class
predict_e
## [1] Up Up Up Up Up Up Up Down Up Up Up Up Up Up Up
## [16] Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up
## [31] Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up
## [46] Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up
## [61] Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up
## [76] Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up
## [91] Down Up Up Up Up Up Up Up Down Up Up Up Up Up Up
## [106] Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up
## [121] Up Up Up Up Up Up Up Up Up Down Up Up Up Up Up
## [136] Up Up Up Up Up Up Up Up Up Down Up Up Up Up Up
## [151] Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up
## [166] Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up
## [181] Up Down Down Up Up Up Up Up Up Up Up Up Up Down Up
## [196] Down Up Up Down Up Up Up Up Up Up Up Down Up Up Down
## [211] Up Down Down Up Up Up Up Up
## Levels: Down Up
predict_result_e <- predict_e %>% bind_cols(data_testing %>% dplyr::select(Direction))
## New names:
## • `` -> `...1`
colnames(predict_result_e) <- c("predicted_value", "actual_value")
confusion_e <- confusionMatrix(predict_result_e$predicted_value, predict_result_e$actual_value)
confusion_e
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 7 7
## Up 90 114
##
## Accuracy : 0.555
## 95% CI : (0.4864, 0.6222)
## No Information Rate : 0.555
## P-Value [Acc > NIR] : 0.5281
##
## Kappa : 0.0156
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.07216
## Specificity : 0.94215
## Pos Pred Value : 0.50000
## Neg Pred Value : 0.55882
## Prevalence : 0.44495
## Detection Rate : 0.03211
## Detection Prevalence : 0.06422
## Balanced Accuracy : 0.50716
##
## 'Positive' Class : Down
##
Compare to the above models, QDA shows the best result with accuracy level at 55.96%. The specificity is improved to 94.22% while sensitivity is 7.22%. It means that in general the two models are the same in terms of accuracy. However, the sensitivity of logistic model is better while specificity of LDA is better.
qda_lag2 <- qda(Direction ~ Lag2, data = data_training)
qda_lag2
## Call:
## qda(Direction ~ Lag2, data = data_training)
##
## Prior probabilities of groups:
## Down Up
## 0.4443169 0.5556831
##
## Group means:
## Lag2
## Down -0.01276227
## Up 0.36730992
predict_f <- predict(qda_lag2, type= "response", newdata=data_testing)$class
predict_f
## [1] Up Up Up Up Up Up Up Down Up Up Up Up Up Up Up
## [16] Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up
## [31] Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up
## [46] Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up
## [61] Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up
## [76] Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up
## [91] Down Up Up Up Up Up Up Up Down Up Up Up Up Up Up
## [106] Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up
## [121] Up Up Up Up Up Up Up Up Up Down Up Up Up Up Up
## [136] Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up
## [151] Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up
## [166] Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up
## [181] Up Down Down Up Up Up Up Up Up Up Up Up Up Down Up
## [196] Down Up Up Down Up Up Up Up Up Up Up Down Up Up Down
## [211] Up Down Down Up Up Up Up Up
## Levels: Down Up
predict_result_f <- predict_f %>% bind_cols(data_testing %>% dplyr::select(Direction))
## New names:
## • `` -> `...1`
colnames(predict_result_f) <- c("predicted_value", "actual_value")
confusion_f <- confusionMatrix(predict_result_f$predicted_value, predict_result_f$actual_value)
confusion_f
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 7 6
## Up 90 115
##
## Accuracy : 0.5596
## 95% CI : (0.491, 0.6266)
## No Information Rate : 0.555
## P-Value [Acc > NIR] : 0.4738
##
## Kappa : 0.0247
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.07216
## Specificity : 0.95041
## Pos Pred Value : 0.53846
## Neg Pred Value : 0.56098
## Prevalence : 0.44495
## Detection Rate : 0.03211
## Detection Prevalence : 0.05963
## Balanced Accuracy : 0.51129
##
## 'Positive' Class : Down
##
Compare to other model, KNN is very different in result. In general, this is the worst model in terms of accuracy with only 47.25%. However, the ability to detect true Down is 52.58% while other models’ proportions are just under 10%.
knn_lag2 <- knn(as.matrix(data_training$Lag2), as.matrix(data_testing$Lag2), data_training$Direction, k = 1)
confusion_g <- table(knn_lag2, data_testing$Direction)
confusion_g
##
## knn_lag2 Down Up
## Down 50 70
## Up 47 51
accuracy_g = (51+52)/(51+52+46+69)
sensitivity_g = 51/(51+46)
specificity_g = 52/(52+69)
accuracy_g
## [1] 0.4724771
sensitivity_g
## [1] 0.5257732
specificity_g
## [1] 0.4297521
The naivebayes model had the same results with qda model.
naivebayes_lag2 <- naiveBayes(Direction~ Lag2, data= data_training)
naivebayes_lag2
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## Down Up
## 0.4443169 0.5556831
##
## Conditional probabilities:
## Lag2
## Y [,1] [,2]
## Down -0.01276227 2.346627
## Up 0.36730992 2.408171
predict_h <- predict(naivebayes_lag2, type= "class", newdata=data_testing)
predict_h
## [1] Up Up Up Up Up Up Up Down Up Up Up Up Up Up Up
## [16] Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up
## [31] Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up
## [46] Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up
## [61] Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up
## [76] Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up
## [91] Down Up Up Up Up Up Up Up Down Up Up Up Up Up Up
## [106] Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up
## [121] Up Up Up Up Up Up Up Up Up Down Up Up Up Up Up
## [136] Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up
## [151] Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up
## [166] Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up
## [181] Up Down Down Up Up Up Up Up Up Up Up Up Up Down Up
## [196] Down Up Up Down Up Up Up Up Up Up Up Down Up Up Down
## [211] Up Down Down Up Up Up Up Up
## Levels: Down Up
predict_result_h <- predict_h %>% bind_cols(data_testing %>% dplyr::select(Direction))
## New names:
## • `` -> `...1`
colnames(predict_result_h) <- c("predicted_value", "actual_value")
confusion_h <- confusionMatrix(predict_result_h$predicted_value, predict_result_h$actual_value)
confusion_h
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 7 6
## Up 90 115
##
## Accuracy : 0.5596
## 95% CI : (0.491, 0.6266)
## No Information Rate : 0.555
## P-Value [Acc > NIR] : 0.4738
##
## Kappa : 0.0247
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.07216
## Specificity : 0.95041
## Pos Pred Value : 0.53846
## Neg Pred Value : 0.56098
## Prevalence : 0.44495
## Detection Rate : 0.03211
## Detection Prevalence : 0.05963
## Balanced Accuracy : 0.51129
##
## 'Positive' Class : Down
##
Based on the results above, QDA and Naive Bayes shows the best result.