This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
require(tidyquant)
require(tidyverse)
require(caret)
# This is first part of using machine learning methods to predict market directions a certain period in the future. Here we compare the performance of four algorithms using caret wrapper: Random forest; support vector machine with radial kernel;extreme gradient boosting;multivariate adaptive regression spline; and KNN. We took medium size data; and results show that these algorithms achieved an accuracy above 80% .Next post will look at other deep learning algorithms such as LSTM.
#Get data & construct some technical indicators
stock.dt=function(stock, period){
dx=stock%>%tq_get(get = "stock.prices",from=Sys.Date()-years(3),to=Sys.Date()+1)
dx=dx%>%tq_mutate(c(high,low,close),mutate_fun = ATR,n=14,matype="EMA")
dx=dx%>%select(-c(tr,ATR,ATR..1))
dx=dx%>%tq_mutate(close,mutate_fun = RSI,n=14,maType="EMA")
dx=dx%>%tq_mutate(close,mutate_fun = MACD,maType="EMA",percent=F)
dx=dx%>%mutate(diff=macd-signal)
dx=dx%>%tq_mutate(c(high,low,close),mutate_fun = SMI,maType="EMA")
dx=dx%>%select(-(signal..1))
dx=dx%>%mutate(mfi=MFI(dx[,c("high","low","close")],dx[,"volume"],n=14))
dx=dx%>%tq_mutate(c(high,low,close),mutate_fun = WPR,col_rename = "wrp")%>%mutate(wrp=-100*wrp)
dx=dx%>%tq_mutate(c(high,low,close),mutate_fun = ADX,maType="EMA")
dx=dx%>%tq_mutate(c(high,low,close),mutate_fun = CCI,n=14,maType="EMA")
dx=dx%>%tq_mutate(close,mutate_fun = CMO,n=14)
dx=dx%>%tq_mutate_xy(close,volume,mutate_fun = OBV)
dx=dx%>%select(-(DIp:DX))
dx=dx%>%mutate(lag.chg=close-lag(close,period))
dx=dx%>%mutate(trend=ifelse(lag.chg>0,"UP","DOWN"))
dx=dx%>%select(-c(symbol,open:adjusted))
dx=na.omit(dx)%>%mutate_if(is.numeric,scale)
dx=dx%>%mutate_if(is.numeric,function(t) c(t))
return(dx)
}
#Application to SP500
stock="^GSPC"
period=2 # number of days in the future
df=stock.dt("^GSPC",2)
# Remove lag change(lag.chg) because correlated to trend
df=df %>% select(-c(date,lag.chg))
#Keep a slice of number of days equal to period
slc=df%>%slice_tail(n=period)
dkor=cor(df[,-c(13)],method = "pearson",use = "complete.obs")
corrplot::corrplot(dkor,type = "upper",order = "hclust",tl.col = "black",tl.srt = 45)
#End of ^GSPC indicators segment
#Split data +trainControl
ind=createDataPartition(df$trend,p=.8,list = F)
x.train=df[ind,]
x.test=df[-ind,]
tc=trainControl("repeatedcv",number = 5,repeats = 3,classProbs = T,savePredictions = T,summaryFunction = twoClassSummary,allowParallel = T,returnData = F,verboseIter = F)
#End data split
#knn algorithm with caret:
knn.fit=train(as.factor(trend)~.,data = x.train,method="knn",trControl=tc,preProcess=c("center","scale"),tuneLength=20)
knn.fit
## k-Nearest Neighbors
##
## Pre-processing: centered (12), scaled (12)
## Resampling: Cross-Validated (5 fold, repeated 3 times)
## Summary of sample sizes: 462, 463, 463, 464, 464, 464, ...
## Resampling results across tuning parameters:
##
## k ROC Sens Spec
## 5 0.8170122 0.6772026 0.8099611
## 7 0.8290312 0.6785098 0.8150583
## 9 0.8348863 0.6798693 0.8283450
## 11 0.8408546 0.6705621 0.8354468
## 13 0.8430866 0.6625882 0.8455633
## 15 0.8474199 0.6812026 0.8527273
## 17 0.8498542 0.6759477 0.8557576
## 19 0.8487667 0.6625882 0.8669930
## 21 0.8491473 0.6626144 0.8649262
## 23 0.8480886 0.6612288 0.8638850
## 25 0.8433967 0.6625882 0.8628749
## 27 0.8433774 0.6493333 0.8638850
## 29 0.8444201 0.6440000 0.8629060
## 31 0.8465326 0.6546667 0.8669930
## 33 0.8440754 0.6506667 0.8578399
## 35 0.8456907 0.6506667 0.8558353
## 37 0.8453190 0.6413856 0.8599068
## 39 0.8424080 0.6387451 0.8548252
## 41 0.8416741 0.6387712 0.8538306
## 43 0.8421302 0.6441046 0.8467599
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was k = 17.
knn.pred=predict(knn.fit,newdata = x.test)
confusionMatrix(table(x.test$trend,knn.pred))
## Confusion Matrix and Statistics
##
## knn.pred
## DOWN UP
## DOWN 44 18
## UP 11 71
##
## Accuracy : 0.7986
## 95% CI : (0.7237, 0.8608)
## No Information Rate : 0.6181
## P-Value [Acc > NIR] : 2.484e-06
##
## Kappa : 0.5836
##
## Mcnemar's Test P-Value : 0.2652
##
## Sensitivity : 0.8000
## Specificity : 0.7978
## Pos Pred Value : 0.7097
## Neg Pred Value : 0.8659
## Prevalence : 0.3819
## Detection Rate : 0.3056
## Detection Prevalence : 0.4306
## Balanced Accuracy : 0.7989
##
## 'Positive' Class : DOWN
##
slc.pred=predict(knn.fit,newdata = slc)
table(slc.pred,slc$trend)
##
## slc.pred DOWN
## DOWN 2
## UP 0
plot(knn.fit)
#End knn segment
require(xgboost)
#xgboost with caret segment:
xgbgrid=expand.grid(nrounds=c(100,200),max_depth=c(10,15,20,25),eta=.05,gamma=.01,colsample_bytree=seq(.5,.9,length.out=5),min_child_weight=1,subsample=1)
xgb.train=xgb.DMatrix(data=as.matrix(x.train%>%select(-trend)))
xgb.test=xgb.DMatrix(data=as.matrix(x.test%>%select(-trend)))
xgb.fit=train(xgb.train,as.factor(x.train$trend),trControl = tc,tuneGrid = xgbgrid,method = "xgbTree")
xgb.fit
## eXtreme Gradient Boosting
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 3 times)
## Summary of sample sizes: 463, 464, 463, 463, 463, 463, ...
## Resampling results across tuning parameters:
##
## max_depth colsample_bytree nrounds ROC Sens Spec
## 10 0.5 100 0.9270642 0.8099869 0.8768143
## 10 0.5 200 0.9288019 0.8166275 0.8656410
## 10 0.6 100 0.9302230 0.8312418 0.8768609
## 10 0.6 200 0.9307707 0.8286275 0.8667133
## 10 0.7 100 0.9313590 0.8311895 0.8727584
## 10 0.7 200 0.9308033 0.8299085 0.8636364
## 10 0.8 100 0.9341013 0.8405229 0.8757887
## 10 0.8 200 0.9323791 0.8365752 0.8584926
## 10 0.9 100 0.9334167 0.8351895 0.8666667
## 10 0.9 200 0.9320606 0.8338562 0.8636053
## 15 0.5 100 0.9277049 0.8206536 0.8687024
## 15 0.5 200 0.9284060 0.8219869 0.8687490
## 15 0.6 100 0.9280260 0.8179346 0.8727739
## 15 0.6 200 0.9298264 0.8232680 0.8666822
## 15 0.7 100 0.9310358 0.8378039 0.8676457
## 15 0.7 200 0.9309899 0.8338562 0.8575291
## 15 0.8 100 0.9327115 0.8378039 0.8737840
## 15 0.8 200 0.9320326 0.8391895 0.8646309
## 15 0.9 100 0.9336894 0.8392157 0.8697280
## 15 0.9 200 0.9330972 0.8352418 0.8625796
## 20 0.5 100 0.9267130 0.8179346 0.8676457
## 20 0.5 200 0.9286623 0.8165752 0.8656721
## 20 0.6 100 0.9304764 0.8245752 0.8727584
## 20 0.6 200 0.9315080 0.8286013 0.8687179
## 20 0.7 100 0.9330737 0.8338301 0.8748252
## 20 0.7 200 0.9319828 0.8378039 0.8656566
## 20 0.8 100 0.9335343 0.8404967 0.8778710
## 20 0.8 200 0.9325366 0.8404967 0.8584926
## 20 0.9 100 0.9327703 0.8458301 0.8687335
## 20 0.9 200 0.9317723 0.8365229 0.8636208
## 25 0.5 100 0.9270714 0.8180131 0.8768454
## 25 0.5 200 0.9275133 0.8219608 0.8687179
## 25 0.6 100 0.9318617 0.8272680 0.8768143
## 25 0.6 200 0.9313691 0.8286013 0.8717483
## 25 0.7 100 0.9320185 0.8364706 0.8677078
## 25 0.7 200 0.9321826 0.8365229 0.8717793
## 25 0.8 100 0.9327945 0.8458301 0.8737374
## 25 0.8 200 0.9321811 0.8392418 0.8656099
## 25 0.9 100 0.9325222 0.8365490 0.8748096
## 25 0.9 200 0.9320278 0.8339608 0.8646620
##
## Tuning parameter 'eta' was held constant at a value of 0.05
## Tuning
## parameter 'min_child_weight' was held constant at a value of 1
##
## Tuning parameter 'subsample' was held constant at a value of 1
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were nrounds = 100, max_depth = 10, eta
## = 0.05, gamma = 0.01, colsample_bytree = 0.8, min_child_weight = 1
## and subsample = 1.
xgb.pred=predict(xgb.fit$finalModel,newdata = xgb.test)
y.pred=factor(ifelse(xgb.pred>.5,"DOWN","UP"))
y.pred=relevel(y.pred,"DOWN")
confusionMatrix(table(y.pred,x.test$trend))
## Confusion Matrix and Statistics
##
##
## y.pred DOWN UP
## DOWN 45 6
## UP 17 76
##
## Accuracy : 0.8403
## 95% CI : (0.77, 0.896)
## No Information Rate : 0.5694
## P-Value [Acc > NIR] : 3.453e-12
##
## Kappa : 0.6671
##
## Mcnemar's Test P-Value : 0.03706
##
## Sensitivity : 0.7258
## Specificity : 0.9268
## Pos Pred Value : 0.8824
## Neg Pred Value : 0.8172
## Prevalence : 0.4306
## Detection Rate : 0.3125
## Detection Prevalence : 0.3542
## Balanced Accuracy : 0.8263
##
## 'Positive' Class : DOWN
##
table(y.pred)
## y.pred
## DOWN UP
## 51 93
plot(xgb.fit)
#End xgboost segment:
#Earth segment with caret:
earth.fit=train(as.factor(trend)~.,data = x.train,method="earth",trControl=tc,preProcess=c("center","scale"))
earth.fit
## Multivariate Adaptive Regression Spline
##
## Pre-processing: centered (12), scaled (12)
## Resampling: Cross-Validated (5 fold, repeated 3 times)
## Summary of sample sizes: 464, 463, 462, 463, 464, 463, ...
## Resampling results across tuning parameters:
##
## nprune ROC Sens Spec
## 2 0.7880275 0.7055948 0.7193939
## 9 0.9377342 0.8461961 0.8830769
## 17 0.9340638 0.8381438 0.8758664
##
## Tuning parameter 'degree' was held constant at a value of 1
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were nprune = 9 and degree = 1.
earth.pred=predict(earth.fit,newdata = x.test)
confusionMatrix(table(x.test$trend,earth.pred))
## Confusion Matrix and Statistics
##
## earth.pred
## DOWN UP
## DOWN 55 7
## UP 7 75
##
## Accuracy : 0.9028
## 95% CI : (0.8423, 0.9458)
## No Information Rate : 0.5694
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8017
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.8871
## Specificity : 0.9146
## Pos Pred Value : 0.8871
## Neg Pred Value : 0.9146
## Prevalence : 0.4306
## Detection Rate : 0.3819
## Detection Prevalence : 0.4306
## Balanced Accuracy : 0.9009
##
## 'Positive' Class : DOWN
##
slc.pred=predict(earth.fit,newdata = slc)
table(slc.pred,slc$trend)
##
## slc.pred DOWN
## DOWN 2
## UP 0
earth.imp=varImp(earth.fit,scale = F)
earth.imp
## earth variable importance
##
## Overall
## rsi 100.00
## macd 68.85
## diff 49.08
## SMI 35.61
## atr 26.82
## wrp 19.24
plot(earth.imp)
plot(earth.fit)
#End earth segment
# SVM segment with caret:
svm.fit=train(as.factor(trend)~.,data = x.train,method="svmRadial",trControl=tc,preProcess=c("center","scale"))
svm.fit
## Support Vector Machines with Radial Basis Function Kernel
##
## Pre-processing: centered (12), scaled (12)
## Resampling: Cross-Validated (5 fold, repeated 3 times)
## Summary of sample sizes: 463, 463, 464, 463, 463, 464, ...
## Resampling results across tuning parameters:
##
## C ROC Sens Spec
## 0.25 0.8866921 0.7448889 0.8466667
## 0.50 0.9022516 0.7582745 0.8660140
## 1.00 0.9137215 0.7702745 0.8802176
##
## Tuning parameter 'sigma' was held constant at a value of 0.1042768
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.1042768 and C = 1.
svm.pred=predict(svm.fit,newdata = x.test)
confusionMatrix(table(x.test$trend,svm.pred))
## Confusion Matrix and Statistics
##
## svm.pred
## DOWN UP
## DOWN 50 12
## UP 5 77
##
## Accuracy : 0.8819
## 95% CI : (0.8177, 0.9297)
## No Information Rate : 0.6181
## P-Value [Acc > NIR] : 1.485e-12
##
## Kappa : 0.7559
##
## Mcnemar's Test P-Value : 0.1456
##
## Sensitivity : 0.9091
## Specificity : 0.8652
## Pos Pred Value : 0.8065
## Neg Pred Value : 0.9390
## Prevalence : 0.3819
## Detection Rate : 0.3472
## Detection Prevalence : 0.4306
## Balanced Accuracy : 0.8871
##
## 'Positive' Class : DOWN
##
slc.pred=predict(svm.fit,newdata = slc)
table(slc.pred,slc$trend)
##
## slc.pred DOWN
## DOWN 2
## UP 0
plot(svm.fit)
#End SVM segment
#Random forest with caret segment:
rf.fit=train(as.factor(trend)~.,data = x.train,method="rf",trControl=tc,preProcess=c("center","scale"))
rf.fit
## Random Forest
##
## Pre-processing: centered (12), scaled (12)
## Resampling: Cross-Validated (5 fold, repeated 3 times)
## Summary of sample sizes: 464, 464, 463, 463, 462, 463, ...
## Resampling results across tuning parameters:
##
## mtry ROC Sens Spec
## 2 0.9143318 0.8006013 0.8770008
## 7 0.9210346 0.8204967 0.8729915
## 12 0.9187709 0.8323922 0.8729915
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 7.
rf.pred=predict(rf.fit,newdata = x.test)
confusionMatrix(table(x.test$trend,rf.pred))
## Confusion Matrix and Statistics
##
## rf.pred
## DOWN UP
## DOWN 47 15
## UP 7 75
##
## Accuracy : 0.8472
## 95% CI : (0.7779, 0.9017)
## No Information Rate : 0.625
## P-Value [Acc > NIR] : 3.758e-09
##
## Kappa : 0.6835
##
## Mcnemar's Test P-Value : 0.1356
##
## Sensitivity : 0.8704
## Specificity : 0.8333
## Pos Pred Value : 0.7581
## Neg Pred Value : 0.9146
## Prevalence : 0.3750
## Detection Rate : 0.3264
## Detection Prevalence : 0.4306
## Balanced Accuracy : 0.8519
##
## 'Positive' Class : DOWN
##
slc.pred=predict(rf.fit,newdata = slc)
table(slc.pred,slc$trend)
##
## slc.pred DOWN
## DOWN 2
## UP 0
rf.imp=varImp(rf.fit,scale = F)
plot(rf.imp)
plot(rf.fit)
#End random forest segment
#Comparing models
model_perf=resamples(list(RF=rf.fit,EARTH=earth.fit,SVM=svm.fit,XGB=xgb.fit,KNN=knn.fit))
bwplot(model_perf)
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.