Stock market Machine Learning Prediction:Part one

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)

require(tidyquant)
require(tidyverse)
require(caret)
# This is first part of using machine learning methods to predict market directions a certain period in the future. Here we compare the performance of four algorithms using caret wrapper: Random forest; support vector machine with radial kernel;extreme gradient boosting;multivariate adaptive regression spline; and KNN. We took medium size data; and results show that  these algorithms achieved an accuracy above 80% .Next post will look at other deep learning algorithms such as LSTM.
#Get data & construct some  technical indicators
stock.dt=function(stock, period){
dx=stock%>%tq_get(get = "stock.prices",from=Sys.Date()-years(3),to=Sys.Date()+1)
dx=dx%>%tq_mutate(c(high,low,close),mutate_fun = ATR,n=14,matype="EMA")
dx=dx%>%select(-c(tr,ATR,ATR..1))
dx=dx%>%tq_mutate(close,mutate_fun = RSI,n=14,maType="EMA")
dx=dx%>%tq_mutate(close,mutate_fun = MACD,maType="EMA",percent=F)
dx=dx%>%mutate(diff=macd-signal)
dx=dx%>%tq_mutate(c(high,low,close),mutate_fun = SMI,maType="EMA")
dx=dx%>%select(-(signal..1))
dx=dx%>%mutate(mfi=MFI(dx[,c("high","low","close")],dx[,"volume"],n=14))
dx=dx%>%tq_mutate(c(high,low,close),mutate_fun = WPR,col_rename = "wrp")%>%mutate(wrp=-100*wrp)
dx=dx%>%tq_mutate(c(high,low,close),mutate_fun = ADX,maType="EMA")
dx=dx%>%tq_mutate(c(high,low,close),mutate_fun = CCI,n=14,maType="EMA")
dx=dx%>%tq_mutate(close,mutate_fun = CMO,n=14)
dx=dx%>%tq_mutate_xy(close,volume,mutate_fun = OBV)
dx=dx%>%select(-(DIp:DX))
dx=dx%>%mutate(lag.chg=close-lag(close,period))
dx=dx%>%mutate(trend=ifelse(lag.chg>0,"UP","DOWN"))
dx=dx%>%select(-c(symbol,open:adjusted))
dx=na.omit(dx)%>%mutate_if(is.numeric,scale)
dx=dx%>%mutate_if(is.numeric,function(t) c(t))
return(dx)
}
#Application to SP500
stock="^GSPC"
period=2 # number of days in the future
df=stock.dt("^GSPC",2)
# Remove lag change(lag.chg) because correlated to trend
df=df %>% select(-c(date,lag.chg))
#Keep a slice of number of days equal to period
slc=df%>%slice_tail(n=period)
dkor=cor(df[,-c(13)],method = "pearson",use = "complete.obs")
corrplot::corrplot(dkor,type = "upper",order = "hclust",tl.col = "black",tl.srt = 45)

#End of ^GSPC indicators segment
#Split data +trainControl
ind=createDataPartition(df$trend,p=.8,list = F)
x.train=df[ind,]
x.test=df[-ind,]
tc=trainControl("repeatedcv",number = 5,repeats = 3,classProbs = T,savePredictions = T,summaryFunction = twoClassSummary,allowParallel = T,returnData = F,verboseIter = F)
#End data split
#knn algorithm with caret:
knn.fit=train(as.factor(trend)~.,data = x.train,method="knn",trControl=tc,preProcess=c("center","scale"),tuneLength=20)
knn.fit

## k-Nearest Neighbors 
## 
## Pre-processing: centered (12), scaled (12) 
## Resampling: Cross-Validated (5 fold, repeated 3 times) 
## Summary of sample sizes: 462, 463, 463, 464, 464, 464, ... 
## Resampling results across tuning parameters:
## 
##   k   ROC        Sens       Spec     
##    5  0.8170122  0.6772026  0.8099611
##    7  0.8290312  0.6785098  0.8150583
##    9  0.8348863  0.6798693  0.8283450
##   11  0.8408546  0.6705621  0.8354468
##   13  0.8430866  0.6625882  0.8455633
##   15  0.8474199  0.6812026  0.8527273
##   17  0.8498542  0.6759477  0.8557576
##   19  0.8487667  0.6625882  0.8669930
##   21  0.8491473  0.6626144  0.8649262
##   23  0.8480886  0.6612288  0.8638850
##   25  0.8433967  0.6625882  0.8628749
##   27  0.8433774  0.6493333  0.8638850
##   29  0.8444201  0.6440000  0.8629060
##   31  0.8465326  0.6546667  0.8669930
##   33  0.8440754  0.6506667  0.8578399
##   35  0.8456907  0.6506667  0.8558353
##   37  0.8453190  0.6413856  0.8599068
##   39  0.8424080  0.6387451  0.8548252
##   41  0.8416741  0.6387712  0.8538306
##   43  0.8421302  0.6441046  0.8467599
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was k = 17.

knn.pred=predict(knn.fit,newdata = x.test)
confusionMatrix(table(x.test$trend,knn.pred))

## Confusion Matrix and Statistics
## 
##       knn.pred
##        DOWN UP
##   DOWN   44 18
##   UP     11 71
##                                           
##                Accuracy : 0.7986          
##                  95% CI : (0.7237, 0.8608)
##     No Information Rate : 0.6181          
##     P-Value [Acc > NIR] : 2.484e-06       
##                                           
##                   Kappa : 0.5836          
##                                           
##  Mcnemar's Test P-Value : 0.2652          
##                                           
##             Sensitivity : 0.8000          
##             Specificity : 0.7978          
##          Pos Pred Value : 0.7097          
##          Neg Pred Value : 0.8659          
##              Prevalence : 0.3819          
##          Detection Rate : 0.3056          
##    Detection Prevalence : 0.4306          
##       Balanced Accuracy : 0.7989          
##                                           
##        'Positive' Class : DOWN            
##

slc.pred=predict(knn.fit,newdata = slc)
table(slc.pred,slc$trend)

##         
## slc.pred DOWN
##     DOWN    2
##     UP      0

plot(knn.fit)

#End knn segment
require(xgboost)
#xgboost with caret segment:
xgbgrid=expand.grid(nrounds=c(100,200),max_depth=c(10,15,20,25),eta=.05,gamma=.01,colsample_bytree=seq(.5,.9,length.out=5),min_child_weight=1,subsample=1)
xgb.train=xgb.DMatrix(data=as.matrix(x.train%>%select(-trend)))
xgb.test=xgb.DMatrix(data=as.matrix(x.test%>%select(-trend)))
xgb.fit=train(xgb.train,as.factor(x.train$trend),trControl = tc,tuneGrid = xgbgrid,method = "xgbTree")
xgb.fit

## eXtreme Gradient Boosting 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 3 times) 
## Summary of sample sizes: 463, 464, 463, 463, 463, 463, ... 
## Resampling results across tuning parameters:
## 
##   max_depth  colsample_bytree  nrounds  ROC        Sens       Spec     
##   10         0.5               100      0.9270642  0.8099869  0.8768143
##   10         0.5               200      0.9288019  0.8166275  0.8656410
##   10         0.6               100      0.9302230  0.8312418  0.8768609
##   10         0.6               200      0.9307707  0.8286275  0.8667133
##   10         0.7               100      0.9313590  0.8311895  0.8727584
##   10         0.7               200      0.9308033  0.8299085  0.8636364
##   10         0.8               100      0.9341013  0.8405229  0.8757887
##   10         0.8               200      0.9323791  0.8365752  0.8584926
##   10         0.9               100      0.9334167  0.8351895  0.8666667
##   10         0.9               200      0.9320606  0.8338562  0.8636053
##   15         0.5               100      0.9277049  0.8206536  0.8687024
##   15         0.5               200      0.9284060  0.8219869  0.8687490
##   15         0.6               100      0.9280260  0.8179346  0.8727739
##   15         0.6               200      0.9298264  0.8232680  0.8666822
##   15         0.7               100      0.9310358  0.8378039  0.8676457
##   15         0.7               200      0.9309899  0.8338562  0.8575291
##   15         0.8               100      0.9327115  0.8378039  0.8737840
##   15         0.8               200      0.9320326  0.8391895  0.8646309
##   15         0.9               100      0.9336894  0.8392157  0.8697280
##   15         0.9               200      0.9330972  0.8352418  0.8625796
##   20         0.5               100      0.9267130  0.8179346  0.8676457
##   20         0.5               200      0.9286623  0.8165752  0.8656721
##   20         0.6               100      0.9304764  0.8245752  0.8727584
##   20         0.6               200      0.9315080  0.8286013  0.8687179
##   20         0.7               100      0.9330737  0.8338301  0.8748252
##   20         0.7               200      0.9319828  0.8378039  0.8656566
##   20         0.8               100      0.9335343  0.8404967  0.8778710
##   20         0.8               200      0.9325366  0.8404967  0.8584926
##   20         0.9               100      0.9327703  0.8458301  0.8687335
##   20         0.9               200      0.9317723  0.8365229  0.8636208
##   25         0.5               100      0.9270714  0.8180131  0.8768454
##   25         0.5               200      0.9275133  0.8219608  0.8687179
##   25         0.6               100      0.9318617  0.8272680  0.8768143
##   25         0.6               200      0.9313691  0.8286013  0.8717483
##   25         0.7               100      0.9320185  0.8364706  0.8677078
##   25         0.7               200      0.9321826  0.8365229  0.8717793
##   25         0.8               100      0.9327945  0.8458301  0.8737374
##   25         0.8               200      0.9321811  0.8392418  0.8656099
##   25         0.9               100      0.9325222  0.8365490  0.8748096
##   25         0.9               200      0.9320278  0.8339608  0.8646620
## 
## Tuning parameter 'eta' was held constant at a value of 0.05
## Tuning
##  parameter 'min_child_weight' was held constant at a value of 1
## 
## Tuning parameter 'subsample' was held constant at a value of 1
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were nrounds = 100, max_depth = 10, eta
##  = 0.05, gamma = 0.01, colsample_bytree = 0.8, min_child_weight = 1
##  and subsample = 1.

xgb.pred=predict(xgb.fit$finalModel,newdata = xgb.test)
y.pred=factor(ifelse(xgb.pred>.5,"DOWN","UP"))
y.pred=relevel(y.pred,"DOWN")
confusionMatrix(table(y.pred,x.test$trend))

## Confusion Matrix and Statistics
## 
##       
## y.pred DOWN UP
##   DOWN   45  6
##   UP     17 76
##                                        
##                Accuracy : 0.8403       
##                  95% CI : (0.77, 0.896)
##     No Information Rate : 0.5694       
##     P-Value [Acc > NIR] : 3.453e-12    
##                                        
##                   Kappa : 0.6671       
##                                        
##  Mcnemar's Test P-Value : 0.03706      
##                                        
##             Sensitivity : 0.7258       
##             Specificity : 0.9268       
##          Pos Pred Value : 0.8824       
##          Neg Pred Value : 0.8172       
##              Prevalence : 0.4306       
##          Detection Rate : 0.3125       
##    Detection Prevalence : 0.3542       
##       Balanced Accuracy : 0.8263       
##                                        
##        'Positive' Class : DOWN         
##

table(y.pred)

## y.pred
## DOWN   UP 
##   51   93

plot(xgb.fit)

#End xgboost segment:
#Earth segment with caret:
earth.fit=train(as.factor(trend)~.,data = x.train,method="earth",trControl=tc,preProcess=c("center","scale"))
earth.fit

## Multivariate Adaptive Regression Spline 
## 
## Pre-processing: centered (12), scaled (12) 
## Resampling: Cross-Validated (5 fold, repeated 3 times) 
## Summary of sample sizes: 464, 463, 462, 463, 464, 463, ... 
## Resampling results across tuning parameters:
## 
##   nprune  ROC        Sens       Spec     
##    2      0.7880275  0.7055948  0.7193939
##    9      0.9377342  0.8461961  0.8830769
##   17      0.9340638  0.8381438  0.8758664
## 
## Tuning parameter 'degree' was held constant at a value of 1
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were nprune = 9 and degree = 1.

earth.pred=predict(earth.fit,newdata = x.test)
confusionMatrix(table(x.test$trend,earth.pred))

## Confusion Matrix and Statistics
## 
##       earth.pred
##        DOWN UP
##   DOWN   55  7
##   UP      7 75
##                                           
##                Accuracy : 0.9028          
##                  95% CI : (0.8423, 0.9458)
##     No Information Rate : 0.5694          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8017          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.8871          
##             Specificity : 0.9146          
##          Pos Pred Value : 0.8871          
##          Neg Pred Value : 0.9146          
##              Prevalence : 0.4306          
##          Detection Rate : 0.3819          
##    Detection Prevalence : 0.4306          
##       Balanced Accuracy : 0.9009          
##                                           
##        'Positive' Class : DOWN            
##

slc.pred=predict(earth.fit,newdata = slc)
table(slc.pred,slc$trend)

##         
## slc.pred DOWN
##     DOWN    2
##     UP      0

earth.imp=varImp(earth.fit,scale = F)
earth.imp

## earth variable importance
## 
##      Overall
## rsi   100.00
## macd   68.85
## diff   49.08
## SMI    35.61
## atr    26.82
## wrp    19.24

plot(earth.imp)

plot(earth.fit)

#End earth segment
# SVM segment with caret:
svm.fit=train(as.factor(trend)~.,data = x.train,method="svmRadial",trControl=tc,preProcess=c("center","scale"))
svm.fit

## Support Vector Machines with Radial Basis Function Kernel 
## 
## Pre-processing: centered (12), scaled (12) 
## Resampling: Cross-Validated (5 fold, repeated 3 times) 
## Summary of sample sizes: 463, 463, 464, 463, 463, 464, ... 
## Resampling results across tuning parameters:
## 
##   C     ROC        Sens       Spec     
##   0.25  0.8866921  0.7448889  0.8466667
##   0.50  0.9022516  0.7582745  0.8660140
##   1.00  0.9137215  0.7702745  0.8802176
## 
## Tuning parameter 'sigma' was held constant at a value of 0.1042768
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.1042768 and C = 1.

svm.pred=predict(svm.fit,newdata = x.test)
confusionMatrix(table(x.test$trend,svm.pred))

## Confusion Matrix and Statistics
## 
##       svm.pred
##        DOWN UP
##   DOWN   50 12
##   UP      5 77
##                                           
##                Accuracy : 0.8819          
##                  95% CI : (0.8177, 0.9297)
##     No Information Rate : 0.6181          
##     P-Value [Acc > NIR] : 1.485e-12       
##                                           
##                   Kappa : 0.7559          
##                                           
##  Mcnemar's Test P-Value : 0.1456          
##                                           
##             Sensitivity : 0.9091          
##             Specificity : 0.8652          
##          Pos Pred Value : 0.8065          
##          Neg Pred Value : 0.9390          
##              Prevalence : 0.3819          
##          Detection Rate : 0.3472          
##    Detection Prevalence : 0.4306          
##       Balanced Accuracy : 0.8871          
##                                           
##        'Positive' Class : DOWN            
##

slc.pred=predict(svm.fit,newdata = slc)
table(slc.pred,slc$trend)

##         
## slc.pred DOWN
##     DOWN    2
##     UP      0

plot(svm.fit)

#End SVM segment
#Random forest with caret segment:
rf.fit=train(as.factor(trend)~.,data = x.train,method="rf",trControl=tc,preProcess=c("center","scale"))
rf.fit

## Random Forest 
## 
## Pre-processing: centered (12), scaled (12) 
## Resampling: Cross-Validated (5 fold, repeated 3 times) 
## Summary of sample sizes: 464, 464, 463, 463, 462, 463, ... 
## Resampling results across tuning parameters:
## 
##   mtry  ROC        Sens       Spec     
##    2    0.9143318  0.8006013  0.8770008
##    7    0.9210346  0.8204967  0.8729915
##   12    0.9187709  0.8323922  0.8729915
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 7.

rf.pred=predict(rf.fit,newdata = x.test)
confusionMatrix(table(x.test$trend,rf.pred))

## Confusion Matrix and Statistics
## 
##       rf.pred
##        DOWN UP
##   DOWN   47 15
##   UP      7 75
##                                           
##                Accuracy : 0.8472          
##                  95% CI : (0.7779, 0.9017)
##     No Information Rate : 0.625           
##     P-Value [Acc > NIR] : 3.758e-09       
##                                           
##                   Kappa : 0.6835          
##                                           
##  Mcnemar's Test P-Value : 0.1356          
##                                           
##             Sensitivity : 0.8704          
##             Specificity : 0.8333          
##          Pos Pred Value : 0.7581          
##          Neg Pred Value : 0.9146          
##              Prevalence : 0.3750          
##          Detection Rate : 0.3264          
##    Detection Prevalence : 0.4306          
##       Balanced Accuracy : 0.8519          
##                                           
##        'Positive' Class : DOWN            
##

slc.pred=predict(rf.fit,newdata = slc)
table(slc.pred,slc$trend)

##         
## slc.pred DOWN
##     DOWN    2
##     UP      0

rf.imp=varImp(rf.fit,scale = F)
plot(rf.imp)

plot(rf.fit)

#End random forest segment
#Comparing models
model_perf=resamples(list(RF=rf.fit,EARTH=earth.fit,SVM=svm.fit,XGB=xgb.fit,KNN=knn.fit))
bwplot(model_perf)

Stock market Machine Learning Prediction:Part one

ABA

2023-08-09

R Markdown

Including Plots