Closing period of trade long and short signal classification Case Santander using cubist

testit <- function() {
  message("testing package startup messages")
  packageStartupMessage("initializing ...", appendLF = FALSE)
  Sys.sleep(1)
  packageStartupMessage(" done")
}

suppressPackageStartupMessages(testit())
## testing package startup messages
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(ggplot2)
library(quantmod)
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Loading required package: TTR
## Version 0.4-0 included new data defaults. See ?getSymbols.
setwd("~/git/")

DATAITX <- load("SAN_270412.RData")

SAN <- h05n

head(SAN)
##                      hora precio volumen broker.comprador broker.vendedor
## 1 2012-04-27 09:00:00.000   4.59      60             9838            8830
## 2 2012-04-27 09:00:00.000   4.59      33             9816            8830
## 3 2012-04-27 09:00:00.000   4.59     100             9838            8830
## 4 2012-04-27 09:00:00.000   4.59   13300             9838            8830
## 5 2012-04-27 09:00:00.000   4.59     200             9843            8830
## 6 2012-04-27 09:00:00.000   4.59     680             9838            8830

converting it into OHLCV format

list <- strsplit(SAN$hora, " ")

library("plyr")
SANhft <- ldply(list)

SANHFT<- cbind(SANhft[,1:2],SAN[,2:3])
colnames(SANHFT) <- c( "Date","Time","Price","Volume")



SANxts <- as.xts(SANHFT[, 3:4], order.by=as.POSIXct(SANHFT[, 2], format=' %H:%M:%S'))
Santander<-to.minutes(SANxts)
chartSeries(Santander) 

plot of chunk unnamed-chunk-3

data aggregation for HFF time series

X <- strptime(SAN$hora, format = "%Y-%m-%d %H:%M:%S.000")

time <- cut(X, "1 min")
timestamp <- table(time)
z <- as.matrix(timestamp)


z<-z[-c(312,313,314,315,506,511,512,513,514,515), ]
Morders <- as.matrix(z)

adding indicators as independent varibales for train test set

## Average True Range Indicator
ATRindicator <- ATR(Santander, n = 1)

## Weilders Directional Index

ADXIndicator <- ADX(Santander, n = 2)

## Aroon Indicator
AroonIndicator <- aroon(Santander[, 2:3], n = 1)

## Stochastic Momentum Indicator
SMIindicator <- SMI(Santander[, 2:4], n = 2, nFast = 2, nSlow = 2,
                    nSig = 2, maType = SMA,
                    bounded = TRUE)

## Bands
BBandIndicator <- BBands(Santander[, 2:4])

## EMA
EMAIndicator <- EMA(Santander[,4], n = 1)



## MACD
MACDindicator <- MACD(Santander[,4])

## SAR
sarindicator <- SAR(Santander[,2:3], accel = c(0.02, 0.2))

## Rate of change
roc <- ROC(Santander[,5], n = 1, type = c("continuous", "discrete"),
           na.pad = TRUE)

## Relative strenght index
rsi <- RSI(Santander[,5], n = 1, maType = "WMA")


CombData <- cbind(Santander,ATRindicator,ADXIndicator,AroonIndicator,SMIindicator,BBandIndicator,EMAIndicator,MACDindicator,sarindicator,roc,rsi)

CombdataM<-as.data.frame(CombData)

MordersM<-as.data.frame(Morders)


RealCombdata<-cbind(CombData[36:507,],MordersM[35:506,])

SANCOMBD<-data.frame(date = index(RealCombdata), 
           RealCombdata, row.names=NULL)

colnames(SANCOMBD)<-c("Date","OpenPrice","HighPrice","LowPrice","ClosePrice","Volume","TR","ATR","trueHigh","trueLow","+dirInd","-dirInd","dirind","ADX","Aroonup","AroonDown","AroonOscillator","SMI","SMISignal","dnBB","MaAvg","UpBB","pctB","EMA","MACD","MACDsignal","SAR","ROC","RSI","MLimitorders")

##PLOTTING Matching limit orders 

p <- ggplot(SANCOMBD, aes(Date,MLimitorders))

p + geom_line()

plot of chunk unnamed-chunk-5

preprocessing of Data using Hirarcheial clustering and cut off correlation coefficient

Filtereddata<-SANCOMBD[,-c(1,5)]

library(corrplot)
correlations <- cor(Filtereddata)

###Before selection of variables

corrplot(correlations, order = "hclust")          

plot of chunk unnamed-chunk-6

highCorr <- findCorrelation(correlations, cutoff = 0.70)
length(highCorr)          
## [1] 15
nearZeroVar(Filtereddata)
## integer(0)
head(highCorr) 
## [1] 25 19 20 22  2 18
filteredCombdata <- Filtereddata[, -highCorr]

corMatfterFiler <- cor(filteredCombdata)


###After the independent variables with high correlation is removed
corrplot(corMatfterFiler, order = "hclust")

plot of chunk unnamed-chunk-6

Dividing dataset for in sample and out of sample

FirstInsampleSet<-Santander[1:452,]
FirstInsampleSetVol<-Santander[34:452,5]
FirstInsampleSetCprice<-Santander[34:452,4]

Calculating indicators for the period of 9,30 to 4:35

reChart(subset = "first 6 hours")

plot of chunk unnamed-chunk-8

TA iNDICATORS

ATRindicatorIS <- ATR(FirstInsampleSet, n = 1)

ADXIndicatorIS <- ADX(FirstInsampleSet, n = 2)

AroonIndicatorIS <- aroon(FirstInsampleSet[, 2:3], n = 1)


SMIindicatorIS <- SMI(FirstInsampleSet[, 2:4], n = 2, nFast = 2, nSlow = 2,nSig = 2, maType = SMA,bounded = TRUE)

BBandIndicatorIS <- BBands(FirstInsampleSet[, 2:4])

MACDindicatorIS <- MACD(FirstInsampleSet[,4])

rsiIS <- RSI(FirstInsampleSet[,5], n = 1, maType = "WMA")


InSampleInput<-cbind(ATRindicatorIS,ADXIndicatorIS,AroonIndicatorIS,SMIindicatorIS, BBandIndicatorIS,MACDindicatorIS,rsiIS )


InSampleInputData<-data.frame(date = index(InSampleInput), InSampleInput, row.names=NULL)

InSampleInputDataM<-InSampleInputData[34:452,]

InSampleInputDataM<-InSampleInputDataM[,-c(1,3,5,8,12,13,15,16,17,19)]

adding volume and matching limit orders variable

FirstInsampleSetVolM<-data.frame(date = index(FirstInsampleSetVol),FirstInsampleSetVol, row.names=NULL)

lagdataFS<-cbind(FirstInsampleSetVolM,InSampleInputDataM,MordersM[34:452,])

lagdataFSM<-lagdataFS[,-c(1)]

colnames(lagdataFSM)<-c("volume","TrueRange","TrueHigh","PDirInd","NDirInd","ADX","Aroonup","AroonDown","SMIsignal","pctB","MACDSignal","Rsi","MlimitOrders")




Vol<-lagdataFSM[,1]

Volume<-Lag(Vol)


TR<-lagdataFSM[,2]

TrueRange<-Lag(TR)



TH<-lagdataFSM[,3]

TrueHigh<-Lag(TH)


Pdir<-lagdataFSM[,4]

PdirIND<-Lag(Pdir)


Ndir<-lagdataFSM[,5]

NdirIND<-Lag(Ndir)


adx<-lagdataFSM[,6]

ADX<-Lag(adx)


aroonUP<-lagdataFSM[,7]

AroonUP<-Lag(aroonUP)


aroonDWN<-lagdataFSM[,8]

AroonDWN<-Lag(aroonDWN)


SMIsignal<-lagdataFSM[,9]

SMIsignalM<-Lag(SMIsignal)


pctB<-lagdataFSM[,10]

PctB<-Lag(pctB)


MACDSignal<-lagdataFSM[,11]
MACDSignalM<-Lag(MACDSignal)


rsi<-lagdataFSM[,12]
RSI<-Lag(rsi)

MlimitOrders<-lagdataFSM[,13]
MlimitOrdersM<-Lag(MlimitOrders)


InsamplelaggedData<-cbind(Volume,TrueRange,TrueHigh,PdirIND,NdirIND,ADX,AroonUP,AroonDWN,SMIsignalM,PctB,MACDSignalM,RSI,MlimitOrdersM)

colnames(InsamplelaggedData)<-c("volume","TrueRange","TrueHigh","PDirInd","NDirInd","ADX","Aroonup","AroonDown","SMIsignal","pctB","MACDSignal","Rsi","MlimitOrders")


InsamplelaggedData<-as.data.frame(InsamplelaggedData)

FirstInsampleSetCpriceM<-as.matrix(FirstInsampleSetCprice)

lagInputDATAInSa<-cbind(FirstInsampleSetCpriceM,InsamplelaggedData)

colnames(lagInputDATAInSa)<-c("Cprice","volume","TrueRange","TrueHigh","PDirInd","NDirInd","ADX","Aroonup","AroonDown","SMIsignal","pctB","MACDSignal","Rsi","MlimitOrders")


head(lagInputDATAInSa)
##                     Cprice volume TrueRange TrueHigh  PDirInd   NDirInd
## 2015-01-09 09:33:59  4.652     NA        NA       NA       NA        NA
## 2015-01-09 09:34:58  4.653 163499     0.020    4.648 80.85587 7.1345550
## 2015-01-09 09:35:45  4.655  79453     0.005    4.650 69.36511 5.1279537
## 2015-01-09 09:36:57  4.660  41362     0.007    4.654 63.98047 2.8687805
## 2015-01-09 09:37:59  4.650 119249     0.004    4.655 50.92653 1.9080712
## 2015-01-09 09:38:57  4.635 102763     0.016    4.659 32.04703 0.5186282
##                          ADX Aroonup AroonDown SMIsignal      pctB
## 2015-01-09 09:33:59       NA      NA        NA        NA        NA
## 2015-01-09 09:34:58 68.43514     100         0  48.23377 0.9632944
## 2015-01-09 09:35:45 77.33377     100         0  69.23404 0.9159944
## 2015-01-09 09:36:57 84.37547     100         0  64.65340 0.8988301
## 2015-01-09 09:37:59 88.57633     100       100  56.26551 0.8528020
## 2015-01-09 09:38:57 92.69560       0         0  56.62393 0.8000342
##                     MACDSignal Rsi MlimitOrders
## 2015-01-09 09:33:59         NA  NA           NA
## 2015-01-09 09:34:58  0.2499475   0           98
## 2015-01-09 09:35:45  0.2627466   0           52
## 2015-01-09 09:36:57  0.2756341   0           29
## 2015-01-09 09:37:59  0.2889856 100           80
## 2015-01-09 09:38:57  0.2978238   0           71

time series cross valdiation

myTimeControl <- trainControl(method = "timeslice", initialWindow =380,horizon =38, fixedWindow = TRUE)

Insample training and testing with cubist method

CubistGrid <-expand.grid(.committees = c(1, 10, 50, 100),.neighbors = c(0, 1, 5, 9))

library(Cubist)
set.seed(100)

CbModel<- train(Cprice~volume+TrueRange+TrueHigh+PDirInd+NDirInd+ADX+Aroonup+AroonDown+SMIsignal+pctB+MACDSignal+Rsi+MlimitOrders,data =lagInputDATAInSa,method = "cubist",tuneGrid =CubistGrid, trControl =myTimeControl)

CbModel
## Cubist 
## 
## 419 samples
##  13 predictor
## 
## No pre-processing
## Resampling: Rolling Forecasting Origin Resampling (38 held-out with a fixed window) 
## 
## Summary of sample sizes: 380 
## 
## Resampling results across tuning parameters:
## 
##   committees  neighbors  RMSE        Rsquared 
##     1         0          0.01104401  0.5957053
##     1         1          0.01283212  0.4801673
##     1         5          0.01153085  0.5913831
##     1         9          0.01098597  0.6149879
##    10         0          0.01086826  0.5968469
##    10         1          0.01277495  0.4818908
##    10         5          0.01159198  0.5869536
##    10         9          0.01092600  0.6159503
##    50         0          0.01111725  0.5954274
##    50         1          0.01285443  0.4843449
##    50         5          0.01175726  0.5872429
##    50         9          0.01111485  0.6150267
##   100         0          0.01114953  0.5952398
##   100         1          0.01286488  0.4846352
##   100         5          0.01177832  0.5872675
##   100         9          0.01113896  0.6148999
## 
## RMSE was used to select the optimal model using  the smallest value.
## The final values used for the model were committees = 10 and neighbors = 0.
summary(CbModel)
## 
## Call:
## cubist.default(x = x, y = y, committees = param$committees)
## 
## 
## Cubist [Release 2.07 GPL Edition]  Fri Jan  9 16:32:26 2015
## ---------------------------------
## 
##     Target attribute `outcome'
## 
## Read 418 cases (14 attributes) from undefined.data
## 
## Model 1:
## 
##   Rule 1/1: [418 cases, mean 4.7701, range 4.605 to 4.874, est err 0.0072]
## 
##  outcome = 0.1581 + 0.966 TrueHigh + 0.01 pctB - 0.018 MACDSignal
## 
## Model 2:
## 
##   Rule 2/1: [418 cases, mean 4.7701, range 4.605 to 4.874, est err 0.0074]
## 
##  outcome = 0.1341 + 0.972 TrueHigh + 0.011 pctB - 0.00011 PDirInd
## 
## Model 3:
## 
##   Rule 3/1: [418 cases, mean 4.7701, range 4.605 to 4.874, est err 0.0079]
## 
##  outcome = 0.2074 + 0.955 TrueHigh - 0.037 MACDSignal + 0.011 pctB
##            + 6e-05 Aroonup
## 
## Model 4:
## 
##   Rule 4/1: [418 cases, mean 4.7701, range 4.605 to 4.874, est err 0.0075]
## 
##  outcome = 0.1351 + 0.972 TrueHigh + 0.012 pctB - 0.00014 PDirInd
## 
## Model 5:
## 
##   Rule 5/1: [418 cases, mean 4.7701, range 4.605 to 4.874, est err 0.0078]
## 
##  outcome = 0.2284 + 0.951 TrueHigh - 0.043 MACDSignal + 0.014 pctB
## 
## Model 6:
## 
##   Rule 6/1: [418 cases, mean 4.7701, range 4.605 to 4.874, est err 0.0075]
## 
##  outcome = 0.1351 + 0.972 TrueHigh + 0.012 pctB - 0.00014 PDirInd
## 
## Model 7:
## 
##   Rule 7/1: [418 cases, mean 4.7701, range 4.605 to 4.874, est err 0.0078]
## 
##  outcome = 0.2284 + 0.951 TrueHigh - 0.043 MACDSignal + 0.014 pctB
## 
## Model 8:
## 
##   Rule 8/1: [418 cases, mean 4.7701, range 4.605 to 4.874, est err 0.0075]
## 
##  outcome = 0.1351 + 0.972 TrueHigh + 0.012 pctB - 0.00014 PDirInd
## 
## Model 9:
## 
##   Rule 9/1: [418 cases, mean 4.7701, range 4.605 to 4.874, est err 0.0078]
## 
##  outcome = 0.2284 + 0.951 TrueHigh - 0.043 MACDSignal + 0.014 pctB
## 
## Model 10:
## 
##   Rule 10/1: [418 cases, mean 4.7701, range 4.605 to 4.874, est err 0.0075]
## 
##  outcome = 0.1351 + 0.972 TrueHigh + 0.012 pctB - 0.00014 PDirInd
## 
## 
## Evaluation on training data (418 cases):
## 
##     Average  |error|             0.0090
##     Relative |error|               0.18
##     Correlation coefficient        0.98
## 
## 
##  Attribute usage:
##    Conds  Model
## 
##           100%    TrueHigh
##           100%    pctB
##            50%    PDirInd
##            50%    MACDSignal
##            10%    Aroonup
## 
## 
## Time: 0.3 secs
plot(CbModel)

plot of chunk unnamed-chunk-12

preperation of out of sample data from 4:30 to 5:30

FirstOutsampleSet<-Santander[453:507,]
FirstOutsampleSetVol<-Santander[486:507,5]
FirstOutsampleSetCprice<-Santander[486:507,4]

FirstOutsampleSetCpriceM<-as.matrix(FirstOutsampleSetCprice)

Calculation of technical Indicators

ATRindicatorOS <- ATR(FirstOutsampleSet, n = 1)

ADXIndicatorOS <- ADX(FirstOutsampleSet, n = 2)

AroonIndicatorOS <- aroon(FirstOutsampleSet[, 2:3], n = 1)


SMIindicatorOS <- SMI(FirstOutsampleSet[, 2:4], n = 2, nFast = 2, nSlow = 2,nSig = 2, maType = SMA,bounded = TRUE)

BBandIndicatorOS <- BBands(FirstOutsampleSet[, 2:4])

MACDindicatorOS <- MACD(FirstOutsampleSet[,4])

rsiOS <- RSI(FirstOutsampleSet[,5], n = 1, maType = "WMA")


OutSampleInput<-cbind(ATRindicatorOS,ADXIndicatorOS,AroonIndicatorOS,SMIindicatorOS, BBandIndicatorOS,MACDindicatorOS,rsiOS )


OutSampleInputData<-data.frame(date = index(OutSampleInput), OutSampleInput, row.names=NULL)

OutSampleInputDataM<-OutSampleInputData[34:55,]

OutSampleInputDataM<-OutSampleInputDataM[,-c(1,3,5,8,12,13,15,16,17,19)]

adding volume and matching limit orders variable

FirstOutsampleSetVolM<-data.frame(date = index(FirstOutsampleSetVol),FirstOutsampleSetVol, row.names=NULL)

lagdataOFS<-cbind(FirstOutsampleSetVolM,OutSampleInputDataM,MordersM[486:507,])

lagdataOFS<-lagdataOFS[,-c(1)]

colnames(lagdataOFS)<-c("volume","TrueRange","TrueHigh","PDirInd","NDirInd","ADX","Aroonup","AroonDown","SMIsignal","pctB","MACDSignal","Rsi","MlimitOrders")




Vol1<-lagdataOFS[,1]

Volume1<-Lag(Vol1)


TR1<-lagdataOFS[,2]

TrueRange1<-Lag(TR1)



TH1<-lagdataOFS[,3]

TrueHigh1<-Lag(TH1)


Pdir1<-lagdataOFS[,4]

PdirIND1<-Lag(Pdir1)


Ndir1<-lagdataOFS[,5]

NdirIND1<-Lag(Ndir1)


adx1<-lagdataOFS[,6]

ADX1<-Lag(adx1)


aroonUP1<-lagdataOFS[,7]

AroonUP1<-Lag(aroonUP1)


aroonDWN1<-lagdataOFS[,8]

AroonDWN1<-Lag(aroonDWN1)


SMIsignal1<-lagdataOFS[,9]

SMIsignalM1<-Lag(SMIsignal1)


pctB1<-lagdataOFS[,10]

PctB1<-Lag(pctB1)


MACDSignal1<-lagdataOFS[,11]
MACDSignalM1<-Lag(MACDSignal1)


rsi1<-lagdataOFS[,12]
RSI1<-Lag(rsi1)

MlimitOrders1<-lagdataOFS[,13]
MlimitOrdersM1<-Lag(MlimitOrders1)


OutsamplelaggedData<-cbind(Volume1,TrueRange1,TrueHigh1,PdirIND1,NdirIND1,ADX1,AroonUP1,AroonDWN1,SMIsignalM1,PctB1,MACDSignalM1,RSI1,MlimitOrdersM1)

colnames(OutsamplelaggedData)<-c("volume","TrueRange","TrueHigh","PDirInd","NDirInd","ADX","Aroonup","AroonDown","SMIsignal","pctB","MACDSignal","Rsi","MlimitOrders")


OutsamplelaggedData<-as.data.frame(OutsamplelaggedData)


lagInputDATAInOut<-cbind(FirstOutsampleSetCpriceM,OutsamplelaggedData)

colnames(lagInputDATAInOut)<-c("Cprice","volume","TrueRange","TrueHigh","PDirInd","NDirInd","ADX","Aroonup","AroonDown","SMIsignal","pctB","MACDSignal","Rsi","MlimitOrders")


head(lagInputDATAInOut)
##                     Cprice volume TrueRange TrueHigh  PDirInd   NDirInd
## 2015-01-09 17:09:59  4.803     NA        NA       NA       NA        NA
## 2015-01-09 17:10:59  4.805  17483     0.001    4.801 18.13819  2.592819
## 2015-01-09 17:11:53  4.801  91247     0.005    4.805 51.91585  1.177094
## 2015-01-09 17:12:58  4.804  82720     0.002    4.805 36.13261 31.220821
## 2015-01-09 17:13:59  4.806  47689     0.002    4.803 22.47008 19.415551
## 2015-01-09 17:14:58  4.803 101686     0.005    4.806 47.01660  6.716770
##                          ADX Aroonup AroonDown SMIsignal      pctB
## 2015-01-09 17:09:59       NA      NA        NA        NA        NA
## 2015-01-09 17:10:59 72.39325     100       100  78.93278 0.4663065
## 2015-01-09 17:11:53 83.97958     100         0  64.18919 0.5529743
## 2015-01-09 17:12:58 45.63607       0       100  27.00000 0.3952218
## 2015-01-09 17:13:59 26.46431     100       100 -13.21739 0.4624164
## 2015-01-09 17:14:58 50.73197     100         0 -19.38406 0.6912891
##                     MACDSignal Rsi MlimitOrders
## 2015-01-09 17:09:59         NA  NA           NA
## 2015-01-09 17:10:59 0.10572681   0           31
## 2015-01-09 17:11:53 0.09889191 100           50
## 2015-01-09 17:12:58 0.09200165   0           58
## 2015-01-09 17:13:59 0.08622499   0           45
## 2015-01-09 17:14:58 0.08191694 100          108

Out of sample set forvalidation

 set.seed(100)
 CubistPredict<-predict(CbModel,lagInputDATAInOut)

 outPrice<-lagInputDATAInOut[2:22,1]


cor(outPrice,CubistPredict)
## [1] 0.9580667
library(forecast)
## Loading required package: timeDate
## This is forecast 5.7
accuracy(CubistPredict,outPrice,d=NULL)
##                  ME        RMSE         MAE        MPE       MAPE
## Test set 0.00100074 0.003368813 0.002930114 0.02074392 0.06080772

Buy Hold Sell signal generator

#Trade logic - Look for trend 
#If one step ahead logarithmic return is > 0 Buy 
#If one step ahead logarithmic return is < 0 Sell
#If one step ahead logarithmic return is = 0 Hold 


signal <- Delt(CubistPredict,k=1,type="log")

MarketReturn <- Delt(outPrice,k=1,type="log")


tradingfunc<-function(x){ 

                 ifelse(x>0,"Long",ifelse(x<0,"Short",NA))

                    }

AlgoTRADE <- apply(signal,1,tradingfunc)

MarketTRADE <- apply(MarketReturn,1,tradingfunc)

Table<-table(AlgoTRADE,MarketTRADE)



confusionMatrix(AlgoTRADE,MarketTRADE)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Long Short
##      Long    11     3
##      Short    2     2
##                                           
##                Accuracy : 0.7222          
##                  95% CI : (0.4652, 0.9031)
##     No Information Rate : 0.7222          
##     P-Value [Acc > NIR] : 0.6175          
##                                           
##                   Kappa : 0.2623          
##  Mcnemar's Test P-Value : 1.0000          
##                                           
##             Sensitivity : 0.8462          
##             Specificity : 0.4000          
##          Pos Pred Value : 0.7857          
##          Neg Pred Value : 0.5000          
##              Prevalence : 0.7222          
##          Detection Rate : 0.6111          
##    Detection Prevalence : 0.7778          
##       Balanced Accuracy : 0.6231          
##                                           
##        'Positive' Class : Long            
##