Predict Stock Return with Cluster

Reproducible notes for Predict Stock Return with Cluster

Anil Kumar

Source File code Connect connect

PRELIMINARIES

Load the library that are required in the assignment:

library("tm")
library("SnowballC")

library("caTools")
library("rpart")
library("rpart.plot")
library("ROCR")
library("randomForest")
library("caret")

INTRODUCTION

first cluster observations and then build cluster-specific prediction models

This dataset contains the following variables:

Load Data

stocks = read.csv("StocksCluster.csv")
str(stocks)
## 'data.frame':    11580 obs. of  12 variables:
##  $ ReturnJan  : num  0.0807 -0.0107 0.0477 -0.074 -0.031 ...
##  $ ReturnFeb  : num  0.0663 0.1021 0.036 -0.0482 -0.2127 ...
##  $ ReturnMar  : num  0.0329 0.1455 0.0397 0.0182 0.0915 ...
##  $ ReturnApr  : num  0.1831 -0.0844 -0.1624 -0.0247 0.1893 ...
##  $ ReturnMay  : num  0.13033 -0.3273 -0.14743 -0.00604 -0.15385 ...
##  $ ReturnJune : num  -0.0176 -0.3593 0.0486 -0.0253 -0.1061 ...
##  $ ReturnJuly : num  -0.0205 -0.0253 -0.1354 -0.094 0.3553 ...
##  $ ReturnAug  : num  0.0247 0.2113 0.0334 0.0953 0.0568 ...
##  $ ReturnSep  : num  -0.0204 -0.58 0 0.0567 0.0336 ...
##  $ ReturnOct  : num  -0.1733 -0.2671 0.0917 -0.0963 0.0363 ...
##  $ ReturnNov  : num  -0.0254 -0.1512 -0.0596 -0.0405 -0.0853 ...
##  $ PositiveDec: int  0 0 0 1 1 1 1 0 0 0 ...

observations have positive returns in December

table(stocks$PositiveDec)
## 
##    0    1 
## 5256 6324

What is the maximum correlation between any two return variables in the dataset? You should look at the pairwise correlations between ReturnJan, ReturnFeb, ReturnMar, ReturnApr, ReturnMay, ReturnJune, ReturnJuly, ReturnAug, ReturnSep, ReturnOct, and ReturnNov.

cor(stocks)
##                ReturnJan   ReturnFeb    ReturnMar    ReturnApr
## ReturnJan    1.000000000  0.06677458 -0.090496798 -0.037678006
## ReturnFeb    0.066774583  1.00000000 -0.155983263 -0.191351924
## ReturnMar   -0.090496798 -0.15598326  1.000000000  0.009726288
## ReturnApr   -0.037678006 -0.19135192  0.009726288  1.000000000
## ReturnMay   -0.044411417 -0.09552092 -0.003892789  0.063822504
## ReturnJune   0.092238307  0.16999448 -0.085905486 -0.011027752
## ReturnJuly  -0.081429765 -0.06177851  0.003374160  0.080631932
## ReturnAug   -0.022792019  0.13155979 -0.022005400 -0.051756051
## ReturnSep   -0.026437153  0.04350177  0.076518327 -0.028920972
## ReturnOct    0.142977229 -0.08732427 -0.011923758  0.048540025
## ReturnNov    0.067632333 -0.15465828  0.037323535  0.031761837
## PositiveDec  0.004728518 -0.03817318  0.022408661  0.094353528
##                ReturnMay  ReturnJune    ReturnJuly     ReturnAug
## ReturnJan   -0.044411417  0.09223831 -0.0814297650 -0.0227920187
## ReturnFeb   -0.095520920  0.16999448 -0.0617785094  0.1315597863
## ReturnMar   -0.003892789 -0.08590549  0.0033741597 -0.0220053995
## ReturnApr    0.063822504 -0.01102775  0.0806319317 -0.0517560510
## ReturnMay    1.000000000 -0.02107454  0.0908502642 -0.0331256580
## ReturnJune  -0.021074539  1.00000000 -0.0291525996  0.0107105260
## ReturnJuly   0.090850264 -0.02915260  1.0000000000  0.0007137558
## ReturnAug   -0.033125658  0.01071053  0.0007137558  1.0000000000
## ReturnSep    0.021962862  0.04474727  0.0689478037  0.0007407139
## ReturnOct    0.017166728 -0.02263599 -0.0547089088 -0.0755945614
## ReturnNov    0.048046590 -0.06527054 -0.0483738369 -0.1164890345
## PositiveDec  0.058201934  0.02340975  0.0743642097  0.0041669657
##                 ReturnSep   ReturnOct   ReturnNov  PositiveDec
## ReturnJan   -0.0264371526  0.14297723  0.06763233  0.004728518
## ReturnFeb    0.0435017706 -0.08732427 -0.15465828 -0.038173184
## ReturnMar    0.0765183267 -0.01192376  0.03732353  0.022408661
## ReturnApr   -0.0289209718  0.04854003  0.03176184  0.094353528
## ReturnMay    0.0219628623  0.01716673  0.04804659  0.058201934
## ReturnJune   0.0447472692 -0.02263599 -0.06527054  0.023409745
## ReturnJuly   0.0689478037 -0.05470891 -0.04837384  0.074364210
## ReturnAug    0.0007407139 -0.07559456 -0.11648903  0.004166966
## ReturnSep    1.0000000000 -0.05807924 -0.01971980  0.041630286
## ReturnOct   -0.0580792362  1.00000000  0.19167279 -0.052574956
## ReturnNov   -0.0197197998  0.19167279  1.00000000 -0.062346556
## PositiveDec  0.0416302863 -0.05257496 -0.06234656  1.000000000

INITIAL LOGISTIC REGRESSION MODEL

set.seed(144)
spl = sample.split(stocks$PositiveDec, SplitRatio = 0.7)
stocksTrain = subset(stocks, spl == TRUE)
stocksTest = subset(stocks, spl == FALSE)
Model = glm(PositiveDec ~ ., data=stocksTrain, family=binomial)
PredictTrain = predict(Model, type="response")
table(stocksTrain$PositiveDec, PredictTrain > 0.5)
##    
##     FALSE TRUE
##   0   990 2689
##   1   787 3640
(990 + 3640)/(990 + 2689 + 787 + 3640)
## [1] 0.5711818
PredictTest = predict(Model, newdata=stocksTest, type="response")
table(stocksTest$PositiveDec, PredictTest > 0.5)
##    
##     FALSE TRUE
##   0   417 1160
##   1   344 1553
(417 + 1553)/(417 + 1160 + 344 + 1553)
## [1] 0.5670697

Accuracy Baseline model

always predicts the most common outcome (PositiveDec = 1)?

table(stocksTest$PositiveDec)
## 
##    0    1 
## 1577 1897
1897/(1577 + 1897)
## [1] 0.5460564

CLUSTERING STOCKS

limitedTrain = stocksTrain
limitedTrain$PositiveDec = NULL
limitedTest = stocksTest
limitedTest$PositiveDec = NULL
library(caret)
preproc = preProcess(limitedTrain)
normTrain = predict(preproc, limitedTrain)
normTest = predict(preproc, limitedTest)
mean(normTrain$ReturnJan)
## [1] 2.100586e-17
mean(normTest$ReturnJan)
## [1] -0.0004185886

K-mean

set.seed(144)
km = kmeans(normTrain, centers = 3)
library(flexclust)
km.kcca = as.kcca(km, normTrain)
clusterTrain = predict(km.kcca)
clusterTest = predict(km.kcca, newdata=normTest)

subset

stocksTrain1 = subset(stocksTrain, clusterTrain == 1)
stocksTrain2 = subset(stocksTrain, clusterTrain == 2)
stocksTrain3 = subset(stocksTrain, clusterTrain == 3)
stocksTest1 = subset(stocksTest, clusterTest == 1)
stocksTest2 = subset(stocksTest, clusterTest == 2)
stocksTest3 = subset(stocksTest, clusterTest == 3)
StocksModel1 = glm(PositiveDec ~ ., data=stocksTrain1, family=binomial)
StocksModel2 = glm(PositiveDec ~ ., data=stocksTrain2, family=binomial)
StocksModel3 = glm(PositiveDec ~ ., data=stocksTrain3, family=binomial)

summary

summary(StocksModel1)
## 
## Call:
## glm(formula = PositiveDec ~ ., family = binomial, data = stocksTrain1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7307  -1.2910   0.8878   1.0280   1.5023  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.17224    0.06302   2.733  0.00628 ** 
## ReturnJan    0.02498    0.29306   0.085  0.93206    
## ReturnFeb   -0.37207    0.29123  -1.278  0.20139    
## ReturnMar    0.59555    0.23325   2.553  0.01067 *  
## ReturnApr    1.19048    0.22439   5.305 1.12e-07 ***
## ReturnMay    0.30421    0.22845   1.332  0.18298    
## ReturnJune  -0.01165    0.29993  -0.039  0.96901    
## ReturnJuly   0.19769    0.27790   0.711  0.47685    
## ReturnAug    0.51273    0.30858   1.662  0.09660 .  
## ReturnSep    0.58833    0.28133   2.091  0.03651 *  
## ReturnOct   -1.02254    0.26007  -3.932 8.43e-05 ***
## ReturnNov   -0.74847    0.28280  -2.647  0.00813 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4243.0  on 3156  degrees of freedom
## Residual deviance: 4172.9  on 3145  degrees of freedom
## AIC: 4196.9
## 
## Number of Fisher Scoring iterations: 4
summary(StocksModel2)
## 
## Call:
## glm(formula = PositiveDec ~ ., family = binomial, data = stocksTrain2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2012  -1.1941   0.8583   1.1334   1.9424  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.10293    0.03785   2.719 0.006540 ** 
## ReturnJan    0.88451    0.20276   4.362 1.29e-05 ***
## ReturnFeb    0.31762    0.26624   1.193 0.232878    
## ReturnMar   -0.37978    0.24045  -1.579 0.114231    
## ReturnApr    0.49291    0.22460   2.195 0.028189 *  
## ReturnMay    0.89655    0.25492   3.517 0.000436 ***
## ReturnJune   1.50088    0.26014   5.770 7.95e-09 ***
## ReturnJuly   0.78315    0.26864   2.915 0.003554 ** 
## ReturnAug   -0.24486    0.27080  -0.904 0.365876    
## ReturnSep    0.73685    0.24820   2.969 0.002989 ** 
## ReturnOct   -0.27756    0.18400  -1.509 0.131419    
## ReturnNov   -0.78747    0.22458  -3.506 0.000454 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6506.3  on 4695  degrees of freedom
## Residual deviance: 6362.2  on 4684  degrees of freedom
## AIC: 6386.2
## 
## Number of Fisher Scoring iterations: 4
summary(StocksModel3)
## 
## Call:
## glm(formula = PositiveDec ~ ., family = binomial, data = stocksTrain3)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9146  -1.0393  -0.7689   1.1921   1.6939  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)  
## (Intercept) -0.181896   0.325182  -0.559   0.5759  
## ReturnJan   -0.009789   0.448943  -0.022   0.9826  
## ReturnFeb   -0.046883   0.213432  -0.220   0.8261  
## ReturnMar    0.674179   0.564790   1.194   0.2326  
## ReturnApr    1.281466   0.602672   2.126   0.0335 *
## ReturnMay    0.762512   0.647783   1.177   0.2392  
## ReturnJune   0.329434   0.408038   0.807   0.4195  
## ReturnJuly   0.774164   0.729360   1.061   0.2885  
## ReturnAug    0.982605   0.533158   1.843   0.0653 .
## ReturnSep    0.363807   0.627774   0.580   0.5622  
## ReturnOct    0.782242   0.733123   1.067   0.2860  
## ReturnNov   -0.873752   0.738480  -1.183   0.2367  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 346.92  on 252  degrees of freedom
## Residual deviance: 328.29  on 241  degrees of freedom
## AIC: 352.29
## 
## Number of Fisher Scoring iterations: 4
PredictTest1 = predict(StocksModel1, newdata = stocksTest1, type="response")
PredictTest2 = predict(StocksModel2, newdata = stocksTest2, type="response")
PredictTest3 = predict(StocksModel3, newdata = stocksTest3, type="response")

And the classification matrices can be computed with:

table(stocksTest1$PositiveDec, PredictTest1 > 0.5)
##    
##     FALSE TRUE
##   0    30  471
##   1    23  774
table(stocksTest2$PositiveDec, PredictTest2 > 0.5)
##    
##     FALSE TRUE
##   0   388  626
##   1   309  757
table(stocksTest3$PositiveDec, PredictTest3 > 0.5)
##    
##     FALSE TRUE
##   0    49   13
##   1    21   13

CLUSTER-SPECIFIC PREDICTIONS

AllPredictions = c(PredictTest1, PredictTest2, PredictTest3)
AllOutcomes = c(stocksTest1$PositiveDec, stocksTest2$PositiveDec, stocksTest3$PositiveDec)
table(AllOutcomes, AllPredictions > 0.5)
##            
## AllOutcomes FALSE TRUE
##           0   467 1110
##           1   353 1544
(467 + 1544)/(467 + 1110 + 353 + 1544) 
## [1] 0.5788716