Predict Stock Return with Cluster

Reproducible notes for Predict Stock Return with Cluster

Anil Kumar

Source File code Connect connect

PRELIMINARIES

Load the library that are required in the assignment:

library("tm")
library("SnowballC")

library("caTools")
library("rpart")
library("rpart.plot")
library("ROCR")
library("randomForest")
library("caret")

INTRODUCTION

first cluster observations and then build cluster-specific prediction models

This dataset contains the following variables:

ReturnJan = the return for the company's stock during January (in the year of the observation).
ReturnFeb = the return for the company's stock during February (in the year of the observation).
ReturnMar = the return for the company's stock during March (in the year of the observation).
ReturnApr = the return for the company's stock during April (in the year of the observation).
ReturnMay = the return for the company's stock during May (in the year of the observation).
ReturnJune = the return for the company's stock during June (in the year of the observation).
ReturnJuly = the return for the company's stock during July (in the year of the observation).
ReturnAug = the return for the company's stock during August (in the year of the observation).
ReturnSep = the return for the company's stock during September (in the year of the observation).
ReturnOct = the return for the company's stock during October (in the year of the observation).
ReturnNov = the return for the company's stock during November (in the year of the observation).
PositiveDec = whether or not the company's stock had a positive return in December (in the year of the observation). This variable takes value 1 if the return was positive, and value 0 if the return was not positive.

Load Data

stocks = read.csv("StocksCluster.csv")
str(stocks)

## 'data.frame':    11580 obs. of  12 variables:
##  $ ReturnJan  : num  0.0807 -0.0107 0.0477 -0.074 -0.031 ...
##  $ ReturnFeb  : num  0.0663 0.1021 0.036 -0.0482 -0.2127 ...
##  $ ReturnMar  : num  0.0329 0.1455 0.0397 0.0182 0.0915 ...
##  $ ReturnApr  : num  0.1831 -0.0844 -0.1624 -0.0247 0.1893 ...
##  $ ReturnMay  : num  0.13033 -0.3273 -0.14743 -0.00604 -0.15385 ...
##  $ ReturnJune : num  -0.0176 -0.3593 0.0486 -0.0253 -0.1061 ...
##  $ ReturnJuly : num  -0.0205 -0.0253 -0.1354 -0.094 0.3553 ...
##  $ ReturnAug  : num  0.0247 0.2113 0.0334 0.0953 0.0568 ...
##  $ ReturnSep  : num  -0.0204 -0.58 0 0.0567 0.0336 ...
##  $ ReturnOct  : num  -0.1733 -0.2671 0.0917 -0.0963 0.0363 ...
##  $ ReturnNov  : num  -0.0254 -0.1512 -0.0596 -0.0405 -0.0853 ...
##  $ PositiveDec: int  0 0 0 1 1 1 1 0 0 0 ...

observations have positive returns in December

table(stocks$PositiveDec)

## 
##    0    1 
## 5256 6324

What is the maximum correlation between any two return variables in the dataset? You should look at the pairwise correlations between ReturnJan, ReturnFeb, ReturnMar, ReturnApr, ReturnMay, ReturnJune, ReturnJuly, ReturnAug, ReturnSep, ReturnOct, and ReturnNov.

cor(stocks)

##                ReturnJan   ReturnFeb    ReturnMar    ReturnApr
## ReturnJan    1.000000000  0.06677458 -0.090496798 -0.037678006
## ReturnFeb    0.066774583  1.00000000 -0.155983263 -0.191351924
## ReturnMar   -0.090496798 -0.15598326  1.000000000  0.009726288
## ReturnApr   -0.037678006 -0.19135192  0.009726288  1.000000000
## ReturnMay   -0.044411417 -0.09552092 -0.003892789  0.063822504
## ReturnJune   0.092238307  0.16999448 -0.085905486 -0.011027752
## ReturnJuly  -0.081429765 -0.06177851  0.003374160  0.080631932
## ReturnAug   -0.022792019  0.13155979 -0.022005400 -0.051756051
## ReturnSep   -0.026437153  0.04350177  0.076518327 -0.028920972
## ReturnOct    0.142977229 -0.08732427 -0.011923758  0.048540025
## ReturnNov    0.067632333 -0.15465828  0.037323535  0.031761837
## PositiveDec  0.004728518 -0.03817318  0.022408661  0.094353528
##                ReturnMay  ReturnJune    ReturnJuly     ReturnAug
## ReturnJan   -0.044411417  0.09223831 -0.0814297650 -0.0227920187
## ReturnFeb   -0.095520920  0.16999448 -0.0617785094  0.1315597863
## ReturnMar   -0.003892789 -0.08590549  0.0033741597 -0.0220053995
## ReturnApr    0.063822504 -0.01102775  0.0806319317 -0.0517560510
## ReturnMay    1.000000000 -0.02107454  0.0908502642 -0.0331256580
## ReturnJune  -0.021074539  1.00000000 -0.0291525996  0.0107105260
## ReturnJuly   0.090850264 -0.02915260  1.0000000000  0.0007137558
## ReturnAug   -0.033125658  0.01071053  0.0007137558  1.0000000000
## ReturnSep    0.021962862  0.04474727  0.0689478037  0.0007407139
## ReturnOct    0.017166728 -0.02263599 -0.0547089088 -0.0755945614
## ReturnNov    0.048046590 -0.06527054 -0.0483738369 -0.1164890345
## PositiveDec  0.058201934  0.02340975  0.0743642097  0.0041669657
##                 ReturnSep   ReturnOct   ReturnNov  PositiveDec
## ReturnJan   -0.0264371526  0.14297723  0.06763233  0.004728518
## ReturnFeb    0.0435017706 -0.08732427 -0.15465828 -0.038173184
## ReturnMar    0.0765183267 -0.01192376  0.03732353  0.022408661
## ReturnApr   -0.0289209718  0.04854003  0.03176184  0.094353528
## ReturnMay    0.0219628623  0.01716673  0.04804659  0.058201934
## ReturnJune   0.0447472692 -0.02263599 -0.06527054  0.023409745
## ReturnJuly   0.0689478037 -0.05470891 -0.04837384  0.074364210
## ReturnAug    0.0007407139 -0.07559456 -0.11648903  0.004166966
## ReturnSep    1.0000000000 -0.05807924 -0.01971980  0.041630286
## ReturnOct   -0.0580792362  1.00000000  0.19167279 -0.052574956
## ReturnNov   -0.0197197998  0.19167279  1.00000000 -0.062346556
## PositiveDec  0.0416302863 -0.05257496 -0.06234656  1.000000000

INITIAL LOGISTIC REGRESSION MODEL

set.seed(144)
spl = sample.split(stocks$PositiveDec, SplitRatio = 0.7)
stocksTrain = subset(stocks, spl == TRUE)
stocksTest = subset(stocks, spl == FALSE)
Model = glm(PositiveDec ~ ., data=stocksTrain, family=binomial)
PredictTrain = predict(Model, type="response")
table(stocksTrain$PositiveDec, PredictTrain > 0.5)

##    
##     FALSE TRUE
##   0   990 2689
##   1   787 3640

(990 + 3640)/(990 + 2689 + 787 + 3640)

## [1] 0.5711818

PredictTest = predict(Model, newdata=stocksTest, type="response")
table(stocksTest$PositiveDec, PredictTest > 0.5)

##    
##     FALSE TRUE
##   0   417 1160
##   1   344 1553

(417 + 1553)/(417 + 1160 + 344 + 1553)

## [1] 0.5670697

Accuracy Baseline model

always predicts the most common outcome (PositiveDec = 1)?

table(stocksTest$PositiveDec)

## 
##    0    1 
## 1577 1897

1897/(1577 + 1897)

## [1] 0.5460564

CLUSTERING STOCKS

limitedTrain = stocksTrain
limitedTrain$PositiveDec = NULL
limitedTest = stocksTest
limitedTest$PositiveDec = NULL

library(caret)
preproc = preProcess(limitedTrain)
normTrain = predict(preproc, limitedTrain)
normTest = predict(preproc, limitedTest)
mean(normTrain$ReturnJan)

## [1] 2.100586e-17

mean(normTest$ReturnJan)

## [1] -0.0004185886

K-mean

set.seed(144)
km = kmeans(normTrain, centers = 3)

library(flexclust)
km.kcca = as.kcca(km, normTrain)
clusterTrain = predict(km.kcca)
clusterTest = predict(km.kcca, newdata=normTest)

subset

stocksTrain1 = subset(stocksTrain, clusterTrain == 1)
stocksTrain2 = subset(stocksTrain, clusterTrain == 2)
stocksTrain3 = subset(stocksTrain, clusterTrain == 3)
stocksTest1 = subset(stocksTest, clusterTest == 1)
stocksTest2 = subset(stocksTest, clusterTest == 2)
stocksTest3 = subset(stocksTest, clusterTest == 3)

StocksModel1 = glm(PositiveDec ~ ., data=stocksTrain1, family=binomial)
StocksModel2 = glm(PositiveDec ~ ., data=stocksTrain2, family=binomial)
StocksModel3 = glm(PositiveDec ~ ., data=stocksTrain3, family=binomial)

summary

summary(StocksModel1)

## 
## Call:
## glm(formula = PositiveDec ~ ., family = binomial, data = stocksTrain1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7307  -1.2910   0.8878   1.0280   1.5023  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.17224    0.06302   2.733  0.00628 ** 
## ReturnJan    0.02498    0.29306   0.085  0.93206    
## ReturnFeb   -0.37207    0.29123  -1.278  0.20139    
## ReturnMar    0.59555    0.23325   2.553  0.01067 *  
## ReturnApr    1.19048    0.22439   5.305 1.12e-07 ***
## ReturnMay    0.30421    0.22845   1.332  0.18298    
## ReturnJune  -0.01165    0.29993  -0.039  0.96901    
## ReturnJuly   0.19769    0.27790   0.711  0.47685    
## ReturnAug    0.51273    0.30858   1.662  0.09660 .  
## ReturnSep    0.58833    0.28133   2.091  0.03651 *  
## ReturnOct   -1.02254    0.26007  -3.932 8.43e-05 ***
## ReturnNov   -0.74847    0.28280  -2.647  0.00813 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4243.0  on 3156  degrees of freedom
## Residual deviance: 4172.9  on 3145  degrees of freedom
## AIC: 4196.9
## 
## Number of Fisher Scoring iterations: 4

summary(StocksModel2)

## 
## Call:
## glm(formula = PositiveDec ~ ., family = binomial, data = stocksTrain2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2012  -1.1941   0.8583   1.1334   1.9424  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.10293    0.03785   2.719 0.006540 ** 
## ReturnJan    0.88451    0.20276   4.362 1.29e-05 ***
## ReturnFeb    0.31762    0.26624   1.193 0.232878    
## ReturnMar   -0.37978    0.24045  -1.579 0.114231    
## ReturnApr    0.49291    0.22460   2.195 0.028189 *  
## ReturnMay    0.89655    0.25492   3.517 0.000436 ***
## ReturnJune   1.50088    0.26014   5.770 7.95e-09 ***
## ReturnJuly   0.78315    0.26864   2.915 0.003554 ** 
## ReturnAug   -0.24486    0.27080  -0.904 0.365876    
## ReturnSep    0.73685    0.24820   2.969 0.002989 ** 
## ReturnOct   -0.27756    0.18400  -1.509 0.131419    
## ReturnNov   -0.78747    0.22458  -3.506 0.000454 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6506.3  on 4695  degrees of freedom
## Residual deviance: 6362.2  on 4684  degrees of freedom
## AIC: 6386.2
## 
## Number of Fisher Scoring iterations: 4

summary(StocksModel3)

## 
## Call:
## glm(formula = PositiveDec ~ ., family = binomial, data = stocksTrain3)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9146  -1.0393  -0.7689   1.1921   1.6939  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)  
## (Intercept) -0.181896   0.325182  -0.559   0.5759  
## ReturnJan   -0.009789   0.448943  -0.022   0.9826  
## ReturnFeb   -0.046883   0.213432  -0.220   0.8261  
## ReturnMar    0.674179   0.564790   1.194   0.2326  
## ReturnApr    1.281466   0.602672   2.126   0.0335 *
## ReturnMay    0.762512   0.647783   1.177   0.2392  
## ReturnJune   0.329434   0.408038   0.807   0.4195  
## ReturnJuly   0.774164   0.729360   1.061   0.2885  
## ReturnAug    0.982605   0.533158   1.843   0.0653 .
## ReturnSep    0.363807   0.627774   0.580   0.5622  
## ReturnOct    0.782242   0.733123   1.067   0.2860  
## ReturnNov   -0.873752   0.738480  -1.183   0.2367  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 346.92  on 252  degrees of freedom
## Residual deviance: 328.29  on 241  degrees of freedom
## AIC: 352.29
## 
## Number of Fisher Scoring iterations: 4

PredictTest1 = predict(StocksModel1, newdata = stocksTest1, type="response")
PredictTest2 = predict(StocksModel2, newdata = stocksTest2, type="response")
PredictTest3 = predict(StocksModel3, newdata = stocksTest3, type="response")

And the classification matrices can be computed with:

table(stocksTest1$PositiveDec, PredictTest1 > 0.5)

##    
##     FALSE TRUE
##   0    30  471
##   1    23  774

table(stocksTest2$PositiveDec, PredictTest2 > 0.5)

##    
##     FALSE TRUE
##   0   388  626
##   1   309  757

table(stocksTest3$PositiveDec, PredictTest3 > 0.5)

##    
##     FALSE TRUE
##   0    49   13
##   1    21   13

CLUSTER-SPECIFIC PREDICTIONS

AllPredictions = c(PredictTest1, PredictTest2, PredictTest3)
AllOutcomes = c(stocksTest1$PositiveDec, stocksTest2$PositiveDec, stocksTest3$PositiveDec)
table(AllOutcomes, AllPredictions > 0.5)

##            
## AllOutcomes FALSE TRUE
##           0   467 1110
##           1   353 1544

(467 + 1544)/(467 + 1110 + 353 + 1544)

## [1] 0.5788716