Data Exploratory Analysis

Load data

stocks <- read.csv("StocksCluster.csv")
str(stocks)
## 'data.frame':    11580 obs. of  12 variables:
##  $ ReturnJan  : num  0.0807 -0.0107 0.0477 -0.074 -0.031 ...
##  $ ReturnFeb  : num  0.0663 0.1021 0.036 -0.0482 -0.2127 ...
##  $ ReturnMar  : num  0.0329 0.1455 0.0397 0.0182 0.0915 ...
##  $ ReturnApr  : num  0.1831 -0.0844 -0.1624 -0.0247 0.1893 ...
##  $ ReturnMay  : num  0.13033 -0.3273 -0.14743 -0.00604 -0.15385 ...
##  $ ReturnJune : num  -0.0176 -0.3593 0.0486 -0.0253 -0.1061 ...
##  $ ReturnJuly : num  -0.0205 -0.0253 -0.1354 -0.094 0.3553 ...
##  $ ReturnAug  : num  0.0247 0.2113 0.0334 0.0953 0.0568 ...
##  $ ReturnSep  : num  -0.0204 -0.58 0 0.0567 0.0336 ...
##  $ ReturnOct  : num  -0.1733 -0.2671 0.0917 -0.0963 0.0363 ...
##  $ ReturnNov  : num  -0.0254 -0.1512 -0.0596 -0.0405 -0.0853 ...
##  $ PositiveDec: int  0 0 0 1 1 1 1 0 0 0 ...

Proportion of stocks with positive return

sum(stocks$PositiveDec) / nrow(stocks)
## [1] 0.546114

Max correlation between months January to November

cm <- cor(stocks[1:11])
diag(cm) <- -1
max(cm)
## [1] 0.1916728
remove(cm)

Months with largest and smallest mean returns

cm <- colMeans(stocks[1:11])
(which.max(cm))
## ReturnApr 
##         4
(which.min(cm))
## ReturnSep 
##         9
remove(cm)

Regression Analysis

Split into training and test sets and apply regression analysis

library(caTools)
## Warning: package 'caTools' was built under R version 3.1.3
set.seed(144)
spl <- sample.split(stocks$PositiveDec, SplitRatio=0.7)
training <- subset(stocks, spl==TRUE)
test <- subset(stocks, spl==FALSE)

Train regression model and find accuracy on training set using a threshold of 0.5:

stocksModel <- glm(training$PositiveDec ~ ., family=binomial, data=training)
predictLR <- predict(stocksModel, type="response")
(conf.mat <- table(training$PositiveDec, predictLR > 0.5))
##    
##     FALSE TRUE
##   0   990 2689
##   1   787 3640
sum(diag(conf.mat)) / nrow(training)
## [1] 0.5711818

Overall accuracy on test set

predictLR <- predict(stocksModel, newdata=test, type="response")
(conf.mat <- table(test$PositiveDec, predictLR > 0.5))
##    
##     FALSE TRUE
##   0   417 1160
##   1   344 1553
sum(diag(conf.mat)) / nrow(test)
## [1] 0.5670697

Accuracy of baseline model on test set

(table(training$PositiveDec))
## 
##    0    1 
## 3679 4427
(t <- table(test$PositiveDec))
## 
##    0    1 
## 1577 1897
t[2] / sum(t)
##         1 
## 0.5460564

Clustering stocks

Remove dependent variable from data because trying to use clustering to “discover” the dependent variable

limitedTrain <- training
limitedTrain$PositiveDec <- NULL
limitedTest <- test
limitedTest$PositiveDec <- NULL

Normalize stock monthly returns by mean and standard deviation of training set

library(caret)
## Warning: package 'caret' was built under R version 3.1.3
## Loading required package: lattice
## Loading required package: ggplot2
preproc <- preProcess(limitedTrain)
normTrain <- predict(preproc, limitedTrain)
normTest <- predict(preproc, limitedTest)
mean(normTrain$ReturnJan)
## [1] 2.100586e-17

Perform k-means clustering with 3 clusters

set.seed(144)
km <- kmeans(normTrain, centers=3, iter.max=1000)
km$size
## [1] 3157 4696  253

Obtain training set and testing set cluster assignments

library(flexclust)
## Warning: package 'flexclust' was built under R version 3.1.3
## Loading required package: grid
## Loading required package: modeltools
## Warning: package 'modeltools' was built under R version 3.1.3
## Loading required package: stats4
km.kcca <- as.kcca(km, normTrain)
clusterTrain <- predict(km.kcca)
clusterTest <- predict(km.kcca, newdata=normTest)
sum(clusterTest == 2)
## [1] 2080

Builing stocksTrain[1,2,3] using clustering clusterTrain and also stocksTest[1,2,3] from clusterTest

stocksTrain <- split(training, clusterTrain)
stocksTest <- split(test, clusterTest)

Which training set has highest average value of the dependent variable?

sapply(stocksTrain, function(s) { mean(s$PositiveDec) })
##         1         2         3 
## 0.6024707 0.5140545 0.4387352

Create logistic regression models 1 2 and 3 using stocksTrain[[1]]

stocksModels <- lapply(stocksTrain, function(s) {
  glm(s$PositiveDec ~ ., family=binomial, data=s)
})

Which variable(s) has positive sign and negative sign in at least 1 model?

sapply(stocksModels, function(m) { m$coefficients })
##                       1          2            3
## (Intercept)  0.17223985  0.1029318 -0.181895809
## ReturnJan    0.02498357  0.8845148 -0.009789345
## ReturnFeb   -0.37207369  0.3176221 -0.046883260
## ReturnMar    0.59554957 -0.3797811  0.674179495
## ReturnApr    1.19047752  0.4929105  1.281466189
## ReturnMay    0.30420906  0.8965492  0.762511555
## ReturnJune  -0.01165375  1.5008787  0.329433917
## ReturnJuly   0.19769226  0.7831487  0.774164370
## ReturnAug    0.51272941 -0.2448602  0.982605385
## ReturnSep    0.58832685  0.7368522  0.363806823
## ReturnOct   -1.02253506 -0.2775631  0.782242086
## ReturnNov   -0.74847186 -0.7874737 -0.873752144

Get accuracies of using stockModels[[i]] against test data[[i]]

predictions <- sapply(1:3, function (i) {
  p <- predict(stocksModels[[i]], newdata=stocksTest[[i]], type="response")
  (conf.mat <- table(stocksTest[[i]]$PositiveDec, p > 0.5))
  accuracy <- sum(diag(conf.mat)) / sum(conf.mat)
  list(predict=p, accuracy=accuracy)
})

predictions
##          [,1]         [,2]         [,3]      
## predict  Numeric,1298 Numeric,2080 Numeric,96
## accuracy 0.6194145    0.5504808    0.6458333

Compute overall test-set accuracy of the cluster-then-predict approach

allPredictions <- unlist(predictions[1,])
allOutcomes <- unlist(sapply(stocksTest, function(s) {
  s$PositiveDec
}))

(conf.mat <- table(allOutcomes, allPredictions > 0.5))
##            
## allOutcomes FALSE TRUE
##           0   467 1110
##           1   353 1544
sum(diag(conf.mat)) / sum(conf.mat)
## [1] 0.5788716