Load data
stocks <- read.csv("StocksCluster.csv")
str(stocks)
## 'data.frame': 11580 obs. of 12 variables:
## $ ReturnJan : num 0.0807 -0.0107 0.0477 -0.074 -0.031 ...
## $ ReturnFeb : num 0.0663 0.1021 0.036 -0.0482 -0.2127 ...
## $ ReturnMar : num 0.0329 0.1455 0.0397 0.0182 0.0915 ...
## $ ReturnApr : num 0.1831 -0.0844 -0.1624 -0.0247 0.1893 ...
## $ ReturnMay : num 0.13033 -0.3273 -0.14743 -0.00604 -0.15385 ...
## $ ReturnJune : num -0.0176 -0.3593 0.0486 -0.0253 -0.1061 ...
## $ ReturnJuly : num -0.0205 -0.0253 -0.1354 -0.094 0.3553 ...
## $ ReturnAug : num 0.0247 0.2113 0.0334 0.0953 0.0568 ...
## $ ReturnSep : num -0.0204 -0.58 0 0.0567 0.0336 ...
## $ ReturnOct : num -0.1733 -0.2671 0.0917 -0.0963 0.0363 ...
## $ ReturnNov : num -0.0254 -0.1512 -0.0596 -0.0405 -0.0853 ...
## $ PositiveDec: int 0 0 0 1 1 1 1 0 0 0 ...
Proportion of stocks with positive return
sum(stocks$PositiveDec) / nrow(stocks)
## [1] 0.546114
Max correlation between months January to November
cm <- cor(stocks[1:11])
diag(cm) <- -1
max(cm)
## [1] 0.1916728
remove(cm)
Months with largest and smallest mean returns
cm <- colMeans(stocks[1:11])
(which.max(cm))
## ReturnApr
## 4
(which.min(cm))
## ReturnSep
## 9
remove(cm)
Split into training and test sets and apply regression analysis
library(caTools)
## Warning: package 'caTools' was built under R version 3.1.3
set.seed(144)
spl <- sample.split(stocks$PositiveDec, SplitRatio=0.7)
training <- subset(stocks, spl==TRUE)
test <- subset(stocks, spl==FALSE)
Train regression model and find accuracy on training set using a threshold of 0.5:
stocksModel <- glm(training$PositiveDec ~ ., family=binomial, data=training)
predictLR <- predict(stocksModel, type="response")
(conf.mat <- table(training$PositiveDec, predictLR > 0.5))
##
## FALSE TRUE
## 0 990 2689
## 1 787 3640
sum(diag(conf.mat)) / nrow(training)
## [1] 0.5711818
Overall accuracy on test set
predictLR <- predict(stocksModel, newdata=test, type="response")
(conf.mat <- table(test$PositiveDec, predictLR > 0.5))
##
## FALSE TRUE
## 0 417 1160
## 1 344 1553
sum(diag(conf.mat)) / nrow(test)
## [1] 0.5670697
Accuracy of baseline model on test set
(table(training$PositiveDec))
##
## 0 1
## 3679 4427
(t <- table(test$PositiveDec))
##
## 0 1
## 1577 1897
t[2] / sum(t)
## 1
## 0.5460564
Remove dependent variable from data because trying to use clustering to “discover” the dependent variable
limitedTrain <- training
limitedTrain$PositiveDec <- NULL
limitedTest <- test
limitedTest$PositiveDec <- NULL
Normalize stock monthly returns by mean and standard deviation of training set
library(caret)
## Warning: package 'caret' was built under R version 3.1.3
## Loading required package: lattice
## Loading required package: ggplot2
preproc <- preProcess(limitedTrain)
normTrain <- predict(preproc, limitedTrain)
normTest <- predict(preproc, limitedTest)
mean(normTrain$ReturnJan)
## [1] 2.100586e-17
Perform k-means clustering with 3 clusters
set.seed(144)
km <- kmeans(normTrain, centers=3, iter.max=1000)
km$size
## [1] 3157 4696 253
Obtain training set and testing set cluster assignments
library(flexclust)
## Warning: package 'flexclust' was built under R version 3.1.3
## Loading required package: grid
## Loading required package: modeltools
## Warning: package 'modeltools' was built under R version 3.1.3
## Loading required package: stats4
km.kcca <- as.kcca(km, normTrain)
clusterTrain <- predict(km.kcca)
clusterTest <- predict(km.kcca, newdata=normTest)
sum(clusterTest == 2)
## [1] 2080
Builing stocksTrain[1,2,3] using clustering clusterTrain and also stocksTest[1,2,3] from clusterTest
stocksTrain <- split(training, clusterTrain)
stocksTest <- split(test, clusterTest)
Which training set has highest average value of the dependent variable?
sapply(stocksTrain, function(s) { mean(s$PositiveDec) })
## 1 2 3
## 0.6024707 0.5140545 0.4387352
Create logistic regression models 1 2 and 3 using stocksTrain[[1]]
stocksModels <- lapply(stocksTrain, function(s) {
glm(s$PositiveDec ~ ., family=binomial, data=s)
})
Which variable(s) has positive sign and negative sign in at least 1 model?
sapply(stocksModels, function(m) { m$coefficients })
## 1 2 3
## (Intercept) 0.17223985 0.1029318 -0.181895809
## ReturnJan 0.02498357 0.8845148 -0.009789345
## ReturnFeb -0.37207369 0.3176221 -0.046883260
## ReturnMar 0.59554957 -0.3797811 0.674179495
## ReturnApr 1.19047752 0.4929105 1.281466189
## ReturnMay 0.30420906 0.8965492 0.762511555
## ReturnJune -0.01165375 1.5008787 0.329433917
## ReturnJuly 0.19769226 0.7831487 0.774164370
## ReturnAug 0.51272941 -0.2448602 0.982605385
## ReturnSep 0.58832685 0.7368522 0.363806823
## ReturnOct -1.02253506 -0.2775631 0.782242086
## ReturnNov -0.74847186 -0.7874737 -0.873752144
Get accuracies of using stockModels[[i]] against test data[[i]]
predictions <- sapply(1:3, function (i) {
p <- predict(stocksModels[[i]], newdata=stocksTest[[i]], type="response")
(conf.mat <- table(stocksTest[[i]]$PositiveDec, p > 0.5))
accuracy <- sum(diag(conf.mat)) / sum(conf.mat)
list(predict=p, accuracy=accuracy)
})
predictions
## [,1] [,2] [,3]
## predict Numeric,1298 Numeric,2080 Numeric,96
## accuracy 0.6194145 0.5504808 0.6458333
Compute overall test-set accuracy of the cluster-then-predict approach
allPredictions <- unlist(predictions[1,])
allOutcomes <- unlist(sapply(stocksTest, function(s) {
s$PositiveDec
}))
(conf.mat <- table(allOutcomes, allPredictions > 0.5))
##
## allOutcomes FALSE TRUE
## 0 467 1110
## 1 353 1544
sum(diag(conf.mat)) / sum(conf.mat)
## [1] 0.5788716