Source File code Connect connect
Load the library that are required in the assignment:
library("tm")
library("SnowballC")
library("caTools")
library("rpart")
library("rpart.plot")
library("ROCR")
library("randomForest")
library("caret")
first cluster observations and then build cluster-specific prediction models
This dataset contains the following variables:
stocks = read.csv("StocksCluster.csv")
str(stocks)
## 'data.frame': 11580 obs. of 12 variables:
## $ ReturnJan : num 0.0807 -0.0107 0.0477 -0.074 -0.031 ...
## $ ReturnFeb : num 0.0663 0.1021 0.036 -0.0482 -0.2127 ...
## $ ReturnMar : num 0.0329 0.1455 0.0397 0.0182 0.0915 ...
## $ ReturnApr : num 0.1831 -0.0844 -0.1624 -0.0247 0.1893 ...
## $ ReturnMay : num 0.13033 -0.3273 -0.14743 -0.00604 -0.15385 ...
## $ ReturnJune : num -0.0176 -0.3593 0.0486 -0.0253 -0.1061 ...
## $ ReturnJuly : num -0.0205 -0.0253 -0.1354 -0.094 0.3553 ...
## $ ReturnAug : num 0.0247 0.2113 0.0334 0.0953 0.0568 ...
## $ ReturnSep : num -0.0204 -0.58 0 0.0567 0.0336 ...
## $ ReturnOct : num -0.1733 -0.2671 0.0917 -0.0963 0.0363 ...
## $ ReturnNov : num -0.0254 -0.1512 -0.0596 -0.0405 -0.0853 ...
## $ PositiveDec: int 0 0 0 1 1 1 1 0 0 0 ...
table(stocks$PositiveDec)
##
## 0 1
## 5256 6324
What is the maximum correlation between any two return variables in the dataset? You should look at the pairwise correlations between ReturnJan, ReturnFeb, ReturnMar, ReturnApr, ReturnMay, ReturnJune, ReturnJuly, ReturnAug, ReturnSep, ReturnOct, and ReturnNov.
cor(stocks)
## ReturnJan ReturnFeb ReturnMar ReturnApr
## ReturnJan 1.000000000 0.06677458 -0.090496798 -0.037678006
## ReturnFeb 0.066774583 1.00000000 -0.155983263 -0.191351924
## ReturnMar -0.090496798 -0.15598326 1.000000000 0.009726288
## ReturnApr -0.037678006 -0.19135192 0.009726288 1.000000000
## ReturnMay -0.044411417 -0.09552092 -0.003892789 0.063822504
## ReturnJune 0.092238307 0.16999448 -0.085905486 -0.011027752
## ReturnJuly -0.081429765 -0.06177851 0.003374160 0.080631932
## ReturnAug -0.022792019 0.13155979 -0.022005400 -0.051756051
## ReturnSep -0.026437153 0.04350177 0.076518327 -0.028920972
## ReturnOct 0.142977229 -0.08732427 -0.011923758 0.048540025
## ReturnNov 0.067632333 -0.15465828 0.037323535 0.031761837
## PositiveDec 0.004728518 -0.03817318 0.022408661 0.094353528
## ReturnMay ReturnJune ReturnJuly ReturnAug
## ReturnJan -0.044411417 0.09223831 -0.0814297650 -0.0227920187
## ReturnFeb -0.095520920 0.16999448 -0.0617785094 0.1315597863
## ReturnMar -0.003892789 -0.08590549 0.0033741597 -0.0220053995
## ReturnApr 0.063822504 -0.01102775 0.0806319317 -0.0517560510
## ReturnMay 1.000000000 -0.02107454 0.0908502642 -0.0331256580
## ReturnJune -0.021074539 1.00000000 -0.0291525996 0.0107105260
## ReturnJuly 0.090850264 -0.02915260 1.0000000000 0.0007137558
## ReturnAug -0.033125658 0.01071053 0.0007137558 1.0000000000
## ReturnSep 0.021962862 0.04474727 0.0689478037 0.0007407139
## ReturnOct 0.017166728 -0.02263599 -0.0547089088 -0.0755945614
## ReturnNov 0.048046590 -0.06527054 -0.0483738369 -0.1164890345
## PositiveDec 0.058201934 0.02340975 0.0743642097 0.0041669657
## ReturnSep ReturnOct ReturnNov PositiveDec
## ReturnJan -0.0264371526 0.14297723 0.06763233 0.004728518
## ReturnFeb 0.0435017706 -0.08732427 -0.15465828 -0.038173184
## ReturnMar 0.0765183267 -0.01192376 0.03732353 0.022408661
## ReturnApr -0.0289209718 0.04854003 0.03176184 0.094353528
## ReturnMay 0.0219628623 0.01716673 0.04804659 0.058201934
## ReturnJune 0.0447472692 -0.02263599 -0.06527054 0.023409745
## ReturnJuly 0.0689478037 -0.05470891 -0.04837384 0.074364210
## ReturnAug 0.0007407139 -0.07559456 -0.11648903 0.004166966
## ReturnSep 1.0000000000 -0.05807924 -0.01971980 0.041630286
## ReturnOct -0.0580792362 1.00000000 0.19167279 -0.052574956
## ReturnNov -0.0197197998 0.19167279 1.00000000 -0.062346556
## PositiveDec 0.0416302863 -0.05257496 -0.06234656 1.000000000
set.seed(144)
spl = sample.split(stocks$PositiveDec, SplitRatio = 0.7)
stocksTrain = subset(stocks, spl == TRUE)
stocksTest = subset(stocks, spl == FALSE)
Model = glm(PositiveDec ~ ., data=stocksTrain, family=binomial)
PredictTrain = predict(Model, type="response")
table(stocksTrain$PositiveDec, PredictTrain > 0.5)
##
## FALSE TRUE
## 0 990 2689
## 1 787 3640
(990 + 3640)/(990 + 2689 + 787 + 3640)
## [1] 0.5711818
PredictTest = predict(Model, newdata=stocksTest, type="response")
table(stocksTest$PositiveDec, PredictTest > 0.5)
##
## FALSE TRUE
## 0 417 1160
## 1 344 1553
(417 + 1553)/(417 + 1160 + 344 + 1553)
## [1] 0.5670697
always predicts the most common outcome (PositiveDec = 1)?
table(stocksTest$PositiveDec)
##
## 0 1
## 1577 1897
1897/(1577 + 1897)
## [1] 0.5460564
limitedTrain = stocksTrain
limitedTrain$PositiveDec = NULL
limitedTest = stocksTest
limitedTest$PositiveDec = NULL
library(caret)
preproc = preProcess(limitedTrain)
normTrain = predict(preproc, limitedTrain)
normTest = predict(preproc, limitedTest)
mean(normTrain$ReturnJan)
## [1] 2.100586e-17
mean(normTest$ReturnJan)
## [1] -0.0004185886
set.seed(144)
km = kmeans(normTrain, centers = 3)
library(flexclust)
km.kcca = as.kcca(km, normTrain)
clusterTrain = predict(km.kcca)
clusterTest = predict(km.kcca, newdata=normTest)
stocksTrain1 = subset(stocksTrain, clusterTrain == 1)
stocksTrain2 = subset(stocksTrain, clusterTrain == 2)
stocksTrain3 = subset(stocksTrain, clusterTrain == 3)
stocksTest1 = subset(stocksTest, clusterTest == 1)
stocksTest2 = subset(stocksTest, clusterTest == 2)
stocksTest3 = subset(stocksTest, clusterTest == 3)
StocksModel1 = glm(PositiveDec ~ ., data=stocksTrain1, family=binomial)
StocksModel2 = glm(PositiveDec ~ ., data=stocksTrain2, family=binomial)
StocksModel3 = glm(PositiveDec ~ ., data=stocksTrain3, family=binomial)
summary(StocksModel1)
##
## Call:
## glm(formula = PositiveDec ~ ., family = binomial, data = stocksTrain1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7307 -1.2910 0.8878 1.0280 1.5023
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.17224 0.06302 2.733 0.00628 **
## ReturnJan 0.02498 0.29306 0.085 0.93206
## ReturnFeb -0.37207 0.29123 -1.278 0.20139
## ReturnMar 0.59555 0.23325 2.553 0.01067 *
## ReturnApr 1.19048 0.22439 5.305 1.12e-07 ***
## ReturnMay 0.30421 0.22845 1.332 0.18298
## ReturnJune -0.01165 0.29993 -0.039 0.96901
## ReturnJuly 0.19769 0.27790 0.711 0.47685
## ReturnAug 0.51273 0.30858 1.662 0.09660 .
## ReturnSep 0.58833 0.28133 2.091 0.03651 *
## ReturnOct -1.02254 0.26007 -3.932 8.43e-05 ***
## ReturnNov -0.74847 0.28280 -2.647 0.00813 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4243.0 on 3156 degrees of freedom
## Residual deviance: 4172.9 on 3145 degrees of freedom
## AIC: 4196.9
##
## Number of Fisher Scoring iterations: 4
summary(StocksModel2)
##
## Call:
## glm(formula = PositiveDec ~ ., family = binomial, data = stocksTrain2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2012 -1.1941 0.8583 1.1334 1.9424
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.10293 0.03785 2.719 0.006540 **
## ReturnJan 0.88451 0.20276 4.362 1.29e-05 ***
## ReturnFeb 0.31762 0.26624 1.193 0.232878
## ReturnMar -0.37978 0.24045 -1.579 0.114231
## ReturnApr 0.49291 0.22460 2.195 0.028189 *
## ReturnMay 0.89655 0.25492 3.517 0.000436 ***
## ReturnJune 1.50088 0.26014 5.770 7.95e-09 ***
## ReturnJuly 0.78315 0.26864 2.915 0.003554 **
## ReturnAug -0.24486 0.27080 -0.904 0.365876
## ReturnSep 0.73685 0.24820 2.969 0.002989 **
## ReturnOct -0.27756 0.18400 -1.509 0.131419
## ReturnNov -0.78747 0.22458 -3.506 0.000454 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6506.3 on 4695 degrees of freedom
## Residual deviance: 6362.2 on 4684 degrees of freedom
## AIC: 6386.2
##
## Number of Fisher Scoring iterations: 4
summary(StocksModel3)
##
## Call:
## glm(formula = PositiveDec ~ ., family = binomial, data = stocksTrain3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9146 -1.0393 -0.7689 1.1921 1.6939
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.181896 0.325182 -0.559 0.5759
## ReturnJan -0.009789 0.448943 -0.022 0.9826
## ReturnFeb -0.046883 0.213432 -0.220 0.8261
## ReturnMar 0.674179 0.564790 1.194 0.2326
## ReturnApr 1.281466 0.602672 2.126 0.0335 *
## ReturnMay 0.762512 0.647783 1.177 0.2392
## ReturnJune 0.329434 0.408038 0.807 0.4195
## ReturnJuly 0.774164 0.729360 1.061 0.2885
## ReturnAug 0.982605 0.533158 1.843 0.0653 .
## ReturnSep 0.363807 0.627774 0.580 0.5622
## ReturnOct 0.782242 0.733123 1.067 0.2860
## ReturnNov -0.873752 0.738480 -1.183 0.2367
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 346.92 on 252 degrees of freedom
## Residual deviance: 328.29 on 241 degrees of freedom
## AIC: 352.29
##
## Number of Fisher Scoring iterations: 4
PredictTest1 = predict(StocksModel1, newdata = stocksTest1, type="response")
PredictTest2 = predict(StocksModel2, newdata = stocksTest2, type="response")
PredictTest3 = predict(StocksModel3, newdata = stocksTest3, type="response")
And the classification matrices can be computed with:
table(stocksTest1$PositiveDec, PredictTest1 > 0.5)
##
## FALSE TRUE
## 0 30 471
## 1 23 774
table(stocksTest2$PositiveDec, PredictTest2 > 0.5)
##
## FALSE TRUE
## 0 388 626
## 1 309 757
table(stocksTest3$PositiveDec, PredictTest3 > 0.5)
##
## FALSE TRUE
## 0 49 13
## 1 21 13
AllPredictions = c(PredictTest1, PredictTest2, PredictTest3)
AllOutcomes = c(stocksTest1$PositiveDec, stocksTest2$PositiveDec, stocksTest3$PositiveDec)
table(AllOutcomes, AllPredictions > 0.5)
##
## AllOutcomes FALSE TRUE
## 0 467 1110
## 1 353 1544
(467 + 1544)/(467 + 1110 + 353 + 1544)
## [1] 0.5788716