# Read csv file - vol-AMZN.csv
vol_data<-read.csv("AMZN_rsi_cc1_preprocess.csv", header=T)
# factor analysis on the data, must be numeric and NA must be removed
# i've removed NA rows
vol_data <- vol_data[complete.cases(vol_data), ]
# remove date variable and dependent variable - y_cc1 (you do not need dependent variable)
vol_data <- subset(vol_data, select = c(8:29))
#null hypothesis in factor analysis is
# - the model described by the factor we have found is good enough
# - the model predicts the data well
# lower p-value -> doesn't predict -> need more factors
# high p-value -> doesn't mean you can comfortably stop
# go for one or two factors and see the gain before deciding the number of factors
# Steps below
# first try with one factor.
# if p-value is significant (lower p-value), try more factors
# add more factors until p-value is insignifant
fact <- factanal(vol_data, factors = 10)
print(fact)
##
## Call:
## factanal(x = vol_data, factors = 10)
##
## Uniquenesses:
## Rsi2 Rsi3 Rsi4 Rsi5 Rsi6 Rsi7 Rsi8 Rsi9 Rsi10 Rsi15
## 0.005 0.005 0.005 0.005 0.005 0.005 0.005 0.005 0.005 0.005
## Rsi20 RsiD2 RsiD3 RsiD4 RsiD5 RsiD6 RsiD7 RsiD8 RsiD9 RsiD10
## 0.005 0.186 0.005 0.005 0.005 0.005 0.005 0.005 0.005 0.005
## RsiD15 RsiD20
## 0.005 0.008
##
## Loadings:
## Factor1 Factor2 Factor3 Factor4 Factor5 Factor6 Factor7 Factor8
## Rsi2 0.538 0.693 0.114 0.171 0.418
## Rsi3 0.680 0.632 0.202 0.290
## Rsi4 0.771 0.536 0.252 0.204
## Rsi5 0.835 0.445 0.268 0.145
## Rsi6 0.881 0.367 0.262 0.102
## Rsi7 0.916 0.302 0.244
## Rsi8 0.941 0.249 0.217
## Rsi9 0.959 0.204 0.187
## Rsi10 0.972 0.167 0.155
## Rsi15 0.993
## Rsi20 0.981 -0.103 -0.130
## RsiD2 0.894
## RsiD3 0.470 0.738 0.467
## RsiD4 0.772 0.411 0.460
## RsiD5 0.123 0.921 0.216 0.232 -0.156
## RsiD6 0.174 0.967 0.112
## RsiD7 0.223 0.959 0.147
## RsiD8 0.268 0.919 0.247 0.108
## RsiD9 0.311 0.863 0.341 0.170
## RsiD10 0.351 0.800 0.425 -0.107 0.191
## RsiD15 0.522 0.504 0.673
## RsiD20 0.641 0.299 0.692
## Factor9 Factor10
## Rsi2
## Rsi3
## Rsi4
## Rsi5
## Rsi6
## Rsi7
## Rsi8
## Rsi9
## Rsi10
## Rsi15
## Rsi20
## RsiD2
## RsiD3
## RsiD4
## RsiD5
## RsiD6
## RsiD7
## RsiD8
## RsiD9
## RsiD10
## RsiD15
## RsiD20
##
## Factor1 Factor2 Factor3 Factor4 Factor5 Factor6 Factor7
## SS loadings 9.439 7.818 1.757 1.621 0.533 0.353 0.134
## Proportion Var 0.429 0.355 0.080 0.074 0.024 0.016 0.006
## Cumulative Var 0.429 0.784 0.864 0.938 0.962 0.978 0.984
## Factor8 Factor9 Factor10
## SS loadings 0.061 0.019 0.014
## Proportion Var 0.003 0.001 0.001
## Cumulative Var 0.987 0.988 0.989
##
## Test of the hypothesis that 10 factors are sufficient.
## The chi square statistic is 74172.81 on 56 degrees of freedom.
## The p-value is 0
# have to do this manually on the number of factors, if you choose more, it generates error
fact <- factanal(vol_data, factors = 15)
print(fact)
##
## Call:
## factanal(x = vol_data, factors = 15)
##
## Uniquenesses:
## Rsi2 Rsi3 Rsi4 Rsi5 Rsi6 Rsi7 Rsi8 Rsi9 Rsi10 Rsi15
## 0.005 0.005 0.005 0.005 0.005 0.005 0.005 0.005 0.005 0.005
## Rsi20 RsiD2 RsiD3 RsiD4 RsiD5 RsiD6 RsiD7 RsiD8 RsiD9 RsiD10
## 0.005 0.211 0.005 0.005 0.005 0.005 0.005 0.005 0.005 0.005
## RsiD15 RsiD20
## 0.005 0.005
##
## Loadings:
## Factor1 Factor2 Factor3 Factor4 Factor5 Factor6 Factor7 Factor8
## Rsi2 0.538 0.693 0.114 0.176 0.418
## Rsi3 0.680 0.632 0.202 0.289
## Rsi4 0.771 0.536 0.251 0.204
## Rsi5 0.835 0.445 0.268 0.145
## Rsi6 0.881 0.367 0.262 0.102
## Rsi7 0.916 0.303 0.243
## Rsi8 0.941 0.249 0.217
## Rsi9 0.959 0.205 0.187
## Rsi10 0.972 0.168 0.155
## Rsi15 0.993
## Rsi20 0.981 -0.103 -0.131
## RsiD2 0.883
## RsiD3 0.468 0.754 0.441
## RsiD4 0.771 0.427 0.448
## RsiD5 0.123 0.920 0.225 0.227 -0.153
## RsiD6 0.174 0.966 0.117
## RsiD7 0.223 0.959 0.147
## RsiD8 0.268 0.919 0.247 0.106
## RsiD9 0.311 0.863 0.340 -0.100 0.169
## RsiD10 0.351 0.801 0.424 -0.108 0.191
## RsiD15 0.522 0.505 0.672
## RsiD20 0.641 0.299 0.694
## Factor9 Factor10 Factor11 Factor12 Factor13 Factor14 Factor15
## Rsi2
## Rsi3
## Rsi4
## Rsi5
## Rsi6
## Rsi7
## Rsi8
## Rsi9
## Rsi10
## Rsi15
## Rsi20
## RsiD2
## RsiD3
## RsiD4
## RsiD5
## RsiD6
## RsiD7
## RsiD8
## RsiD9
## RsiD10
## RsiD15
## RsiD20
##
## Factor1 Factor2 Factor3 Factor4 Factor5 Factor6 Factor7
## SS loadings 9.438 7.815 1.757 1.647 0.498 0.353 0.132
## Proportion Var 0.429 0.355 0.080 0.075 0.023 0.016 0.006
## Cumulative Var 0.429 0.784 0.864 0.939 0.962 0.978 0.984
## Factor8 Factor9 Factor10 Factor11 Factor12 Factor13
## SS loadings 0.062 0.018 0.014 0.001 0.000 0.000
## Proportion Var 0.003 0.001 0.001 0.000 0.000 0.000
## Cumulative Var 0.986 0.987 0.988 0.988 0.988 0.988
## Factor14 Factor15
## SS loadings 0.000 0.000
## Proportion Var 0.000 0.000
## Cumulative Var 0.988 0.988
##
## Test of the hypothesis that 15 factors are sufficient.
## The chi square statistic is 72288.53 on 6 degrees of freedom.
## The p-value is 0
# check corelation of variables with the factors
# rule of thumb -> anything greater than 0.7 means variable is well explained by the factor
load = fact$loadings[,1:15]
plot(load, type="n") # set up plot
text(load,labels=names(vol_data),cex=.7) # add variable names
