#Principal Component Analysis and Hierarchical Clustering
# ref: http://moderndata.plot.ly/principal-component-analysis-cluster-plotly/
pcaCars <- princomp(mtcars,cor = TRUE)
names(pcaCars)
## [1] "sdev"     "loadings" "center"   "scale"    "n.obs"    "scores"  
## [7] "call"
summary(pcaCars)
## Importance of components:
##                           Comp.1    Comp.2     Comp.3     Comp.4
## Standard deviation     2.5706809 1.6280258 0.79195787 0.51922773
## Proportion of Variance 0.6007637 0.2409516 0.05701793 0.02450886
## Cumulative Proportion  0.6007637 0.8417153 0.89873322 0.92324208
##                            Comp.5     Comp.6     Comp.7     Comp.8
## Standard deviation     0.47270615 0.45999578 0.36777981 0.35057301
## Proportion of Variance 0.02031374 0.01923601 0.01229654 0.01117286
## Cumulative Proportion  0.94355581 0.96279183 0.97508837 0.98626123
##                             Comp.9     Comp.10     Comp.11
## Standard deviation     0.277572792 0.228112781 0.148473587
## Proportion of Variance 0.007004241 0.004730495 0.002004037
## Cumulative Proportion  0.993265468 0.997995963 1.000000000
plot(pcaCars,type="l")

carsHC <- hclust(dist(pcaCars$scores),method = "ward.D2")
plot(carsHC)

carsClusters <- cutree(carsHC,k=3)
plot(carsClusters)
carsDf <- data.frame(pcaCars$scores,"cluster"=factor(carsClusters))
carsDf <- transform(carsDf,cluster_name = paste("Cluster",carsClusters))
library(ggplot2)

p1 <- ggplot(carsDf,aes(x=Comp.1,y=Comp.2))+
  theme_classic()+
  geom_hline(yintercept = 0,color="gray70")+
  geom_vline(xintercept = 0,color="gray70")+
  geom_point(aes(color=cluster),alpha=0.55,size=3)+
  xlab("PC1")+
  ylab("PC2")+
  xlim(-5,6)+
  ggtitle("PCA Clusters from Hierarchical Clustering of Cars Data")
p1+geom_text(aes(y=Comp.2+0.25,label=rownames(carsDf)))

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:graphics':
## 
##     layout
p <- plot_ly(carsDf,x=Comp.1,y=Comp.2,text=rownames(carsDf),
             mode="markers",color = cluster_name,marker=list(size=11))
p <- layout(p,title="PCA Clusters from Hierachical Clustering of Cars Data",
            xaxis=list(title="PC1"),
            yaxis=list(title="PC2"))
p