Principal Component & Clustering by Kmeans

library(MASS)

## Warning: package 'MASS' was built under R version 3.1.2

data(crabs)
str(crabs)

## 'data.frame':    200 obs. of  8 variables:
##  $ sp   : Factor w/ 2 levels "B","O": 1 1 1 1 1 1 1 1 1 1 ...
##  $ sex  : Factor w/ 2 levels "F","M": 2 2 2 2 2 2 2 2 2 2 ...
##  $ index: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ FL   : num  8.1 8.8 9.2 9.6 9.8 10.8 11.1 11.6 11.8 11.8 ...
##  $ RW   : num  6.7 7.7 7.8 7.9 8 9 9.9 9.1 9.6 10.5 ...
##  $ CL   : num  16.1 18.1 19 20.1 20.3 23 23.8 24.5 24.2 25.2 ...
##  $ CW   : num  19 20.8 22.4 23.1 23 26.5 27.1 28.4 27.8 29.3 ...
##  $ BD   : num  7 7.4 7.7 8.2 8.2 9.8 9.8 10.4 9.7 10.3 ...

library(psych)

## Warning: package 'psych' was built under R version 3.1.2

## Data visualization



KMO(crabs[, 4:8])

## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = crabs[, 4:8])
## Overall MSA =  0.79
## MSA for each item = 
##   FL   RW   CL   CW   BD 
## 0.86 0.83 0.71 0.72 0.84

# function to standardize data on a similar scale
rangeStd <- function(x) {(x-min(x))/(max(x)-min(x))}
stdData <- apply(subset(crabs, , 4:8),2, rangeStd)
# enough correlation in the dataset or dedundancy to run PCA will help reduce number of dimensions
# range standardize the data is necessary since PCA is a scale sensistive
## we use simple range std to be able to compare apple to apple
pc <- principal(stdData, nfactors=4, rotate='varimax')

## Loading required namespace: GPArotation

par(mfrow=c(1,1))
plot(pc$values[1:4], type="b", main="Scree Plot")
#each of these numbers amount of vari encapsulated in each PC of this dataset

pc$weights # 1st PC 4.7 is picking up most variance of the dataset

##           RC1         RC2        RC3        RC4
## FL  0.9648563 -0.08042853 -1.7996115  6.7565207
## RW -1.1955483  1.97290349 -0.2410677 -0.7801158
## CL  0.1036548 -0.53688183  1.5040413 -0.1133102
## CW -0.5526257 -0.48999546  3.0254492  0.6363241
## BD  1.3735976 -0.16161771 -2.2017850 -6.5418717

head(pc$scores) # this is a new dataset with 4 PC as x1,x2,x3,x4

##          RC1       RC2        RC3         RC4
## 1 -1.0904824 -1.834537 -1.1361717 -0.35771000
## 2 -1.2986549 -1.365723 -0.7330716  0.04185194
## 3 -1.2135927 -1.479885 -0.3361774  0.32670438
## 4 -0.9822213 -1.562546 -0.3715082  0.15359009
## 5 -0.9635397 -1.499340 -0.4800274  0.49861131
## 6 -0.7167580 -1.252667 -0.2016098 -0.68784017

#PC1 is a linear combination of 0.96 of FL, -1.19 of RW so on for all X
pc$loadings #Important: this is a correlation of original variable with PC

## 
## Loadings:
##    RC1    RC2    RC3    RC4   
## FL  0.805  0.551  0.199       
## RW  0.496  0.836  0.236       
## CL  0.781  0.499  0.373       
## CW  0.734  0.518  0.437       
## BD  0.831  0.511  0.213       
## 
##                  RC1   RC2   RC3   RC4
## SS loadings    2.734 1.781 0.471 0.012
## Proportion Var 0.547 0.356 0.094 0.002
## Cumulative Var 0.547 0.903 0.997 1.000

# PC1 picked up 0.805 of squared value of variance in FL
# cumulative variance of 0.903 for PC2 means, 90% of variance in the dataset is captured by PC1 & PC2
# which variables makes most of my PCs


# LAST STEP use k-means clustering to examine the data relationships
head(stdData)

##           FL         RW         CL         CW         BD
## 1 0.05660377 0.01459854 0.04255319 0.05066667 0.05806452
## 2 0.10062893 0.08759124 0.10334347 0.09866667 0.08387097
## 3 0.12578616 0.09489051 0.13069909 0.14133333 0.10322581
## 4 0.15094340 0.10218978 0.16413374 0.16000000 0.13548387
## 5 0.16352201 0.10948905 0.17021277 0.15733333 0.13548387
## 6 0.22641509 0.18248175 0.25227964 0.25066667 0.23870968

cl <- kmeans(stdData, 3)
stdData1 <- cbind(stdData, as.factor(cl$cluster))
library(ggplot2)

## 
## Attaching package: 'ggplot2'
## 
## The following object is masked from 'package:psych':
## 
##     %+%

ggplot(as.data.frame(pc$scores), aes(x=RC1, y=RC2, color=as.factor(stdData1[,6]))) + 
    geom_point(size=5) + theme_bw() + 
    ggtitle('First 2 prinipal components clustering w/ kmeans') + 
    xlab("Principal Component 1") + ylab("Principal Component 2")

# SUMMARY OF PCA-Kmeans
# 1.Check the relevance of our dataset for PCA
# 2. Standardize data
# 3. Run PCA
# 4. Compressed our variables to just n components
# 5. Interpret variable loadings on our components
# 6. use k-means clustering to examine the data relationships
# 7. plot 2d or 3d view