library(MASS)
## Warning: package 'MASS' was built under R version 3.1.2
data(crabs)
str(crabs)
## 'data.frame': 200 obs. of 8 variables:
## $ sp : Factor w/ 2 levels "B","O": 1 1 1 1 1 1 1 1 1 1 ...
## $ sex : Factor w/ 2 levels "F","M": 2 2 2 2 2 2 2 2 2 2 ...
## $ index: int 1 2 3 4 5 6 7 8 9 10 ...
## $ FL : num 8.1 8.8 9.2 9.6 9.8 10.8 11.1 11.6 11.8 11.8 ...
## $ RW : num 6.7 7.7 7.8 7.9 8 9 9.9 9.1 9.6 10.5 ...
## $ CL : num 16.1 18.1 19 20.1 20.3 23 23.8 24.5 24.2 25.2 ...
## $ CW : num 19 20.8 22.4 23.1 23 26.5 27.1 28.4 27.8 29.3 ...
## $ BD : num 7 7.4 7.7 8.2 8.2 9.8 9.8 10.4 9.7 10.3 ...
library(psych)
## Warning: package 'psych' was built under R version 3.1.2
## Data visualization
KMO(crabs[, 4:8])
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = crabs[, 4:8])
## Overall MSA = 0.79
## MSA for each item =
## FL RW CL CW BD
## 0.86 0.83 0.71 0.72 0.84
# function to standardize data on a similar scale
rangeStd <- function(x) {(x-min(x))/(max(x)-min(x))}
stdData <- apply(subset(crabs, , 4:8),2, rangeStd)
# enough correlation in the dataset or dedundancy to run PCA will help reduce number of dimensions
# range standardize the data is necessary since PCA is a scale sensistive
## we use simple range std to be able to compare apple to apple
pc <- principal(stdData, nfactors=4, rotate='varimax')
## Loading required namespace: GPArotation
par(mfrow=c(1,1))
plot(pc$values[1:4], type="b", main="Scree Plot")
#each of these numbers amount of vari encapsulated in each PC of this dataset
pc$weights # 1st PC 4.7 is picking up most variance of the dataset
## RC1 RC2 RC3 RC4
## FL 0.9648563 -0.08042853 -1.7996115 6.7565207
## RW -1.1955483 1.97290349 -0.2410677 -0.7801158
## CL 0.1036548 -0.53688183 1.5040413 -0.1133102
## CW -0.5526257 -0.48999546 3.0254492 0.6363241
## BD 1.3735976 -0.16161771 -2.2017850 -6.5418717
head(pc$scores) # this is a new dataset with 4 PC as x1,x2,x3,x4
## RC1 RC2 RC3 RC4
## 1 -1.0904824 -1.834537 -1.1361717 -0.35771000
## 2 -1.2986549 -1.365723 -0.7330716 0.04185194
## 3 -1.2135927 -1.479885 -0.3361774 0.32670438
## 4 -0.9822213 -1.562546 -0.3715082 0.15359009
## 5 -0.9635397 -1.499340 -0.4800274 0.49861131
## 6 -0.7167580 -1.252667 -0.2016098 -0.68784017
#PC1 is a linear combination of 0.96 of FL, -1.19 of RW so on for all X
pc$loadings #Important: this is a correlation of original variable with PC
##
## Loadings:
## RC1 RC2 RC3 RC4
## FL 0.805 0.551 0.199
## RW 0.496 0.836 0.236
## CL 0.781 0.499 0.373
## CW 0.734 0.518 0.437
## BD 0.831 0.511 0.213
##
## RC1 RC2 RC3 RC4
## SS loadings 2.734 1.781 0.471 0.012
## Proportion Var 0.547 0.356 0.094 0.002
## Cumulative Var 0.547 0.903 0.997 1.000
# PC1 picked up 0.805 of squared value of variance in FL
# cumulative variance of 0.903 for PC2 means, 90% of variance in the dataset is captured by PC1 & PC2
# which variables makes most of my PCs
# LAST STEP use k-means clustering to examine the data relationships
head(stdData)
## FL RW CL CW BD
## 1 0.05660377 0.01459854 0.04255319 0.05066667 0.05806452
## 2 0.10062893 0.08759124 0.10334347 0.09866667 0.08387097
## 3 0.12578616 0.09489051 0.13069909 0.14133333 0.10322581
## 4 0.15094340 0.10218978 0.16413374 0.16000000 0.13548387
## 5 0.16352201 0.10948905 0.17021277 0.15733333 0.13548387
## 6 0.22641509 0.18248175 0.25227964 0.25066667 0.23870968
cl <- kmeans(stdData, 3)
stdData1 <- cbind(stdData, as.factor(cl$cluster))
library(ggplot2)
##
## Attaching package: 'ggplot2'
##
## The following object is masked from 'package:psych':
##
## %+%

ggplot(as.data.frame(pc$scores), aes(x=RC1, y=RC2, color=as.factor(stdData1[,6]))) +
geom_point(size=5) + theme_bw() +
ggtitle('First 2 prinipal components clustering w/ kmeans') +
xlab("Principal Component 1") + ylab("Principal Component 2")

# SUMMARY OF PCA-Kmeans
# 1.Check the relevance of our dataset for PCA
# 2. Standardize data
# 3. Run PCA
# 4. Compressed our variables to just n components
# 5. Interpret variable loadings on our components
# 6. use k-means clustering to examine the data relationships
# 7. plot 2d or 3d view