Loading packages that we will be using

# devtools::install_github("vqv/ggbiplot")
# devtools::install_github("bwlewis/rthreejs")
library(RCurl)
library(knitr)
library(plyr)
library(ggbiplot)
library(rCharts)
library(qcc)
library(threejs)
library(rgl)
library(pca3d)
library(gridExtra)

Load a sample data set

We would like to look at the Breast Cancer data set from UCI Machine Learning dataset repository

link1 = 'http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data'
names1 = c('ID','Clump.Thickness','Cell.Size','Cell.Shape','Adhesion','Epithelial',
          'Bare.Nuclei','Chromatin','Nucleoli','Mitoses','Class')

#    #  Attribute                     Domain
#    -- -----------------------------------------
#    1. Sample code number            id number
#    2. Clump Thickness               1 - 10
#    3. Uniformity of Cell Size       1 - 10
#    4. Uniformity of Cell Shape      1 - 10
#    5. Marginal Adhesion             1 - 10
#    6. Single Epithelial Cell Size   1 - 10
#    7. Bare Nuclei                   1 - 10
#    8. Bland Chromatin               1 - 10
#    9. Normal Nucleoli               1 - 10
#   10. Mitoses                       1 - 10
#   11. Class:                        (2 for benign, 4 for malignant)
dataBC = read.csv(link1, header=F)
names(dataBC) = names1
str(dataBC)
## 'data.frame':    699 obs. of  11 variables:
##  $ ID             : int  1000025 1002945 1015425 1016277 1017023 1017122 1018099 1018561 1033078 1033078 ...
##  $ Clump.Thickness: int  5 5 3 6 4 8 1 2 2 4 ...
##  $ Cell.Size      : int  1 4 1 8 1 10 1 1 1 2 ...
##  $ Cell.Shape     : int  1 4 1 8 1 10 1 2 1 1 ...
##  $ Adhesion       : int  1 5 1 1 3 8 1 1 1 1 ...
##  $ Epithelial     : int  2 7 2 3 2 7 2 2 2 2 ...
##  $ Bare.Nuclei    : Factor w/ 11 levels "?","1","10","2",..: 2 3 4 6 2 3 3 2 2 2 ...
##  $ Chromatin      : int  3 3 3 3 3 9 3 3 1 2 ...
##  $ Nucleoli       : int  1 2 1 7 1 7 1 1 1 1 ...
##  $ Mitoses        : int  1 1 1 1 1 1 1 1 5 1 ...
##  $ Class          : int  2 2 2 2 2 4 2 2 2 2 ...

Clean up the data for our analysis

dataBC = subset(dataBC, Bare.Nuclei!='?')
dataBC$Bare.Nuclei = as.integer(dataBC$Bare.Nuclei)
dataBC = mutate(dataBC, Color=ifelse(Class==2, 'green','red'))
dataBC = mutate(dataBC, Label=ifelse(Class==2, 'Benign','Malignant'))
dataBC$Label = as.factor(dataBC$Label)
str(dataBC)
## 'data.frame':    683 obs. of  13 variables:
##  $ ID             : int  1000025 1002945 1015425 1016277 1017023 1017122 1018099 1018561 1033078 1033078 ...
##  $ Clump.Thickness: int  5 5 3 6 4 8 1 2 2 4 ...
##  $ Cell.Size      : int  1 4 1 8 1 10 1 1 1 2 ...
##  $ Cell.Shape     : int  1 4 1 8 1 10 1 2 1 1 ...
##  $ Adhesion       : int  1 5 1 1 3 8 1 1 1 1 ...
##  $ Epithelial     : int  2 7 2 3 2 7 2 2 2 2 ...
##  $ Bare.Nuclei    : int  2 3 4 6 2 3 3 2 2 2 ...
##  $ Chromatin      : int  3 3 3 3 3 9 3 3 1 2 ...
##  $ Nucleoli       : int  1 2 1 7 1 7 1 1 1 1 ...
##  $ Mitoses        : int  1 1 1 1 1 1 1 1 5 1 ...
##  $ Class          : int  2 2 2 2 2 4 2 2 2 2 ...
##  $ Color          : chr  "green" "green" "green" "green" ...
##  $ Label          : Factor w/ 2 levels "Benign","Malignant": 1 1 1 1 1 2 1 1 1 1 ...

Mean centering data

head(dataBC[,2:10])
##   Clump.Thickness Cell.Size Cell.Shape Adhesion Epithelial Bare.Nuclei
## 1               5         1          1        1          2           2
## 2               5         4          4        5          7           3
## 3               3         1          1        1          2           4
## 4               6         8          8        1          3           6
## 5               4         1          1        3          2           2
## 6               8        10         10        8          7           3
##   Chromatin Nucleoli Mitoses
## 1         3        1       1
## 2         3        2       1
## 3         3        1       1
## 4         3        7       1
## 5         3        1       1
## 6         9        7       1
dataMC = apply(dataBC[,2:10], 2, function(y) y - mean(y))
head(dataMC)
##   Clump.Thickness  Cell.Size Cell.Shape   Adhesion Epithelial Bare.Nuclei
## 1       0.5578331 -2.1508053 -2.2152269 -1.8301611 -1.2342606  -1.2166911
## 2       0.5578331  0.8491947  0.7847731  2.1698389  3.7657394  -0.2166911
## 3      -1.4421669 -2.1508053 -2.2152269 -1.8301611 -1.2342606   0.7833089
## 4       1.5578331  4.8491947  4.7847731 -1.8301611 -0.2342606   2.7833089
## 5      -0.4421669 -2.1508053 -2.2152269  0.1698389 -1.2342606  -1.2166911
## 6       3.5578331  6.8491947  6.7847731  5.1698389  3.7657394  -0.2166911
##    Chromatin   Nucleoli    Mitoses
## 1 -0.4450952 -1.8696925 -0.6032211
## 2 -0.4450952 -0.8696925 -0.6032211
## 3 -0.4450952 -1.8696925 -0.6032211
## 4 -0.4450952  4.1303075 -0.6032211
## 5 -0.4450952 -1.8696925 -0.6032211
## 6  5.5549048  4.1303075 -0.6032211

Apply PCA

pca = prcomp(dataBC[,2:10], center=T, scale.=T)

Investigate PCA

str(pca)
## List of 5
##  $ sdev    : num [1:9] 2.351 0.896 0.847 0.729 0.635 ...
##  $ rotation: num [1:9, 1:9] -0.311 -0.395 -0.39 -0.337 -0.35 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:9] "Clump.Thickness" "Cell.Size" "Cell.Shape" "Adhesion" ...
##   .. ..$ : chr [1:9] "PC1" "PC2" "PC3" "PC4" ...
##  $ center  : Named num [1:9] 4.44 3.15 3.22 2.83 3.23 ...
##   ..- attr(*, "names")= chr [1:9] "Clump.Thickness" "Cell.Size" "Cell.Shape" "Adhesion" ...
##  $ scale   : Named num [1:9] 2.82 3.07 2.99 2.86 2.22 ...
##   ..- attr(*, "names")= chr [1:9] "Clump.Thickness" "Cell.Size" "Cell.Shape" "Adhesion" ...
##  $ x       : num [1:683, 1:9] 1.407 -0.849 1.416 -1.79 1.282 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:683] "1" "2" "3" "4" ...
##   .. ..$ : chr [1:9] "PC1" "PC2" "PC3" "PC4" ...
##  - attr(*, "class")= chr "prcomp"

Eigen Values

round(pca$sdev^2, 2)
## [1] 5.53 0.80 0.72 0.53 0.40 0.37 0.30 0.26 0.09

Extract PCs

pcs = data.frame(pca$x)
str(pcs)
## 'data.frame':    683 obs. of  9 variables:
##  $ PC1: num  1.407 -0.849 1.416 -1.79 1.282 ...
##  $ PC2: num  0.071 0.227 -0.577 -1.479 0.25 ...
##  $ PC3: num  0.207 0.417 -0.339 -0.128 0.331 ...
##  $ PC4: num  -0.6238 0.1466 0.1639 -0.3106 -0.0838 ...
##  $ PC5: num  0.15 -0.882 0.151 -0.726 0.493 ...
##  $ PC6: num  0.0825 -1.1726 -0.0347 0.8743 -0.17 ...
##  $ PC7: num  -0.27519 0.64359 -0.34249 0.27309 0.00218 ...
##  $ PC8: num  0.354 0.574 0.248 -1.594 0.366 ...
##  $ PC9: num  -0.0212 0.08511 -0.00891 -0.19165 0.00122 ...

Percentage of variance explained by each PC

cov = round(pca$sdev^2/sum(pca$sdev^2)*100, 2)
cov = data.frame(c(1:9),cov)
names(cov)[1] = 'PCs'
names(cov)[2] = 'Variance'
cov
##   PCs Variance
## 1   1    61.41
## 2   2     8.92
## 3   3     7.98
## 4   4     5.90
## 5   5     4.48
## 6   6     4.12
## 7   7     3.29
## 8   8     2.90
## 9   9     1.00
p1 = hPlot(Variance~PCs, data=cov, type=c('line'), radius=7)
p1$print(include_assets=T)

PCA Pareto Chart

PCA = pca$sdev^2
names(PCA) = paste0('PC', cov$PCs)
qcc::pareto.chart(PCA)

##      
## Pareto chart analysis for PCA
##        Frequency Cum.Freq. Percentage Cum.Percent.
##   PC1 5.52692008  5.526920  61.410223     61.41022
##   PC2 0.80235707  6.329277   8.915079     70.32530
##   PC3 0.71804127  7.047318   7.978236     78.30354
##   PC4 0.53138409  7.578703   5.904268     84.20781
##   PC5 0.40352916  7.982232   4.483657     88.69146
##   PC6 0.37046840  8.352700   4.116316     92.80778
##   PC7 0.29654175  8.649242   3.294908     96.10269
##   PC8 0.26069582  8.909938   2.896620     98.99931
##   PC9 0.09006237  9.000000   1.000693    100.00000

Percentage of variance explained by PC1 and PC2

sum(cov$Variance[1:2])
## [1] 70.33

Percentage of variance explained by PC1, PC2 and PC3

sum(cov$Variance[1:3])
## [1] 78.31

Classification using PC1 and PC2

** R base graphics plot **

biplot(pca,cex=0.8)

Based on ggplot2

g = ggbiplot(pca, choices = 1:2, obs.scale = 1, var.scale = 1, groups = dataBC[,13], ellipse = T, circle = T)
g+theme(legend.position='top') + scale_colour_manual(values=c('green','red'))

Visualizing relationship between different PCs

pcs$Label = dataBC$Label
p1 = ggplot(pcs, aes(PC1, PC2, color=Label))+ geom_point() + 
     scale_colour_manual(values=c('green','red')) + theme(legend.position='none') 
p2 = ggplot(pcs, aes(PC2, PC3, color=Label))+ geom_point() +
     scale_colour_manual(values=c('green','red')) + theme(legend.position='none')
p3 = ggplot(pcs, aes(PC1, PC3, color=Label))+ geom_point() +
     scale_colour_manual(values=c('green','red')) + theme(legend.position='none')
p4 = ggplot(pcs, aes(PC1, PC4, color=Label))+geom_point() +
     scale_colour_manual(values=c('green','red')) + theme(legend.position='none')
grid.arrange(p1, p2, p3, p4, ncol=2)

Classification using PC1, PC2 and PC3

scatterplot3js(pcs[,c('PC1','PC2','PC3')], labels=dataBC$Label, 
               color = dataBC$Color, flip.y=TRUE,renderer="canvas")

This will show a 3D plot on your local R

# pca3d(pca, components = 1:3, col = dataBC$Color, biplot=T)