# devtools::install_github("vqv/ggbiplot")
# devtools::install_github("bwlewis/rthreejs")
library(RCurl)
library(knitr)
library(plyr)
library(ggbiplot)
library(rCharts)
library(qcc)
library(threejs)
library(rgl)
library(pca3d)
library(gridExtra)
We would like to look at the Breast Cancer data set from UCI Machine Learning dataset repository
link1 = 'http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data'
names1 = c('ID','Clump.Thickness','Cell.Size','Cell.Shape','Adhesion','Epithelial',
'Bare.Nuclei','Chromatin','Nucleoli','Mitoses','Class')
# # Attribute Domain
# -- -----------------------------------------
# 1. Sample code number id number
# 2. Clump Thickness 1 - 10
# 3. Uniformity of Cell Size 1 - 10
# 4. Uniformity of Cell Shape 1 - 10
# 5. Marginal Adhesion 1 - 10
# 6. Single Epithelial Cell Size 1 - 10
# 7. Bare Nuclei 1 - 10
# 8. Bland Chromatin 1 - 10
# 9. Normal Nucleoli 1 - 10
# 10. Mitoses 1 - 10
# 11. Class: (2 for benign, 4 for malignant)
dataBC = read.csv(link1, header=F)
names(dataBC) = names1
str(dataBC)
## 'data.frame': 699 obs. of 11 variables:
## $ ID : int 1000025 1002945 1015425 1016277 1017023 1017122 1018099 1018561 1033078 1033078 ...
## $ Clump.Thickness: int 5 5 3 6 4 8 1 2 2 4 ...
## $ Cell.Size : int 1 4 1 8 1 10 1 1 1 2 ...
## $ Cell.Shape : int 1 4 1 8 1 10 1 2 1 1 ...
## $ Adhesion : int 1 5 1 1 3 8 1 1 1 1 ...
## $ Epithelial : int 2 7 2 3 2 7 2 2 2 2 ...
## $ Bare.Nuclei : Factor w/ 11 levels "?","1","10","2",..: 2 3 4 6 2 3 3 2 2 2 ...
## $ Chromatin : int 3 3 3 3 3 9 3 3 1 2 ...
## $ Nucleoli : int 1 2 1 7 1 7 1 1 1 1 ...
## $ Mitoses : int 1 1 1 1 1 1 1 1 5 1 ...
## $ Class : int 2 2 2 2 2 4 2 2 2 2 ...
dataBC = subset(dataBC, Bare.Nuclei!='?')
dataBC$Bare.Nuclei = as.integer(dataBC$Bare.Nuclei)
dataBC = mutate(dataBC, Color=ifelse(Class==2, 'green','red'))
dataBC = mutate(dataBC, Label=ifelse(Class==2, 'Benign','Malignant'))
dataBC$Label = as.factor(dataBC$Label)
str(dataBC)
## 'data.frame': 683 obs. of 13 variables:
## $ ID : int 1000025 1002945 1015425 1016277 1017023 1017122 1018099 1018561 1033078 1033078 ...
## $ Clump.Thickness: int 5 5 3 6 4 8 1 2 2 4 ...
## $ Cell.Size : int 1 4 1 8 1 10 1 1 1 2 ...
## $ Cell.Shape : int 1 4 1 8 1 10 1 2 1 1 ...
## $ Adhesion : int 1 5 1 1 3 8 1 1 1 1 ...
## $ Epithelial : int 2 7 2 3 2 7 2 2 2 2 ...
## $ Bare.Nuclei : int 2 3 4 6 2 3 3 2 2 2 ...
## $ Chromatin : int 3 3 3 3 3 9 3 3 1 2 ...
## $ Nucleoli : int 1 2 1 7 1 7 1 1 1 1 ...
## $ Mitoses : int 1 1 1 1 1 1 1 1 5 1 ...
## $ Class : int 2 2 2 2 2 4 2 2 2 2 ...
## $ Color : chr "green" "green" "green" "green" ...
## $ Label : Factor w/ 2 levels "Benign","Malignant": 1 1 1 1 1 2 1 1 1 1 ...
head(dataBC[,2:10])
## Clump.Thickness Cell.Size Cell.Shape Adhesion Epithelial Bare.Nuclei
## 1 5 1 1 1 2 2
## 2 5 4 4 5 7 3
## 3 3 1 1 1 2 4
## 4 6 8 8 1 3 6
## 5 4 1 1 3 2 2
## 6 8 10 10 8 7 3
## Chromatin Nucleoli Mitoses
## 1 3 1 1
## 2 3 2 1
## 3 3 1 1
## 4 3 7 1
## 5 3 1 1
## 6 9 7 1
dataMC = apply(dataBC[,2:10], 2, function(y) y - mean(y))
head(dataMC)
## Clump.Thickness Cell.Size Cell.Shape Adhesion Epithelial Bare.Nuclei
## 1 0.5578331 -2.1508053 -2.2152269 -1.8301611 -1.2342606 -1.2166911
## 2 0.5578331 0.8491947 0.7847731 2.1698389 3.7657394 -0.2166911
## 3 -1.4421669 -2.1508053 -2.2152269 -1.8301611 -1.2342606 0.7833089
## 4 1.5578331 4.8491947 4.7847731 -1.8301611 -0.2342606 2.7833089
## 5 -0.4421669 -2.1508053 -2.2152269 0.1698389 -1.2342606 -1.2166911
## 6 3.5578331 6.8491947 6.7847731 5.1698389 3.7657394 -0.2166911
## Chromatin Nucleoli Mitoses
## 1 -0.4450952 -1.8696925 -0.6032211
## 2 -0.4450952 -0.8696925 -0.6032211
## 3 -0.4450952 -1.8696925 -0.6032211
## 4 -0.4450952 4.1303075 -0.6032211
## 5 -0.4450952 -1.8696925 -0.6032211
## 6 5.5549048 4.1303075 -0.6032211
pca = prcomp(dataBC[,2:10], center=T, scale.=T)
str(pca)
## List of 5
## $ sdev : num [1:9] 2.351 0.896 0.847 0.729 0.635 ...
## $ rotation: num [1:9, 1:9] -0.311 -0.395 -0.39 -0.337 -0.35 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:9] "Clump.Thickness" "Cell.Size" "Cell.Shape" "Adhesion" ...
## .. ..$ : chr [1:9] "PC1" "PC2" "PC3" "PC4" ...
## $ center : Named num [1:9] 4.44 3.15 3.22 2.83 3.23 ...
## ..- attr(*, "names")= chr [1:9] "Clump.Thickness" "Cell.Size" "Cell.Shape" "Adhesion" ...
## $ scale : Named num [1:9] 2.82 3.07 2.99 2.86 2.22 ...
## ..- attr(*, "names")= chr [1:9] "Clump.Thickness" "Cell.Size" "Cell.Shape" "Adhesion" ...
## $ x : num [1:683, 1:9] 1.407 -0.849 1.416 -1.79 1.282 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:683] "1" "2" "3" "4" ...
## .. ..$ : chr [1:9] "PC1" "PC2" "PC3" "PC4" ...
## - attr(*, "class")= chr "prcomp"
round(pca$sdev^2, 2)
## [1] 5.53 0.80 0.72 0.53 0.40 0.37 0.30 0.26 0.09
pcs = data.frame(pca$x)
str(pcs)
## 'data.frame': 683 obs. of 9 variables:
## $ PC1: num 1.407 -0.849 1.416 -1.79 1.282 ...
## $ PC2: num 0.071 0.227 -0.577 -1.479 0.25 ...
## $ PC3: num 0.207 0.417 -0.339 -0.128 0.331 ...
## $ PC4: num -0.6238 0.1466 0.1639 -0.3106 -0.0838 ...
## $ PC5: num 0.15 -0.882 0.151 -0.726 0.493 ...
## $ PC6: num 0.0825 -1.1726 -0.0347 0.8743 -0.17 ...
## $ PC7: num -0.27519 0.64359 -0.34249 0.27309 0.00218 ...
## $ PC8: num 0.354 0.574 0.248 -1.594 0.366 ...
## $ PC9: num -0.0212 0.08511 -0.00891 -0.19165 0.00122 ...
cov = round(pca$sdev^2/sum(pca$sdev^2)*100, 2)
cov = data.frame(c(1:9),cov)
names(cov)[1] = 'PCs'
names(cov)[2] = 'Variance'
cov
## PCs Variance
## 1 1 61.41
## 2 2 8.92
## 3 3 7.98
## 4 4 5.90
## 5 5 4.48
## 6 6 4.12
## 7 7 3.29
## 8 8 2.90
## 9 9 1.00
p1 = hPlot(Variance~PCs, data=cov, type=c('line'), radius=7)
p1$print(include_assets=T)
PCA Pareto Chart
PCA = pca$sdev^2
names(PCA) = paste0('PC', cov$PCs)
qcc::pareto.chart(PCA)
##
## Pareto chart analysis for PCA
## Frequency Cum.Freq. Percentage Cum.Percent.
## PC1 5.52692008 5.526920 61.410223 61.41022
## PC2 0.80235707 6.329277 8.915079 70.32530
## PC3 0.71804127 7.047318 7.978236 78.30354
## PC4 0.53138409 7.578703 5.904268 84.20781
## PC5 0.40352916 7.982232 4.483657 88.69146
## PC6 0.37046840 8.352700 4.116316 92.80778
## PC7 0.29654175 8.649242 3.294908 96.10269
## PC8 0.26069582 8.909938 2.896620 98.99931
## PC9 0.09006237 9.000000 1.000693 100.00000
sum(cov$Variance[1:2])
## [1] 70.33
sum(cov$Variance[1:3])
## [1] 78.31
** R base graphics plot **
biplot(pca,cex=0.8)
Based on ggplot2
g = ggbiplot(pca, choices = 1:2, obs.scale = 1, var.scale = 1, groups = dataBC[,13], ellipse = T, circle = T)
g+theme(legend.position='top') + scale_colour_manual(values=c('green','red'))
Visualizing relationship between different PCs
pcs$Label = dataBC$Label
p1 = ggplot(pcs, aes(PC1, PC2, color=Label))+ geom_point() +
scale_colour_manual(values=c('green','red')) + theme(legend.position='none')
p2 = ggplot(pcs, aes(PC2, PC3, color=Label))+ geom_point() +
scale_colour_manual(values=c('green','red')) + theme(legend.position='none')
p3 = ggplot(pcs, aes(PC1, PC3, color=Label))+ geom_point() +
scale_colour_manual(values=c('green','red')) + theme(legend.position='none')
p4 = ggplot(pcs, aes(PC1, PC4, color=Label))+geom_point() +
scale_colour_manual(values=c('green','red')) + theme(legend.position='none')
grid.arrange(p1, p2, p3, p4, ncol=2)
scatterplot3js(pcs[,c('PC1','PC2','PC3')], labels=dataBC$Label,
color = dataBC$Color, flip.y=TRUE,renderer="canvas")
This will show a 3D plot on your local R
# pca3d(pca, components = 1:3, col = dataBC$Color, biplot=T)