PCA and K-means

In this post I will explore PCA, K-means clustering and the RGL engine to display both.

First I load the required components

require(rgl,  lib.loc="C:/TFS/Rlib/")
require(SciViews,  lib.loc="C:/TFS/Rlib/")
require(plotrix,  lib.loc="C:/TFS/Rlib/")
library(corrplot,  lib.loc="C:/TFS/Rlib/")
require(ggplot2)
require(reshape,  lib.loc="C:/TFS/Rlib/")
require("gridExtra")

We wil use the cars dataset:

head(mtcars)

##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1

We run a PCA:

cars.pca <- pcomp(~mpg+cyl+disp+hp+drat+wt+qsec, data = mtcars)#,  subset = -(8:14))
mtcars_pca = cbind(cbind(mtcars, cars.pca$scores), car = rownames(mtcars))
#cars.pca
screeplot(cars.pca)

The first three principle components explain a lot of the variance.

Lets look at the loadings to get a better understanding:

summary(cars.pca)

## Importance of components (eigenvalues):
##                          PC1   PC2    PC3    PC4    PC5    PC6     PC7
## Variance               5.086 1.157 0.3448 0.1579 0.1295 0.0759 0.04920
## Proportion of Variance 0.727 0.165 0.0493 0.0226 0.0185 0.0108 0.00703
## Cumulative Proportion  0.727 0.892 0.9411 0.9636 0.9821 0.9930 1.00000
## 
## Loadings (eigenvectors, rotation matrix):
##      PC1    PC2    PC3    PC4    PC5    PC6    PC7   
## mpg  -0.413        -0.242 -0.767  0.213         0.351
## cyl   0.425        -0.188 -0.194 -0.238 -0.781  0.273
## disp  0.423         0.118 -0.588 -0.149  0.162 -0.638
## hp    0.388  0.337  0.203         0.831              
## drat -0.331  0.449  0.755 -0.117 -0.222 -0.233       
## wt    0.391 -0.322  0.441 -0.107 -0.167  0.366  0.613
## qsec -0.240 -0.749  0.294         0.328 -0.407 -0.131

Loadings <- as.data.frame(cars.pca$loadings[,1:3])
hm = cbind(car = rownames(Loadings), Loadings)
hmm = melt(hm, id = c("car"))
ggplot(hmm, aes(variable, car)) + geom_tile(aes(fill = value),
     colour = "white") + scale_fill_gradient(low = "red",
     high = "green")

Look at it visually, stongly correlated are arrows close to each other. The opposite is arrows pointing away from each other. The correlation plot will show the same thing.

plot(cars.pca, which = "correlations")

So high in PC1 are cars that are heavy, many cylinders, low mpg. Typical US cars.. PC2 says something about accelaration, high on PC2 is fast accelaration, and vice versa.

#upper left corner
cor(mtcars$drat, mtcars$mpg)

## [1] 0.6811719

#middle right
cor(mtcars$cyl, mtcars$disp)

## [1] 0.9020329

M<-cor(mtcars)
corrplot(M, method="circle")

plot(cars.pca, which = "scores", cex = 0.8) # Similar to plot(scores(cars.pca)[, 1:3])

## Warning in isTRUE(!as.numeric(labels)): NAs introduced by coercion

The plot shows the data as expected. Four quandrants;

bottom right: slow, heavy, little mpg, little accelaration.

data.frame(mtcars["Lincoln Continental",])

##                      mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Lincoln Continental 10.4   8  460 215    3 5.424 17.82  0  0    3    4

top left: the opposite; fast, lightweight, hight MPG.

mtcars["Porsche 914-2",]

##               mpg cyl  disp hp drat   wt qsec vs am gear carb
## Porsche 914-2  26   4 120.3 91 4.43 2.14 16.7  0  1    5    2

top right: heavy, little mpg, yet high accelaration.

mtcars["Maserati Bora",]

##               mpg cyl disp  hp drat   wt qsec vs am gear carb
## Maserati Bora  15   8  301 335 3.54 3.57 14.6  0  1    5    8

bottom left: slow(!), somewhat lightweight, hight MPG.

mtcars["Merc 230",]

##           mpg cyl  disp hp drat   wt qsec vs am gear carb
## Merc 230 22.8   4 140.8 95 3.92 3.15 22.9  1  0    4    2

lets look at PC3:

plot(cars.pca, which = "correlations", choices = c(1,3))

plot(cars.pca, which = "correlations", choices = c(2,3))

PC3 has a high loading for drat (Rear axle ratio). Higher axle ratio means you have funtionally lower gearing which will make hauling big ass loads even easier. And a high loading on weight. We display both.

ds <- transform(mtcars_pca, car = reorder(car, drat))
ggplot(ds, aes(x = factor(car), y = drat, label=car)) + geom_bar(stat = "identity") + geom_text(angle=90, colour="red")

ggplot(mtcars_pca, aes(PC1, PC3, label = rownames(mtcars_pca), colour=drat, size=wt)) + geom_point() + geom_text() + scale_colour_gradient(limits=c(3, 5), low="red", high="green")

Lets take a 3D view of the principle components. We will make the spheres larger by weight.

plot3d(scores(cars.pca)[, 1:3], size=0, col=cols )

#Make the spheres larger when the car has more weight
size <- as.numeric(cut(mtcars_pca$wt, 5))
WList <- split(new, size)
for(i in seq_along(WList)) {
  with(WList[[i]], spheres3d(PC1, PC2,PC3, radius = i/40, col=col))
}

#Add text labels
with(new,text3d(PC1,PC2,PC3,label))

The result looks like this:

We apply k-means clustering to see how the clusters plot on the principle components:

#view the results
k <- kmeans(mtcars, 5, nstart=25, iter.max=1000)
new = cbind(mtcars,cluster = k$cluster)

Display it in 3D.

with(new,plot3d(PC1,PC2,PC3, col=k$cluster, size=2, type='s'))
#View with text labels
with(new,text3d(PC1,PC2,PC3,label))

The results look like this:

The pontiac Firebird is lighter than the others in the cluster. With this custom heatmap sorted by cluster, it becomes visible on which attributes the clustering is done.

new = cbind(mtcars,cluster = k$cluster)
new = cbind(car = rownames(new),new)
new = new[order(new$cluster),]
#ordering of cars on cluster
new$car <- factor(new$car, levels=(new$car)[order(new$cluster)])
new = cbind(new, variable="cluster")

g = ggplot(new, aes(variable, car)) + geom_tile(aes(fill = cluster),
     colour = "white") + scale_fill_gradient(low = "red",
     high = "green") +  theme(legend.position="none")

myplotslist2 <- list()
myplotslist2[[1]] = g
var = c("wt", "mpg", "qsec", "drat")

for (i in 1:length(var)){
    t= paste("new[[\"variable\"]] = \"", var[[i]],"\"; a = ggplot(new, aes(variable, car)) + geom_tile(aes(fill = ", var[[i]], "),colour = \"white\") + scale_fill_gradient(low = \"red\", high = \"green\") + theme(axis.title.y=element_blank(), axis.text.y=element_blank(),legend.position=\"none\"); myplotslist2[[i+1]] = a")
    eval(parse(text=t))
}

grid.arrange(grobs=myplotslist2, ncol=(length(var)+1), widths=rep(100,length(var)+1))

Lets look at the PC values compared to the cluster:

new = cbind(new, cars.pca$scores[,1:3])
var = c("PC1", "PC2", "PC3")

myplotslist2 <- list()
myplotslist2[[1]] = g

for (i in 1:length(var)){
    t= paste("new[[\"variable\"]] = \"", var[[i]],"\"; a = ggplot(new, aes(variable, car)) + geom_tile(aes(fill = ", var[[i]], "),colour = \"white\") + scale_fill_gradient(low = \"red\", high = \"green\") + theme(axis.title.y=element_blank(), axis.text.y=element_blank(),legend.position=\"none\"); myplotslist2[[i+1]] = a")
    eval(parse(text=t))
}
grid.arrange(grobs=myplotslist2, ncol=(length(var)+1), widths=rep(100,length(var)+1))

PCA and K-means

Steve de Peijper

21 mei 2016