In this post I will explore PCA, K-means clustering and the RGL engine to display both.
First I load the required components
require(rgl, lib.loc="C:/TFS/Rlib/")
require(SciViews, lib.loc="C:/TFS/Rlib/")
require(plotrix, lib.loc="C:/TFS/Rlib/")
library(corrplot, lib.loc="C:/TFS/Rlib/")
require(ggplot2)
require(reshape, lib.loc="C:/TFS/Rlib/")
require("gridExtra")
We wil use the cars dataset:
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
We run a PCA:
cars.pca <- pcomp(~mpg+cyl+disp+hp+drat+wt+qsec, data = mtcars)#, subset = -(8:14))
mtcars_pca = cbind(cbind(mtcars, cars.pca$scores), car = rownames(mtcars))
#cars.pca
screeplot(cars.pca)
The first three principle components explain a lot of the variance.
Lets look at the loadings to get a better understanding:
summary(cars.pca)
## Importance of components (eigenvalues):
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Variance 5.086 1.157 0.3448 0.1579 0.1295 0.0759 0.04920
## Proportion of Variance 0.727 0.165 0.0493 0.0226 0.0185 0.0108 0.00703
## Cumulative Proportion 0.727 0.892 0.9411 0.9636 0.9821 0.9930 1.00000
##
## Loadings (eigenvectors, rotation matrix):
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## mpg -0.413 -0.242 -0.767 0.213 0.351
## cyl 0.425 -0.188 -0.194 -0.238 -0.781 0.273
## disp 0.423 0.118 -0.588 -0.149 0.162 -0.638
## hp 0.388 0.337 0.203 0.831
## drat -0.331 0.449 0.755 -0.117 -0.222 -0.233
## wt 0.391 -0.322 0.441 -0.107 -0.167 0.366 0.613
## qsec -0.240 -0.749 0.294 0.328 -0.407 -0.131
Loadings <- as.data.frame(cars.pca$loadings[,1:3])
hm = cbind(car = rownames(Loadings), Loadings)
hmm = melt(hm, id = c("car"))
ggplot(hmm, aes(variable, car)) + geom_tile(aes(fill = value),
colour = "white") + scale_fill_gradient(low = "red",
high = "green")
Look at it visually, stongly correlated are arrows close to each other. The opposite is arrows pointing away from each other. The correlation plot will show the same thing.
plot(cars.pca, which = "correlations")
So high in PC1 are cars that are heavy, many cylinders, low mpg. Typical US cars.. PC2 says something about accelaration, high on PC2 is fast accelaration, and vice versa.
#upper left corner
cor(mtcars$drat, mtcars$mpg)
## [1] 0.6811719
#middle right
cor(mtcars$cyl, mtcars$disp)
## [1] 0.9020329
M<-cor(mtcars)
corrplot(M, method="circle")
plot(cars.pca, which = "scores", cex = 0.8) # Similar to plot(scores(cars.pca)[, 1:3])
## Warning in isTRUE(!as.numeric(labels)): NAs introduced by coercion
The plot shows the data as expected. Four quandrants;
data.frame(mtcars["Lincoln Continental",])
## mpg cyl disp hp drat wt qsec vs am gear carb
## Lincoln Continental 10.4 8 460 215 3 5.424 17.82 0 0 3 4
mtcars["Porsche 914-2",]
## mpg cyl disp hp drat wt qsec vs am gear carb
## Porsche 914-2 26 4 120.3 91 4.43 2.14 16.7 0 1 5 2
mtcars["Maserati Bora",]
## mpg cyl disp hp drat wt qsec vs am gear carb
## Maserati Bora 15 8 301 335 3.54 3.57 14.6 0 1 5 8
mtcars["Merc 230",]
## mpg cyl disp hp drat wt qsec vs am gear carb
## Merc 230 22.8 4 140.8 95 3.92 3.15 22.9 1 0 4 2
lets look at PC3:
plot(cars.pca, which = "correlations", choices = c(1,3))
plot(cars.pca, which = "correlations", choices = c(2,3))
PC3 has a high loading for drat (Rear axle ratio). Higher axle ratio means you have funtionally lower gearing which will make hauling big ass loads even easier. And a high loading on weight. We display both.
ds <- transform(mtcars_pca, car = reorder(car, drat))
ggplot(ds, aes(x = factor(car), y = drat, label=car)) + geom_bar(stat = "identity") + geom_text(angle=90, colour="red")
ggplot(mtcars_pca, aes(PC1, PC3, label = rownames(mtcars_pca), colour=drat, size=wt)) + geom_point() + geom_text() + scale_colour_gradient(limits=c(3, 5), low="red", high="green")
Lets take a 3D view of the principle components. We will make the spheres larger by weight.
plot3d(scores(cars.pca)[, 1:3], size=0, col=cols )
#Make the spheres larger when the car has more weight
size <- as.numeric(cut(mtcars_pca$wt, 5))
WList <- split(new, size)
for(i in seq_along(WList)) {
with(WList[[i]], spheres3d(PC1, PC2,PC3, radius = i/40, col=col))
}
#Add text labels
with(new,text3d(PC1,PC2,PC3,label))
The result looks like this:
We apply k-means clustering to see how the clusters plot on the principle components:
#view the results
k <- kmeans(mtcars, 5, nstart=25, iter.max=1000)
new = cbind(mtcars,cluster = k$cluster)
Display it in 3D.
with(new,plot3d(PC1,PC2,PC3, col=k$cluster, size=2, type='s'))
#View with text labels
with(new,text3d(PC1,PC2,PC3,label))
The results look like this:
The pontiac Firebird is lighter than the others in the cluster. With this custom heatmap sorted by cluster, it becomes visible on which attributes the clustering is done.
new = cbind(mtcars,cluster = k$cluster)
new = cbind(car = rownames(new),new)
new = new[order(new$cluster),]
#ordering of cars on cluster
new$car <- factor(new$car, levels=(new$car)[order(new$cluster)])
new = cbind(new, variable="cluster")
g = ggplot(new, aes(variable, car)) + geom_tile(aes(fill = cluster),
colour = "white") + scale_fill_gradient(low = "red",
high = "green") + theme(legend.position="none")
myplotslist2 <- list()
myplotslist2[[1]] = g
var = c("wt", "mpg", "qsec", "drat")
for (i in 1:length(var)){
t= paste("new[[\"variable\"]] = \"", var[[i]],"\"; a = ggplot(new, aes(variable, car)) + geom_tile(aes(fill = ", var[[i]], "),colour = \"white\") + scale_fill_gradient(low = \"red\", high = \"green\") + theme(axis.title.y=element_blank(), axis.text.y=element_blank(),legend.position=\"none\"); myplotslist2[[i+1]] = a")
eval(parse(text=t))
}
grid.arrange(grobs=myplotslist2, ncol=(length(var)+1), widths=rep(100,length(var)+1))
Lets look at the PC values compared to the cluster:
new = cbind(new, cars.pca$scores[,1:3])
var = c("PC1", "PC2", "PC3")
myplotslist2 <- list()
myplotslist2[[1]] = g
for (i in 1:length(var)){
t= paste("new[[\"variable\"]] = \"", var[[i]],"\"; a = ggplot(new, aes(variable, car)) + geom_tile(aes(fill = ", var[[i]], "),colour = \"white\") + scale_fill_gradient(low = \"red\", high = \"green\") + theme(axis.title.y=element_blank(), axis.text.y=element_blank(),legend.position=\"none\"); myplotslist2[[i+1]] = a")
eval(parse(text=t))
}
grid.arrange(grobs=myplotslist2, ncol=(length(var)+1), widths=rep(100,length(var)+1))