The following work is an application of Principal Components Analysis and Cluster Analysis on a multivariate data set.
We refer to the state data set containing data related to the 50 States of the USA (see R Documentation for details).
library(ggplot2)
colnames(state.x77) <- gsub(" ", "", colnames(state.x77))
statedf <- data.frame(state.abb, state.region,
state.division, state.x77)
colnames(statedf)[1:3] <- gsub("state.", "",
colnames(statedf)[1:3])
The first plot displays the relationship between each pair of variables, while the second one focuses on the relationship between 2 variables (LifeExp and Murder) and the third one is a boxplot of Income grouped by division.
# multivariate plot
pairs(statedf[, 5:10], panel = panel.smooth, main = "Multivariate Plot")
# bivariate plot
plot(statedf$LifeExp, statedf$Murder, main = "Bivariate Plot")
text(statedf$LifeExp, statedf$Murder, statedf$abb,
cex = 0.5, pos = 4, col = "blue")
# boxplot grouped by division
boxplot(statedf$Income ~ statedf$region, col = "green",
las = 2)
title(main = "Income Boxplot")
We calculate the correlation between the selected variables:
mydata <- statedf[, 5:9]
round(cor(mydata), 2)
## Income Illiteracy LifeExp Murder HSGrad
## Income 1.00 -0.44 0.34 -0.23 0.62
## Illiteracy -0.44 1.00 -0.59 0.70 -0.66
## LifeExp 0.34 -0.59 1.00 -0.78 0.58
## Murder -0.23 0.70 -0.78 1.00 -0.49
## HSGrad 0.62 -0.66 0.58 -0.49 1.00
We center and scale the original variables in order to make them uniform, and apply PCA:
mypca <- prcomp(mydata, center = TRUE, scale. = TRUE)
names(mypca)
## [1] "sdev" "rotation" "center" "scale" "x"
exVar <- mypca$sdev^2
exVar <- 100 * exVar / sum(exVar)
round(exVar[1:3], 2)
## [1] 64.03 18.76 7.98
barplot(height = exVar, main = "Explained Variance",
xlab = "Principal Components", ylim = c(0,80),
col = "cyan", border = "navyblue")
grid(nx = NA, ny = 4, lty = 2)
par(mfrow = c(1,3))
for(i in 1:3) {
barplot(mypca$rotation[, i], ylim = c(-0.75, 0.75),
col = "blue", las = 3)
abline(h = 0, col = "darkgray")
title(main = paste("PC",i), ylab = "variable loadings")
}
The following plot leads to the interpretation of pc1 as expressing High Education and Life Expectation versus Illiteracy and Murder, and pc2 as expressing Income level.
pccor <- round(cor(mydata, mypca$x), 3)
par(mfrow = c(1,1))
plot(pccor[, 1], pccor[, 2], type = "p", pch = 19,
xlim = c(-1, 1), ylim = c(-1, 1), col = "darkred",
main = "Correlations", xlab = "PC1", ylab = "PC2")
grid(nx = 9, ny = 9, lty = 2)
abline(h = 0, v = 0, col = "darkgray")
text(pccor[, 1], pccor[, 2], rownames(pccor),
cex = 0.7, pos = 4, col = "black")
plot(mypca$x[, 1], mypca$x[, 2], type = "p",
pch = 19, col = "steelblue",
main = "Plot of PC data (rotated data)",
xlab = "PC1 - Education and Life Expectation",
ylab = "PC2 - Income")
grid()
abline(h = 0, v = 0, col = "magenta")
text(mypca$x[, 1], mypca$x[, 2], rownames(mypca$x),
cex = 0.6, pos = 4, col = "black")
The final step consists of an application of Cluster Analysis on the data obtained from PCA.
kmx <- kmeans(mypca$x[,1:2], centers = 5, nstart = 3)
plot(mypca$x[, 1], mypca$x[, 2], type = "p",
pch = 23, bg = 1+kmx$cluster, col = "black",
main = "Partition of the points into 5 clusters",
xlab = "PC1 - Education and Life Expectation",
ylab = "PC2 - Income")
grid()
abline(h = 0, v = 0, col = "darkgray")
points(kmx$centers, pch = 3, cex = 3, col = "black")