Get Data

Protein consumption today. More datasets here.

url <- 'http://www.biz.uiowa.edu/faculty/jledolter/DataMining/protein.csv'
data <- read.csv(url) 
data$X <- NULL # drop index column
head(data) # check results
##          Country RedMeat WhiteMeat Eggs Milk Fish Cereals Starch Nuts
## 1        Albania    10.1       1.4  0.5  8.9  0.2    42.3    0.6  5.5
## 2        Austria     8.9      14.0  4.3 19.9  2.1    28.0    3.6  1.3
## 3        Belgium    13.5       9.3  4.1 17.5  4.5    26.6    5.7  2.1
## 4       Bulgaria     7.8       6.0  1.6  8.3  1.2    56.7    1.1  3.7
## 5 Czechoslovakia     9.7      11.4  2.8 12.5  2.0    34.3    5.0  1.1
## 6        Denmark    10.6      10.8  3.7 25.0  9.9    21.9    4.8  0.7
##   Fr.Veg
## 1    1.7
## 2    4.3
## 3    4.0
## 4    4.2
## 5    4.0
## 6    2.4
labcol <- 1 #number of label column
data.stand <- scale(subset(data, select = -c(labcol)))  # standarized variables, labels excluded
data.lab <- data[,labcol] # labels
row.names(data.stand) <- data.lab # name rows by labels for the fanciest view

PCA

Create PCA model

data.cpca = princomp(data.stand)
plot(data.cpca)

Apply PCA transoform to data

data.pca = predict(data.cpca, data)
# plot result
plot(data.pca, cex = 0, main="PCA")
text(data.pca, labels = data.lab, col = 'black', cex = 1)

Create biplot

plot(data.cpca$loadings, cex = 0)
text(data.cpca$loadings, labels = labels(data.cpca$loadings)[[1]], cex = 1)

Or just so:

biplot(data.cpca, expand = 1, xlim = c(-0.5,0.5), ylim = c(-0.7,0.4))

Briefly about biplots here.

For the best PCA visual try ggfortify.

K-Means

Basic clustering algorithm. Interactive example here

library(cluster)
#Set number of clusters
num_clust <- 4
# Create K-Means model
k.means.fit <- kmeans(data.stand, num_clust)
# Useful info about result
print(k.means.fit)
## K-means clustering with 4 clusters of sizes 10, 4, 5, 6
## 
## Cluster means:
##        RedMeat  WhiteMeat       Eggs        Milk       Fish    Cereals
## 1 -0.677605893 -0.7595936 -0.8643394 -0.87567008 -0.1775148  0.9150064
## 2  0.006572897 -0.2290150  0.1914789  1.34587480  1.1582546 -0.8722721
## 3  1.599006499  0.2988565  0.9341308  0.60911284 -0.1422470 -0.5948180
## 4 -0.207544192  1.1696189  0.5344707  0.05460623 -0.3577726 -0.4478143
##       Starch       Nuts     Fr.Veg
## 1 -0.5299602  1.0565639  0.3292860
## 2  0.1676780 -0.9553392 -1.1148048
## 3  0.3451473 -0.3484949  0.1020010
## 4  0.4838590 -0.8336346  0.1093924
## 
## Clustering vector:
##        Albania        Austria        Belgium       Bulgaria Czechoslovakia 
##              1              4              3              1              4 
##        Denmark      E Germany        Finland         France         Greece 
##              2              4              2              3              1 
##        Hungary        Ireland          Italy    Netherlands         Norway 
##              1              3              1              4              2 
##         Poland       Portugal        Romania          Spain         Sweden 
##              4              1              1              1              2 
##    Switzerland             UK           USSR      W Germany     Yugoslavia 
##              3              3              1              4              1 
## 
## Within cluster sum of squares by cluster:
## [1] 70.957748  5.900318 12.069794 12.692620
##  (between_SS / total_SS =  53.0 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

See particular attributes.

Centroids:

k.means.fit$centers
##        RedMeat  WhiteMeat       Eggs        Milk       Fish    Cereals
## 1 -0.677605893 -0.7595936 -0.8643394 -0.87567008 -0.1775148  0.9150064
## 2  0.006572897 -0.2290150  0.1914789  1.34587480  1.1582546 -0.8722721
## 3  1.599006499  0.2988565  0.9341308  0.60911284 -0.1422470 -0.5948180
## 4 -0.207544192  1.1696189  0.5344707  0.05460623 -0.3577726 -0.4478143
##       Starch       Nuts     Fr.Veg
## 1 -0.5299602  1.0565639  0.3292860
## 2  0.1676780 -0.9553392 -1.1148048
## 3  0.3451473 -0.3484949  0.1020010
## 4  0.4838590 -0.8336346  0.1093924

Partitioning

k.means.fit$cluster
##        Albania        Austria        Belgium       Bulgaria Czechoslovakia 
##              1              4              3              1              4 
##        Denmark      E Germany        Finland         France         Greece 
##              2              4              2              3              1 
##        Hungary        Ireland          Italy    Netherlands         Norway 
##              1              3              1              4              2 
##         Poland       Portugal        Romania          Spain         Sweden 
##              4              1              1              1              2 
##    Switzerland             UK           USSR      W Germany     Yugoslavia 
##              3              3              1              4              1

Cluster size

 k.means.fit$sizes
## NULL

Show clusters in PCA-view

cols = rainbow(length(k.means.fit$size)) # colors
plot(data.pca, cex=0, main="PCA+KMeans")
text(data.pca, labels = data.lab, col = cols[factor(k.means.fit$cluster)])

Or just so:

clusplot(data.stand, k.means.fit$cluster, color = T, labels=2, lines = F)

Elbow rule

The basic rule to determine amount of clusters: calculate some metric (inner distance here) for different number of clusters, choose point of inflection (‘elbow’).

wssplot <- function(data, nc=10, seed=1234){
  wss <- (nrow(data)-1)*sum(apply(data,2,var))
  for (i in 2:nc){
    set.seed(seed)
    wss[i] <- sum(kmeans(data, centers=i)$withinss)}
  plot(1:nc, wss, type="b", xlab="Number of Clusters",
       ylab="Within groups sum of squares")}

wssplot(data.stand) 

Doesn’t looks good for the Protein dataset, but 2 and 5 clusters looks slightly better than other options. More inforation and another approaches: even in Wikipedia, also some great note from StackOwerflow.

Hierarchical clustering

The simpliest approach.

# choose distance metric and create distance matrix
data.dist = dist(data.stand, method = 'euclidean') # "maximum", "manhattan", etc
# choose intercluster distance metric and perform clustering 
data.hclust = hclust(data.dist, method = 'average') # complete', 'single', 'ward.D2'
# plot result
plot(data.hclust, labels = data.lab)
# plot clusters for chosen number of clusters
rect.hclust(data.hclust, num_clust)

Also take a look to dendextend.

Silhouette analysis

Type ?silhouette in console for detailed information

groups <- cutree(data.hclust, num_clust) 
# also try any another partitioning
#groups <- k.means.fit$cluster
sil <- silhouette(groups, data.dist)
row.names(sil) <- data.lab
plot(sil, col = cols, border = NA, main = 'Silhouette')


Further readings: