In preparation of, and in anticipation of, proper socio-economic data, we test-drive SOM algorithms on the IRIS dataset. Questions to love.borjeson at gmail.com
Self Organizing Map (SOM) is a deep learning approach to clustering and/or classification. Here we concentrate on the former. Wikipedia should get you pretty far in terms of basic understanding of the statistical mechanisms and the involved machine learning steps and iterations. The IRIS data set is not ideal: to few dimensions and observations to do the method justice. Running through the script should nonetheless be informative.
A lot of attention is given to the plots. It’s not just for show, SOMs are decisively graphical/visual to their nature and should be interpreted mainly, albeit not exclusively, by visual inspection.
The visual presentation is hence ganz wichtig.
prescript
#If you, like me, run your code in RStudio, this can save you 1.5 seconds :-):
library(rstudioapi)
setwd(dirname(rstudioapi::callFun("getActiveDocumentContext")$path))
script
(Now on with it)
library(kohonen) #fitting SOMs
library(ggplot2) #plots
library(GGally) #plots
library(RColorBrewer) #colors, using predefined palettes
#clean slate...:
rm(list = ls())
iris_complete <-iris[complete.cases(iris),] #only complete cases... the iris dataset floats around in the sky with diamonds.
iris_unique <- unique(iris_complete) # Remove duplicates
Palettes we can use later.
contrast <- c("#FA4925", "#22693E", "#D4D40F", "#2C4382", "#F0F0F0", "#3D3D3D") #my own, contrasting pairs
kindofpretty <- c("#B39B66", "#3B3828", "#FAE6B9", "#F2F2F2", "#86BA9F", "#135E1F", "#FFF70A", "#FFB10A", "#0498BD", "#FF780A") #my own
kindofpretty2 <- c("#B39B66", "#3B3828", "#FAE6B9", "#F2F2F2", "#F58B00", "#F5D800", "#7185A3", "#786187") #my own
#or define palettes using RColorBrewer:
display.brewer.all() # you get it...
#and then, e.g.:
cols <- brewer.pal(10, "Paired")
#Similarely, but a color function that will be called upon from inside other functions. Easy to change to rainbow, etc.
terraincolors <- function(n, alpha = 1) {
terrain.colors(n, alpha=alpha)[n:1]
}
#oh, well, first of all:
summary(iris_unique)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.00 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.80 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.00 Median :4.300 Median :1.300
## Mean :5.844 Mean :3.06 Mean :3.749 Mean :1.195
## 3rd Qu.:6.400 3rd Qu.:3.30 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.40 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :49
##
##
##
#Set attributes to be able to classify variables (not observations) that can then be investigated using GGally.
attr(iris_unique,'Sepal') <- c("Sepal.Length", "Sepal.Width")
attr(iris_unique,'Petal') <- c("Petal.Length", "Petal.Width")
str(attributes(iris_unique)) #check...
## List of 5
## $ names : chr [1:5] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" ...
## $ row.names: int [1:149] 1 2 3 4 5 6 7 8 9 10 ...
## $ class : chr "data.frame"
## $ Sepal : chr [1:2] "Sepal.Length" "Sepal.Width"
## $ Petal : chr [1:2] "Petal.Length" "Petal.Width"
(SepalVar <- attr(iris_unique, "Sepal")) #Define variables groups based on attributes
## [1] "Sepal.Length" "Sepal.Width"
(PetalVar <- attr(iris_unique, "Petal"))
## [1] "Petal.Length" "Petal.Width"
# Next, look at the within correlation using ggpairs.
ggpairs(iris_unique, title = "the whole dataset")
ggpairs(iris_unique, SepalVar, title = "Within Sepal")
ggpairs(iris_unique, PetalVar, title = "Within Petal")
# Then, look at the between correlation using ggpairs.
ggduo(
iris_unique, SepalVar, PetalVar,
types = list(continuous = "smooth_lm"),
title = "Between Sepal and Petal Variable Correlation",
xlab = "Sepal",
ylab = "Petal"
)
#scale data
iris.sc = scale(iris_unique[, 1:4]) #Levels/Factors cannot be scaled... But used in predictive SOM:s using xyf. Later.
#build grid
iris.grid = somgrid(xdim = 10, ydim=10, topo="hexagonal", toroidal = TRUE) #gridsize roughly 5-10 samples / node. Hexagonal is to be preferred (more neighbors).
#Hence, 10*10 is too big a grid when n=150. But we'll go along with it here for illustrative purposes.
#In toroidal maps, boundaries at the edges will only be shown on the top and right sides to avoid double boundaries.
A toroidal map has the shape of a swimming ring and basically works in a PacMan like fashion - if you go over the edge you end up at the opposite end of the map.
# build model
set.seed(33) #for reproducability
iris.som <- som(iris.sc, grid=iris.grid, rlen=700, alpha=c(0.05,0.01), keep.data = TRUE)
summary(iris.som)
## SOM of size 10x10 with a hexagonaltoroidal topology and a bubble neighbourhood function.
## Training data included of 149 objects
## The number of layers is 1
## Mean distance to the closest unit in the map: 0.04065537
#a shoot at efficiency
#relating mean distances between units and closest nodes to mean distances between units?
meanD <- mean(iris.som$distances) #mean distances between codes and the respective closest unit in the map.
meanunitD <- mean(unit.distances(iris.grid)) #Distances between units give us a clue on the
#distances prior to the mapping.
AccountedD <- meanunitD - meanD #MeanD is the distances that are "left over" when we use the map.
#The difference between the distances going in and whats left when we are done are the distances we have accounted for.
#If meandistance is 0 (will never happen), we have a perfect match between units and codes.
EF <- AccountedD/meanunitD #Divide the distances accounted for by the mean distances prior to mapping.
#The ratio should say something about the efficiency.
meanD
## [1] 0.04065537
meanunitD
## [1] 3.58932
AccountedD
## [1] 3.548664
EF #close to 1 = good.
## [1] 0.9886732
#Suspiciously good? Am I missing something, or is this an ok metric?
#It is certanily blind to overfitting (i.e. will e.g. always improve as we increase the grid-zize).
par(mfrow=c(1,1)) #no of plots to combine
par(mar=c(5.1,4.1,4.1,2.1)) #par sets or adjusts plotting parameters. Not allways necessary. "mar" = margin.
plot(iris.som, type="changes") # This one will indicate if you need to manipulate rlen.
#Shows the mean distance to the closest codebook vector during training. I see 3 plateaus:
#250-370, 430-550 and 620-700.
plot(iris.som, type="count") #Which nodes have many observations?
plot(iris.som, type="dist.neighbours", palette.name=grey.colors, shape = "straight") # #shows the sum of the distances to all immediate neighbours.
#This kind of visualisation is also known as a U-matrix plot. Should give you an idea about how to cluster,
#since Units near a class boundary can be expected to have higher average distances to their neighbours.
#I see 2-3 clusters/districts (keep in mind the grid is toroidal).
plot(iris.som, type="quality", shape = "straight") #close to 0 = good quality.
#shows the mean distance of objects mapped to a unit to the codebook vector of that unit.
#aka quantization error (qe).
#The smaller the distances, the better the objects are represented by the codebook vectors.
plot(iris.som, type="codes", codeRendering = "stars") #shows the codebook vectors.
plot(iris.som, type="codes", codeRendering = "segments") #default
plot(iris.som, type="codes", codeRendering = "lines")
plot(iris.som, type="mapping", border = "grey")#The lower number of empty cells, the better SOM.
#pretty gradient colors
colour1 <- tricolor(iris.som$grid)
plot(iris.som, "mapping", bg = rgb(colour1))
colour2 <- tricolor(iris.som$grid, phi = c(pi/6, 0, -pi/6))
plot(iris.som, "mapping", bg = rgb(colour2), shape = "straight")
colour3 <- tricolor(iris.som$grid, phi = c(pi/6, 0, -pi/6), offset = .5)
plot(iris.som, "mapping", bg = rgb(colour3), shape = "straight")
colour4 <- tricolor(iris.som$grid, phi = c(pi/8, 6, -pi/6), offset = 0.1)
plot(iris.som, "codes", bg = rgb(colour4), shape = "straight")
#We can go color-crazy here..
plot(iris.som, "mapping", bg = rgb(colour3), shape = "straight",col=brewer.pal(3,"Dark2")) #unclear what the brewer colors represent. Omit.
var <- 1 #define the variable to plot
plot(iris.som, type = "property", property = getCodes(iris.som)[,var], main=colnames(getCodes(iris.som))[var], palette.name=terrain.colors)
#...and so forth, for var 2-4.
#But more informative:
#Unscaled plots (i.e., probably easier to relate to)
var <- 1 #define the variable to plot
var_unscaled <- aggregate(as.numeric(iris_unique[,var]), by=list(iris.som$unit.classif), FUN=mean, simplify=TRUE)[,2]
plot(iris.som, type = "property", property=var_unscaled, main=names(iris_unique)[var], shape = "straight", palette.name = terrain.colors)
Now for the clustering. We cluster on the codes, not the units. The idea is that (as I understand it) by using SOM we can make a highly complex material so relatively simple that it is suitable for more straightforward approaches like hierarchical or k-means clustering. An option is to rely more heavily on the SOM output, guided initially by the “ridges” in the U-matrix (neighbor distances), but subsequently also by the property plots.
par(mfrow=c(1,1)) # back to one plot
clusterdata <- getCodes(iris.som)
wss <- (nrow(clusterdata)-1)*sum(apply(clusterdata,2,var))
for (i in 1:15) wss[i] <- sum(kmeans(clusterdata,
centers=i)$withinss) #i range from 1 to no of nodes -1...
par(mar=c(5.1,4.1,4.1,2.1)) #par sets or adjusts plotting parameters. Not allways necessary. "mar" = margin.
plot(1:15, wss, type="b", xlab="Number of Clusters", #match i
ylab="Within groups sum of squares", main="Within cluster sum of squares (WCSS)")
set.seed(33) #for reproducability
fit_kmeans <- kmeans(clusterdata[,1:4], 3) #3 clusters are used, as indicated by the wss development.
cl_assignmentk <- fit_kmeans$cluster[iris.som$unit.classif] #we have 100 (since grid=10*10) codes with asigned clusters,
#but we have 149 units. The above is to assign units to clusters based on their class-id (code-id) in the SOM model.
iris_unique$clustersKm <- cl_assignmentk #back to original data.
#2a. Create distance matrix
DistanceM <- dist(clusterdata[,1:4], method = "euclidean") #create the distance matrix
fit_hac <- hclust(DistanceM, method="ward.D2") #fit the distances. "D2", since "DistanceM" is not squared.
plot(fit_hac) #display dendrogram. To me, it looks like a 5 or 6 cluster solution.
#in my world 3-5 clusters
HACgroups <- cutree(fit_hac, k=3) # cut tree into k clusters
# draw dendrogram with red borders around the 6 clusters
rect.hclust(fit_hac, k=3, border="grey")
cl_assignmenth <- HACgroups[iris.som$unit.classif] #we have 100 (since grid=10*10) codes with asigned clusters,
#but we have 149 units. The above is to assign units to clusters based on their class-id (code-id) in the SOM model.
iris_unique$clustersHac <- cl_assignmenth #back to original data.
Compare the cluster solutions to true species.
iris_unique$trueN <- ifelse(iris_unique$Species == "setosa", 1, ifelse(iris_unique$Species == "versicolor", 2, ifelse(iris_unique$Species == "virginica", 3, NA)))
kmeancolor <- iris_unique[,6] #set color indicator
haccolor <- iris_unique[,7] #set color indicator
trueccolor <- iris_unique[,8] #set color indicator
par(mfrow=c(1,1))
plot(iris.som, type="mapping", bg = rgb(colour4), shape = "straight", border = "grey", labels = iris_unique[,5],col=contrast[trueccolor])
add.cluster.boundaries(iris.som, fit_kmeans$cluster, lwd = 3, lty = 2, col=contrast[4]) #name your pallette and choose color...
par(mfrow=c(1,2))
plot(iris.som, type="mapping", bg = rgb(colour4), shape = "straight", border = "grey", labels = iris_unique[,8],col=contrast[kmeancolor], main = "Kmean, mapping, col=cluster, no=trueSp")
add.cluster.boundaries(iris.som, fit_kmeans$cluster, lwd = 3, lty = 2, col=contrast[4]) #name your pallette and choose color...
plot(iris.som, type="mapping", bg = rgb(colour4), shape = "straight", border = "grey", labels = iris_unique[,6],col=contrast[trueccolor], main = "Kmean, mapping, col=true, no=cluster")
add.cluster.boundaries(iris.som, fit_kmeans$cluster, lwd = 3, lty = 2, col=contrast[4]) #name your pallette and choose color...
#ordering is off, but that does not mean that clustering is. Inspect the dataset with cluster assignments.
summary(iris_unique$Species) # and with guidens from above plots:
## setosa versicolor virginica
## 50 50 49
length(which(iris_unique$clustersKm[1:50] == 2)) #Ok. Zero misses.
## [1] 50
length(which(iris_unique$clustersKm[51:100] == 3)) #ok. 14 misses.
## [1] 36
length(which(iris_unique$clustersKm[101:149] == 1)) #Ok. 8 misses. In total 22.
## [1] 41
#Then:
iris_unique$testkm <- 1 #make everything "wrong" (i.e. 1).
#Cluster assignments can only be right in one whay, but wrong in k (no of clusters-1) ways.
#Better look for the "rights", rather than the "wrongs".
iris_unique$testkm[iris_unique$clustersKm == 2 & iris_unique$trueN == 1] <- 0 #Make 2 right, i.e. 0.
iris_unique$testkm[iris_unique$clustersKm == 3 & iris_unique$trueN == 2] <- 0 #3 was right for true 2.
iris_unique$testkm[iris_unique$clustersKm == 1 & iris_unique$trueN == 3] <- 0 #1 was righht for true 3.
testKM <- sum(iris_unique$testkm)
testKM #we already saw this in the previous section.
## [1] 22
Accuracykm <- 1-(testKM/149)
Accuracykm
## [1] 0.852349
par(mfrow=c(1,1))
plot(iris.som, type="mapping", bg = rgb(colour4), shape = "straight", border = "grey", labels = iris_unique[,5],col=contrast[haccolor])
add.cluster.boundaries(iris.som, HACgroups, lwd = 3, lty = 2, col=contrast[4]) #name your pallette and choose color...
par(mfrow=c(1,2))
plot(iris.som, type="mapping", bg = rgb(colour4), shape = "straight", border = "grey", labels = iris_unique[,8],col=contrast[haccolor], main = "HAC, mapping, col=cluster, no=trueSp")
add.cluster.boundaries(iris.som, HACgroups, lwd = 3, lty = 2, col=contrast[4]) #name your pallette and choose color...
plot(iris.som, type="mapping", bg = rgb(colour4), shape = "straight", border = "grey", labels = iris_unique[,7],col=contrast[trueccolor], main = "HAC, mapping, col=true, no=cluster")
add.cluster.boundaries(iris.som, HACgroups, lwd = 3, lty = 2, col=contrast[4]) #name your pallette and choose color...
#Inspect the dataset with cluster assignments.
length(which(iris_unique$clustersHac[1:50] == 1)) #Zero misses.
## [1] 50
length(which(iris_unique$clustersHac[51:100] == 3)) #15 misses.
## [1] 35
length(which(iris_unique$clustersHac[101:149] == 2)) #5 misses. 20 in total.
## [1] 44
#Then:
iris_unique$testhc <- 1 #make everything "wrong" (i.e. 1).
#Cluster assignments can only be right in one whay, but wrong in k-1 (no of clusters-1) ways. Therefore:
#Better look for the "rights", rather than the "wrongs".
iris_unique$testhc[iris_unique$clustersHac == 1 & iris_unique$trueN == 1] <- 0 #1 was right all along, make them 0.
iris_unique$testhc[iris_unique$clustersHac == 3 & iris_unique$trueN == 2] <- 0 #3 was right for true 2.
iris_unique$testhc[iris_unique$clustersHac == 2 & iris_unique$trueN == 3] <- 0 #2 was righht for true 3.
testHC <- sum(iris_unique$testhc)
testHC #we already saw this in the previous section.
## [1] 20
Accuracyhc <- 1-(testHC/nrow(iris.sc))
Accuracyhc
## [1] 0.8657718
GMY
## [1] "MYA"