1. Loading raw data and getting it ready

library(readr)
larva2 <- read_csv("~/GENER/SOMs/Ricardo/1. Raw Data/PSOMML_2Larvas.csv", 
    col_names = FALSE)
colnames(larva2) <- c("FILA", "COLUMNA", "RED", "GREEN", "BLUE")
larva2 <- larva2[,-1:-2]

The dataframe obtained is made of integer values:

str(larva2)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame':   262144 obs. of  3 variables:
 $ RED  : int  206 206 205 204 204 204 204 204 204 204 ...
 $ GREEN: int  207 207 208 208 208 208 208 208 208 208 ...
 $ BLUE : int  209 209 209 209 209 209 209 209 209 209 ...

This will be a problem down the line when feeding this data to the algorithm, values have to be changed to numeric:

larva2[,'RED'] <- as.numeric(as.factor(larva2$RED))
larva2[,'GREEN'] <- as.numeric(as.factor(larva2$GREEN))
larva2[,'BLUE'] <- as.numeric(as.factor(larva2$BLUE))

Whit this taken care of we have a database ready to be fed to the algorithmL

str(larva2)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame':   262144 obs. of  3 variables:
 $ RED  : num  166 166 165 164 164 164 164 164 164 164 ...
 $ GREEN: num  166 166 167 167 167 167 167 167 167 167 ...
 $ BLUE : num  170 170 170 170 170 170 170 170 170 170 ...

2 Initial exploration of data

A brief summary of the values of the variable can help with the initial understanding of things:

summary(larva2)
      RED            GREEN            BLUE      
 Min.   :  1.0   Min.   :  1.0   Min.   :  1.0  
 1st Qu.:164.0   1st Qu.:166.0   1st Qu.:169.0  
 Median :166.0   Median :167.0   Median :170.0  
 Mean   :163.3   Mean   :164.6   Mean   :165.5  
 3rd Qu.:167.0   3rd Qu.:168.0   3rd Qu.:171.0  
 Max.   :171.0   Max.   :172.0   Max.   :175.0  

3 Self Organizing Map

Loading required package

require(kohonen)

Changing dataframe to a matrix

#I removed the function scale() because it does not need to be scaled.
larva2.matrix <- as.matrix(larva2)
#TEST
larva2.matrixTest <- larva2.matrix[1:20000,]

Creating the SOM grid

som_grid <- somgrid(xdim = 30, ydim = 30, topo = "rectangular")

Training the SOM

Training progress evaluation

plot(som_model, type = "changes")

Node counts

source('coolBlueHotRed.R')
plot(som_model, type = "counts", palette.name=coolBlueHotRed)

SOM by RGB value

par(mfrow = c(2,2))
models <- as.data.frame(som_model$codes)
plot(som_model,
     type = "property",
     property = models[,1],
     main=names(models)[1],
     palette.name=coolBlueHotRed)
plot(som_model,
     type = "property",
     property = models[,2],
     main=names(models)[2],
     palette.name=coolBlueHotRed)
plot(som_model,
     type = "property",
     property = models[,3],
     main=names(models)[3],
     palette.name=coolBlueHotRed)

Neighbor Distance

plot(som_model, type = "dist.neighbours", palette.name = grey.colors)

Code Spread

plot(som_model, type = "codes", palette.name = coolBlueHotRed)

Clustering

mydata <- som_model$codes 
wss <- (nrow(mydata)-1)*sum(apply(data.frame(mydata),2,var)) 
for (i in 2:20) {
  wss[i] <- sum(kmeans(data.frame(mydata), centers=i)$withinss)
}
did not converge in 10 iterationsdid not converge in 10 iterationsdid not converge in 10 iterationsdid not converge in 10 iterationsdid not converge in 10 iterationsdid not converge in 10 iterationsdid not converge in 10 iterationsdid not converge in 10 iterationsdid not converge in 10 iterations
plot(1:20, wss, type="b", xlab="Number of Clusters",
     ylab="Within groups sum of squares", main="Within cluster sum of squares (WCSS)")

som_cluster <- cutree(hclust(dist(data.frame(som_model$codes))), 3)

In this graphic we can see that the ideal amount of clusters is not more than 5. Starting with 5 and going down we will choose the smallest number that gives us a smooth distribution in quantity of it’s members:

# 7 Clusters -- the first cluster has considerably less members than the other four ones.
table(cutree(hclust(dist(data.frame(som_model$codes))), 3))

  1   2   3 
717 117  66 
# 5 Clusters -- the first cluster has considerably less members than the other four ones.
table(cutree(hclust(dist(data.frame(som_model$codes))), 5))

  1   2   3   4   5 
717  63  54  50  16 
#4 clusters -- we have founde the good one.
table(cutree(hclust(dist(data.frame(som_model$codes))), 4))

  1   2   3   4 
717 117  50  16 
pretty_palette <- c("#1f77b4", '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2')
plot(som_model, type="mapping", bgcol = pretty_palette[som_cluster], main = "Clusters", keepMargins = TRUE) 
add.cluster.boundaries(som_model, som_cluster, lwd = 7)

LS0tDQp0aXRsZTogIlNPTSAyIExBUlZBUyINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KLS0tDQoNCiMxLiBMb2FkaW5nIHJhdyBkYXRhIGFuZCBnZXR0aW5nIGl0IHJlYWR5DQoNCmBgYHtyLCBlY2hvPVRSVUUsIG1lc3NhZ2U9RkFMU0UsIHdhcm5pbmc9RkFMU0V9DQpsaWJyYXJ5KHJlYWRyKQ0KbGFydmEyIDwtIHJlYWRfY3N2KCJ+L0dFTkVSL1NPTXMvUmljYXJkby8xLiBSYXcgRGF0YS9QU09NTUxfMkxhcnZhcy5jc3YiLCANCiAgICBjb2xfbmFtZXMgPSBGQUxTRSkNCmNvbG5hbWVzKGxhcnZhMikgPC0gYygiRklMQSIsICJDT0xVTU5BIiwgIlJFRCIsICJHUkVFTiIsICJCTFVFIikNCg0KbGFydmEyIDwtIGxhcnZhMlssLTE6LTJdDQoNCg0KYGBgDQoNClRoZSBkYXRhZnJhbWUgb2J0YWluZWQgaXMgbWFkZSBvZiBgaW50ZWdlcmAgdmFsdWVzOg0KDQpgYGB7cn0NCnN0cihsYXJ2YTIpDQpgYGANCg0KDQpUaGlzIHdpbGwgYmUgYSBwcm9ibGVtIGRvd24gdGhlIGxpbmUgd2hlbiBmZWVkaW5nIHRoaXMgZGF0YSB0byB0aGUgYWxnb3JpdGhtLCB2YWx1ZXMgaGF2ZSB0byBiZSBjaGFuZ2VkIHRvICBgbnVtZXJpY2A6DQoNCmBgYHtyfQ0KbGFydmEyWywnUkVEJ10gPC0gYXMubnVtZXJpYyhhcy5mYWN0b3IobGFydmEyJFJFRCkpDQpsYXJ2YTJbLCdHUkVFTiddIDwtIGFzLm51bWVyaWMoYXMuZmFjdG9yKGxhcnZhMiRHUkVFTikpDQpsYXJ2YTJbLCdCTFVFJ10gPC0gYXMubnVtZXJpYyhhcy5mYWN0b3IobGFydmEyJEJMVUUpKQ0KYGBgDQoNCldoaXQgdGhpcyB0YWtlbiBjYXJlIG9mIHdlIGhhdmUgYSBkYXRhYmFzZSByZWFkeSB0byBiZSBmZWQgdG8gdGhlIGFsZ29yaXRobUwNCmBgYHtyfQ0Kc3RyKGxhcnZhMikNCmBgYA0KIzIgSW5pdGlhbCBleHBsb3JhdGlvbiBvZiBkYXRhDQoNCkEgYnJpZWYgc3VtbWFyeSBvZiB0aGUgdmFsdWVzIG9mIHRoZSB2YXJpYWJsZSBjYW4gaGVscCB3aXRoIHRoZSBpbml0aWFsIHVuZGVyc3RhbmRpbmcgb2YgdGhpbmdzOg0KYGBge3J9DQpzdW1tYXJ5KGxhcnZhMikNCmBgYA0KDQoNCiMzIFNlbGYgT3JnYW5pemluZyBNYXANCg0KIyNMb2FkaW5nIHJlcXVpcmVkIHBhY2thZ2UNCg0KYGBge3IsIG1lc3NhZ2U9RkFMU0UsIHdhcm5pbmc9RkFMU0V9DQpyZXF1aXJlKGtvaG9uZW4pDQpgYGANCg0KIyNDaGFuZ2luZyBkYXRhZnJhbWUgdG8gYSBgbWF0cml4YA0KYGBge3J9DQojSSByZW1vdmVkIHRoZSBmdW5jdGlvbiBzY2FsZSgpIGJlY2F1c2UgaXQgZG9lcyBub3QgbmVlZCB0byBiZSBzY2FsZWQuDQpsYXJ2YTIubWF0cml4IDwtIGFzLm1hdHJpeChsYXJ2YTIpDQoNCiNURVNUDQpsYXJ2YTIubWF0cml4VGVzdCA8LSBsYXJ2YTIubWF0cml4WzE6MjAwMDAsXQ0KYGBgDQoNCiMjQ3JlYXRpbmcgdGhlIFNPTSBncmlkDQpgYGB7cn0NCnNvbV9ncmlkIDwtIHNvbWdyaWQoeGRpbSA9IDMwLCB5ZGltID0gMzAsIHRvcG8gPSAicmVjdGFuZ3VsYXIiKQ0KYGBgDQoNCg0KIyNUcmFpbmluZyB0aGUgU09NDQpgYGB7ciwgbWVzc2FnZT1GQUxTRSwgd2FybmluZz1GQUxTRSwgaW5jbHVkZT1GQUxTRX0NCnNvbV9tb2RlbCA8LSBzb20obGFydmEyLm1hdHJpeCwNCiAgICAgICAgICAgICAgICAgZ3JpZCA9IHNvbV9ncmlkLA0KICAgICAgICAgICAgICAgICBybGVuID0gMTUwMCwNCiAgICAgICAgICAgICAgICAgYWxwaGEgPSBjKDAuMDUsMC4wMSksDQogICAgICAgICAgICAgICAgIGtlZXAuZGF0YSA9IFRSVUUpDQpgYGANCg0KIyNUcmFpbmluZyBwcm9ncmVzcyBldmFsdWF0aW9uDQpgYGB7ciwgbWVzc2FnZT1GQUxTRSwgd2FybmluZz1GQUxTRX0NCnBsb3Qoc29tX21vZGVsLCB0eXBlID0gImNoYW5nZXMiKQ0KYGBgDQoNCiMjTm9kZSBjb3VudHMNCmBgYHtyLCBmaWcuaGVpZ2h0PSA4fQ0Kc291cmNlKCdjb29sQmx1ZUhvdFJlZC5SJykNCnBsb3Qoc29tX21vZGVsLCB0eXBlID0gImNvdW50cyIsIHBhbGV0dGUubmFtZT1jb29sQmx1ZUhvdFJlZCkNCmBgYA0KDQojI1NPTSBieSBSR0IgdmFsdWUNCg0KYGBge3IsIGZpZy5oZWlnaHQ9MTB9DQpwYXIobWZyb3cgPSBjKDIsMikpDQptb2RlbHMgPC0gYXMuZGF0YS5mcmFtZShzb21fbW9kZWwkY29kZXMpDQpwbG90KHNvbV9tb2RlbCwNCiAgICAgdHlwZSA9ICJwcm9wZXJ0eSIsDQogICAgIHByb3BlcnR5ID0gbW9kZWxzWywxXSwNCiAgICAgbWFpbj1uYW1lcyhtb2RlbHMpWzFdLA0KICAgICBwYWxldHRlLm5hbWU9Y29vbEJsdWVIb3RSZWQpDQpwbG90KHNvbV9tb2RlbCwNCiAgICAgdHlwZSA9ICJwcm9wZXJ0eSIsDQogICAgIHByb3BlcnR5ID0gbW9kZWxzWywyXSwNCiAgICAgbWFpbj1uYW1lcyhtb2RlbHMpWzJdLA0KICAgICBwYWxldHRlLm5hbWU9Y29vbEJsdWVIb3RSZWQpDQpwbG90KHNvbV9tb2RlbCwNCiAgICAgdHlwZSA9ICJwcm9wZXJ0eSIsDQogICAgIHByb3BlcnR5ID0gbW9kZWxzWywzXSwNCiAgICAgbWFpbj1uYW1lcyhtb2RlbHMpWzNdLA0KICAgICBwYWxldHRlLm5hbWU9Y29vbEJsdWVIb3RSZWQpDQoNCmBgYA0KDQoNCiMjTmVpZ2hib3IgRGlzdGFuY2UNCmBgYHtyLCBmaWcuYWxpZ249J2NlbnRlcid9DQpwbG90KHNvbV9tb2RlbCwgdHlwZSA9ICJkaXN0Lm5laWdoYm91cnMiLCBwYWxldHRlLm5hbWUgPSBncmV5LmNvbG9ycykNCmBgYA0KDQojI0NvZGUgU3ByZWFkDQpgYGB7ciwgZmlnLmhlaWdodD0gMTB9DQpwbG90KHNvbV9tb2RlbCwgdHlwZSA9ICJjb2RlcyIsIHBhbGV0dGUubmFtZSA9IGNvb2xCbHVlSG90UmVkKQ0KYGBgDQoNCiMjQ2x1c3RlcmluZw0KYGBge3J9DQpteWRhdGEgPC0gc29tX21vZGVsJGNvZGVzIA0Kd3NzIDwtIChucm93KG15ZGF0YSktMSkqc3VtKGFwcGx5KGRhdGEuZnJhbWUobXlkYXRhKSwyLHZhcikpIA0KZm9yIChpIGluIDI6MjApIHsNCiAgd3NzW2ldIDwtIHN1bShrbWVhbnMoZGF0YS5mcmFtZShteWRhdGEpLCBjZW50ZXJzPWkpJHdpdGhpbnNzKQ0KfQ0KcGxvdCgxOjIwLCB3c3MsIHR5cGU9ImIiLCB4bGFiPSJOdW1iZXIgb2YgQ2x1c3RlcnMiLA0KICAgICB5bGFiPSJXaXRoaW4gZ3JvdXBzIHN1bSBvZiBzcXVhcmVzIiwgbWFpbj0iV2l0aGluIGNsdXN0ZXIgc3VtIG9mIHNxdWFyZXMgKFdDU1MpIikNCg0Kc29tX2NsdXN0ZXIgPC0gY3V0cmVlKGhjbHVzdChkaXN0KGRhdGEuZnJhbWUoc29tX21vZGVsJGNvZGVzKSkpLCAzKQ0KDQoNCmBgYA0KDQpJbiB0aGlzIGdyYXBoaWMgd2UgY2FuIHNlZSB0aGF0IHRoZSBpZGVhbCBhbW91bnQgb2YgY2x1c3RlcnMgaXMgbm90IG1vcmUgdGhhbiA1LiBTdGFydGluZyB3aXRoIDUgYW5kIGdvaW5nIGRvd24gd2Ugd2lsbCBjaG9vc2UgdGhlIHNtYWxsZXN0IG51bWJlciB0aGF0IGdpdmVzIHVzIGEgc21vb3RoIGRpc3RyaWJ1dGlvbiBpbiBxdWFudGl0eSBvZiBpdCdzIG1lbWJlcnM6DQoNCmBgYHtyfQ0KIyA3IENsdXN0ZXJzIC0tIHRoZSBmaXJzdCBjbHVzdGVyIGhhcyBjb25zaWRlcmFibHkgbGVzcyBtZW1iZXJzIHRoYW4gdGhlIG90aGVyIGZvdXIgb25lcy4NCnRhYmxlKGN1dHJlZShoY2x1c3QoZGlzdChkYXRhLmZyYW1lKHNvbV9tb2RlbCRjb2RlcykpKSwgMykpDQoNCg0KIyA1IENsdXN0ZXJzIC0tIHRoZSBmaXJzdCBjbHVzdGVyIGhhcyBjb25zaWRlcmFibHkgbGVzcyBtZW1iZXJzIHRoYW4gdGhlIG90aGVyIGZvdXIgb25lcy4NCnRhYmxlKGN1dHJlZShoY2x1c3QoZGlzdChkYXRhLmZyYW1lKHNvbV9tb2RlbCRjb2RlcykpKSwgNSkpDQoNCiM0IGNsdXN0ZXJzIC0tIHdlIGhhdmUgZm91bmRlIHRoZSBnb29kIG9uZS4NCnRhYmxlKGN1dHJlZShoY2x1c3QoZGlzdChkYXRhLmZyYW1lKHNvbV9tb2RlbCRjb2RlcykpKSwgNCkpDQpgYGANCg0KDQpgYGB7ciwgZmlnLmhlaWdodD0gMTB9DQpwcmV0dHlfcGFsZXR0ZSA8LSBjKCIjMWY3N2I0IiwgJyNmZjdmMGUnLCAnIzJjYTAyYycsICcjZDYyNzI4JywgJyM5NDY3YmQnLCAnIzhjNTY0YicsICcjZTM3N2MyJykNCg0KcGxvdChzb21fbW9kZWwsIHR5cGU9Im1hcHBpbmciLCBiZ2NvbCA9IHByZXR0eV9wYWxldHRlW3NvbV9jbHVzdGVyXSwgbWFpbiA9ICJDbHVzdGVycyIsIGtlZXBNYXJnaW5zID0gVFJVRSkgDQphZGQuY2x1c3Rlci5ib3VuZGFyaWVzKHNvbV9tb2RlbCwgc29tX2NsdXN0ZXIsIGx3ZCA9IDcpDQpgYGANCg0K