1. Loading raw data and getting it ready

library(readr)
larva <- read_csv("~/GENER/SOMs/Ricardo/1. Raw Data/PSOMML_1Larva.csv")
larva <- larva[,-1:-2]

The dataframe obtained is made of integer values:

str(larva)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame':   262144 obs. of  3 variables:
 $ RED  : int  205 205 206 206 206 206 206 205 205 205 ...
 $ GREEN: int  209 209 210 210 210 210 210 209 209 209 ...
 $ BLUE : int  208 208 209 209 209 209 209 209 208 209 ...

This will be a problem down the line when feeding this data to the algorithm, values have to be changed to numeric:

#larva[,'FILA'] <- as.numeric(as.factor(larva$FILA))
#larva[,'COLUMNA'] <- as.numeric(as.factor(larva$COLUMNA))
larva[,'RED'] <- as.numeric(as.factor(larva$RED))
larva[,'GREEN'] <- as.numeric(as.factor(larva$GREEN))
larva[,'BLUE'] <- as.numeric(as.factor(larva$BLUE))

Whit this taken care of we have a database ready to be fed to the algorithmL

str(larva)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame':   262144 obs. of  3 variables:
 $ RED  : num  162 162 163 163 163 163 163 162 162 162 ...
 $ GREEN: num  169 169 170 170 170 170 170 169 169 169 ...
 $ BLUE : num  171 171 172 172 172 172 172 172 171 172 ...

2 Initial exploration of data

A brief summary of the values of the variable can help with the initial understanding of things:

summary(larva)
      RED            GREEN            BLUE      
 Min.   :  1.0   Min.   :  1.0   Min.   :  1.0  
 1st Qu.:158.0   1st Qu.:163.0   1st Qu.:167.0  
 Median :159.0   Median :165.0   Median :168.0  
 Mean   :157.8   Mean   :162.9   Mean   :165.3  
 3rd Qu.:161.0   3rd Qu.:166.0   3rd Qu.:170.0  
 Max.   :165.0   Max.   :171.0   Max.   :175.0  

3 Self Organizing Map

Loading required package

require(kohonen)

Changing dataframe to a matrix

#I removed the function scale() because it does not need to be scaled.
larva.matrix <- as.matrix(larva)
#TEST
larva.matrixTest <- larva.matrix[1:20000,]

Creating the SOM grid

som_grid60x60 <- somgrid(xdim = 60, ydim = 60, topo = "rectangular")

Training the SOM

Training progress evaluation

plot(som_model60, type = "changes")

Node counts

source('coolBlueHotRed.R')
plot(som_model60, type = "counts", palette.name=coolBlueHotRed)

par(mfrow = c(2,2))
models <- as.data.frame(som_model60$codes)
plot(som_model60,
     type = "property",
     property = models[,1],
     main=names(models)[1],
     palette.name=coolBlueHotRed)
plot(som_model60,
     type = "property",
     property = models[,2],
     main=names(models)[2],
     palette.name=coolBlueHotRed)
plot(som_model60,
     type = "property",
     property = models[,3],
     main=names(models)[3],
     palette.name=coolBlueHotRed)

#source('plotHeatMap.R')
#plotHeatMap(som_model60, data = models, variable=0)

Neighbor Distance

plot(som_model60, type = "dist.neighbours", palette.name = grey.colors)

Code Spread

plot(som_model60, type = "codes", palette.name = coolBlueHotRed)

Clustering

mydata <- som_model60$codes 
wss <- (nrow(mydata)-1)*sum(apply(data.frame(mydata),2,var)) 
for (i in 2:20) {
  wss[i] <- sum(kmeans(data.frame(mydata), centers=i)$withinss)
}
did not converge in 10 iterationsdid not converge in 10 iterationsdid not converge in 10 iterationsdid not converge in 10 iterationsdid not converge in 10 iterationsdid not converge in 10 iterationsdid not converge in 10 iterationsdid not converge in 10 iterationsdid not converge in 10 iterations
plot(1:20, wss, type="b", xlab="Number of Clusters",
     ylab="Within groups sum of squares", main="Within cluster sum of squares (WCSS)")

som_cluster <- cutree(hclust(dist(data.frame(som_model60$codes))), 3)

In this graphic we can see that the ideal amount of clusters is not more than 5. Starting with 5 and going down we will choose the smallest number that gives us a smooth distribution in quantity of it’s members:

# 7 Clusters -- the first cluster has considerably less members than the other four ones.
table(cutree(hclust(dist(data.frame(som_model60$codes))), 7))

   1    2    3    4    5    6    7 
  19   41  101   55  701 2556  127 
# 5 Clusters -- the first cluster has considerably less members than the other four ones.
table(cutree(hclust(dist(data.frame(som_model60$codes))), 5))

   1    2    3    4    5 
  60  101  182  701 2556 
#4 clusters -- we have founde the good one.
table(cutree(hclust(dist(data.frame(som_model60$codes))), 4))

   1    2    3    4 
  60  283  701 2556 
pretty_palette <- c("#1f77b4", '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2')
plot(som_model60, type="mapping", bgcol = pretty_palette[som_cluster], main = "Clusters", keepMargins = TRUE) 
add.cluster.boundaries(som_model60, som_cluster, lwd = 7)

LS0tDQp0aXRsZTogIlNPTSBMQVJWQSBlbiB1bmEgbWF0cml6IGRlIDYwIHggNjAgTmV1cm9uYXMiDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQojMS4gTG9hZGluZyByYXcgZGF0YSBhbmQgZ2V0dGluZyBpdCByZWFkeQ0KDQpgYGB7ciwgZWNobz1UUlVFLCBtZXNzYWdlPUZBTFNFLCB3YXJuaW5nPUZBTFNFfQ0KbGlicmFyeShyZWFkcikNCmxhcnZhIDwtIHJlYWRfY3N2KCJ+L0dFTkVSL1NPTXMvUmljYXJkby8xLiBSYXcgRGF0YS9QU09NTUxfMUxhcnZhLmNzdiIpDQoNCmxhcnZhIDwtIGxhcnZhWywtMTotMl0NCmBgYA0KDQpUaGUgZGF0YWZyYW1lIG9idGFpbmVkIGlzIG1hZGUgb2YgYGludGVnZXJgIHZhbHVlczoNCg0KYGBge3J9DQpzdHIobGFydmEpDQpgYGANCg0KDQpUaGlzIHdpbGwgYmUgYSBwcm9ibGVtIGRvd24gdGhlIGxpbmUgd2hlbiBmZWVkaW5nIHRoaXMgZGF0YSB0byB0aGUgYWxnb3JpdGhtLCB2YWx1ZXMgaGF2ZSB0byBiZSBjaGFuZ2VkIHRvICBgbnVtZXJpY2A6DQoNCmBgYHtyfQ0KI2xhcnZhWywnRklMQSddIDwtIGFzLm51bWVyaWMoYXMuZmFjdG9yKGxhcnZhJEZJTEEpKQ0KI2xhcnZhWywnQ09MVU1OQSddIDwtIGFzLm51bWVyaWMoYXMuZmFjdG9yKGxhcnZhJENPTFVNTkEpKQ0KbGFydmFbLCdSRUQnXSA8LSBhcy5udW1lcmljKGFzLmZhY3RvcihsYXJ2YSRSRUQpKQ0KbGFydmFbLCdHUkVFTiddIDwtIGFzLm51bWVyaWMoYXMuZmFjdG9yKGxhcnZhJEdSRUVOKSkNCmxhcnZhWywnQkxVRSddIDwtIGFzLm51bWVyaWMoYXMuZmFjdG9yKGxhcnZhJEJMVUUpKQ0KYGBgDQoNCldoaXQgdGhpcyB0YWtlbiBjYXJlIG9mIHdlIGhhdmUgYSBkYXRhYmFzZSByZWFkeSB0byBiZSBmZWQgdG8gdGhlIGFsZ29yaXRobUwNCmBgYHtyfQ0Kc3RyKGxhcnZhKQ0KYGBgDQojMiBJbml0aWFsIGV4cGxvcmF0aW9uIG9mIGRhdGENCg0KQSBicmllZiBzdW1tYXJ5IG9mIHRoZSB2YWx1ZXMgb2YgdGhlIHZhcmlhYmxlIGNhbiBoZWxwIHdpdGggdGhlIGluaXRpYWwgdW5kZXJzdGFuZGluZyBvZiB0aGluZ3M6DQpgYGB7cn0NCnN1bW1hcnkobGFydmEpDQpgYGANCg0KDQojMyBTZWxmIE9yZ2FuaXppbmcgTWFwDQoNCiMjTG9hZGluZyByZXF1aXJlZCBwYWNrYWdlDQoNCmBgYHtyLCBtZXNzYWdlPUZBTFNFLCB3YXJuaW5nPUZBTFNFfQ0KcmVxdWlyZShrb2hvbmVuKQ0KYGBgDQoNCiMjQ2hhbmdpbmcgZGF0YWZyYW1lIHRvIGEgYG1hdHJpeGANCmBgYHtyfQ0KI0kgcmVtb3ZlZCB0aGUgZnVuY3Rpb24gc2NhbGUoKSBiZWNhdXNlIGl0IGRvZXMgbm90IG5lZWQgdG8gYmUgc2NhbGVkLg0KbGFydmEubWF0cml4IDwtIGFzLm1hdHJpeChsYXJ2YSkNCg0KI1RFU1QNCmxhcnZhLm1hdHJpeFRlc3QgPC0gbGFydmEubWF0cml4WzE6MjAwMDAsXQ0KYGBgDQoNCiMjQ3JlYXRpbmcgdGhlIFNPTSBncmlkDQpgYGB7cn0NCnNvbV9ncmlkNjB4NjAgPC0gc29tZ3JpZCh4ZGltID0gNjAsIHlkaW0gPSA2MCwgdG9wbyA9ICJyZWN0YW5ndWxhciIpDQpgYGANCg0KDQojI1RyYWluaW5nIHRoZSBTT00NCmBgYHtyLCBtZXNzYWdlPUZBTFNFLCB3YXJuaW5nPUZBTFNFLCBpbmNsdWRlPUZBTFNFfQ0Kc29tX21vZGVsNjAgPC0gc29tKGxhcnZhLm1hdHJpeCwNCiAgICAgICAgICAgICAgICAgZ3JpZCA9IHNvbV9ncmlkNjB4NjAsDQogICAgICAgICAgICAgICAgIHJsZW4gPSAxNTAwLA0KICAgICAgICAgICAgICAgICBhbHBoYSA9IGMoMC4wNSwwLjAxKSwNCiAgICAgICAgICAgICAgICAga2VlcC5kYXRhID0gVFJVRSkNCmBgYA0KDQojI1RyYWluaW5nIHByb2dyZXNzIGV2YWx1YXRpb24NCmBgYHtyLCBtZXNzYWdlPUZBTFNFLCB3YXJuaW5nPUZBTFNFfQ0KcGxvdChzb21fbW9kZWw2MCwgdHlwZSA9ICJjaGFuZ2VzIikNCmBgYA0KDQojI05vZGUgY291bnRzDQpgYGB7ciwgZmlnLmhlaWdodD0gOH0NCnNvdXJjZSgnY29vbEJsdWVIb3RSZWQuUicpDQpwbG90KHNvbV9tb2RlbDYwLCB0eXBlID0gImNvdW50cyIsIHBhbGV0dGUubmFtZT1jb29sQmx1ZUhvdFJlZCkNCmBgYA0KDQpgYGB7ciwgZmlnLmhlaWdodD0xMH0NCnBhcihtZnJvdyA9IGMoMiwyKSkNCm1vZGVscyA8LSBhcy5kYXRhLmZyYW1lKHNvbV9tb2RlbDYwJGNvZGVzKQ0KcGxvdChzb21fbW9kZWw2MCwNCiAgICAgdHlwZSA9ICJwcm9wZXJ0eSIsDQogICAgIHByb3BlcnR5ID0gbW9kZWxzWywxXSwNCiAgICAgbWFpbj1uYW1lcyhtb2RlbHMpWzFdLA0KICAgICBwYWxldHRlLm5hbWU9Y29vbEJsdWVIb3RSZWQpDQoNCnBsb3Qoc29tX21vZGVsNjAsDQogICAgIHR5cGUgPSAicHJvcGVydHkiLA0KICAgICBwcm9wZXJ0eSA9IG1vZGVsc1ssMl0sDQogICAgIG1haW49bmFtZXMobW9kZWxzKVsyXSwNCiAgICAgcGFsZXR0ZS5uYW1lPWNvb2xCbHVlSG90UmVkKQ0KDQpwbG90KHNvbV9tb2RlbDYwLA0KICAgICB0eXBlID0gInByb3BlcnR5IiwNCiAgICAgcHJvcGVydHkgPSBtb2RlbHNbLDNdLA0KICAgICBtYWluPW5hbWVzKG1vZGVscylbM10sDQogICAgIHBhbGV0dGUubmFtZT1jb29sQmx1ZUhvdFJlZCkNCg0KDQpgYGANCg0KYGBge3J9DQojc291cmNlKCdwbG90SGVhdE1hcC5SJykNCiNwbG90SGVhdE1hcChzb21fbW9kZWw2MCwgZGF0YSA9IG1vZGVscywgdmFyaWFibGU9MCkNCmBgYA0KDQoNCiMjTmVpZ2hib3IgRGlzdGFuY2UNCmBgYHtyLCBmaWcuYWxpZ249J2NlbnRlcid9DQpwbG90KHNvbV9tb2RlbDYwLCB0eXBlID0gImRpc3QubmVpZ2hib3VycyIsIHBhbGV0dGUubmFtZSA9IGdyZXkuY29sb3JzKQ0KYGBgDQoNCiMjQ29kZSBTcHJlYWQNCmBgYHtyLCBmaWcuaGVpZ2h0PSAxMH0NCnBsb3Qoc29tX21vZGVsNjAsIHR5cGUgPSAiY29kZXMiLCBwYWxldHRlLm5hbWUgPSBjb29sQmx1ZUhvdFJlZCkNCmBgYA0KDQojI0NsdXN0ZXJpbmcNCmBgYHtyfQ0KbXlkYXRhIDwtIHNvbV9tb2RlbDYwJGNvZGVzIA0Kd3NzIDwtIChucm93KG15ZGF0YSktMSkqc3VtKGFwcGx5KGRhdGEuZnJhbWUobXlkYXRhKSwyLHZhcikpIA0KZm9yIChpIGluIDI6MjApIHsNCiAgd3NzW2ldIDwtIHN1bShrbWVhbnMoZGF0YS5mcmFtZShteWRhdGEpLCBjZW50ZXJzPWkpJHdpdGhpbnNzKQ0KfQ0KcGxvdCgxOjIwLCB3c3MsIHR5cGU9ImIiLCB4bGFiPSJOdW1iZXIgb2YgQ2x1c3RlcnMiLA0KICAgICB5bGFiPSJXaXRoaW4gZ3JvdXBzIHN1bSBvZiBzcXVhcmVzIiwgbWFpbj0iV2l0aGluIGNsdXN0ZXIgc3VtIG9mIHNxdWFyZXMgKFdDU1MpIikNCg0Kc29tX2NsdXN0ZXIgPC0gY3V0cmVlKGhjbHVzdChkaXN0KGRhdGEuZnJhbWUoc29tX21vZGVsNjAkY29kZXMpKSksIDMpDQoNCg0KYGBgDQoNCkluIHRoaXMgZ3JhcGhpYyB3ZSBjYW4gc2VlIHRoYXQgdGhlIGlkZWFsIGFtb3VudCBvZiBjbHVzdGVycyBpcyBub3QgbW9yZSB0aGFuIDUuIFN0YXJ0aW5nIHdpdGggNSBhbmQgZ29pbmcgZG93biB3ZSB3aWxsIGNob29zZSB0aGUgc21hbGxlc3QgbnVtYmVyIHRoYXQgZ2l2ZXMgdXMgYSBzbW9vdGggZGlzdHJpYnV0aW9uIGluIHF1YW50aXR5IG9mIGl0J3MgbWVtYmVyczoNCg0KYGBge3J9DQojIDcgQ2x1c3RlcnMgLS0gdGhlIGZpcnN0IGNsdXN0ZXIgaGFzIGNvbnNpZGVyYWJseSBsZXNzIG1lbWJlcnMgdGhhbiB0aGUgb3RoZXIgZm91ciBvbmVzLg0KdGFibGUoY3V0cmVlKGhjbHVzdChkaXN0KGRhdGEuZnJhbWUoc29tX21vZGVsNjAkY29kZXMpKSksIDcpKQ0KDQoNCiMgNSBDbHVzdGVycyAtLSB0aGUgZmlyc3QgY2x1c3RlciBoYXMgY29uc2lkZXJhYmx5IGxlc3MgbWVtYmVycyB0aGFuIHRoZSBvdGhlciBmb3VyIG9uZXMuDQp0YWJsZShjdXRyZWUoaGNsdXN0KGRpc3QoZGF0YS5mcmFtZShzb21fbW9kZWw2MCRjb2RlcykpKSwgNSkpDQoNCiM0IGNsdXN0ZXJzIC0tIHdlIGhhdmUgZm91bmRlIHRoZSBnb29kIG9uZS4NCnRhYmxlKGN1dHJlZShoY2x1c3QoZGlzdChkYXRhLmZyYW1lKHNvbV9tb2RlbDYwJGNvZGVzKSkpLCA0KSkNCmBgYA0KDQoNCmBgYHtyLCBmaWcuaGVpZ2h0PSAxMH0NCnByZXR0eV9wYWxldHRlIDwtIGMoIiMxZjc3YjQiLCAnI2ZmN2YwZScsICcjMmNhMDJjJywgJyNkNjI3MjgnLCAnIzk0NjdiZCcsICcjOGM1NjRiJywgJyNlMzc3YzInKQ0KDQpwbG90KHNvbV9tb2RlbDYwLCB0eXBlPSJtYXBwaW5nIiwgYmdjb2wgPSBwcmV0dHlfcGFsZXR0ZVtzb21fY2x1c3Rlcl0sIG1haW4gPSAiQ2x1c3RlcnMiLCBrZWVwTWFyZ2lucyA9IFRSVUUpIA0KYWRkLmNsdXN0ZXIuYm91bmRhcmllcyhzb21fbW9kZWw2MCwgc29tX2NsdXN0ZXIsIGx3ZCA9IDcpDQpgYGANCg0K