1. Loading raw data and getting it ready

library(readr)
library(readr)
img <- read_csv("~/GENER/Gerardo/1. Raw Data/DatosDifsg.csv")
img <- img[,-1:-2]

The dataframe obtained is made of integer values:

str(img)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame':   250 obs. of  6 variables:
 $ sgBaseDiffb: num  -9.94e-06 -7.44e-06 -3.55e-06 9.08e-07 5.14e-06 ...
 $ sg102Diffb : num  5.05e-06 3.19e-06 3.08e-06 3.99e-06 5.16e-06 ...
 $ sg113Diffb : num  1.73e-05 9.19e-06 3.74e-06 4.58e-07 -1.18e-06 ...
 $ sg130Diffb : num  3.62e-05 2.66e-05 1.79e-05 1.02e-05 3.60e-06 ...
 $ sg103Diffb : num  -1.69e-05 -1.57e-05 -1.17e-05 -6.23e-06 -3.03e-07 ...
 $ sg91Diffb  : num  2.58e-06 2.42e-06 1.81e-06 1.04e-06 4.16e-07 ...

This will be a problem down the line when feeding this data to the algorithm, values have to be changed to numeric:

#img[,'FILA'] <- as.numeric(as.factor(img$FILA))
#img[,'COLUMNA'] <- as.numeric(as.factor(img$COLUMNA))
#img[,'RED'] <- as.numeric(as.factor(img$RED))
#img[,'GREEN'] <- as.numeric(as.factor(img$GREEN))
#img[,'BLUE'] <- as.numeric(as.factor(img$BLUE))

Whit this taken care of we have a database ready to be fed to the algorithmL

#str(img)

2 Initial exploration of data

A brief summary of the values of the variable can help with the initial understanding of things:

summary(img)
  sgBaseDiffb           sg102Diffb           sg113Diffb           sg130Diffb           sg103Diffb        
 Min.   :-1.950e-05   Min.   :-1.997e-05   Min.   :-3.320e-05   Min.   :-3.283e-05   Min.   :-3.985e-05  
 1st Qu.:-5.326e-06   1st Qu.:-5.057e-06   1st Qu.:-5.765e-06   1st Qu.:-5.985e-06   1st Qu.:-7.367e-06  
 Median :-6.496e-07   Median : 6.249e-09   Median : 1.152e-07   Median :-3.131e-07   Median :-8.982e-07  
 Mean   : 3.176e-08   Mean   : 1.960e-08   Mean   : 2.278e-08   Mean   : 1.672e-07   Mean   :-3.965e-07  
 3rd Qu.: 5.720e-06   3rd Qu.: 4.881e-06   3rd Qu.: 5.511e-06   3rd Qu.: 6.917e-06   3rd Qu.: 6.313e-06  
 Max.   : 1.887e-05   Max.   : 2.203e-05   Max.   : 2.353e-05   Max.   : 3.616e-05   Max.   : 3.413e-05  
   sg91Diffb         
 Min.   :-4.790e-05  
 1st Qu.:-6.094e-06  
 Median :-3.885e-07  
 Mean   : 9.732e-08  
 3rd Qu.: 5.872e-06  
 Max.   : 4.821e-05  

3 Self Organizing Map

Loading required package

require(kohonen)

Changing dataframe to a matrix and normalizing it’s values

img.matrix <- as.matrix(img)
#TEST
#img.matrixTest <- img.matrix[1:1000000,]

Creating the SOM grid

som_grid <- somgrid(xdim = 6, ydim = 6, topo = "hexagonal")

Training the SOM

Training progress evaluation

plot(som_model, type = "changes")

Node counts

source('coolBlueHotRed.R')
plot(som_model, type = "counts", palette.name=coolBlueHotRed)

SOM by RGB value

source("coolBlueHotRed.R")
models <- as.data.frame(som_model$codes)
par(mfrow = c(3,3))
plot(som_model,
     type = "property",
     property = models[,1],
     main=names(models)[1],
     palette.name=coolBlueHotRed)
plot(som_model,
     type = "property",
     property = models[,2],
     main=names(models)[2],
     palette.name=coolBlueHotRed)
plot(som_model,
     type = "property",
     property = models[,3],
     main=names(models)[3],
     palette.name=coolBlueHotRed)
plot(som_model,
     type = "property",
     property = models[,4],
     main=names(models)[4],
     palette.name=coolBlueHotRed)
plot(som_model,
     type = "property",
     property = models[,5],
     main=names(models)[5],
     palette.name=coolBlueHotRed)
plot(som_model,
     type = "property",
     property = models[,6],
     main=names(models)[6],
     palette.name=coolBlueHotRed)

Neighbor Distance

plot(som_model, type = "dist.neighbours", palette.name = grey.colors)

Code Spread

plot(som_model, type = "codes", palette.name = coolBlueHotRed)

Clustering

mydata <- som_model$codes 
wss <- (nrow(mydata)-1)*sum(apply(data.frame(mydata),2,var)) 
for (i in 2:20) {
  wss[i] <- sum(kmeans(data.frame(mydata), centers=i)$withinss)
}
plot(1:20, wss, type="b", xlab="Number of Clusters",
     ylab="Within groups sum of squares", main="Within cluster sum of squares (WCSS)")

som_cluster <- cutree(hclust(dist(data.frame(som_model$codes))), 3)

In this graphic we can see that the ideal amount of clusters is not more than 5. Starting with 5 and going down we will choose the smallest number that gives us a smooth distribution in quantity of it’s members:

# 7 Clusters -- the first cluster has considerably less members than the other four ones.
table(cutree(hclust(dist(data.frame(som_model$codes))), 3))


# 5 Clusters -- the first cluster has considerably less members than the other four ones.
table(cutree(hclust(dist(data.frame(som_model$codes))), 5))

#4 clusters -- we have founde the good one.
table(cutree(hclust(dist(data.frame(som_model$codes))), 4))
pretty_palette <- c("#1f77b4", '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2')
plot(som_model, type="mapping", bgcol = pretty_palette[som_cluster], main = "Clusters", keepMargins = TRUE) 
add.cluster.boundaries(som_model, som_cluster, lwd = 7)

LS0tDQp0aXRsZTogIlNPTSBmb3IgR2VyYXJkbydzIERhdGEiDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQojMS4gTG9hZGluZyByYXcgZGF0YSBhbmQgZ2V0dGluZyBpdCByZWFkeQ0KDQpgYGB7ciwgZWNobz1UUlVFLCBtZXNzYWdlPUZBTFNFLCB3YXJuaW5nPUZBTFNFfQ0KbGlicmFyeShyZWFkcikNCmxpYnJhcnkocmVhZHIpDQppbWcgPC0gcmVhZF9jc3YoIn4vR0VORVIvR2VyYXJkby8xLiBSYXcgRGF0YS9EYXRvc0RpZnNnLmNzdiIpDQppbWcgPC0gaW1nWywtMTotMl0NCmBgYA0KDQpUaGUgZGF0YWZyYW1lIG9idGFpbmVkIGlzIG1hZGUgb2YgYGludGVnZXJgIHZhbHVlczoNCg0KYGBge3J9DQpzdHIoaW1nKQ0KYGBgDQoNCg0KVGhpcyB3aWxsIGJlIGEgcHJvYmxlbSBkb3duIHRoZSBsaW5lIHdoZW4gZmVlZGluZyB0aGlzIGRhdGEgdG8gdGhlIGFsZ29yaXRobSwgdmFsdWVzIGhhdmUgdG8gYmUgY2hhbmdlZCB0byAgYG51bWVyaWNgOg0KDQpgYGB7cn0NCiNpbWdbLCdGSUxBJ10gPC0gYXMubnVtZXJpYyhhcy5mYWN0b3IoaW1nJEZJTEEpKQ0KI2ltZ1ssJ0NPTFVNTkEnXSA8LSBhcy5udW1lcmljKGFzLmZhY3RvcihpbWckQ09MVU1OQSkpDQojaW1nWywnUkVEJ10gPC0gYXMubnVtZXJpYyhhcy5mYWN0b3IoaW1nJFJFRCkpDQojaW1nWywnR1JFRU4nXSA8LSBhcy5udW1lcmljKGFzLmZhY3RvcihpbWckR1JFRU4pKQ0KI2ltZ1ssJ0JMVUUnXSA8LSBhcy5udW1lcmljKGFzLmZhY3RvcihpbWckQkxVRSkpDQpgYGANCg0KV2hpdCB0aGlzIHRha2VuIGNhcmUgb2Ygd2UgaGF2ZSBhIGRhdGFiYXNlIHJlYWR5IHRvIGJlIGZlZCB0byB0aGUgYWxnb3JpdGhtTA0KYGBge3J9DQojc3RyKGltZykNCmBgYA0KIzIgSW5pdGlhbCBleHBsb3JhdGlvbiBvZiBkYXRhDQoNCkEgYnJpZWYgc3VtbWFyeSBvZiB0aGUgdmFsdWVzIG9mIHRoZSB2YXJpYWJsZSBjYW4gaGVscCB3aXRoIHRoZSBpbml0aWFsIHVuZGVyc3RhbmRpbmcgb2YgdGhpbmdzOg0KYGBge3J9DQpzdW1tYXJ5KGltZykNCmBgYA0KDQoNCiMzIFNlbGYgT3JnYW5pemluZyBNYXANCg0KIyNMb2FkaW5nIHJlcXVpcmVkIHBhY2thZ2UNCg0KYGBge3IsIG1lc3NhZ2U9RkFMU0UsIHdhcm5pbmc9RkFMU0V9DQpyZXF1aXJlKGtvaG9uZW4pDQpgYGANCg0KIyNDaGFuZ2luZyBkYXRhZnJhbWUgdG8gYSBgbWF0cml4YCBhbmQgbm9ybWFsaXppbmcgaXQncyB2YWx1ZXMNCmBgYHtyfQ0KaW1nLm1hdHJpeCA8LSBhcy5tYXRyaXgoaW1nKQ0KDQojVEVTVA0KI2ltZy5tYXRyaXhUZXN0IDwtIGltZy5tYXRyaXhbMToxMDAwMDAwLF0NCmBgYA0KDQojI0NyZWF0aW5nIHRoZSBTT00gZ3JpZA0KYGBge3J9DQpzb21fZ3JpZCA8LSBzb21ncmlkKHhkaW0gPSA2LCB5ZGltID0gNiwgdG9wbyA9ICJoZXhhZ29uYWwiKQ0KYGBgDQoNCg0KIyNUcmFpbmluZyB0aGUgU09NDQpgYGB7ciwgbWVzc2FnZT1GQUxTRSwgd2FybmluZz1GQUxTRSwgaW5jbHVkZT1GQUxTRX0NCnNvbV9tb2RlbCA8LSBzb20oaW1nLm1hdHJpeCwNCiAgICAgICAgICAgICAgICAgZ3JpZCA9IHNvbV9ncmlkLA0KICAgICAgICAgICAgICAgICBybGVuID0gMTUwMCwNCiAgICAgICAgICAgICAgICAgYWxwaGEgPSBjKDAuMDUsMC4wMSksDQogICAgICAgICAgICAgICAgIGtlZXAuZGF0YSA9IFRSVUUpDQpgYGANCg0KIyNUcmFpbmluZyBwcm9ncmVzcyBldmFsdWF0aW9uDQpgYGB7cn0NCnBsb3Qoc29tX21vZGVsLCB0eXBlID0gImNoYW5nZXMiKQ0KYGBgDQoNCiMjTm9kZSBjb3VudHMNCmBgYHtyfQ0Kc291cmNlKCdjb29sQmx1ZUhvdFJlZC5SJykNCnBsb3Qoc29tX21vZGVsLCB0eXBlID0gImNvdW50cyIsIHBhbGV0dGUubmFtZT1jb29sQmx1ZUhvdFJlZCkNCmBgYA0KDQoNCg0KDQojI1NPTSBieSBSR0IgdmFsdWUNCg0KYGBge3IsIGZpZy5oZWlnaHQ9MTB9DQpzb3VyY2UoImNvb2xCbHVlSG90UmVkLlIiKQ0KDQptb2RlbHMgPC0gYXMuZGF0YS5mcmFtZShzb21fbW9kZWwkY29kZXMpDQoNCnBhcihtZnJvdyA9IGMoMywzKSkNCnBsb3Qoc29tX21vZGVsLA0KICAgICB0eXBlID0gInByb3BlcnR5IiwNCiAgICAgcHJvcGVydHkgPSBtb2RlbHNbLDFdLA0KICAgICBtYWluPW5hbWVzKG1vZGVscylbMV0sDQogICAgIHBhbGV0dGUubmFtZT1jb29sQmx1ZUhvdFJlZCkNCnBsb3Qoc29tX21vZGVsLA0KICAgICB0eXBlID0gInByb3BlcnR5IiwNCiAgICAgcHJvcGVydHkgPSBtb2RlbHNbLDJdLA0KICAgICBtYWluPW5hbWVzKG1vZGVscylbMl0sDQogICAgIHBhbGV0dGUubmFtZT1jb29sQmx1ZUhvdFJlZCkNCnBsb3Qoc29tX21vZGVsLA0KICAgICB0eXBlID0gInByb3BlcnR5IiwNCiAgICAgcHJvcGVydHkgPSBtb2RlbHNbLDNdLA0KICAgICBtYWluPW5hbWVzKG1vZGVscylbM10sDQogICAgIHBhbGV0dGUubmFtZT1jb29sQmx1ZUhvdFJlZCkNCnBsb3Qoc29tX21vZGVsLA0KICAgICB0eXBlID0gInByb3BlcnR5IiwNCiAgICAgcHJvcGVydHkgPSBtb2RlbHNbLDRdLA0KICAgICBtYWluPW5hbWVzKG1vZGVscylbNF0sDQogICAgIHBhbGV0dGUubmFtZT1jb29sQmx1ZUhvdFJlZCkNCnBsb3Qoc29tX21vZGVsLA0KICAgICB0eXBlID0gInByb3BlcnR5IiwNCiAgICAgcHJvcGVydHkgPSBtb2RlbHNbLDVdLA0KICAgICBtYWluPW5hbWVzKG1vZGVscylbNV0sDQogICAgIHBhbGV0dGUubmFtZT1jb29sQmx1ZUhvdFJlZCkNCnBsb3Qoc29tX21vZGVsLA0KICAgICB0eXBlID0gInByb3BlcnR5IiwNCiAgICAgcHJvcGVydHkgPSBtb2RlbHNbLDZdLA0KICAgICBtYWluPW5hbWVzKG1vZGVscylbNl0sDQogICAgIHBhbGV0dGUubmFtZT1jb29sQmx1ZUhvdFJlZCkNCg0KYGBgDQoNCg0KIyNOZWlnaGJvciBEaXN0YW5jZQ0KYGBge3IsIGZpZy5hbGlnbj0nY2VudGVyJ30NCnBsb3Qoc29tX21vZGVsLCB0eXBlID0gImRpc3QubmVpZ2hib3VycyIsIHBhbGV0dGUubmFtZSA9IGdyZXkuY29sb3JzKQ0KYGBgDQoNCiMjQ29kZSBTcHJlYWQNCmBgYHtyLCBmaWcuaGVpZ2h0PSAxMH0NCnBsb3Qoc29tX21vZGVsLCB0eXBlID0gImNvZGVzIiwgcGFsZXR0ZS5uYW1lID0gY29vbEJsdWVIb3RSZWQpDQpgYGANCg0KIyNDbHVzdGVyaW5nDQpgYGB7cn0NCm15ZGF0YSA8LSBzb21fbW9kZWwkY29kZXMgDQp3c3MgPC0gKG5yb3cobXlkYXRhKS0xKSpzdW0oYXBwbHkoZGF0YS5mcmFtZShteWRhdGEpLDIsdmFyKSkgDQpmb3IgKGkgaW4gMjoyMCkgew0KICB3c3NbaV0gPC0gc3VtKGttZWFucyhkYXRhLmZyYW1lKG15ZGF0YSksIGNlbnRlcnM9aSkkd2l0aGluc3MpDQp9DQpwbG90KDE6MjAsIHdzcywgdHlwZT0iYiIsIHhsYWI9Ik51bWJlciBvZiBDbHVzdGVycyIsDQogICAgIHlsYWI9IldpdGhpbiBncm91cHMgc3VtIG9mIHNxdWFyZXMiLCBtYWluPSJXaXRoaW4gY2x1c3RlciBzdW0gb2Ygc3F1YXJlcyAoV0NTUykiKQ0KDQpzb21fY2x1c3RlciA8LSBjdXRyZWUoaGNsdXN0KGRpc3QoZGF0YS5mcmFtZShzb21fbW9kZWwkY29kZXMpKSksIDMpDQoNCg0KYGBgDQoNCkluIHRoaXMgZ3JhcGhpYyB3ZSBjYW4gc2VlIHRoYXQgdGhlIGlkZWFsIGFtb3VudCBvZiBjbHVzdGVycyBpcyBub3QgbW9yZSB0aGFuIDUuIFN0YXJ0aW5nIHdpdGggNSBhbmQgZ29pbmcgZG93biB3ZSB3aWxsIGNob29zZSB0aGUgc21hbGxlc3QgbnVtYmVyIHRoYXQgZ2l2ZXMgdXMgYSBzbW9vdGggZGlzdHJpYnV0aW9uIGluIHF1YW50aXR5IG9mIGl0J3MgbWVtYmVyczoNCg0KYGBge3J9DQojIDcgQ2x1c3RlcnMgLS0gdGhlIGZpcnN0IGNsdXN0ZXIgaGFzIGNvbnNpZGVyYWJseSBsZXNzIG1lbWJlcnMgdGhhbiB0aGUgb3RoZXIgZm91ciBvbmVzLg0KdGFibGUoY3V0cmVlKGhjbHVzdChkaXN0KGRhdGEuZnJhbWUoc29tX21vZGVsJGNvZGVzKSkpLCAzKSkNCg0KDQojIDUgQ2x1c3RlcnMgLS0gdGhlIGZpcnN0IGNsdXN0ZXIgaGFzIGNvbnNpZGVyYWJseSBsZXNzIG1lbWJlcnMgdGhhbiB0aGUgb3RoZXIgZm91ciBvbmVzLg0KdGFibGUoY3V0cmVlKGhjbHVzdChkaXN0KGRhdGEuZnJhbWUoc29tX21vZGVsJGNvZGVzKSkpLCA1KSkNCg0KIzQgY2x1c3RlcnMgLS0gd2UgaGF2ZSBmb3VuZGUgdGhlIGdvb2Qgb25lLg0KdGFibGUoY3V0cmVlKGhjbHVzdChkaXN0KGRhdGEuZnJhbWUoc29tX21vZGVsJGNvZGVzKSkpLCA0KSkNCmBgYA0KDQpgYGB7ciwgZmlnLmhlaWdodD0gMTB9DQpwcmV0dHlfcGFsZXR0ZSA8LSBjKCIjMWY3N2I0IiwgJyNmZjdmMGUnLCAnIzJjYTAyYycsICcjZDYyNzI4JywgJyM5NDY3YmQnLCAnIzhjNTY0YicsICcjZTM3N2MyJykNCg0KcGxvdChzb21fbW9kZWwsIHR5cGU9Im1hcHBpbmciLCBiZ2NvbCA9IHByZXR0eV9wYWxldHRlW3NvbV9jbHVzdGVyXSwgbWFpbiA9ICJDbHVzdGVycyIsIGtlZXBNYXJnaW5zID0gVFJVRSkgDQphZGQuY2x1c3Rlci5ib3VuZGFyaWVzKHNvbV9tb2RlbCwgc29tX2NsdXN0ZXIsIGx3ZCA9IDcpDQpgYGA=