library(readr)
larva2 <- read_csv("~/GENER/SOMs/1. Raw Data/PSOMML_2Larvas.csv",
col_names = FALSE)
colnames(larva2) <- c("FILA", "COLUMNA", "RED", "GREEN", "BLUE")
larva2Esp <- larva2
larva2 <- larva2[,-1:-2]
The dataframe obtained is made of integer values:
str(larva2)
This will be a problem down the line when feeding this data to the algorithm, values have to be changed to numeric:
larva2[,'RED'] <- as.numeric(as.factor(larva2$RED))
larva2[,'GREEN'] <- as.numeric(as.factor(larva2$GREEN))
larva2[,'BLUE'] <- as.numeric(as.factor(larva2$BLUE))
###larva2Esp
larva2Esp[,'FILA'] <- as.numeric(as.factor(larva2Esp$FILA))
larva2Esp[,'COLUMNA'] <- as.numeric(as.factor(larva2Esp$COLUMNA))
larva2Esp[,'RED'] <- as.numeric(as.factor(larva2Esp$RED))
larva2Esp[,'GREEN'] <- as.numeric(as.factor(larva2Esp$GREEN))
larva2Esp[,'BLUE'] <- as.numeric(as.factor(larva2Esp$BLUE))
Whit this taken care of we have a database ready to be fed to the algorithmL
str(larva2)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 262144 obs. of 3 variables:
$ RED : num 166 166 165 164 164 164 164 164 164 164 ...
$ GREEN: num 166 166 167 167 167 167 167 167 167 167 ...
$ BLUE : num 170 170 170 170 170 170 170 170 170 170 ...
A brief summary of the values of the variable can help with the initial understanding of things:
summary(larva2)
RED GREEN BLUE
Min. : 1.0 Min. : 1.0 Min. : 1.0
1st Qu.:164.0 1st Qu.:166.0 1st Qu.:169.0
Median :166.0 Median :167.0 Median :170.0
Mean :163.3 Mean :164.6 Mean :165.5
3rd Qu.:167.0 3rd Qu.:168.0 3rd Qu.:171.0
Max. :171.0 Max. :172.0 Max. :175.0
require(kohonen)
matrix#I removed the function scale() because it does not need to be scaled.
larva2.matrix <- as.matrix(larva2)
larva2Esp.matrix <- as.matrix(larva2Esp)
#TEST
#larva2.matrixTest <- larva2.matrix[1:20000,]
som_grid <- somgrid(xdim = 30, ydim = 30, topo = "rectangular")
som_model <- som(larva2.matrix,
grid = som_grid,
rlen = 1500,
alpha = c(0.05,0.01),
keep.data = TRUE)
som_modelEsp <- som(larva2Esp.matrix,
grid = som_grid,
rlen = 1500,
alpha = c(0.05,0.01),
keep.data = TRUE)
plot(som_modelEsp, type = "changes")
source('coolBlueHotRed.R')
plot(som_modelEsp, type = "counts", palette.name=coolBlueHotRed)
par(mfrow = c(2,2))
models <- as.data.frame(som_modelEsp$codes)
plot(som_modelEsp,
type = "property",
property = models[,3],
main=names(models)[3],
palette.name=coolBlueHotRed)
plot(som_modelEsp,
type = "property",
property = models[,4],
main=names(models)[4],
palette.name=coolBlueHotRed)
plot(som_modelEsp,
type = "property",
property = models[,5],
main=names(models)[5],
palette.name=coolBlueHotRed)
plot(som_modelEsp, type = "dist.neighbours", palette.name = grey.colors)
plot(som_modelEsp, type = "codes", palette.name = coolBlueHotRed)
mydata <- som_modelEsp$codes
wss <- (nrow(mydata)-1)*sum(apply(data.frame(mydata),2,var))
for (i in 2:20) {
wss[i] <- sum(kmeans(data.frame(mydata), centers=i)$withinss)
}
plot(1:20, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares", main="Within cluster sum of squares (WCSS)")
som_cluster <- cutree(hclust(dist(data.frame(som_modelEsp$codes))), 6)
In this graphic we can see that the ideal amount of clusters is not more than 5. Starting with 5 and going down we will choose the smallest number that gives us a smooth distribution in quantity of it’s members:
# 7 Clusters -- the first cluster has considerably less members than the other four ones.
table(cutree(hclust(dist(data.frame(som_modelEsp$codes))), 3))
1 2 3
489 172 239
# 5 Clusters -- the first cluster has considerably less members than the other four ones.
table(cutree(hclust(dist(data.frame(som_modelEsp$codes))), 5))
1 2 3 4 5
132 172 134 239 223
#4 clusters -- we have founde the good one.
table(cutree(hclust(dist(data.frame(som_modelEsp$codes))), 4))
1 2 3 4
266 172 239 223
pretty_palette <- c("#1f77b4", '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2')
plot(som_modelEsp, type="mapping", bgcol = pretty_palette[som_cluster], main = "Clusters", keepMargins = TRUE)
add.cluster.boundaries(som_modelEsp, som_cluster, lwd = 7)