library(readr)
larva2 <- read_csv("~/GENER/SOMs/Ricardo/1. Raw Data/PSOMML_2Larvas.csv",
col_names = FALSE)
colnames(larva2) <- c("FILA", "COLUMNA", "RED", "GREEN", "BLUE")
larva2 <- larva2[,-1:-2]
The dataframe obtained is made of integer values:
str(larva2)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 262144 obs. of 3 variables:
$ RED : int 206 206 205 204 204 204 204 204 204 204 ...
$ GREEN: int 207 207 208 208 208 208 208 208 208 208 ...
$ BLUE : int 209 209 209 209 209 209 209 209 209 209 ...
This will be a problem down the line when feeding this data to the algorithm, values have to be changed to numeric:
larva2[,'RED'] <- as.numeric(as.factor(larva2$RED))
larva2[,'GREEN'] <- as.numeric(as.factor(larva2$GREEN))
larva2[,'BLUE'] <- as.numeric(as.factor(larva2$BLUE))
Whit this taken care of we have a database ready to be fed to the algorithmL
str(larva2)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 262144 obs. of 3 variables:
$ RED : num 166 166 165 164 164 164 164 164 164 164 ...
$ GREEN: num 166 166 167 167 167 167 167 167 167 167 ...
$ BLUE : num 170 170 170 170 170 170 170 170 170 170 ...
A brief summary of the values of the variable can help with the initial understanding of things:
summary(larva2)
RED GREEN BLUE
Min. : 1.0 Min. : 1.0 Min. : 1.0
1st Qu.:164.0 1st Qu.:166.0 1st Qu.:169.0
Median :166.0 Median :167.0 Median :170.0
Mean :163.3 Mean :164.6 Mean :165.5
3rd Qu.:167.0 3rd Qu.:168.0 3rd Qu.:171.0
Max. :171.0 Max. :172.0 Max. :175.0
require(kohonen)
matrix#I removed the function scale() because it does not need to be scaled.
larva2.matrix <- as.matrix(larva2)
#TEST
larva2.matrixTest <- larva2.matrix[1:20000,]
som_grid <- somgrid(xdim = 30, ydim = 30, topo = "rectangular")
plot(som_model, type = "changes")
source('coolBlueHotRed.R')
plot(som_model, type = "counts", palette.name=coolBlueHotRed)
par(mfrow = c(2,2))
models <- as.data.frame(som_model$codes)
plot(som_model,
type = "property",
property = models[,1],
main=names(models)[1],
palette.name=coolBlueHotRed)
plot(som_model,
type = "property",
property = models[,2],
main=names(models)[2],
palette.name=coolBlueHotRed)
plot(som_model,
type = "property",
property = models[,3],
main=names(models)[3],
palette.name=coolBlueHotRed)
plot(som_model, type = "dist.neighbours", palette.name = grey.colors)
plot(som_model, type = "codes", palette.name = coolBlueHotRed)
mydata <- som_model$codes
wss <- (nrow(mydata)-1)*sum(apply(data.frame(mydata),2,var))
for (i in 2:20) {
wss[i] <- sum(kmeans(data.frame(mydata), centers=i)$withinss)
}
did not converge in 10 iterationsdid not converge in 10 iterationsdid not converge in 10 iterationsdid not converge in 10 iterationsdid not converge in 10 iterationsdid not converge in 10 iterationsdid not converge in 10 iterationsdid not converge in 10 iterationsdid not converge in 10 iterations
plot(1:20, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares", main="Within cluster sum of squares (WCSS)")
som_cluster <- cutree(hclust(dist(data.frame(som_model$codes))), 3)
In this graphic we can see that the ideal amount of clusters is not more than 5. Starting with 5 and going down we will choose the smallest number that gives us a smooth distribution in quantity of it’s members:
# 7 Clusters -- the first cluster has considerably less members than the other four ones.
table(cutree(hclust(dist(data.frame(som_model$codes))), 3))
1 2 3
717 117 66
# 5 Clusters -- the first cluster has considerably less members than the other four ones.
table(cutree(hclust(dist(data.frame(som_model$codes))), 5))
1 2 3 4 5
717 63 54 50 16
#4 clusters -- we have founde the good one.
table(cutree(hclust(dist(data.frame(som_model$codes))), 4))
1 2 3 4
717 117 50 16
pretty_palette <- c("#1f77b4", '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2')
plot(som_model, type="mapping", bgcol = pretty_palette[som_cluster], main = "Clusters", keepMargins = TRUE)
add.cluster.boundaries(som_model, som_cluster, lwd = 7)