library(readr)
larva <- read_csv("~/GENER/SOMs/Ricardo/1. Raw Data/PSOMML_1Larva.csv")
larva <- larva[,-1:-2]
The dataframe obtained is made of integer values:
str(larva)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 262144 obs. of 3 variables:
$ RED : int 205 205 206 206 206 206 206 205 205 205 ...
$ GREEN: int 209 209 210 210 210 210 210 209 209 209 ...
$ BLUE : int 208 208 209 209 209 209 209 209 208 209 ...
This will be a problem down the line when feeding this data to the algorithm, values have to be changed to numeric:
#larva[,'FILA'] <- as.numeric(as.factor(larva$FILA))
#larva[,'COLUMNA'] <- as.numeric(as.factor(larva$COLUMNA))
larva[,'RED'] <- as.numeric(as.factor(larva$RED))
larva[,'GREEN'] <- as.numeric(as.factor(larva$GREEN))
larva[,'BLUE'] <- as.numeric(as.factor(larva$BLUE))
Whit this taken care of we have a database ready to be fed to the algorithmL
str(larva)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 262144 obs. of 3 variables:
$ RED : num 162 162 163 163 163 163 163 162 162 162 ...
$ GREEN: num 169 169 170 170 170 170 170 169 169 169 ...
$ BLUE : num 171 171 172 172 172 172 172 172 171 172 ...
A brief summary of the values of the variable can help with the initial understanding of things:
summary(larva)
RED GREEN BLUE
Min. : 1.0 Min. : 1.0 Min. : 1.0
1st Qu.:158.0 1st Qu.:163.0 1st Qu.:167.0
Median :159.0 Median :165.0 Median :168.0
Mean :157.8 Mean :162.9 Mean :165.3
3rd Qu.:161.0 3rd Qu.:166.0 3rd Qu.:170.0
Max. :165.0 Max. :171.0 Max. :175.0
require(kohonen)
matrix#I removed the function scale() because it does not need to be scaled.
larva.matrix <- as.matrix(larva)
#TEST
larva.matrixTest <- larva.matrix[1:20000,]
som_grid60x60 <- somgrid(xdim = 60, ydim = 60, topo = "rectangular")
plot(som_model60, type = "changes")
source('coolBlueHotRed.R')
plot(som_model60, type = "counts", palette.name=coolBlueHotRed)
par(mfrow = c(2,2))
models <- as.data.frame(som_model60$codes)
plot(som_model60,
type = "property",
property = models[,1],
main=names(models)[1],
palette.name=coolBlueHotRed)
plot(som_model60,
type = "property",
property = models[,2],
main=names(models)[2],
palette.name=coolBlueHotRed)
plot(som_model60,
type = "property",
property = models[,3],
main=names(models)[3],
palette.name=coolBlueHotRed)
#source('plotHeatMap.R')
#plotHeatMap(som_model60, data = models, variable=0)
plot(som_model60, type = "dist.neighbours", palette.name = grey.colors)
plot(som_model60, type = "codes", palette.name = coolBlueHotRed)
mydata <- som_model60$codes
wss <- (nrow(mydata)-1)*sum(apply(data.frame(mydata),2,var))
for (i in 2:20) {
wss[i] <- sum(kmeans(data.frame(mydata), centers=i)$withinss)
}
did not converge in 10 iterationsdid not converge in 10 iterationsdid not converge in 10 iterationsdid not converge in 10 iterationsdid not converge in 10 iterationsdid not converge in 10 iterationsdid not converge in 10 iterationsdid not converge in 10 iterationsdid not converge in 10 iterations
plot(1:20, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares", main="Within cluster sum of squares (WCSS)")
som_cluster <- cutree(hclust(dist(data.frame(som_model60$codes))), 3)
In this graphic we can see that the ideal amount of clusters is not more than 5. Starting with 5 and going down we will choose the smallest number that gives us a smooth distribution in quantity of it’s members:
# 7 Clusters -- the first cluster has considerably less members than the other four ones.
table(cutree(hclust(dist(data.frame(som_model60$codes))), 7))
1 2 3 4 5 6 7
19 41 101 55 701 2556 127
# 5 Clusters -- the first cluster has considerably less members than the other four ones.
table(cutree(hclust(dist(data.frame(som_model60$codes))), 5))
1 2 3 4 5
60 101 182 701 2556
#4 clusters -- we have founde the good one.
table(cutree(hclust(dist(data.frame(som_model60$codes))), 4))
1 2 3 4
60 283 701 2556
pretty_palette <- c("#1f77b4", '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2')
plot(som_model60, type="mapping", bgcol = pretty_palette[som_cluster], main = "Clusters", keepMargins = TRUE)
add.cluster.boundaries(som_model60, som_cluster, lwd = 7)