The purpose of this essay is to understand how the kohonen maps works and what are the most important parameters that I should master to make well use of this library in R. I have selected a trivial example: Based on the hard work of students I will demonstrate that in the future they will have better hourly salaries. I will be using only a 2D space problem. So this document shows how a Clustering technique allows to do segmentation of students that have work hard to get a better salary. It is an hypothetic example.

For a detailed discussion of results or any consulting job please contact me at dromero@cpq-energy.com

The website of my company is http://www.cpq-energy.com

Loading Libraries

# Main libraries include readxl, imputeTS, and kohonen

library(readxl)
library(tidyverse)
library(stringr)
library(forecast) # Moving Average
library(imputeTS) # Time Series Missing Value Imputation
library(kohonen)
library(RcppRoll)
library(matrixStats)
library(msm)
library(DT)
library(plotly)
library(RColorBrewer)
library(fields)
library(latticeExtra)
library(deldir)

Reading Data

data <- read_excel("hourlysalary-vs-studyhours.xlsx", 
                   col_types = c("numeric", "numeric", "numeric"), 
                   na = c("NA")
                   )

# Basic scatter plot. Shape=1 is to have circles
ggplot(data = data, aes(x=studyhours, y=hourlysalary)) + 
       geom_point(shape=1, size=3)

Training Kohonen model

set.seed(123)
training <- sample(nrow(data), size = round(length(data$id) * .8, 0))

data[training, "settype"] <- "Train"
data[-training, "settype"] <- "Test"

Xtraining <- scale(data[training, c(2:3)])
Xtest <- scale(data[-training, c(2:3)],
               center = attr(Xtraining, "scaled:center"),
               scale = attr(Xtraining, "scaled:scale"))

system.time(som.model <- som(Xtraining, 
                             grid=somgrid(5, 5, "hexagonal"), 
                             rlen=500, 
                             alpha=c(0.05,0.01), 
                             n.hood = "circular",
                             keep.data = TRUE))
##    user  system elapsed 
##   0.016   0.000   0.037

Training Progress and WCSS

coolBlueHotRed <- function(n, alpha = 1) {
  rainbow(n, end=4/6, alpha=alpha)[n:1]
}

# Colour palette definition
pretty_palette <- c("#1f77b4", '#2ca02c', '#ff7f0e', '#d62728', '#9467bd', '#8c564b', '#e377c2')

mydata <- som.model$codes
wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var))
for (i in 2:15) wss[i] <- sum(kmeans(mydata,
                                     centers=i)$withinss)
par(mfrow=c(1,2), mar=c(2.1,2.1,2.1,1.1))
plot(som.model, type="changes", main="Training Progress")
plot(1:15, wss, type="b", xlab="Number of Clusters",
     ylab="Within groups sum of squares", main="Within cluster sum of squares (WCSS)")

Hierarchical Tree for selecting cluster number

nclusters <- 5
som.cluster <- cutree(hclust(dist(som.model$codes)), nclusters)

my.par <- par(mfrow=c(1,2))
par(mfrow=c(2,2), mar=c(2.1,2.1,1.0,2.1))
colsom_names <- c("Salary", "Study")

data[training, "cluster"] <- (data.frame(id=data[training, "id"], 
                                         cluster=som.cluster[som.model$unit.classif]))$cluster

Plotting Maps (SOM)

Predictive Mode

som.prediction <- predict(som.model, newdata = Xtest,
                          trainX = Xtraining, 
                          trainY = data[training, "cluster"])

data[-training, "cluster"] <- as.factor(round(som.prediction$prediction, 0))

Data table Results

# Basic scatter plot. Shape=1 is to have circles. Colour using Cluster #
p <- ggplot(data = data, aes(x=studyhours, y=hourlysalary)) + 
     geom_point(shape = 1, 
                color=pretty_palette[factor(round(data$cluster, 0))], 
                size=3)

ggplotly(p)
datatable(data = data, options = list(pageLength = 5, 
                               lengthMenu = c(5, 10, 25, 50, 100), 
                               searching = TRUE)
          )