The purpose of this essay is to understand how the kohonen maps works and what are the most important parameters that I should master to make well use of this library in R. I have selected a trivial example: Based on the hard work of students I will demonstrate that in the future they will have better hourly salaries. I will be using only a 2D space problem. So this document shows how a Clustering technique allows to do segmentation of students that have work hard to get a better salary. It is an hypothetic example.
For a detailed discussion of results or any consulting job please contact me at dromero@cpq-energy.com
The website of my company is http://www.cpq-energy.com
# Main libraries include readxl, imputeTS, and kohonen
library(readxl)
library(tidyverse)
library(stringr)
library(forecast) # Moving Average
library(imputeTS) # Time Series Missing Value Imputation
library(kohonen)
library(RcppRoll)
library(matrixStats)
library(msm)
library(DT)
library(plotly)
library(RColorBrewer)
library(fields)
library(latticeExtra)
library(deldir)data <- read_excel("hourlysalary-vs-studyhours.xlsx",
col_types = c("numeric", "numeric", "numeric"),
na = c("NA")
)
# Basic scatter plot. Shape=1 is to have circles
ggplot(data = data, aes(x=studyhours, y=hourlysalary)) +
geom_point(shape=1, size=3)set.seed(123)
training <- sample(nrow(data), size = round(length(data$id) * .8, 0))
data[training, "settype"] <- "Train"
data[-training, "settype"] <- "Test"
Xtraining <- scale(data[training, c(2:3)])
Xtest <- scale(data[-training, c(2:3)],
center = attr(Xtraining, "scaled:center"),
scale = attr(Xtraining, "scaled:scale"))
system.time(som.model <- som(Xtraining,
grid=somgrid(5, 5, "hexagonal"),
rlen=500,
alpha=c(0.05,0.01),
n.hood = "circular",
keep.data = TRUE))## user system elapsed
## 0.016 0.000 0.037
coolBlueHotRed <- function(n, alpha = 1) {
rainbow(n, end=4/6, alpha=alpha)[n:1]
}
# Colour palette definition
pretty_palette <- c("#1f77b4", '#2ca02c', '#ff7f0e', '#d62728', '#9467bd', '#8c564b', '#e377c2')
mydata <- som.model$codes
wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var))
for (i in 2:15) wss[i] <- sum(kmeans(mydata,
centers=i)$withinss)
par(mfrow=c(1,2), mar=c(2.1,2.1,2.1,1.1))
plot(som.model, type="changes", main="Training Progress")
plot(1:15, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares", main="Within cluster sum of squares (WCSS)")nclusters <- 5
som.cluster <- cutree(hclust(dist(som.model$codes)), nclusters)
my.par <- par(mfrow=c(1,2))
par(mfrow=c(2,2), mar=c(2.1,2.1,1.0,2.1))
colsom_names <- c("Salary", "Study")
data[training, "cluster"] <- (data.frame(id=data[training, "id"],
cluster=som.cluster[som.model$unit.classif]))$clustersom.prediction <- predict(som.model, newdata = Xtest,
trainX = Xtraining,
trainY = data[training, "cluster"])
data[-training, "cluster"] <- as.factor(round(som.prediction$prediction, 0))# Basic scatter plot. Shape=1 is to have circles. Colour using Cluster #
p <- ggplot(data = data, aes(x=studyhours, y=hourlysalary)) +
geom_point(shape = 1,
color=pretty_palette[factor(round(data$cluster, 0))],
size=3)
ggplotly(p)datatable(data = data, options = list(pageLength = 5,
lengthMenu = c(5, 10, 25, 50, 100),
searching = TRUE)
)