Fun with R - Plotly / Dendrogram / k-mean

WORK IN PROGRESS

Introduction

A small exercise on how to handle a data set at work:
- Testing the package plotly. - Dendrogram
- k-mean clustering of the data
- a few variation of it

setwd("C:/Users/marco/OneDrive/062 R/Projects R/004 CWS")
require(dplyr)
require(ggplot2)
require(plotly)
require(lubridate)

files_loc       <- list.files("./010_data/", full.names = TRUE)
CWSdata <- read.csv2("./010_data/Z_PUR_ORDER_175150.csv",
                       header = TRUE, 
                       skip = 0)
head(CWSdata)
#>   UnitName UnitSize1  UnitSize2 Year       Date ProjectName ProjectNumber
#> 1   Unit 0      DN80 20-40 m3/h 2014 29.10.2014   Project A    C.1234/404
#> 2   Unit 1     DN100 41-65 m3/h 2011 26.08.2011   Project B    C.2345/401
#> 3   Unit 1     DN100 41-65 m3/h 2011 30.06.2011   Project C    C.3456/401
#> 4   Unit 1     DN100 41-65 m3/h 2012 08.02.2012   Project D    M.4567/402
#> 5   Unit 1     DN100 41-65 m3/h 2014 17.02.2014   Project E    C.5678/402
#> 6   Unit 1     DN100 41-65 m3/h 2014 17.01.2014   Project F    C.6789/401
#>   MEMS SEMS FEMS Demi Flow Manufacturer     Cost
#> 1    x                  24          Mey 55950.00
#> 2    x                  60          Cip 62195.00
#> 3    x                  60          Cip 66891.00
#> 4    x                  45          Cip 58215.00
#> 5    x                  48          Cip 56035.00
#> 6    x                  60          Cip 60544.00
dim(CWSdata)
#> [1] 28 14

CWSdata$Flow <- as.numeric(CWSdata$Flow)
CWSdata$Cost <- as.numeric(levels(CWSdata$Cost))[CWSdata$Cost]
CWSdata$Date <- dmy(CWSdata$Date)

str(CWSdata)
#> 'data.frame':    28 obs. of  14 variables:
#>  $ UnitName     : Factor w/ 5 levels "Unit 0","Unit 1",..: 1 2 2 2 2 2 2 2 2 3 ...
#>  $ UnitSize1    : Factor w/ 4 levels "DN100","DN125",..: 4 1 1 1 1 1 1 1 1 2 ...
#>  $ UnitSize2    : Factor w/ 5 levels "126-160 m3/h",..: 2 3 3 3 3 3 3 3 3 4 ...
#>  $ Year         : int  2014 2011 2011 2012 2014 2014 2014 2014 2015 2011 ...
#>  $ Date         : Date, format: "2014-10-29" "2011-08-26" ...
#>  $ ProjectName  : Factor w/ 28 levels "Project A","Project AA",..: 1 3 5 6 7 8 9 10 11 12 ...
#>  $ ProjectNumber: Factor w/ 28 levels "C.0123/401","C.0123/409",..: 4 6 10 28 15 18 21 23 25 1 ...
#>  $ MEMS         : Factor w/ 2 levels "","x": 2 2 2 2 2 2 1 2 2 2 ...
#>  $ SEMS         : Factor w/ 2 levels "","x": 1 1 1 1 1 1 1 1 1 1 ...
#>  $ FEMS         : Factor w/ 3 levels "","(fu)","x": 1 1 1 1 1 1 3 1 1 1 ...
#>  $ Demi         : Factor w/ 2 levels "","x": 1 1 1 1 1 1 1 1 2 1 ...
#>  $ Flow         : num  24 60 60 45 48 60 60 60 60 80 ...
#>  $ Manufacturer : Factor w/ 2 levels "Cip","Mey": 2 1 1 1 1 1 2 2 1 2 ...
#>  $ Cost         : num  55950 62195 66891 58215 56035 ...

plot(CWSdata$Flow, CWSdata$Cost)

Definition of Units


# Unit 0
Unit0min <- 20; Unit0max <- 40
datUnit0 <- filter(CWSdata, UnitName == "Unit 0")
#> Warning: package 'bindrcpp' was built under R version 3.3.3
meanUnit0 <- mean(datUnit0$Cost)

# Unit 1 - Bsp.: 41 < flow < 65
Unit1min <- 41; Unit1max <- 65
datUnit1 <- filter(CWSdata, UnitName == "Unit 1")
meanUnit1 <- mean(datUnit1$Cost)

# Unit 2a - Bsp.: 66 < flow < 90
Unit2amin <- 66; Unit2amax <- 90
datUnit2a <- filter(CWSdata, UnitName == "Unit 2a")
meanUnit2a <- mean(datUnit2a$Cost)


# Unit 2b - Bsp.: 91 < flow < 125
Unit2bmin <- 91; Unit2bmax <- 125
datUnit2b <- filter(CWSdata, UnitName == "Unit 2b")
meanUnit2b <- mean(datUnit2b$Cost)


# Unit 3a - Bsp.: 126 < flow < 160
Unit3amin <- 126; Unit3amax <- 160
datUnit3a <- filter(CWSdata, UnitName == "Unit 3a")
meanUnit3a <- mean(datUnit3a$Cost)


p <- ggplot() + 
        
        # Average per Units
        geom_segment(aes(x = Unit0min, xend = Unit0max, y = meanUnit0, yend = meanUnit0)) +
        geom_segment(aes(x = Unit1min, xend = Unit1max, y = meanUnit1, yend = meanUnit1)) +
        geom_segment(aes(x = Unit2amin, xend = Unit2amax, y = meanUnit2a, yend = meanUnit2a)) +
        geom_segment(aes(x = Unit2bmin, xend = Unit2bmax, y = meanUnit2b, yend = meanUnit2b)) +
        geom_segment(aes(x = Unit3amin, xend = Unit3amax, y = meanUnit3a, yend = meanUnit3a)) +
        # Rectangles Ã¼er Units
        geom_rect(data = data.frame(xmin=Unit0min, xmax=Unit0max, ymin=0, ymax=meanUnit0), 
                  aes(xmin=xmin,xmax=xmax,ymin=ymin,ymax=ymax), 
                  fill="red", alpha=.2) +
        geom_rect(data = data.frame(xmin=Unit1min, xmax=Unit1max, ymin=0, ymax=meanUnit1), 
                  aes(xmin=xmin,xmax=xmax,ymin=ymin,ymax=ymax), 
                  fill="green", alpha=.2) + 
        geom_rect(data = data.frame(xmin=Unit2amin, xmax=Unit2amax, ymin=0, ymax=meanUnit2a), 
                  aes(xmin=xmin,xmax=xmax,ymin=ymin,ymax=ymax), 
                  fill="blue", alpha=.2) +
        geom_rect(data = data.frame(xmin=Unit2bmin, xmax=Unit2bmax, ymin=0, ymax=meanUnit2b), 
                  aes(xmin=xmin,xmax=xmax,ymin=ymin,ymax=ymax), 
                  fill="yellow", alpha=.2) +
        geom_rect(data = data.frame(xmin=Unit3amin, xmax=Unit3amax, ymin=0, ymax=meanUnit3a), 
                  aes(xmin=xmin,xmax=xmax,ymin=ymin,ymax=ymax), 
                  fill="orange", alpha=.2) +        
        # Points at the END here
        geom_point(data = CWSdata, aes(Flow, Cost, colour = factor(Manufacturer))) + 
        # Formating
        labs(title = "CWS Units - Average Prices",
             x=expression(Flow~m^{3}/h),
             y=expression(Price))

p

# p_ggplotly <- ggplotly(p)
# p_ggplotly

Clustering

If we cluster the data w.r.t. to the prices (and neglect the unit-definition), we should get a similar aggregation. If yes, the unit definition and the corresponding prices are make sense. If not, one might think about re-defining the units (change max. and min. flow of the units).

dclust <- CWSdata %>% dist %>% hclust # data.frame(x = CWSdata$Flow, y = CWSdata$Cost)
plot(dclust)

# We need the labesl as character
CWSdata$ProjectNumber <- as.character(CWSdata$ProjectNumber)
datframe <- data.frame(x = CWSdata$Flow, y = CWSdata$Cost)

require(ape)
distdat <- dist(datframe)
hcdat   <- hclust(distdat)
hcdat$labels <- CWSdata$ProjectNumber # That's how to get the labeling in. Took me a while.
phylodat <- as.phylo(hcdat)



p1 <- plot(phylodat, cex = 0.6, label.offset = 0.5)

plot(phylodat, type = "unrooted", cex = 0.6,
     no.margin = TRUE)

Coloring the cluster - with the pre-defined 5 Unit Types


require("RColorBrewer")
# display.brewer.all() # Display all available color schemes

colors <- brewer.pal(5,"Set1")

#colors = c("red", "blue", "green", "black")
clus5 = cutree(hcdat, 5)
plot(as.phylo(hcdat), type = "fan", tip.color = colors[clus5],
     label.offset = 1, cex = 0.7)

k-mean

kcl <- (kmeans(datframe,5)) # 5 Unit types

datframe$cluster <- factor(kcl$cluster)
centers <- as.data.frame(kcl$centers)

require(ggplot2)

ggplot(data=datframe, aes(x=x, y=y, color=cluster)) + 
        geom_point() + 
        geom_point(data=centers, aes(x=x,y=y, color=colors)) +
        geom_point(data=centers, aes(x=x,y=y, color=colors), size=52, alpha=.2) +
        theme(legend.position="none")

Conclusion

WORK IN PROGRESS
To be done is: to find out if the clustering corresponds to how the Units have been pre-defined.