WORK IN PROGRESS
A small exercise on how to handle a data set at work:
- Testing the package plotly. - Dendrogram
- k-mean clustering of the data
- a few variation of it
setwd("C:/Users/marco/OneDrive/062 R/Projects R/004 CWS")
require(dplyr)
require(ggplot2)
require(plotly)
require(lubridate)files_loc <- list.files("./010_data/", full.names = TRUE)
CWSdata <- read.csv2("./010_data/Z_PUR_ORDER_175150.csv",
header = TRUE,
skip = 0)
head(CWSdata)
#> UnitName UnitSize1 UnitSize2 Year Date ProjectName ProjectNumber
#> 1 Unit 0 DN80 20-40 m3/h 2014 29.10.2014 Project A C.1234/404
#> 2 Unit 1 DN100 41-65 m3/h 2011 26.08.2011 Project B C.2345/401
#> 3 Unit 1 DN100 41-65 m3/h 2011 30.06.2011 Project C C.3456/401
#> 4 Unit 1 DN100 41-65 m3/h 2012 08.02.2012 Project D M.4567/402
#> 5 Unit 1 DN100 41-65 m3/h 2014 17.02.2014 Project E C.5678/402
#> 6 Unit 1 DN100 41-65 m3/h 2014 17.01.2014 Project F C.6789/401
#> MEMS SEMS FEMS Demi Flow Manufacturer Cost
#> 1 x 24 Mey 55950.00
#> 2 x 60 Cip 62195.00
#> 3 x 60 Cip 66891.00
#> 4 x 45 Cip 58215.00
#> 5 x 48 Cip 56035.00
#> 6 x 60 Cip 60544.00
dim(CWSdata)
#> [1] 28 14CWSdata$Flow <- as.numeric(CWSdata$Flow)
CWSdata$Cost <- as.numeric(levels(CWSdata$Cost))[CWSdata$Cost]
CWSdata$Date <- dmy(CWSdata$Date)
str(CWSdata)
#> 'data.frame': 28 obs. of 14 variables:
#> $ UnitName : Factor w/ 5 levels "Unit 0","Unit 1",..: 1 2 2 2 2 2 2 2 2 3 ...
#> $ UnitSize1 : Factor w/ 4 levels "DN100","DN125",..: 4 1 1 1 1 1 1 1 1 2 ...
#> $ UnitSize2 : Factor w/ 5 levels "126-160 m3/h",..: 2 3 3 3 3 3 3 3 3 4 ...
#> $ Year : int 2014 2011 2011 2012 2014 2014 2014 2014 2015 2011 ...
#> $ Date : Date, format: "2014-10-29" "2011-08-26" ...
#> $ ProjectName : Factor w/ 28 levels "Project A","Project AA",..: 1 3 5 6 7 8 9 10 11 12 ...
#> $ ProjectNumber: Factor w/ 28 levels "C.0123/401","C.0123/409",..: 4 6 10 28 15 18 21 23 25 1 ...
#> $ MEMS : Factor w/ 2 levels "","x": 2 2 2 2 2 2 1 2 2 2 ...
#> $ SEMS : Factor w/ 2 levels "","x": 1 1 1 1 1 1 1 1 1 1 ...
#> $ FEMS : Factor w/ 3 levels "","(fu)","x": 1 1 1 1 1 1 3 1 1 1 ...
#> $ Demi : Factor w/ 2 levels "","x": 1 1 1 1 1 1 1 1 2 1 ...
#> $ Flow : num 24 60 60 45 48 60 60 60 60 80 ...
#> $ Manufacturer : Factor w/ 2 levels "Cip","Mey": 2 1 1 1 1 1 2 2 1 2 ...
#> $ Cost : num 55950 62195 66891 58215 56035 ...plot(CWSdata$Flow, CWSdata$Cost)
# Unit 0
Unit0min <- 20; Unit0max <- 40
datUnit0 <- filter(CWSdata, UnitName == "Unit 0")
#> Warning: package 'bindrcpp' was built under R version 3.3.3
meanUnit0 <- mean(datUnit0$Cost)
# Unit 1 - Bsp.: 41 < flow < 65
Unit1min <- 41; Unit1max <- 65
datUnit1 <- filter(CWSdata, UnitName == "Unit 1")
meanUnit1 <- mean(datUnit1$Cost)
# Unit 2a - Bsp.: 66 < flow < 90
Unit2amin <- 66; Unit2amax <- 90
datUnit2a <- filter(CWSdata, UnitName == "Unit 2a")
meanUnit2a <- mean(datUnit2a$Cost)
# Unit 2b - Bsp.: 91 < flow < 125
Unit2bmin <- 91; Unit2bmax <- 125
datUnit2b <- filter(CWSdata, UnitName == "Unit 2b")
meanUnit2b <- mean(datUnit2b$Cost)
# Unit 3a - Bsp.: 126 < flow < 160
Unit3amin <- 126; Unit3amax <- 160
datUnit3a <- filter(CWSdata, UnitName == "Unit 3a")
meanUnit3a <- mean(datUnit3a$Cost)
p <- ggplot() +
# Average per Units
geom_segment(aes(x = Unit0min, xend = Unit0max, y = meanUnit0, yend = meanUnit0)) +
geom_segment(aes(x = Unit1min, xend = Unit1max, y = meanUnit1, yend = meanUnit1)) +
geom_segment(aes(x = Unit2amin, xend = Unit2amax, y = meanUnit2a, yend = meanUnit2a)) +
geom_segment(aes(x = Unit2bmin, xend = Unit2bmax, y = meanUnit2b, yend = meanUnit2b)) +
geom_segment(aes(x = Unit3amin, xend = Unit3amax, y = meanUnit3a, yend = meanUnit3a)) +
# Rectangles üer Units
geom_rect(data = data.frame(xmin=Unit0min, xmax=Unit0max, ymin=0, ymax=meanUnit0),
aes(xmin=xmin,xmax=xmax,ymin=ymin,ymax=ymax),
fill="red", alpha=.2) +
geom_rect(data = data.frame(xmin=Unit1min, xmax=Unit1max, ymin=0, ymax=meanUnit1),
aes(xmin=xmin,xmax=xmax,ymin=ymin,ymax=ymax),
fill="green", alpha=.2) +
geom_rect(data = data.frame(xmin=Unit2amin, xmax=Unit2amax, ymin=0, ymax=meanUnit2a),
aes(xmin=xmin,xmax=xmax,ymin=ymin,ymax=ymax),
fill="blue", alpha=.2) +
geom_rect(data = data.frame(xmin=Unit2bmin, xmax=Unit2bmax, ymin=0, ymax=meanUnit2b),
aes(xmin=xmin,xmax=xmax,ymin=ymin,ymax=ymax),
fill="yellow", alpha=.2) +
geom_rect(data = data.frame(xmin=Unit3amin, xmax=Unit3amax, ymin=0, ymax=meanUnit3a),
aes(xmin=xmin,xmax=xmax,ymin=ymin,ymax=ymax),
fill="orange", alpha=.2) +
# Points at the END here
geom_point(data = CWSdata, aes(Flow, Cost, colour = factor(Manufacturer))) +
# Formating
labs(title = "CWS Units - Average Prices",
x=expression(Flow~m^{3}/h),
y=expression(Price))
p# p_ggplotly <- ggplotly(p)
# p_ggplotlyIf we cluster the data w.r.t. to the prices (and neglect the unit-definition), we should get a similar aggregation. If yes, the unit definition and the corresponding prices are make sense. If not, one might think about re-defining the units (change max. and min. flow of the units).
dclust <- CWSdata %>% dist %>% hclust # data.frame(x = CWSdata$Flow, y = CWSdata$Cost)
plot(dclust)# We need the labesl as character
CWSdata$ProjectNumber <- as.character(CWSdata$ProjectNumber)
datframe <- data.frame(x = CWSdata$Flow, y = CWSdata$Cost)
require(ape)
distdat <- dist(datframe)
hcdat <- hclust(distdat)
hcdat$labels <- CWSdata$ProjectNumber # That's how to get the labeling in. Took me a while.
phylodat <- as.phylo(hcdat)
p1 <- plot(phylodat, cex = 0.6, label.offset = 0.5)plot(phylodat, type = "unrooted", cex = 0.6,
no.margin = TRUE)
require("RColorBrewer")
# display.brewer.all() # Display all available color schemes
colors <- brewer.pal(5,"Set1")
#colors = c("red", "blue", "green", "black")
clus5 = cutree(hcdat, 5)
plot(as.phylo(hcdat), type = "fan", tip.color = colors[clus5],
label.offset = 1, cex = 0.7)kcl <- (kmeans(datframe,5)) # 5 Unit types
datframe$cluster <- factor(kcl$cluster)
centers <- as.data.frame(kcl$centers)
require(ggplot2)
ggplot(data=datframe, aes(x=x, y=y, color=cluster)) +
geom_point() +
geom_point(data=centers, aes(x=x,y=y, color=colors)) +
geom_point(data=centers, aes(x=x,y=y, color=colors), size=52, alpha=.2) +
theme(legend.position="none")WORK IN PROGRESS
To be done is: to find out if the clustering corresponds to how the Units have been pre-defined.