Package

library(tidyverse)  # data manipulation
## Warning: package 'tidyverse' was built under R version 3.5.3
## -- Attaching packages ---------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.1     v purrr   0.3.2
## v tibble  2.1.1     v dplyr   0.8.1
## v tidyr   0.8.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.5.3
## Warning: package 'tibble' was built under R version 3.5.3
## Warning: package 'tidyr' was built under R version 3.5.3
## Warning: package 'readr' was built under R version 3.5.3
## Warning: package 'purrr' was built under R version 3.5.3
## Warning: package 'dplyr' was built under R version 3.5.3
## Warning: package 'stringr' was built under R version 3.5.3
## Warning: package 'forcats' was built under R version 3.5.3
## -- Conflicts ------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(cluster)    # clustering algorithms
## Warning: package 'cluster' was built under R version 3.5.3
library(factoextra) # clustering visualization
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
library(dendextend) # for comparing two dendrograms
## Warning: package 'dendextend' was built under R version 3.5.3
## 
## ---------------------
## Welcome to dendextend version 1.12.0
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
## 
##     cutree
library(corrplot)   # for correlation visualize  
## Warning: package 'corrplot' was built under R version 3.5.3
## corrplot 0.84 loaded
library(plotly)     
## Warning: package 'plotly' was built under R version 3.5.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(DT)
## Warning: package 'DT' was built under R version 3.5.3

Import Data

df <- read.delim("D:\\Bahan\\Medium\\Cluster Hierarcy\\data.csv", sep=",")

Select Top Players (Overall >= 85)

status_data_top <- df %>% 
  filter(Overall >= 85) %>% #Selain filter dapat juga gunakan subset
  select(Name,Position, 55:88)

Korelasi antara variabel

status_data_top %>% 
  select(-Name,-Position) %>%
  cor %>% 
  corrplot(type = "lower") #all

status_data_top %>% 
  select(-Name,- Position,-contains("GK"),- Marking,-SlidingTackle, 
         -StandingTackle, -Positioning, -Penalties, -Composure, -LongPassing, -LongShots,
         -ShotPower, -BallControl, -Volleys, -Crossing, -Finishing) %>% 
  cor %>% 
  corrplot(type = "lower")

Clustering Hierarki

data_for_dist <- df %>%
  filter(Overall >= 85) %>%
  select(HeadingAccuracy, ShortPassing, Dribbling, Curve, FKAccuracy,
         Acceleration, SprintSpeed, Agility, Reactions, Balance, Jumping,
         Stamina, Strength, Aggression, Interceptions, Vision)
rownames(data_for_dist) <- status_data_top$Name[1:110]
# Dissimilarity matrix
dist <- dist(data_for_dist, method = "euclidean")
# Hierarchical clustering using Complete Linkage
hc1 <- hclust(dist, method = "complete")
# Plot the obtained dendrogram
plot(hc1, cex = 0.6, hang = -1)

Alternatif fungsi menggunakan Agnes

# Compute with agnes
hc2 <- agnes(data_for_dist, method = "complete")
# Agglomerative coefficient
hc2$ac
## [1] 0.8526465
# methods to assess
m <- c( "average", "single", "complete", "ward")
names(m) <- c( "average", "single", "complete", "ward")
# function to compute coefficient
ac <- function(x) {
  agnes(data_for_dist, method = x)$ac
}
map_dbl(m, ac)
##   average    single  complete      ward 
## 0.8183192 0.7710234 0.8526465 0.9573419

Clustering Hierarki dengan Metode Terbaik (Ward)

hc3 <- agnes(data_for_dist, method = "ward")
pltree(hc3, cex = 0.6, hang = -1, main = "Dendrogram of agnes")

Menentukan Cluster Optimal

fviz_nbclust(data_for_dist, FUN = hcut, method = "wss")

fviz_nbclust(data_for_dist, FUN = hcut, method = "silhouette")

gap_stat <- clusGap(data_for_dist, FUN = hcut, nstart = 25, K.max = 10, B = 50)
fviz_gap_stat(gap_stat)

cutree <- cutree(hc3,3)
clust_data <- data.frame(Name = names(cutree),cluster = cutree,row.names = NULL)
datatable(clust_data)
only_status_data_top <- data.frame(data_for_dist, clust_data)
plot_data <- tidyr::gather(only_status_data_top,param,value,-cluster,-Name)
g <- ggplot(data = plot_data,aes(x=param,y=value,group=param,fill=param))
g <- g + geom_violin()
g <- g + facet_wrap(~cluster)
ggplotly(g)

Interpret Clusters

Cluster mana yang sesuai dengan posisi yang ada

position_data <- data.frame(clust_data, status_data_top$Position)
datatable(position_data)
plot_data <- position_data %>% 
  select(cluster,status_data_top.Position) %>% 
  group_by(cluster,status_data_top.Position) %>% 
  summarise(count=n())
g <- ggplot(data=plot_data,aes(x=cluster,y=count,fill=status_data_top.Position))
g <- g + geom_bar(stat="identity",position="stack")
ggplotly(g)

Clustering Berdasarkan Posisi