Package
library(tidyverse) # data manipulation
## Warning: package 'tidyverse' was built under R version 3.5.3
## -- Attaching packages ---------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.1 v purrr 0.3.2
## v tibble 2.1.1 v dplyr 0.8.1
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.5.3
## Warning: package 'tibble' was built under R version 3.5.3
## Warning: package 'tidyr' was built under R version 3.5.3
## Warning: package 'readr' was built under R version 3.5.3
## Warning: package 'purrr' was built under R version 3.5.3
## Warning: package 'dplyr' was built under R version 3.5.3
## Warning: package 'stringr' was built under R version 3.5.3
## Warning: package 'forcats' was built under R version 3.5.3
## -- Conflicts ------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(cluster) # clustering algorithms
## Warning: package 'cluster' was built under R version 3.5.3
library(factoextra) # clustering visualization
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
library(dendextend) # for comparing two dendrograms
## Warning: package 'dendextend' was built under R version 3.5.3
##
## ---------------------
## Welcome to dendextend version 1.12.0
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
##
## cutree
library(corrplot) # for correlation visualize
## Warning: package 'corrplot' was built under R version 3.5.3
## corrplot 0.84 loaded
library(plotly)
## Warning: package 'plotly' was built under R version 3.5.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(DT)
## Warning: package 'DT' was built under R version 3.5.3
Import Data
df <- read.delim("D:\\Bahan\\Medium\\Cluster Hierarcy\\data.csv", sep=",")
Select Top Players (Overall >= 85)
status_data_top <- df %>%
filter(Overall >= 85) %>% #Selain filter dapat juga gunakan subset
select(Name,Position, 55:88)
Korelasi antara variabel
status_data_top %>%
select(-Name,-Position) %>%
cor %>%
corrplot(type = "lower") #all

status_data_top %>%
select(-Name,- Position,-contains("GK"),- Marking,-SlidingTackle,
-StandingTackle, -Positioning, -Penalties, -Composure, -LongPassing, -LongShots,
-ShotPower, -BallControl, -Volleys, -Crossing, -Finishing) %>%
cor %>%
corrplot(type = "lower")

Clustering Hierarki
data_for_dist <- df %>%
filter(Overall >= 85) %>%
select(HeadingAccuracy, ShortPassing, Dribbling, Curve, FKAccuracy,
Acceleration, SprintSpeed, Agility, Reactions, Balance, Jumping,
Stamina, Strength, Aggression, Interceptions, Vision)
rownames(data_for_dist) <- status_data_top$Name[1:110]
# Dissimilarity matrix
dist <- dist(data_for_dist, method = "euclidean")
# Hierarchical clustering using Complete Linkage
hc1 <- hclust(dist, method = "complete")
# Plot the obtained dendrogram
plot(hc1, cex = 0.6, hang = -1)

Alternatif fungsi menggunakan Agnes
# Compute with agnes
hc2 <- agnes(data_for_dist, method = "complete")
# Agglomerative coefficient
hc2$ac
## [1] 0.8526465
# methods to assess
m <- c( "average", "single", "complete", "ward")
names(m) <- c( "average", "single", "complete", "ward")
# function to compute coefficient
ac <- function(x) {
agnes(data_for_dist, method = x)$ac
}
map_dbl(m, ac)
## average single complete ward
## 0.8183192 0.7710234 0.8526465 0.9573419
Clustering Hierarki dengan Metode Terbaik (Ward)
hc3 <- agnes(data_for_dist, method = "ward")
pltree(hc3, cex = 0.6, hang = -1, main = "Dendrogram of agnes")

Menentukan Cluster Optimal
fviz_nbclust(data_for_dist, FUN = hcut, method = "wss")

fviz_nbclust(data_for_dist, FUN = hcut, method = "silhouette")

gap_stat <- clusGap(data_for_dist, FUN = hcut, nstart = 25, K.max = 10, B = 50)
fviz_gap_stat(gap_stat)

cutree <- cutree(hc3,3)
clust_data <- data.frame(Name = names(cutree),cluster = cutree,row.names = NULL)
datatable(clust_data)
only_status_data_top <- data.frame(data_for_dist, clust_data)
plot_data <- tidyr::gather(only_status_data_top,param,value,-cluster,-Name)
g <- ggplot(data = plot_data,aes(x=param,y=value,group=param,fill=param))
g <- g + geom_violin()
g <- g + facet_wrap(~cluster)
ggplotly(g)
Interpret Clusters
- Cluster 1: Keseluruhan skill tinggi
- Cluster 2: Cluster untuk para penjaga gawang
- Cluster 3: pemain yang hampir mempunyai skill tinggi namun mempunyai balance yang cukup rendah
Cluster mana yang sesuai dengan posisi yang ada
position_data <- data.frame(clust_data, status_data_top$Position)
datatable(position_data)
plot_data <- position_data %>%
select(cluster,status_data_top.Position) %>%
group_by(cluster,status_data_top.Position) %>%
summarise(count=n())
g <- ggplot(data=plot_data,aes(x=cluster,y=count,fill=status_data_top.Position))
g <- g + geom_bar(stat="identity",position="stack")
ggplotly(g)
Clustering Berdasarkan Posisi
- Cluster 1: Keseluruhan skill tinggi, dan posisi pada bagian Penyerang (FW dan MF)
- Cluster 2: Cluster untuk para penjaga gawang, dan posisinya adalah GK
- Cluster 3: pemain yang hampir mempunyai skill tinggi namun mempunyai balance yang cukup rendah, dan posisinya adalah Gelandang (DF)