Clustering Hierarki Top Player

Package

library(tidyverse)  # data manipulation

## Warning: package 'tidyverse' was built under R version 3.5.3

## -- Attaching packages ---------------------------------- tidyverse 1.2.1 --

## v ggplot2 3.1.1     v purrr   0.3.2
## v tibble  2.1.1     v dplyr   0.8.1
## v tidyr   0.8.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0

## Warning: package 'ggplot2' was built under R version 3.5.3

## Warning: package 'tibble' was built under R version 3.5.3

## Warning: package 'tidyr' was built under R version 3.5.3

## Warning: package 'readr' was built under R version 3.5.3

## Warning: package 'purrr' was built under R version 3.5.3

## Warning: package 'dplyr' was built under R version 3.5.3

## Warning: package 'stringr' was built under R version 3.5.3

## Warning: package 'forcats' was built under R version 3.5.3

## -- Conflicts ------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(cluster)    # clustering algorithms

## Warning: package 'cluster' was built under R version 3.5.3

library(factoextra) # clustering visualization

## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ

library(dendextend) # for comparing two dendrograms

## Warning: package 'dendextend' was built under R version 3.5.3

## 
## ---------------------
## Welcome to dendextend version 1.12.0
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------

## 
## Attaching package: 'dendextend'

## The following object is masked from 'package:stats':
## 
##     cutree

library(corrplot)   # for correlation visualize

## Warning: package 'corrplot' was built under R version 3.5.3

## corrplot 0.84 loaded

library(plotly)

## Warning: package 'plotly' was built under R version 3.5.3

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(DT)

## Warning: package 'DT' was built under R version 3.5.3

Import Data

df <- read.delim("D:\\Bahan\\Medium\\Cluster Hierarcy\\data.csv", sep=",")

Select Top Players (Overall >= 85)

status_data_top <- df %>% 
  filter(Overall >= 85) %>% #Selain filter dapat juga gunakan subset
  select(Name,Position, 55:88)

Korelasi antara variabel

status_data_top %>% 
  select(-Name,-Position) %>%
  cor %>% 
  corrplot(type = "lower") #all

status_data_top %>% 
  select(-Name,- Position,-contains("GK"),- Marking,-SlidingTackle, 
         -StandingTackle, -Positioning, -Penalties, -Composure, -LongPassing, -LongShots,
         -ShotPower, -BallControl, -Volleys, -Crossing, -Finishing) %>% 
  cor %>% 
  corrplot(type = "lower")

Clustering Hierarki

data_for_dist <- df %>%
  filter(Overall >= 85) %>%
  select(HeadingAccuracy, ShortPassing, Dribbling, Curve, FKAccuracy,
         Acceleration, SprintSpeed, Agility, Reactions, Balance, Jumping,
         Stamina, Strength, Aggression, Interceptions, Vision)
rownames(data_for_dist) <- status_data_top$Name[1:110]

# Dissimilarity matrix
dist <- dist(data_for_dist, method = "euclidean")
# Hierarchical clustering using Complete Linkage
hc1 <- hclust(dist, method = "complete")
# Plot the obtained dendrogram
plot(hc1, cex = 0.6, hang = -1)

Alternatif fungsi menggunakan Agnes

# Compute with agnes
hc2 <- agnes(data_for_dist, method = "complete")
# Agglomerative coefficient
hc2$ac

## [1] 0.8526465

# methods to assess
m <- c( "average", "single", "complete", "ward")
names(m) <- c( "average", "single", "complete", "ward")
# function to compute coefficient
ac <- function(x) {
  agnes(data_for_dist, method = x)$ac
}
map_dbl(m, ac)

##   average    single  complete      ward 
## 0.8183192 0.7710234 0.8526465 0.9573419

Clustering Hierarki dengan Metode Terbaik (Ward)

hc3 <- agnes(data_for_dist, method = "ward")
pltree(hc3, cex = 0.6, hang = -1, main = "Dendrogram of agnes")

Menentukan Cluster Optimal

fviz_nbclust(data_for_dist, FUN = hcut, method = "wss")

fviz_nbclust(data_for_dist, FUN = hcut, method = "silhouette")

gap_stat <- clusGap(data_for_dist, FUN = hcut, nstart = 25, K.max = 10, B = 50)
fviz_gap_stat(gap_stat)

cutree <- cutree(hc3,3)
clust_data <- data.frame(Name = names(cutree),cluster = cutree,row.names = NULL)
datatable(clust_data)

only_status_data_top <- data.frame(data_for_dist, clust_data)
plot_data <- tidyr::gather(only_status_data_top,param,value,-cluster,-Name)
g <- ggplot(data = plot_data,aes(x=param,y=value,group=param,fill=param))
g <- g + geom_violin()
g <- g + facet_wrap(~cluster)
ggplotly(g)

Interpret Clusters

Cluster 1: Keseluruhan skill tinggi
Cluster 2: Cluster untuk para penjaga gawang
Cluster 3: pemain yang hampir mempunyai skill tinggi namun mempunyai balance yang cukup rendah

Cluster mana yang sesuai dengan posisi yang ada

position_data <- data.frame(clust_data, status_data_top$Position)
datatable(position_data)

plot_data <- position_data %>% 
  select(cluster,status_data_top.Position) %>% 
  group_by(cluster,status_data_top.Position) %>% 
  summarise(count=n())
g <- ggplot(data=plot_data,aes(x=cluster,y=count,fill=status_data_top.Position))
g <- g + geom_bar(stat="identity",position="stack")
ggplotly(g)

Clustering Berdasarkan Posisi

Cluster 1: Keseluruhan skill tinggi, dan posisi pada bagian Penyerang (FW dan MF)
Cluster 2: Cluster untuk para penjaga gawang, dan posisinya adalah GK
Cluster 3: pemain yang hampir mempunyai skill tinggi namun mempunyai balance yang cukup rendah, dan posisinya adalah Gelandang (DF)

Clustering Hierarki Top Player

Alfazrin Banapon

June 19, 2019

Package

Import Data

Select Top Players (Overall >= 85)

Korelasi antara variabel

Clustering Hierarki

Alternatif fungsi menggunakan Agnes

Clustering Hierarki dengan Metode Terbaik (Ward)

Menentukan Cluster Optimal

Interpret Clusters

Cluster mana yang sesuai dengan posisi yang ada

Clustering Berdasarkan Posisi