data21 <- read.table("./players_21.csv", fill=TRUE, header=TRUE, sep=",")
data <- data21[,c(3,5,9,13,14,15,18,34,35,36,37,38,39)]
data[data == ''] <- NA
#Dropping rows containing missing values
data <- data %>% drop_na()
#Convert character objects to integer objects / factors
data$short_name <- factor(data$short_name)
data$age <- as.integer(data$age)
data$overall <- as.integer(data$overall)
data$nationality <- factor(data$nationality)
data$potential <- as.integer(data$potential)
data$value_eur <- as.integer(data$value_eur)
data$preferred_foot <- factor(data$preferred_foot)
data$pace <- as.integer(data$pace)
data$shooting <- as.integer(data$shooting)
data$passing <- as.integer(data$passing)
data$dribbling <- as.integer(data$dribbling)
data$defending <- as.integer(data$defending)
data$physic <- as.integer(data$physic)
colnames(data) <- c("Name", "Age", "Nationality", "Overall", "Potential", "Value", "PreferredFoot", "Pace", "Shooting", "Passing", "Dribbling", "Defending", "Physic")
# set the seed for reproducibility
set.seed(2)
# randomly select 250 observations from data
mydata <- data[sample(nrow(data), 250), c(1:2,7:13)]
# view the first few rows of mydata
head(mydata)
## Name Age PreferredFoot Pace Shooting Passing Dribbling Defending
## 4806 M. Finotto 27 Left 82 62 57 65 35
## 5469 T. Lange 26 Left 67 49 52 59 58
## 3276 Aleix Coch 28 Right 64 31 44 44 68
## 3453 D. Davis 29 Right 74 65 66 68 68
## 690 Dyego Sousa 30 Right 57 77 61 74 30
## 6787 N. Bruneel 23 Right 66 30 47 50 59
## Physic
## 4806 67
## 5469 74
## 3276 69
## 3453 81
## 690 81
## 6787 54
Name: Name of a player
Age: Age of a player in years
PreferredFoot: The natural preference of player’s left or right foot
Pace: Pace rating (1-100)
Shooting: Shooting rating (1-100)
Passing: Passing rating (1-100)
Dribbling: Dribbling rating (1-100)
Defending: Defending rating (1-100)
Physic: Physic rating (1-100)
# Standardization of skill ratings
mydata$Pace_z <- scale(mydata$Pace)
mydata$Shooting_z <- scale(mydata$Shooting)
mydata$Passing_z <- scale(mydata$Passing)
mydata$Dribbling_z <- scale(mydata$Dribbling)
mydata$Defending_z <- scale(mydata$Defending)
mydata$Physic_z <- scale(mydata$Physic)
#Checking a few variances of standardized cluster variables (by theory the variance should be 1)
var(mydata$Pace_z)
## [,1]
## [1,] 1
var(mydata$Shooting_z)
## [,1]
## [1,] 1
# Checking correlation (we want low correlation between clustering variables)
rcorr(as.matrix(mydata[, c("Pace_z", "Shooting_z", "Passing_z", "Dribbling_z", "Defending_z", "Physic_z")]),
type = "pearson")
## Pace_z Shooting_z Passing_z Dribbling_z Defending_z Physic_z
## Pace_z 1.00 0.41 0.36 0.59 -0.31 -0.14
## Shooting_z 0.41 1.00 0.70 0.79 -0.39 0.05
## Passing_z 0.36 0.70 1.00 0.86 0.11 0.11
## Dribbling_z 0.59 0.79 0.86 1.00 -0.14 0.03
## Defending_z -0.31 -0.39 0.11 -0.14 1.00 0.47
## Physic_z -0.14 0.05 0.11 0.03 0.47 1.00
##
## n= 250
##
##
## P
## Pace_z Shooting_z Passing_z Dribbling_z Defending_z Physic_z
## Pace_z 0.0000 0.0000 0.0000 0.0000 0.0269
## Shooting_z 0.0000 0.0000 0.0000 0.0000 0.4181
## Passing_z 0.0000 0.0000 0.0000 0.0714 0.0787
## Dribbling_z 0.0000 0.0000 0.0000 0.0219 0.5936
## Defending_z 0.0000 0.0000 0.0714 0.0219 0.0000
## Physic_z 0.0269 0.4181 0.0787 0.5936 0.0000
rcorr(as.matrix(mydata[, c("Pace_z", "Shooting_z", "Passing_z", "Defending_z", "Physic_z")]),
type = "pearson")
## Pace_z Shooting_z Passing_z Defending_z Physic_z
## Pace_z 1.00 0.41 0.36 -0.31 -0.14
## Shooting_z 0.41 1.00 0.70 -0.39 0.05
## Passing_z 0.36 0.70 1.00 0.11 0.11
## Defending_z -0.31 -0.39 0.11 1.00 0.47
## Physic_z -0.14 0.05 0.11 0.47 1.00
##
## n= 250
##
##
## P
## Pace_z Shooting_z Passing_z Defending_z Physic_z
## Pace_z 0.0000 0.0000 0.0000 0.0269
## Shooting_z 0.0000 0.0000 0.0000 0.4181
## Passing_z 0.0000 0.0000 0.0714 0.0787
## Defending_z 0.0000 0.0000 0.0714 0.0000
## Physic_z 0.0269 0.4181 0.0787 0.0000
We can se that Dribbling quite highly correlate with other cluster variables. Therefore, the dribbling rating will not be taken into account when determining clusters.
#Finding outliers
mydata$Dissimilarity <- sqrt(mydata$Shooting_z^2 + mydata$Passing_z^2 + mydata$Defending_z^2 + mydata$Physic_z^2)
head(mydata[order(-mydata$Dissimilarity), c(1,16) ], 10)
## Name Dissimilarity
## 8347 Zang Yifeng 3.802735
## 8101 N. Lemoine 3.590697
## 2782 R. Insigne 3.563217
## 8399 Wang Zhenghao 3.544954
## 4523 Isi Ros 3.345089
## 7724 Y. Rodríguez 3.156435
## 8071 S. Mroczko 3.141745
## 8200 S. Olberkis 3.121189
## 8136 M. Hunt 3.047675
## 7063 K. Lewis-Potter 3.037110
hist(mydata$Dissimilarity,
breaks = 30,
xlab = "Dissimilarity",
main = "Histogram of Dissimilarities")
# Removing outliers
mydata <- mydata[mydata$Dissimilarity < 3.5,]
#Calculating Euclidean distances (between all)
distance <- get_dist(mydata[c("Pace_z", "Shooting_z", "Passing_z", "Defending_z", "Physic_z")],
method = "euclidian")
#Manually getting to squared euclidian distance
distance2 <- distance^2
fviz_dist(distance2) #Showing dissimilarity matrix
# Hopkins Statistics (OK if H > 0.5)
# H0: There are no natural groups of objects
# H1: There are natural groups of objects
get_clust_tendency(mydata[c("Pace_z", "Shooting_z", "Passing_z", "Defending_z", "Physic_z")],
n = nrow(mydata) - 1,
graph = FALSE)
## $hopkins_stat
## [1] 0.6924061
##
## $plot
## NULL
WARD <- mydata[c("Pace_z", "Shooting_z", "Passing_z", "Defending_z", "Physic_z")] %>%
#scale() %>%
get_dist(method = "euclidean") %>% #Selecting distance
hclust(method = "ward.D2") #Selecting algorithm
# "ward.D2" square the distances and than cluster
WARD
##
## Call:
## hclust(d = ., method = "ward.D2")
##
## Cluster method : ward.D2
## Distance : euclidean
## Number of objects: 246
fviz_dend(WARD)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <]8;;https://github.com/kassambara/factoextra/issueshttps://github.com/kassambara/factoextra/issues]8;;>.
# Finding the best number of clusters
set.seed(1)
OptNumber <- mydata[c("Pace_z", "Shooting_z", "Passing_z", "Defending_z", "Physic_z")] %>%
#scale() %>%
NbClust(distance = "euclidean",
min.nc = 2, max.nc = 10,
method = "ward.D2",
index = "all")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 2 proposed 2 as the best number of clusters
## * 16 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 2 proposed 5 as the best number of clusters
## * 2 proposed 10 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
fviz_dend(WARD,
k = 2,
cex = 0.5,
palette = "jama",
color_labels_by_k = TRUE,
rect = TRUE)
fviz_dend(WARD,
k = 3,
cex = 0.5,
palette = "jama",
color_labels_by_k = TRUE,
rect = TRUE)
We choose that number of clusters is 3.
#Cutting the dendrogram
mydata$ClusterWard <- cutree(WARD,
k = 3)
head(mydata)
## Name Age PreferredFoot Pace Shooting Passing Dribbling Defending
## 4806 M. Finotto 27 Left 82 62 57 65 35
## 5469 T. Lange 26 Left 67 49 52 59 58
## 3276 Aleix Coch 28 Right 64 31 44 44 68
## 3453 D. Davis 29 Right 74 65 66 68 68
## 690 Dyego Sousa 30 Right 57 77 61 74 30
## 6787 N. Bruneel 23 Right 66 30 47 50 59
## Physic Pace_z Shooting_z Passing_z Dribbling_z Defending_z
## 4806 67 1.29038146 0.6509567 -0.05899326 0.1994188 -1.1704054
## 5469 74 -0.05488695 -0.2558625 -0.53783464 -0.3703492 0.3287773
## 3276 69 -0.32394063 -1.5114583 -1.30398084 -1.7947692 0.9805958
## 3453 81 0.57290498 0.8602226 0.80292122 0.4843028 0.9805958
## 690 81 -0.95173256 1.6972865 0.32407984 1.0540708 -1.4963147
## 6787 54 -0.14457151 -1.5812136 -1.01667601 -1.2250012 0.3939591
## Physic_z Dissimilarity ClusterWard
## 4806 0.2734583 1.368157 1
## 5469 1.0342724 1.237960 2
## 3276 0.4908337 2.277577 3
## 3453 1.7950864 2.359782 2
## 690 1.7950864 2.906390 1
## 6787 -1.1394821 2.233269 3
#Calculating positions of initial leaders
Initial_leaders <- aggregate(mydata[, c("Pace_z", "Shooting_z", "Passing_z", "Defending_z", "Physic_z")],
by = list(mydata$ClusterWard),
FUN = mean)
Initial_leaders
## Group.1 Pace_z Shooting_z Passing_z Defending_z Physic_z
## 1 1 0.5280627 0.8095670 0.1758670 -1.0959119 -0.38125582
## 2 2 0.1419816 0.2834652 0.7573729 0.7874350 0.56505951
## 3 3 -0.6826789 -1.1182127 -0.9209077 0.4135137 -0.05260486
#Performing K-Means clustering - initial leaders are chosen as centroids of groups, found with hierarhical clustering
K_MEANS <- hkmeans(mydata[, c("Pace_z", "Shooting_z", "Passing_z", "Defending_z", "Physic_z")],
k = 3,
hc.metric = "euclidean",
hc.method = "ward.D2")
K_MEANS
## Hierarchical K-means clustering with 3 clusters of sizes 88, 76, 82
##
## Cluster means:
## Pace_z Shooting_z Passing_z Defending_z Physic_z
## 1 0.5036033 0.6818710 0.09010053 -1.0867060 -0.48118030
## 2 0.2070864 0.4701435 0.90372993 0.7430251 0.59809137
## 3 -0.7154903 -1.1456682 -0.89521381 0.5457852 0.08524298
##
## Clustering vector:
## 4806 5469 3276 3453 690 6787 4914 5695 6375 7017 683 3945 7790 936 8336 6189
## 1 3 3 2 1 3 2 1 1 3 3 3 3 2 3 1
## 2286 3081 7543 2941 6494 130 3959 466 963 1665 4055 2352 7469 1313 8200 676
## 2 2 1 1 3 2 1 2 1 1 1 2 3 2 3 1
## 7149 4322 1685 3958 2765 358 2495 7724 4764 6596 1642 6982 4832 5903 8366 6097
## 1 2 3 1 1 1 1 1 1 3 1 3 1 3 3 3
## 7557 5173 2424 6287 1616 8085 4875 4937 4175 8042 7246 4523 2417 473 4119 3612
## 1 1 1 1 2 3 1 1 3 3 1 1 1 2 1 1
## 5633 1496 4317 2199 4444 2323 3284 4041 2714 7259 1959 2239 6798 5195 2757 5923
## 2 2 1 1 1 2 2 3 1 1 2 1 1 2 3 1
## 2947 550 3073 5570 893 2548 6624 443 2903 6515 2884 4702 1983 7752 5709 5713
## 3 2 2 3 2 2 1 2 2 3 1 1 2 1 1 3
## 4272 5092 6887 200 1565 7124 6738 4530 1590 5006 3779 5685 827 7397 6819 5227
## 1 1 3 2 2 3 3 1 1 3 1 3 1 3 3 1
## 5879 1890 8149 568 3390 3619 342 5464 1094 7001 5270 103 3371 902 2776 2086
## 3 2 3 1 3 1 2 2 2 3 1 2 2 2 2 2
## 2669 318 3339 7020 3811 1435 2439 6487 2852 2241 4785 1961 2234 1458 7019 437
## 2 1 3 3 2 2 1 3 2 3 2 2 3 1 1 1
## 4355 5300 269 5600 978 3596 2998 5874 1833 3120 6924 768 6229 500 6201 4614
## 2 3 2 2 2 1 2 3 2 3 3 1 3 2 3 2
## 8071 1173 2920 7063 4608 5834 4262 7331 6762 3245 1294 1486 7440 469 6169 3901
## 3 2 2 1 2 1 2 1 3 3 2 1 3 3 1 3
## 2899 6874 3419 6783 830 3823 2003 1069 660 428 3320 6663 1062 5850 7305 2597
## 1 1 2 3 2 1 1 2 2 2 1 3 2 3 1 3
## 7315 158 539 5176 7395 7056 1116 7678 4527 2283 5609 5673 5841 3763 1603 5284
## 3 2 2 1 3 3 1 3 3 2 2 3 3 1 3 2
## 5013 4733 7404 1445 6591 3111 5994 3140 6398 44 4515 5818 313 4965 4427 3615
## 3 3 1 2 1 3 3 3 3 2 1 3 2 3 1 1
## 4038 3223 2558 838 4903 6074 5316 146 7779 5119 8136 1949 4509 843 3108 5822
## 3 2 1 2 3 1 3 1 1 2 3 3 2 2 2 3
## 4376 6411 3680 1841 6003 3642
## 1 3 1 2 1 3
##
## Within cluster sum of squares by cluster:
## [1] 258.3020 137.5163 210.8096
## (between_SS / total_SS = 48.8 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault" "data"
## [11] "hclust"
between_SS / total_SS: 48.8% ratio of variability between groups and total sums of groups.It would be good if it is at least 50%, but thhere is no rule for that.
A higher value of this proportion would suggest a stronger relationship between the dependent variable and the grouping variables, while a lower value would indicate a weaker relationship.
fviz_cluster(K_MEANS,
palette = "jama",
repel = FALSE,
ggtheme = theme_classic())
mydata$ClusterK_Means <- K_MEANS$cluster
head(mydata[c("Name", "ClusterWard", "ClusterK_Means")])
## Name ClusterWard ClusterK_Means
## 4806 M. Finotto 1 1
## 5469 T. Lange 2 3
## 3276 Aleix Coch 3 3
## 3453 D. Davis 2 2
## 690 Dyego Sousa 1 1
## 6787 N. Bruneel 3 3
#Checking for reclassifications
table(mydata$ClusterWard) #Ward clustering
##
## 1 2 3
## 84 82 80
table(mydata$ClusterK_Means) #K Means clustering
##
## 1 2 3
## 88 76 82
table(mydata$ClusterWard, mydata$ClusterK_Means) #Reclassification
##
## 1 2 3
## 1 78 6 0
## 2 3 70 9
## 3 7 0 73
Centroids <- K_MEANS$centers
Centroids
## Pace_z Shooting_z Passing_z Defending_z Physic_z
## 1 0.5036033 0.6818710 0.09010053 -1.0867060 -0.48118030
## 2 0.2070864 0.4701435 0.90372993 0.7430251 0.59809137
## 3 -0.7154903 -1.1456682 -0.89521381 0.5457852 0.08524298
library(ggplot2)
library(tidyr)
Figure <- as.data.frame(Centroids)
Figure$id <- 1:nrow(Figure)
Figure <- pivot_longer(Figure, cols = c(Pace_z, Shooting_z, Passing_z, Defending_z, Physic_z))
Figure$Groups <- factor(Figure$id,
levels = c(1, 2, 3),
labels = c("1", "2", "3"))
Figure$nameFactor <- factor(Figure$name,
levels = c("Pace_z", "Shooting_z", "Passing_z", "Defending_z", "Physic_z"),
labels = c("Pace_z", "Shooting_z", "Passing_z", "Defending_z", "Physic_z"))
ggplot(Figure, aes(x = nameFactor, y = value)) +
geom_hline(yintercept = 0) +
theme_bw() +
geom_point(aes(shape = Groups, col = Groups), size = 3) +
geom_line(aes(group = id), linewidth = 1) +
ylab("Averages") +
xlab("Cluster variables")+
ylim(-1.5, 1.5)
Here we graphically show deviations from the mean values of the cluster variables, for each group separately.
# ANOVA - Checking if all the cluster variables are successful at classifing units into groups.
# H0: All arithmetic means are equal
# H0: At least one arithmetic mean is different
fit <- aov(cbind(Pace_z, Shooting_z, Passing_z, Defending_z, Physic_z) ~ as.factor(ClusterK_Means), data = mydata)
summary(fit)
## Response 1 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterK_Means) 2 67.548 33.774 46.749 < 2.2e-16 ***
## Residuals 243 175.554 0.722
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 2 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterK_Means) 2 165.331 82.665 253.56 < 2.2e-16 ***
## Residuals 243 79.221 0.326
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 3 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterK_Means) 2 128.46 64.230 142.13 < 2.2e-16 ***
## Residuals 243 109.82 0.452
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 4 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterK_Means) 2 170.179 85.090 296.11 < 2.2e-16 ***
## Residuals 243 69.829 0.287
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 5 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterK_Means) 2 47.742 23.8712 33.684 1.225e-13 ***
## Residuals 243 172.208 0.7087
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
We have 5 variables, meaning we need to perform 5 separate anova tests. All of the variables used in the clustering are statistically significant (We can reject H0, at p < 0.001), which tells us that all are successful at classifying units into groups. Moreover we can look at F - values. Highest F value is in response 4 (Defending), which means the differences between the clusters are the highest for the variable Defending. In the other hand response 5 (Physic) has the lowest F value, meaning lower differences between clusters.
# Criterion validity
# H0: All arithmetic means are equal.
# H1: At least one arithmetic mean is different.
aggregate(mydata$Age,
by = list(mydata$ClusterK_Means),
FUN = "mean")
## Group.1 x
## 1 1 24.54545
## 2 2 27.34211
## 3 3 24.96341
fit <- aov(Age ~ as.factor(ClusterK_Means),
data = mydata)
summary(fit)
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterK_Means) 2 361 180.6 9.922 7.21e-05 ***
## Residuals 243 4422 18.2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
We can reject H0, at p < 0.001. At least one arithmetic mean is different.
# Pearson Chi2 test - Criterion Validity
# H0: There is no association between categorical variables.
# H1: There is some association between categorical variables.
chisq_results <- chisq.test(mydata$PreferredFoot, as.factor(mydata$ClusterK_Means))
chisq_results
##
## Pearson's Chi-squared test
##
## data: mydata$PreferredFoot and as.factor(mydata$ClusterK_Means)
## X-squared = 10.033, df = 2, p-value = 0.006629
addmargins(chisq_results$observed)
##
## mydata$PreferredFoot 1 2 3 Sum
## Left 19 13 31 63
## Right 69 63 51 183
## Sum 88 76 82 246
round(chisq_results$expected, 2)
##
## mydata$PreferredFoot 1 2 3
## Left 22.54 19.46 21
## Right 65.46 56.54 61
round(chisq_results$res, 2)
##
## mydata$PreferredFoot 1 2 3
## Left -0.74 -1.47 2.18
## Right 0.44 0.86 -1.28
We can reject H0, at p = 0.016. There is some association between the clusters of categorical variable PreferredFoot.
In group 3 there is more than expected units who prefer left foot at significance level 5% (2.18 > 1.96).
library(psych)
describeBy(mydata$Age, mydata$ClusterK_Means)
##
## Descriptive statistics by group
## group: 1
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 88 24.55 3.8 24 24.38 4.45 17 35 18 0.39 -0.29 0.4
## ------------------------------------------------------------
## group: 2
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 76 27.34 3.61 27 27.23 4.45 21 36 15 0.28 -0.82 0.41
## ------------------------------------------------------------
## group: 3
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 82 24.96 5.2 23 24.58 5.93 17 37 20 0.53 -0.86 0.57
Classification of 246 football players was based on five standardized variables.
For hierarchical clustering, Ward’s algorithm was used, and based on the analysis of the dendrogram and the indices analyzing the increase in heterogeneity, it was decided to classify the players into three groups. The classification was further optimized using the K-Means cluster.