Data set for Homework 4, was found on Kaggle. Link to data set is https://www.kaggle.com/datasets/stefanoleone992/fifa-21-complete-player-dataset?resource=download&select=players_21.csv.
mydata <- read.table("./players_21.csv", fill=TRUE, header=TRUE, sep=",")
mydata <- mydata[,c(3,5,9,13,14,15,18,34,35,36,37,38,39)]
mydata[mydata == ''] <- NA
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#install.packages("tidyverse")
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.2
## Warning: package 'lubridate' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ ggplot2 3.4.3 ✔ stringr 1.5.0
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
mydata <- mydata %>% drop_na()
#Convert character objects to integer objects / factors
mydata$short_name <- factor(mydata$short_name)
mydata$age <- as.integer(mydata$age)
mydata$overall <- as.integer(mydata$overall)
mydata$nationality <- factor(mydata$nationality)
mydata$potential <- as.integer(mydata$potential)
mydata$value_eur <- as.integer(mydata$value_eur)
mydata$preferred_foot <- factor(mydata$preferred_foot)
mydata$pace <- as.integer(mydata$pace)
mydata$shooting <- as.integer(mydata$shooting)
mydata$passing <- as.integer(mydata$passing)
mydata$dribbling <- as.integer(mydata$dribbling)
mydata$defending <- as.integer(mydata$defending)
mydata$physic <- as.integer(mydata$physic)
colnames(mydata) <- c("Name", "Age", "Nationality", "Overall", "Potential", "Value", "PreferredFoot", "Pace", "Shooting", "Passing", "Dribbling", "Defending", "Physic")
head(mydata)
## Name Age Nationality Overall Potential Value PreferredFoot
## 1 L. Messi 33 Argentina 93 93 67500000 Left
## 2 Cristiano Ronaldo 35 Portugal 92 92 46000000 Right
## 3 R. Lewandowski 31 Poland 91 91 80000000 Right
## 4 Neymar Jr 28 Brazil 91 91 90000000 Right
## 5 K. De Bruyne 29 Belgium 91 91 87000000 Right
## 6 K. Mbappé 21 France 90 95 105500000 Right
## Pace Shooting Passing Dribbling Defending Physic
## 1 85 92 91 95 38 65
## 2 89 93 81 89 35 77
## 3 78 91 78 85 43 82
## 4 91 85 86 94 36 59
## 5 76 86 93 88 64 78
## 6 96 86 78 91 39 76
library(dplyr)
# Create a new ID column with unique identifiers
mydata <- mutate(mydata, ID = 1:nrow(mydata))
library(dplyr)
# Move the 'ID' column to the first position
mydata <- select(mydata, ID, everything())
head(mydata)
## ID Name Age Nationality Overall Potential Value
## 1 1 L. Messi 33 Argentina 93 93 67500000
## 2 2 Cristiano Ronaldo 35 Portugal 92 92 46000000
## 3 3 R. Lewandowski 31 Poland 91 91 80000000
## 4 4 Neymar Jr 28 Brazil 91 91 90000000
## 5 5 K. De Bruyne 29 Belgium 91 91 87000000
## 6 6 K. Mbappé 21 France 90 95 105500000
## PreferredFoot Pace Shooting Passing Dribbling Defending Physic
## 1 Left 85 92 91 95 38 65
## 2 Right 89 93 81 89 35 77
## 3 Right 78 91 78 85 43 82
## 4 Right 91 85 86 94 36 59
## 5 Right 76 86 93 88 64 78
## 6 Right 96 86 78 91 39 76
library(dplyr)
set.seed(1)
mydata1 <- mydata %>% # for the purpose of exercise, I will take a random sample of 300 units
sample_n(300)
mydata2 <- mydata1[,c(-4, -5, -6, -7)]# excluding variable, that I will not use
head(mydata2)
## ID Name Age PreferredFoot Pace Shooting Passing Dribbling Defending
## 1 1017 Mossoró 36 Right 69 71 75 75 45
## 2 8004 Y. Velásquez 20 Right 65 49 52 58 33
## 3 4775 G. Karlen 27 Right 68 65 51 60 32
## 4 4050 L. Kelly 30 Right 50 52 63 59 64
## 5 1301 Luisinho 35 Left 68 53 71 70 72
## 6 1799 L. Hejda 30 Right 62 35 53 57 73
## Physic
## 1 59
## 2 55
## 3 68
## 4 74
## 5 60
## 6 75
head(mydata2)
## ID Name Age PreferredFoot Pace Shooting Passing Dribbling Defending
## 1 1017 Mossoró 36 Right 69 71 75 75 45
## 2 8004 Y. Velásquez 20 Right 65 49 52 58 33
## 3 4775 G. Karlen 27 Right 68 65 51 60 32
## 4 4050 L. Kelly 30 Right 50 52 63 59 64
## 5 1301 Luisinho 35 Left 68 53 71 70 72
## 6 1799 L. Hejda 30 Right 62 35 53 57 73
## Physic
## 1 59
## 2 55
## 3 68
## 4 74
## 5 60
## 6 75
Explanation of data set
Unit of observation: Football Player in FIFA 21
Sample size: 300 units
summary(mydata2 [,c(-1,-2)])
## Age PreferredFoot Pace Shooting Passing
## Min. :17.00 Left : 74 Min. :31.00 Min. :24.00 Min. :29.00
## 1st Qu.:22.00 Right:226 1st Qu.:61.00 1st Qu.:42.00 1st Qu.:51.00
## Median :25.00 Median :68.00 Median :56.00 Median :58.00
## Mean :25.48 Mean :67.02 Mean :52.93 Mean :57.42
## 3rd Qu.:29.00 3rd Qu.:74.00 3rd Qu.:64.00 3rd Qu.:65.00
## Max. :39.00 Max. :91.00 Max. :82.00 Max. :80.00
## Dribbling Defending Physic
## Min. :32.00 Min. :17.00 Min. :36.00
## 1st Qu.:57.00 1st Qu.:36.00 1st Qu.:59.00
## Median :64.00 Median :56.00 Median :65.00
## Mean :62.45 Mean :51.29 Mean :64.39
## 3rd Qu.:69.00 3rd Qu.:65.00 3rd Qu.:71.00
## Max. :82.00 Max. :79.00 Max. :81.00
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
describe(mydata2 [,c(-1, -2, -4)])
## vars n mean sd median trimmed mad min max range skew
## Age 1 300 25.48 4.67 25 25.26 4.45 17 39 22 0.34
## Pace 2 300 67.02 11.18 68 67.86 9.64 31 91 60 -0.82
## Shooting 3 300 52.93 13.77 56 53.42 14.83 24 82 58 -0.31
## Passing 4 300 57.42 10.03 58 57.76 10.38 29 80 51 -0.30
## Dribbling 5 300 62.45 9.74 64 63.07 8.90 32 82 50 -0.59
## Defending 6 300 51.29 16.44 56 52.13 17.05 17 79 62 -0.42
## Physic 7 300 64.39 8.69 65 64.94 8.90 36 81 45 -0.52
## kurtosis se
## Age -0.42 0.27
## Pace 1.07 0.65
## Shooting -0.82 0.80
## Passing -0.24 0.58
## Dribbling 0.10 0.56
## Defending -1.08 0.95
## Physic -0.15 0.50
# Standardization of skill ratings
mydata2$Pace_z <- scale(mydata2$Pace)
mydata2$Shooting_z <- scale(mydata2$Shooting)
mydata2$Passing_z <- scale(mydata2$Passing)
mydata2$Dribbling_z <- scale(mydata2$Dribbling)
mydata2$Defending_z <- scale(mydata2$Defending)
mydata2$Physic_z <- scale(mydata2$Physic)
#Checking for NA in the data
any(is.na(mydata2))
## [1] FALSE
No issues with NA, because I dropped all rows with NA values at the beginning.
library(Hmisc)
##
## Attaching package: 'Hmisc'
## The following object is masked from 'package:psych':
##
## describe
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
rcorr(as.matrix(mydata2[, c("Pace_z", "Shooting_z", "Passing_z", "Dribbling_z", "Defending_z", "Physic_z")]),
type = "pearson")
## Pace_z Shooting_z Passing_z Dribbling_z Defending_z Physic_z
## Pace_z 1.00 0.31 0.29 0.53 -0.18 -0.20
## Shooting_z 0.31 1.00 0.62 0.76 -0.39 0.06
## Passing_z 0.29 0.62 1.00 0.82 0.26 0.19
## Dribbling_z 0.53 0.76 0.82 1.00 -0.05 0.05
## Defending_z -0.18 -0.39 0.26 -0.05 1.00 0.45
## Physic_z -0.20 0.06 0.19 0.05 0.45 1.00
##
## n= 300
##
##
## P
## Pace_z Shooting_z Passing_z Dribbling_z Defending_z Physic_z
## Pace_z 0.0000 0.0000 0.0000 0.0013 0.0006
## Shooting_z 0.0000 0.0000 0.0000 0.0000 0.3250
## Passing_z 0.0000 0.0000 0.0000 0.0000 0.0010
## Dribbling_z 0.0000 0.0000 0.0000 0.4012 0.3866
## Defending_z 0.0013 0.0000 0.0000 0.4012 0.0000
## Physic_z 0.0006 0.3250 0.0010 0.3866 0.0000
rcorr(as.matrix(mydata2[, c("Pace_z", "Shooting_z", "Passing_z", "Defending_z", "Physic_z")]),
type = "pearson")
## Pace_z Shooting_z Passing_z Defending_z Physic_z
## Pace_z 1.00 0.31 0.29 -0.18 -0.20
## Shooting_z 0.31 1.00 0.62 -0.39 0.06
## Passing_z 0.29 0.62 1.00 0.26 0.19
## Defending_z -0.18 -0.39 0.26 1.00 0.45
## Physic_z -0.20 0.06 0.19 0.45 1.00
##
## n= 300
##
##
## P
## Pace_z Shooting_z Passing_z Defending_z Physic_z
## Pace_z 0.0000 0.0000 0.0013 0.0006
## Shooting_z 0.0000 0.0000 0.0000 0.3250
## Passing_z 0.0000 0.0000 0.0000 0.0010
## Defending_z 0.0013 0.0000 0.0000 0.0000
## Physic_z 0.0006 0.3250 0.0010 0.0000
rcorr(as.matrix(mydata2[, c("Pace_z", "Shooting_z", "Passing_z", "Defending_z")]),
type = "pearson")
## Pace_z Shooting_z Passing_z Defending_z
## Pace_z 1.00 0.31 0.29 -0.18
## Shooting_z 0.31 1.00 0.62 -0.39
## Passing_z 0.29 0.62 1.00 0.26
## Defending_z -0.18 -0.39 0.26 1.00
##
## n= 300
##
##
## P
## Pace_z Shooting_z Passing_z Defending_z
## Pace_z 0.0000 0.0000 0.0013
## Shooting_z 0.0000 0.0000 0.0000
## Passing_z 0.0000 0.0000 0.0000
## Defending_z 0.0013 0.0000 0.0000
#Checking dissimilarity
mydata2$Dissimilarity <- sqrt(mydata2$Pace_z^2 + mydata2$Shooting_z^2 + mydata2$Passing_z^2 + mydata2$Defending_z^2)
head(mydata2[order(-mydata2$Dissimilarity),], 20)
## ID Name Age PreferredFoot Pace Shooting Passing Dribbling
## 161 7452 J. Pérez 28 Left 33 30 39 40
## 246 1993 B. Poulain 32 Right 33 26 57 46
## 62 1706 S. Prödl 33 Right 31 36 50 39
## 164 1265 S. Dann 33 Right 32 40 48 47
## 203 3388 L. Guldan 37 Right 31 31 58 52
## 253 169 Jorge Molina 38 Right 36 82 67 70
## 274 2137 César 27 Right 35 37 44 51
## 171 7151 F. Arthur 20 Right 60 26 29 36
## 177 4664 A. Madu 26 Left 41 26 42 39
## 227 127 O. Giroud 33 Left 39 79 70 71
## 272 6525 J. Medić 21 Right 60 25 32 32
## 111 6816 T. Břečka 26 Left 58 28 31 38
## 147 5987 T. Keller 20 Right 58 25 33 39
## 293 7794 P. Flottmann 23 Right 56 25 33 39
## 144 8207 D. Dietze 22 Right 52 58 38 49
## 7 8229 Feng Boyuan 25 Right 51 51 35 49
## 300 7885 E. Navarro 18 Right 59 30 32 36
## 31 5522 Iñaki Astiz 36 Right 44 26 46 41
## 61 3992 D. Moor 36 Right 35 42 59 58
## 179 5474 S. Novothny 26 Right 45 64 41 55
## Defending Physic Pace_z Shooting_z Passing_z Dribbling_z Defending_z
## 161 63 62 -3.0421544 -1.6649606 -1.83618116 -2.3062896 0.7124214
## 246 75 73 -3.0421544 -1.9554029 -0.04154256 -1.6900008 1.4422782
## 62 74 71 -3.2209995 -1.2292971 -0.73945757 -2.4090044 1.3814568
## 164 76 64 -3.1315769 -0.9388548 -0.93886186 -1.5872860 1.5030996
## 203 67 75 -3.2209995 -1.5923500 0.05815958 -1.0737120 0.9557070
## 253 42 73 -2.7738868 2.1107896 0.95547888 0.7751543 -0.5648281
## 274 73 73 -2.8633093 -1.1566866 -1.33767044 -1.1764268 1.3206354
## 171 57 75 -0.6277461 -1.9554029 -2.83320260 -2.7171488 0.3474929
## 177 65 77 -2.3267742 -1.9554029 -1.53707473 -2.4090044 0.8340642
## 227 42 77 -2.5056192 1.8929578 1.25458532 0.8778691 -0.5648281
## 272 63 70 -0.6277461 -2.0280135 -2.53409617 -3.1280080 0.7124214
## 111 62 62 -0.8065912 -1.8101818 -2.63379831 -2.5117192 0.6516000
## 147 63 68 -0.8065912 -2.0280135 -2.43439402 -2.4090044 0.7124214
## 293 54 70 -0.9854363 -2.0280135 -2.43439402 -2.4090044 0.1650287
## 144 17 63 -1.3431264 0.3681356 -1.93588330 -1.3818564 -2.0853632
## 7 23 57 -1.4325489 -0.1401384 -2.23498974 -1.3818564 -1.7204347
## 300 56 62 -0.7171687 -1.6649606 -2.53409617 -2.7171488 0.2866715
## 31 62 58 -2.0585066 -1.9554029 -1.13826615 -2.2035748 0.6516000
## 61 67 69 -2.8633093 -0.7936337 0.15786173 -0.4574232 0.9557070
## 179 26 69 -1.9690841 0.8037991 -1.63677687 -0.7655676 -1.5379705
## Physic_z Dissimilarity
## 161 -0.2750406 3.988220
## 246 0.9908367 3.893610
## 62 0.7606772 3.786982
## 164 -0.0448811 3.718735
## 203 1.2209962 3.718491
## 253 0.9908367 3.658121
## 274 0.9908367 3.615232
## 171 1.2209962 3.516455
## 177 1.4511557 3.506528
## 227 1.4511557 3.428474
## 272 0.6455974 3.381729
## 111 -0.2750406 3.359884
## 147 0.4154379 3.346229
## 293 0.6455974 3.322263
## 144 -0.1599609 3.167948
## 7 -0.8504394 3.166530
## 300 -0.2750406 3.128937
## 31 -0.7353596 3.127504
## 61 0.5305177 3.125171
## 179 0.5305177 3.093182
mydata2 <- mydata2 %>%
filter(!ID %in% c(7452, 1993))
head(mydata2[order(-mydata2$Dissimilarity),], 5)
## ID Name Age PreferredFoot Pace Shooting Passing Dribbling
## 62 1706 S. Prödl 33 Right 31 36 50 39
## 163 1265 S. Dann 33 Right 32 40 48 47
## 202 3388 L. Guldan 37 Right 31 31 58 52
## 251 169 Jorge Molina 38 Right 36 82 67 70
## 272 2137 César 27 Right 35 37 44 51
## Defending Physic Pace_z Shooting_z Passing_z Dribbling_z Defending_z
## 62 74 71 -3.220999 -1.2292971 -0.73945757 -2.4090044 1.3814568
## 163 76 64 -3.131577 -0.9388548 -0.93886186 -1.5872860 1.5030996
## 202 67 75 -3.220999 -1.5923500 0.05815958 -1.0737120 0.9557070
## 251 42 73 -2.773887 2.1107896 0.95547888 0.7751543 -0.5648281
## 272 73 73 -2.863309 -1.1566866 -1.33767044 -1.1764268 1.3206354
## Physic_z Dissimilarity
## 62 0.7606772 3.786982
## 163 -0.0448811 3.718735
## 202 1.2209962 3.718491
## 251 0.9908367 3.658121
## 272 0.9908367 3.615232
#install.packages("factoextra")
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.3.2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
#Calculating Euclidean distances
distance <- get_dist(mydata2[c("Pace_z", "Shooting_z", "Passing_z", "Defending_z")],
method = "euclidian")
distance2 <- distance^2
fviz_dist(distance2) #Showing dissimilarity matrix
get_clust_tendency(mydata2[c("Pace_z", "Shooting_z", "Passing_z", "Defending_z")],
n = nrow(mydata2) - 1,
graph = FALSE)
## $hopkins_stat
## [1] 0.7236504
##
## $plot
## NULL
Hopkins statistics:
Hopkins statistics is higher than 0.5 and this means that data is good for clustering.
WARD <- mydata2[c("Pace_z", "Shooting_z", "Passing_z", "Defending_z")] %>%
#scale() %>%
get_dist(method = "euclidean") %>% #Selecting distance
hclust(method = "ward.D2") #Selecting algorithm
WARD
##
## Call:
## hclust(d = ., method = "ward.D2")
##
## Cluster method : ward.D2
## Distance : euclidean
## Number of objects: 298
fviz_dend(WARD)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
fviz_dend(WARD,
k = 2,
cex = 0.5,
palette = "jama",
color_labels_by_k = TRUE,
rect = TRUE)
## Warning in data.frame(xmin = unlist(xleft), ymin = unlist(ybottom), xmax =
## unlist(xright), : row names were found from a short variable and have been
## discarded
fviz_dend(WARD,
k = 3,
cex = 0.5,
palette = "jama",
color_labels_by_k = TRUE,
rect = TRUE)
fviz_dend(WARD,
k = 4,
cex = 0.5,
palette = "jama",
color_labels_by_k = TRUE,
rect = TRUE)
According to the dendrograms from above, I will create 3 clusters.
mydata2$ClusterWard <- cutree(WARD,
k = 3)
head(mydata2[, c("ID", "ClusterWard")])
## ID ClusterWard
## 1 1017 1
## 2 8004 1
## 3 4775 1
## 4 4050 2
## 5 1301 2
## 6 1799 3
ID 1301 is clustered in group 2.
#Calculating positions of initial leaders
Initial_leaders <- aggregate(mydata2[, c("Pace_z", "Shooting_z", "Passing_z", "Defending_z")],
by = list(mydata2$ClusterWard),
FUN = mean)
Initial_leaders
## Group.1 Pace_z Shooting_z Passing_z Defending_z
## 1 1 0.33668690 0.6897825 0.02684817 -1.0011337
## 2 2 0.03562302 0.2184689 0.71944932 0.8303404
## 3 3 -0.48285901 -1.2816871 -0.90983465 0.4760648
#Performing K-Means clustering - initial leaders are chosen as centroids of groups, found with hierarhical clustering
K_MEANS <- hkmeans(mydata2[, c("Pace_z", "Shooting_z", "Passing_z", "Defending_z")],
k = 3,
hc.metric = "euclidean",
hc.method = "ward.D2")
K_MEANS
## Hierarchical K-means clustering with 3 clusters of sizes 93, 117, 88
##
## Cluster means:
## Pace_z Shooting_z Passing_z Defending_z
## 1 0.3203250 0.5773790 -0.3095591 -1.2580613
## 2 0.2297930 0.4903945 0.8804892 0.5757032
## 3 -0.5749056 -1.2210459 -0.8221650 0.5396333
##
## Clustering vector:
## [1] 2 1 1 2 2 3 1 2 2 1 1 3 1 2 1 2 2 2 2 1 2 3 2 1 1 3 2 2 2 2 3 1 1 3 3 3 1
## [38] 1 2 2 3 1 3 2 1 2 3 3 1 1 1 2 2 1 1 3 3 1 1 1 3 3 2 3 2 1 3 1 2 2 2 2 1 2
## [75] 2 2 2 3 3 3 2 2 2 2 2 2 1 3 1 1 3 3 2 1 2 2 2 1 2 3 2 3 1 2 3 1 2 2 2 1 3
## [112] 1 3 2 1 2 1 3 1 3 1 3 2 2 1 2 3 1 2 2 2 1 2 1 3 1 2 1 2 2 1 1 3 1 3 1 3 3
## [149] 3 3 1 1 3 1 2 2 2 3 2 1 3 3 3 3 3 3 2 3 1 3 2 1 2 2 3 3 2 1 1 2 2 3 3 2 2
## [186] 2 3 1 1 2 2 2 3 2 2 3 1 3 2 2 3 3 1 3 2 1 1 1 2 1 3 3 2 2 1 3 2 2 3 1 1 1
## [223] 1 3 1 2 2 2 1 3 3 2 1 1 3 1 2 1 1 3 1 1 3 2 3 3 3 2 2 2 2 2 2 1 2 2 1 2 2
## [260] 3 1 1 2 2 2 2 3 1 3 3 2 3 1 2 1 1 1 2 2 3 2 3 2 3 1 3 1 3 3 2 3 1 2 1 2 3
## [297] 2 3
##
## Within cluster sum of squares by cluster:
## [1] 165.4623 211.5997 182.0237
## (between_SS / total_SS = 52.0 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault" "data"
## [11] "hclust"
fviz_cluster(K_MEANS,
palette = "jama",
repel = FALSE,
ggtheme = theme_classic())
mydata2$ClusterK_Means <- K_MEANS$cluster
head(mydata2[c("ID", "ClusterWard", "ClusterK_Means")])
## ID ClusterWard ClusterK_Means
## 1 1017 1 2
## 2 8004 1 1
## 3 4775 1 1
## 4 4050 2 2
## 5 1301 2 2
## 6 1799 3 3
Customer with ID 1017, was initially assigned to group 1, and later, after K-means clustering reclassified to group 2.
table(mydata2$ClusterWard) #Ward clustering
##
## 1 2 3
## 121 98 79
table(mydata2$ClusterK_Means)# K Means Clustering
##
## 1 2 3
## 93 117 88
#Reclasifications
table(mydata2$ClusterWard, mydata2$ClusterK_Means)
##
## 1 2 3
## 1 93 28 0
## 2 0 86 12
## 3 0 3 76
Centroids <- K_MEANS$centers
Centroids
## Pace_z Shooting_z Passing_z Defending_z
## 1 0.3203250 0.5773790 -0.3095591 -1.2580613
## 2 0.2297930 0.4903945 0.8804892 0.5757032
## 3 -0.5749056 -1.2210459 -0.8221650 0.5396333
library(ggplot2)
library(tidyr)
Figure <- as.data.frame(Centroids)
Figure$id <- 1:nrow(Figure)
Figure <- pivot_longer(Figure, cols = c(Pace_z, Shooting_z, Passing_z, Defending_z))
Figure$Groups <- factor(Figure$id,
levels = c(1, 2, 3),
labels = c("1", "2", "3"))
Figure$nameFactor <- factor(Figure$name,
levels = c("Pace_z", "Shooting_z", "Passing_z", "Defending_z"),
labels = c("Pace_z", "Shooting_z", "Passing_z", "Defending_z"))
ggplot(Figure, aes(x = nameFactor, y = value)) +
geom_hline(yintercept = 0) +
theme_bw() +
geom_point(aes(shape = Groups, col = Groups), size = 3) +
geom_line(aes(group = id), linewidth = 1) +
ylab("Averages") +
xlab("Cluster variables")+
ylim(-1.5, 1.5)
#Are all the cluster variables successful at classifing units into groups? Performing ANOVAs.
fit <- aov(cbind(Pace_z, Shooting_z, Passing_z, Defending_z) ~ as.factor(ClusterK_Means), data = mydata2)
summary(fit)
## Response 1 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterK_Means) 2 44.682 22.3410 27.964 7.572e-12 ***
## Residuals 295 235.684 0.7989
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 2 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterK_Means) 2 190.30 95.150 275.03 < 2.2e-16 ***
## Residuals 295 102.06 0.346
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 3 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterK_Means) 2 159.09 79.545 171.88 < 2.2e-16 ***
## Residuals 295 136.53 0.463
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 4 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterK_Means) 2 211.581 105.790 367.95 < 2.2e-16 ***
## Residuals 295 84.816 0.288
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
We reject all H0 at p<0.001. Choice of cluster variables is correct.
aggregate(mydata2$Age,
by = list(mydata2$ClusterK_Means),
FUN = "mean")
## Group.1 x
## 1 1 24.36559
## 2 2 26.89744
## 3 3 24.65909
fit <- aov(Age ~ as.factor(ClusterK_Means),
data = mydata2)
summary(fit)
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterK_Means) 2 410 204.76 9.964 6.49e-05 ***
## Residuals 295 6062 20.55
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
We can reject H0 (p<0.001), and conclude that at least one arithmetic mean is different. There are statistical differences in age between groups (p<0.001).
#Pearson Chi2 test
chisq_results <- chisq.test(mydata2$PreferredFoot, as.factor(mydata2$ClusterK_Means))
chisq_results
##
## Pearson's Chi-squared test
##
## data: mydata2$PreferredFoot and as.factor(mydata2$ClusterK_Means)
## X-squared = 1.9725, df = 2, p-value = 0.373
addmargins(chisq_results$observed)
##
## mydata2$PreferredFoot 1 2 3 Sum
## Left 18 32 23 73
## Right 75 85 65 225
## Sum 93 117 88 298
round(chisq_results$expected, 2)
##
## mydata2$PreferredFoot 1 2 3
## Left 22.78 28.66 21.56
## Right 70.22 88.34 66.44
round(chisq_results$res, 2)
##
## mydata2$PreferredFoot 1 2 3
## Left -1.00 0.62 0.31
## Right 0.57 -0.36 -0.18
We can not reject H0. There is no statistical differences in which is the preferred foot between groups.
Classification of 298 football players was based on four standardized variables. For hierarchical clustering, Ward’s algorithm was used, and based on the analysis of the dendrogram it was decided to classify the players into three groups. The classification was further optimized using the K-Means cluster. Group 1 contains 31.21% of observed players, characterized by a higher than average value in pace and shooting and below the average value of passing. Players in this group have the lowest rating for defending among all three groups. In this group there they are on average the youngest players. Group 2 contains the most (39.26%) of observed players, characterized by a higher than average value of all 4 cluster variables. In this group there they are on average the oldest players. Group 3 contains 29.53% of observed players, characterized by the lowest value in pace, shooting and passing among all three groups and above average value in defending.