data20 <- read.table("./players_20.csv", fill=TRUE, header=TRUE, sep=",")
data20 <- data20[,c(1,3,5,22,34,35,36,37,38,39)]
set.seed(100)
data20 <- data20[sample(nrow(data20), 300), ]
data20 <- data20[data20$sofifa_id != 251971, ] #I observed when looking through data manually that this ID unit had a mistake in work_rate: Value should be categorical, but it was numeric.
library(tidyr)
data20 <- drop_na(data20) #Drop missing values
data20$work_rate <- factor(data20$work_rate)
data20$short_name <- factor(data20$short_name)
data20$age <- as.integer(data20$age)
data20$pace <- as.integer(data20$pace)
data20$shooting <- as.integer(data20$shooting)
data20$passing <- as.integer(data20$passing)
data20$dribbling <- as.integer(data20$dribbling)
data20$defending <- as.integer(data20$defending)
data20$physic <- as.integer(data20$physic) #When I did describeBy later, it showed me values as characters - that is why I converted them.
colnames(data20) <- c("ID","name", "age","work_rate", "pace", "shooting", "passing", "dribbling", "defending", "physic") #Renamed the columns.
head(data20, 10) #Table with first 10 rows.
## ID name age work_rate pace shooting passing dribbling
## 1 252721 D. Takagi 23 Medium/Medium 74 63 63 70
## 2 176794 O. Toivonen 32 Medium/Low 50 77 75 73
## 3 237635 F. Pick 23 Medium/Medium 89 54 55 71
## 4 248100 Romário Pires 30 Medium/Medium 68 56 59 66
## 5 243481 M. Pedersen 19 Medium/Medium 82 48 52 64
## 6 227787 Kim Jong Woo 25 High/Medium 72 63 66 68
## 7 242966 N. Clayton-Phillips 19 Medium/Medium 65 59 56 63
## 8 251378 T. Hölscher 19 Medium/Medium 70 57 52 59
## 9 178253 Ivo 32 High/High 72 65 77 73
## 10 230517 Tiagildo Serra 31 Medium/Medium 59 60 55 66
## defending physic
## 1 33 51
## 2 41 73
## 3 27 50
## 4 55 64
## 5 38 47
## 6 59 50
## 7 29 38
## 8 45 52
## 9 48 66
## 10 28 50
DESCRIPTION OF VARIABLES
Name: Name of a player
Age: Age of a player in years
Pace: Pace rating (1-100)
Work_rate:
Shooting: Shooting rating (1-100)
Passing: Passing rating (1-100)
Dribbling: Dribbling rating (1-100)
Defending: Defending rating (1-100)
Physic: Physic rating (1-100)
Unit of observation: Football Player in FIFA
Sample size: 255 (after dropping units with missing values)
Source: Kaggle | FIFA 20 complete player dataset
RQ: Can you divide football players into homogeneous groups based on cluster variables (pace, shooting, passing, dribbling, defending, physic)?
summary(data20)
## ID name age work_rate
## Length:255 A. Al Saluli : 1 Min. :17.0 Medium/Medium:140
## Class :character A. Bolivar : 1 1st Qu.:21.0 High/Medium : 41
## Mode :character A. Cerri : 1 Median :24.0 Medium/High : 24
## A. Ciss : 1 Mean :24.7 High/High : 15
## A. Cruz : 1 3rd Qu.:28.0 High/Low : 12
## A. Fiordaliso: 1 Max. :37.0 Medium/Low : 9
## (Other) :249 (Other) : 14
## pace shooting passing dribbling defending
## Min. :33.00 Min. :20.00 Min. :25 Min. :31.00 Min. :18.00
## 1st Qu.:61.00 1st Qu.:40.00 1st Qu.:49 1st Qu.:56.00 1st Qu.:42.50
## Median :67.00 Median :51.00 Median :57 Median :63.00 Median :56.00
## Mean :66.97 Mean :49.87 Mean :56 Mean :60.83 Mean :52.62
## 3rd Qu.:74.00 3rd Qu.:59.50 3rd Qu.:63 3rd Qu.:67.50 3rd Qu.:63.00
## Max. :89.00 Max. :77.00 Max. :81 Max. :86.00 Max. :79.00
##
## physic
## Min. :38.00
## 1st Qu.:58.00
## Median :65.00
## Mean :63.58
## 3rd Qu.:71.00
## Max. :84.00
##
library(psych)
describeBy(data20)
## Warning in describeBy(data20): no grouping variable requested
## vars n mean sd median trimmed mad min max range skew
## ID* 1 255 128.00 73.76 128 128.00 94.89 1 255 254 0.00
## name* 2 255 128.00 73.76 128 128.00 94.89 1 255 254 0.00
## age 3 255 24.70 4.63 24 24.47 5.93 17 37 20 0.36
## work_rate* 4 255 7.07 2.48 9 7.42 0.00 1 9 8 -0.83
## pace 5 255 66.97 10.48 67 67.40 8.90 33 89 56 -0.44
## shooting 6 255 49.87 13.13 51 50.15 14.83 20 77 57 -0.18
## passing 7 255 56.00 10.22 57 56.38 10.38 25 81 56 -0.36
## dribbling 8 255 60.83 9.81 63 61.61 8.90 31 86 55 -0.71
## defending 9 255 52.62 14.32 56 53.67 13.34 18 79 61 -0.62
## physic 10 255 63.58 9.67 65 64.10 10.38 38 84 46 -0.45
## kurtosis se
## ID* -1.21 4.62
## name* -1.21 4.62
## age -0.80 0.29
## work_rate* -0.91 0.16
## pace 0.33 0.66
## shooting -0.81 0.82
## passing -0.05 0.64
## dribbling 0.41 0.61
## defending -0.60 0.90
## physic -0.38 0.61
Remove ID name from describeby flikn v cht tabelo
data20$Pace_z <- scale(data20$pace)
data20$Shooting_z <- scale(data20$shooting)
data20$Passing_z <- scale(data20$passing)
data20$Dribbling_z <- scale(data20$dribbling)
data20$Defending_z <- scale(data20$defending)
data20$Physic_z <- scale(data20$physic) #I standardized.
library(Hmisc)
##
## Attaching package: 'Hmisc'
## The following object is masked from 'package:psych':
##
## describe
## The following objects are masked from 'package:base':
##
## format.pval, units
rcorr(as.matrix(data20[, c("Pace_z", "Shooting_z", "Passing_z", "Dribbling_z", "Defending_z", "Physic_z")]),
type = "pearson")
## Pace_z Shooting_z Passing_z Dribbling_z Defending_z Physic_z
## Pace_z 1.00 0.33 0.31 0.52 -0.19 -0.08
## Shooting_z 0.33 1.00 0.69 0.74 -0.33 0.05
## Passing_z 0.31 0.69 1.00 0.82 0.13 0.12
## Dribbling_z 0.52 0.74 0.82 1.00 -0.17 -0.06
## Defending_z -0.19 -0.33 0.13 -0.17 1.00 0.55
## Physic_z -0.08 0.05 0.12 -0.06 0.55 1.00
##
## n= 255
##
##
## P
## Pace_z Shooting_z Passing_z Dribbling_z Defending_z Physic_z
## Pace_z 0.0000 0.0000 0.0000 0.0025 0.1948
## Shooting_z 0.0000 0.0000 0.0000 0.0000 0.4543
## Passing_z 0.0000 0.0000 0.0000 0.0362 0.0621
## Dribbling_z 0.0000 0.0000 0.0000 0.0066 0.3497
## Defending_z 0.0025 0.0000 0.0362 0.0066 0.0000
## Physic_z 0.1948 0.4543 0.0621 0.3497 0.0000
From the observed table, we see that dribbling is highly correlated to all of the cluster variables (more than 0.3 everywhere), that is why I decided to drop this variable (but this is not a requirement, just results will be better).
data20 <- data20[, c(-8, -14)]
rcorr(as.matrix(data20[, c("Pace_z", "Shooting_z", "Passing_z", "Defending_z", "Physic_z")]),
type = "pearson")
## Pace_z Shooting_z Passing_z Defending_z Physic_z
## Pace_z 1.00 0.33 0.31 -0.19 -0.08
## Shooting_z 0.33 1.00 0.69 -0.33 0.05
## Passing_z 0.31 0.69 1.00 0.13 0.12
## Defending_z -0.19 -0.33 0.13 1.00 0.55
## Physic_z -0.08 0.05 0.12 0.55 1.00
##
## n= 255
##
##
## P
## Pace_z Shooting_z Passing_z Defending_z Physic_z
## Pace_z 0.0000 0.0000 0.0025 0.1948
## Shooting_z 0.0000 0.0000 0.0000 0.4543
## Passing_z 0.0000 0.0000 0.0362 0.0621
## Defending_z 0.0025 0.0000 0.0362 0.0000
## Physic_z 0.1948 0.4543 0.0621 0.0000
data20$Dissimilarity <- sqrt(data20$Shooting_z^2 + data20$Passing_z^2 + data20$Defending_z^2 + data20$Physic_z^2)
head(data20[order(-data20$Dissimilarity), ], 10)
## ID name age work_rate pace shooting passing
## 248 224334 M. Acuña 27 High/High 76 74 81
## 100 244902 F. Castro 24 54 24 25
## 202 239853 M. Kryeziu 22 Medium/Medium 35 20 28
## 163 251124 F. Nevarez 18 Medium/Medium 56 26 29
## 64 239821 L. Matarese 21 Medium/Medium 68 56 55
## 7 242966 N. Clayton-Phillips 19 Medium/Medium 65 59 56
## 254 251823 T. Tattermusch 18 Medium/Medium 64 45 44
## 200 252013 R. Hauk 20 Medium/Medium 53 27 29
## 23 252557 J. Jiménez 24 Low/High 61 23 35
## 214 221740 E. Crivelli 24 High/High 49 72 62
## defending physic Pace_z Shooting_z Passing_z Defending_z Physic_z
## 248 76 84 0.86200664 1.8380377 2.445104384 1.63253642 2.1117395
## 100 49 77 -1.23780110 -1.9712895 -3.032788713 -0.25242304 1.3879578
## 202 53 75 -3.05127143 -2.2760357 -2.739330154 0.02683021 1.1811631
## 163 47 50 -1.04690949 -1.8189164 -2.641510635 -0.39204966 -1.4037716
## 64 25 39 0.09844018 0.4666799 -0.098203126 -1.92794255 -2.5411428
## 7 29 38 -0.18789723 0.6952396 -0.000383606 -1.64868930 -2.6445402
## 254 21 45 -0.28334304 -0.3713721 -1.174217841 -2.20719581 -1.9207585
## 200 53 65 -1.33324691 -1.7427299 -2.641510635 0.02683021 0.1471892
## 23 64 54 -0.56968046 -2.0474761 -2.054593517 0.79477666 -0.9901820
## 214 30 83 -1.71503014 1.6856647 0.586533512 -1.57887599 2.0083421
## Dissimilarity
## 248 4.059746
## 100 3.882516
## 202 3.752350
## 163 3.522827
## 64 3.225183
## 7 3.192981
## 254 3.174544
## 200 3.168134
## 23 3.166330
## 214 3.116372
From what is observed there is a significant jump from 3.22 to 3.52 - for practice I will remove the units with dissimilarity result of more than 3.5.
data20 <- data20[data20$Dissimilarity < 3.5,]
data20$Pace_z <- scale(data20$pace)
data20$Shooting_z <- scale(data20$shooting)
data20$Passing_z <- scale(data20$passing)
data20$Defending_z <- scale(data20$defending)
data20$Physic_z <- scale(data20$physic) #After removing the "outliers" I need to standardize variables again.
library(factoextra)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
distance <- get_dist(data20[c("Pace_z", "Shooting_z", "Passing_z", "Defending_z", "Physic_z")],
method = "euclidian")
distance2 <- distance^2
fviz_dist(distance2)
From what I can observe, there seem to be 5 natural groups forming.
get_clust_tendency(data20[c("Pace_z", "Shooting_z", "Passing_z", "Defending_z", "Physic_z")],
n = nrow(data20) - 1,
graph = FALSE)
## $hopkins_stat
## [1] 0.674727
##
## $plot
## NULL
I want Hopkin`s result to be more than 0.5, to say my data is clusterable. Here I have more - 0.67.
WARD <- data20[c("Pace_z", "Shooting_z", "Passing_z", "Defending_z", "Physic_z")] %>%
get_dist(method = "euclidean") %>%
hclust(method = "ward.D2") #For Ward`s method I need to use Euclidean squared distance
print(WARD)
##
## Call:
## hclust(d = ., method = "ward.D2")
##
## Cluster method : ward.D2
## Distance : euclidean
## Number of objects: 251
I can see that there are 251 objects now after removing the units. Those 251 need to be classified into group - number of which I will determine in the next steps…
fviz_dend(WARD)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
fviz_dend(WARD,
k = 3,
cex = 0.5,
palette = "jama",
color_labels_by_k = TRUE,
rect = TRUE)
Here the last group seems to be too heterogeneous, so I will try to cut dendrogram into 5.
fviz_dend(WARD,
k = 5,
cex = 0.5,
palette = "jama",
color_labels_by_k = TRUE,
rect = TRUE)
I choose 5 clusters.
data20$ClusterWard <- cutree(WARD,
k = 5)
head(data20)
## ID name age work_rate pace shooting passing defending physic
## 1 252721 D. Takagi 23 Medium/Medium 74 63 63 33 51
## 2 176794 O. Toivonen 32 Medium/Low 50 77 75 41 73
## 3 237635 F. Pick 23 Medium/Medium 89 54 55 27 50
## 4 248100 Romário Pires 30 Medium/Medium 68 56 59 55 64
## 5 243481 M. Pedersen 19 Medium/Medium 82 48 52 38 47
## 6 227787 Kim Jong Woo 25 High/Medium 72 63 66 59 50
## Pace_z Shooting_z Passing_z Defending_z Physic_z Dissimilarity
## 1 0.66504762 1.0072490 0.6975470 -1.3623645 -1.30250089 2.2438000
## 2 -1.66687721 2.1000163 1.9370819 -0.8050967 0.99907028 3.0546170
## 3 2.12250063 0.3047558 -0.1288096 -1.7803154 -1.40711776 2.2971898
## 4 0.08206641 0.4608654 0.2843687 0.1701221 0.05751844 0.5773289
## 5 1.44235589 -0.1635730 -0.4386933 -1.0140721 -1.72096838 2.0377978
## 6 0.47072055 1.0072490 1.0074307 0.4487560 -1.40711776 2.0310881
## ClusterWard
## 1 1
## 2 2
## 3 3
## 4 4
## 5 3
## 6 1
Initial_leaders <- aggregate(data20[, c("Pace_z", "Shooting_z", "Passing_z", "Defending_z", "Physic_z")],
by = list(data20$ClusterWard),
FUN = mean)
Initial_leaders
## Group.1 Pace_z Shooting_z Passing_z Defending_z Physic_z
## 1 1 1.05792626 1.31607457 0.9580290 -0.9686427 -0.1744581
## 2 2 -0.02724256 0.83162574 1.0314098 0.4126829 0.6403839
## 3 3 -0.02843329 0.08436577 -0.3880587 -1.1820720 -1.2983983
## 4 4 0.45290723 -0.15446662 0.1190974 0.3698098 0.2161874
## 5 5 -0.79559109 -1.17828547 -1.1007946 0.6109119 0.3507887
K_MEANS <- hkmeans(data20[, c("Pace_z", "Shooting_z", "Passing_z", "Defending_z", "Physic_z")],
k = 5,
hc.metric = "euclidean",
hc.method = "ward.D2")
K_MEANS
## Hierarchical K-means clustering with 5 clusters of sizes 36, 48, 58, 56, 53
##
## Cluster means:
## Pace_z Shooting_z Passing_z Defending_z Physic_z
## 1 0.81619089 1.26743172 0.7836258 -0.9115193 0.07786061
## 2 -0.32683013 0.73730951 1.0440142 0.5706584 0.38226664
## 3 0.07871594 -0.09359285 -0.5419879 -1.1317709 -1.25199619
## 4 0.59043990 -0.23465864 0.2308769 0.5719023 0.35829195
## 5 -0.96839972 -1.17828547 -1.1286231 0.7365902 0.59244622
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 1 2 3 4 3 2 3 3 1 3 5 2 1 1 3 2 1 3 3 4
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 3 5 5 5 5 4 3 3 5 4 1 2 2 5 5 4 1 3 5 3
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 5 1 2 5 3 4 5 1 2 1 3 2 3 1 4 4 5 3 2 4
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 2 5 2 3 2 4 5 2 5 4 3 4 3 2 5 1 4 4 2 5
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 101
## 2 4 4 2 3 4 1 4 5 2 2 5 3 1 3 1 3 2 3 1
## 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121
## 5 5 1 2 5 4 3 3 4 2 4 5 4 1 5 5 5 3 2 2
## 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141
## 5 1 4 3 4 5 2 3 4 4 2 2 2 4 3 2 4 4 5 2
## 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
## 5 3 5 4 3 1 5 1 4 5 5 3 4 4 2 5 1 2 4 1
## 162 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182
## 2 3 4 2 5 2 1 3 1 3 5 3 1 4 1 3 3 4 5 3
## 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 203
## 3 5 2 5 5 4 3 4 5 3 4 5 5 4 1 5 3 5 4 2
## 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223
## 2 4 3 3 5 4 1 5 4 1 2 5 4 4 3 2 3 3 4 5
## 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243
## 1 3 1 3 2 1 2 2 2 2 4 1 1 2 4 3 3 4 3 4
## 244 245 246 247 249 250 251 252 253 254 255
## 1 5 5 4 4 2 4 3 3 3 4
##
## Within cluster sum of squares by cluster:
## [1] 72.79334 94.73413 157.13840 86.12904 126.60821
## (between_SS / total_SS = 57.0 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault" "data"
## [11] "hclust"
In the 1st group there are 36 players, 2nd 48 players, 3rd 58 players, 4th 56 players, 5th 53 players.
Write sth. about the cluster means ???
Clustering vector tells me e.g. that the 1st player is in the 1st group… 5th player in the 3rd group etc.
From what I see the clusters are not that homogeneous (57% tells us that ?? ).
fviz_cluster(K_MEANS,
palette = "jama",
repel = FALSE,
ggtheme = theme_classic())
data20$ClusterK_Means <- K_MEANS$cluster
head(data20[c("name", "ClusterWard", "ClusterK_Means")])
## name ClusterWard ClusterK_Means
## 1 D. Takagi 1 1
## 2 O. Toivonen 2 2
## 3 F. Pick 3 3
## 4 Romário Pires 4 4
## 5 M. Pedersen 3 3
## 6 Kim Jong Woo 1 2
As it is shown, some of the objects have been reclassified after using K-Means clustering - e.g. Kim jong woo was reclassified from 1st group to 2nd.
table(data20$ClusterWard)
##
## 1 2 3 4 5
## 23 56 51 60 61
This tells the number of players in certain group initially.
table(data20$ClusterK_Means)
##
## 1 2 3 4 5
## 36 48 58 56 53
This tells the number of players in certain group after K-Means clustering.
table(data20$ClusterWard, data20$ClusterK_Means)
##
## 1 2 3 4 5
## 1 22 1 0 0 0
## 2 8 39 0 9 0
## 3 0 3 46 2 0
## 4 6 4 3 45 2
## 5 0 1 9 0 51
As this table shows: initially after Ward`s in the 1st group there were 23 players. After K-Means clustering there were 36 players in this group. If we look at the structure of the initial 23 players in group 1, 1 went to group 2 and 8 players came from group 2 and 6 from group 4.
Centroids <- K_MEANS$centers
Centroids
## Pace_z Shooting_z Passing_z Defending_z Physic_z
## 1 0.81619089 1.26743172 0.7836258 -0.9115193 0.07786061
## 2 -0.32683013 0.73730951 1.0440142 0.5706584 0.38226664
## 3 0.07871594 -0.09359285 -0.5419879 -1.1317709 -1.25199619
## 4 0.59043990 -0.23465864 0.2308769 0.5719023 0.35829195
## 5 -0.96839972 -1.17828547 -1.1286231 0.7365902 0.59244622
kaj to točno pomen???
library(ggplot2)
library(tidyr)
Figure <- as.data.frame(Centroids)
Figure$id <- 1:nrow(Figure)
Figure <- pivot_longer(Figure, cols = c(Pace_z, Shooting_z, Passing_z, Defending_z, Physic_z))
Figure$Groups <- factor(Figure$id,
levels = c(1, 2, 3, 4, 5),
labels = c("1", "2", "3", "4", "5"))
Figure$nameFactor <- factor(Figure$name,
levels = c("Pace_z", "Shooting_z", "Passing_z", "Defending_z", "Physic_z"),
labels = c("Pace_z", "Shooting_z", "Passing_z", "Defending_z", "Physic_z"))
ggplot(Figure, aes(x = nameFactor, y = value)) +
geom_hline(yintercept = 0) +
theme_bw() +
geom_point(aes(shape = Groups, col = Groups), size = 3) +
geom_line(aes(group = id), linewidth = 1) +
ylab("Averages") +
xlab("Cluster variables")+
ylim(-1.5, 1.5)
Here I graphically show the deviations from the mean values of the
cluster variables, for each group.
Players in group 1: Here there are above average ratings of defending and physic, which indicates that these players play defending positions.
Players in group 2:Here there are above average ratings of Pace and Shooting, which indicates that these players play attacking positions.
Players in group 3: Here there are above average ratings of shooting, passing, defending and physic which indicates that these players play midfield positions.
Players in group 4:
Players in group 5:
fit <- aov(cbind(Pace_z, Shooting_z, Passing_z, Defending_z, Physic_z) ~ as.factor(ClusterK_Means), data = data20) #Code for all one-way ANOVAs at once
summary(fit)
## Response 1 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterK_Means) 4 98.695 24.6737 40.116 < 2.2e-16 ***
## Residuals 246 151.305 0.6151
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 2 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterK_Means) 4 161.098 40.275 111.44 < 2.2e-16 ***
## Residuals 246 88.902 0.361
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 3 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterK_Means) 4 161.958 40.490 113.13 < 2.2e-16 ***
## Residuals 246 88.042 0.358
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 4 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterK_Means) 4 166.907 41.727 123.53 < 2.2e-16 ***
## Residuals 246 83.093 0.338
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response 5 :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterK_Means) 4 123.94 30.9846 60.464 < 2.2e-16 ***
## Residuals 246 126.06 0.5124
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Because I have selected 5 variables i need to perform 5 one-way ANOVA tests.
H0: The averages of cluster variables are the same in all groups.
H1: At least one average is different.
Based on sample data, I can reject the null hypothesis (p>0.001). All of my selected variables are statistically significant. I conclude that all cluster variables are successful at classifying units into groups.
aggregate(data20$age,
by = list(data20$ClusterK_Means),
FUN = "mean")
## Group.1 x
## 1 1 27.77778
## 2 2 26.79167
## 3 3 20.94828
## 4 4 24.82143
## 5 5 24.83019
On average the players in Group 1 are the oldest ??? a je to prou??
fit <- aov(age ~ as.factor(ClusterK_Means),
data = data20)
summary(fit)
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterK_Means) 4 1369 342.2 20.98 6.75e-15 ***
## Residuals 246 4013 16.3
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
I can reject the null hypothesis (p>0.001) (meaning at least one arithmetic mean is different).
chisq_results <- chisq.test(data20$work_rate, as.factor(data20$ClusterK_Means)) #For the categorical variable I need to preform Pearson`s chi-squared test
## Warning in chisq.test(data20$work_rate, as.factor(data20$ClusterK_Means)):
## Chi-squared approximation may be incorrect
chisq_results
##
## Pearson's Chi-squared test
##
## data: data20$work_rate and as.factor(data20$ClusterK_Means)
## X-squared = 83.398, df = 28, p-value = 2.059e-07
Ho?? H1???
Based on the sample data, I can reject the null hypothesis (p>0.001). That means that work rate can be ??
addmargins(chisq_results$observed)
##
## data20$work_rate 1 2 3 4 5 Sum
## High/High 4 2 0 7 1 14
## High/Low 6 2 4 0 0 12
## High/Medium 10 9 6 13 3 41
## Low/High 0 0 0 1 4 5
## Low/Medium 0 2 0 2 4 8
## Medium/High 0 7 2 7 8 24
## Medium/Low 4 1 4 0 0 9
## Medium/Medium 12 25 42 26 33 138
## Sum 36 48 58 56 53 251
round(chisq_results$expected, 2)
##
## data20$work_rate 1 2 3 4 5
## High/High 2.01 2.68 3.24 3.12 2.96
## High/Low 1.72 2.29 2.77 2.68 2.53
## High/Medium 5.88 7.84 9.47 9.15 8.66
## Low/High 0.72 0.96 1.16 1.12 1.06
## Low/Medium 1.15 1.53 1.85 1.78 1.69
## Medium/High 3.44 4.59 5.55 5.35 5.07
## Medium/Low 1.29 1.72 2.08 2.01 1.90
## Medium/Medium 19.79 26.39 31.89 30.79 29.14
round(chisq_results$res, 2)
##
## data20$work_rate 1 2 3 4 5
## High/High 1.41 -0.41 -1.80 2.19 -1.14
## High/Low 3.26 -0.19 0.74 -1.64 -1.59
## High/Medium 1.70 0.41 -1.13 1.27 -1.92
## Low/High -0.85 -0.98 -1.07 -0.11 2.87
## Low/Medium -1.07 0.38 -1.36 0.16 1.78
## Medium/High -1.86 1.13 -1.51 0.71 1.30
## Medium/Low 2.38 -0.55 1.33 -1.42 -1.38
## Medium/Medium -1.75 -0.27 1.79 -0.86 0.72