MVA HOMEWORK 4 (clustering)

Clustering - RQ: Can you divide football players into homogeneous groups based on cluster variables (pace, shooting, passing, dribbling, defending, physic)?

data20 <- read.table("./players_20.csv", fill=TRUE, header=TRUE, sep=",")
data20 <- data20[,c(1,3,5,22,34,35,36,37,38,39)]

set.seed(100)
data20 <- data20[sample(nrow(data20), 300), ]

data20 <- data20[data20$sofifa_id != 251971, ] #I observed when looking through data manually that this ID unit had a mistake in work_rate: Value should be categorical, but it was numeric.

library(tidyr)
data20 <- drop_na(data20) #Drop missing values

data20$work_rate <- factor(data20$work_rate)
data20$short_name <- factor(data20$short_name)
data20$age <- as.integer(data20$age)
data20$pace <- as.integer(data20$pace)
data20$shooting <- as.integer(data20$shooting) 
data20$passing <- as.integer(data20$passing)
data20$dribbling <- as.integer(data20$dribbling)
data20$defending <- as.integer(data20$defending)
data20$physic <- as.integer(data20$physic) #When I did describeBy later, it showed me values as characters - that is why I converted them.

colnames(data20) <- c("ID","name", "age","work_rate", "pace", "shooting", "passing", "dribbling", "defending", "physic") #Renamed the columns.

Descriptive statistics

head(data20, 10) #Table with first 10 rows.

##        ID                name age     work_rate pace shooting passing dribbling
## 1  252721           D. Takagi  23 Medium/Medium   74       63      63        70
## 2  176794         O. Toivonen  32    Medium/Low   50       77      75        73
## 3  237635             F. Pick  23 Medium/Medium   89       54      55        71
## 4  248100       Romário Pires  30 Medium/Medium   68       56      59        66
## 5  243481         M. Pedersen  19 Medium/Medium   82       48      52        64
## 6  227787        Kim Jong Woo  25   High/Medium   72       63      66        68
## 7  242966 N. Clayton-Phillips  19 Medium/Medium   65       59      56        63
## 8  251378         T. Hölscher  19 Medium/Medium   70       57      52        59
## 9  178253                 Ivo  32     High/High   72       65      77        73
## 10 230517      Tiagildo Serra  31 Medium/Medium   59       60      55        66
##    defending physic
## 1         33     51
## 2         41     73
## 3         27     50
## 4         55     64
## 5         38     47
## 6         59     50
## 7         29     38
## 8         45     52
## 9         48     66
## 10        28     50

DESCRIPTION OF VARIABLES

Name: Name of a player
Age: Age of a player in years
Pace: Pace rating (1-100)
Work_rate:
Shooting: Shooting rating (1-100)
Passing: Passing rating (1-100)
Dribbling: Dribbling rating (1-100)
Defending: Defending rating (1-100)
Physic: Physic rating (1-100)

Unit of observation: Football Player in FIFA

Sample size: 255 (after dropping units with missing values)

Source: Kaggle | FIFA 20 complete player dataset

RQ: Can you divide football players into homogeneous groups based on cluster variables (pace, shooting, passing, dribbling, defending, physic)?

summary(data20)

##       ID                       name          age               work_rate  
##  Length:255         A. Al Saluli :  1   Min.   :17.0   Medium/Medium:140  
##  Class :character   A. Bolivar   :  1   1st Qu.:21.0   High/Medium  : 41  
##  Mode  :character   A. Cerri     :  1   Median :24.0   Medium/High  : 24  
##                     A. Ciss      :  1   Mean   :24.7   High/High    : 15  
##                     A. Cruz      :  1   3rd Qu.:28.0   High/Low     : 12  
##                     A. Fiordaliso:  1   Max.   :37.0   Medium/Low   :  9  
##                     (Other)      :249                  (Other)      : 14  
##       pace          shooting        passing     dribbling       defending    
##  Min.   :33.00   Min.   :20.00   Min.   :25   Min.   :31.00   Min.   :18.00  
##  1st Qu.:61.00   1st Qu.:40.00   1st Qu.:49   1st Qu.:56.00   1st Qu.:42.50  
##  Median :67.00   Median :51.00   Median :57   Median :63.00   Median :56.00  
##  Mean   :66.97   Mean   :49.87   Mean   :56   Mean   :60.83   Mean   :52.62  
##  3rd Qu.:74.00   3rd Qu.:59.50   3rd Qu.:63   3rd Qu.:67.50   3rd Qu.:63.00  
##  Max.   :89.00   Max.   :77.00   Max.   :81   Max.   :86.00   Max.   :79.00  
##                                                                              
##      physic     
##  Min.   :38.00  
##  1st Qu.:58.00  
##  Median :65.00  
##  Mean   :63.58  
##  3rd Qu.:71.00  
##  Max.   :84.00  
##

library(psych)
describeBy(data20)

## Warning in describeBy(data20): no grouping variable requested

##            vars   n   mean    sd median trimmed   mad min max range  skew
## ID*           1 255 128.00 73.76    128  128.00 94.89   1 255   254  0.00
## name*         2 255 128.00 73.76    128  128.00 94.89   1 255   254  0.00
## age           3 255  24.70  4.63     24   24.47  5.93  17  37    20  0.36
## work_rate*    4 255   7.07  2.48      9    7.42  0.00   1   9     8 -0.83
## pace          5 255  66.97 10.48     67   67.40  8.90  33  89    56 -0.44
## shooting      6 255  49.87 13.13     51   50.15 14.83  20  77    57 -0.18
## passing       7 255  56.00 10.22     57   56.38 10.38  25  81    56 -0.36
## dribbling     8 255  60.83  9.81     63   61.61  8.90  31  86    55 -0.71
## defending     9 255  52.62 14.32     56   53.67 13.34  18  79    61 -0.62
## physic       10 255  63.58  9.67     65   64.10 10.38  38  84    46 -0.45
##            kurtosis   se
## ID*           -1.21 4.62
## name*         -1.21 4.62
## age           -0.80 0.29
## work_rate*    -0.91 0.16
## pace           0.33 0.66
## shooting      -0.81 0.82
## passing       -0.05 0.64
## dribbling      0.41 0.61
## defending     -0.60 0.90
## physic        -0.38 0.61

Remove ID name from describeby flikn v cht tabelo

data20$Pace_z <- scale(data20$pace)
data20$Shooting_z   <- scale(data20$shooting)
data20$Passing_z <- scale(data20$passing)
data20$Dribbling_z <- scale(data20$dribbling)
data20$Defending_z <- scale(data20$defending)
data20$Physic_z <- scale(data20$physic) #I standardized.

Choosing cluster variables

library(Hmisc)

## 
## Attaching package: 'Hmisc'

## The following object is masked from 'package:psych':
## 
##     describe

## The following objects are masked from 'package:base':
## 
##     format.pval, units

rcorr(as.matrix(data20[, c("Pace_z", "Shooting_z", "Passing_z", "Dribbling_z", "Defending_z", "Physic_z")]), 
      type = "pearson")

##             Pace_z Shooting_z Passing_z Dribbling_z Defending_z Physic_z
## Pace_z        1.00       0.33      0.31        0.52       -0.19    -0.08
## Shooting_z    0.33       1.00      0.69        0.74       -0.33     0.05
## Passing_z     0.31       0.69      1.00        0.82        0.13     0.12
## Dribbling_z   0.52       0.74      0.82        1.00       -0.17    -0.06
## Defending_z  -0.19      -0.33      0.13       -0.17        1.00     0.55
## Physic_z     -0.08       0.05      0.12       -0.06        0.55     1.00
## 
## n= 255 
## 
## 
## P
##             Pace_z Shooting_z Passing_z Dribbling_z Defending_z Physic_z
## Pace_z             0.0000     0.0000    0.0000      0.0025      0.1948  
## Shooting_z  0.0000            0.0000    0.0000      0.0000      0.4543  
## Passing_z   0.0000 0.0000               0.0000      0.0362      0.0621  
## Dribbling_z 0.0000 0.0000     0.0000                0.0066      0.3497  
## Defending_z 0.0025 0.0000     0.0362    0.0066                  0.0000  
## Physic_z    0.1948 0.4543     0.0621    0.3497      0.0000

From the observed table, we see that dribbling is highly correlated to all of the cluster variables (more than 0.3 everywhere), that is why I decided to drop this variable (but this is not a requirement, just results will be better).

data20 <- data20[, c(-8, -14)]

rcorr(as.matrix(data20[, c("Pace_z", "Shooting_z", "Passing_z", "Defending_z", "Physic_z")]), 
      type = "pearson")

##             Pace_z Shooting_z Passing_z Defending_z Physic_z
## Pace_z        1.00       0.33      0.31       -0.19    -0.08
## Shooting_z    0.33       1.00      0.69       -0.33     0.05
## Passing_z     0.31       0.69      1.00        0.13     0.12
## Defending_z  -0.19      -0.33      0.13        1.00     0.55
## Physic_z     -0.08       0.05      0.12        0.55     1.00
## 
## n= 255 
## 
## 
## P
##             Pace_z Shooting_z Passing_z Defending_z Physic_z
## Pace_z             0.0000     0.0000    0.0025      0.1948  
## Shooting_z  0.0000            0.0000    0.0000      0.4543  
## Passing_z   0.0000 0.0000               0.0362      0.0621  
## Defending_z 0.0025 0.0000     0.0362                0.0000  
## Physic_z    0.1948 0.4543     0.0621    0.0000

Outliers

data20$Dissimilarity <- sqrt(data20$Shooting_z^2 + data20$Passing_z^2 + data20$Defending_z^2 + data20$Physic_z^2)

head(data20[order(-data20$Dissimilarity), ], 10)

##         ID                name age     work_rate pace shooting passing
## 248 224334            M. Acuña  27     High/High   76       74      81
## 100 244902           F. Castro  24                 54       24      25
## 202 239853          M. Kryeziu  22 Medium/Medium   35       20      28
## 163 251124          F. Nevarez  18 Medium/Medium   56       26      29
## 64  239821         L. Matarese  21 Medium/Medium   68       56      55
## 7   242966 N. Clayton-Phillips  19 Medium/Medium   65       59      56
## 254 251823      T. Tattermusch  18 Medium/Medium   64       45      44
## 200 252013             R. Hauk  20 Medium/Medium   53       27      29
## 23  252557          J. Jiménez  24      Low/High   61       23      35
## 214 221740         E. Crivelli  24     High/High   49       72      62
##     defending physic      Pace_z Shooting_z    Passing_z Defending_z   Physic_z
## 248        76     84  0.86200664  1.8380377  2.445104384  1.63253642  2.1117395
## 100        49     77 -1.23780110 -1.9712895 -3.032788713 -0.25242304  1.3879578
## 202        53     75 -3.05127143 -2.2760357 -2.739330154  0.02683021  1.1811631
## 163        47     50 -1.04690949 -1.8189164 -2.641510635 -0.39204966 -1.4037716
## 64         25     39  0.09844018  0.4666799 -0.098203126 -1.92794255 -2.5411428
## 7          29     38 -0.18789723  0.6952396 -0.000383606 -1.64868930 -2.6445402
## 254        21     45 -0.28334304 -0.3713721 -1.174217841 -2.20719581 -1.9207585
## 200        53     65 -1.33324691 -1.7427299 -2.641510635  0.02683021  0.1471892
## 23         64     54 -0.56968046 -2.0474761 -2.054593517  0.79477666 -0.9901820
## 214        30     83 -1.71503014  1.6856647  0.586533512 -1.57887599  2.0083421
##     Dissimilarity
## 248      4.059746
## 100      3.882516
## 202      3.752350
## 163      3.522827
## 64       3.225183
## 7        3.192981
## 254      3.174544
## 200      3.168134
## 23       3.166330
## 214      3.116372

From what is observed there is a significant jump from 3.22 to 3.52 - for practice I will remove the units with dissimilarity result of more than 3.5.

data20 <- data20[data20$Dissimilarity < 3.5,]

data20$Pace_z <- scale(data20$pace)
data20$Shooting_z   <- scale(data20$shooting)
data20$Passing_z <- scale(data20$passing)
data20$Defending_z <- scale(data20$defending)
data20$Physic_z <- scale(data20$physic) #After removing the "outliers" I need to standardize variables again.

Is data clusterable?

library(factoextra)

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following objects are masked from 'package:psych':
## 
##     %+%, alpha

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

distance <- get_dist(data20[c("Pace_z", "Shooting_z", "Passing_z", "Defending_z", "Physic_z")], 
                     method = "euclidian")

distance2 <- distance^2

fviz_dist(distance2)

From what I can observe, there seem to be 5 natural groups forming.

get_clust_tendency(data20[c("Pace_z", "Shooting_z", "Passing_z", "Defending_z", "Physic_z")],
                   n = nrow(data20) - 1, 
                   graph = FALSE)

## $hopkins_stat
## [1] 0.674727
## 
## $plot
## NULL

I want Hopkin`s result to be more than 0.5, to say my data is clusterable. Here I have more - 0.67.

Clustering

WARD <- data20[c("Pace_z", "Shooting_z", "Passing_z", "Defending_z", "Physic_z")] %>%
  get_dist(method = "euclidean") %>%  
  hclust(method = "ward.D2") #For Ward`s method I need to use Euclidean squared distance 

print(WARD)

## 
## Call:
## hclust(d = ., method = "ward.D2")
## 
## Cluster method   : ward.D2 
## Distance         : euclidean 
## Number of objects: 251

I can see that there are 251 objects now after removing the units. Those 251 need to be classified into group - number of which I will determine in the next steps…

fviz_dend(WARD)

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

fviz_dend(WARD, 
          k = 3,
          cex = 0.5, 
          palette = "jama",
          color_labels_by_k = TRUE, 
          rect = TRUE)

Here the last group seems to be too heterogeneous, so I will try to cut dendrogram into 5.

fviz_dend(WARD, 
          k = 5,
          cex = 0.5, 
          palette = "jama",
          color_labels_by_k = TRUE, 
          rect = TRUE)

I choose 5 clusters.

data20$ClusterWard <- cutree(WARD, 
                             k = 5)
head(data20)

##       ID          name age     work_rate pace shooting passing defending physic
## 1 252721     D. Takagi  23 Medium/Medium   74       63      63        33     51
## 2 176794   O. Toivonen  32    Medium/Low   50       77      75        41     73
## 3 237635       F. Pick  23 Medium/Medium   89       54      55        27     50
## 4 248100 Romário Pires  30 Medium/Medium   68       56      59        55     64
## 5 243481   M. Pedersen  19 Medium/Medium   82       48      52        38     47
## 6 227787  Kim Jong Woo  25   High/Medium   72       63      66        59     50
##        Pace_z Shooting_z  Passing_z Defending_z    Physic_z Dissimilarity
## 1  0.66504762  1.0072490  0.6975470  -1.3623645 -1.30250089     2.2438000
## 2 -1.66687721  2.1000163  1.9370819  -0.8050967  0.99907028     3.0546170
## 3  2.12250063  0.3047558 -0.1288096  -1.7803154 -1.40711776     2.2971898
## 4  0.08206641  0.4608654  0.2843687   0.1701221  0.05751844     0.5773289
## 5  1.44235589 -0.1635730 -0.4386933  -1.0140721 -1.72096838     2.0377978
## 6  0.47072055  1.0072490  1.0074307   0.4487560 -1.40711776     2.0310881
##   ClusterWard
## 1           1
## 2           2
## 3           3
## 4           4
## 5           3
## 6           1

Initial_leaders <- aggregate(data20[, c("Pace_z", "Shooting_z", "Passing_z", "Defending_z", "Physic_z")], 
                            by = list(data20$ClusterWard), 
                            FUN = mean)

Initial_leaders

##   Group.1      Pace_z  Shooting_z  Passing_z Defending_z   Physic_z
## 1       1  1.05792626  1.31607457  0.9580290  -0.9686427 -0.1744581
## 2       2 -0.02724256  0.83162574  1.0314098   0.4126829  0.6403839
## 3       3 -0.02843329  0.08436577 -0.3880587  -1.1820720 -1.2983983
## 4       4  0.45290723 -0.15446662  0.1190974   0.3698098  0.2161874
## 5       5 -0.79559109 -1.17828547 -1.1007946   0.6109119  0.3507887

K_MEANS <- hkmeans(data20[, c("Pace_z", "Shooting_z", "Passing_z", "Defending_z", "Physic_z")], 
                   k = 5,
                   hc.metric = "euclidean",
                   hc.method = "ward.D2")

K_MEANS

## Hierarchical K-means clustering with 5 clusters of sizes 36, 48, 58, 56, 53
## 
## Cluster means:
##        Pace_z  Shooting_z  Passing_z Defending_z    Physic_z
## 1  0.81619089  1.26743172  0.7836258  -0.9115193  0.07786061
## 2 -0.32683013  0.73730951  1.0440142   0.5706584  0.38226664
## 3  0.07871594 -0.09359285 -0.5419879  -1.1317709 -1.25199619
## 4  0.59043990 -0.23465864  0.2308769   0.5719023  0.35829195
## 5 -0.96839972 -1.17828547 -1.1286231   0.7365902  0.59244622
## 
## Clustering vector:
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
##   1   2   3   4   3   2   3   3   1   3   5   2   1   1   3   2   1   3   3   4 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
##   3   5   5   5   5   4   3   3   5   4   1   2   2   5   5   4   1   3   5   3 
##  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
##   5   1   2   5   3   4   5   1   2   1   3   2   3   1   4   4   5   3   2   4 
##  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
##   2   5   2   3   2   4   5   2   5   4   3   4   3   2   5   1   4   4   2   5 
##  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 101 
##   2   4   4   2   3   4   1   4   5   2   2   5   3   1   3   1   3   2   3   1 
## 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 
##   5   5   1   2   5   4   3   3   4   2   4   5   4   1   5   5   5   3   2   2 
## 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 
##   5   1   4   3   4   5   2   3   4   4   2   2   2   4   3   2   4   4   5   2 
## 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 
##   5   3   5   4   3   1   5   1   4   5   5   3   4   4   2   5   1   2   4   1 
## 162 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 
##   2   3   4   2   5   2   1   3   1   3   5   3   1   4   1   3   3   4   5   3 
## 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 203 
##   3   5   2   5   5   4   3   4   5   3   4   5   5   4   1   5   3   5   4   2 
## 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 
##   2   4   3   3   5   4   1   5   4   1   2   5   4   4   3   2   3   3   4   5 
## 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 
##   1   3   1   3   2   1   2   2   2   2   4   1   1   2   4   3   3   4   3   4 
## 244 245 246 247 249 250 251 252 253 254 255 
##   1   5   5   4   4   2   4   3   3   3   4 
## 
## Within cluster sum of squares by cluster:
## [1]  72.79334  94.73413 157.13840  86.12904 126.60821
##  (between_SS / total_SS =  57.0 %)
## 
## Available components:
## 
##  [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
##  [6] "betweenss"    "size"         "iter"         "ifault"       "data"        
## [11] "hclust"

In the 1st group there are 36 players, 2nd 48 players, 3rd 58 players, 4th 56 players, 5th 53 players.

Write sth. about the cluster means ???

Clustering vector tells me e.g. that the 1st player is in the 1st group… 5th player in the 3rd group etc.

From what I see the clusters are not that homogeneous (57% tells us that ?? ).

fviz_cluster(K_MEANS, 
             palette = "jama", 
             repel = FALSE,
             ggtheme = theme_classic())

data20$ClusterK_Means <- K_MEANS$cluster
head(data20[c("name", "ClusterWard", "ClusterK_Means")])

##            name ClusterWard ClusterK_Means
## 1     D. Takagi           1              1
## 2   O. Toivonen           2              2
## 3       F. Pick           3              3
## 4 Romário Pires           4              4
## 5   M. Pedersen           3              3
## 6  Kim Jong Woo           1              2

As it is shown, some of the objects have been reclassified after using K-Means clustering - e.g. Kim jong woo was reclassified from 1st group to 2nd.

table(data20$ClusterWard)

## 
##  1  2  3  4  5 
## 23 56 51 60 61

This tells the number of players in certain group initially.

table(data20$ClusterK_Means)

## 
##  1  2  3  4  5 
## 36 48 58 56 53

This tells the number of players in certain group after K-Means clustering.

table(data20$ClusterWard, data20$ClusterK_Means)

##    
##      1  2  3  4  5
##   1 22  1  0  0  0
##   2  8 39  0  9  0
##   3  0  3 46  2  0
##   4  6  4  3 45  2
##   5  0  1  9  0 51

As this table shows: initially after Ward`s in the 1st group there were 23 players. After K-Means clustering there were 36 players in this group. If we look at the structure of the initial 23 players in group 1, 1 went to group 2 and 8 players came from group 2 and 6 from group 4.

Centroids <- K_MEANS$centers
Centroids

##        Pace_z  Shooting_z  Passing_z Defending_z    Physic_z
## 1  0.81619089  1.26743172  0.7836258  -0.9115193  0.07786061
## 2 -0.32683013  0.73730951  1.0440142   0.5706584  0.38226664
## 3  0.07871594 -0.09359285 -0.5419879  -1.1317709 -1.25199619
## 4  0.59043990 -0.23465864  0.2308769   0.5719023  0.35829195
## 5 -0.96839972 -1.17828547 -1.1286231   0.7365902  0.59244622

kaj to točno pomen???

library(ggplot2)
library(tidyr)

Figure <- as.data.frame(Centroids)
Figure$id <- 1:nrow(Figure)
Figure <- pivot_longer(Figure, cols = c(Pace_z, Shooting_z, Passing_z, Defending_z, Physic_z))

Figure$Groups <- factor(Figure$id, 
                        levels = c(1, 2, 3, 4, 5), 
                        labels = c("1", "2", "3", "4", "5"))

Figure$nameFactor <- factor(Figure$name, 
                            levels = c("Pace_z", "Shooting_z", "Passing_z", "Defending_z", "Physic_z"), 
                            labels = c("Pace_z", "Shooting_z", "Passing_z", "Defending_z", "Physic_z"))

ggplot(Figure, aes(x = nameFactor, y = value)) +
  geom_hline(yintercept = 0) +
  theme_bw() +
  geom_point(aes(shape = Groups, col = Groups), size = 3) +
  geom_line(aes(group = id), linewidth = 1) +
  ylab("Averages") +
  xlab("Cluster variables")+
  ylim(-1.5, 1.5)

Here I graphically show the deviations from the mean values of the cluster variables, for each group.

Players in group 1: Here there are above average ratings of defending and physic, which indicates that these players play defending positions.
Players in group 2:Here there are above average ratings of Pace and Shooting, which indicates that these players play attacking positions.
Players in group 3: Here there are above average ratings of shooting, passing, defending and physic which indicates that these players play midfield positions.
Players in group 4:
Players in group 5:

fit <- aov(cbind(Pace_z, Shooting_z, Passing_z, Defending_z, Physic_z) ~ as.factor(ClusterK_Means), data = data20) #Code for all one-way ANOVAs at once

summary(fit)

##  Response 1 :
##                            Df  Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(ClusterK_Means)   4  98.695 24.6737  40.116 < 2.2e-16 ***
## Residuals                 246 151.305  0.6151                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 2 :
##                            Df  Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(ClusterK_Means)   4 161.098  40.275  111.44 < 2.2e-16 ***
## Residuals                 246  88.902   0.361                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 3 :
##                            Df  Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(ClusterK_Means)   4 161.958  40.490  113.13 < 2.2e-16 ***
## Residuals                 246  88.042   0.358                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 4 :
##                            Df  Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(ClusterK_Means)   4 166.907  41.727  123.53 < 2.2e-16 ***
## Residuals                 246  83.093   0.338                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 5 :
##                            Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(ClusterK_Means)   4 123.94 30.9846  60.464 < 2.2e-16 ***
## Residuals                 246 126.06  0.5124                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Because I have selected 5 variables i need to perform 5 one-way ANOVA tests.

H0: The averages of cluster variables are the same in all groups.
H1: At least one average is different.

Based on sample data, I can reject the null hypothesis (p>0.001). All of my selected variables are statistically significant. I conclude that all cluster variables are successful at classifying units into groups.

Validation

aggregate(data20$age, 
          by = list(data20$ClusterK_Means), 
          FUN = "mean")

##   Group.1        x
## 1       1 27.77778
## 2       2 26.79167
## 3       3 20.94828
## 4       4 24.82143
## 5       5 24.83019

On average the players in Group 1 are the oldest ??? a je to prou??

fit <- aov(age ~ as.factor(ClusterK_Means), 
           data = data20)

summary(fit)

##                            Df Sum Sq Mean Sq F value   Pr(>F)    
## as.factor(ClusterK_Means)   4   1369   342.2   20.98 6.75e-15 ***
## Residuals                 246   4013    16.3                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

I can reject the null hypothesis (p>0.001) (meaning at least one arithmetic mean is different).

chisq_results <- chisq.test(data20$work_rate, as.factor(data20$ClusterK_Means)) #For the categorical variable I need to preform Pearson`s chi-squared test

## Warning in chisq.test(data20$work_rate, as.factor(data20$ClusterK_Means)):
## Chi-squared approximation may be incorrect

chisq_results

## 
##  Pearson's Chi-squared test
## 
## data:  data20$work_rate and as.factor(data20$ClusterK_Means)
## X-squared = 83.398, df = 28, p-value = 2.059e-07

Ho?? H1???

Based on the sample data, I can reject the null hypothesis (p>0.001). That means that work rate can be ??

addmargins(chisq_results$observed)

##                 
## data20$work_rate   1   2   3   4   5 Sum
##    High/High       4   2   0   7   1  14
##    High/Low        6   2   4   0   0  12
##    High/Medium    10   9   6  13   3  41
##    Low/High        0   0   0   1   4   5
##    Low/Medium      0   2   0   2   4   8
##    Medium/High     0   7   2   7   8  24
##    Medium/Low      4   1   4   0   0   9
##    Medium/Medium  12  25  42  26  33 138
##    Sum            36  48  58  56  53 251

round(chisq_results$expected, 2)

##                 
## data20$work_rate     1     2     3     4     5
##    High/High      2.01  2.68  3.24  3.12  2.96
##    High/Low       1.72  2.29  2.77  2.68  2.53
##    High/Medium    5.88  7.84  9.47  9.15  8.66
##    Low/High       0.72  0.96  1.16  1.12  1.06
##    Low/Medium     1.15  1.53  1.85  1.78  1.69
##    Medium/High    3.44  4.59  5.55  5.35  5.07
##    Medium/Low     1.29  1.72  2.08  2.01  1.90
##    Medium/Medium 19.79 26.39 31.89 30.79 29.14

round(chisq_results$res, 2)

##                 
## data20$work_rate     1     2     3     4     5
##    High/High      1.41 -0.41 -1.80  2.19 -1.14
##    High/Low       3.26 -0.19  0.74 -1.64 -1.59
##    High/Medium    1.70  0.41 -1.13  1.27 -1.92
##    Low/High      -0.85 -0.98 -1.07 -0.11  2.87
##    Low/Medium    -1.07  0.38 -1.36  0.16  1.78
##    Medium/High   -1.86  1.13 -1.51  0.71  1.30
##    Medium/Low     2.38 -0.55  1.33 -1.42 -1.38
##    Medium/Medium -1.75 -0.27  1.79 -0.86  0.72