knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)

Introduction

# The data used for this project was optained from the EA Sports video game UFC 2025 https://www.ea.com/games/ufc/ufc-5. I have chosen to work with the top 5 rated fighters in each weight division. The data includes both men and women fighters. The women have 3 weight classes and the men have 8 weight classes. For the purpose of this project I have chosen to use the following character attributes to work with "punch power", "kick power", "punch speed","kick speed","body strenth", "leg strenght" and "takedowns".
library(dplyr)
library(cluster)
library(factoextra)
library(mclust)
library(cluster)
library(ggplot2)
library(readxl)
ufc <- read_excel("C:/Users/mzing/Desktop/USL/UFC 2025/UFC.xlsx")
str(ufc)
## tibble [55 × 13] (S3: tbl_df/tbl/data.frame)
##  $ Fighter Name : chr [1:55] "Amanda Nunes" "Valentina Shevchenko" "Holly Holm" "Julianna Pena" ...
##  $ Gender       : chr [1:55] "F" "F" "F" "F" ...
##  $ Weight Class : chr [1:55] "Bantamweight" "Bantamweight" "Bantamweight" "Bantamweight" ...
##  $ Ranking      : num [1:55] 1 2 3 4 5 1 2 3 4 5 ...
##  $ Nationality  : chr [1:55] "Brazil" "Kyrgyzstani" "USA" "USA" ...
##  $ Stars        : num [1:55] 5 4.5 4.5 4.5 4 4.5 4.5 4.5 4.5 4.5 ...
##  $ Punch Power  : num [1:55] 98 92 90 90 87 95 88 92 95 90 ...
##  $ Kick Power   : num [1:55] 94 93 96 96 85 93 91 93 95 88 ...
##  $ Punch Speed  : num [1:55] 96 94 92 92 90 98 93 94 93 92 ...
##  $ Kick Speed   : num [1:55] 94 95 96 96 87 97 95 94 93 92 ...
##  $ Body Strength: num [1:55] 95 92 96 96 90 92 92 93 90 94 ...
##  $ Leg Strength : num [1:55] 95 89 97 97 89 89 91 91 95 89 ...
##  $ Takedowns    : num [1:55] 92 91 90 90 92 86 92 92 88 97 ...
head(ufc)
## # A tibble: 6 × 13
##   `Fighter Name`   Gender `Weight Class` Ranking Nationality Stars `Punch Power`
##   <chr>            <chr>  <chr>            <dbl> <chr>       <dbl>         <dbl>
## 1 Amanda Nunes     F      Bantamweight         1 Brazil        5              98
## 2 Valentina Shevc… F      Bantamweight         2 Kyrgyzstani   4.5            92
## 3 Holly Holm       F      Bantamweight         3 USA           4.5            90
## 4 Julianna Pena    F      Bantamweight         4 USA           4.5            90
## 5 Meisha Tate      F      Bantamweight         5 USA           4              87
## 6 Sean O'malley    M      Bantamweight         1 USA           4.5            95
## # ℹ 6 more variables: `Kick Power` <dbl>, `Punch Speed` <dbl>,
## #   `Kick Speed` <dbl>, `Body Strength` <dbl>, `Leg Strength` <dbl>,
## #   Takedowns <dbl>

Data Preparation

# Separating the male from the female fighters

ufc_men <- data.frame(ufc[ufc$Gender == 'M',])
ufc_men
##             Fighter.Name Gender      Weight.Class Ranking    Nationality Stars
## 1          Sean O'malley      M      Bantamweight       1            USA   4.5
## 2      Aljamain Sterling      M      Bantamweight       2            USA   4.5
## 3         Cory Sandhagen      M      Bantamweight       3            USA   4.5
## 4            Marlon Vera      M      Bantamweight       4        Ecuador   4.5
## 5      Merab Dvalishvili      M      Bantamweight       5        Georgia   4.5
## 6  Alexander Volkanovski      M     Featherweight       1      Australia   5.0
## 7           Max Holloway      M     Featherweight       2            USA   4.5
## 8         Conor McGregor      M     Featherweight       3        Ireland   4.5
## 9           Ilia Torupia      M     Featherweight       4        Georgia   4.5
## 10             Jose Aldo      M     Featherweight       5         Brazil   4.5
## 11     Alexandre Pantoja      M         Flyweight       1         Brazil   4.5
## 12        Brandon Moreno      M         Flyweight       2         Mexico   4.5
## 13     Demetrius Johnson      M         Flyweight       3            USA   4.5
## 14          Henry Cejudo      M         Flyweight       4            USA   4.5
## 15    Deiveson Figueredo      M         Flyweight       5         Brazil   4.5
## 16             Jon Jones      M       Heavyweight       1            USA   5.0
## 17     Fedor Emelianenko      M       Heavyweight       2         Russia   5.0
## 18        Daniel Cormier      M       Heavyweight       3            USA   4.5
## 19          Stipe Miocic      M       Heavyweight       4            USA   4.5
## 20          Tom Aspinall      M       Heavyweight       5        England   4.5
## 21             Jon Jones      M Light Heavyweight       1            USA   5.0
## 22          Alex Pereira      M Light Heavyweight       2         Brazil   4.5
## 23        Daniel Cormier      M Light Heavyweight       3            USA   4.5
## 24        Jiri Prochazka      M Light Heavyweight       4 Czech Republic   4.5
## 25          Jamahal Hill      M Light Heavyweight       5            USA   4.5
## 26   Khabib Nurmagomedov      M       Lightweight       1         Russia   5.0
## 27       Islam Makhachev      M       Lightweight       2         Russia   5.0
## 28        Justin Gaethje      M       Lightweight       3            USA   4.5
## 29      Charles Oliveira      M       Lightweight       4         Brazil   4.5
## 30        Dustin Poirier      M       Lightweight       5            USA   4.5
## 31        Anderson Silva      M      Middleweight       1         Brazil   5.0
## 32       Israel Adesanya      M      Middleweight       2        Nigeria   5.0
## 33       Sean Strickland      M      Middleweight       3            USA   4.5
## 34       Michael Bisping      M      Middleweight       4        England   4.5
## 35     Dricus Du Plessis      M      Middleweight       5   South Africa   4.5
## 36     Georges St Pierre      M      Welterweight       1         Canada   5.0
## 37          Leon Edwards      M      Welterweight       2        England   4.5
## 38          Kamura Usman      M      Welterweight       3            USA   4.5
## 39       Colby Covington      M      Welterweight       4            USA   4.5
## 40     Shavkat Rakhmonov      M      Welterweight       5     Kazakhstan   4.5
##    Punch.Power Kick.Power Punch.Speed Kick.Speed Body.Strength Leg.Strength
## 1           95         93          98         97            92           89
## 2           88         91          93         95            92           91
## 3           92         93          94         94            93           91
## 4           95         95          93         93            90           95
## 5           90         88          92         92            94           89
## 6           94         93          97         95            95           95
## 7           89         88          96         92            93           87
## 8           97         91          98         92            94           94
## 9           95         90          95         91            92           90
## 10          95         97          97         96            91           93
## 11          95         90          94         92            93           94
## 12          91         92          97         93            94           89
## 13          88         89          98         96            91           94
## 14          95         93          94         93            92           89
## 15          98         93          95         94            93           95
## 16          86         88          89         95            92           88
## 17          95         87          97         90            94           93
## 18          92         89          92         89            89           92
## 19          93         90          95         89            95           95
## 20          94         91          95         92            90           92
## 21          86         88          89         95            92           90
## 22          97         96          96         95            92           95
## 23          93         89          92         89            89           92
## 24          96         92          95         92            91           90
## 25          98         94          94         93            90           89
## 26          91         87          90         87            92           89
## 27          94         90          94         92            92           90
## 28          95         98          94         95            93           96
## 29          95         93          94         94            91           93
## 30          95         90          96         92            91           92
## 31          94         96          98         94            95           93
## 32          95         95          96         97            95           94
## 33          89         88          95         90            94           94
## 34          92         88          95         90            92           92
## 35          95         91          93         93            88           90
## 36          92         92          93         94            95           95
## 37          91         95          94         95            95           94
## 38          94         88          92         89            92           93
## 39          89         89          91         90            94           94
## 40          94         94          94         94            90           90
##    Takedowns
## 1         86
## 2         92
## 3         92
## 4         88
## 5         97
## 6         93
## 7         87
## 8         86
## 9         93
## 10        87
## 11        92
## 12        90
## 13        95
## 14        95
## 15        88
## 16        93
## 17        89
## 18        96
## 19        90
## 20        93
## 21        93
## 22        84
## 23        97
## 24        84
## 25        88
## 26        98
## 27        99
## 28        86
## 29        93
## 30        90
## 31        94
## 32        86
## 33        91
## 34        87
## 35        92
## 36        99
## 37        92
## 38        97
## 39        96
## 40        91
male_fighters <- ufc_men[, c("Punch.Power", "Kick.Power", "Punch.Speed", "Kick.Speed",
                         "Body.Strength", "Leg.Strength", "Takedowns")]

male_fighters
##    Punch.Power Kick.Power Punch.Speed Kick.Speed Body.Strength Leg.Strength
## 1           95         93          98         97            92           89
## 2           88         91          93         95            92           91
## 3           92         93          94         94            93           91
## 4           95         95          93         93            90           95
## 5           90         88          92         92            94           89
## 6           94         93          97         95            95           95
## 7           89         88          96         92            93           87
## 8           97         91          98         92            94           94
## 9           95         90          95         91            92           90
## 10          95         97          97         96            91           93
## 11          95         90          94         92            93           94
## 12          91         92          97         93            94           89
## 13          88         89          98         96            91           94
## 14          95         93          94         93            92           89
## 15          98         93          95         94            93           95
## 16          86         88          89         95            92           88
## 17          95         87          97         90            94           93
## 18          92         89          92         89            89           92
## 19          93         90          95         89            95           95
## 20          94         91          95         92            90           92
## 21          86         88          89         95            92           90
## 22          97         96          96         95            92           95
## 23          93         89          92         89            89           92
## 24          96         92          95         92            91           90
## 25          98         94          94         93            90           89
## 26          91         87          90         87            92           89
## 27          94         90          94         92            92           90
## 28          95         98          94         95            93           96
## 29          95         93          94         94            91           93
## 30          95         90          96         92            91           92
## 31          94         96          98         94            95           93
## 32          95         95          96         97            95           94
## 33          89         88          95         90            94           94
## 34          92         88          95         90            92           92
## 35          95         91          93         93            88           90
## 36          92         92          93         94            95           95
## 37          91         95          94         95            95           94
## 38          94         88          92         89            92           93
## 39          89         89          91         90            94           94
## 40          94         94          94         94            90           90
##    Takedowns
## 1         86
## 2         92
## 3         92
## 4         88
## 5         97
## 6         93
## 7         87
## 8         86
## 9         93
## 10        87
## 11        92
## 12        90
## 13        95
## 14        95
## 15        88
## 16        93
## 17        89
## 18        96
## 19        90
## 20        93
## 21        93
## 22        84
## 23        97
## 24        84
## 25        88
## 26        98
## 27        99
## 28        86
## 29        93
## 30        90
## 31        94
## 32        86
## 33        91
## 34        87
## 35        92
## 36        99
## 37        92
## 38        97
## 39        96
## 40        91
# Female Fighters

ufc_women <- data.frame(ufc[ufc$Gender == 'F',])
ufc_women
##            Fighter.Name Gender Weight.Class Ranking Nationality Stars
## 1          Amanda Nunes      F Bantamweight       1      Brazil   5.0
## 2  Valentina Shevchenko      F Bantamweight       2 Kyrgyzstani   4.5
## 3            Holly Holm      F Bantamweight       3         USA   4.5
## 4         Julianna Pena      F Bantamweight       4         USA   4.5
## 5           Meisha Tate      F Bantamweight       5         USA   4.0
## 6  Valentina Shevchenko      F    Flyweight       1 Kyrgyzstani   5.0
## 7    Joanna Jedrzejczyk      F    Flyweight       2      Poland   4.5
## 8          Alexa Grasso      F    Flyweight       3      Mexico   4.5
## 9      Erin Blanchfield      F    Flyweight       4         USA   4.5
## 10         Taila Santos      F    Flyweight       5      Brazil   4.5
## 11   Joanna Jedrzejczyk      F  Strawweight       1      Poland   5.0
## 12          Zhang Weili      F  Strawweight       2       China   4.5
## 13       Rose Namajunas      F  Strawweight       3         USA   4.5
## 14       Mackenzie Dern      F  Strawweight       4         USA   4.5
## 15         Amanda Lemos      F  Strawweight       5      Brazil   4.5
##    Punch.Power Kick.Power Punch.Speed Kick.Speed Body.Strength Leg.Strength
## 1           98         94          96         94            95           95
## 2           92         93          94         95            92           89
## 3           90         96          92         96            96           97
## 4           90         96          92         96            96           97
## 5           87         85          90         87            90           89
## 6           92         94          95         96            92           93
## 7           88         89          96         95            91           92
## 8           93         89          96         92            91           90
## 9           89         88          91         90            92           89
## 10          94         92          93         91            90           92
## 11          92         92          96         95            91           92
## 12          96         92          96         94            95           94
## 13          92         93          95         94            91           94
## 14          93         90          94         89            90           90
## 15          97         95          91         93            90           90
##    Takedowns
## 1         92
## 2         91
## 3         90
## 4         90
## 5         92
## 6         95
## 7         84
## 8         89
## 9         91
## 10        95
## 11        84
## 12        93
## 13        93
## 14        90
## 15        88
female_fighters <- ufc_women[, c("Punch.Power", "Kick.Power", "Punch.Speed", "Kick.Speed",
                         "Body.Strength", "Leg.Strength", "Takedowns")]

female_fighters
##    Punch.Power Kick.Power Punch.Speed Kick.Speed Body.Strength Leg.Strength
## 1           98         94          96         94            95           95
## 2           92         93          94         95            92           89
## 3           90         96          92         96            96           97
## 4           90         96          92         96            96           97
## 5           87         85          90         87            90           89
## 6           92         94          95         96            92           93
## 7           88         89          96         95            91           92
## 8           93         89          96         92            91           90
## 9           89         88          91         90            92           89
## 10          94         92          93         91            90           92
## 11          92         92          96         95            91           92
## 12          96         92          96         94            95           94
## 13          92         93          95         94            91           94
## 14          93         90          94         89            90           90
## 15          97         95          91         93            90           90
##    Takedowns
## 1         92
## 2         91
## 3         90
## 4         90
## 5         92
## 6         95
## 7         84
## 8         89
## 9         91
## 10        95
## 11        84
## 12        93
## 13        93
## 14        90
## 15        88
# All UFC fighters combined


ufc_fighters <- data.frame(lapply(ufc[, c("Punch Power", "Kick Power", "Punch Speed", 
                                          "Kick Speed", "Body Strength", 
                                          "Leg Strength", "Takedowns")], as.numeric))

print(ufc_fighters)
##    Punch.Power Kick.Power Punch.Speed Kick.Speed Body.Strength Leg.Strength
## 1           98         94          96         94            95           95
## 2           92         93          94         95            92           89
## 3           90         96          92         96            96           97
## 4           90         96          92         96            96           97
## 5           87         85          90         87            90           89
## 6           95         93          98         97            92           89
## 7           88         91          93         95            92           91
## 8           92         93          94         94            93           91
## 9           95         95          93         93            90           95
## 10          90         88          92         92            94           89
## 11          94         93          97         95            95           95
## 12          89         88          96         92            93           87
## 13          97         91          98         92            94           94
## 14          95         90          95         91            92           90
## 15          95         97          97         96            91           93
## 16          92         94          95         96            92           93
## 17          88         89          96         95            91           92
## 18          93         89          96         92            91           90
## 19          89         88          91         90            92           89
## 20          94         92          93         91            90           92
## 21          95         90          94         92            93           94
## 22          91         92          97         93            94           89
## 23          88         89          98         96            91           94
## 24          95         93          94         93            92           89
## 25          98         93          95         94            93           95
## 26          86         88          89         95            92           88
## 27          95         87          97         90            94           93
## 28          92         89          92         89            89           92
## 29          93         90          95         89            95           95
## 30          94         91          95         92            90           92
## 31          86         88          89         95            92           90
## 32          97         96          96         95            92           95
## 33          93         89          92         89            89           92
## 34          96         92          95         92            91           90
## 35          98         94          94         93            90           89
## 36          91         87          90         87            92           89
## 37          94         90          94         92            92           90
## 38          95         98          94         95            93           96
## 39          95         93          94         94            91           93
## 40          95         90          96         92            91           92
## 41          94         96          98         94            95           93
## 42          95         95          96         97            95           94
## 43          89         88          95         90            94           94
## 44          92         88          95         90            92           92
## 45          95         91          93         93            88           90
## 46          92         92          96         95            91           92
## 47          96         92          96         94            95           94
## 48          92         93          95         94            91           94
## 49          93         90          94         89            90           90
## 50          97         95          91         93            90           90
## 51          92         92          93         94            95           95
## 52          91         95          94         95            95           94
## 53          94         88          92         89            92           93
## 54          89         89          91         90            94           94
## 55          94         94          94         94            90           90
##    Takedowns
## 1         92
## 2         91
## 3         90
## 4         90
## 5         92
## 6         86
## 7         92
## 8         92
## 9         88
## 10        97
## 11        93
## 12        87
## 13        86
## 14        93
## 15        87
## 16        95
## 17        84
## 18        89
## 19        91
## 20        95
## 21        92
## 22        90
## 23        95
## 24        95
## 25        88
## 26        93
## 27        89
## 28        96
## 29        90
## 30        93
## 31        93
## 32        84
## 33        97
## 34        84
## 35        88
## 36        98
## 37        99
## 38        86
## 39        93
## 40        90
## 41        94
## 42        86
## 43        91
## 44        87
## 45        92
## 46        84
## 47        93
## 48        93
## 49        90
## 50        88
## 51        99
## 52        92
## 53        97
## 54        96
## 55        91
str(ufc_fighters)
## 'data.frame':    55 obs. of  7 variables:
##  $ Punch.Power  : num  98 92 90 90 87 95 88 92 95 90 ...
##  $ Kick.Power   : num  94 93 96 96 85 93 91 93 95 88 ...
##  $ Punch.Speed  : num  96 94 92 92 90 98 93 94 93 92 ...
##  $ Kick.Speed   : num  94 95 96 96 87 97 95 94 93 92 ...
##  $ Body.Strength: num  95 92 96 96 90 92 92 93 90 94 ...
##  $ Leg.Strength : num  95 89 97 97 89 89 91 91 95 89 ...
##  $ Takedowns    : num  92 91 90 90 92 86 92 92 88 97 ...

Clustering

Kmeans

#Kmeans for the male fighters

km.male_fighter <- kmeans(male_fighters, centers = 2, nstart = 20)

print(km.male_fighter)
## K-means clustering with 2 clusters of sizes 27, 13
## 
## Cluster means:
##   Punch.Power Kick.Power Punch.Speed Kick.Speed Body.Strength Leg.Strength
## 1    94.44444   92.51852    95.33333   93.22222      92.37037     92.18519
## 2    90.15385   88.92308    92.30769   91.76923      92.15385     91.61538
##   Takedowns
## 1  89.48148
## 2  95.61538
## 
## Clustering vector:
##  [1] 1 2 1 1 2 1 1 1 1 1 1 1 2 1 1 2 1 2 1 1 2 1 2 1 1 2 2 1 1 1 1 1 2 1 1 2 1 2
## [39] 2 1
## 
## Within cluster sum of squares by cluster:
## [1] 995.1852 479.5385
##  (between_SS / total_SS =  32.4 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
km.male_fighter$cluster
##  [1] 1 2 1 1 2 1 1 1 1 1 1 1 2 1 1 2 1 2 1 1 2 1 2 1 1 2 2 1 1 1 1 1 2 1 1 2 1 2
## [39] 2 1
#Kmeans for the female fighters

km.female_fighter <- kmeans(female_fighters, centers = 2, nstart = 20)

print(km.female_fighter)
## K-means clustering with 2 clusters of sizes 9, 6
## 
## Cluster means:
##   Punch.Power Kick.Power Punch.Speed Kick.Speed Body.Strength Leg.Strength
## 1    93.44444   93.88889    93.77778   94.33333      93.00000     93.44444
## 2    90.33333   88.83333    93.83333   91.33333      90.83333     90.33333
##   Takedowns
## 1  91.88889
## 2  88.33333
## 
## Clustering vector:
##  [1] 1 1 1 1 2 1 2 2 2 1 2 1 1 2 1
## 
## Within cluster sum of squares by cluster:
## [1] 295.7778 225.8333
##  (between_SS / total_SS =  33.0 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
km.female_fighter$cluster
##  [1] 1 1 1 1 2 1 2 2 2 1 2 1 1 2 1

Visualizing & Interpreting results of kmeans

#

plot(male_fighters[, 1], male_fighters[, 2],
     col = km.male_fighter$cluster,
     main = "Male K-means Clustering with 2 Clusters",
     xlab = "Punch Power",  
     ylab = "Kick Power",
     pch = 2, cex = 1.5)

plot(female_fighters[, 1], female_fighters[, 2],
     col = km.female_fighter$cluster,
     main = "Female K-means Clustering with 2 Clusters",
     xlab = "Punch Power",  
     ylab = "Kick Power",
     pch = 2, cex = 1.5)

#From the results of the Male K means clustering we can see that there a two groups the male fighters who both have a high punch power ranging from 94 - 98 and they also appear to have a similar kicking power. This can be concluded as being the most dominant fighters based on analysis of these two categories. While the cluster in black may show a group of males that typical have weaker attributes with punch power ranging from 86-94 and the kick power from 88-92. Using this cluster plot we see a picture of who the more dominant fighters are but in the real world scenario if takes more than Punch and Kick power to determine who the better fighters are.

#The results from the Female cluster draw similar conclusions, with a small sample size.


km.ufc_fighters <- kmeans(ufc_fighters, centers = 2, nstart = 20)

print(km.ufc_fighters)
## K-means clustering with 2 clusters of sizes 33, 22
## 
## Cluster means:
##   Punch.Power Kick.Power Punch.Speed Kick.Speed Body.Strength Leg.Strength
## 1    94.00000   92.96970    95.15152   93.81818      92.69697     92.84848
## 2    91.04545   89.27273    92.77273   91.40909      91.59091     90.86364
##   Takedowns
## 1  89.24242
## 2  94.13636
## 
## Clustering vector:
##  [1] 1 1 1 1 2 1 2 1 1 2 1 2 1 2 1 1 1 1 2 2 1 1 2 2 1 2 1 2 1 2 2 1 2 1 1 2 2 1
## [39] 1 1 1 1 2 1 2 1 1 1 2 1 2 1 2 2 1
## 
## Within cluster sum of squares by cluster:
## [1] 1263.394  893.000
##  (between_SS / total_SS =  27.8 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
km.ufc_fighters$cluster
##  [1] 1 1 1 1 2 1 2 1 1 2 1 2 1 2 1 1 1 1 2 2 1 1 2 2 1 2 1 2 1 2 2 1 2 1 1 2 2 1
## [39] 1 1 1 1 2 1 2 1 1 1 2 1 2 1 2 2 1
plot(ufc_fighters[, 1], ufc_fighters[, 2],
     col = km.ufc_fighters$cluster,
     main = "K-means Clustering with 2 Clusters",
     xlab = "Punch Power",  
     ylab = "Kick Power",
     pch = 2, cex = 1.5)

#A combination of the two groups reveals slight changes in the ranges of the Dominant and weaker fighters in punch power as the highest rating is now 95, mean while the kick power remains 92 and below for the "weaker" fighters.

Selecting number of Clusters

#Using WSS Method

wss <- numeric(15)

#number of iterations 
        for(i in 1:15) {
        km.ufc_fighters <- kmeans(ufc_fighters, centers = i, nstart = 20)
        wss[i] <- km.ufc_fighters$tot.withinss
        }

print(wss)
##  [1] 2987.6364 2156.3939 1804.9167 1506.8539 1323.4083 1164.2669 1057.3123
##  [8]  978.1817  890.7671  804.8679  744.4345  703.8826  637.7202  598.8095
## [15]  554.1167
plot(1:15, wss, type = "b", pch = 1, col = "blue",
     main = "Elbow Method for Optimal Clusters",
     xlab = "Number of Clusters",
     ylab = "Within-Cluster Sum of Squares (WSS)")

#the line plot shows a decreasing WSS curve as the number of clusters increases. The elbow of the curve is positioned at 2 clusters. This supports the decision to use to clusters for the Kmeans algorithm above.

Hierarchical Clustering

# I will be using the ufc_fighters dataframe to compare the results between Hierarchical Clustering and kmeans Clustering.


dist_ufc_fighters <- dist(ufc_fighters)
d <- dist(ufc_fighters)

hclust(d = dist_ufc_fighters)
## 
## Call:
## hclust(d = dist_ufc_fighters)
## 
## Cluster method   : complete 
## Distance         : euclidean 
## Number of objects: 55
# Hclust model

hclust_ufc_fighters <- hclust(dist(ufc_fighters))

summary(hclust_ufc_fighters)
##             Length Class  Mode     
## merge       108    -none- numeric  
## height       54    -none- numeric  
## order        55    -none- numeric  
## labels        0    -none- NULL     
## method        1    -none- character
## call          2    -none- call     
## dist.method   1    -none- character
plot(hclust_ufc_fighters)

abline(h = 15, col = "red")

# I chose to use the height 15 in the Dendogram and it gave a result of 4 Clusters.The next step was to perform Tree "cutting". Below I will demonstrate two methods using Height and another using number of clusters K.

cutree(hclust_ufc_fighters, h = 15)
##  [1] 1 1 1 1 2 3 2 1 3 2 1 4 3 1 3 1 4 4 2 1 4 4 1 1 3 2 4 2 4 1 2 3 2 3 3 2 2 3
## [39] 1 4 1 3 2 4 1 4 1 1 4 3 1 1 2 2 1
cutree(hclust_ufc_fighters, k = 4)
##  [1] 1 1 1 1 2 3 2 1 3 2 1 4 3 1 3 1 4 4 2 1 4 4 1 1 3 2 4 2 4 1 2 3 2 3 3 2 2 3
## [39] 1 4 1 3 2 4 1 4 1 1 4 3 1 1 2 2 1

Plotting Dendrogram of Hclust

hclust.complete <- hclust(d, method = "complete")
hclust.average <- hclust(d, method = "average")
hclust.single <-hclust(d, method = "single")

# Plotting dendgram of hclust

plot(hclust.complete, main = "complete")
abline(h = 15, col = "red")

plot(hclust.average, main = "average")

plot(hclust.single, main = "single")

# The dendrogram determines using the complete method is the best option as it accounts for 4 clusters.

# Now I want to find out if scaling is necessary for this data

colMeans(ufc_fighters)
##   Punch.Power    Kick.Power   Punch.Speed    Kick.Speed Body.Strength 
##      92.81818      91.49091      94.20000      92.85455      92.25455 
##  Leg.Strength     Takedowns 
##      92.05455      91.20000
# Observation: the variables have similar means, which range from 91.2 and 94.2

apply(ufc_fighters, 2, sd)
##   Punch.Power    Kick.Power   Punch.Speed    Kick.Speed Body.Strength 
##      3.085755      2.980633      2.280351      2.504945      1.945728 
##  Leg.Strength     Takedowns 
##      2.475197      3.941235
# Observation: A majority of the sd(standard deviations) are between 2 and 3. However "Takedowns" has a higher sd of 3.94. Based on the results you would most likely think scaling is not necessary, however because I am using algorithms that are highly sensitive like k-means and hierarchical clustering with Euclidean distance I have decided to go ahead with scaling.

Scaling

scaled_ufc_fighters <- scale(ufc_fighters)


colMeans(scaled_ufc_fighters)
##   Punch.Power    Kick.Power   Punch.Speed    Kick.Speed Body.Strength 
##  1.672904e-15  1.997392e-15 -1.241179e-15 -1.764245e-15  2.128600e-15 
##  Leg.Strength     Takedowns 
##  2.843685e-15 -7.145799e-16
apply(scaled_ufc_fighters, 2, sd)
##   Punch.Power    Kick.Power   Punch.Speed    Kick.Speed Body.Strength 
##             1             1             1             1             1 
##  Leg.Strength     Takedowns 
##             1             1
# The results I have from the mean and sd are satisfactory as the mean is approx = 0 and the sd is = 1

hclust.fighters <- hclust(dist(scaled_ufc_fighters), method = "complete")

# Now I want to Compare the kmeans{} and the hclust()

plot(hclust.fighters, main = "Hierarchical Clustering Dendrogram (Scaled)",
     xlab = "Fighters", sub = "", cex = 0.8)

# The dendrogram dectates 4 clusters is best.
cut.fighters <- cutree(hclust.fighters, k = 4)


km_scaled_fighters <- kmeans(scaled_ufc_fighters, centers = 4, nstart = 20)
print(km_scaled_fighters)
## K-means clustering with 4 clusters of sizes 14, 10, 14, 17
## 
## Cluster means:
##   Punch.Power Kick.Power Punch.Speed  Kick.Speed Body.Strength Leg.Strength
## 1  -0.1494088 -0.5481272   0.9146462 -0.14153824    0.08944002   -0.1663255
## 2  -1.0105085 -1.2047470  -1.4909986 -1.01980118   -0.33640126   -0.6280492
## 3   0.4061394  1.1054044   0.3821467  0.88500285    0.97049094    1.0745571
## 4   0.3829915  0.2497406  -0.1908891 -0.01238193   -0.67500123   -0.3785147
##    Takedowns
## 1 -0.8119282
## 2  0.9641648
## 3 -0.1413625
## 4  0.2179072
## 
## Clustering vector:
##  [1] 3 4 3 3 2 1 4 4 4 2 3 1 1 4 3 3 1 1 2 4 4 1 1 4 3 2 1 2 1 4 2 3 2 1 4 2 4 3
## [39] 4 1 3 3 1 1 4 1 3 4 4 4 3 3 2 2 4
## 
## Within cluster sum of squares by cluster:
## [1] 61.02464 40.83950 52.87905 48.06717
##  (between_SS / total_SS =  46.3 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
plot(scaled_ufc_fighters, main = "Hierarchical Clustering (Scaled)", col = 2)

plot(ufc_fighters[, 1], ufc_fighters[, 2],
     col = km_scaled_fighters$cluster,
     main = "K-means Clustering with 4 Clusters",
     xlab = "Punch Power",  
     ylab = "Kick Power",
     pch = 2, cex = 1.5)

table(km.ufc_fighters$cluster, cut.fighters)
##     cut.fighters
##      1 2 3 4
##   1  0 0 2 0
##   2  0 3 0 0
##   3  2 0 0 1
##   4  0 0 0 4
##   5  0 0 0 5
##   6  3 0 0 1
##   7  0 0 2 0
##   8  1 1 2 0
##   9  3 0 0 1
##   10 4 0 0 0
##   11 6 0 0 0
##   12 6 0 0 0
##   13 0 0 0 2
##   14 0 0 4 0
##   15 2 0 0 0
# Comparing the Kmeans against the Hierarchical Clustering, I have concluded that 4 clusters are too much. The table shows values that are spread across rows and columns. This shows that there is a disagreement between k-means and hierarchical clustering. This is okay as both methods use different algorithms to analyse data.

Dimensional Reduction

library(cluster)
library(ggplot2)

#Principal Component Analysis
ufc_pca <- prcomp(ufc_fighters, scale. = TRUE)
ufc_pca
## Standard deviations (1, .., p=7):
## [1] 1.6702150 1.1663917 0.9726150 0.9142879 0.7415688 0.6238773 0.3589741
## 
## Rotation (n x k) = (7 x 7):
##                      PC1          PC2         PC3         PC4          PC5
## Punch.Power   -0.3331537 -0.418299422  0.59688765  0.22916694 -0.095353315
## Kick.Power    -0.5019811  0.002175869 -0.09286234  0.51112131  0.002467875
## Punch.Speed   -0.3969549 -0.243993459  0.14051578 -0.56626098 -0.487148931
## Kick.Speed    -0.4262803  0.122667102 -0.59339940  0.21964500 -0.315451490
## Body.Strength -0.2506796  0.625738705  0.08625097 -0.40870857 -0.084768693
## Leg.Strength  -0.3659803  0.431628699  0.36638819  0.06101042  0.471733938
## Takedowns      0.3177322  0.415437808  0.34864690  0.38285393 -0.651428655
##                      PC6         PC7
## Punch.Power    0.3293232  0.43323880
## Kick.Power     0.1848500 -0.66630793
## Punch.Speed   -0.3706840 -0.26034040
## Kick.Speed    -0.1436089  0.53173110
## Body.Strength  0.6024185  0.03217230
## Leg.Strength  -0.5518059  0.12153133
## Takedowns     -0.1787421 -0.04492079
#Visualizing the first two Principal Components

pca_data <- data.frame(PC1 = ufc_pca$x[, 1], 
                       PC2 = ufc_pca$x[, 2], 
                       Cluster = factor(km.ufc_fighters$cluster))

pca_data
##            PC1         PC2 Cluster
## 1  -2.21501858  0.64377680      10
## 2  -0.02779265 -0.39712345       9
## 3  -1.91796178  2.71517842       2
## 4  -1.91796178  2.71517842       2
## 5   4.25534474 -0.22672377       7
## 6  -1.79143081 -1.66089105       6
## 7   0.69987144  0.70481871       9
## 8  -0.20155370  0.32967386       9
## 9  -1.04542381 -0.70640676      11
## 10  2.11492040  1.21306984       8
## 11 -1.87837860  1.23265939      10
## 12  1.14496036 -0.80380435      13
## 13 -1.81662179 -0.66321727       6
## 14  0.67355663 -0.72366858       5
## 15 -2.50281898 -1.11860466      11
## 16 -0.80942528  0.66473671       3
## 17 -0.14949130 -0.60811472      13
## 18  0.52001537 -1.25453866       5
## 19  2.51128000  0.08204584       7
## 20  0.91603408 -0.45626705      12
## 21 -0.12343222  0.34601399       5
## 22 -0.27158017 -0.14344560       9
## 23 -0.07674592  0.73511028       3
## 24  0.31113299 -0.48010590      12
## 25 -1.93732603 -0.31477938      11
## 26  2.64154377  0.98400127       1
## 27 -0.02289619 -0.24412013       4
## 28  2.36108743 -0.39447053      14
## 29 -0.13781742  0.96998001       4
## 30  0.40488548 -0.83283954      12
## 31  2.34582563  1.33276440       1
## 32 -2.87248926 -0.97828765      11
## 33  2.33373981 -0.42462070      14
## 34 -0.53813324 -2.07906517       6
## 35 -0.48782648 -2.26709896      15
## 36  3.71268891  0.50814388      14
## 37  1.26912684  0.20030598       8
## 38 -2.76069531  0.21507893      11
## 39 -0.48287778 -0.26602185      12
## 40 -0.07942965 -1.07075389       5
## 41 -2.01118645  0.83552610      10
## 42 -2.90590975  0.39126162      11
## 43  0.81800797  1.16915310       4
## 44  0.72503311 -0.65110901       4
## 45  0.94766985 -1.80279477      12
## 46 -1.08659412 -1.14815750       6
## 47 -1.43378358  0.84445965      10
## 48 -0.48091801  0.20803610       3
## 49  1.41973349 -1.40291031       5
## 50 -0.17390605 -1.63543457      15
## 51 -0.14384935  2.51451681       8
## 52 -1.30184162  1.68199903       2
## 53  1.85982236  0.57826108      14
## 54  1.74898575  2.12491608       8
## 55 -0.13214874 -1.18529054      12
variance <- ufc_pca$sdev^2 / sum(ufc_pca$sdev^2)
barplot(variance[1:7], 
        names.arg = paste("PC", 1:7),
        main = "PCA Scree Plot",
        xlab = "Principal Components",
        ylab = "Proportion of Variance",
        col = "blue")

#PC1 and PC2 are enough to explain a large majority of the variance


#Comparing membership between Kmeans and Hclust

h_clusters <- cutree(hclust_ufc_fighters, k = 4)

k_clusters <- km.ufc_fighters$cluster

table(Hierarchical = h_clusters, KMeans = k_clusters)
##             KMeans
## Hierarchical 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
##            1 0 3 3 0 1 0 0 1 2  4  0  6  0  0  0
##            2 2 0 0 1 0 0 2 3 1  0  0  0  0  4  0
##            3 0 0 0 0 0 3 0 0 0  0  6  0  0  0  2
##            4 0 0 0 3 4 1 0 0 1  0  0  0  2  0  0
#Finding the mean for each cluster

aggregate(ufc_fighters, by = list(Hierarchical = h_clusters), mean)
##   Hierarchical Punch.Power Kick.Power Punch.Speed Kick.Speed Body.Strength
## 1            1    93.15000   93.00000    94.60000   94.05000      92.70000
## 2            2    89.84615   88.30769    91.53846   90.76923      91.84615
## 3            3    96.18182   94.45455    95.18182   94.27273      91.90909
## 4            4    92.36364   89.54545    95.63636   91.72727      92.27273
##   Leg.Strength Takedowns
## 1     92.85000  93.05000
## 2     90.76923  94.76923
## 3     92.72727  86.45455
## 4     91.45455  88.36364
aggregate(ufc_fighters, by = list(KMeans = k_clusters), mean)
##    KMeans Punch.Power Kick.Power Punch.Speed Kick.Speed Body.Strength
## 1       1    86.00000   88.00000    89.00000   95.00000      92.00000
## 2       2    90.33333   95.66667    92.66667   95.66667      95.66667
## 3       3    90.66667   92.00000    96.00000   95.33333      91.33333
## 4       4    92.25000   88.25000    95.50000   89.75000      93.75000
## 5       5    94.20000   89.80000    95.00000   91.20000      91.40000
## 6       6    95.00000   92.00000    96.75000   94.00000      92.00000
## 7       7    88.00000   86.50000    90.50000   88.50000      91.00000
## 8       8    91.25000   89.75000    92.50000   92.00000      93.75000
## 9       9    90.75000   92.25000    94.50000   94.25000      92.75000
## 10     10    95.50000   93.75000    96.75000   94.25000      95.00000
## 11     11    95.83333   95.66667    95.16667   95.00000      92.33333
## 12     12    94.50000   92.33333    93.83333   92.83333      90.16667
## 13     13    88.50000   88.50000    96.00000   93.50000      92.00000
## 14     14    92.50000   88.25000    91.50000   88.50000      90.50000
## 15     15    97.50000   94.50000    92.50000   93.00000      90.00000
##    Leg.Strength Takedowns
## 1      89.00000  93.00000
## 2      96.00000  90.66667
## 3      93.66667  94.33333
## 4      93.50000  89.25000
## 5      91.20000  90.80000
## 6      91.25000  85.00000
## 7      89.00000  91.50000
## 8      92.00000  97.75000
## 9      90.00000  91.25000
## 10     94.25000  93.00000
## 11     94.66667  86.50000
## 12     91.00000  93.16667
## 13     89.50000  85.50000
## 14     91.50000  97.00000
## 15     89.50000  88.00000

Scatter Plot

#Scatter plot for k-means
plot(ufc_fighters[, 1], ufc_fighters[, 2], col = k_clusters,
     main = "K-Means Clustering", xlab = "Punch Power", ylab = "Kick Power")

#Scatter plot for hierarchical clustering
plot(ufc_fighters[, 1], ufc_fighters[, 2], col = h_clusters,
     main = "Hierarchical Clustering", xlab = "Punch Power", ylab = "Kick Power")

ggplot(pca_data, aes(x = PC1, y = PC2, color = Cluster)) +
  geom_point(size = 1, alpha = 0.8) +
  scale_color_hue() +
  theme_minimal() +
  labs(title = "PCA Scatter Plot with Clusters",
       x = "Principal Component 1",
       y = "Principal Component 2",
       color = "Cluster") +
  theme(legend.position = "right")

Silhoutte Scores

knitr::opts_chunk$set(echo = FALSE, message = FALSE, warning = FALSE)
#Kmeans

library(cluster)
silhouette_kmeans <- silhouette(k_clusters, dist(ufc_fighters))
plot(silhouette_kmeans)

#Hclust
silhouette_hclust <- silhouette(h_clusters, dist(ufc_fighters))
plot(silhouette_hclust)

#Silhoutte values range from -1 to 1. 
#when it is close to 1, the data is well matched to its cluster. when it is close to 0, the data points are on a boundary between clusters and when it is negative it is misclassified.

#Kmeans cluster has a higher silhoutte width, which tells us that the fighters are better grouped based on their skill attributes than in the hclust which has a width of 0.19.

#The silhoutte scores for Hclust are uniformely low, which shows a poorly defined cluster as compared to the kmeans.