Introduction
# The data used for this project was optained from the EA Sports video game UFC 2025 https://www.ea.com/games/ufc/ufc-5. I have chosen to work with the top 5 rated fighters in each weight division. The data includes both men and women fighters. The women have 3 weight classes and the men have 8 weight classes. For the purpose of this project I have chosen to use the following character attributes to work with "punch power", "kick power", "punch speed","kick speed","body strenth", "leg strenght" and "takedowns".
library(dplyr)
library(cluster)
library(factoextra)
library(mclust)
library(cluster)
library(ggplot2)
library(readxl)
ufc <- read_excel("C:/Users/mzing/Desktop/USL/UFC 2025/UFC.xlsx")
str(ufc)
## tibble [55 × 13] (S3: tbl_df/tbl/data.frame)
## $ Fighter Name : chr [1:55] "Amanda Nunes" "Valentina Shevchenko" "Holly Holm" "Julianna Pena" ...
## $ Gender : chr [1:55] "F" "F" "F" "F" ...
## $ Weight Class : chr [1:55] "Bantamweight" "Bantamweight" "Bantamweight" "Bantamweight" ...
## $ Ranking : num [1:55] 1 2 3 4 5 1 2 3 4 5 ...
## $ Nationality : chr [1:55] "Brazil" "Kyrgyzstani" "USA" "USA" ...
## $ Stars : num [1:55] 5 4.5 4.5 4.5 4 4.5 4.5 4.5 4.5 4.5 ...
## $ Punch Power : num [1:55] 98 92 90 90 87 95 88 92 95 90 ...
## $ Kick Power : num [1:55] 94 93 96 96 85 93 91 93 95 88 ...
## $ Punch Speed : num [1:55] 96 94 92 92 90 98 93 94 93 92 ...
## $ Kick Speed : num [1:55] 94 95 96 96 87 97 95 94 93 92 ...
## $ Body Strength: num [1:55] 95 92 96 96 90 92 92 93 90 94 ...
## $ Leg Strength : num [1:55] 95 89 97 97 89 89 91 91 95 89 ...
## $ Takedowns : num [1:55] 92 91 90 90 92 86 92 92 88 97 ...
head(ufc)
## # A tibble: 6 × 13
## `Fighter Name` Gender `Weight Class` Ranking Nationality Stars `Punch Power`
## <chr> <chr> <chr> <dbl> <chr> <dbl> <dbl>
## 1 Amanda Nunes F Bantamweight 1 Brazil 5 98
## 2 Valentina Shevc… F Bantamweight 2 Kyrgyzstani 4.5 92
## 3 Holly Holm F Bantamweight 3 USA 4.5 90
## 4 Julianna Pena F Bantamweight 4 USA 4.5 90
## 5 Meisha Tate F Bantamweight 5 USA 4 87
## 6 Sean O'malley M Bantamweight 1 USA 4.5 95
## # ℹ 6 more variables: `Kick Power` <dbl>, `Punch Speed` <dbl>,
## # `Kick Speed` <dbl>, `Body Strength` <dbl>, `Leg Strength` <dbl>,
## # Takedowns <dbl>
Data Preparation
# Separating the male from the female fighters
ufc_men <- data.frame(ufc[ufc$Gender == 'M',])
ufc_men
## Fighter.Name Gender Weight.Class Ranking Nationality Stars
## 1 Sean O'malley M Bantamweight 1 USA 4.5
## 2 Aljamain Sterling M Bantamweight 2 USA 4.5
## 3 Cory Sandhagen M Bantamweight 3 USA 4.5
## 4 Marlon Vera M Bantamweight 4 Ecuador 4.5
## 5 Merab Dvalishvili M Bantamweight 5 Georgia 4.5
## 6 Alexander Volkanovski M Featherweight 1 Australia 5.0
## 7 Max Holloway M Featherweight 2 USA 4.5
## 8 Conor McGregor M Featherweight 3 Ireland 4.5
## 9 Ilia Torupia M Featherweight 4 Georgia 4.5
## 10 Jose Aldo M Featherweight 5 Brazil 4.5
## 11 Alexandre Pantoja M Flyweight 1 Brazil 4.5
## 12 Brandon Moreno M Flyweight 2 Mexico 4.5
## 13 Demetrius Johnson M Flyweight 3 USA 4.5
## 14 Henry Cejudo M Flyweight 4 USA 4.5
## 15 Deiveson Figueredo M Flyweight 5 Brazil 4.5
## 16 Jon Jones M Heavyweight 1 USA 5.0
## 17 Fedor Emelianenko M Heavyweight 2 Russia 5.0
## 18 Daniel Cormier M Heavyweight 3 USA 4.5
## 19 Stipe Miocic M Heavyweight 4 USA 4.5
## 20 Tom Aspinall M Heavyweight 5 England 4.5
## 21 Jon Jones M Light Heavyweight 1 USA 5.0
## 22 Alex Pereira M Light Heavyweight 2 Brazil 4.5
## 23 Daniel Cormier M Light Heavyweight 3 USA 4.5
## 24 Jiri Prochazka M Light Heavyweight 4 Czech Republic 4.5
## 25 Jamahal Hill M Light Heavyweight 5 USA 4.5
## 26 Khabib Nurmagomedov M Lightweight 1 Russia 5.0
## 27 Islam Makhachev M Lightweight 2 Russia 5.0
## 28 Justin Gaethje M Lightweight 3 USA 4.5
## 29 Charles Oliveira M Lightweight 4 Brazil 4.5
## 30 Dustin Poirier M Lightweight 5 USA 4.5
## 31 Anderson Silva M Middleweight 1 Brazil 5.0
## 32 Israel Adesanya M Middleweight 2 Nigeria 5.0
## 33 Sean Strickland M Middleweight 3 USA 4.5
## 34 Michael Bisping M Middleweight 4 England 4.5
## 35 Dricus Du Plessis M Middleweight 5 South Africa 4.5
## 36 Georges St Pierre M Welterweight 1 Canada 5.0
## 37 Leon Edwards M Welterweight 2 England 4.5
## 38 Kamura Usman M Welterweight 3 USA 4.5
## 39 Colby Covington M Welterweight 4 USA 4.5
## 40 Shavkat Rakhmonov M Welterweight 5 Kazakhstan 4.5
## Punch.Power Kick.Power Punch.Speed Kick.Speed Body.Strength Leg.Strength
## 1 95 93 98 97 92 89
## 2 88 91 93 95 92 91
## 3 92 93 94 94 93 91
## 4 95 95 93 93 90 95
## 5 90 88 92 92 94 89
## 6 94 93 97 95 95 95
## 7 89 88 96 92 93 87
## 8 97 91 98 92 94 94
## 9 95 90 95 91 92 90
## 10 95 97 97 96 91 93
## 11 95 90 94 92 93 94
## 12 91 92 97 93 94 89
## 13 88 89 98 96 91 94
## 14 95 93 94 93 92 89
## 15 98 93 95 94 93 95
## 16 86 88 89 95 92 88
## 17 95 87 97 90 94 93
## 18 92 89 92 89 89 92
## 19 93 90 95 89 95 95
## 20 94 91 95 92 90 92
## 21 86 88 89 95 92 90
## 22 97 96 96 95 92 95
## 23 93 89 92 89 89 92
## 24 96 92 95 92 91 90
## 25 98 94 94 93 90 89
## 26 91 87 90 87 92 89
## 27 94 90 94 92 92 90
## 28 95 98 94 95 93 96
## 29 95 93 94 94 91 93
## 30 95 90 96 92 91 92
## 31 94 96 98 94 95 93
## 32 95 95 96 97 95 94
## 33 89 88 95 90 94 94
## 34 92 88 95 90 92 92
## 35 95 91 93 93 88 90
## 36 92 92 93 94 95 95
## 37 91 95 94 95 95 94
## 38 94 88 92 89 92 93
## 39 89 89 91 90 94 94
## 40 94 94 94 94 90 90
## Takedowns
## 1 86
## 2 92
## 3 92
## 4 88
## 5 97
## 6 93
## 7 87
## 8 86
## 9 93
## 10 87
## 11 92
## 12 90
## 13 95
## 14 95
## 15 88
## 16 93
## 17 89
## 18 96
## 19 90
## 20 93
## 21 93
## 22 84
## 23 97
## 24 84
## 25 88
## 26 98
## 27 99
## 28 86
## 29 93
## 30 90
## 31 94
## 32 86
## 33 91
## 34 87
## 35 92
## 36 99
## 37 92
## 38 97
## 39 96
## 40 91
male_fighters <- ufc_men[, c("Punch.Power", "Kick.Power", "Punch.Speed", "Kick.Speed",
"Body.Strength", "Leg.Strength", "Takedowns")]
male_fighters
## Punch.Power Kick.Power Punch.Speed Kick.Speed Body.Strength Leg.Strength
## 1 95 93 98 97 92 89
## 2 88 91 93 95 92 91
## 3 92 93 94 94 93 91
## 4 95 95 93 93 90 95
## 5 90 88 92 92 94 89
## 6 94 93 97 95 95 95
## 7 89 88 96 92 93 87
## 8 97 91 98 92 94 94
## 9 95 90 95 91 92 90
## 10 95 97 97 96 91 93
## 11 95 90 94 92 93 94
## 12 91 92 97 93 94 89
## 13 88 89 98 96 91 94
## 14 95 93 94 93 92 89
## 15 98 93 95 94 93 95
## 16 86 88 89 95 92 88
## 17 95 87 97 90 94 93
## 18 92 89 92 89 89 92
## 19 93 90 95 89 95 95
## 20 94 91 95 92 90 92
## 21 86 88 89 95 92 90
## 22 97 96 96 95 92 95
## 23 93 89 92 89 89 92
## 24 96 92 95 92 91 90
## 25 98 94 94 93 90 89
## 26 91 87 90 87 92 89
## 27 94 90 94 92 92 90
## 28 95 98 94 95 93 96
## 29 95 93 94 94 91 93
## 30 95 90 96 92 91 92
## 31 94 96 98 94 95 93
## 32 95 95 96 97 95 94
## 33 89 88 95 90 94 94
## 34 92 88 95 90 92 92
## 35 95 91 93 93 88 90
## 36 92 92 93 94 95 95
## 37 91 95 94 95 95 94
## 38 94 88 92 89 92 93
## 39 89 89 91 90 94 94
## 40 94 94 94 94 90 90
## Takedowns
## 1 86
## 2 92
## 3 92
## 4 88
## 5 97
## 6 93
## 7 87
## 8 86
## 9 93
## 10 87
## 11 92
## 12 90
## 13 95
## 14 95
## 15 88
## 16 93
## 17 89
## 18 96
## 19 90
## 20 93
## 21 93
## 22 84
## 23 97
## 24 84
## 25 88
## 26 98
## 27 99
## 28 86
## 29 93
## 30 90
## 31 94
## 32 86
## 33 91
## 34 87
## 35 92
## 36 99
## 37 92
## 38 97
## 39 96
## 40 91
# Female Fighters
ufc_women <- data.frame(ufc[ufc$Gender == 'F',])
ufc_women
## Fighter.Name Gender Weight.Class Ranking Nationality Stars
## 1 Amanda Nunes F Bantamweight 1 Brazil 5.0
## 2 Valentina Shevchenko F Bantamweight 2 Kyrgyzstani 4.5
## 3 Holly Holm F Bantamweight 3 USA 4.5
## 4 Julianna Pena F Bantamweight 4 USA 4.5
## 5 Meisha Tate F Bantamweight 5 USA 4.0
## 6 Valentina Shevchenko F Flyweight 1 Kyrgyzstani 5.0
## 7 Joanna Jedrzejczyk F Flyweight 2 Poland 4.5
## 8 Alexa Grasso F Flyweight 3 Mexico 4.5
## 9 Erin Blanchfield F Flyweight 4 USA 4.5
## 10 Taila Santos F Flyweight 5 Brazil 4.5
## 11 Joanna Jedrzejczyk F Strawweight 1 Poland 5.0
## 12 Zhang Weili F Strawweight 2 China 4.5
## 13 Rose Namajunas F Strawweight 3 USA 4.5
## 14 Mackenzie Dern F Strawweight 4 USA 4.5
## 15 Amanda Lemos F Strawweight 5 Brazil 4.5
## Punch.Power Kick.Power Punch.Speed Kick.Speed Body.Strength Leg.Strength
## 1 98 94 96 94 95 95
## 2 92 93 94 95 92 89
## 3 90 96 92 96 96 97
## 4 90 96 92 96 96 97
## 5 87 85 90 87 90 89
## 6 92 94 95 96 92 93
## 7 88 89 96 95 91 92
## 8 93 89 96 92 91 90
## 9 89 88 91 90 92 89
## 10 94 92 93 91 90 92
## 11 92 92 96 95 91 92
## 12 96 92 96 94 95 94
## 13 92 93 95 94 91 94
## 14 93 90 94 89 90 90
## 15 97 95 91 93 90 90
## Takedowns
## 1 92
## 2 91
## 3 90
## 4 90
## 5 92
## 6 95
## 7 84
## 8 89
## 9 91
## 10 95
## 11 84
## 12 93
## 13 93
## 14 90
## 15 88
female_fighters <- ufc_women[, c("Punch.Power", "Kick.Power", "Punch.Speed", "Kick.Speed",
"Body.Strength", "Leg.Strength", "Takedowns")]
female_fighters
## Punch.Power Kick.Power Punch.Speed Kick.Speed Body.Strength Leg.Strength
## 1 98 94 96 94 95 95
## 2 92 93 94 95 92 89
## 3 90 96 92 96 96 97
## 4 90 96 92 96 96 97
## 5 87 85 90 87 90 89
## 6 92 94 95 96 92 93
## 7 88 89 96 95 91 92
## 8 93 89 96 92 91 90
## 9 89 88 91 90 92 89
## 10 94 92 93 91 90 92
## 11 92 92 96 95 91 92
## 12 96 92 96 94 95 94
## 13 92 93 95 94 91 94
## 14 93 90 94 89 90 90
## 15 97 95 91 93 90 90
## Takedowns
## 1 92
## 2 91
## 3 90
## 4 90
## 5 92
## 6 95
## 7 84
## 8 89
## 9 91
## 10 95
## 11 84
## 12 93
## 13 93
## 14 90
## 15 88
# All UFC fighters combined
ufc_fighters <- data.frame(lapply(ufc[, c("Punch Power", "Kick Power", "Punch Speed",
"Kick Speed", "Body Strength",
"Leg Strength", "Takedowns")], as.numeric))
print(ufc_fighters)
## Punch.Power Kick.Power Punch.Speed Kick.Speed Body.Strength Leg.Strength
## 1 98 94 96 94 95 95
## 2 92 93 94 95 92 89
## 3 90 96 92 96 96 97
## 4 90 96 92 96 96 97
## 5 87 85 90 87 90 89
## 6 95 93 98 97 92 89
## 7 88 91 93 95 92 91
## 8 92 93 94 94 93 91
## 9 95 95 93 93 90 95
## 10 90 88 92 92 94 89
## 11 94 93 97 95 95 95
## 12 89 88 96 92 93 87
## 13 97 91 98 92 94 94
## 14 95 90 95 91 92 90
## 15 95 97 97 96 91 93
## 16 92 94 95 96 92 93
## 17 88 89 96 95 91 92
## 18 93 89 96 92 91 90
## 19 89 88 91 90 92 89
## 20 94 92 93 91 90 92
## 21 95 90 94 92 93 94
## 22 91 92 97 93 94 89
## 23 88 89 98 96 91 94
## 24 95 93 94 93 92 89
## 25 98 93 95 94 93 95
## 26 86 88 89 95 92 88
## 27 95 87 97 90 94 93
## 28 92 89 92 89 89 92
## 29 93 90 95 89 95 95
## 30 94 91 95 92 90 92
## 31 86 88 89 95 92 90
## 32 97 96 96 95 92 95
## 33 93 89 92 89 89 92
## 34 96 92 95 92 91 90
## 35 98 94 94 93 90 89
## 36 91 87 90 87 92 89
## 37 94 90 94 92 92 90
## 38 95 98 94 95 93 96
## 39 95 93 94 94 91 93
## 40 95 90 96 92 91 92
## 41 94 96 98 94 95 93
## 42 95 95 96 97 95 94
## 43 89 88 95 90 94 94
## 44 92 88 95 90 92 92
## 45 95 91 93 93 88 90
## 46 92 92 96 95 91 92
## 47 96 92 96 94 95 94
## 48 92 93 95 94 91 94
## 49 93 90 94 89 90 90
## 50 97 95 91 93 90 90
## 51 92 92 93 94 95 95
## 52 91 95 94 95 95 94
## 53 94 88 92 89 92 93
## 54 89 89 91 90 94 94
## 55 94 94 94 94 90 90
## Takedowns
## 1 92
## 2 91
## 3 90
## 4 90
## 5 92
## 6 86
## 7 92
## 8 92
## 9 88
## 10 97
## 11 93
## 12 87
## 13 86
## 14 93
## 15 87
## 16 95
## 17 84
## 18 89
## 19 91
## 20 95
## 21 92
## 22 90
## 23 95
## 24 95
## 25 88
## 26 93
## 27 89
## 28 96
## 29 90
## 30 93
## 31 93
## 32 84
## 33 97
## 34 84
## 35 88
## 36 98
## 37 99
## 38 86
## 39 93
## 40 90
## 41 94
## 42 86
## 43 91
## 44 87
## 45 92
## 46 84
## 47 93
## 48 93
## 49 90
## 50 88
## 51 99
## 52 92
## 53 97
## 54 96
## 55 91
str(ufc_fighters)
## 'data.frame': 55 obs. of 7 variables:
## $ Punch.Power : num 98 92 90 90 87 95 88 92 95 90 ...
## $ Kick.Power : num 94 93 96 96 85 93 91 93 95 88 ...
## $ Punch.Speed : num 96 94 92 92 90 98 93 94 93 92 ...
## $ Kick.Speed : num 94 95 96 96 87 97 95 94 93 92 ...
## $ Body.Strength: num 95 92 96 96 90 92 92 93 90 94 ...
## $ Leg.Strength : num 95 89 97 97 89 89 91 91 95 89 ...
## $ Takedowns : num 92 91 90 90 92 86 92 92 88 97 ...
Clustering
Kmeans
#Kmeans for the male fighters
km.male_fighter <- kmeans(male_fighters, centers = 2, nstart = 20)
print(km.male_fighter)
## K-means clustering with 2 clusters of sizes 27, 13
##
## Cluster means:
## Punch.Power Kick.Power Punch.Speed Kick.Speed Body.Strength Leg.Strength
## 1 94.44444 92.51852 95.33333 93.22222 92.37037 92.18519
## 2 90.15385 88.92308 92.30769 91.76923 92.15385 91.61538
## Takedowns
## 1 89.48148
## 2 95.61538
##
## Clustering vector:
## [1] 1 2 1 1 2 1 1 1 1 1 1 1 2 1 1 2 1 2 1 1 2 1 2 1 1 2 2 1 1 1 1 1 2 1 1 2 1 2
## [39] 2 1
##
## Within cluster sum of squares by cluster:
## [1] 995.1852 479.5385
## (between_SS / total_SS = 32.4 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
km.male_fighter$cluster
## [1] 1 2 1 1 2 1 1 1 1 1 1 1 2 1 1 2 1 2 1 1 2 1 2 1 1 2 2 1 1 1 1 1 2 1 1 2 1 2
## [39] 2 1
#Kmeans for the female fighters
km.female_fighter <- kmeans(female_fighters, centers = 2, nstart = 20)
print(km.female_fighter)
## K-means clustering with 2 clusters of sizes 9, 6
##
## Cluster means:
## Punch.Power Kick.Power Punch.Speed Kick.Speed Body.Strength Leg.Strength
## 1 93.44444 93.88889 93.77778 94.33333 93.00000 93.44444
## 2 90.33333 88.83333 93.83333 91.33333 90.83333 90.33333
## Takedowns
## 1 91.88889
## 2 88.33333
##
## Clustering vector:
## [1] 1 1 1 1 2 1 2 2 2 1 2 1 1 2 1
##
## Within cluster sum of squares by cluster:
## [1] 295.7778 225.8333
## (between_SS / total_SS = 33.0 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
km.female_fighter$cluster
## [1] 1 1 1 1 2 1 2 2 2 1 2 1 1 2 1
Visualizing & Interpreting results of kmeans
#
plot(male_fighters[, 1], male_fighters[, 2],
col = km.male_fighter$cluster,
main = "Male K-means Clustering with 2 Clusters",
xlab = "Punch Power",
ylab = "Kick Power",
pch = 2, cex = 1.5)

plot(female_fighters[, 1], female_fighters[, 2],
col = km.female_fighter$cluster,
main = "Female K-means Clustering with 2 Clusters",
xlab = "Punch Power",
ylab = "Kick Power",
pch = 2, cex = 1.5)

#From the results of the Male K means clustering we can see that there a two groups the male fighters who both have a high punch power ranging from 94 - 98 and they also appear to have a similar kicking power. This can be concluded as being the most dominant fighters based on analysis of these two categories. While the cluster in black may show a group of males that typical have weaker attributes with punch power ranging from 86-94 and the kick power from 88-92. Using this cluster plot we see a picture of who the more dominant fighters are but in the real world scenario if takes more than Punch and Kick power to determine who the better fighters are.
#The results from the Female cluster draw similar conclusions, with a small sample size.
km.ufc_fighters <- kmeans(ufc_fighters, centers = 2, nstart = 20)
print(km.ufc_fighters)
## K-means clustering with 2 clusters of sizes 33, 22
##
## Cluster means:
## Punch.Power Kick.Power Punch.Speed Kick.Speed Body.Strength Leg.Strength
## 1 94.00000 92.96970 95.15152 93.81818 92.69697 92.84848
## 2 91.04545 89.27273 92.77273 91.40909 91.59091 90.86364
## Takedowns
## 1 89.24242
## 2 94.13636
##
## Clustering vector:
## [1] 1 1 1 1 2 1 2 1 1 2 1 2 1 2 1 1 1 1 2 2 1 1 2 2 1 2 1 2 1 2 2 1 2 1 1 2 2 1
## [39] 1 1 1 1 2 1 2 1 1 1 2 1 2 1 2 2 1
##
## Within cluster sum of squares by cluster:
## [1] 1263.394 893.000
## (between_SS / total_SS = 27.8 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
km.ufc_fighters$cluster
## [1] 1 1 1 1 2 1 2 1 1 2 1 2 1 2 1 1 1 1 2 2 1 1 2 2 1 2 1 2 1 2 2 1 2 1 1 2 2 1
## [39] 1 1 1 1 2 1 2 1 1 1 2 1 2 1 2 2 1
plot(ufc_fighters[, 1], ufc_fighters[, 2],
col = km.ufc_fighters$cluster,
main = "K-means Clustering with 2 Clusters",
xlab = "Punch Power",
ylab = "Kick Power",
pch = 2, cex = 1.5)

#A combination of the two groups reveals slight changes in the ranges of the Dominant and weaker fighters in punch power as the highest rating is now 95, mean while the kick power remains 92 and below for the "weaker" fighters.
Selecting number of Clusters
#Using WSS Method
wss <- numeric(15)
#number of iterations
for(i in 1:15) {
km.ufc_fighters <- kmeans(ufc_fighters, centers = i, nstart = 20)
wss[i] <- km.ufc_fighters$tot.withinss
}
print(wss)
## [1] 2987.6364 2156.3939 1804.9167 1506.8539 1323.4083 1164.2669 1057.3123
## [8] 978.1817 890.7671 804.8679 744.4345 703.8826 637.7202 598.8095
## [15] 554.1167
plot(1:15, wss, type = "b", pch = 1, col = "blue",
main = "Elbow Method for Optimal Clusters",
xlab = "Number of Clusters",
ylab = "Within-Cluster Sum of Squares (WSS)")

#the line plot shows a decreasing WSS curve as the number of clusters increases. The elbow of the curve is positioned at 2 clusters. This supports the decision to use to clusters for the Kmeans algorithm above.
Hierarchical Clustering
# I will be using the ufc_fighters dataframe to compare the results between Hierarchical Clustering and kmeans Clustering.
dist_ufc_fighters <- dist(ufc_fighters)
d <- dist(ufc_fighters)
hclust(d = dist_ufc_fighters)
##
## Call:
## hclust(d = dist_ufc_fighters)
##
## Cluster method : complete
## Distance : euclidean
## Number of objects: 55
# Hclust model
hclust_ufc_fighters <- hclust(dist(ufc_fighters))
summary(hclust_ufc_fighters)
## Length Class Mode
## merge 108 -none- numeric
## height 54 -none- numeric
## order 55 -none- numeric
## labels 0 -none- NULL
## method 1 -none- character
## call 2 -none- call
## dist.method 1 -none- character
plot(hclust_ufc_fighters)
abline(h = 15, col = "red")

# I chose to use the height 15 in the Dendogram and it gave a result of 4 Clusters.The next step was to perform Tree "cutting". Below I will demonstrate two methods using Height and another using number of clusters K.
cutree(hclust_ufc_fighters, h = 15)
## [1] 1 1 1 1 2 3 2 1 3 2 1 4 3 1 3 1 4 4 2 1 4 4 1 1 3 2 4 2 4 1 2 3 2 3 3 2 2 3
## [39] 1 4 1 3 2 4 1 4 1 1 4 3 1 1 2 2 1
cutree(hclust_ufc_fighters, k = 4)
## [1] 1 1 1 1 2 3 2 1 3 2 1 4 3 1 3 1 4 4 2 1 4 4 1 1 3 2 4 2 4 1 2 3 2 3 3 2 2 3
## [39] 1 4 1 3 2 4 1 4 1 1 4 3 1 1 2 2 1
Plotting Dendrogram of Hclust
hclust.complete <- hclust(d, method = "complete")
hclust.average <- hclust(d, method = "average")
hclust.single <-hclust(d, method = "single")
# Plotting dendgram of hclust
plot(hclust.complete, main = "complete")
abline(h = 15, col = "red")

plot(hclust.average, main = "average")

plot(hclust.single, main = "single")

# The dendrogram determines using the complete method is the best option as it accounts for 4 clusters.
# Now I want to find out if scaling is necessary for this data
colMeans(ufc_fighters)
## Punch.Power Kick.Power Punch.Speed Kick.Speed Body.Strength
## 92.81818 91.49091 94.20000 92.85455 92.25455
## Leg.Strength Takedowns
## 92.05455 91.20000
# Observation: the variables have similar means, which range from 91.2 and 94.2
apply(ufc_fighters, 2, sd)
## Punch.Power Kick.Power Punch.Speed Kick.Speed Body.Strength
## 3.085755 2.980633 2.280351 2.504945 1.945728
## Leg.Strength Takedowns
## 2.475197 3.941235
# Observation: A majority of the sd(standard deviations) are between 2 and 3. However "Takedowns" has a higher sd of 3.94. Based on the results you would most likely think scaling is not necessary, however because I am using algorithms that are highly sensitive like k-means and hierarchical clustering with Euclidean distance I have decided to go ahead with scaling.
Scaling
scaled_ufc_fighters <- scale(ufc_fighters)
colMeans(scaled_ufc_fighters)
## Punch.Power Kick.Power Punch.Speed Kick.Speed Body.Strength
## 1.672904e-15 1.997392e-15 -1.241179e-15 -1.764245e-15 2.128600e-15
## Leg.Strength Takedowns
## 2.843685e-15 -7.145799e-16
apply(scaled_ufc_fighters, 2, sd)
## Punch.Power Kick.Power Punch.Speed Kick.Speed Body.Strength
## 1 1 1 1 1
## Leg.Strength Takedowns
## 1 1
# The results I have from the mean and sd are satisfactory as the mean is approx = 0 and the sd is = 1
hclust.fighters <- hclust(dist(scaled_ufc_fighters), method = "complete")
# Now I want to Compare the kmeans{} and the hclust()
plot(hclust.fighters, main = "Hierarchical Clustering Dendrogram (Scaled)",
xlab = "Fighters", sub = "", cex = 0.8)

# The dendrogram dectates 4 clusters is best.
cut.fighters <- cutree(hclust.fighters, k = 4)
km_scaled_fighters <- kmeans(scaled_ufc_fighters, centers = 4, nstart = 20)
print(km_scaled_fighters)
## K-means clustering with 4 clusters of sizes 14, 10, 14, 17
##
## Cluster means:
## Punch.Power Kick.Power Punch.Speed Kick.Speed Body.Strength Leg.Strength
## 1 -0.1494088 -0.5481272 0.9146462 -0.14153824 0.08944002 -0.1663255
## 2 -1.0105085 -1.2047470 -1.4909986 -1.01980118 -0.33640126 -0.6280492
## 3 0.4061394 1.1054044 0.3821467 0.88500285 0.97049094 1.0745571
## 4 0.3829915 0.2497406 -0.1908891 -0.01238193 -0.67500123 -0.3785147
## Takedowns
## 1 -0.8119282
## 2 0.9641648
## 3 -0.1413625
## 4 0.2179072
##
## Clustering vector:
## [1] 3 4 3 3 2 1 4 4 4 2 3 1 1 4 3 3 1 1 2 4 4 1 1 4 3 2 1 2 1 4 2 3 2 1 4 2 4 3
## [39] 4 1 3 3 1 1 4 1 3 4 4 4 3 3 2 2 4
##
## Within cluster sum of squares by cluster:
## [1] 61.02464 40.83950 52.87905 48.06717
## (between_SS / total_SS = 46.3 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
plot(scaled_ufc_fighters, main = "Hierarchical Clustering (Scaled)", col = 2)

plot(ufc_fighters[, 1], ufc_fighters[, 2],
col = km_scaled_fighters$cluster,
main = "K-means Clustering with 4 Clusters",
xlab = "Punch Power",
ylab = "Kick Power",
pch = 2, cex = 1.5)

table(km.ufc_fighters$cluster, cut.fighters)
## cut.fighters
## 1 2 3 4
## 1 0 0 2 0
## 2 0 3 0 0
## 3 2 0 0 1
## 4 0 0 0 4
## 5 0 0 0 5
## 6 3 0 0 1
## 7 0 0 2 0
## 8 1 1 2 0
## 9 3 0 0 1
## 10 4 0 0 0
## 11 6 0 0 0
## 12 6 0 0 0
## 13 0 0 0 2
## 14 0 0 4 0
## 15 2 0 0 0
# Comparing the Kmeans against the Hierarchical Clustering, I have concluded that 4 clusters are too much. The table shows values that are spread across rows and columns. This shows that there is a disagreement between k-means and hierarchical clustering. This is okay as both methods use different algorithms to analyse data.
Dimensional Reduction
library(cluster)
library(ggplot2)
#Principal Component Analysis
ufc_pca <- prcomp(ufc_fighters, scale. = TRUE)
ufc_pca
## Standard deviations (1, .., p=7):
## [1] 1.6702150 1.1663917 0.9726150 0.9142879 0.7415688 0.6238773 0.3589741
##
## Rotation (n x k) = (7 x 7):
## PC1 PC2 PC3 PC4 PC5
## Punch.Power -0.3331537 -0.418299422 0.59688765 0.22916694 -0.095353315
## Kick.Power -0.5019811 0.002175869 -0.09286234 0.51112131 0.002467875
## Punch.Speed -0.3969549 -0.243993459 0.14051578 -0.56626098 -0.487148931
## Kick.Speed -0.4262803 0.122667102 -0.59339940 0.21964500 -0.315451490
## Body.Strength -0.2506796 0.625738705 0.08625097 -0.40870857 -0.084768693
## Leg.Strength -0.3659803 0.431628699 0.36638819 0.06101042 0.471733938
## Takedowns 0.3177322 0.415437808 0.34864690 0.38285393 -0.651428655
## PC6 PC7
## Punch.Power 0.3293232 0.43323880
## Kick.Power 0.1848500 -0.66630793
## Punch.Speed -0.3706840 -0.26034040
## Kick.Speed -0.1436089 0.53173110
## Body.Strength 0.6024185 0.03217230
## Leg.Strength -0.5518059 0.12153133
## Takedowns -0.1787421 -0.04492079
#Visualizing the first two Principal Components
pca_data <- data.frame(PC1 = ufc_pca$x[, 1],
PC2 = ufc_pca$x[, 2],
Cluster = factor(km.ufc_fighters$cluster))
pca_data
## PC1 PC2 Cluster
## 1 -2.21501858 0.64377680 10
## 2 -0.02779265 -0.39712345 9
## 3 -1.91796178 2.71517842 2
## 4 -1.91796178 2.71517842 2
## 5 4.25534474 -0.22672377 7
## 6 -1.79143081 -1.66089105 6
## 7 0.69987144 0.70481871 9
## 8 -0.20155370 0.32967386 9
## 9 -1.04542381 -0.70640676 11
## 10 2.11492040 1.21306984 8
## 11 -1.87837860 1.23265939 10
## 12 1.14496036 -0.80380435 13
## 13 -1.81662179 -0.66321727 6
## 14 0.67355663 -0.72366858 5
## 15 -2.50281898 -1.11860466 11
## 16 -0.80942528 0.66473671 3
## 17 -0.14949130 -0.60811472 13
## 18 0.52001537 -1.25453866 5
## 19 2.51128000 0.08204584 7
## 20 0.91603408 -0.45626705 12
## 21 -0.12343222 0.34601399 5
## 22 -0.27158017 -0.14344560 9
## 23 -0.07674592 0.73511028 3
## 24 0.31113299 -0.48010590 12
## 25 -1.93732603 -0.31477938 11
## 26 2.64154377 0.98400127 1
## 27 -0.02289619 -0.24412013 4
## 28 2.36108743 -0.39447053 14
## 29 -0.13781742 0.96998001 4
## 30 0.40488548 -0.83283954 12
## 31 2.34582563 1.33276440 1
## 32 -2.87248926 -0.97828765 11
## 33 2.33373981 -0.42462070 14
## 34 -0.53813324 -2.07906517 6
## 35 -0.48782648 -2.26709896 15
## 36 3.71268891 0.50814388 14
## 37 1.26912684 0.20030598 8
## 38 -2.76069531 0.21507893 11
## 39 -0.48287778 -0.26602185 12
## 40 -0.07942965 -1.07075389 5
## 41 -2.01118645 0.83552610 10
## 42 -2.90590975 0.39126162 11
## 43 0.81800797 1.16915310 4
## 44 0.72503311 -0.65110901 4
## 45 0.94766985 -1.80279477 12
## 46 -1.08659412 -1.14815750 6
## 47 -1.43378358 0.84445965 10
## 48 -0.48091801 0.20803610 3
## 49 1.41973349 -1.40291031 5
## 50 -0.17390605 -1.63543457 15
## 51 -0.14384935 2.51451681 8
## 52 -1.30184162 1.68199903 2
## 53 1.85982236 0.57826108 14
## 54 1.74898575 2.12491608 8
## 55 -0.13214874 -1.18529054 12
variance <- ufc_pca$sdev^2 / sum(ufc_pca$sdev^2)
barplot(variance[1:7],
names.arg = paste("PC", 1:7),
main = "PCA Scree Plot",
xlab = "Principal Components",
ylab = "Proportion of Variance",
col = "blue")

#PC1 and PC2 are enough to explain a large majority of the variance
#Comparing membership between Kmeans and Hclust
h_clusters <- cutree(hclust_ufc_fighters, k = 4)
k_clusters <- km.ufc_fighters$cluster
table(Hierarchical = h_clusters, KMeans = k_clusters)
## KMeans
## Hierarchical 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 1 0 3 3 0 1 0 0 1 2 4 0 6 0 0 0
## 2 2 0 0 1 0 0 2 3 1 0 0 0 0 4 0
## 3 0 0 0 0 0 3 0 0 0 0 6 0 0 0 2
## 4 0 0 0 3 4 1 0 0 1 0 0 0 2 0 0
#Finding the mean for each cluster
aggregate(ufc_fighters, by = list(Hierarchical = h_clusters), mean)
## Hierarchical Punch.Power Kick.Power Punch.Speed Kick.Speed Body.Strength
## 1 1 93.15000 93.00000 94.60000 94.05000 92.70000
## 2 2 89.84615 88.30769 91.53846 90.76923 91.84615
## 3 3 96.18182 94.45455 95.18182 94.27273 91.90909
## 4 4 92.36364 89.54545 95.63636 91.72727 92.27273
## Leg.Strength Takedowns
## 1 92.85000 93.05000
## 2 90.76923 94.76923
## 3 92.72727 86.45455
## 4 91.45455 88.36364
aggregate(ufc_fighters, by = list(KMeans = k_clusters), mean)
## KMeans Punch.Power Kick.Power Punch.Speed Kick.Speed Body.Strength
## 1 1 86.00000 88.00000 89.00000 95.00000 92.00000
## 2 2 90.33333 95.66667 92.66667 95.66667 95.66667
## 3 3 90.66667 92.00000 96.00000 95.33333 91.33333
## 4 4 92.25000 88.25000 95.50000 89.75000 93.75000
## 5 5 94.20000 89.80000 95.00000 91.20000 91.40000
## 6 6 95.00000 92.00000 96.75000 94.00000 92.00000
## 7 7 88.00000 86.50000 90.50000 88.50000 91.00000
## 8 8 91.25000 89.75000 92.50000 92.00000 93.75000
## 9 9 90.75000 92.25000 94.50000 94.25000 92.75000
## 10 10 95.50000 93.75000 96.75000 94.25000 95.00000
## 11 11 95.83333 95.66667 95.16667 95.00000 92.33333
## 12 12 94.50000 92.33333 93.83333 92.83333 90.16667
## 13 13 88.50000 88.50000 96.00000 93.50000 92.00000
## 14 14 92.50000 88.25000 91.50000 88.50000 90.50000
## 15 15 97.50000 94.50000 92.50000 93.00000 90.00000
## Leg.Strength Takedowns
## 1 89.00000 93.00000
## 2 96.00000 90.66667
## 3 93.66667 94.33333
## 4 93.50000 89.25000
## 5 91.20000 90.80000
## 6 91.25000 85.00000
## 7 89.00000 91.50000
## 8 92.00000 97.75000
## 9 90.00000 91.25000
## 10 94.25000 93.00000
## 11 94.66667 86.50000
## 12 91.00000 93.16667
## 13 89.50000 85.50000
## 14 91.50000 97.00000
## 15 89.50000 88.00000
Scatter Plot
#Scatter plot for k-means
plot(ufc_fighters[, 1], ufc_fighters[, 2], col = k_clusters,
main = "K-Means Clustering", xlab = "Punch Power", ylab = "Kick Power")

#Scatter plot for hierarchical clustering
plot(ufc_fighters[, 1], ufc_fighters[, 2], col = h_clusters,
main = "Hierarchical Clustering", xlab = "Punch Power", ylab = "Kick Power")

ggplot(pca_data, aes(x = PC1, y = PC2, color = Cluster)) +
geom_point(size = 1, alpha = 0.8) +
scale_color_hue() +
theme_minimal() +
labs(title = "PCA Scatter Plot with Clusters",
x = "Principal Component 1",
y = "Principal Component 2",
color = "Cluster") +
theme(legend.position = "right")

Silhoutte Scores
knitr::opts_chunk$set(echo = FALSE, message = FALSE, warning = FALSE)
#Kmeans
library(cluster)
silhouette_kmeans <- silhouette(k_clusters, dist(ufc_fighters))
plot(silhouette_kmeans)

#Hclust
silhouette_hclust <- silhouette(h_clusters, dist(ufc_fighters))
plot(silhouette_hclust)

#Silhoutte values range from -1 to 1.
#when it is close to 1, the data is well matched to its cluster. when it is close to 0, the data points are on a boundary between clusters and when it is negative it is misclassified.
#Kmeans cluster has a higher silhoutte width, which tells us that the fighters are better grouped based on their skill attributes than in the hclust which has a width of 0.19.
#The silhoutte scores for Hclust are uniformely low, which shows a poorly defined cluster as compared to the kmeans.