The first step we are going to perform is to load the libraries for the data preparation.

library(knitr)
library(readr)
library(dplyr)
## 
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.4.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(grid)
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.4.3
## 
## Adjuntando el paquete: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.4.3

Loading the data

df_full_clean<- read.csv("D:/UPV/2º/Proyecto II/lol_dataset.csv")
View(df_full_clean)

We select the variables that are interested for the clustering analysis.

df_cluster <- df_full_clean %>%
  select(kills, deaths, assists, gold_earned, total_damage_dealt_to_champions,total_damage_taken,
         vision_score, time_ccing_others, champion_mastery_points)

We use the silhoutte method to find the optimal number of clusters.

df_scaled <- scale(df_cluster)
p1 = fviz_nbclust(x = df_scaled, FUNcluster = hcut, method = "silhouette", 
                  hc_method = "ward.D2", k.max = 10, verbose = FALSE, 
                  hc_metric = "euclidean") + labs(title = "Optimal Num. clusters")
p2 = fviz_nbclust(x = df_scaled, FUNcluster = hcut, method = "wss", 
                  hc_method = "ward.D2", k.max = 10, verbose = FALSE, 
                  hc_metric = "euclidean") + labs(title = "Optimal Num. clusters")
grid.arrange(p1, p2, nrow = 1)

Draw a pca graph with the clusters for the visualization.

set.seed(123) # Para reproducibilidad
km_res <- kmeans(df_scaled, centers = 4, nstart = 25)

# Agregar los clusters al dataframe original
df_cluster$cluster <- factor(km_res$cluster)

# Visualización con PCA
fviz_cluster(km_res, data = df_scaled,
             ellipse.type = "norm",
             geom = "point",
             stand = FALSE,
             main = "Clustering of the players (k-means + PCA)")

We calculate the mean of the variables corresponding to each cluster so that we can the cluster’s characteristics.

df2 <- df_full_clean
# Agregar los clusters al dataset original
df2$cluster <- factor(km_res$cluster)
# Ver medias por cluster
df2 %>%
  group_by(cluster) %>%
  summarise(across(c(kills, deaths, assists, gold_earned,
                     total_damage_dealt_to_champions, total_damage_taken, vision_score,
                     time_ccing_others, champion_mastery_points), mean))

Cluster Description

1 Passive or beginner player

2 Utility support

3 Balanced fighter - Balance between kills and assists - Good gold and damage - Acceptable vision, average participation → Versatile player, possibly top or jungler

4 Aggressive carry - Very high in kills, gold, damage and damage taken - Maximum offensive impact - More kills too → Very aggressive carry player, as mid or ADC

df2 <- df2 %>%
  mutate(estilo_jugador = case_when(
    cluster == 1 ~ "Passive or beginner player",
    cluster == 2 ~ "Utility support",
    cluster == 3 ~ "Balanced fighter",
    cluster == 4 ~ "Aggressive carry"
  ))

We calculate the amount of games played by each playstyle, the amount and victories and the percentage of victories.

# Agregar resumen de victorias y winrate
resumen_estilos <- df2 %>%
  group_by(estilo_jugador) %>%
  summarise(
    total_partidas = n(),
    victorias = sum(win == TRUE),
    porcentaje_victorias = round(mean(win == TRUE) * 100, 2),
    .groups = "drop"  #Esto quita la agrupacion que sera util si hago operaciones que no depende del grupo anterior
  ) %>%
  arrange(desc(porcentaje_victorias))

# Mostrar tabla final
print(resumen_estilos)
## # A tibble: 4 × 4
##   estilo_jugador             total_partidas victorias porcentaje_victorias
##   <chr>                               <int>     <int>                <dbl>
## 1 Aggressive carry                     1045       646                 61.8
## 2 Utility support                       831       458                 55.1
## 3 Balanced fighter                     2584      1263                 48.9
## 4 Passive or beginner player           1479       636                 43

Finally, we plot the results in a bar chart.

victorias_por_estilo <- df2 %>%
  group_by(estilo_jugador) %>%
  summarise(
    total_partidas = n(),
    victorias = sum(win == TRUE),
    porcentaje_victorias = round(mean(win == TRUE) * 100, 2)
  ) %>%
  arrange(desc(porcentaje_victorias))

# Crear gráfico de barras
ggplot(victorias_por_estilo, aes(x = reorder(estilo_jugador, porcentaje_victorias), 
                                  y = porcentaje_victorias, 
                                  fill = estilo_jugador)) +
  geom_bar(stat = "identity", width = 0.6) +
  coord_flip() +
  labs(title = "Percentage of victories by player's playstyle",
       x = "Playstyle of the player",
       y = "Percentage of victories (%)") +
  theme_minimal() +
  theme(legend.position = "none") +
  geom_text(aes(label = paste0(porcentaje_victorias, "%")), 
            hjust = -0.1, size = 4)