The first step we are going to perform is to load the libraries for the data preparation.
library(knitr)
library(readr)
library(dplyr)
##
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.4.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(grid)
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.4.3
##
## Adjuntando el paquete: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.4.3
Loading the data
df_full_clean<- read.csv("D:/UPV/2º/Proyecto II/lol_dataset.csv")
View(df_full_clean)
We select the variables that are interested for the clustering analysis.
df_cluster <- df_full_clean %>%
select(kills, deaths, assists, gold_earned, total_damage_dealt_to_champions,total_damage_taken,
vision_score, time_ccing_others, champion_mastery_points)
We use the silhoutte method to find the optimal number of clusters.
df_scaled <- scale(df_cluster)
p1 = fviz_nbclust(x = df_scaled, FUNcluster = hcut, method = "silhouette",
hc_method = "ward.D2", k.max = 10, verbose = FALSE,
hc_metric = "euclidean") + labs(title = "Optimal Num. clusters")
p2 = fviz_nbclust(x = df_scaled, FUNcluster = hcut, method = "wss",
hc_method = "ward.D2", k.max = 10, verbose = FALSE,
hc_metric = "euclidean") + labs(title = "Optimal Num. clusters")
grid.arrange(p1, p2, nrow = 1)
Draw a pca graph with the clusters for the visualization.
set.seed(123) # Para reproducibilidad
km_res <- kmeans(df_scaled, centers = 4, nstart = 25)
# Agregar los clusters al dataframe original
df_cluster$cluster <- factor(km_res$cluster)
# Visualización con PCA
fviz_cluster(km_res, data = df_scaled,
ellipse.type = "norm",
geom = "point",
stand = FALSE,
main = "Clustering of the players (k-means + PCA)")
We calculate the mean of the variables corresponding to each cluster so that we can the cluster’s characteristics.
df2 <- df_full_clean
# Agregar los clusters al dataset original
df2$cluster <- factor(km_res$cluster)
# Ver medias por cluster
df2 %>%
group_by(cluster) %>%
summarise(across(c(kills, deaths, assists, gold_earned,
total_damage_dealt_to_champions, total_damage_taken, vision_score,
time_ccing_others, champion_mastery_points), mean))
Cluster Description
1 Passive or beginner player
2 Utility support
3 Balanced fighter - Balance between kills and assists - Good gold and damage - Acceptable vision, average participation → Versatile player, possibly top or jungler
4 Aggressive carry - Very high in kills, gold, damage and damage taken - Maximum offensive impact - More kills too → Very aggressive carry player, as mid or ADC
df2 <- df2 %>%
mutate(estilo_jugador = case_when(
cluster == 1 ~ "Passive or beginner player",
cluster == 2 ~ "Utility support",
cluster == 3 ~ "Balanced fighter",
cluster == 4 ~ "Aggressive carry"
))
We calculate the amount of games played by each playstyle, the amount and victories and the percentage of victories.
# Agregar resumen de victorias y winrate
resumen_estilos <- df2 %>%
group_by(estilo_jugador) %>%
summarise(
total_partidas = n(),
victorias = sum(win == TRUE),
porcentaje_victorias = round(mean(win == TRUE) * 100, 2),
.groups = "drop" #Esto quita la agrupacion que sera util si hago operaciones que no depende del grupo anterior
) %>%
arrange(desc(porcentaje_victorias))
# Mostrar tabla final
print(resumen_estilos)
## # A tibble: 4 × 4
## estilo_jugador total_partidas victorias porcentaje_victorias
## <chr> <int> <int> <dbl>
## 1 Aggressive carry 1045 646 61.8
## 2 Utility support 831 458 55.1
## 3 Balanced fighter 2584 1263 48.9
## 4 Passive or beginner player 1479 636 43
Finally, we plot the results in a bar chart.
victorias_por_estilo <- df2 %>%
group_by(estilo_jugador) %>%
summarise(
total_partidas = n(),
victorias = sum(win == TRUE),
porcentaje_victorias = round(mean(win == TRUE) * 100, 2)
) %>%
arrange(desc(porcentaje_victorias))
# Crear gráfico de barras
ggplot(victorias_por_estilo, aes(x = reorder(estilo_jugador, porcentaje_victorias),
y = porcentaje_victorias,
fill = estilo_jugador)) +
geom_bar(stat = "identity", width = 0.6) +
coord_flip() +
labs(title = "Percentage of victories by player's playstyle",
x = "Playstyle of the player",
y = "Percentage of victories (%)") +
theme_minimal() +
theme(legend.position = "none") +
geom_text(aes(label = paste0(porcentaje_victorias, "%")),
hjust = -0.1, size = 4)