Evidencia 2: Clustering en el deporte

Abraham Castañon - A01747966 Angie Zerón - A00834060

En este Markdown se buscará hacer un análisis de clasificación de todos los pilotos que han competido en la Formula 1 desde sus inicios hasta la primera carrera de la temporada 2023 con el fin de averiguar que piloto tiene las mejores estadísticas, agrupándolos en clusters y después analizando el cluster con mejor resultado

La base de datos que se utilizó fue sacada de Kaggle y todos los derechos de la misma son del propietario Link: https://www.kaggle.com/datasets/petalme/f1-drivers-dataset

Variables:

Librerías utilizadas

library(tidyverse)
library(factoextra)
library(cowplot)
library(ggpubr)
library(cluster)
library(purrr)
library(dplyr)
library(readxl)
library(openxlsx)
library(tree)
library(ggplot2)
library(grt)
F1 <- read.xlsx("F1Drivers.xlsx", rowNames = TRUE)
head(F1)
##                                          Seasons Championships Race_Entries
## Carlo Abate                         [1962, 1963]             0            3
## George Abecassis                    [1951, 1952]             0            2
## Kenny Acheson                       [1983, 1985]             0           10
## Andrea de Adamich [1968, 1970, 1971, 1972, 1973]             0           36
## Philippe Adams                            [1994]             0            2
## Walt Ader                                 [1950]             0            1
##                   Race_Starts Pole_Positions Race_Wins Podiums Fastest_Laps
## Carlo Abate                 0              0         0       0            0
## George Abecassis            2              0         0       0            0
## Kenny Acheson               3              0         0       0            0
## Andrea de Adamich          30              0         0       0            0
## Philippe Adams              2              0         0       0            0
## Walt Ader                   1              0         0       0            0
##                   Points Pole_Rate Start_Rate Win_Rate Podium_Rate FastLap_Rate
## Carlo Abate            0         0  0.0000000        0           0            0
## George Abecassis       0         0  1.0000000        0           0            0
## Kenny Acheson          0         0  0.3000000        0           0            0
## Andrea de Adamich      6         0  0.8333333        0           0            0
## Philippe Adams         0         0  1.0000000        0           0            0
## Walt Ader              0         0  1.0000000        0           0            0
##                   Points_Per_Entry Years_Active
## Carlo Abate              0.0000000            2
## George Abecassis         0.0000000            2
## Kenny Acheson            0.0000000            2
## Andrea de Adamich        0.1666667            5
## Philippe Adams           0.0000000            1
## Walt Ader                0.0000000            1
F1$Seasons <- sapply(F1$Seasons, function(season) {
  years <- regmatches(season, gregexpr("\\d{4}", season))[[1]]
  cantidad <- length(years)
  return(cantidad)})
F1$Seasons = as.numeric(F1$Seasons)
head(F1)
##                   Seasons Championships Race_Entries Race_Starts Pole_Positions
## Carlo Abate             2             0            3           0              0
## George Abecassis        2             0            2           2              0
## Kenny Acheson           2             0           10           3              0
## Andrea de Adamich       5             0           36          30              0
## Philippe Adams          1             0            2           2              0
## Walt Ader               1             0            1           1              0
##                   Race_Wins Podiums Fastest_Laps Points Pole_Rate Start_Rate
## Carlo Abate               0       0            0      0         0  0.0000000
## George Abecassis          0       0            0      0         0  1.0000000
## Kenny Acheson             0       0            0      0         0  0.3000000
## Andrea de Adamich         0       0            0      6         0  0.8333333
## Philippe Adams            0       0            0      0         0  1.0000000
## Walt Ader                 0       0            0      0         0  1.0000000
##                   Win_Rate Podium_Rate FastLap_Rate Points_Per_Entry
## Carlo Abate              0           0            0        0.0000000
## George Abecassis         0           0            0        0.0000000
## Kenny Acheson            0           0            0        0.0000000
## Andrea de Adamich        0           0            0        0.1666667
## Philippe Adams           0           0            0        0.0000000
## Walt Ader                0           0            0        0.0000000
##                   Years_Active
## Carlo Abate                  2
## George Abecassis             2
## Kenny Acheson                2
## Andrea de Adamich            5
## Philippe Adams               1
## Walt Ader                    1
colSums(is.na(F1))
##          Seasons    Championships     Race_Entries      Race_Starts 
##                0                0                0                0 
##   Pole_Positions        Race_Wins          Podiums     Fastest_Laps 
##                0                0                0                0 
##           Points        Pole_Rate       Start_Rate         Win_Rate 
##                0                0                0                0 
##      Podium_Rate     FastLap_Rate Points_Per_Entry     Years_Active 
##                0                0                0                0
sum(is.na(F1))
## [1] 0
F1C = scale(F1, center = TRUE, scale = TRUE)

F1C = as.data.frame(F1C)
Pilotos=rownames(F1C)
total_within = function(n_clusters, data, iter.max=1000, nstart=50){
  cluster_means = kmeans(data,centers = n_clusters,
                         iter.max = iter.max,
                         nstart = nstart)
  return(cluster_means$tot.withinss)}
total_withinss <- map_dbl(.x = 1:15,
                          .f = total_within,
                          data = F1C)
total_withinss
##  [1] 13872.000  8159.325  6130.684  5128.550  4417.059  3826.805  3347.241
##  [8]  2984.388  2756.369  2549.032  2478.268  2341.364  2210.958  2045.382
## [15]  1952.213
data.frame(n_clusters = 1:15, suma_cuadrados_internos = total_withinss) %>%
  ggplot(aes(x = n_clusters, y = suma_cuadrados_internos)) +
  geom_line() +
  geom_point() +
  scale_x_continuous(breaks = 1:15) +
  labs(title = "Suma total de cuadrados intra-cluster") +
  theme_bw()

set.seed(123)
kmeans_F1 = kmeans(F1C,centers=3,nstart = 50)
F1C = F1C %>% mutate(cluster = kmeans_F1$cluster)

(g1=ggplot(F1C, aes(x = Race_Entries, y = Race_Wins)) +
    geom_point(aes(color=as.factor(cluster)), size=10)+
    geom_text(aes(label = cluster), size = 5) +
    theme_bw() +
    theme(legend.position = "none")+
    labs(title = "Kmenas con k=3"))

Analizando el cluster con mejores pilotos

cluster_2_data <- (F1C[kmeans_F1$cluster == 2, ])
print(cluster_2_data)
##                      Seasons Championships Race_Entries Race_Starts
## Fernando Alonso    4.3792974     3.6501425   6.11904112  6.20891040
## Alberto Ascari     0.6668797     3.6501425   0.05732504  0.08142183
## Valtteri Bottas    1.8091621    -0.1602287   3.19974841  3.27755013
## Jim Clark          1.5235915     3.6501425   0.80109389  0.83790190
## Juan Manuel Fangio 1.2380209     9.3656993   0.41061524  0.44074987
## Mika H√§kkinen     2.0947326     3.6501425   2.51176223  2.52107006
## Lewis Hamilton     3.5225856    13.1760705   5.22651851  5.35787032
## Niki Lauda         2.6658738     5.5553281   2.73489288  2.71019007
## Nigel Mansell      3.2370150     1.7449569   2.99521198  3.01278210
## Nelson Piquet      2.9514444     5.5553281   3.29271952  3.33428613
## Alain Prost        2.6658738     7.4605137   3.19974841  3.23972612
## Kimi Räikkönen   4.3792974     1.7449569   6.00747580  6.07652639
## Nico Rosberg       2.0947326     1.7449569   3.27412529  3.37211014
## Michael Schumacher 4.3792974    13.1760705   5.17073585  5.26331031
## Ayrton Senna       2.0947326     5.5553281   2.45597956  2.52107006
## Jackie Stewart     1.5235915     5.5553281   1.30313786  1.34852595
## Max Verstappen     1.2380209     3.6501425   2.49316801  2.57780606
## Sebastian Vettel   3.5225856     7.4605137   5.02198208  5.13092630
##                    Pole_Positions Race_Wins    Podiums Fastest_Laps     Points
## Fernando Alonso          3.269905  4.737012  6.5990602     4.015499  7.5954471
## Alberto Ascari           2.009568  1.810297  0.9175672     1.983596  0.1947243
## Valtteri Bottas          2.954821  1.348184  4.3818922     3.276625  6.5238921
## Jim Clark                5.002867  3.658748  1.9568647     4.939091  0.7487746
## Juan Manuel Fangio       4.372699  3.504711  2.1647242     4.015499  0.7111762
## Mika H√§kkinen           3.900073  2.888560  3.2733082     4.384936  1.3691485
## Lewis Hamilton          16.030809 15.673682 12.9734181    11.034801 16.3915978
## Niki Lauda               3.584989  3.658748  3.4811677     4.200217  1.3710285
## Nigel Mansell            4.845325  4.582974  3.8276002     5.308528  1.5947391
## Nelson Piquet            3.584989  3.350673  3.8968867     4.015499  1.6003788
## Alain Prost              5.002867  7.663726  7.0840657     7.340431  2.6794535
## Kimi Räikkönen         2.639737  3.042598  6.8762062     8.264024  6.8321992
## Nico Rosberg             4.530241  3.350673  3.6890272     3.461343  5.7850831
## Michael Schumacher      10.516838 13.825231 10.4791041    13.990296  5.6779276
## Ayrton Senna            10.044212  6.123350  5.2826167     3.276625  2.0835186
## Jackie Stewart           2.482195  3.966824  2.7190162     2.537751  1.1397982
## Max Verstappen           3.112363  5.353162  5.1440437     3.646062  7.4469334
## Sebastian Vettel         8.783876  7.971802  8.1926496     6.786276 11.4380058
##                     Pole_Rate Start_Rate   Win_Rate Podium_Rate FastLap_Rate
## Fernando Alonso     1.0701851  0.6531520  1.7918238    2.201608     1.199008
## Alberto Ascari      8.8684119  0.5854792  8.7860346    4.450260     8.083526
## Valtteri Bottas     1.8807841  0.6636546  0.8824121    2.726862     1.888278
## Jim Clark           9.4659613  0.6366794  7.6048861    3.728888     8.541436
## Juan Manuel Fangio 11.7355856  0.6196209 10.3372079    5.933722     9.891501
## Mika H√§kkinen      3.1390728  0.6041673  2.5278527    2.514643     3.208690
## Lewis Hamilton      6.8691724  0.6789197  7.3461385    5.380176     4.234264
## Niki Lauda          2.6667775  0.5743930  2.9874931    2.477012     2.842782
## Nigel Mansell       3.3531390  0.6143430  3.4707666    2.512855     3.336303
## Nelson Piquet       2.2445712  0.6342307  2.2960681    2.333953     2.280149
## Alain Prost         3.2634835  0.6331246  5.5399039    4.540446     4.391200
## Kimi Räikkönen    0.8491078  0.6439787  1.1115377    2.352079     2.721403
## Nico Rosberg        2.8824409  0.6789197  2.3084450    2.210372     1.957865
## Michael Schumacher  4.4970006  0.6588967  6.5261356    4.338434     5.472007
## Ayrton Senna        8.3740876  0.6598855  5.5539283    4.249951     2.422010
## Jackie Stewart      3.4060079  0.6480843  5.9420386    3.650395     3.173869
## Max Verstappen      2.5046851  0.6789197  4.7835117    4.078827     2.669400
## Sebastian Vettel    3.8357084  0.6686412  3.8003497    3.431215     2.637637
##                    Points_Per_Entry Years_Active cluster
## Fernando Alonso            4.369303    4.3792974       2
## Alberto Ascari             2.292471    0.6668797       2
## Valtteri Bottas            6.909707    1.8091621       2
## Jim Clark                  2.483053    1.5235915       2
## Juan Manuel Fangio         3.486811    1.2380209       2
## Mika H√§kkinen             1.702301    2.0947326       2
## Lewis Hamilton            11.301947    3.5225856       2
## Niki Lauda                 1.562456    2.6658738       2
## Nigel Mansell              1.675637    3.2370150       2
## Nelson Piquet              1.521577    2.9514444       2
## Alain Prost                2.739519    2.6658738       2
## Kimi Räikkönen           3.976511    4.3792974       2
## Nico Rosberg               5.982024    2.0947326       2
## Michael Schumacher         3.794003    4.3792974       2
## Ayrton Senna               2.707370    2.0947326       2
## Jackie Stewart             2.562841    1.5235915       2
## Max Verstappen             9.835454    1.2380209       2
## Sebastian Vettel           8.112788    3.5225856       2
# Obtener los nombres de los pilotos como un vector desde los rownames
pilotos <- rownames(cluster_2_data)

# Crear un dataframe con los nombres de los pilotos y las variables del cluster
plot <- data.frame(
  Pilotos = pilotos,
  Race_Entries = cluster_2_data$Race_Entries,
  Race_Wins = cluster_2_data$Race_Wins)

# Visualizar los datos del segundo cluster
print(plot)
##               Pilotos Race_Entries Race_Wins
## 1     Fernando Alonso   6.11904112  4.737012
## 2      Alberto Ascari   0.05732504  1.810297
## 3     Valtteri Bottas   3.19974841  1.348184
## 4           Jim Clark   0.80109389  3.658748
## 5  Juan Manuel Fangio   0.41061524  3.504711
## 6      Mika H√§kkinen   2.51176223  2.888560
## 7      Lewis Hamilton   5.22651851 15.673682
## 8          Niki Lauda   2.73489288  3.658748
## 9       Nigel Mansell   2.99521198  4.582974
## 10      Nelson Piquet   3.29271952  3.350673
## 11        Alain Prost   3.19974841  7.663726
## 12   Kimi Räikkönen   6.00747580  3.042598
## 13       Nico Rosberg   3.27412529  3.350673
## 14 Michael Schumacher   5.17073585 13.825231
## 15       Ayrton Senna   2.45597956  6.123350
## 16     Jackie Stewart   1.30313786  3.966824
## 17     Max Verstappen   2.49316801  5.353162
## 18   Sebastian Vettel   5.02198208  7.971802
ggplot(plot, aes(x = Race_Entries, y = Race_Wins, color = Pilotos)) +
  geom_point() +
  geom_text(aes(label = Pilotos), hjust = 0.5, vjust = -0.5, size = 3) +
  labs(title = "Gráfica de dispersión de los mejores pilotos por sus carreras disputadas y carreras ganadas", x = "Carreras disputadas", y = "Carreras ganadas") +
  guides(color = FALSE)