Introduction

The dataset being discussed is E-Sports Player Performance & Tournament Analytics from Kaggle (https://www.kaggle.com/datasets/jayjoshi37/e-sports-player-performance-and-tournament-analytics). Stated in the Kaggle description of this dataset, it contains synthetic competitive e-sports tournament data designed to analyze player performance, match statistics, and tournament-level outcomes across multiple gaming events and also each record represents a player’s performance in a specific tournament match. Below are the features in this dataset, and the target variable will be ‘performance_score’.

  1. record_id
  2. player_id
  3. team_name
  4. player_role
  5. map_played
  6. match_type
  7. kills
  8. assists
  9. deaths
  10. accuracy_percent
  11. reaction_time_ms
  12. fatigue_index
  13. performance_score
  14. win_probability
  15. match_outcome
  16. mvp_award
library(ggplot2)
library(dplyr)
library(tidyr)
library(patchwork)
library(nnet)
library(randomForest)
esportdf <- read.csv("esports_player_performance_tournament_analytics.csv")
## Warning: package 'patchwork' was built under R version 4.5.3
## Warning: package 'nnet' was built under R version 4.5.3
## Warning: package 'randomForest' was built under R version 4.5.3

Loading and Cleaning

head(esportdf)
##   record_id player_id     team_name player_role   map_played match_type kills
## 1         1       160 Titan Esports        Flex   Dust Arena    Playoff    18
## 2         2         2 Titan Esports     Support    Neon City      Final    16
## 3         3       468    Team Alpha     Support    Neon City Semi-Final    14
## 4         4       447     Team Nova         IGL Desert Storm      Final    12
## 5         5       196 Phantom Squad      Sniper      Skyline    Playoff    13
## 6         6       173    Team Alpha      Sniper  Frozen Base  Qualifier     9
##   assists deaths accuracy_percent reaction_time_ms fatigue_index
## 1       6      7            40.43           192.28          0.52
## 2       7     11            40.50           201.98          0.68
## 3       7     15            36.93           172.50          0.27
## 4       6     13            53.46           246.83          0.07
## 5       9     13            44.71           266.94          0.52
## 6       8      8            44.51           247.46          0.80
##   performance_score win_probability match_outcome mvp_award
## 1             50.02            0.83           Win        No
## 2             41.45            0.89           Win        No
## 3             35.77            0.81           Win        No
## 4             43.03            1.00           Win        No
## 5             39.16            0.85           Win        No
## 6             32.25            0.86           Win        No
sum(is.na(esportdf))
## [1] 0

Dropping unecessary columns

Learning from last time, our features will be all columns leading up to performance_score.

esportdf <- esportdf %>%
  select(-c("record_id", "player_id", "team_name", "player_role", "map_played", "match_type", "win_probability", "match_outcome", "mvp_award"))
head(esportdf)
##   kills assists deaths accuracy_percent reaction_time_ms fatigue_index
## 1    18       6      7            40.43           192.28          0.52
## 2    16       7     11            40.50           201.98          0.68
## 3    14       7     15            36.93           172.50          0.27
## 4    12       6     13            53.46           246.83          0.07
## 5    13       9     13            44.71           266.94          0.52
## 6     9       8      8            44.51           247.46          0.80
##   performance_score
## 1             50.02
## 2             41.45
## 3             35.77
## 4             43.03
## 5             39.16
## 6             32.25

Exploratory Data Analysis

ggplot(esportdf, aes(x = performance_score)) +
  geom_histogram(aes(y = after_stat(density)), bins = 20,
                 fill = "skyblue", color = "black") +
  geom_density(color = "tomato", linewidth = 1) +
  labs(title = "Performance Score Distribution")

ggplot(esportdf, aes(x = fatigue_index, y = performance_score, color = deaths)) +
  geom_point() +
  labs(
    title = "Relationship between Accuracy and Performance Score With Deaths",
    x = "Fatigue Index",
    y = "Performance Score",
    color = "Deaths"
  ) +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  scale_color_gradient(low = "#b88aea", high = "#490a64") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

max_row <- esportdf %>%
  slice_max(performance_score, n = 1) %>%
  select(performance_score, fatigue_index)

min_row <- esportdf %>%
  slice_min(performance_score, n = 1) %>%
  select(performance_score, fatigue_index)

print(max_row)
##   performance_score fatigue_index
## 1             87.36          0.31
print(min_row)
##   performance_score fatigue_index
## 1             16.58          0.88
p1 <- ggplot(esportdf, aes(x = kills, y = performance_score)) +
  geom_smooth(method = "lm", color = "blue", se = TRUE) +
  labs(
    title = "Kills vs Performance",
    x = "Kills",
    y = "Performance Score"
  ) +
  theme_minimal()
p2 <- ggplot(esportdf, aes(x = deaths, y = performance_score)) +
  geom_smooth(method = "lm", color = "blue", se = TRUE) +
  labs(
    title = "Deaths vs Performance",
    x = "Deaths",
    y = "Performance Score"
  ) +
  theme_minimal()
p3 <- ggplot(esportdf, aes(x = accuracy_percent, y = performance_score)) +
  geom_smooth(method = "lm", color = "blue", se = TRUE) +
  labs(
    title = "Accuracy vs Performance",
    x = "Accuracy",
    y = "Performance Score"
  ) +
  theme_minimal()
p4 <- ggplot(esportdf, aes(x = reaction_time_ms, y = performance_score)) +
  geom_smooth(method = "lm", color = "blue", se = TRUE) +
  labs(
    title = "Reaction Time vs Performance",
    x = "MS",
    y = "Performance Score"
  ) +
  theme_minimal()

(p1 | p2) /
(p3 | p4)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

Unsupervised Learning

num_data <- esportdf[, c("kills", "assists", "deaths", 
                  "accuracy_percent", "reaction_time_ms", "fatigue_index")]

scaled_data <- scale(num_data)

set.seed(123)
kmeans_model <- kmeans(scaled_data, centers = 3, nstart = 25)

esportdf$cluster <- kmeans_model$cluster
pca_model <- prcomp(scaled_data)

pca_df <- data.frame(pca_model$x[,1:2])
pca_df$cluster <- as.factor(esportdf$cluster)

plot(pca_df$PC1, pca_df$PC2,
     col = pca_df$cluster,
     pch = 19,
     xlab = "PC 1",
     ylab = "PC 2",
     main = "K-Means Clusters")
legend("topright", legend = levels(pca_df$cluster),
       col = 1:length(levels(pca_df$cluster)), pch = 19)

aggregate(num_data, by = list(cluster = esportdf$cluster), mean)
##   cluster    kills  assists    deaths accuracy_percent reaction_time_ms
## 1       1 15.00864 7.727979  9.930052         44.06630         216.3739
## 2       2 17.21114 9.867876 11.387306         45.96184         210.8068
## 3       3 12.61609 6.794253  8.942529         45.69576         233.4019
##   fatigue_index
## 1     0.7772712
## 2     0.2909067
## 3     0.3128621
wss <- numeric(10)

for (k in 1:10) {
  kmeans_model <- kmeans(
    scaled_data,
    centers = k,
    nstart = 25,      # more random starts
    iter.max = 100    # allow convergence
  )
  
  wss[k] <- kmeans_model$tot.withinss
}
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 140000)
plot(1:10, wss, type = "b",
     xlab = "Number of Clusters (k)",
     ylab = "Within-cluster Sum of Squares",
     main = "Elbow Method")

set.seed(123)
kmeans_model <- kmeans(scaled_data, centers = 2, nstart = 25)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 140000)
esportdf$cluster <- kmeans_model$cluster
pca_model <- prcomp(scaled_data)

pca_df <- data.frame(pca_model$x[,1:2])
pca_df$cluster <- as.factor(esportdf$cluster)

plot(pca_df$PC1, pca_df$PC2,
     col = pca_df$cluster,
     pch = 19,
     xlab = "PC 1",
     ylab = "PC 2",
     main = "K-Means Clusters")
legend("topright", legend = levels(pca_df$cluster),
       col = 1:length(levels(pca_df$cluster)), pch = 19)

aggregate(num_data, by = list(cluster = esportdf$cluster), mean)
##   cluster    kills  assists    deaths accuracy_percent reaction_time_ms
## 1       1 14.69884 7.824093  9.924709         44.19983         221.2422
## 2       2 15.06199 8.250187 10.134429         46.07220         218.9161
##   fatigue_index
## 1     0.7393361
## 2     0.2365049

Supervised Learning

Neural Network

set.seed(123)
train_idx <- sample(1:nrow(esportdf), 0.75*nrow(esportdf))

train <- esportdf[train_idx, ]
test  <- esportdf[-train_idx, ]

x_train <- scale(train[, !names(train) %in% "performance_score"])
x_test  <- scale(test[, !names(test) %in% "performance_score"],
                 center = attr(x_train, "scaled:center"),
                 scale  = attr(x_train, "scaled:scale"))

train_scaled <- data.frame(x_train, performance_score = train$performance_score)
test_scaled  <- data.frame(x_test,  performance_score = test$performance_score)

nn_model <- nnet(
  performance_score ~ .,
  data = train_scaled,
  size = 5,
  linout = TRUE,
  maxit = 500
)
## # weights:  46
## initial  value 4479024.792373 
## iter  10 value 27250.982260
## iter  20 value 14415.205409
## iter  30 value 9487.741795
## iter  40 value 7575.538370
## iter  50 value 2411.738143
## iter  60 value 192.311751
## iter  70 value 46.973770
## iter  80 value 13.392192
## iter  90 value 4.242143
## iter 100 value 3.172629
## iter 110 value 3.074976
## iter 120 value 1.925874
## iter 130 value 0.856114
## iter 140 value 0.746905
## iter 150 value 0.589038
## iter 160 value 0.442615
## iter 170 value 0.362017
## iter 180 value 0.314732
## iter 190 value 0.289579
## iter 200 value 0.287404
## iter 210 value 0.260179
## iter 220 value 0.188404
## iter 230 value 0.143843
## iter 240 value 0.112729
## iter 250 value 0.079883
## iter 260 value 0.061017
## iter 270 value 0.052925
## iter 280 value 0.051267
## iter 290 value 0.051145
## iter 300 value 0.051067
## iter 310 value 0.050447
## iter 320 value 0.048668
## iter 330 value 0.046075
## iter 340 value 0.043485
## iter 350 value 0.040892
## iter 360 value 0.038926
## iter 370 value 0.035912
## iter 380 value 0.034886
## iter 390 value 0.034874
## iter 400 value 0.034676
## iter 410 value 0.033447
## iter 420 value 0.032357
## iter 430 value 0.032269
## iter 440 value 0.031821
## iter 450 value 0.031439
## iter 460 value 0.031159
## iter 470 value 0.031059
## iter 480 value 0.031045
## iter 490 value 0.030988
## iter 500 value 0.030742
## final  value 0.030742 
## stopped after 500 iterations
pred_nn <- predict(nn_model, test_scaled)

rmse_nn <- sqrt(mean((pred_nn - test_scaled$performance_score)^2))
rmse_nn
## [1] 0.003988278
mae_nn <- mean(abs(pred_nn - test_scaled$performance_score))
mae_nn
## [1] 0.003143445
ss_res <- sum((test_scaled$performance_score - pred_nn)^2)
ss_tot <- sum((test_scaled$performance_score - mean(test_scaled$performance_score))^2)
r2_nn <- 1 - (ss_res / ss_tot)
r2_nn
## [1] 0.9999998

Random Forest

rf_model <- randomForest(performance_score ~ ., data = train, ntree = 100)

pred_rf <- predict(rf_model, test)

rmse_rf <- sqrt(mean((pred_rf - test$performance_score)^2))
rmse_rf
## [1] 2.601682
mae_rf <- mean(abs(pred_rf - test$performance_score))
mae_rf
## [1] 1.884136
ss_res_rf <- sum((test$performance_score - pred_rf)^2)
ss_tot_rf <- sum((test$performance_score - mean(test$performance_score))^2)
r2_rf <- 1 - (ss_res_rf / ss_tot_rf)
r2_rf
## [1] 0.9361206
end1 <- ggplot(data = data.frame(
  actual = test_scaled$performance_score,
  predicted = pred_nn
), aes(x = actual, y = predicted)) +
  geom_point(alpha = 0.6) +
  geom_abline(slope = 1, intercept = 0, color = "red") +
  labs(
    title = "NN: Actual vs Predicted",
    x = "Actual Performance Score",
    y = "Predicted Performance Score"
  ) +
  theme_minimal()
end2 <- ggplot(data = data.frame(
  actual = test$performance_score,
  predicted = pred_rf
), aes(x = actual, y = predicted)) +
  geom_point(alpha = 0.6) +
  geom_abline(slope = 1, intercept = 0, color = "red") +
  labs(
    title = "RF: Actual vs Predicted",
    x = "Actual Performance Score",
    y = "Predicted Performance Score"
  ) +
  theme_minimal()
importance_df <- data.frame(
  Feature = rownames(importance(rf_model)),
  Importance = importance(rf_model)[, 1]
)

end3 <- ggplot(importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Random Forest Feature Importance",
    x = "Feature",
    y = "Importance"
  ) +
  theme_minimal()
(end1 | end2) / end3

Conclusion

Having learned from last time which variables were skewed, this analysis and modeling of prediction for performance_score were much more productive. The K-means clustering was more difficult to interpret since not every value has a direct relationship with performance score. The neural network is very accurate with quite high regression metric values on the test set and while the random forest model had lower evaluation metric values, this can potentially be better with hyper parameter tuning in the future.