The dataset being discussed is E-Sports Player Performance & Tournament Analytics from Kaggle (https://www.kaggle.com/datasets/jayjoshi37/e-sports-player-performance-and-tournament-analytics). Stated in the Kaggle description of this dataset, it contains synthetic competitive e-sports tournament data designed to analyze player performance, match statistics, and tournament-level outcomes across multiple gaming events and also each record represents a player’s performance in a specific tournament match. Below are the features in this dataset, and the target variable will be ‘performance_score’.
library(ggplot2)
library(dplyr)
library(tidyr)
library(patchwork)
library(nnet)
library(randomForest)
esportdf <- read.csv("esports_player_performance_tournament_analytics.csv")
## Warning: package 'patchwork' was built under R version 4.5.3
## Warning: package 'nnet' was built under R version 4.5.3
## Warning: package 'randomForest' was built under R version 4.5.3
head(esportdf)
## record_id player_id team_name player_role map_played match_type kills
## 1 1 160 Titan Esports Flex Dust Arena Playoff 18
## 2 2 2 Titan Esports Support Neon City Final 16
## 3 3 468 Team Alpha Support Neon City Semi-Final 14
## 4 4 447 Team Nova IGL Desert Storm Final 12
## 5 5 196 Phantom Squad Sniper Skyline Playoff 13
## 6 6 173 Team Alpha Sniper Frozen Base Qualifier 9
## assists deaths accuracy_percent reaction_time_ms fatigue_index
## 1 6 7 40.43 192.28 0.52
## 2 7 11 40.50 201.98 0.68
## 3 7 15 36.93 172.50 0.27
## 4 6 13 53.46 246.83 0.07
## 5 9 13 44.71 266.94 0.52
## 6 8 8 44.51 247.46 0.80
## performance_score win_probability match_outcome mvp_award
## 1 50.02 0.83 Win No
## 2 41.45 0.89 Win No
## 3 35.77 0.81 Win No
## 4 43.03 1.00 Win No
## 5 39.16 0.85 Win No
## 6 32.25 0.86 Win No
sum(is.na(esportdf))
## [1] 0
Learning from last time, our features will be all columns leading up to performance_score.
esportdf <- esportdf %>%
select(-c("record_id", "player_id", "team_name", "player_role", "map_played", "match_type", "win_probability", "match_outcome", "mvp_award"))
head(esportdf)
## kills assists deaths accuracy_percent reaction_time_ms fatigue_index
## 1 18 6 7 40.43 192.28 0.52
## 2 16 7 11 40.50 201.98 0.68
## 3 14 7 15 36.93 172.50 0.27
## 4 12 6 13 53.46 246.83 0.07
## 5 13 9 13 44.71 266.94 0.52
## 6 9 8 8 44.51 247.46 0.80
## performance_score
## 1 50.02
## 2 41.45
## 3 35.77
## 4 43.03
## 5 39.16
## 6 32.25
ggplot(esportdf, aes(x = performance_score)) +
geom_histogram(aes(y = after_stat(density)), bins = 20,
fill = "skyblue", color = "black") +
geom_density(color = "tomato", linewidth = 1) +
labs(title = "Performance Score Distribution")
ggplot(esportdf, aes(x = fatigue_index, y = performance_score, color = deaths)) +
geom_point() +
labs(
title = "Relationship between Accuracy and Performance Score With Deaths",
x = "Fatigue Index",
y = "Performance Score",
color = "Deaths"
) +
geom_smooth(method = "lm", se = FALSE, color = "red") +
scale_color_gradient(low = "#b88aea", high = "#490a64") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
max_row <- esportdf %>%
slice_max(performance_score, n = 1) %>%
select(performance_score, fatigue_index)
min_row <- esportdf %>%
slice_min(performance_score, n = 1) %>%
select(performance_score, fatigue_index)
print(max_row)
## performance_score fatigue_index
## 1 87.36 0.31
print(min_row)
## performance_score fatigue_index
## 1 16.58 0.88
p1 <- ggplot(esportdf, aes(x = kills, y = performance_score)) +
geom_smooth(method = "lm", color = "blue", se = TRUE) +
labs(
title = "Kills vs Performance",
x = "Kills",
y = "Performance Score"
) +
theme_minimal()
p2 <- ggplot(esportdf, aes(x = deaths, y = performance_score)) +
geom_smooth(method = "lm", color = "blue", se = TRUE) +
labs(
title = "Deaths vs Performance",
x = "Deaths",
y = "Performance Score"
) +
theme_minimal()
p3 <- ggplot(esportdf, aes(x = accuracy_percent, y = performance_score)) +
geom_smooth(method = "lm", color = "blue", se = TRUE) +
labs(
title = "Accuracy vs Performance",
x = "Accuracy",
y = "Performance Score"
) +
theme_minimal()
p4 <- ggplot(esportdf, aes(x = reaction_time_ms, y = performance_score)) +
geom_smooth(method = "lm", color = "blue", se = TRUE) +
labs(
title = "Reaction Time vs Performance",
x = "MS",
y = "Performance Score"
) +
theme_minimal()
(p1 | p2) /
(p3 | p4)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
num_data <- esportdf[, c("kills", "assists", "deaths",
"accuracy_percent", "reaction_time_ms", "fatigue_index")]
scaled_data <- scale(num_data)
set.seed(123)
kmeans_model <- kmeans(scaled_data, centers = 3, nstart = 25)
esportdf$cluster <- kmeans_model$cluster
pca_model <- prcomp(scaled_data)
pca_df <- data.frame(pca_model$x[,1:2])
pca_df$cluster <- as.factor(esportdf$cluster)
plot(pca_df$PC1, pca_df$PC2,
col = pca_df$cluster,
pch = 19,
xlab = "PC 1",
ylab = "PC 2",
main = "K-Means Clusters")
legend("topright", legend = levels(pca_df$cluster),
col = 1:length(levels(pca_df$cluster)), pch = 19)
aggregate(num_data, by = list(cluster = esportdf$cluster), mean)
## cluster kills assists deaths accuracy_percent reaction_time_ms
## 1 1 15.00864 7.727979 9.930052 44.06630 216.3739
## 2 2 17.21114 9.867876 11.387306 45.96184 210.8068
## 3 3 12.61609 6.794253 8.942529 45.69576 233.4019
## fatigue_index
## 1 0.7772712
## 2 0.2909067
## 3 0.3128621
wss <- numeric(10)
for (k in 1:10) {
kmeans_model <- kmeans(
scaled_data,
centers = k,
nstart = 25, # more random starts
iter.max = 100 # allow convergence
)
wss[k] <- kmeans_model$tot.withinss
}
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 140000)
plot(1:10, wss, type = "b",
xlab = "Number of Clusters (k)",
ylab = "Within-cluster Sum of Squares",
main = "Elbow Method")
set.seed(123)
kmeans_model <- kmeans(scaled_data, centers = 2, nstart = 25)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 140000)
esportdf$cluster <- kmeans_model$cluster
pca_model <- prcomp(scaled_data)
pca_df <- data.frame(pca_model$x[,1:2])
pca_df$cluster <- as.factor(esportdf$cluster)
plot(pca_df$PC1, pca_df$PC2,
col = pca_df$cluster,
pch = 19,
xlab = "PC 1",
ylab = "PC 2",
main = "K-Means Clusters")
legend("topright", legend = levels(pca_df$cluster),
col = 1:length(levels(pca_df$cluster)), pch = 19)
aggregate(num_data, by = list(cluster = esportdf$cluster), mean)
## cluster kills assists deaths accuracy_percent reaction_time_ms
## 1 1 14.69884 7.824093 9.924709 44.19983 221.2422
## 2 2 15.06199 8.250187 10.134429 46.07220 218.9161
## fatigue_index
## 1 0.7393361
## 2 0.2365049
set.seed(123)
train_idx <- sample(1:nrow(esportdf), 0.75*nrow(esportdf))
train <- esportdf[train_idx, ]
test <- esportdf[-train_idx, ]
x_train <- scale(train[, !names(train) %in% "performance_score"])
x_test <- scale(test[, !names(test) %in% "performance_score"],
center = attr(x_train, "scaled:center"),
scale = attr(x_train, "scaled:scale"))
train_scaled <- data.frame(x_train, performance_score = train$performance_score)
test_scaled <- data.frame(x_test, performance_score = test$performance_score)
nn_model <- nnet(
performance_score ~ .,
data = train_scaled,
size = 5,
linout = TRUE,
maxit = 500
)
## # weights: 46
## initial value 4479024.792373
## iter 10 value 27250.982260
## iter 20 value 14415.205409
## iter 30 value 9487.741795
## iter 40 value 7575.538370
## iter 50 value 2411.738143
## iter 60 value 192.311751
## iter 70 value 46.973770
## iter 80 value 13.392192
## iter 90 value 4.242143
## iter 100 value 3.172629
## iter 110 value 3.074976
## iter 120 value 1.925874
## iter 130 value 0.856114
## iter 140 value 0.746905
## iter 150 value 0.589038
## iter 160 value 0.442615
## iter 170 value 0.362017
## iter 180 value 0.314732
## iter 190 value 0.289579
## iter 200 value 0.287404
## iter 210 value 0.260179
## iter 220 value 0.188404
## iter 230 value 0.143843
## iter 240 value 0.112729
## iter 250 value 0.079883
## iter 260 value 0.061017
## iter 270 value 0.052925
## iter 280 value 0.051267
## iter 290 value 0.051145
## iter 300 value 0.051067
## iter 310 value 0.050447
## iter 320 value 0.048668
## iter 330 value 0.046075
## iter 340 value 0.043485
## iter 350 value 0.040892
## iter 360 value 0.038926
## iter 370 value 0.035912
## iter 380 value 0.034886
## iter 390 value 0.034874
## iter 400 value 0.034676
## iter 410 value 0.033447
## iter 420 value 0.032357
## iter 430 value 0.032269
## iter 440 value 0.031821
## iter 450 value 0.031439
## iter 460 value 0.031159
## iter 470 value 0.031059
## iter 480 value 0.031045
## iter 490 value 0.030988
## iter 500 value 0.030742
## final value 0.030742
## stopped after 500 iterations
pred_nn <- predict(nn_model, test_scaled)
rmse_nn <- sqrt(mean((pred_nn - test_scaled$performance_score)^2))
rmse_nn
## [1] 0.003988278
mae_nn <- mean(abs(pred_nn - test_scaled$performance_score))
mae_nn
## [1] 0.003143445
ss_res <- sum((test_scaled$performance_score - pred_nn)^2)
ss_tot <- sum((test_scaled$performance_score - mean(test_scaled$performance_score))^2)
r2_nn <- 1 - (ss_res / ss_tot)
r2_nn
## [1] 0.9999998
rf_model <- randomForest(performance_score ~ ., data = train, ntree = 100)
pred_rf <- predict(rf_model, test)
rmse_rf <- sqrt(mean((pred_rf - test$performance_score)^2))
rmse_rf
## [1] 2.601682
mae_rf <- mean(abs(pred_rf - test$performance_score))
mae_rf
## [1] 1.884136
ss_res_rf <- sum((test$performance_score - pred_rf)^2)
ss_tot_rf <- sum((test$performance_score - mean(test$performance_score))^2)
r2_rf <- 1 - (ss_res_rf / ss_tot_rf)
r2_rf
## [1] 0.9361206
end1 <- ggplot(data = data.frame(
actual = test_scaled$performance_score,
predicted = pred_nn
), aes(x = actual, y = predicted)) +
geom_point(alpha = 0.6) +
geom_abline(slope = 1, intercept = 0, color = "red") +
labs(
title = "NN: Actual vs Predicted",
x = "Actual Performance Score",
y = "Predicted Performance Score"
) +
theme_minimal()
end2 <- ggplot(data = data.frame(
actual = test$performance_score,
predicted = pred_rf
), aes(x = actual, y = predicted)) +
geom_point(alpha = 0.6) +
geom_abline(slope = 1, intercept = 0, color = "red") +
labs(
title = "RF: Actual vs Predicted",
x = "Actual Performance Score",
y = "Predicted Performance Score"
) +
theme_minimal()
importance_df <- data.frame(
Feature = rownames(importance(rf_model)),
Importance = importance(rf_model)[, 1]
)
end3 <- ggplot(importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_col() +
coord_flip() +
labs(
title = "Random Forest Feature Importance",
x = "Feature",
y = "Importance"
) +
theme_minimal()
(end1 | end2) / end3
Having learned from last time which variables were skewed, this analysis and modeling of prediction for performance_score were much more productive. The K-means clustering was more difficult to interpret since not every value has a direct relationship with performance score. The neural network is very accurate with quite high regression metric values on the test set and while the random forest model had lower evaluation metric values, this can potentially be better with hyper parameter tuning in the future.