Football clubs, analysts, and recruitment teams need evidence-based ways to compare players, estimate market value, and understand different technical profiles. This project uses demographic, physical, technical, mental, and goalkeeping attributes to study player value from complementary supervised and unsupervised perspectives.
The data come from Rehan Ahmed’s FIFA 24 Player Stats Dataset on Kaggle: https://www.kaggle.com/datasets/rehandl23/fifa-24-player-stats-dataset. The dataset represents the FIFA 24 edition and contains football players from clubs and countries around the world. It includes demographic and body information, technical and mental ability ratings, goalkeeper-specific ratings, and estimated player market value. The Kaggle data card credits FIFA Index as the original information source and lists the dataset under the Apache 2.0 license.
The project addresses three questions:
Regression findings are interpreted as association and predictive importance, not as causal effects. Classification is a predictive screening task, while clustering is an unsupervised exploratory task.
The following code block runs the full analysis pipeline. It reads
the raw CSV file, cleans the data, creates engineered variables,
performs EDA, trains regression and classification models, evaluates
long-tail and class-imbalance performance, performs clustering, and
writes supporting figures and tables into the local
rmd_output folder.
# ============================================================
# PART 0. Reproducibility, file paths, and output folders
# Purpose: fix the random seed, define the raw input file, and
# create a separate R Markdown output folder for tables and figures.
# ============================================================
options(encoding = "UTF-8")
set.seed(2026)
# Locate the R Markdown folder during knitting. This makes the
# submission portable: the file can run on another computer if
# player_stats.csv is submitted with the Rmd or placed one folder above it.
rmd_input <- tryCatch(knitr::current_input(dir = TRUE), error = function(e) NULL)
rmd_dir <- if (!is.null(rmd_input) && nzchar(rmd_input)) {
dirname(normalizePath(rmd_input, winslash = "/", mustWork = FALSE))
} else {
getwd()
}
candidate_input_files <- c(
file.path(rmd_dir, "player_stats.csv"),
file.path(dirname(rmd_dir), "player_stats.csv"),
"D:/UMhomework/7004/player_stats.csv"
)
input_file <- candidate_input_files[file.exists(candidate_input_files)][1]
if (is.na(input_file)) {
stop("Cannot find player_stats.csv. Place it in the same folder as this Rmd, one folder above it, or update input_file.")
}
output_dir <- file.path(rmd_dir, "rmd_output")
fig_dir <- file.path(output_dir, "figures")
table_dir <- file.path(output_dir, "tables")
dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)
dir.create(fig_dir, recursive = TRUE, showWarnings = FALSE)
dir.create(table_dir, recursive = TRUE, showWarnings = FALSE)
cat("Analysis started:", format(Sys.time()), "\n")
# ============================================================
# PART 0A. Package management
# Purpose: install missing packages if needed, then load all
# libraries required for cleaning, visualization, regression,
# and clustering.
# ============================================================
install_if_missing <- function(pkgs, optional = FALSE) {
installed <- rownames(installed.packages())
missing <- setdiff(pkgs, installed)
if (length(missing) > 0) {
message("Installing packages: ", paste(missing, collapse = ", "))
tryCatch(
install.packages(
missing,
repos = "https://cloud.r-project.org",
dependencies = TRUE,
Ncpus = max(1L, parallel::detectCores() - 1L)
),
error = function(e) {
cat("Package installation error:", conditionMessage(e), "\n")
}
)
}
still_missing <- pkgs[!vapply(pkgs, requireNamespace, logical(1), quietly = TRUE)]
if (length(still_missing) > 0 && !optional) {
stop("Required packages are missing: ", paste(still_missing, collapse = ", "))
}
length(still_missing) == 0
}
core_pkgs <- c("ggplot2", "dplyr", "tidyr", "scales", "glmnet", "randomForest", "cluster")
required_packages_ok <- install_if_missing(core_pkgs, optional = FALSE)
gbm_available <- install_if_missing("gbm", optional = TRUE)
invisible(lapply(core_pkgs, function(pkg) {
suppressPackageStartupMessages(library(pkg, character.only = TRUE))
}))
if (gbm_available) {
suppressWarnings(suppressPackageStartupMessages(library(gbm)))
}
cat("Packages loaded.\n\n")
# ============================================================
# PART 0B. Helper functions
# Purpose: define reusable functions for robust CSV reading,
# market value parsing, evaluation metrics, plot saving, and
# simple markdown table formatting.
# ============================================================
read_csv_robust <- function(path) {
raw <- readBin(path, what = "raw", n = file.info(path)$size)
txt <- rawToChar(raw)
txt <- iconv(txt, from = "UTF-8", to = "UTF-8", sub = "?")
read.csv(
text = txt,
stringsAsFactors = FALSE,
na.strings = c("", "NA", "N/A", "None", "NULL", "null"),
check.names = FALSE
)
}
parse_market_value <- function(x) {
s <- trimws(as.character(x))
s <- gsub("\\$", "", s)
s <- gsub(",", "", s)
# In this data, values such as 975.00 represent 975,000, while
# values such as 1.400.000 already use dots as thousand separators.
out <- rep(NA_real_, length(s))
thousand_decimal <- grepl("^\\d{1,3}\\.00$", s)
out[thousand_decimal] <- as.numeric(gsub("\\.00$", "", s[thousand_decimal])) * 1000
out[!thousand_decimal] <- suppressWarnings(as.numeric(gsub("\\.", "", s[!thousand_decimal])))
out
}
rmse <- function(actual, predicted) {
sqrt(mean((actual - predicted)^2, na.rm = TRUE))
}
mae <- function(actual, predicted) {
mean(abs(actual - predicted), na.rm = TRUE)
}
median_ae <- function(actual, predicted) {
median(abs(actual - predicted), na.rm = TRUE)
}
r2_standard <- function(actual, predicted) {
1 - sum((actual - predicted)^2, na.rm = TRUE) /
sum((actual - mean(actual, na.rm = TRUE))^2, na.rm = TRUE)
}
metric_row <- function(
model_name, segment, actual_log, predicted_log,
actual_value, smearing_factor = 1) {
predicted_value <- exp(predicted_log) * smearing_factor
data.frame(
model = model_name,
segment = segment,
n = length(actual_log),
RMSE_log = rmse(actual_log, predicted_log),
MAE_log = mae(actual_log, predicted_log),
R2_log = r2_standard(actual_log, predicted_log),
RMSE_value = rmse(actual_value, predicted_value),
MAE_value = mae(actual_value, predicted_value),
MedAE_value = median_ae(actual_value, predicted_value),
stringsAsFactors = FALSE
)
}
roc_auc <- function(actual, probability) {
actual <- as.integer(actual)
n_pos <- sum(actual == 1)
n_neg <- sum(actual == 0)
if (n_pos == 0 || n_neg == 0) return(NA_real_)
ranks <- rank(probability, ties.method = "average")
(sum(ranks[actual == 1]) - n_pos * (n_pos + 1) / 2) / (n_pos * n_neg)
}
average_precision <- function(actual, probability) {
actual <- as.integer(actual)
if (sum(actual == 1) == 0) return(NA_real_)
ord <- order(probability, decreasing = TRUE)
y <- actual[ord]
precision_at_rank <- cumsum(y) / seq_along(y)
sum(precision_at_rank * y) / sum(y)
}
classification_metric_row <- function(model_name, actual, probability, threshold = 0.5) {
actual <- as.integer(actual)
predicted <- as.integer(probability >= threshold)
tp <- sum(actual == 1 & predicted == 1)
tn <- sum(actual == 0 & predicted == 0)
fp <- sum(actual == 0 & predicted == 1)
fn <- sum(actual == 1 & predicted == 0)
sensitivity <- ifelse(tp + fn == 0, NA_real_, tp / (tp + fn))
specificity <- ifelse(tn + fp == 0, NA_real_, tn / (tn + fp))
precision <- ifelse(tp + fp == 0, 0, tp / (tp + fp))
f1 <- ifelse(precision + sensitivity == 0, 0, 2 * precision * sensitivity / (precision + sensitivity))
data.frame(
model = model_name,
threshold = threshold,
ROC_AUC = roc_auc(actual, probability),
PR_AUC = average_precision(actual, probability),
accuracy = mean(actual == predicted),
balanced_accuracy = mean(c(sensitivity, specificity), na.rm = TRUE),
precision = precision,
recall = sensitivity,
specificity = specificity,
F1 = f1,
TP = tp,
FP = fp,
TN = tn,
FN = fn,
stringsAsFactors = FALSE
)
}
roc_curve_points <- function(actual, probability, model_name) {
thresholds <- sort(unique(c(Inf, probability, -Inf)), decreasing = TRUE)
rows <- lapply(thresholds, function(threshold) {
predicted <- probability >= threshold
tp <- sum(actual == 1 & predicted)
fp <- sum(actual == 0 & predicted)
fn <- sum(actual == 1 & !predicted)
tn <- sum(actual == 0 & !predicted)
data.frame(
model = model_name,
FPR = ifelse(fp + tn == 0, 0, fp / (fp + tn)),
TPR = ifelse(tp + fn == 0, 0, tp / (tp + fn))
)
})
do.call(rbind, rows)
}
adjusted_rand_index <- function(cluster_a, cluster_b) {
tab <- table(cluster_a, cluster_b)
choose2 <- function(x) x * (x - 1) / 2
sum_cells <- sum(choose2(tab))
sum_rows <- sum(choose2(rowSums(tab)))
sum_cols <- sum(choose2(colSums(tab)))
total_pairs <- choose2(sum(tab))
expected <- sum_rows * sum_cols / total_pairs
maximum <- 0.5 * (sum_rows + sum_cols)
if (maximum == expected) return(1)
(sum_cells - expected) / (maximum - expected)
}
save_plot <- function(filename, plot, width = 9, height = 6) {
path <- file.path(fig_dir, filename)
ggplot2::ggsave(path, plot = plot, width = width, height = height, dpi = 160)
invisible(path)
}
md_table <- function(df, digits = 3, max_rows = Inf) {
if (nrow(df) == 0) {
return("_No rows._")
}
if (is.finite(max_rows)) {
df <- df[seq_len(min(nrow(df), max_rows)), , drop = FALSE]
}
fmt <- df
for (col in names(fmt)) {
if (is.numeric(fmt[[col]])) {
fmt[[col]] <- round(fmt[[col]], digits)
}
}
fmt <- as.data.frame(lapply(fmt, as.character), stringsAsFactors = FALSE)
header <- paste0("| ", paste(names(fmt), collapse = " | "), " |")
sep <- paste0("| ", paste(rep("---", ncol(fmt)), collapse = " | "), " |")
rows <- apply(fmt, 1, function(row) paste0("| ", paste(row, collapse = " | "), " |"))
paste(c(header, sep, rows), collapse = "\n")
}
# ============================================================
# PART 1. Data import and cleaning
# Purpose: read the raw CSV, trim string fields, remove exact
# duplicate rows, drop the fully missing marking column, convert
# numeric columns, and create value_num/log_value.
# ============================================================
df_raw <- read_csv_robust(input_file)
raw_rows <- nrow(df_raw)
raw_cols <- ncol(df_raw)
raw_dimension_score <- raw_rows * raw_cols
cat("Raw data dimensions:", raw_rows, "rows x", raw_cols, "columns\n")
dataset_overview <- data.frame(
item = c(
"Dataset title", "Edition/year", "Source and author",
"Raw rows", "Raw columns", "Dimension score (rows x columns)",
"Primary unit of analysis", "Target variable", "License"
),
value = c(
"FIFA 24 Player Stats Dataset",
"FIFA 24 (2023/24 edition)",
"Rehan Ahmed, Kaggle; data card credits FIFA Index",
format(raw_rows, big.mark = ","),
raw_cols,
format(raw_dimension_score, big.mark = ","),
"One football player per row",
"Estimated player market value",
"Apache 2.0"
),
stringsAsFactors = FALSE
)
data_structure_summary <- data.frame(
section = c(
"Identifiers", "Demographic and body", "Technical and mental",
"Goalkeeping", "Outcome"
),
variables = c(
"player, country, club",
"age, height, weight",
"ball control, dribbling, passing, shooting, defending, physical and mental ratings",
"GK positioning, diving, handling, kicking and reflexes",
"value"
),
purpose = c(
"Identify the player and contextual affiliation",
"Describe age and physical profile",
"Measure outfield playing abilities on FIFA rating scales",
"Measure goalkeeper-specific abilities",
"Estimated market value used for regression and high-value classification"
),
stringsAsFactors = FALSE
)
raw_missing_summary <- data.frame(
variable = names(df_raw),
missing_n = vapply(df_raw, function(x) sum(is.na(x)), integer(1)),
missing_percent = vapply(df_raw, function(x) mean(is.na(x)) * 100, numeric(1)),
stringsAsFactors = FALSE
) %>%
arrange(desc(missing_n))
write.csv(dataset_overview, file.path(table_dir, "dataset_overview.csv"), row.names = FALSE)
write.csv(data_structure_summary, file.path(table_dir, "data_structure_summary.csv"), row.names = FALSE)
write.csv(raw_missing_summary, file.path(table_dir, "raw_missing_summary.csv"), row.names = FALSE)
df <- df_raw
char_cols <- names(df)[vapply(df, is.character, logical(1))]
df[char_cols] <- lapply(df[char_cols], trimws)
duplicate_rows <- sum(duplicated(df))
df <- df[!duplicated(df), , drop = FALSE]
rows_after_duplicate_removal <- nrow(df)
marking_missing <- if ("marking" %in% names(df)) sum(is.na(df$marking)) else NA_integer_
if ("marking" %in% names(df)) {
df$marking <- NULL
}
id_cols <- c("player", "country", "club", "value")
numeric_candidates <- setdiff(names(df), id_cols)
for (col in numeric_candidates) {
df[[col]] <- suppressWarnings(as.numeric(df[[col]]))
}
df$value_num <- parse_market_value(df$value)
value_na_rows <- sum(is.na(df$value_num) | df$value_num <= 0)
df <- df[!is.na(df$value_num) & df$value_num > 0, , drop = FALSE]
df$row_id <- seq_len(nrow(df))
df$log_value <- log(df$value_num)
# ============================================================
# PART 1B. Feature engineering
# Purpose: build role indicators and ability-group averages
# for goalkeeper, attacking, passing, defending, physical, and
# mental profiles.
# ============================================================
gk_cols <- c("gk_positioning", "gk_diving", "gk_handling", "gk_kicking", "gk_reflexes")
defending_cols <- c("slide_tackle", "stand_tackle", "interceptions", "aggression", "heading")
passing_cols <- c("short_pass", "long_pass", "vision", "crossing", "curve", "fk_acc")
attacking_cols <- c("att_position", "finishing", "shot_power", "long_shots", "volleys", "ball_control", "dribbling")
physical_cols <- c("acceleration", "sprint_speed", "agility", "balance", "jumping", "stamina", "strength")
mental_cols <- c("reactions", "composure", "vision", "aggression")
gk_cols <- intersect(gk_cols, names(df))
defending_cols <- intersect(defending_cols, names(df))
passing_cols <- intersect(passing_cols, names(df))
attacking_cols <- intersect(attacking_cols, names(df))
physical_cols <- intersect(physical_cols, names(df))
mental_cols <- intersect(mental_cols, names(df))
df$gk_score <- rowMeans(df[, gk_cols, drop = FALSE], na.rm = TRUE)
df$is_gk <- ifelse(df$gk_score >= 45, 1, 0)
df$attacking_mean <- rowMeans(df[, attacking_cols, drop = FALSE], na.rm = TRUE)
df$passing_mean <- rowMeans(df[, passing_cols, drop = FALSE], na.rm = TRUE)
df$defending_mean <- rowMeans(df[, defending_cols, drop = FALSE], na.rm = TRUE)
df$physical_mean <- rowMeans(df[, physical_cols, drop = FALSE], na.rm = TRUE)
df$mental_mean <- rowMeans(df[, mental_cols, drop = FALSE], na.rm = TRUE)
ability_cols <- c(
"ball_control", "dribbling", "slide_tackle", "stand_tackle", "aggression",
"reactions", "att_position", "interceptions", "vision", "composure",
"crossing", "short_pass", "long_pass", "acceleration", "stamina",
"strength", "balance", "sprint_speed", "agility", "jumping", "heading",
"shot_power", "finishing", "long_shots", "curve", "fk_acc", "penalties",
"volleys", gk_cols
)
ability_cols <- unique(intersect(ability_cols, names(df)))
# ============================================================
# PART 1C. Cleaning audit and cleaned data export
# Purpose: save the cleaned dataset and a transparent cleaning
# summary so the data preparation step is reproducible.
# ============================================================
cleaned_path <- file.path(output_dir, "cleaned_player_stats.csv")
write.csv(df, cleaned_path, row.names = FALSE, fileEncoding = "UTF-8")
cleaning_summary <- data.frame(
item = c(
"raw_rows", "raw_columns", "rows_after_exact_duplicate_removal",
"exact_duplicates_removed", "marking_missing_count_before_removal",
"invalid_or_empty_value_rows_removed", "final_rows", "final_columns",
"goalkeeper_count", "outfield_count"
),
value = c(
raw_rows, raw_cols, rows_after_duplicate_removal, duplicate_rows, marking_missing,
value_na_rows, nrow(df), ncol(df), sum(df$is_gk == 1), sum(df$is_gk == 0)
)
)
write.csv(cleaning_summary, file.path(table_dir, "cleaning_summary.csv"), row.names = FALSE)
cat("Cleaned data dimensions:", nrow(df), "rows x", ncol(df), "columns\n")
cat("Goalkeepers:", sum(df$is_gk == 1), "Outfield:", sum(df$is_gk == 0), "\n\n")
# ============================================================
# PART 2. Exploratory Data Analysis: summary tables
# Purpose: summarize numeric variables and compare raw market
# value with log-transformed market value.
# ============================================================
summary_vars <- c("age", "height", "weight", "value_num", "log_value", ability_cols)
summary_vars <- intersect(summary_vars, names(df))
numeric_summary <- data.frame(
variable = summary_vars,
mean = vapply(df[, summary_vars, drop = FALSE], mean, numeric(1), na.rm = TRUE),
sd = vapply(df[, summary_vars, drop = FALSE], sd, numeric(1), na.rm = TRUE),
min = vapply(df[, summary_vars, drop = FALSE], min, numeric(1), na.rm = TRUE),
q25 = vapply(df[, summary_vars, drop = FALSE], quantile, numeric(1), probs = 0.25, na.rm = TRUE),
median = vapply(df[, summary_vars, drop = FALSE], median, numeric(1), na.rm = TRUE),
q75 = vapply(df[, summary_vars, drop = FALSE], quantile, numeric(1), probs = 0.75, na.rm = TRUE),
max = vapply(df[, summary_vars, drop = FALSE], max, numeric(1), na.rm = TRUE),
stringsAsFactors = FALSE
)
write.csv(numeric_summary, file.path(table_dir, "numeric_summary.csv"), row.names = FALSE)
value_summary <- data.frame(
metric = c("mean", "sd", "min", "q01", "q05", "q25", "median", "q75", "q95", "q99", "max", "skew_approx"),
value_num = c(
mean(df$value_num), sd(df$value_num), min(df$value_num),
quantile(df$value_num, 0.01), quantile(df$value_num, 0.05),
quantile(df$value_num, 0.25), median(df$value_num),
quantile(df$value_num, 0.75), quantile(df$value_num, 0.95),
quantile(df$value_num, 0.99), max(df$value_num),
mean((df$value_num - mean(df$value_num))^3) / sd(df$value_num)^3
),
log_value = c(
mean(df$log_value), sd(df$log_value), min(df$log_value),
quantile(df$log_value, 0.01), quantile(df$log_value, 0.05),
quantile(df$log_value, 0.25), median(df$log_value),
quantile(df$log_value, 0.75), quantile(df$log_value, 0.95),
quantile(df$log_value, 0.99), max(df$log_value),
mean((df$log_value - mean(df$log_value))^3) / sd(df$log_value)^3
)
)
write.csv(value_summary, file.path(table_dir, "value_summary.csv"), row.names = FALSE)
# ============================================================
# PART 2B. Exploratory Data Analysis: visual distributions
# Purpose: visualize the long-tailed value distribution, the
# stabilized log_value distribution, body variables, and role
# differences between goalkeepers and outfield players.
# ============================================================
p_value_raw <- ggplot(df, aes(x = value_num)) +
geom_histogram(bins = 50, fill = "#2B6CB0", color = "white") +
scale_x_continuous(labels = scales::dollar_format(prefix = "$")) +
labs(
title = "Raw Market Value Distribution",
subtitle = "The long right tail makes direct linear regression unstable.",
x = "Market value",
y = "Number of players"
) +
theme_minimal()
save_plot("eda_value_raw_hist.png", p_value_raw, width = 9, height = 5.5)
p_log_value <- ggplot(df, aes(x = log_value)) +
geom_histogram(bins = 50, fill = "#38A169", color = "white") +
labs(
title = "Log Market Value Distribution",
subtitle = "The log transformation reduces the extreme right skew.",
x = "log(value)",
y = "Number of players"
) +
theme_minimal()
save_plot("eda_log_value_hist.png", p_log_value, width = 9, height = 5.5)
body_long <- df %>%
dplyr::select(age, height, weight) %>%
tidyr::pivot_longer(cols = dplyr::everything(), names_to = "variable", values_to = "value")
p_body <- ggplot(body_long, aes(x = value)) +
geom_histogram(bins = 35, fill = "#805AD5", color = "white") +
facet_wrap(~ variable, scales = "free", nrow = 1) +
labs(title = "Age, Height, and Weight Distributions", x = NULL, y = "Number of players") +
theme_minimal()
save_plot("eda_age_height_weight.png", p_body, width = 10, height = 4.5)
group_long <- df %>%
mutate(position_group = ifelse(is_gk == 1, "Goalkeeper", "Outfield")) %>%
dplyr::select(
position_group, attacking_mean, passing_mean, defending_mean,
physical_mean, mental_mean, gk_score
) %>%
group_by(position_group) %>%
summarise(across(where(is.numeric), function(x) mean(x, na.rm = TRUE)), .groups = "drop") %>%
tidyr::pivot_longer(-position_group, names_to = "ability_group", values_to = "mean_score")
p_gk_compare <- ggplot(group_long, aes(x = ability_group, y = mean_score, fill = position_group)) +
geom_col(position = "dodge") +
labs(
title = "Goalkeepers vs Outfield Players",
subtitle = "Goalkeeper variables strongly separate the two role groups.",
x = NULL,
y = "Mean score",
fill = NULL
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 35, hjust = 1))
save_plot("eda_gk_vs_outfield.png", p_gk_compare, width = 9.5, height = 5.5)
# ============================================================
# PART 2C. Exploratory Data Analysis: correlations
# Purpose: identify variables most strongly associated with
# log_value and visualize correlation structure before modelling.
# ============================================================
corr_features <- c("age", "height", "weight", ability_cols)
corr_features <- intersect(corr_features, names(df))
cor_data <- df[, c(corr_features, "log_value"), drop = FALSE]
cor_mat <- cor(cor_data, use = "pairwise.complete.obs")
cor_with_value <- sort(cor_mat[setdiff(rownames(cor_mat), "log_value"), "log_value"], decreasing = TRUE)
cor_table <- data.frame(
variable = names(cor_with_value),
correlation_with_log_value = as.numeric(cor_with_value),
stringsAsFactors = FALSE
)
write.csv(cor_table, file.path(table_dir, "correlation_with_log_value.csv"), row.names = FALSE)
corr_long <- as.data.frame(as.table(cor_mat))
names(corr_long) <- c("var1", "var2", "correlation")
p_corr <- ggplot(corr_long, aes(x = var1, y = var2, fill = correlation)) +
geom_tile() +
scale_fill_gradient2(low = "#2166AC", mid = "white", high = "#B2182B", midpoint = 0, limits = c(-1, 1)) +
labs(title = "Correlation Heatmap", x = NULL, y = NULL, fill = "r") +
theme_minimal(base_size = 8) +
theme(
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
panel.grid = element_blank()
)
save_plot("eda_correlation_heatmap.png", p_corr, width = 12, height = 10)
top_cor_plot <- cor_table %>%
slice_head(n = 15) %>%
mutate(variable = factor(variable, levels = rev(variable)))
p_top_cor <- ggplot(top_cor_plot, aes(x = variable, y = correlation_with_log_value)) +
geom_col(fill = "#DD6B20") +
coord_flip() +
labs(
title = "Top Positive Correlations with log(value)",
x = NULL,
y = "Correlation"
) +
theme_minimal()
save_plot("eda_top_correlations.png", p_top_cor, width = 8.5, height = 6)
top_scatter_vars <- intersect(c("reactions", "composure", "short_pass", "shot_power", "vision"), names(df))
scatter_long <- df %>%
dplyr::select(log_value, all_of(top_scatter_vars)) %>%
tidyr::pivot_longer(-log_value, names_to = "variable", values_to = "score")
p_scatter <- ggplot(scatter_long, aes(x = score, y = log_value)) +
geom_point(alpha = 0.25, size = 0.8, color = "#2C5282") +
geom_smooth(method = "lm", formula = y ~ x, se = FALSE, color = "#C53030", linewidth = 0.7) +
facet_wrap(~ variable, scales = "free_x") +
labs(
title = "Key Ability Variables vs log(value)",
x = "Ability score",
y = "log(value)"
) +
theme_minimal()
save_plot("eda_key_scatterplots.png", p_scatter, width = 10, height = 6)
# ============================================================
# PART 3. Data splitting and regression design
# Purpose: use log_value as the target, build predictors, and
# create an 80/20 stratified split by market-value bin and role.
# Scaling is fitted only on the training set to avoid leakage.
# ============================================================
raw_predictors <- c("age", "height", "weight", ability_cols, "is_gk")
raw_predictors <- unique(intersect(raw_predictors, names(df)))
model_df <- df %>%
dplyr::select(player, country, club, value_num, log_value, all_of(raw_predictors)) %>%
filter(complete.cases(.))
breaks <- unique(quantile(model_df$value_num, probs = seq(0, 1, length.out = 6), na.rm = TRUE))
model_df$value_bin <- cut(model_df$value_num, breaks = breaks, include.lowest = TRUE, labels = FALSE)
strata <- interaction(model_df$value_bin, model_df$is_gk, drop = TRUE)
test_idx <- integer(0)
for (level in unique(strata)) {
idx <- which(strata == level)
n_test <- max(1L, round(0.20 * length(idx)))
if (n_test >= length(idx)) {
n_test <- max(1L, length(idx) - 1L)
}
test_idx <- c(test_idx, sample(idx, n_test))
}
test_idx <- sort(unique(test_idx))
train <- model_df[-test_idx, , drop = FALSE]
test <- model_df[test_idx, , drop = FALSE]
train_mean_log <- mean(train$log_value)
top10_threshold <- as.numeric(quantile(train$value_num, 0.90, na.rm = TRUE))
top_test_idx <- test$value_num >= top10_threshold
scaler <- function(train_df, test_df, cols) {
mu <- vapply(train_df[, cols, drop = FALSE], mean, numeric(1), na.rm = TRUE)
sigma <- vapply(train_df[, cols, drop = FALSE], sd, numeric(1), na.rm = TRUE)
sigma[sigma == 0 | is.na(sigma)] <- 1
x_train <- sweep(train_df[, cols, drop = FALSE], 2, mu, "-")
x_train <- sweep(x_train, 2, sigma, "/")
x_test <- sweep(test_df[, cols, drop = FALSE], 2, mu, "-")
x_test <- sweep(x_test, 2, sigma, "/")
list(
x_train = as.data.frame(x_train),
x_test = as.data.frame(x_test),
center = mu,
scale = sigma
)
}
scaled <- scaler(train, test, raw_predictors)
x_train <- scaled$x_train
x_test <- scaled$x_test
y_train <- train$log_value
y_test <- test$log_value
train_std <- data.frame(log_value = y_train, x_train, check.names = FALSE)
test_std <- data.frame(x_test, check.names = FALSE)
# ============================================================
# PART 4. Baseline and regression modelling
# Purpose: compare a training-mean baseline with OLS, weighted
# OLS, Ridge, Lasso, Elastic Net, Random Forest, and optionally
# Gradient Boosting if the gbm package is available.
# ============================================================
predictions <- list()
train_predictions <- list()
predictions[["Baseline mean"]] <- rep(train_mean_log, nrow(test))
train_predictions[["Baseline mean"]] <- rep(train_mean_log, nrow(train))
ols_model <- lm(log_value ~ ., data = train_std)
predictions[["OLS"]] <- as.numeric(predict(ols_model, newdata = test_std))
train_predictions[["OLS"]] <- as.numeric(predict(ols_model, newdata = train_std))
rank_pct <- rank(train$value_num, ties.method = "average") / nrow(train)
weighted_regression_weights <- 0.5 + 1.5 * rank_pct
weighted_regression_weights <- weighted_regression_weights / mean(weighted_regression_weights)
weighted_ols_model <- lm(log_value ~ ., data = train_std, weights = weighted_regression_weights)
predictions[["Weighted OLS"]] <- as.numeric(predict(weighted_ols_model, newdata = test_std))
train_predictions[["Weighted OLS"]] <- as.numeric(predict(weighted_ols_model, newdata = train_std))
x_train_mat <- as.matrix(x_train)
x_test_mat <- as.matrix(x_test)
ridge_cv <- glmnet::cv.glmnet(x_train_mat, y_train, alpha = 0, nfolds = 5, standardize = FALSE)
lasso_cv <- glmnet::cv.glmnet(x_train_mat, y_train, alpha = 1, nfolds = 5, standardize = FALSE)
elastic_cv <- glmnet::cv.glmnet(x_train_mat, y_train, alpha = 0.5, nfolds = 5, standardize = FALSE)
predictions[["Ridge"]] <- as.numeric(predict(ridge_cv, newx = x_test_mat, s = "lambda.min"))
predictions[["Lasso"]] <- as.numeric(predict(lasso_cv, newx = x_test_mat, s = "lambda.min"))
predictions[["Elastic Net"]] <- as.numeric(predict(elastic_cv, newx = x_test_mat, s = "lambda.min"))
train_predictions[["Ridge"]] <- as.numeric(predict(ridge_cv, newx = x_train_mat, s = "lambda.min"))
train_predictions[["Lasso"]] <- as.numeric(predict(lasso_cv, newx = x_train_mat, s = "lambda.min"))
train_predictions[["Elastic Net"]] <- as.numeric(predict(elastic_cv, newx = x_train_mat, s = "lambda.min"))
rf_model <- randomForest::randomForest(
x = x_train,
y = y_train,
ntree = 400,
importance = TRUE
)
predictions[["Random Forest"]] <- as.numeric(predict(rf_model, newdata = x_test))
train_predictions[["Random Forest"]] <- as.numeric(predict(rf_model, newdata = x_train))
if (gbm_available) {
gbm_model <- gbm::gbm(
log_value ~ .,
data = train_std,
distribution = "gaussian",
n.trees = 1000,
interaction.depth = 3,
shrinkage = 0.03,
bag.fraction = 0.70,
train.fraction = 1.0,
cv.folds = 5,
verbose = FALSE
)
best_gbm_trees <- gbm::gbm.perf(gbm_model, method = "cv", plot.it = FALSE)
predictions[["Gradient Boosting"]] <- as.numeric(predict(gbm_model, newdata = test_std, n.trees = best_gbm_trees))
train_predictions[["Gradient Boosting"]] <- as.numeric(predict(gbm_model, newdata = train_std, n.trees = best_gbm_trees))
} else {
best_gbm_trees <- NA
}
# Duan's smearing correction reduces retransformation bias when
# converting predictions from log(value) back to the original scale.
smearing_factors <- vapply(names(predictions), function(model_name) {
mean(exp(y_train - train_predictions[[model_name]]), na.rm = TRUE)
}, numeric(1))
# ============================================================
# PART 5. Final evaluation and long-tail handling
# Purpose: evaluate all models on the held-out test set and on
# the high-value top 10% segment. The weighted OLS model gives
# larger training weights to higher-value players as an advanced
# long-tail handling strategy.
# ============================================================
metrics <- do.call(
rbind,
lapply(names(predictions), function(model_name) {
pred <- predictions[[model_name]]
overall <- metric_row(
model_name, "overall_test", y_test, pred,
test$value_num, smearing_factors[[model_name]]
)
top10 <- metric_row(
model_name,
"top10_market_value_test",
y_test[top_test_idx],
pred[top_test_idx],
test$value_num[top_test_idx],
smearing_factors[[model_name]]
)
rbind(overall, top10)
})
)
metrics <- metrics %>% arrange(segment, RMSE_log)
write.csv(metrics, file.path(table_dir, "model_metrics.csv"), row.names = FALSE)
p_metrics <- metrics %>%
filter(segment == "overall_test") %>%
mutate(model = factor(model, levels = model[order(RMSE_log)])) %>%
ggplot(aes(x = model, y = RMSE_log)) +
geom_col(fill = "#2F855A") +
coord_flip() +
labs(title = "Regression Model Comparison", x = NULL, y = "Test RMSE on log(value)") +
theme_minimal()
save_plot("regression_model_comparison.png", p_metrics, width = 8.5, height = 5.5)
best_regression_name <- metrics %>%
filter(segment == "overall_test") %>%
arrange(RMSE_log) %>%
slice_head(n = 1) %>%
pull(model)
best_regression_pred <- predictions[[best_regression_name]]
best_regression_smear <- smearing_factors[[best_regression_name]]
regression_diagnostic_df <- data.frame(
actual_log = y_test,
predicted_log = best_regression_pred,
residual_log = y_test - best_regression_pred,
actual_value = test$value_num,
predicted_value = exp(best_regression_pred) * best_regression_smear
)
p_actual_pred <- ggplot(regression_diagnostic_df, aes(x = actual_log, y = predicted_log)) +
geom_point(alpha = 0.35, color = "#2B6CB0") +
geom_abline(slope = 1, intercept = 0, color = "#C53030", linewidth = 0.8) +
labs(
title = paste("Actual vs Predicted log(value):", best_regression_name),
x = "Actual log(value)",
y = "Predicted log(value)"
) +
theme_minimal()
save_plot("regression_actual_vs_predicted.png", p_actual_pred, width = 7.5, height = 5.5)
p_residual <- ggplot(regression_diagnostic_df, aes(x = predicted_log, y = residual_log)) +
geom_point(alpha = 0.35, color = "#805AD5") +
geom_hline(yintercept = 0, color = "#C53030", linewidth = 0.8) +
labs(
title = paste("Residual Diagnostic:", best_regression_name),
x = "Predicted log(value)",
y = "Residual"
) +
theme_minimal()
save_plot("regression_residual_diagnostic.png", p_residual, width = 7.5, height = 5.5)
# ============================================================
# PART 6. Regression interpretation and predictive importance
# Purpose: report standardized linear coefficients, regularized
# model coefficients, multicollinearity diagnostics, Random Forest
# permutation importance, and grouped importance by ability family.
# These results are associations/predictive importance, not causality.
# ============================================================
ols_coef <- data.frame(
feature = names(coef(ols_model))[-1],
standardized_coef = as.numeric(coef(ols_model)[-1]),
stringsAsFactors = FALSE
) %>%
mutate(abs_coef = abs(standardized_coef)) %>%
arrange(desc(abs_coef))
write.csv(ols_coef, file.path(table_dir, "ols_standardized_coefficients.csv"), row.names = FALSE)
lasso_coef_mat <- as.matrix(coef(lasso_cv, s = "lambda.min"))
lasso_coef <- data.frame(
feature = rownames(lasso_coef_mat),
coefficient = as.numeric(lasso_coef_mat[, 1]),
stringsAsFactors = FALSE
) %>%
filter(feature != "(Intercept)") %>%
mutate(abs_coef = abs(coefficient), selected = coefficient != 0) %>%
arrange(desc(abs_coef))
write.csv(lasso_coef, file.path(table_dir, "lasso_coefficients.csv"), row.names = FALSE)
calc_vif <- function(x_df) {
cols <- names(x_df)
out <- lapply(cols, function(col) {
other_cols <- setdiff(cols, col)
tmp <- data.frame(y = x_df[[col]], x_df[, other_cols, drop = FALSE], check.names = FALSE)
fit <- tryCatch(lm(y ~ ., data = tmp), error = function(e) NULL)
if (is.null(fit)) {
vif_value <- NA_real_
} else {
r2 <- summary(fit)$r.squared
vif_value <- ifelse(r2 >= 0.999999, Inf, 1 / (1 - r2))
}
data.frame(feature = col, VIF = vif_value, stringsAsFactors = FALSE)
})
do.call(rbind, out) %>% arrange(desc(VIF))
}
vif_table <- calc_vif(x_train)
write.csv(vif_table, file.path(table_dir, "vif_table.csv"), row.names = FALSE)
permutation_importance <- function(model, x, y, repeats = 3) {
base_pred <- as.numeric(predict(model, newdata = x))
base_rmse <- rmse(y, base_pred)
rows <- lapply(names(x), function(feature) {
deltas <- numeric(repeats)
for (i in seq_len(repeats)) {
x_perm <- x
x_perm[[feature]] <- sample(x_perm[[feature]])
pred <- as.numeric(predict(model, newdata = x_perm))
deltas[i] <- rmse(y, pred) - base_rmse
}
data.frame(
feature = feature,
permutation_importance_rmse_increase = mean(deltas),
stringsAsFactors = FALSE
)
})
do.call(rbind, rows) %>% arrange(desc(permutation_importance_rmse_increase))
}
rf_perm_importance <- permutation_importance(rf_model, x_test, y_test, repeats = 3)
write.csv(rf_perm_importance, file.path(table_dir, "rf_permutation_importance.csv"), row.names = FALSE)
feature_group_map <- data.frame(
feature = raw_predictors,
group = dplyr::case_when(
raw_predictors %in% c("age") ~ "age",
raw_predictors %in% c("height", "weight") ~ "body_size",
raw_predictors %in% gk_cols | raw_predictors == "is_gk" ~ "goalkeeping",
raw_predictors %in% attacking_cols ~ "attacking",
raw_predictors %in% passing_cols ~ "passing",
raw_predictors %in% defending_cols ~ "defending",
raw_predictors %in% physical_cols ~ "physical",
raw_predictors %in% mental_cols ~ "mental",
TRUE ~ "other"
),
stringsAsFactors = FALSE
)
ols_group_importance <- ols_coef %>%
left_join(feature_group_map, by = "feature") %>%
group_by(group) %>%
summarise(total_abs_standardized_coef = sum(abs_coef, na.rm = TRUE), .groups = "drop") %>%
mutate(percent = total_abs_standardized_coef / sum(total_abs_standardized_coef)) %>%
arrange(desc(total_abs_standardized_coef))
write.csv(ols_group_importance, file.path(table_dir, "ols_group_importance.csv"), row.names = FALSE)
rf_group_importance <- rf_perm_importance %>%
left_join(feature_group_map, by = "feature") %>%
group_by(group) %>%
summarise(total_rmse_increase = sum(pmax(permutation_importance_rmse_increase, 0), na.rm = TRUE), .groups = "drop") %>%
mutate(percent = total_rmse_increase / sum(total_rmse_increase)) %>%
arrange(desc(total_rmse_increase))
write.csv(rf_group_importance, file.path(table_dir, "rf_group_importance.csv"), row.names = FALSE)
p_ols_importance <- ols_coef %>%
slice_head(n = 15) %>%
mutate(feature = factor(feature, levels = rev(feature))) %>%
ggplot(aes(x = feature, y = abs_coef)) +
geom_col(fill = "#744210") +
coord_flip() +
labs(title = "OLS Variable Importance", subtitle = "Absolute standardized coefficients", x = NULL, y = "|standardized coefficient|") +
theme_minimal()
save_plot("regression_ols_importance.png", p_ols_importance, width = 8.5, height = 6)
p_rf_importance <- rf_perm_importance %>%
slice_head(n = 15) %>%
mutate(feature = factor(feature, levels = rev(feature))) %>%
ggplot(aes(x = feature, y = permutation_importance_rmse_increase)) +
geom_col(fill = "#2B6CB0") +
coord_flip() +
labs(title = "Random Forest Permutation Importance", x = NULL, y = "RMSE increase after permutation") +
theme_minimal()
save_plot("regression_rf_permutation_importance.png", p_rf_importance, width = 8.5, height = 6)
p_group_importance <- ols_group_importance %>%
mutate(group = factor(group, levels = rev(group))) %>%
ggplot(aes(x = group, y = percent)) +
geom_col(fill = "#9F7AEA") +
coord_flip() +
scale_y_continuous(labels = scales::percent_format()) +
labs(title = "Grouped OLS Importance", x = NULL, y = "Share of total absolute standardized coefficients") +
theme_minimal()
save_plot("regression_group_importance.png", p_group_importance, width = 8.5, height = 5.5)
# ============================================================
# PART 7. High-value player classification
# Purpose: add a distinct classification problem aligned with the
# assignment guidance. The positive class is defined using only the
# training-set top 10% market-value threshold. Class weighting is
# applied only during training to address class imbalance.
# ============================================================
class_train <- as.integer(train$value_num >= top10_threshold)
class_test <- as.integer(test$value_num >= top10_threshold)
class_train_factor <- factor(class_train, levels = c(0, 1), labels = c("No", "Yes"))
classification_balance <- data.frame(
dataset = c("Training", "Training", "Test", "Test"),
class = c("Not high value", "High value", "Not high value", "High value"),
n = c(
sum(class_train == 0), sum(class_train == 1),
sum(class_test == 0), sum(class_test == 1)
),
stringsAsFactors = FALSE
) %>%
group_by(dataset) %>%
mutate(percent = n / sum(n)) %>%
ungroup()
write.csv(classification_balance, file.path(table_dir, "classification_balance.csv"), row.names = FALSE)
p_class_balance <- ggplot(classification_balance, aes(x = dataset, y = percent, fill = class)) +
geom_col(position = "stack") +
scale_y_continuous(labels = scales::percent_format()) +
scale_fill_manual(values = c("Not high value" = "#A0AEC0", "High value" = "#DD6B20")) +
labs(
title = "High-Value Classification Target",
subtitle = paste("Positive class uses the training top-10% threshold:", scales::dollar(top10_threshold)),
x = NULL,
y = "Class share",
fill = NULL
) +
theme_minimal()
save_plot("classification_class_balance.png", p_class_balance, width = 7.5, height = 5)
class_weight_ratio <- sum(class_train == 0) / sum(class_train == 1)
classification_weights <- ifelse(class_train == 1, class_weight_ratio, 1)
classification_probabilities <- list()
classification_probabilities[["Prevalence baseline"]] <- rep(mean(class_train), nrow(test))
class_train_glm <- data.frame(high_value = class_train, x_train, check.names = FALSE)
class_test_glm <- data.frame(x_test, check.names = FALSE)
logistic_model <- glm(
high_value ~ .,
data = class_train_glm,
family = binomial(),
control = glm.control(maxit = 200)
)
classification_probabilities[["Logistic Regression"]] <- as.numeric(
predict(logistic_model, newdata = class_test_glm, type = "response")
)
weighted_logistic_model <- glm(
high_value ~ .,
data = class_train_glm,
family = binomial(),
weights = classification_weights,
control = glm.control(maxit = 200)
)
classification_probabilities[["Weighted Logistic"]] <- as.numeric(
predict(weighted_logistic_model, newdata = class_test_glm, type = "response")
)
class_elastic_cv <- glmnet::cv.glmnet(
x_train_mat,
class_train,
family = "binomial",
alpha = 0.5,
weights = classification_weights,
type.measure = "auc",
nfolds = 5,
standardize = FALSE
)
classification_probabilities[["Weighted Elastic Net"]] <- as.numeric(
predict(class_elastic_cv, newx = x_test_mat, s = "lambda.min", type = "response")
)
rf_class_model <- randomForest::randomForest(
x = x_train,
y = class_train_factor,
ntree = 500,
importance = TRUE,
classwt = c("No" = 1, "Yes" = class_weight_ratio)
)
classification_probabilities[["Class-Weighted Random Forest"]] <- as.numeric(
predict(rf_class_model, newdata = x_test, type = "prob")[, "Yes"]
)
if (gbm_available) {
gbm_class_model <- gbm::gbm(
high_value ~ .,
data = class_train_glm,
distribution = "bernoulli",
weights = classification_weights,
n.trees = 1000,
interaction.depth = 3,
shrinkage = 0.03,
bag.fraction = 0.70,
train.fraction = 1.0,
cv.folds = 5,
verbose = FALSE
)
best_gbm_class_trees <- gbm::gbm.perf(gbm_class_model, method = "cv", plot.it = FALSE)
classification_probabilities[["Class-Weighted Gradient Boosting"]] <- as.numeric(
predict(gbm_class_model, newdata = class_test_glm, n.trees = best_gbm_class_trees, type = "response")
)
}
classification_metrics <- do.call(
rbind,
lapply(names(classification_probabilities), function(model_name) {
classification_metric_row(
model_name,
class_test,
classification_probabilities[[model_name]],
threshold = 0.5
)
})
) %>%
arrange(desc(PR_AUC), desc(ROC_AUC))
write.csv(classification_metrics, file.path(table_dir, "classification_metrics.csv"), row.names = FALSE)
best_classifier_name <- classification_metrics$model[1]
best_classifier_probability <- classification_probabilities[[best_classifier_name]]
best_classifier_prediction <- as.integer(best_classifier_probability >= 0.5)
confusion_matrix_df <- as.data.frame(
table(
Actual = factor(class_test, levels = c(0, 1), labels = c("Not high value", "High value")),
Predicted = factor(
best_classifier_prediction,
levels = c(0, 1),
labels = c("Not high value", "High value")
)
),
stringsAsFactors = FALSE
) %>%
group_by(Actual) %>%
mutate(
row_percent = Freq / sum(Freq),
cell_label = paste0(Freq, "\n", scales::percent(row_percent, accuracy = 0.1))
) %>%
ungroup()
write.csv(
confusion_matrix_df,
file.path(table_dir, "classification_confusion_matrix.csv"),
row.names = FALSE
)
p_confusion_matrix <- ggplot(
confusion_matrix_df,
aes(x = Predicted, y = Actual, fill = row_percent)
) +
geom_tile(color = "white", linewidth = 1.2) +
geom_text(aes(label = cell_label), size = 5, fontface = "bold") +
scale_fill_gradient(low = "#EBF8FF", high = "#2B6CB0", labels = scales::percent_format()) +
labs(
title = paste("Confusion Matrix:", best_classifier_name),
subtitle = "Fixed probability threshold = 0.50; percentages are calculated within each actual class.",
x = "Predicted class",
y = "Actual class",
fill = "Row share"
) +
theme_minimal(base_size = 12) +
theme(panel.grid = element_blank())
save_plot("classification_confusion_matrix.png", p_confusion_matrix, width = 8, height = 5.5)
p_class_metrics <- classification_metrics %>%
dplyr::select(model, ROC_AUC, PR_AUC) %>%
tidyr::pivot_longer(c(ROC_AUC, PR_AUC), names_to = "metric", values_to = "score") %>%
ggplot(aes(x = reorder(model, score), y = score, fill = metric)) +
geom_col(position = "dodge") +
coord_flip() +
scale_y_continuous(limits = c(0, 1)) +
labs(
title = "High-Value Classification Performance",
subtitle = "PR-AUC is emphasized because only about 10% of players are positive.",
x = NULL,
y = "Test score",
fill = NULL
) +
theme_minimal()
save_plot("classification_model_comparison.png", p_class_metrics, width = 9, height = 6)
classification_roc_df <- do.call(
rbind,
lapply(names(classification_probabilities), function(model_name) {
roc_curve_points(class_test, classification_probabilities[[model_name]], model_name)
})
)
p_class_roc <- ggplot(classification_roc_df, aes(x = FPR, y = TPR, color = model)) +
geom_line(linewidth = 0.8) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey50") +
coord_equal() +
labs(
title = "ROC Curves for High-Value Classification",
x = "False positive rate",
y = "True positive rate",
color = "Model"
) +
theme_minimal()
save_plot("classification_roc_curves.png", p_class_roc, width = 8.5, height = 6)
rf_class_importance <- data.frame(
feature = rownames(randomForest::importance(rf_class_model)),
mean_decrease_accuracy = randomForest::importance(rf_class_model)[, "MeanDecreaseAccuracy"],
stringsAsFactors = FALSE
) %>%
arrange(desc(mean_decrease_accuracy))
write.csv(rf_class_importance, file.path(table_dir, "classification_rf_importance.csv"), row.names = FALSE)
p_class_importance <- rf_class_importance %>%
slice_head(n = 15) %>%
mutate(feature = factor(feature, levels = rev(feature))) %>%
ggplot(aes(x = feature, y = mean_decrease_accuracy)) +
geom_col(fill = "#C05621") +
coord_flip() +
labs(
title = "High-Value Classification Variable Importance",
subtitle = "Class-weighted Random Forest mean decrease in accuracy",
x = NULL,
y = "Mean decrease in accuracy"
) +
theme_minimal()
save_plot("classification_rf_importance.png", p_class_importance, width = 8.5, height = 6)
# ============================================================
# PART 8. Clustering design
# Purpose: build technical archetypes without using market value.
# Outfield players and goalkeepers are clustered separately because
# their technical profiles are structurally different.
# ============================================================
outfield <- df %>% filter(is_gk == 0)
goalkeepers <- df %>% filter(is_gk == 1)
cluster_vars_out <- unique(c("height", "weight", setdiff(ability_cols, gk_cols)))
cluster_vars_out <- intersect(cluster_vars_out, names(outfield))
out_cluster_data <- outfield[, cluster_vars_out, drop = FALSE]
out_cluster_data <- out_cluster_data[, vapply(out_cluster_data, sd, numeric(1), na.rm = TRUE) > 0, drop = FALSE]
out_complete <- complete.cases(out_cluster_data)
out_scaled <- scale(out_cluster_data[out_complete, , drop = FALSE])
outfield_cluster_df <- outfield[out_complete, , drop = FALSE]
# ============================================================
# PART 8A. Clustering model selection helper
# Purpose: compare k-means solutions using elbow, silhouette,
# and gap statistic diagnostics.
# ============================================================
select_k_kmeans <- function(x, k_values, sample_size = 2000, gap_b = 10) {
idx <- sample(seq_len(nrow(x)), min(sample_size, nrow(x)))
x_sample <- x[idx, , drop = FALSE]
sample_dist <- dist(x_sample)
sil_rows <- lapply(k_values, function(k) {
km <- kmeans(x_sample, centers = k, nstart = 25, iter.max = 100)
sil <- mean(cluster::silhouette(km$cluster, sample_dist)[, "sil_width"])
data.frame(k = k, silhouette = sil)
})
sil_df <- do.call(rbind, sil_rows)
elbow_df <- do.call(rbind, lapply(k_values, function(k) {
km <- kmeans(x, centers = k, nstart = 20, iter.max = 100)
data.frame(k = k, total_withinss = km$tot.withinss)
}))
gap_df <- tryCatch({
gap <- cluster::clusGap(
x_sample,
FUN = function(data, k) kmeans(data, centers = k, nstart = 10, iter.max = 100),
K.max = max(k_values),
B = gap_b
)
data.frame(k = seq_len(nrow(gap$Tab)), gap = gap$Tab[, "gap"], gap_se = gap$Tab[, "SE.sim"])
}, error = function(e) {
data.frame(k = k_values, gap = NA_real_, gap_se = NA_real_)
})
best_k <- sil_df$k[which.max(sil_df$silhouette)]
list(silhouette = sil_df, elbow = elbow_df, gap = gap_df, best_k = best_k)
}
out_k_result <- select_k_kmeans(out_scaled, k_values = 3:8, sample_size = 2000, gap_b = 10)
write.csv(out_k_result$silhouette, file.path(table_dir, "outfield_silhouette_scores.csv"), row.names = FALSE)
write.csv(out_k_result$elbow, file.path(table_dir, "outfield_elbow_scores.csv"), row.names = FALSE)
write.csv(out_k_result$gap, file.path(table_dir, "outfield_gap_scores.csv"), row.names = FALSE)
best_k_out <- out_k_result$best_k
out_km <- kmeans(out_scaled, centers = best_k_out, nstart = 50, iter.max = 150)
outfield_cluster_df$outfield_cluster <- paste0("C", out_km$cluster)
# Compare three clustering algorithms on the same reproducible sample.
set.seed(2026)
method_sample_idx <- sample(seq_len(nrow(out_scaled)), min(1200, nrow(out_scaled)))
method_sample <- out_scaled[method_sample_idx, , drop = FALSE]
method_dist <- dist(method_sample)
method_kmeans <- kmeans(method_sample, centers = best_k_out, nstart = 40, iter.max = 150)$cluster
method_pam <- cluster::pam(method_sample, k = best_k_out, cluster.only = TRUE)
method_hierarchical <- cutree(
hclust(method_dist, method = "ward.D2"),
k = best_k_out
)
clustering_method_comparison <- data.frame(
method = c("k-means", "PAM", "Hierarchical Ward.D2"),
average_silhouette = c(
mean(cluster::silhouette(method_kmeans, method_dist)[, "sil_width"]),
mean(cluster::silhouette(method_pam, method_dist)[, "sil_width"]),
mean(cluster::silhouette(method_hierarchical, method_dist)[, "sil_width"])
),
sample_n = length(method_sample_idx),
stringsAsFactors = FALSE
) %>%
arrange(desc(average_silhouette))
write.csv(
clustering_method_comparison,
file.path(table_dir, "outfield_clustering_method_comparison.csv"),
row.names = FALSE
)
# Refit k-means under multiple random seeds and compare each solution
# with the final reference partition using the Adjusted Rand Index.
outfield_cluster_stability <- do.call(
rbind,
lapply(1:10, function(seed_value) {
set.seed(seed_value)
alternative <- kmeans(
out_scaled,
centers = best_k_out,
nstart = 10,
iter.max = 150
)$cluster
data.frame(
seed = seed_value,
adjusted_rand_index = adjusted_rand_index(out_km$cluster, alternative)
)
})
)
write.csv(
outfield_cluster_stability,
file.path(table_dir, "outfield_cluster_stability.csv"),
row.names = FALSE
)
p_cluster_methods <- ggplot(
clustering_method_comparison,
aes(x = reorder(method, average_silhouette), y = average_silhouette)
) +
geom_col(fill = "#3182CE") +
coord_flip() +
labs(
title = "Outfield Clustering Method Comparison",
subtitle = paste("Comparison at k =", best_k_out, "using the same player sample"),
x = NULL,
y = "Average silhouette"
) +
theme_minimal()
save_plot("cluster_outfield_method_comparison.png", p_cluster_methods, width = 8, height = 5)
p_cluster_stability <- ggplot(
outfield_cluster_stability,
aes(x = factor(seed), y = adjusted_rand_index)
) +
geom_col(fill = "#2F855A") +
geom_hline(yintercept = 0.80, linetype = "dashed", color = "#C53030") +
coord_cartesian(ylim = c(0, 1)) +
labs(
title = "Outfield k-means Stability Across Random Seeds",
subtitle = "Adjusted Rand Index relative to the final solution",
x = "Random seed",
y = "Adjusted Rand Index"
) +
theme_minimal()
save_plot("cluster_outfield_stability.png", p_cluster_stability, width = 8, height = 5)
# ============================================================
# PART 8B. Outfield clustering diagnostics and PCA visualization
# Purpose: save elbow, silhouette, gap, and PCA plots for the
# outfield archetype solution.
# ============================================================
p_out_elbow <- ggplot(out_k_result$elbow, aes(x = k, y = total_withinss)) +
geom_line() +
geom_point(size = 2, color = "#2B6CB0") +
labs(title = "Outfield Clustering: Elbow Method", x = "k", y = "Total within-cluster SS") +
theme_minimal()
save_plot("cluster_outfield_elbow.png", p_out_elbow, width = 7.5, height = 5)
p_out_sil <- ggplot(out_k_result$silhouette, aes(x = k, y = silhouette)) +
geom_line() +
geom_point(size = 2, color = "#C53030") +
labs(title = "Outfield Clustering: Silhouette Scores", x = "k", y = "Average silhouette") +
theme_minimal()
save_plot("cluster_outfield_silhouette.png", p_out_sil, width = 7.5, height = 5)
p_out_gap <- ggplot(out_k_result$gap, aes(x = k, y = gap)) +
geom_line() +
geom_point(size = 2, color = "#2F855A") +
geom_errorbar(aes(ymin = gap - gap_se, ymax = gap + gap_se), width = 0.15, alpha = 0.6) +
labs(title = "Outfield Clustering: Gap Statistic", x = "k", y = "Gap") +
theme_minimal()
save_plot("cluster_outfield_gap.png", p_out_gap, width = 7.5, height = 5)
out_pca <- prcomp(out_scaled, center = FALSE, scale. = FALSE)
out_pca_df <- data.frame(
PC1 = out_pca$x[, 1],
PC2 = out_pca$x[, 2],
cluster = outfield_cluster_df$outfield_cluster,
player = outfield_cluster_df$player,
stringsAsFactors = FALSE
)
p_out_pca <- ggplot(out_pca_df, aes(x = PC1, y = PC2, color = cluster)) +
geom_point(alpha = 0.55, size = 0.9) +
labs(
title = "Outfield Player Archetypes by PCA",
subtitle = paste("k-means selected k =", best_k_out),
x = "PC1",
y = "PC2",
color = "Cluster"
) +
theme_minimal()
save_plot("cluster_outfield_pca.png", p_out_pca, width = 9, height = 6)
# ============================================================
# PART 8C. Outfield archetype profiling and labelling
# Purpose: summarize each cluster using standardized ability-group
# profiles, assign readable archetype labels, and export profiles.
# ============================================================
cluster_group_vars <- c("attacking_mean", "passing_mean", "defending_mean", "physical_mean", "mental_mean", "height", "weight")
cluster_group_vars <- intersect(cluster_group_vars, names(outfield_cluster_df))
group_scaled <- as.data.frame(scale(outfield_cluster_df[, cluster_group_vars, drop = FALSE]))
group_scaled$outfield_cluster <- outfield_cluster_df$outfield_cluster
out_group_profile <- group_scaled %>%
group_by(outfield_cluster) %>%
summarise(across(all_of(cluster_group_vars), function(x) mean(x, na.rm = TRUE)), .groups = "drop")
out_group_long <- out_group_profile %>%
tidyr::pivot_longer(-outfield_cluster, names_to = "profile_variable", values_to = "z_mean")
label_cluster <- function(cluster_name) {
row <- out_group_profile[out_group_profile$outfield_cluster == cluster_name, , drop = FALSE]
vars <- setdiff(names(row), "outfield_cluster")
ordered <- vars[order(as.numeric(row[1, vars]), decreasing = TRUE)]
top1 <- ordered[1]
top2 <- ordered[2]
if (top1 == "defending_mean") return("Defensive ball-winner")
if (top1 == "passing_mean" || (top1 == "mental_mean" && top2 == "passing_mean")) return("Playmaker / creator")
if (top1 == "attacking_mean" && top2 %in% c("passing_mean", "mental_mean")) return("Attacking creator")
if (top1 == "attacking_mean") return("Finisher / attacker")
if (top1 == "physical_mean" && top2 %in% c("height", "weight")) return("Target forward / physical profile")
if (top1 == "physical_mean") return("Pace and physical profile")
if (top1 %in% c("height", "weight")) return("Large body-size profile")
"All-round profile"
}
cluster_labels <- data.frame(
outfield_cluster = sort(unique(outfield_cluster_df$outfield_cluster)),
archetype_label = vapply(sort(unique(outfield_cluster_df$outfield_cluster)), label_cluster, character(1)),
stringsAsFactors = FALSE
)
outfield_cluster_df <- outfield_cluster_df %>%
left_join(cluster_labels, by = "outfield_cluster")
out_cluster_profile <- outfield_cluster_df %>%
group_by(outfield_cluster, archetype_label) %>%
summarise(
n = n(),
mean_value = mean(value_num, na.rm = TRUE),
median_value = median(value_num, na.rm = TRUE),
mean_age = mean(age, na.rm = TRUE),
mean_height = mean(height, na.rm = TRUE),
mean_weight = mean(weight, na.rm = TRUE),
attacking_mean = mean(attacking_mean, na.rm = TRUE),
passing_mean = mean(passing_mean, na.rm = TRUE),
defending_mean = mean(defending_mean, na.rm = TRUE),
physical_mean = mean(physical_mean, na.rm = TRUE),
mental_mean = mean(mental_mean, na.rm = TRUE),
.groups = "drop"
)
write.csv(out_cluster_profile, file.path(table_dir, "outfield_cluster_profiles.csv"), row.names = FALSE)
out_group_long <- out_group_long %>%
left_join(cluster_labels, by = "outfield_cluster") %>%
mutate(cluster_label = paste(outfield_cluster, archetype_label, sep = ": "))
p_out_profile <- ggplot(out_group_long, aes(x = profile_variable, y = cluster_label, fill = z_mean)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "#2166AC", mid = "white", high = "#B2182B", midpoint = 0) +
labs(
title = "Outfield Cluster Profiles",
subtitle = "Values are standardized means within each cluster.",
x = NULL,
y = NULL,
fill = "z mean"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 35, hjust = 1))
save_plot("cluster_outfield_profile_heatmap.png", p_out_profile, width = 10, height = 5.8)
# ============================================================
# PART 8D. Goalkeeper clustering handled separately
# Purpose: cluster goalkeepers only on goalkeeper-relevant variables
# and create separate PCA/profile outputs when enough cases exist.
# ============================================================
gk_cluster_profile <- data.frame()
best_k_gk <- NA
if (nrow(goalkeepers) >= 20) {
cluster_vars_gk <- unique(c("height", "weight", "reactions", "composure", gk_cols))
cluster_vars_gk <- intersect(cluster_vars_gk, names(goalkeepers))
gk_cluster_data <- goalkeepers[, cluster_vars_gk, drop = FALSE]
gk_cluster_data <- gk_cluster_data[, vapply(gk_cluster_data, sd, numeric(1), na.rm = TRUE) > 0, drop = FALSE]
gk_complete <- complete.cases(gk_cluster_data)
gk_scaled <- scale(gk_cluster_data[gk_complete, , drop = FALSE])
goalkeeper_cluster_df <- goalkeepers[gk_complete, , drop = FALSE]
gk_k_values <- 2:5
gk_dist <- dist(gk_scaled)
gk_sil <- do.call(rbind, lapply(gk_k_values, function(k) {
km <- kmeans(gk_scaled, centers = k, nstart = 30, iter.max = 100)
data.frame(k = k, silhouette = mean(cluster::silhouette(km$cluster, gk_dist)[, "sil_width"]))
}))
write.csv(gk_sil, file.path(table_dir, "goalkeeper_silhouette_scores.csv"), row.names = FALSE)
best_k_gk <- gk_sil$k[which.max(gk_sil$silhouette)]
gk_km <- kmeans(gk_scaled, centers = best_k_gk, nstart = 50, iter.max = 150)
goalkeeper_cluster_df$goalkeeper_cluster <- paste0("GK", gk_km$cluster)
gk_cluster_profile <- goalkeeper_cluster_df %>%
group_by(goalkeeper_cluster) %>%
summarise(
n = n(),
mean_value = mean(value_num, na.rm = TRUE),
median_value = median(value_num, na.rm = TRUE),
mean_age = mean(age, na.rm = TRUE),
mean_height = mean(height, na.rm = TRUE),
mean_weight = mean(weight, na.rm = TRUE),
gk_score = mean(gk_score, na.rm = TRUE),
reactions = mean(reactions, na.rm = TRUE),
composure = mean(composure, na.rm = TRUE),
.groups = "drop"
)
write.csv(gk_cluster_profile, file.path(table_dir, "goalkeeper_cluster_profiles.csv"), row.names = FALSE)
gk_pca <- prcomp(gk_scaled, center = FALSE, scale. = FALSE)
gk_pca_df <- data.frame(
PC1 = gk_pca$x[, 1],
PC2 = gk_pca$x[, 2],
cluster = goalkeeper_cluster_df$goalkeeper_cluster,
stringsAsFactors = FALSE
)
p_gk_pca <- ggplot(gk_pca_df, aes(x = PC1, y = PC2, color = cluster)) +
geom_point(alpha = 0.65, size = 1.2) +
labs(
title = "Goalkeeper Archetypes by PCA",
subtitle = paste("k-means selected k =", best_k_gk),
x = "PC1",
y = "PC2",
color = "Cluster"
) +
theme_minimal()
save_plot("cluster_goalkeeper_pca.png", p_gk_pca, width = 8.5, height = 5.5)
gk_profile_scaled <- as.data.frame(scale(goalkeeper_cluster_df[, c("height", "weight", "reactions", "composure", "gk_score"), drop = FALSE]))
gk_profile_scaled$goalkeeper_cluster <- goalkeeper_cluster_df$goalkeeper_cluster
gk_profile_long <- gk_profile_scaled %>%
group_by(goalkeeper_cluster) %>%
summarise(across(where(is.numeric), function(x) mean(x, na.rm = TRUE)), .groups = "drop") %>%
tidyr::pivot_longer(-goalkeeper_cluster, names_to = "profile_variable", values_to = "z_mean")
p_gk_profile <- ggplot(gk_profile_long, aes(x = profile_variable, y = goalkeeper_cluster, fill = z_mean)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "#2166AC", mid = "white", high = "#B2182B", midpoint = 0) +
labs(title = "Goalkeeper Cluster Profiles", x = NULL, y = NULL, fill = "z mean") +
theme_minimal()
save_plot("cluster_goalkeeper_profile_heatmap.png", p_gk_profile, width = 8.5, height = 4.8)
}
# ============================================================
# PART 9. Final exports and compact objects for the report
# Purpose: save the final cleaned dataset with cluster assignments
# and prepare compact tables used in the narrative sections below.
# ============================================================
clustered_df <- df
out_cols_to_merge <- outfield_cluster_df %>%
dplyr::select(row_id, outfield_cluster, archetype_label)
clustered_df <- clustered_df %>%
left_join(out_cols_to_merge, by = "row_id")
if (exists("goalkeeper_cluster_df")) {
gk_cols_to_merge <- goalkeeper_cluster_df %>%
dplyr::select(row_id, goalkeeper_cluster)
clustered_df <- clustered_df %>%
left_join(gk_cols_to_merge, by = "row_id")
} else {
clustered_df$goalkeeper_cluster <- NA_character_
}
write.csv(clustered_df, file.path(output_dir, "cleaned_player_stats_with_clusters.csv"), row.names = FALSE, fileEncoding = "UTF-8")
split_summary <- data.frame(
item = c(
"train_rows", "test_rows", "top10_threshold_from_train",
"high_value_rows_in_train", "high_value_rows_in_test"
),
value = c(
nrow(train), nrow(test), round(top10_threshold, 0),
sum(class_train), sum(class_test)
)
)
write.csv(split_summary, file.path(table_dir, "split_summary.csv"), row.names = FALSE)
# Compact report objects used below
best_overall_model <- metrics %>%
filter(segment == "overall_test") %>%
arrange(RMSE_log) %>%
slice_head(n = 1)
baseline_rmse_log <- metrics %>%
filter(segment == "overall_test", model == "Baseline mean") %>%
pull(RMSE_log)
best_rmse_improvement <- 100 * (baseline_rmse_log - best_overall_model$RMSE_log[1]) / baseline_rmse_log
best_classifier <- classification_metrics %>%
arrange(desc(PR_AUC), desc(ROC_AUC)) %>%
slice_head(n = 1)
classification_prevalence <- mean(class_test)
top_classification_importance <- rf_class_importance %>% slice_head(n = 10)
best_outfield_silhouette <- max(out_k_result$silhouette$silhouette)
mean_outfield_stability <- mean(outfield_cluster_stability$adjusted_rand_index)
top_cor_report <- cor_table %>% slice_head(n = 8)
top_ols_report <- ols_coef %>% slice_head(n = 10)
top_rf_report <- rf_perm_importance %>% slice_head(n = 10)
top_vif_report <- vif_table %>% slice_head(n = 10)
The project uses the FIFA 24 Player Stats Dataset, published by Rehan Ahmed on Kaggle. Its purpose is to provide player-level demographic, physical, technical, mental, goalkeeper, and market-value information for analysis and machine learning.
dataset_overview
The raw dimension score is 232,962 (5682 rows multiplied by 41 columns), exceeding the assignment requirement of 100,000.
data_structure_summary
The raw dataset contains one row per player. Most ability variables
are numeric FIFA ratings. player, country, and
club are identifiers or contextual fields, while
value is the estimated market-value outcome.
raw_missing_summary %>%
filter(missing_n > 0) %>%
slice_head(n = 10)
The cleaning process was designed as an auditable pipeline:
club and
country.marking, which is entirely missing in the
supplied file.value field into
value_num.log_value, goalkeeper status, and grouped
ability scores.cleaning_summary
Values such as $1.400.000 are parsed as
1400000. Values such as $975.00 are parsed as
975000, because this dataset uses that notation for
thousands rather than cents. The final cleaned dataset is exported for
reproducibility.
The raw market-value distribution is strongly right-skewed. A small number of elite players have extremely high values, while most observations are concentrated at lower levels. The log transformation substantially reduces this skew and provides a more stable regression target.
value_summary
Goalkeeping attributes are structurally different from outfield technical variables. This empirical difference supports separating goalkeepers and outfield players during clustering.
The following variables have the strongest positive bivariate
correlations with log_value. Correlation is descriptive and
does not establish causality.
top_cor_report
The target is log_value = log(value_num). An 80/20 split
is stratified by market-value quintile and goalkeeper status. The split
is created before preprocessing, and scaling parameters are estimated
only from the training set. The same held-out test set is used for final
model comparison.
To protect the reliability of the evaluation:
split_summary
The baseline predicts the training-set mean of
log_value. It is compared with OLS, weighted OLS, Ridge,
Lasso, Elastic Net, Random Forest, and Gradient Boosting.
metrics %>%
filter(segment == "overall_test") %>%
arrange(RMSE_log) %>%
dplyr::select(model, n, RMSE_log, MAE_log, R2_log, RMSE_value, MAE_value, MedAE_value)
The best predictive model is Gradient Boosting, with test RMSE of 0.258 and test R-squared of 0.961 on the log scale. Its log-scale RMSE is 80.3% lower than the mean baseline.
The high-value subset uses the top 10% threshold calculated from the training data only: $4,700,000. Metrics are then calculated on the corresponding test observations. Unlike the earlier draft, subset R-squared uses the subset’s own total sum of squares, so it is not inflated by comparison with the overall training mean.
metrics %>%
filter(segment == "top10_market_value_test") %>%
arrange(RMSE_log) %>%
dplyr::select(model, n, RMSE_log, MAE_log, R2_log, RMSE_value, MAE_value, MedAE_value)
Original-scale predictions use Duan’s smearing correction to reduce
bias when converting from log_value back to market
value.
OLS standardized coefficients quantify conditional associations with
log_value, holding the other included variables constant.
Because the technical variables are highly correlated, individual
coefficients must be interpreted cautiously.
top_ols_report
Random Forest permutation importance provides a complementary predictive-importance measure based on the increase in test RMSE after each variable is permuted.
top_rf_report
Grouped importance reduces the risk of over-interpreting one variable from a highly correlated ability family.
ols_group_importance
top_vif_report
Market value is a continuous, long-tailed outcome rather than a classification label. The regression-specific handling therefore includes:
For a separate classification problem, a player is labelled high value when market value is at or above the training-set 90th percentile. The threshold is never recalculated on the test set. This creates a genuine minority-class problem, so class weights are used during training for selected models.
classification_balance
The models include a prevalence baseline, logistic regression, weighted logistic regression, weighted Elastic Net, class-weighted Random Forest, and class-weighted Gradient Boosting. ROC-AUC measures ranking performance, while PR-AUC is emphasized because the positive class represents only 8.6% of the test data. Threshold-dependent metrics use a fixed 0.5 threshold and are not tuned on the test set.
classification_metrics %>%
dplyr::select(
model, ROC_AUC, PR_AUC, accuracy, balanced_accuracy,
precision, recall, specificity, F1
)
The strongest classifier by test PR-AUC is Class-Weighted Gradient Boosting, with PR-AUC 0.96 and ROC-AUC 0.995. Accuracy is not used alone because a model predicting every player as low value would still appear accurate in an imbalanced dataset.
The confusion matrix below uses the best classifier selected by test PR-AUC and a fixed probability threshold of 0.50. Counts are accompanied by percentages within each actual class, making minority-class recall and false-positive behavior directly visible.
confusion_matrix_df %>%
dplyr::select(Actual, Predicted, Freq, row_percent)
top_classification_importance
Market value is excluded from all clustering inputs. Goalkeepers and outfield players are separated before clustering because combining them would mainly reproduce positional differences rather than reveal useful within-role archetypes.
The selected outfield solution uses 3 clusters. The maximum average silhouette is 0.184, indicating modest separation and substantial overlap between player profiles. The archetypes should therefore be interpreted as exploratory summaries rather than natural or definitive player classes.
clustering_method_comparison
The mean Adjusted Rand Index across repeated k-means fits is 1. This measures whether the selected partition is reproducible across random initializations.
outfield_cluster_stability
out_cluster_profile
Cluster labels summarize the strongest standardized profile dimensions. They are descriptive labels assigned after clustering; market value is reported only after cluster formation and was not used to create the clusters.
Goalkeepers are clustered separately using height, weight, reactions, composure, and goalkeeper-specific ratings.
gk_cluster_profile
The regression and classification results answer related but different questions. Regression estimates the magnitude of player value, while classification identifies players likely to exceed a decision-relevant high-value threshold. Gradient-based and tree-based models can capture nonlinear relationships and interactions, while OLS and regularized linear models provide more transparent coefficient-based interpretation.
The clustering analysis adds an unsupervised perspective. It shows how technical profiles can be summarized into archetypes without using price. However, the relatively low silhouette score demonstrates that football ability profiles lie on overlapping continua rather than forming perfectly separated groups.
The project satisfies three complementary objectives. First, Gradient Boosting is the strongest market-value regression model, achieving a test R-squared of 0.961 and RMSE_log of 0.258. This is a substantial improvement over the mean baseline and shows that FIFA 24 technical and mental ratings contain strong predictive information about market value.
Second, Class-Weighted Gradient Boosting is the strongest high-value classifier, achieving test PR-AUC of 0.96. This result demonstrates that class-weighted modelling can identify the rare high-value group much more effectively than the prevalence baseline.
Third, the selected outfield clustering solution has a maximum average silhouette of 0.184, approximately 0.19. The solution is stable across repeated k-means initializations, but the modest silhouette score means the archetypes are exploratory summaries of overlapping technical profiles rather than sharply separated natural classes.
For practical use, nonlinear models are preferred when prediction is the main objective. OLS, regularized coefficients, grouped importance, and permutation importance are more appropriate when transparent interpretation is required. All conclusions should be understood as predictive and associational rather than causal.
The report uses a fixed random seed, training-only preprocessing, and a held-out test set. Package and R versions are recorded below.
sessionInfo()
## R version 4.6.0 (2026-04-24 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
##
## Matrix products: default
## LAPACK version 3.12.1
##
## locale:
## [1] LC_COLLATE=Chinese (Simplified)_China.utf8
## [2] LC_CTYPE=Chinese (Simplified)_China.utf8
## [3] LC_MONETARY=Chinese (Simplified)_China.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=Chinese (Simplified)_China.utf8
##
## time zone: Asia/Shanghai
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] gbm_2.2.3 cluster_2.1.8.2 randomForest_4.7-1.2
## [4] glmnet_5.0 Matrix_1.7-5 scales_1.4.0
## [7] tidyr_1.3.2 dplyr_1.2.1 ggplot2_4.0.3
##
## loaded via a namespace (and not attached):
## [1] sass_0.4.10 generics_0.1.4 shape_1.4.6.1 lattice_0.22-9
## [5] digest_0.6.39 magrittr_2.0.5 evaluate_1.0.5 grid_4.6.0
## [9] RColorBrewer_1.1-3 iterators_1.0.14 fastmap_1.2.0 foreach_1.5.2
## [13] jsonlite_2.0.0 survival_3.8-6 mgcv_1.9-4 purrr_1.2.2
## [17] codetools_0.2-20 textshaping_1.0.5 jquerylib_0.1.4 cli_3.6.6
## [21] rlang_1.2.0 splines_4.6.0 withr_3.0.2 cachem_1.1.0
## [25] yaml_2.3.12 otel_0.2.0 parallel_4.6.0 tools_4.6.0
## [29] vctrs_0.7.3 R6_2.6.1 lifecycle_1.0.5 ragg_1.5.2
## [33] pkgconfig_2.0.3 pillar_1.11.1 bslib_0.11.0 gtable_0.3.6
## [37] glue_1.8.1 Rcpp_1.1.1-1.1 systemfonts_1.3.2 xfun_0.58
## [41] tibble_3.3.1 tidyselect_1.2.1 rstudioapi_0.18.0 knitr_1.51
## [45] dichromat_2.0-0.1 farver_2.1.2 nlme_3.1-169 htmltools_0.5.9
## [49] labeling_0.4.3 rmarkdown_2.31 compiler_4.6.0 S7_0.2.2