# Load necessary libraries (加载必要的库)
library(dplyr)
library(ggplot2)
library(caret)
library(pROC)
# 1. Read the dataset (读取数据集)
df <- read.csv("./data/players_quarters_final.csv")
# 2. Data Preprocessing & Optimization (数据预处理与优化)
# Optimization: Using 'sprints' instead of 'distance' as it's more sensitive to fatigue
# (优化:使用“冲刺次数”代替“总距离”,因为其对疲劳更敏感)
df_clean <- df %>%
filter(position != "G") %>%
mutate(
position = as.factor(position),
# Scale sprint variables (对冲刺相关变量进行标准化)
sc_last15_spr = (last15_sprints - mean(last15_sprints, na.rm = TRUE)) / sd(last15_sprints, na.rm = TRUE),
sc_cumul_spr = (cumul_sprints - mean(cumul_sprints, na.rm = TRUE)) / sd(cumul_sprints, na.rm = TRUE)
)
# 3. Model Definition (模型定义)
# Formula: Inclusion of quadratic term and interaction (包含二次项与交互项)
model_formula <- scored_after ~ sc_last15_spr +
sc_cumul_spr +
I(sc_cumul_spr^2) +
sc_last15_spr:sc_cumul_spr +
checkpoint_min +
position
# 4. Group K-Fold Cross-Validation (分组K折交叉验证)
# Preventing data leakage by grouping by player_appearance_id
# (通过按 player_appearance_id 分组来防止数据泄露)
set.seed(2026)
folds <- groupKFold(df_clean$player_appearance_id, k = 5)
cv_results <- lapply(seq_along(folds), function(i) {
# Split data based on groups (基于分组拆分数据)
train_indices <- folds[[i]]
train_data <- df_clean[train_indices, ]
test_data <- df_clean[-train_indices, ]
# Fit optimized model (拟合优化后的模型)
fit <- glm(model_formula, data = train_data, family = binomial(link = "logit"))
# Predict and evaluate (预测与评估)
probs <- predict(fit, newdata = test_data, type = "response")
roc_obj <- roc(test_data$scored_after, probs, quiet = TRUE)
return(as.numeric(auc(roc_obj)))
})
# Print CV Results (打印交叉验证结果)
mean_auc <- mean(unlist(cv_results))
print(paste("Average Group K-Fold AUC:", round(mean_auc, 4)))
## [1] "Average Group K-Fold AUC: 0.6469"
# 5. Final Model Fit on Full Clean Data (在完整清洗后的数据上拟合最终模型)
final_fatigue_model <- glm(model_formula, data = df_clean, family = binomial(link = "logit"))
summary(final_fatigue_model)
##
## Call:
## glm(formula = model_formula, family = binomial(link = "logit"),
## data = df_clean)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.752476 0.255512 -6.859 6.95e-12 ***
## sc_last15_spr 0.081434 0.096163 0.847 0.3971
## sc_cumul_spr -0.295292 0.127091 -2.323 0.0202 *
## I(sc_cumul_spr^2) -0.063366 0.104529 -0.606 0.5444
## checkpoint_min -0.001980 0.007096 -0.279 0.7802
## positionD -1.590724 0.208569 -7.627 2.41e-14 ***
## positionM -0.756555 0.166970 -4.531 5.87e-06 ***
## sc_last15_spr:sc_cumul_spr -0.167442 0.114986 -1.456 0.1453
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1508.3 on 3167 degrees of freedom
## Residual deviance: 1425.6 on 3160 degrees of freedom
## AIC: 1441.6
##
## Number of Fisher Scoring iterations: 6
# 6. Visualization: The Fatigue Interaction Effect (可视化:疲劳交互效应)
# Generate grid to show how cumulative sprints dampen the effect of last15 effort
# (生成网格展示累计冲刺如何削弱最近15分钟努力的效果)
spr_seq <- seq(min(df_clean$sc_cumul_spr), max(df_clean$sc_cumul_spr), length.out = 100)
predict_data <- data.frame(
sc_cumul_spr = spr_seq,
sc_last15_spr = 1, # High intensity in last 15m (最近15分钟高强度)
checkpoint_min = 60,
position = factor("A", levels = levels(df_clean$position))
)
predict_data$prob <- predict(final_fatigue_model, newdata = predict_data, type = "response")
ggplot(predict_data, aes(x = sc_cumul_spr, y = prob)) +
geom_line(colour = "firebrick", size = 1.2) +
labs(
title = "Scoring Probability vs Cumulative Fatigue (Sprints)",
subtitle = "Holding last 15m sprints at +1 SD (High Intensity)",
x = "Cumulative Sprints (Standardized)",
y = "Predicted Probability of Scoring"
) +
theme_minimal()

# 7. Coefficient Interpretation Table (系数解释表)
model_summary <- summary(final_fatigue_model)$coefficients
coef_data <- data.frame(
Variable = rownames(model_summary),
Estimate = model_summary[, "Estimate"],
P_Value = model_summary[, "Pr(>|z|)"]
) %>%
mutate(
Odds_Ratio = exp(Estimate),
Significance = ifelse(P_Value < 0.05, "Significant", "Non-Significant")
)
print("--- Final Optimized Model Coefficients ---")
## [1] "--- Final Optimized Model Coefficients ---"
## Variable Estimate P_Value
## (Intercept) (Intercept) -1.752475908 6.949583e-12
## sc_last15_spr sc_last15_spr 0.081433975 3.970907e-01
## sc_cumul_spr sc_cumul_spr -0.295291800 2.015426e-02
## I(sc_cumul_spr^2) I(sc_cumul_spr^2) -0.063366140 5.443763e-01
## checkpoint_min checkpoint_min -0.001980237 7.801987e-01
## positionD positionD -1.590723832 2.405574e-14
## positionM positionM -0.756554616 5.868319e-06
## sc_last15_spr:sc_cumul_spr sc_last15_spr:sc_cumul_spr -0.167441735 1.453378e-01
## Odds_Ratio Significance
## (Intercept) 0.1733442 Significant
## sc_last15_spr 1.0848416 Non-Significant
## sc_cumul_spr 0.7443144 Significant
## I(sc_cumul_spr^2) 0.9385998 Non-Significant
## checkpoint_min 0.9980217 Non-Significant
## positionD 0.2037781 Significant
## positionM 0.4692805 Significant
## sc_last15_spr:sc_cumul_spr 0.8458259 Non-Significant