The Fatigue Curve: Analyzing the Impact of Cumulative Distance on Scoring Probability in Football

# Load necessary libraries (加载必要的库)
library(dplyr)
library(ggplot2)
library(caret)
library(pROC)

# 1. Read the dataset (读取数据集)
df <- read.csv("./data/players_quarters_final.csv")

# 2. Data Preprocessing & Optimization (数据预处理与优化)
# Optimization: Using 'sprints' instead of 'distance' as it's more sensitive to fatigue
# (优化：使用“冲刺次数”代替“总距离”，因为其对疲劳更敏感)
df_clean <- df %>%
  filter(position != "G") %>%
  mutate(
    position = as.factor(position),
    # Scale sprint variables (对冲刺相关变量进行标准化)
    sc_last15_spr = (last15_sprints - mean(last15_sprints, na.rm = TRUE)) / sd(last15_sprints, na.rm = TRUE),
    sc_cumul_spr = (cumul_sprints - mean(cumul_sprints, na.rm = TRUE)) / sd(cumul_sprints, na.rm = TRUE)
  )

# 3. Model Definition (模型定义)
# Formula: Inclusion of quadratic term and interaction (包含二次项与交互项)
model_formula <- scored_after ~ sc_last15_spr + 
                               sc_cumul_spr + 
                               I(sc_cumul_spr^2) + 
                               sc_last15_spr:sc_cumul_spr + 
                               checkpoint_min + 
                               position

# 4. Group K-Fold Cross-Validation (分组K折交叉验证)
# Preventing data leakage by grouping by player_appearance_id
# (通过按 player_appearance_id 分组来防止数据泄露)
set.seed(2026)
folds <- groupKFold(df_clean$player_appearance_id, k = 5)

cv_results <- lapply(seq_along(folds), function(i) {
  # Split data based on groups (基于分组拆分数据)
  train_indices <- folds[[i]]
  train_data <- df_clean[train_indices, ]
  test_data  <- df_clean[-train_indices, ]
  
  # Fit optimized model (拟合优化后的模型)
  fit <- glm(model_formula, data = train_data, family = binomial(link = "logit"))
  
  # Predict and evaluate (预测与评估)
  probs <- predict(fit, newdata = test_data, type = "response")
  roc_obj <- roc(test_data$scored_after, probs, quiet = TRUE)
  
  return(as.numeric(auc(roc_obj)))
})

# Print CV Results (打印交叉验证结果)
mean_auc <- mean(unlist(cv_results))
print(paste("Average Group K-Fold AUC:", round(mean_auc, 4)))

## [1] "Average Group K-Fold AUC: 0.6469"

# 5. Final Model Fit on Full Clean Data (在完整清洗后的数据上拟合最终模型)
final_fatigue_model <- glm(model_formula, data = df_clean, family = binomial(link = "logit"))
summary(final_fatigue_model)

## 
## Call:
## glm(formula = model_formula, family = binomial(link = "logit"), 
##     data = df_clean)
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -1.752476   0.255512  -6.859 6.95e-12 ***
## sc_last15_spr               0.081434   0.096163   0.847   0.3971    
## sc_cumul_spr               -0.295292   0.127091  -2.323   0.0202 *  
## I(sc_cumul_spr^2)          -0.063366   0.104529  -0.606   0.5444    
## checkpoint_min             -0.001980   0.007096  -0.279   0.7802    
## positionD                  -1.590724   0.208569  -7.627 2.41e-14 ***
## positionM                  -0.756555   0.166970  -4.531 5.87e-06 ***
## sc_last15_spr:sc_cumul_spr -0.167442   0.114986  -1.456   0.1453    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1508.3  on 3167  degrees of freedom
## Residual deviance: 1425.6  on 3160  degrees of freedom
## AIC: 1441.6
## 
## Number of Fisher Scoring iterations: 6

# 6. Visualization: The Fatigue Interaction Effect (可视化：疲劳交互效应)
# Generate grid to show how cumulative sprints dampen the effect of last15 effort
# (生成网格展示累计冲刺如何削弱最近15分钟努力的效果)
spr_seq <- seq(min(df_clean$sc_cumul_spr), max(df_clean$sc_cumul_spr), length.out = 100)
predict_data <- data.frame(
  sc_cumul_spr = spr_seq,
  sc_last15_spr = 1,      # High intensity in last 15m (最近15分钟高强度)
  checkpoint_min = 60,
  position = factor("A", levels = levels(df_clean$position))
)
predict_data$prob <- predict(final_fatigue_model, newdata = predict_data, type = "response")

ggplot(predict_data, aes(x = sc_cumul_spr, y = prob)) +
  geom_line(colour = "firebrick", size = 1.2) +
  labs(
    title = "Scoring Probability vs Cumulative Fatigue (Sprints)",
    subtitle = "Holding last 15m sprints at +1 SD (High Intensity)",
    x = "Cumulative Sprints (Standardized)",
    y = "Predicted Probability of Scoring"
  ) +
  theme_minimal()

# 7. Coefficient Interpretation Table (系数解释表)
model_summary <- summary(final_fatigue_model)$coefficients
coef_data <- data.frame(
  Variable = rownames(model_summary),
  Estimate = model_summary[, "Estimate"],
  P_Value  = model_summary[, "Pr(>|z|)"]
) %>%
  mutate(
    Odds_Ratio = exp(Estimate),
    Significance = ifelse(P_Value < 0.05, "Significant", "Non-Significant")
  )

print("--- Final Optimized Model Coefficients ---")

## [1] "--- Final Optimized Model Coefficients ---"

print(coef_data)

##                                              Variable     Estimate      P_Value
## (Intercept)                               (Intercept) -1.752475908 6.949583e-12
## sc_last15_spr                           sc_last15_spr  0.081433975 3.970907e-01
## sc_cumul_spr                             sc_cumul_spr -0.295291800 2.015426e-02
## I(sc_cumul_spr^2)                   I(sc_cumul_spr^2) -0.063366140 5.443763e-01
## checkpoint_min                         checkpoint_min -0.001980237 7.801987e-01
## positionD                                   positionD -1.590723832 2.405574e-14
## positionM                                   positionM -0.756554616 5.868319e-06
## sc_last15_spr:sc_cumul_spr sc_last15_spr:sc_cumul_spr -0.167441735 1.453378e-01
##                            Odds_Ratio    Significance
## (Intercept)                 0.1733442     Significant
## sc_last15_spr               1.0848416 Non-Significant
## sc_cumul_spr                0.7443144     Significant
## I(sc_cumul_spr^2)           0.9385998 Non-Significant
## checkpoint_min              0.9980217 Non-Significant
## positionD                   0.2037781     Significant
## positionM                   0.4692805     Significant
## sc_last15_spr:sc_cumul_spr  0.8458259 Non-Significant