This is an attempt to build an expected touchdowns model and improve upon what was already built. Based upon past iterations of various models, xGBoost was found to achieve the best performance/results and will be used here.
library(tidyverse)
Warning: package ‘tidyr’ was built under R version 4.3.3Warning: package ‘readr’ was built under R version 4.3.3Warning: package ‘purrr’ was built under R version 4.3.3Warning: package ‘dplyr’ was built under R version 4.3.3Warning: package ‘stringr’ was built under R version 4.3.3Warning: package ‘lubridate’ was built under R version 4.3.3── Attaching core tidyverse packages ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.2 ✔ tibble 3.3.0
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
✔ purrr 1.0.4 ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(nflfastR)
# Define the range of years you want to load (e.g., 2003 to 2023)
years <- 1999:2024
# Initialize an empty list to store the play-by-play data for each year
pbp_list <- list()
# Loop through each year and load the play-by-play data
for (year in years) {
# Load the data and add the year column
pbp_list[[as.character(year)]] <- load_pbp(year) %>%
mutate(year = year)
}
Note: nflreadr caches (i.e., stores a saved version) data by default.
If you expect different output try one of the following:
ℹ Restart your R Session or
ℹ Run `nflreadr::.clear_cache()`.
To disable this warning, run `options(nflreadr.verbose = FALSE)` or add it to your .Rprofile
# Combine all the yearly play-by-play data into one data frame using bind_rows
pbp_data <- bind_rows(pbp_list)
first, let’s look at all the different play types
play_type_counts <- pbp_data %>%
group_by(play_type) %>%
summarise(count = n()) %>%
arrange(desc(count))
play_type_counts
let’s looks at the playtypes by year:
# Summarize the number of plays by play_type and year
play_type_year_summary <- pbp_data %>%
group_by(play_type, year) %>%
summarise(count = n(), .groups = 'drop')
# Reshape the data to have years as columns
play_type_year_pivot <- play_type_year_summary %>%
pivot_wider(names_from = year, values_from = count, values_fill = 0)
print(play_type_year_pivot)
Let’s quickly look at a visual to see if there are any evident trends
library(ggplot2)
library(ggrepel)
Warning: package ‘ggrepel’ was built under R version 4.3.3
# Pivot the data to long format so that we can plot it easily with ggplot2
play_type_long <- play_type_year_pivot %>%
pivot_longer(cols = -play_type, names_to = "year", values_to = "count") %>%
mutate(year = as.numeric(year)) # Convert year to numeric for the plot
# Create the line plot
ggplot(play_type_long, aes(x = year, y = count, color = play_type)) +
geom_line(size = 1) + # Line size for better visibility
labs(title = "Number of Plays by Play Type Over the Years",
x = "Year",
y = "Number of Plays",
color = "Play Type") + # Labels and title
theme_minimal() + # Clean theme
theme(
legend.position = "right", # Place legend on the right
plot.title = element_text(hjust = 0.5) # Center the title
)
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
Please use `linewidth` instead.
Now let’s summarize by year
# Summarize the number of plays by play_type and year
touchdown_year_summary <- pbp_data %>%
filter(play_type == "pass" | play_type =="run" | play_type== "punt" | play_type=="kickoff") %>%
group_by(yardline_100, touchdown) %>%
summarise(count = n(), .groups = 'drop')
# Reshape the data to have years as columns
play_touchdown_year_pivot <- touchdown_year_summary %>%
pivot_wider(names_from = yardline_100, values_from = count, values_fill = 0)
play_touchdown_year_pivot <- play_touchdown_year_pivot %>%
filter(!is.na(touchdown))
print(play_touchdown_year_pivot)
NA
Let’s calculate probabilities
# Calculate the total number of plays and the number of touchdowns for each yardline
probability_data <- play_touchdown_year_pivot %>%
pivot_longer(cols = -touchdown, names_to = "yardline_100", values_to = "count") %>%
mutate(yardline_100 = as.numeric(gsub("X", "", yardline_100))) %>%
group_by(yardline_100) %>%
summarise(
total_plays = sum(count),
touchdowns = sum(count[touchdown == 1]),
.groups = 'drop'
) %>%
mutate(probability_td = touchdowns / total_plays) %>%
select(yardline_100, probability_td)
# Create a new dataframe with probabilities for scoring (1) and not scoring (0)
probability_scores <- probability_data %>%
mutate(probability_no_td = 1 - probability_td) %>%
select(yardline_100, probability_td, probability_no_td)
# View the resulting probability scores
print(probability_scores)
Let’s join in the probability data
combined_data <- pbp_data %>%
left_join(probability_scores, by = "yardline_100") %>%
mutate(unique_play_id = paste(game_id, play_id, sep = "_"))
combined_data <- combined_data %>% left_join(model_results, by = “play_id”)
# --- 1. Load libraries ---
library(dplyr)
library(xgboost)
Attaching package: ‘xgboost’
The following object is masked from ‘package:dplyr’:
slice
library(caret)
Warning: package ‘caret’ was built under R version 4.3.3Loading required package: lattice
Warning: package ‘lattice’ was built under R version 4.3.3
Attaching package: ‘caret’
The following object is masked from ‘package:purrr’:
lift
library(pROC)
Warning: package ‘pROC’ was built under R version 4.3.3Type 'citation("pROC")' for a citation.
Attaching package: ‘pROC’
The following objects are masked from ‘package:stats’:
cov, smooth, var
library(Matrix)
Attaching package: ‘Matrix’
The following objects are masked from ‘package:tidyr’:
expand, pack, unpack
model_data <- combined_data %>%
filter(play_type %in% c("run", "pass")) %>%
mutate(
# Basic engineered features
half_seconds_remaining = ifelse(game_half == "Half1", game_seconds_remaining - 1800, game_seconds_remaining),
distance = ydstogo,
score_differential = posteam_score - defteam_score,
# Enhanced pre-snap features
is_red_zone = as.numeric(yardline_100 <= 20),
is_two_minute_drill = as.numeric(game_seconds_remaining <= 120),
is_hurry_up = as.numeric(no_huddle == 1),
yards_from_1st = yardline_100 - ydstogo,
seconds_elapsed = 3600 - game_seconds_remaining,
yardline_log = log1p(yardline_100), # log(yardline + 1)
score_diff_squared = score_differential^2,
# Clean categorical variables (pre-snap known intent, not result)
run_location = ifelse(is.na(run_location), "None", run_location),
pass_length = ifelse(is.na(pass_length), "none", pass_length),
pass_location = ifelse(is.na(pass_location), "none", pass_location),
run_location = as.factor(run_location),
pass_length = as.factor(pass_length),
pass_location = as.factor(pass_location),
# Categorical conversions
play_type = factor(play_type),
posteam_type = factor(posteam_type),
goal_to_go = as.numeric(goal_to_go),
down = as.factor(down),
qtr = as.factor(qtr)
) %>%
select(
touchdown, yardline_100, yardline_log, down, distance, goal_to_go, qtr,
half_seconds_remaining, seconds_elapsed, score_differential, score_diff_squared,
is_red_zone, is_two_minute_drill, is_hurry_up, yards_from_1st,
posteam_type, play_type, unique_play_id, year, no_huddle,
run_location, pass_length, pass_location
) %>%
na.omit()
train_data <- model_data %>% filter(year < 2024)
test_data <- model_data %>% filter(year == 2024)
dummies <- dummyVars(touchdown ~ . - year - unique_play_id, data = train_data) # Exclude year and game_id from dummies
x_train <- predict(dummies, newdata = train_data)
x_test <- predict(dummies, newdata = test_data)
# Create matrices
dtrain <- xgb.DMatrix(data = as.matrix(x_train), label = train_data$touchdown)
dtest <- xgb.DMatrix(data = as.matrix(x_test), label = test_data$touchdown)
ratio <- sum(train_data$touchdown == 0) / sum(train_data$touchdown == 1)
params <- list(
objective = "binary:logistic",
eval_metric = "auc",
booster = "gbtree",
eta = 0.1,
max_depth = 6,
scale_pos_weight = ratio,
subsample = 0.8,
colsample_bytree = 0.8,
base_score = mean(train_data$touchdown)
)
set.seed(123)
xgb_model <- xgb.train(
params = params,
data = dtrain,
nrounds = 100,
watchlist = list(train = dtrain, eval = dtest),
print_every_n = 10,
early_stopping_rounds = 10,
maximize = TRUE
)
[1] train-auc:0.892262 eval-auc:0.903865
Multiple eval metrics are present. Will use eval_auc for early stopping.
Will train until eval_auc hasn't improved in 10 rounds.
[11] train-auc:0.897399 eval-auc:0.905856
Stopping. Best iteration:
[10] train-auc:0.897314 eval-auc:0.905992
an auc value of 0.897 represents a good model
xTD <- predict(xgb_model, dtest)
pred_touchdowns <- ifelse(xTD > 0.5, 1, 0)
conf_matrix <- confusionMatrix(as.factor(pred_touchdowns), as.factor(test_data$touchdown))
print(conf_matrix)
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 30306 314
1 3156 1127
Accuracy : 0.9006
95% CI : (0.8974, 0.9037)
No Information Rate : 0.9587
P-Value [Acc > NIR] : 1
Kappa : 0.3539
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.9057
Specificity : 0.7821
Pos Pred Value : 0.9897
Neg Pred Value : 0.2631
Prevalence : 0.9587
Detection Rate : 0.8683
Detection Prevalence : 0.8773
Balanced Accuracy : 0.8439
'Positive' Class : 0
# ROC / AUC
roc_obj <- roc(test_data$touchdown, xTD)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
auc_val <- auc(roc_obj)
print(paste("AUC:", round(auc_val, 4)))
[1] "AUC: 0.906"
plot(roc_obj, main = "ROC Curve - xTD Model", col = "blue")
Again, we see an AUC value of 0.906, with an accuracy of 90% which is an indicator of a good model.
Let’s take a look at what features are good predictors of Touchdowns
importance <- xgb.importance(model = xgb_model)
xgb.plot.importance(importance, top_n = 15)
yardline_100
Gain: 0.7174676 → This is the most important feature. It contributes over 71% of the model’s gain.
Interpretation: How close the offense is to the end zone is the strongest predictor of a touchdown. Makes perfect sense.
goal_to_go
Gain: 0.1477172 → Also very influential. Goal-to-go situations increase the chance of scoring.
Interpretation: Whether or not the play is goal-to-go is a major indicator.
play_type.pass
Gain: 0.0603022 → Passing plays add additional information.
Interpretation: The model finds some distinction between pass vs. run when predicting touchdowns.
distance
Gain: 0.0375925 → The distance to a first down also matters (more than the raw down itself).
Interpretation: Shorter distances to convert probably raise TD chances.
play_type.run
Gain: 0.0098352
Interpretation: This adds less than play_type.pass, perhaps because TDs are more likely on passes (or more varied).
model_results <- test_data %>%
mutate(
xTDs = xTD,
predicted_touchdown = ifelse(xTD > 0.5, 1, 0)
) %>%
select(unique_play_id, xTDs, predicted_touchdown) %>%
distinct(unique_play_id, .keep_all = TRUE)
#— 12. combine the data back in
combined_data <- combined_data %>%
left_join(model_results, by = "unique_play_id")
Let’s start summarizing our results and take a look at underperformers/overperformers for purposes of fantasy
combined_data %>% filter(!is.na(xTDs), !is.na(touchdown), !is.na(fantasy_player_name)) %>% filter(play_type %in% c(“run”, “pass”)) %>% filter(fantasy_player_name == rusher_player_name | fantasy_player_name == receiver_player_name)
xTD_summary <- combined_data %>% filter(year == 2024) %>% filter(fantasy_player_name == rusher_player_name | fantasy_player_name == receiver_player_name) %>% group_by(fantasy_player_name, year) %>% summarise( total_touchdowns = sum(touchdown, na.rm = TRUE), xTDs = sum(xTDs, na.rm = TRUE), touchdowns_over_expected = total_touchdowns - xTDs, .groups = “drop” ) %>%
It looks like there’s an issue with out probabilities, as xTD is way off, even though TD prediction is correct. Let’s use a calibration plot to check probability reliability.
calibration_df <- data.frame(
predicted_prob = xTD,
actual = test_data$touchdown
)
# Bin the predicted probabilities into deciles
calibration_df <- calibration_df %>%
mutate(bin = ntile(predicted_prob, 10)) %>%
group_by(bin) %>%
summarise(
mean_pred = mean(predicted_prob),
mean_actual = mean(actual),
count = n()
)
# Plot calibration curve
ggplot(calibration_df, aes(x = mean_pred, y = mean_actual)) +
geom_line(color = "blue") +
geom_abline(linetype = "dashed", color = "gray") +
labs(title = "Calibration Curve", x = "Mean Predicted Probability", y = "Actual TD Rate") +
theme_minimal()
It looks like even though the model is separating TDs well, the predicted probabilities are not reliable (bad calibration). It’s systemically overestimating TD likelihood across the board, especially at mid-range probabilities.
Let’s re-calibrate with Platt Scaling (logistic regression on predicted probs)
# Step 1: Fit Platt scaling model
platt_model <- glm(test_data$touchdown ~ xTD, family = binomial)
# Step 2: Predict calibrated probabilities
calibrated_probs <- predict(platt_model, type = "response")
# Step 3: Add to your test data
test_data <- test_data %>%
mutate(calibrated_xTD = calibrated_probs)
Let’s look at the updated calibration curve
calibration_df <- data.frame(
predicted_prob = test_data$calibrated_xTD,
actual = test_data$touchdown
)
# Bin the predicted probabilities into deciles
calibration_df <- calibration_df %>%
mutate(bin = ntile(predicted_prob, 10)) %>%
group_by(bin) %>%
summarise(
mean_pred = mean(predicted_prob),
mean_actual = mean(actual),
count = n()
)
# Plot calibration curve
ggplot(calibration_df, aes(x = mean_pred, y = mean_actual)) +
geom_line(color = "blue") +
geom_abline(linetype = "dashed", color = "gray") +
labs(title = "Calibration Curve", x = "Mean Predicted Probability", y = "Actual TD Rate") +
theme_minimal()
X-axis: The model’s predicted probabilities of a touchdown (xTD).
Y-axis: The actual observed frequency of touchdowns in those prediction bins.
Blue Line: The calibration curve — how your model's predictions match reality.
Dotted Diagonal Line: Perfect calibration line — where predicted probability = actual probability.
Key Takeaways:
The blue line closely follows the diagonal, especially from 0.05 to 0.25, which is the typical range for play-level touchdown probabilities.
This suggests your Platt scaling worked well — the model’s predicted probabilities are now trustworthy as actual likelihoods.
Slight deviation at very low probabilities (under 0.05) is common due to sample sparsity there — not a concern unless you're making bets based on extremely unlikely events.
now let’s take a look at key under/over performers in 2024
# First, ensure you retain player and year info before splitting
player_info <- combined_data %>%
select(unique_play_id, fantasy_player_name)
# Add the info to the test_data
predicted_data <- test_data %>%
left_join(player_info, by = "unique_play_id")
# Summarize expected and actual touchdowns
xTD_summary <- predicted_data %>%
mutate(xTDs = calibrated_xTD) %>%
group_by(fantasy_player_name, year) %>%
summarise(
total_touchdowns = sum(touchdown, na.rm = TRUE),
xTDs = sum(xTDs, na.rm = TRUE),
touchdowns_over_expected = total_touchdowns - xTDs,
.groups = "drop"
)
xTD_summary <- xTD_summary %>%
filter(!is.na(fantasy_player_name))
Top 5 Over Performers:
# Top 5 overperformers (actual TDs >> expected TDs)
top_overperformers <- xTD_summary %>%
arrange(desc(touchdowns_over_expected)) %>%
slice_head(n = 5)
# Print results
print("Top 5 Overperformers:")
[1] "Top 5 Overperformers:"
print(top_overperformers)
Top 5 Underperformers
# Top 5 underperformers (actual TDs << expected TDs)
top_underperformers <- xTD_summary %>%
arrange(touchdowns_over_expected) %>%
slice_head(n = 5)
print("Top 5 Underperformers:")
[1] "Top 5 Underperformers:"
print(top_underperformers)
write.csv(xTD_summary, file = "C:\\Users\\james\\R_Data\\NFL\\NFL_expectedTDdata.csv")