Does greater ball possession significantly increase a team’s probability of winning (vs not winning = loss or draw) in professional soccer?
library(tidyverse)
library(readr)
pick_first <- function(df, candidates, default = NA) {
cols <- intersect(candidates, names(df))
if (length(cols) == 0) return(rep(default, nrow(df)))
out <- df[[cols[1]]]
if (length(cols) > 1) {
for (cn in cols[-1]) out <- dplyr::coalesce(out, df[[cn]])
}
out
}
# parse possession
parse_poss <- function(x) {
if (is.null(x)) return(NA_real_)
x <- gsub("%", "", x)
suppressWarnings(as.numeric(x))
}
# Read from your local Windows path because csv is too big
local_path <- params$local_csv_path
message("Attempting to read CSV from: ", local_path)
## Attempting to read CSV from: full_data.csv
matches_raw <- read_csv(local_path, show_col_types = FALSE)
# Show available columns for mapping
names(matches_raw)
## [1] "League" "Home" "Away"
## [4] "INC" "Round" "Date"
## [7] "Time" "H_Score" "A_Score"
## [10] "HT_H_Score" "HT_A_Score" "WIN"
## [13] "H_BET" "X_BET" "A_BET"
## [16] "WIN_BET" "OVER_2.5" "OVER_3.5"
## [19] "H_15" "A_15" "H_45_50"
## [22] "A_45_50" "H_90" "A_90"
## [25] "H_Missing_Players" "A_Missing_Players" "Missing_Players"
## [28] "H_Ball_Possession" "A_Ball_Possession" "H_Goal_Attempts"
## [31] "A_Goal_Attempts" "H_Shots_on_Goal" "A_Shots_on_Goal"
## [34] "H_Attacks" "A_Attacks" "H_Dangerous_Attacks"
## [37] "A_Dangerous_Attacks" "H_Shots_off_Goal" "A_Shots_off_Goal"
## [40] "H_Blocked_Shots" "A_Blocked_Shots" "H_Free_Kicks"
## [43] "A_Free_Kicks" "H_Corner_Kicks" "A_Corner_Kicks"
## [46] "H_Offsides" "A_Offsides" "H_Throw_in"
## [49] "A_Throw_in" "H_Goalkeeper_Saves" "A_Goalkeeper_Saves"
## [52] "H_Fouls" "A_Fouls" "H_Yellow_Cards"
## [55] "A_Yellow_Cards" "Game Link"
glimpse(matches_raw, width = 120)
## Rows: 96,337
## Columns: 56
## $ League <chr> "championship", "championship", "championship", "championship", "championship", "champions…
## $ Home <chr> "Swansea", "Cardiff", "Swansea", "Reading", "Nottingham", "Barnsley", "Bristol City", "Bur…
## $ Away <chr> "Reading", "Reading", "Nottingham", "Cardiff", "Swansea", "Millwall", "Hull", "Cardiff", "…
## $ INC <chr> "[\"08' Yellow_Away - Griffin A.\", \"12' Yellow_Away - Khizanishvili Z.\", \"12' Yellow_H…
## $ Round <chr> "Play-off", "Play-off", "Play-off", "Play-off", "Play-off", "46", "46", "46", "46", "46", …
## $ Date <chr> "30.05.2011", "17.05.2011", "16.05.2011", "13.05.2011", "12.05.2011", "07.05.2011", "07.05…
## $ Time <time> 16:00:00, 20:45:00, 20:45:00, 20:45:00, 20:45:00, 13:45:00, 13:45:00, 13:45:00, 13:45:00,…
## $ H_Score <dbl> 4, 0, 3, 0, 0, 1, 3, 1, 0, 4, 3, 2, 3, 1, 2, 1, 4, 0, 0, 0, 0, 1, 1, 2, 0, 5, 2, 0, 1, 2, …
## $ A_Score <dbl> 2, 3, 1, 0, 0, 0, 0, 1, 3, 2, 0, 2, 1, 2, 1, 1, 0, 1, 3, 0, 2, 1, 1, 1, 2, 1, 2, 2, 0, 2, …
## $ HT_H_Score <dbl> 3, 0, 2, 0, 0, 0, 2, 1, 0, 3, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 2, 1, 0, 1, 0, …
## $ HT_A_Score <dbl> 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 3, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, …
## $ WIN <chr> "Home", "Away", "Home", "Draw", "Draw", "Home", "Home", "Draw", "Away", "Home", "Home", "D…
## $ H_BET <dbl> 2.15, 2.10, 1.70, 1.91, 2.00, 2.15, 2.00, 2.15, 2.95, 1.58, 1.52, 1.38, 2.35, 1.66, 1.35, …
## $ X_BET <dbl> 2.95, 3.00, 3.10, 3.10, 2.90, 2.90, 2.90, 2.95, 2.90, 3.05, 2.95, 3.40, 2.90, 3.15, 3.40, …
## $ A_BET <dbl> 2.40, 2.95, 4.20, 3.30, 2.70, 2.45, 2.70, 2.40, 1.90, 3.85, 4.30, 5.25, 2.25, 3.35, 5.40, …
## $ WIN_BET <dbl> 2.15, 2.95, 1.70, 3.10, 2.90, 2.15, 2.00, 2.95, 1.90, 1.58, 1.52, 3.40, 2.35, 3.35, 1.35, …
## $ OVER_2.5 <lgl> TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, TRUE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TR…
## $ OVER_3.5 <lgl> TRUE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, TRUE, TRUE, FALS…
## $ H_15 <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, T…
## $ A_15 <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, …
## $ H_45_50 <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, …
## $ A_45_50 <lgl> FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
## $ H_90 <lgl> FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, F…
## $ A_90 <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, …
## $ H_Missing_Players <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ A_Missing_Players <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ Missing_Players <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ H_Ball_Possession <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ A_Ball_Possession <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ H_Goal_Attempts <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ A_Goal_Attempts <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ H_Shots_on_Goal <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ A_Shots_on_Goal <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ H_Attacks <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ A_Attacks <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ H_Dangerous_Attacks <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ A_Dangerous_Attacks <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ H_Shots_off_Goal <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ A_Shots_off_Goal <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ H_Blocked_Shots <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ A_Blocked_Shots <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ H_Free_Kicks <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ A_Free_Kicks <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ H_Corner_Kicks <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ A_Corner_Kicks <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ H_Offsides <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ A_Offsides <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ H_Throw_in <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ A_Throw_in <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ H_Goalkeeper_Saves <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ A_Goalkeeper_Saves <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ H_Fouls <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ A_Fouls <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ H_Yellow_Cards <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ A_Yellow_Cards <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ `Game Link` <chr> "https://www.flashscore.com/match/feOBWmKD/#/match-summary/match-summary", "https://www.fl…
std <- matches_raw %>%
mutate(
# Teams
home_team = pick_first(., c("Home","home_team","HomeTeam","home","HomeTeamName"), default = NA_character_),
away_team = pick_first(., c("Away","away_team","AwayTeam","away","AwayTeamName"), default = NA_character_),
# Goals (numeric)
home_goals = suppressWarnings(as.numeric(pick_first(., c("H_Score","home_goals","FTHG","home_score","HomeGoals","home_team_goals"), default = NA_real_))),
away_goals = suppressWarnings(as.numeric(pick_first(., c("A_Score","away_goals","FTAG","away_score","AwayGoals","away_team_goals"), default = NA_real_))),
# Possession (parse % -> numeric)
home_poss_chr = pick_first(., c("H_Ball_Possession","home_possession","HomePossession"), default = NA_character_),
away_poss_chr = pick_first(., c("A_Ball_Possession","away_possession","AwayPossession"), default = NA_character_),
home_poss = parse_poss(home_poss_chr),
away_poss = parse_poss(away_poss_chr),
# Outcome label if goals NA
win_label = pick_first(., c("WIN","Result"), default = NA_character_)
) %>%
mutate(
# If only one side of possession exists, infer the other
home_poss = ifelse(!is.na(home_poss), home_poss,
ifelse(!is.na(away_poss), 100 - away_poss, NA_real_)),
away_poss = ifelse(!is.na(away_poss), away_poss,
ifelse(!is.na(home_poss), 100 - home_poss, NA_real_))
)
# Build team-match rows
home_rows <- std %>%
transmute(team = home_team, opponent = away_team,
team_goals = home_goals, opp_goals = away_goals,
team_poss = home_poss, opp_poss = away_poss,
win_label = win_label,
is_home = 1L)
away_rows <- std %>%
transmute(team = away_team, opponent = home_team,
team_goals = away_goals, opp_goals = home_goals,
team_poss = away_poss, opp_poss = home_poss,
win_label = win_label,
is_home = 0L)
teams_df <- bind_rows(home_rows, away_rows) %>%
mutate(
# Primary outcome from goals when available
team_win_bin_goals = dplyr::case_when(
!is.na(team_goals) & !is.na(opp_goals) & team_goals > opp_goals ~ 1L,
!is.na(team_goals) & !is.na(opp_goals) ~ 0L,
TRUE ~ NA_integer_
),
# Fallback from WIN col if goals missing
team_win_bin_fallback = dplyr::case_when(
is.na(team_win_bin_goals) & !is.na(win_label) & win_label == "Home" & is_home == 1L ~ 1L,
is.na(team_win_bin_goals) & !is.na(win_label) & win_label == "Away" & is_home == 0L ~ 1L,
is.na(team_win_bin_goals) & !is.na(win_label) & win_label %in% c("Home","Away","Draw") ~ 0L,
TRUE ~ NA_integer_
),
team_win_bin = dplyr::coalesce(team_win_bin_goals, team_win_bin_fallback),
poss_diff = team_poss - opp_poss
)
# show usable rows after mapping
diag_counts <- teams_df %>%
summarize(
n = n(),
n_win_na = sum(is.na(team_win_bin)),
n_poss_na = sum(is.na(team_poss)),
complete_cases = sum(!is.na(team_win_bin) & !is.na(team_poss))
)
diag_counts
## # A tibble: 1 × 4
## n n_win_na n_poss_na complete_cases
## <int> <int> <int> <int>
## 1 192674 64 88900 103738
glimpse(teams_df, width = 120)
## Rows: 192,674
## Columns: 12
## $ team <chr> "Swansea", "Cardiff", "Swansea", "Reading", "Nottingham", "Barnsley", "Bristol City", "B…
## $ opponent <chr> "Reading", "Reading", "Nottingham", "Cardiff", "Swansea", "Millwall", "Hull", "Cardiff",…
## $ team_goals <dbl> 4, 0, 3, 0, 0, 1, 3, 1, 0, 4, 3, 2, 3, 1, 2, 1, 4, 0, 0, 0, 0, 1, 1, 2, 0, 5, 2, 0, 1, 2…
## $ opp_goals <dbl> 2, 3, 1, 0, 0, 0, 0, 1, 3, 2, 0, 2, 1, 2, 1, 1, 0, 1, 3, 0, 2, 1, 1, 1, 2, 1, 2, 2, 0, 2…
## $ team_poss <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ opp_poss <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ win_label <chr> "Home", "Away", "Home", "Draw", "Draw", "Home", "Home", "Draw", "Away", "Home", "Home", …
## $ is_home <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ team_win_bin_goals <int> 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0…
## $ team_win_bin_fallback <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ team_win_bin <int> 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0…
## $ poss_diff <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
What are the cases, and how many are there? Each case represents a single soccer match. The dataset includes over 96,000 matches from 18 European soccer leagues (10 countries) from 2011 to 2021.
Describe the method of data collection
Dataset was collected by Kaggle user Sebastian Gębala. The data were scrapped from a livescore stats web page provider. The data include match statistics such as goals, shots, attacks, possession, corners, etc.
What type of study is this (observational/experiment)? This is an observational study since no experimental manipulation has been conducted.
If you collected the data, state self-collected. If not, provide a citation/link. Kaggle - Football DataSet +96k matches (18 leagues): https://www.kaggle.com/datasets/bastekforever/complete-football-data-89000-matches-18-leagues
Data was directly downloaded from Kaggle.
Gębala, S. (2023). Football DataSet +96k matches (18 leagues). Kaggle. Retrieved from the dataset page.
The response variable is team winning - binary where 1 represents winning a game and 0 represents not winning
Team ball possession is a numeric variable that will be the key predictor. Other variables, such as shots, pass accuracy, home vs away, will be used as optional controls.
table(teams_df$team_win_bin, useNA = "ifany")
##
## 0 1 <NA>
## 122878 69732 64
summary(teams_df$team_poss)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 10 44 50 50 56 90 88900
teams_df %>%
filter(!is.na(team_poss), !is.na(team_win_bin)) %>%
mutate(outcome = ifelse(team_win_bin == 1, "Win", "Not-Win")) %>%
ggplot(aes(x = outcome, y = team_poss, fill = outcome)) +
geom_boxplot(alpha = 0.7, outlier.alpha = 0.3) +
labs(title = "Team Possession by Outcome",
x = "Outcome", y = "Team Possession (%)") +
theme_minimal()
teams_binned <- teams_df %>%
filter(!is.na(team_poss), !is.na(team_win_bin)) %>%
mutate(poss_bin = cut(team_poss,
breaks = c(-Inf, 40, 50, 60, Inf),
labels = c("<40%", "40–50%", "50–60%", ">60%")),
outcome = ifelse(team_win_bin==1,"Win","Not-Win"))
tab <- table(teams_binned$poss_bin, teams_binned$outcome, useNA = "ifany")
tab
##
## Not-Win Win
## <40% 9999 5169
## 40–50% 24996 14069
## 50–60% 22902 13730
## >60% 7756 5117
complete_df <- teams_df %>%
filter(!is.na(team_poss), !is.na(team_win_bin)) %>%
mutate(team_win_bin = as.integer(team_win_bin))
n_obs <- nrow(complete_df)
n_classes <- length(unique(complete_df$team_win_bin))
sd_poss <- sd(complete_df$team_poss, na.rm = TRUE)
cat("Obs available for glm:", n_obs, "| Classes:", n_classes, "| sd(team_poss):", sd_poss, "\n")
## Obs available for glm: 103738 | Classes: 2 | sd(team_poss): 9.201833
if (n_obs >= 50 && n_classes > 1 && is.finite(sd_poss) && sd_poss > 0) {
mdl <- glm(team_win_bin ~ scale(team_poss),
data = complete_df,
family = binomial())
print(summary(mdl))
} else {
cat("Skipping glm: not enough usable data yet (need >=50 rows, >1 class, and nonzero variance in possession).\n")
cat("If many values are still NA, verify that the possession columns are populated in your CSV.\n")
}
##
## Call:
## glm(formula = team_win_bin ~ scale(team_poss), family = binomial(),
## data = complete_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.545441 0.006448 -84.6 <2e-16 ***
## scale(team_poss) 0.081342 0.006456 12.6 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 136397 on 103737 degrees of freedom
## Residual deviance: 136237 on 103736 degrees of freedom
## AIC: 136241
##
## Number of Fisher Scoring iterations: 4