Research question

Does greater ball possession significantly increase a team’s probability of winning (vs not winning = loss or draw) in professional soccer?

Data Preparation

library(tidyverse)
library(readr)

pick_first <- function(df, candidates, default = NA) {
  cols <- intersect(candidates, names(df))
  if (length(cols) == 0) return(rep(default, nrow(df)))
  out <- df[[cols[1]]]
  if (length(cols) > 1) {
    for (cn in cols[-1]) out <- dplyr::coalesce(out, df[[cn]])
  }
  out
}

# parse possession 
parse_poss <- function(x) {
  if (is.null(x)) return(NA_real_)
  x <- gsub("%", "", x)
  suppressWarnings(as.numeric(x))
}
# Read from your local Windows path because csv is too big
local_path <- params$local_csv_path
message("Attempting to read CSV from: ", local_path)
## Attempting to read CSV from: full_data.csv
matches_raw <- read_csv(local_path, show_col_types = FALSE)

# Show available columns for mapping
names(matches_raw)
##  [1] "League"              "Home"                "Away"               
##  [4] "INC"                 "Round"               "Date"               
##  [7] "Time"                "H_Score"             "A_Score"            
## [10] "HT_H_Score"          "HT_A_Score"          "WIN"                
## [13] "H_BET"               "X_BET"               "A_BET"              
## [16] "WIN_BET"             "OVER_2.5"            "OVER_3.5"           
## [19] "H_15"                "A_15"                "H_45_50"            
## [22] "A_45_50"             "H_90"                "A_90"               
## [25] "H_Missing_Players"   "A_Missing_Players"   "Missing_Players"    
## [28] "H_Ball_Possession"   "A_Ball_Possession"   "H_Goal_Attempts"    
## [31] "A_Goal_Attempts"     "H_Shots_on_Goal"     "A_Shots_on_Goal"    
## [34] "H_Attacks"           "A_Attacks"           "H_Dangerous_Attacks"
## [37] "A_Dangerous_Attacks" "H_Shots_off_Goal"    "A_Shots_off_Goal"   
## [40] "H_Blocked_Shots"     "A_Blocked_Shots"     "H_Free_Kicks"       
## [43] "A_Free_Kicks"        "H_Corner_Kicks"      "A_Corner_Kicks"     
## [46] "H_Offsides"          "A_Offsides"          "H_Throw_in"         
## [49] "A_Throw_in"          "H_Goalkeeper_Saves"  "A_Goalkeeper_Saves" 
## [52] "H_Fouls"             "A_Fouls"             "H_Yellow_Cards"     
## [55] "A_Yellow_Cards"      "Game Link"
glimpse(matches_raw, width = 120)
## Rows: 96,337
## Columns: 56
## $ League              <chr> "championship", "championship", "championship", "championship", "championship", "champions…
## $ Home                <chr> "Swansea", "Cardiff", "Swansea", "Reading", "Nottingham", "Barnsley", "Bristol City", "Bur…
## $ Away                <chr> "Reading", "Reading", "Nottingham", "Cardiff", "Swansea", "Millwall", "Hull", "Cardiff", "…
## $ INC                 <chr> "[\"08' Yellow_Away - Griffin A.\", \"12' Yellow_Away - Khizanishvili Z.\", \"12' Yellow_H…
## $ Round               <chr> "Play-off", "Play-off", "Play-off", "Play-off", "Play-off", "46", "46", "46", "46", "46", …
## $ Date                <chr> "30.05.2011", "17.05.2011", "16.05.2011", "13.05.2011", "12.05.2011", "07.05.2011", "07.05…
## $ Time                <time> 16:00:00, 20:45:00, 20:45:00, 20:45:00, 20:45:00, 13:45:00, 13:45:00, 13:45:00, 13:45:00,…
## $ H_Score             <dbl> 4, 0, 3, 0, 0, 1, 3, 1, 0, 4, 3, 2, 3, 1, 2, 1, 4, 0, 0, 0, 0, 1, 1, 2, 0, 5, 2, 0, 1, 2, …
## $ A_Score             <dbl> 2, 3, 1, 0, 0, 0, 0, 1, 3, 2, 0, 2, 1, 2, 1, 1, 0, 1, 3, 0, 2, 1, 1, 1, 2, 1, 2, 2, 0, 2, …
## $ HT_H_Score          <dbl> 3, 0, 2, 0, 0, 0, 2, 1, 0, 3, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 2, 1, 0, 1, 0, …
## $ HT_A_Score          <dbl> 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 3, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, …
## $ WIN                 <chr> "Home", "Away", "Home", "Draw", "Draw", "Home", "Home", "Draw", "Away", "Home", "Home", "D…
## $ H_BET               <dbl> 2.15, 2.10, 1.70, 1.91, 2.00, 2.15, 2.00, 2.15, 2.95, 1.58, 1.52, 1.38, 2.35, 1.66, 1.35, …
## $ X_BET               <dbl> 2.95, 3.00, 3.10, 3.10, 2.90, 2.90, 2.90, 2.95, 2.90, 3.05, 2.95, 3.40, 2.90, 3.15, 3.40, …
## $ A_BET               <dbl> 2.40, 2.95, 4.20, 3.30, 2.70, 2.45, 2.70, 2.40, 1.90, 3.85, 4.30, 5.25, 2.25, 3.35, 5.40, …
## $ WIN_BET             <dbl> 2.15, 2.95, 1.70, 3.10, 2.90, 2.15, 2.00, 2.95, 1.90, 1.58, 1.52, 3.40, 2.35, 3.35, 1.35, …
## $ OVER_2.5            <lgl> TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, TRUE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TR…
## $ OVER_3.5            <lgl> TRUE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, TRUE, TRUE, FALS…
## $ H_15                <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, T…
## $ A_15                <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, …
## $ H_45_50             <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, …
## $ A_45_50             <lgl> FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
## $ H_90                <lgl> FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, F…
## $ A_90                <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, …
## $ H_Missing_Players   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ A_Missing_Players   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ Missing_Players     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ H_Ball_Possession   <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ A_Ball_Possession   <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ H_Goal_Attempts     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ A_Goal_Attempts     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ H_Shots_on_Goal     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ A_Shots_on_Goal     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ H_Attacks           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ A_Attacks           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ H_Dangerous_Attacks <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ A_Dangerous_Attacks <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ H_Shots_off_Goal    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ A_Shots_off_Goal    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ H_Blocked_Shots     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ A_Blocked_Shots     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ H_Free_Kicks        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ A_Free_Kicks        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ H_Corner_Kicks      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ A_Corner_Kicks      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ H_Offsides          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ A_Offsides          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ H_Throw_in          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ A_Throw_in          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ H_Goalkeeper_Saves  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ A_Goalkeeper_Saves  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ H_Fouls             <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ A_Fouls             <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ H_Yellow_Cards      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ A_Yellow_Cards      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ `Game Link`         <chr> "https://www.flashscore.com/match/feOBWmKD/#/match-summary/match-summary", "https://www.fl…
std <- matches_raw %>%
  mutate(
    # Teams
    home_team = pick_first(., c("Home","home_team","HomeTeam","home","HomeTeamName"), default = NA_character_),
    away_team = pick_first(., c("Away","away_team","AwayTeam","away","AwayTeamName"), default = NA_character_),
    # Goals (numeric)
    home_goals = suppressWarnings(as.numeric(pick_first(., c("H_Score","home_goals","FTHG","home_score","HomeGoals","home_team_goals"), default = NA_real_))),
    away_goals = suppressWarnings(as.numeric(pick_first(., c("A_Score","away_goals","FTAG","away_score","AwayGoals","away_team_goals"), default = NA_real_))),
    # Possession (parse % -> numeric)
    home_poss_chr = pick_first(., c("H_Ball_Possession","home_possession","HomePossession"), default = NA_character_),
    away_poss_chr = pick_first(., c("A_Ball_Possession","away_possession","AwayPossession"), default = NA_character_),
    home_poss = parse_poss(home_poss_chr),
    away_poss = parse_poss(away_poss_chr),
    # Outcome label if goals NA
    win_label = pick_first(., c("WIN","Result"), default = NA_character_)
  ) %>%
  mutate(
    # If only one side of possession exists, infer the other
    home_poss = ifelse(!is.na(home_poss), home_poss,
                       ifelse(!is.na(away_poss), 100 - away_poss, NA_real_)),
    away_poss = ifelse(!is.na(away_poss), away_poss,
                       ifelse(!is.na(home_poss), 100 - home_poss, NA_real_))
  )

# Build team-match rows
home_rows <- std %>%
  transmute(team = home_team, opponent = away_team,
            team_goals = home_goals, opp_goals = away_goals,
            team_poss = home_poss,  opp_poss = away_poss,
            win_label = win_label,
            is_home = 1L)

away_rows <- std %>%
  transmute(team = away_team, opponent = home_team,
            team_goals = away_goals, opp_goals = home_goals,
            team_poss = away_poss,  opp_poss = home_poss,
            win_label = win_label,
            is_home = 0L)

teams_df <- bind_rows(home_rows, away_rows) %>%
  mutate(
    # Primary outcome from goals when available
    team_win_bin_goals = dplyr::case_when(
      !is.na(team_goals) & !is.na(opp_goals) & team_goals > opp_goals ~ 1L,
      !is.na(team_goals) & !is.na(opp_goals) ~ 0L,
      TRUE ~ NA_integer_
    ),
    # Fallback from WIN col if goals missing
    team_win_bin_fallback = dplyr::case_when(
      is.na(team_win_bin_goals) & !is.na(win_label) & win_label == "Home" & is_home == 1L ~ 1L,
      is.na(team_win_bin_goals) & !is.na(win_label) & win_label == "Away" & is_home == 0L ~ 1L,
      is.na(team_win_bin_goals) & !is.na(win_label) & win_label %in% c("Home","Away","Draw") ~ 0L,
      TRUE ~ NA_integer_
    ),
    team_win_bin = dplyr::coalesce(team_win_bin_goals, team_win_bin_fallback),
    poss_diff = team_poss - opp_poss
  )

# show usable rows after mapping
diag_counts <- teams_df %>%
  summarize(
    n = n(),
    n_win_na = sum(is.na(team_win_bin)),
    n_poss_na = sum(is.na(team_poss)),
    complete_cases = sum(!is.na(team_win_bin) & !is.na(team_poss))
  )
diag_counts
## # A tibble: 1 × 4
##        n n_win_na n_poss_na complete_cases
##    <int>    <int>     <int>          <int>
## 1 192674       64     88900         103738
glimpse(teams_df, width = 120)
## Rows: 192,674
## Columns: 12
## $ team                  <chr> "Swansea", "Cardiff", "Swansea", "Reading", "Nottingham", "Barnsley", "Bristol City", "B…
## $ opponent              <chr> "Reading", "Reading", "Nottingham", "Cardiff", "Swansea", "Millwall", "Hull", "Cardiff",…
## $ team_goals            <dbl> 4, 0, 3, 0, 0, 1, 3, 1, 0, 4, 3, 2, 3, 1, 2, 1, 4, 0, 0, 0, 0, 1, 1, 2, 0, 5, 2, 0, 1, 2…
## $ opp_goals             <dbl> 2, 3, 1, 0, 0, 0, 0, 1, 3, 2, 0, 2, 1, 2, 1, 1, 0, 1, 3, 0, 2, 1, 1, 1, 2, 1, 2, 2, 0, 2…
## $ team_poss             <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ opp_poss              <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ win_label             <chr> "Home", "Away", "Home", "Draw", "Draw", "Home", "Home", "Draw", "Away", "Home", "Home", …
## $ is_home               <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ team_win_bin_goals    <int> 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0…
## $ team_win_bin_fallback <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ team_win_bin          <int> 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0…
## $ poss_diff             <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …

Cases

What are the cases, and how many are there? Each case represents a single soccer match. The dataset includes over 96,000 matches from 18 European soccer leagues (10 countries) from 2011 to 2021.

Data Collection

Describe the method of data collection

Dataset was collected by Kaggle user Sebastian Gębala. The data were scrapped from a livescore stats web page provider. The data include match statistics such as goals, shots, attacks, possession, corners, etc.

Type of study

What type of study is this (observational/experiment)? This is an observational study since no experimental manipulation has been conducted.

Data Source

If you collected the data, state self-collected. If not, provide a citation/link. Kaggle - Football DataSet +96k matches (18 leagues): https://www.kaggle.com/datasets/bastekforever/complete-football-data-89000-matches-18-leagues

Data was directly downloaded from Kaggle.

Gębala, S. (2023). Football DataSet +96k matches (18 leagues). Kaggle. Retrieved from the dataset page.

Response Variable

The response variable is team winning - binary where 1 represents winning a game and 0 represents not winning

Explanatory

Team ball possession is a numeric variable that will be the key predictor. Other variables, such as shots, pass accuracy, home vs away, will be used as optional controls.

Relevant summary statistics

table(teams_df$team_win_bin, useNA = "ifany")
## 
##      0      1   <NA> 
## 122878  69732     64
summary(teams_df$team_poss)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##      10      44      50      50      56      90   88900
teams_df %>%
  filter(!is.na(team_poss), !is.na(team_win_bin)) %>%
  mutate(outcome = ifelse(team_win_bin == 1, "Win", "Not-Win")) %>%
  ggplot(aes(x = outcome, y = team_poss, fill = outcome)) +
  geom_boxplot(alpha = 0.7, outlier.alpha = 0.3) +
  labs(title = "Team Possession by Outcome",
       x = "Outcome", y = "Team Possession (%)") +
  theme_minimal()

teams_binned <- teams_df %>%
  filter(!is.na(team_poss), !is.na(team_win_bin)) %>%
  mutate(poss_bin = cut(team_poss,
                        breaks = c(-Inf, 40, 50, 60, Inf),
                        labels = c("<40%", "40–50%", "50–60%", ">60%")),
         outcome = ifelse(team_win_bin==1,"Win","Not-Win"))

tab <- table(teams_binned$poss_bin, teams_binned$outcome, useNA = "ifany")
tab
##         
##          Not-Win   Win
##   <40%      9999  5169
##   40–50%   24996 14069
##   50–60%   22902 13730
##   >60%      7756  5117
complete_df <- teams_df %>%
  filter(!is.na(team_poss), !is.na(team_win_bin)) %>%
  mutate(team_win_bin = as.integer(team_win_bin))

n_obs <- nrow(complete_df)
n_classes <- length(unique(complete_df$team_win_bin))
sd_poss <- sd(complete_df$team_poss, na.rm = TRUE)

cat("Obs available for glm:", n_obs, "| Classes:", n_classes, "| sd(team_poss):", sd_poss, "\n")
## Obs available for glm: 103738 | Classes: 2 | sd(team_poss): 9.201833
if (n_obs >= 50 && n_classes > 1 && is.finite(sd_poss) && sd_poss > 0) {
  mdl <- glm(team_win_bin ~ scale(team_poss),
             data = complete_df,
             family = binomial())
  print(summary(mdl))
} else {
  cat("Skipping glm: not enough usable data yet (need >=50 rows, >1 class, and nonzero variance in possession).\n")
  cat("If many values are still NA, verify that the possession columns are populated in your CSV.\n")
}
## 
## Call:
## glm(formula = team_win_bin ~ scale(team_poss), family = binomial(), 
##     data = complete_df)
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -0.545441   0.006448   -84.6   <2e-16 ***
## scale(team_poss)  0.081342   0.006456    12.6   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 136397  on 103737  degrees of freedom
## Residual deviance: 136237  on 103736  degrees of freedom
## AIC: 136241
## 
## Number of Fisher Scoring iterations: 4