library(dplyr)
library(tidyr)
library(readr)
library(stringr)
library(arules)
library(arulesViz)
library(ggplot2)

Introduction

This project applies association rule mining to English Premier League match data. The goal is to discover interpretable patterns in match statistics that are strongly associated with a home win. We treat each match as a “transaction” and convert numerical match features (e.g., shots, corners, cards) into categorical items (high/low/advantage). Rules are evaluated using support, confidence, and lift.

Data

The dataset comes from football-data.co.uk (Premier League) and is provided as a CSV file. We use basic full-time match statistics and the full-time result. We keep the following columns:

FTR (full time result: H/D/A)

FTHG, FTAG (goals)

HS, AS (shots)

HC, AC (corners)

HY, AY (yellow cards)

df <- read_csv("E0.csv", show_col_types = FALSE)
glimpse(df)
## Rows: 380
## Columns: 120
## $ Div         <chr> "E0", "E0", "E0", "E0", "E0", "E0", "E0", "E0", "E0", "E0"…
## $ Date        <chr> "16/08/2024", "17/08/2024", "17/08/2024", "17/08/2024", "1…
## $ Time        <time> 20:00:00, 12:30:00, 15:00:00, 15:00:00, 15:00:00, 15:00:0…
## $ HomeTeam    <chr> "Man United", "Ipswich", "Arsenal", "Everton", "Newcastle"…
## $ AwayTeam    <chr> "Fulham", "Liverpool", "Wolves", "Brighton", "Southampton"…
## $ FTHG        <dbl> 1, 0, 2, 0, 1, 1, 1, 2, 0, 1, 2, 0, 2, 4, 0, 4, 0, 1, 2, 2…
## $ FTAG        <dbl> 0, 2, 0, 3, 0, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 0, 2, 1, 6, 0…
## $ FTR         <chr> "H", "A", "H", "A", "H", "D", "A", "H", "A", "D", "H", "A"…
## $ HTHG        <dbl> 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 3, 0, 2, 0, 1, 2, 1…
## $ HTAG        <dbl> 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 2, 0…
## $ HTR         <chr> "D", "D", "H", "A", "H", "H", "D", "H", "A", "A", "H", "D"…
## $ Referee     <chr> "R Jones", "T Robinson", "J Gillett", "S Hooper", "C Pawso…
## $ HS          <dbl> 14, 7, 18, 9, 3, 14, 14, 9, 10, 7, 14, 14, 18, 14, 5, 13, …
## $ AS          <dbl> 10, 18, 9, 10, 19, 13, 15, 14, 11, 15, 11, 18, 10, 1, 23, …
## $ HST         <dbl> 5, 2, 6, 1, 1, 8, 3, 5, 3, 3, 5, 2, 6, 5, 1, 7, 3, 4, 4, 8…
## $ AST         <dbl> 2, 5, 3, 5, 4, 4, 3, 6, 5, 7, 4, 3, 4, 1, 8, 1, 4, 5, 8, 2…
## $ HF          <dbl> 12, 9, 17, 8, 15, 17, 18, 6, 12, 11, 9, 9, 14, 4, 14, 11, …
## $ AF          <dbl> 10, 18, 14, 8, 16, 8, 11, 15, 9, 12, 13, 17, 13, 15, 14, 1…
## $ HC          <dbl> 7, 2, 8, 1, 3, 2, 5, 4, 4, 2, 4, 3, 7, 10, 4, 12, 4, 8, 5,…
## $ AC          <dbl> 8, 10, 2, 5, 12, 6, 3, 7, 3, 13, 4, 3, 5, 1, 10, 5, 1, 9, …
## $ HY          <dbl> 2, 3, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 0, 1, 2, 2, 2…
## $ AY          <dbl> 3, 1, 2, 1, 4, 3, 2, 5, 1, 1, 2, 1, 2, 3, 3, 0, 3, 2, 3, 3…
## $ HR          <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ AR          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ B365H       <dbl> 1.60, 8.50, 1.18, 2.63, 1.36, 2.45, 2.45, 2.40, 4.10, 5.00…
## $ B365D       <dbl> 4.20, 5.50, 7.50, 3.30, 5.25, 3.50, 3.60, 3.40, 3.90, 4.33…
## $ B365A       <dbl> 5.25, 1.33, 13.00, 2.63, 8.00, 2.80, 2.75, 2.90, 1.80, 1.6…
## $ BWH         <dbl> 1.60, 7.50, 1.20, 2.65, 1.35, 2.45, 2.45, 2.40, 3.90, 5.00…
## $ BWD         <dbl> 4.40, 5.50, 7.50, 3.40, 5.50, 3.50, 3.60, 3.40, 4.10, 4.33…
## $ BWA         <dbl> 5.25, 1.36, 13.50, 2.60, 7.75, 2.80, 2.70, 2.87, 1.82, 1.6…
## $ BFH         <dbl> 1.60, 8.50, 1.15, 2.70, 1.33, 2.45, 2.40, 2.45, 4.10, 5.00…
## $ BFD         <dbl> 4.33, 5.50, 8.00, 3.40, 5.50, 3.50, 3.60, 3.40, 3.90, 4.33…
## $ BFA         <dbl> 5.00, 1.33, 16.00, 2.63, 8.50, 2.88, 2.80, 2.88, 1.80, 1.5…
## $ PSH         <dbl> 1.63, 8.18, 1.16, 2.73, 1.35, 2.47, 2.49, 2.50, 4.19, 5.09…
## $ PSD         <dbl> 4.38, 5.84, 8.56, 3.36, 5.70, 3.42, 3.65, 3.40, 3.93, 4.39…
## $ PSA         <dbl> 5.30, 1.34, 16.22, 2.71, 8.25, 2.97, 2.80, 2.95, 1.84, 1.6…
## $ WHH         <dbl> 1.65, 8.50, 1.18, 2.60, 1.35, 2.50, 2.40, 2.45, 4.00, 5.00…
## $ WHD         <dbl> 4.20, 5.50, 7.00, 3.50, 5.50, 3.50, 3.75, 3.50, 4.00, 4.33…
## $ WHA         <dbl> 5.00, 1.33, 17.00, 2.70, 8.00, 2.80, 2.75, 2.88, 1.83, 1.6…
## $ `1XBH`      <dbl> 1.68, 8.60, 1.20, 2.68, 1.37, 2.46, 2.42, 2.47, 4.02, 4.98…
## $ `1XBD`      <dbl> 4.32, 5.85, 7.65, 3.66, 5.74, 3.70, 3.91, 3.66, 4.22, 4.52…
## $ `1XBA`      <dbl> 5.03, 1.35, 16.00, 2.63, 8.10, 2.87, 2.81, 2.88, 1.85, 1.6…
## $ MaxH        <dbl> 1.68, 9.00, 1.20, 2.76, 1.37, 2.51, 2.49, 2.52, 4.30, 5.30…
## $ MaxD        <dbl> 4.50, 6.10, 9.10, 3.66, 5.90, 3.70, 3.91, 3.66, 4.22, 4.52…
## $ MaxA        <dbl> 5.50, 1.37, 18.00, 2.78, 8.60, 3.00, 2.87, 2.99, 1.88, 1.6…
## $ AvgH        <dbl> 1.62, 8.28, 1.18, 2.67, 1.35, 2.45, 2.44, 2.46, 4.05, 5.01…
## $ AvgD        <dbl> 4.36, 5.76, 7.86, 3.41, 5.62, 3.49, 3.69, 3.44, 4.00, 4.37…
## $ AvgA        <dbl> 5.15, 1.34, 15.87, 2.68, 8.10, 2.89, 2.79, 2.90, 1.83, 1.6…
## $ BFEH        <dbl> 1.66, 9.40, 1.19, 2.78, 1.37, 2.54, 2.50, 2.54, 4.30, 5.40…
## $ BFED        <dbl> 4.50, 6.20, 9.00, 3.50, 6.00, 3.60, 3.75, 3.55, 4.10, 4.60…
## $ BFEA        <dbl> 5.60, 1.36, 18.00, 2.78, 9.20, 3.00, 2.94, 3.00, 1.87, 1.6…
## $ `B365>2.5`  <dbl> 1.53, 1.40, 1.44, 1.80, 1.40, 1.73, 1.57, 1.80, 1.50, 1.53…
## $ `B365<2.5`  <dbl> 2.50, 3.00, 2.75, 2.00, 3.00, 2.10, 2.38, 2.00, 2.63, 2.50…
## $ `P>2.5`     <dbl> 1.56, 1.41, 1.46, 1.83, 1.40, 1.79, 1.59, 1.83, 1.52, 1.53…
## $ `P<2.5`     <dbl> 2.56, 3.00, 2.79, 2.05, 3.09, 2.11, 2.46, 2.05, 2.62, 2.61…
## $ `Max>2.5`   <dbl> 1.57, 1.43, 1.50, 1.85, 1.42, 1.81, 1.59, 1.85, 1.53, 1.55…
## $ `Max<2.5`   <dbl> 2.60, 3.07, 2.82, 2.08, 3.12, 2.14, 2.52, 2.08, 2.69, 2.63…
## $ `Avg>2.5`   <dbl> 1.53, 1.41, 1.46, 1.81, 1.40, 1.77, 1.57, 1.81, 1.50, 1.52…
## $ `Avg<2.5`   <dbl> 2.52, 2.94, 2.70, 2.04, 3.01, 2.08, 2.43, 2.03, 2.59, 2.54…
## $ `BFE>2.5`   <dbl> 1.59, 1.45, 1.53, 1.88, 1.43, 1.83, 1.63, 1.84, 1.56, 1.56…
## $ `BFE<2.5`   <dbl> 2.64, 3.15, 2.84, 2.08, 3.15, 2.16, 2.52, 2.14, 2.68, 2.70…
## $ AHh         <dbl> -1.00, 1.50, -2.00, 0.00, -1.50, 0.00, -0.25, -0.25, 0.50,…
## $ B365AHH     <dbl> 2.05, 2.02, 1.93, 1.96, 1.98, 1.82, 2.08, 2.11, 2.07, 1.86…
## $ B365AHA     <dbl> 1.88, 1.91, 2.00, 1.97, 1.95, 2.11, 1.73, 1.82, 1.86, 2.07…
## $ PAHH        <dbl> 2.07, 1.99, 1.88, 1.96, 1.94, 1.78, 2.16, 2.15, 2.06, 1.86…
## $ PAHA        <dbl> 1.86, 1.92, 2.00, 1.94, 1.96, 2.14, 1.76, 1.78, 1.85, 2.05…
## $ MaxAHH      <dbl> 2.07, 2.02, 1.97, 1.97, 1.99, 1.83, 2.16, 2.15, 2.08, 1.86…
## $ MaxAHA      <dbl> 1.89, 1.95, 2.00, 1.97, 1.98, 2.14, 1.82, 1.82, 1.86, 2.09…
## $ AvgAHH      <dbl> 2.03, 1.97, 1.90, 1.94, 1.93, 1.80, 2.10, 2.10, 2.03, 1.83…
## $ AvgAHA      <dbl> 1.85, 1.90, 1.96, 1.94, 1.93, 2.09, 1.78, 1.78, 1.83, 2.03…
## $ BFEAHH      <dbl> 2.10, 2.04, 1.94, 1.99, 1.94, 1.83, 2.17, 2.18, 2.12, 1.91…
## $ BFEAHA      <dbl> 1.88, 1.93, 2.00, 1.99, 2.01, 2.17, 1.84, 1.83, 1.87, 2.06…
## $ B365CH      <dbl> 1.67, 8.00, 1.14, 3.10, 1.40, 2.20, 2.40, 2.88, 3.60, 4.50…
## $ B365CD      <dbl> 4.10, 5.75, 8.50, 3.40, 5.00, 3.40, 3.50, 3.20, 3.90, 4.20…
## $ B365CA      <dbl> 5.00, 1.33, 15.00, 2.30, 7.00, 3.25, 2.80, 2.55, 1.95, 1.7…
## $ BWCH        <dbl> 1.65, 8.00, 1.16, 3.00, 1.39, 2.20, 2.50, 2.80, 3.60, 4.33…
## $ BWCD        <dbl> 4.20, 5.75, 8.50, 3.40, 5.25, 3.40, 3.50, 3.20, 3.75, 4.20…
## $ BWCA        <dbl> 4.80, 1.34, 18.00, 2.37, 7.25, 3.25, 2.75, 2.60, 1.98, 1.7…
## $ BFCH        <dbl> 1.62, 7.50, 1.13, 3.00, 1.36, 2.20, 2.38, 2.80, 3.60, 4.33…
## $ BFCD        <dbl> 4.00, 5.50, 8.50, 3.30, 5.00, 3.30, 3.40, 3.25, 3.80, 4.10…
## $ BFCA        <dbl> 5.00, 1.33, 17.00, 2.30, 7.50, 3.25, 2.80, 2.63, 2.00, 1.7…
## $ PSCH        <dbl> 1.65, 8.14, 1.15, 3.15, 1.42, 2.24, 2.54, 2.92, 3.86, 4.64…
## $ PSCD        <dbl> 4.23, 6.09, 9.05, 3.41, 5.30, 3.50, 3.51, 3.24, 3.91, 4.33…
## $ PSCA        <dbl> 5.28, 1.34, 18.76, 2.40, 7.26, 3.37, 2.86, 2.66, 1.97, 1.7…
## $ WHCH        <dbl> 1.60, 8.00, 1.15, 3.10, 1.40, 2.25, 2.50, 3.00, 3.50, 4.60…
## $ WHCD        <dbl> 4.20, 5.50, 8.00, 3.30, 5.00, 3.40, 3.50, 3.20, 3.90, 4.20…
## $ WHCA        <dbl> 5.50, 1.35, 19.00, 2.40, 7.50, 3.25, 2.80, 2.50, 2.00, 1.7…
## $ `1XBCH`     <dbl> 1.66, 8.57, 1.16, 3.16, 1.39, 2.26, 2.56, 2.98, 3.89, 4.79…
## $ `1XBCD`     <dbl> 4.15, 5.85, 9.39, 3.47, 5.34, 3.49, 3.60, 3.30, 3.96, 4.27…
## $ `1XBCA`     <dbl> 5.33, 1.34, 16.60, 2.34, 7.90, 3.33, 2.77, 2.56, 1.92, 1.7…
## $ MaxCH       <dbl> 1.70, 8.57, 1.17, 3.16, 1.44, 2.38, 2.56, 3.00, 3.89, 4.90…
## $ MaxCD       <dbl> 4.33, 6.25, 9.40, 3.50, 5.75, 3.55, 3.60, 3.40, 3.96, 4.40…
## $ MaxCA       <dbl> 5.50, 1.39, 21.00, 2.45, 8.00, 3.37, 2.91, 2.72, 2.05, 1.7…
## $ AvgCH       <dbl> 1.66, 7.87, 1.15, 3.06, 1.39, 2.25, 2.47, 2.85, 3.64, 4.57…
## $ AvgCD       <dbl> 4.20, 5.81, 8.62, 3.40, 5.27, 3.44, 3.51, 3.28, 3.83, 4.23…
## $ AvgCA       <dbl> 5.02, 1.35, 18.11, 2.38, 7.33, 3.23, 2.83, 2.62, 1.98, 1.7…
## $ BFECH       <dbl> 1.72, 8.60, 1.17, 3.15, 1.43, 2.30, 2.54, 2.94, 3.75, 4.70…
## $ BFECD       <dbl> 4.20, 6.20, 9.40, 3.55, 5.50, 3.50, 3.65, 3.35, 3.90, 4.40…
## $ BFECA       <dbl> 5.40, 1.37, 21.00, 2.46, 8.20, 3.50, 2.98, 2.74, 2.06, 1.7…
## $ `B365C>2.5` <dbl> 1.62, 1.36, 1.40, 1.93, 1.44, 1.89, 1.73, 2.07, 1.57, 1.57…
## $ `B365C<2.5` <dbl> 2.30, 3.20, 3.00, 1.97, 2.75, 2.01, 2.10, 1.83, 2.38, 2.38…
## $ `PC>2.5`    <dbl> 1.63, 1.37, 1.41, 1.93, 1.46, 1.89, 1.72, 2.09, 1.58, 1.54…
## $ `PC<2.5`    <dbl> 2.38, 3.30, 2.98, 1.97, 2.85, 2.02, 2.21, 1.83, 2.51, 2.61…
## $ `MaxC>2.5`  <dbl> 1.66, 1.40, 1.45, 1.95, 1.46, 1.93, 1.73, 2.09, 1.61, 1.57…
## $ `MaxC<2.5`  <dbl> 2.45, 3.38, 3.00, 2.00, 3.05, 2.04, 2.32, 1.91, 2.52, 2.62…
## $ `AvgC>2.5`  <dbl> 1.61, 1.37, 1.42, 1.89, 1.43, 1.87, 1.68, 2.00, 1.56, 1.54…
## $ `AvgC<2.5`  <dbl> 2.37, 3.18, 2.93, 1.96, 2.84, 1.96, 2.22, 1.85, 2.45, 2.52…
## $ `BFEC>2.5`  <dbl> 1.68, 1.40, 1.44, 1.94, 1.49, 1.91, 1.74, 2.10, 1.62, 1.58…
## $ `BFEC<2.5`  <dbl> 2.46, 3.40, 3.20, 2.04, 2.98, 2.08, 2.32, 1.89, 2.60, 2.70…
## $ AHCh        <dbl> -0.75, 1.50, -2.25, 0.25, -1.25, -0.25, 0.00, 0.00, 0.50, …
## $ B365CAHH    <dbl> 1.86, 2.05, 2.02, 1.87, 1.87, 1.94, 1.83, 2.07, 1.94, 2.03…
## $ B365CAHA    <dbl> 2.07, 1.88, 1.91, 2.06, 2.06, 1.99, 2.10, 1.86, 1.99, 1.90…
## $ PCAHH       <dbl> 1.83, 2.04, 2.00, 1.86, 1.88, 1.94, 1.85, 2.06, 1.96, 2.02…
## $ PCAHA       <dbl> 2.11, 1.90, 1.90, 2.07, 2.06, 1.98, 2.09, 1.88, 1.97, 1.91…
## $ MaxCAHH     <dbl> 1.88, 2.20, 2.05, 1.92, 1.89, 2.05, 1.86, 2.11, 1.96, 2.06…
## $ MaxCAHA     <dbl> 2.11, 2.00, 1.93, 2.10, 2.10, 2.00, 2.14, 1.88, 2.00, 1.92…
## $ AvgCAHH     <dbl> 1.82, 1.99, 1.99, 1.83, 1.82, 1.93, 1.82, 2.05, 1.90, 1.99…
## $ AvgCAHA     <dbl> 2.05, 1.88, 1.87, 2.04, 2.05, 1.93, 2.08, 1.85, 1.96, 1.87…
## $ BFECAHH     <dbl> 1.90, 2.04, 2.02, 1.88, 1.89, 1.96, 1.84, 2.07, 1.93, 2.03…
## $ BFECAHA     <dbl> 2.08, 1.93, 1.96, 2.11, 2.10, 2.02, 2.18, 1.92, 2.07, 1.96…
vars <- c("Date","HomeTeam","AwayTeam","FTR","FTHG","FTAG","HS","AS","HC","AC","HY","AY")
df2 <- df %>% select(any_of(vars))
summary(df2$FTR)
##    Length     Class      Mode 
##       380 character character

Preprocessing and item creation

Association rules require categorical items. Therefore, we discretize numerical variables using data-driven thresholds (quartiles) to ensure that items are neither too rare nor too frequent.

q_HS_hi <- quantile(df2$HS, 0.75, na.rm = TRUE)
q_AS_lo <- quantile(df2$AS, 0.25, na.rm = TRUE)
q_HC_hi <- quantile(df2$HC, 0.75, na.rm = TRUE)
q_HY_lo <- quantile(df2$HY, 0.25, na.rm = TRUE)

c(q_HS_hi = q_HS_hi, q_AS_lo = q_AS_lo, q_HC_hi = q_HC_hi, q_HY_lo = q_HY_lo)
## q_HS_hi.75% q_AS_lo.25% q_HC_hi.75% q_HY_lo.25% 
##          17           9           7           1

Match-level items

HOME_SHOTS_HIGH (home team shots in the top quartile),

AWAY_SHOTS_LOW (away team shots in the bottom quartile),

SHOT_ADV_HOME_5PLUS (home has at least +5 shot advantage),

HOME_CORNERS_HIGH (home corners in the top quartile),

CORNER_ADV_HOME_3PLUS (home has at least +3 corner advantage),

HOME_CARDS_LOW (home yellow cards in the bottom quartile),

GOALS_3PLUS (at least 3 total goals),

and the outcome label HOME_WIN / DRAW / AWAY_WIN.

df_items <- df2 %>%
  mutate(
    RESULT = case_when(
      FTR == "H" ~ "RESULT_HOME_WIN",
      FTR == "A" ~ "RESULT_AWAY_WIN",
      TRUE ~ "RESULT_DRAW"
    ),
    GOALS_3PLUS = ifelse((FTHG + FTAG) >= 3, "GOALS_3PLUS", NA),
    HOME_SHOTS_HIGH = ifelse(!is.na(HS) & HS >= q_HS_hi, "HOME_SHOTS_HIGH", NA),
    AWAY_SHOTS_LOW  = ifelse(!is.na(AS) & AS <= q_AS_lo, "AWAY_SHOTS_LOW", NA),
    HOME_CORNERS_HIGH = ifelse(!is.na(HC) & HC >= q_HC_hi, "HOME_CORNERS_HIGH", NA),
    HOME_CARDS_LOW = ifelse(!is.na(HY) & HY <= q_HY_lo, "HOME_CARDS_LOW", NA),
    SHOT_ADV_HOME_5PLUS = ifelse(!is.na(HS) & !is.na(AS) & (HS - AS) >= 5,
                                 "SHOT_ADV_HOME_5PLUS", NA),
    CORNER_ADV_HOME_3PLUS = ifelse(!is.na(HC) & !is.na(AC) & (HC - AC) >= 3,
                                   "CORNER_ADV_HOME_3PLUS", NA)
  )

Transactions

Each match is treated as one transaction containing the created items. This produces a transaction dataset suitable for Apriori.

df_long <- df_items %>%
  mutate(match_id = row_number()) %>%
  select(
    match_id,
    RESULT,
    GOALS_3PLUS,
    HOME_SHOTS_HIGH,
    AWAY_SHOTS_LOW,
    HOME_CORNERS_HIGH,
    HOME_CARDS_LOW,
    SHOT_ADV_HOME_5PLUS,
    CORNER_ADV_HOME_3PLUS
  ) %>%
  pivot_longer(-match_id, values_to = "item", values_drop_na = TRUE) %>%
distinct(match_id, item)

basket_list <- split(df_long$item, df_long$match_id)
trans <- as(basket_list, "transactions")
summary(trans)
## transactions as itemMatrix in sparse format with
##  380 rows (elements/itemsets/transactions) and
##  10 columns (items) and a density of 0.3642105 
## 
## most frequent items:
##         GOALS_3PLUS      HOME_CARDS_LOW     RESULT_HOME_WIN SHOT_ADV_HOME_5PLUS 
##                 215                 166                 155                 137 
##     RESULT_AWAY_WIN             (Other) 
##                 132                 579 
## 
## element (itemset/transaction) length distribution:
## sizes
##  1  2  3  4  5  6  7  8 
## 45 96 64 46 49 41 32  7 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   3.642   5.000   8.000 
## 
## includes extended item information - examples:
##                  labels
## 1        AWAY_SHOTS_LOW
## 2 CORNER_ADV_HOME_3PLUS
## 3           GOALS_3PLUS
## 
## includes extended transaction information - examples:
##   transactionID
## 1             1
## 2             2
## 3             3
itemLabels(trans)
##  [1] "AWAY_SHOTS_LOW"        "CORNER_ADV_HOME_3PLUS" "GOALS_3PLUS"          
##  [4] "HOME_CARDS_LOW"        "HOME_CORNERS_HIGH"     "HOME_SHOTS_HIGH"      
##  [7] "RESULT_AWAY_WIN"       "RESULT_DRAW"           "RESULT_HOME_WIN"      
## [10] "SHOT_ADV_HOME_5PLUS"
grep("match", itemLabels(trans), value = TRUE)
## character(0)

Association rule mining (Apriori)

We mine association rules where the right-hand side (RHS) is HOME_WIN. We focus on rules that are sufficiently frequent (support) and reliable (confidence), and we rank them by lift.

rules_win <- apriori(
  trans,
  parameter = list(supp = 0.02, conf = 0.35, minlen = 2, maxlen = 4),
  appearance = list(default = "lhs", rhs = "RESULT_HOME_WIN"),
  control = list(verbose = FALSE)
)

length(rules_win)
## [1] 63
inspect(head(sort(rules_win, by = "lift"), 15))
##      lhs                         rhs                  support confidence   coverage     lift count
## [1]  {AWAY_SHOTS_LOW,                                                                             
##       GOALS_3PLUS,                                                                                
##       HOME_SHOTS_HIGH}        => {RESULT_HOME_WIN} 0.07105263  0.7714286 0.09210526 1.891244    27
## [2]  {AWAY_SHOTS_LOW,                                                                             
##       GOALS_3PLUS,                                                                                
##       SHOT_ADV_HOME_5PLUS}    => {RESULT_HOME_WIN} 0.10000000  0.7450980 0.13421053 1.826692    38
## [3]  {AWAY_SHOTS_LOW,                                                                             
##       GOALS_3PLUS,                                                                                
##       HOME_CARDS_LOW}         => {RESULT_HOME_WIN} 0.05526316  0.7241379 0.07631579 1.775306    21
## [4]  {AWAY_SHOTS_LOW,                                                                             
##       HOME_SHOTS_HIGH}        => {RESULT_HOME_WIN} 0.10263158  0.7090909 0.14473684 1.738416    39
## [5]  {AWAY_SHOTS_LOW,                                                                             
##       HOME_SHOTS_HIGH,                                                                            
##       SHOT_ADV_HOME_5PLUS}    => {RESULT_HOME_WIN} 0.10263158  0.7090909 0.14473684 1.738416    39
## [6]  {AWAY_SHOTS_LOW,                                                                             
##       CORNER_ADV_HOME_3PLUS,                                                                      
##       HOME_SHOTS_HIGH}        => {RESULT_HOME_WIN} 0.07368421  0.6829268 0.10789474 1.674272    28
## [7]  {AWAY_SHOTS_LOW,                                                                             
##       GOALS_3PLUS}            => {RESULT_HOME_WIN} 0.11315789  0.6825397 0.16578947 1.673323    43
## [8]  {AWAY_SHOTS_LOW,                                                                             
##       HOME_CORNERS_HIGH,                                                                          
##       HOME_SHOTS_HIGH}        => {RESULT_HOME_WIN} 0.06578947  0.6756757 0.09736842 1.656495    25
## [9]  {AWAY_SHOTS_LOW,                                                                             
##       GOALS_3PLUS,                                                                                
##       HOME_CORNERS_HIGH}      => {RESULT_HOME_WIN} 0.05789474  0.6666667 0.08684211 1.634409    22
## [10] {GOALS_3PLUS,                                                                                
##       HOME_SHOTS_HIGH,                                                                            
##       SHOT_ADV_HOME_5PLUS}    => {RESULT_HOME_WIN} 0.10789474  0.6612903 0.16315789 1.621228    41
## [11] {GOALS_3PLUS,                                                                                
##       SHOT_ADV_HOME_5PLUS}    => {RESULT_HOME_WIN} 0.13947368  0.6463415 0.21578947 1.584579    53
## [12] {AWAY_SHOTS_LOW,                                                                             
##       SHOT_ADV_HOME_5PLUS}    => {RESULT_HOME_WIN} 0.15263158  0.6444444 0.23684211 1.579928    58
## [13] {AWAY_SHOTS_LOW,                                                                             
##       HOME_CARDS_LOW,                                                                             
##       HOME_SHOTS_HIGH}        => {RESULT_HOME_WIN} 0.04736842  0.6428571 0.07368421 1.576037    18
## [14] {GOALS_3PLUS,                                                                                
##       HOME_CARDS_LOW,                                                                             
##       SHOT_ADV_HOME_5PLUS}    => {RESULT_HOME_WIN} 0.06052632  0.6388889 0.09473684 1.566308    23
## [15] {GOALS_3PLUS,                                                                                
##       HOME_SHOTS_HIGH}        => {RESULT_HOME_WIN} 0.12368421  0.6351351 0.19473684 1.557105    47

Baseline probability of a home win

base_homewin <- itemFrequency(trans, type = "relative")["RESULT_HOME_WIN"]
base_homewin
## RESULT_HOME_WIN 
##       0.4078947

The baseline probability of a home win in the dataset is approximately 0.408. Therefore, rules with lift values between 1.6 and 1.9 represent a substantial increase over the unconditional probability of a home win.

Rule evaluation metrics

Support: fraction of matches where both the LHS conditions and the RHS outcome occur. Higher support means the pattern appears more often in the dataset.

Confidence: estimated probability of the RHS given the LHS (i.e., P(HOME_WIN∣LHS)).

Lift: how much more likely the RHS becomes when the LHS occurs compared to the baseline probability of the RHS. Lift > 1 indicates a positive association; the larger the lift, the stronger the association.

Rules

Rule 1 (highest lift ~1.89)

{AWAY_SHOTS_LOW, GOALS_3PLUS, HOME_SHOTS_HIGH} ⇒ {RESULT_HOME_WIN}

Support ≈ 0.071 (27 matches): the pattern occurs in about 7% of all matches.

Confidence ≈ 0.77: when the away team has low shots, the home team has high shots, and the match has 3+ total goals, the home team wins ~77% of the time.

Lift ≈ 1.89: this is almost twice as likely as a home win in a random match. Football interpretation: a combination of strong home attacking volume + weak away attacking output, in a more open match, strongly favors a home win.

Rule 2 (support 10%, confidence 0.75)

{AWAY_SHOTS_LOW, GOALS_3PLUS, SHOT_ADV_HOME_5PLUS} ⇒ {RESULT_HOME_WIN}

Appears in 38 matches (10%), with ~75% success. Interpretation: when home shot advantage is large and away shooting is low, home wins are very frequent—especially in matches with 3+ goals.

Rule 4 (simple, interpretable)

{AWAY_SHOTS_LOW, HOME_SHOTS_HIGH} ⇒ {RESULT_HOME_WIN}

Support ≈ 0.103 (39 matches) and confidence ≈ 0.71. Interpretation: even without adding goals/corners/cards, the “high home shots + low away shots” combination alone is a strong signal of a home win.

Rule 11–12 (dominance signal)

{GOALS_3PLUS, SHOT_ADV_HOME_5PLUS} ⇒ {RESULT_HOME_WIN} (lift ~1.58) {AWAY_SHOTS_LOW, SHOT_ADV_HOME_5PLUS} ⇒ {RESULT_HOME_WIN} (lift ~1.58) Interpretation: shot dominance is a recurring driver across many high-lift rules, suggesting that shot-based items capture a core “control of the match” dimension associated with home wins.

Rule pruning (removing redundancy)

Raw rule sets can contain many redundant rules. We remove redundant rules and keep more informative patterns.

rules_clean <- rules_win[!is.redundant(rules_win)]
rules_clean <- sort(rules_clean, by = "lift", decreasing = TRUE)

summary(rules_clean)
## set of 28 rules
## 
## rule length distribution (lhs + rhs):sizes
##  2  3  4 
##  7 13  8 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   2.750   3.000   3.036   4.000   4.000 
## 
## summary of quality measures:
##     support          confidence        coverage            lift      
##  Min.   :0.05000   Min.   :0.4157   Min.   :0.07632   Min.   :1.019  
##  1st Qu.:0.09145   1st Qu.:0.5228   1st Qu.:0.15197   1st Qu.:1.282  
##  Median :0.10658   Median :0.5809   Median :0.17895   Median :1.424  
##  Mean   :0.12190   Mean   :0.5870   Mean   :0.21833   Mean   :1.439  
##  3rd Qu.:0.15263   3rd Qu.:0.6449   3rd Qu.:0.27632   3rd Qu.:1.581  
##  Max.   :0.27632   Max.   :0.7714   Max.   :0.56579   Max.   :1.891  
##      count       
##  Min.   : 19.00  
##  1st Qu.: 34.75  
##  Median : 40.50  
##  Mean   : 46.32  
##  3rd Qu.: 58.00  
##  Max.   :105.00  
## 
## mining info:
##   data ntransactions support confidence
##  trans           380    0.02       0.35
##                                                                                                                                                                                     call
##  apriori(data = trans, parameter = list(supp = 0.02, conf = 0.35, minlen = 2, maxlen = 4), appearance = list(default = "lhs", rhs = "RESULT_HOME_WIN"), control = list(verbose = FALSE))
inspect(head(rules_clean, 15))
##      lhs                         rhs                  support confidence   coverage     lift count
## [1]  {AWAY_SHOTS_LOW,                                                                             
##       GOALS_3PLUS,                                                                                
##       HOME_SHOTS_HIGH}        => {RESULT_HOME_WIN} 0.07105263  0.7714286 0.09210526 1.891244    27
## [2]  {AWAY_SHOTS_LOW,                                                                             
##       GOALS_3PLUS,                                                                                
##       SHOT_ADV_HOME_5PLUS}    => {RESULT_HOME_WIN} 0.10000000  0.7450980 0.13421053 1.826692    38
## [3]  {AWAY_SHOTS_LOW,                                                                             
##       GOALS_3PLUS,                                                                                
##       HOME_CARDS_LOW}         => {RESULT_HOME_WIN} 0.05526316  0.7241379 0.07631579 1.775306    21
## [4]  {AWAY_SHOTS_LOW,                                                                             
##       HOME_SHOTS_HIGH}        => {RESULT_HOME_WIN} 0.10263158  0.7090909 0.14473684 1.738416    39
## [5]  {AWAY_SHOTS_LOW,                                                                             
##       GOALS_3PLUS}            => {RESULT_HOME_WIN} 0.11315789  0.6825397 0.16578947 1.673323    43
## [6]  {GOALS_3PLUS,                                                                                
##       HOME_SHOTS_HIGH,                                                                            
##       SHOT_ADV_HOME_5PLUS}    => {RESULT_HOME_WIN} 0.10789474  0.6612903 0.16315789 1.621228    41
## [7]  {GOALS_3PLUS,                                                                                
##       SHOT_ADV_HOME_5PLUS}    => {RESULT_HOME_WIN} 0.13947368  0.6463415 0.21578947 1.584579    53
## [8]  {AWAY_SHOTS_LOW,                                                                             
##       SHOT_ADV_HOME_5PLUS}    => {RESULT_HOME_WIN} 0.15263158  0.6444444 0.23684211 1.579928    58
## [9]  {GOALS_3PLUS,                                                                                
##       HOME_SHOTS_HIGH}        => {RESULT_HOME_WIN} 0.12368421  0.6351351 0.19473684 1.557105    47
## [10] {AWAY_SHOTS_LOW,                                                                             
##       HOME_CARDS_LOW,                                                                             
##       HOME_CORNERS_HIGH}      => {RESULT_HOME_WIN} 0.05000000  0.6129032 0.08157895 1.502601    19
## [11] {AWAY_SHOTS_LOW,                                                                             
##       HOME_CARDS_LOW}         => {RESULT_HOME_WIN} 0.10263158  0.6093750 0.16842105 1.493952    39
## [12] {CORNER_ADV_HOME_3PLUS,                                                                      
##       GOALS_3PLUS,                                                                                
##       HOME_CORNERS_HIGH}      => {RESULT_HOME_WIN} 0.09210526  0.6034483 0.15263158 1.479422    35
## [13] {AWAY_SHOTS_LOW,                                                                             
##       CORNER_ADV_HOME_3PLUS,                                                                      
##       HOME_CORNERS_HIGH}      => {RESULT_HOME_WIN} 0.08947368  0.5862069 0.15263158 1.437152    34
## [14] {HOME_CARDS_LOW,                                                                             
##       SHOT_ADV_HOME_5PLUS}    => {RESULT_HOME_WIN} 0.10263158  0.5820896 0.17631579 1.427058    39
## [15] {GOALS_3PLUS,                                                                                
##       HOME_CORNERS_HIGH}      => {RESULT_HOME_WIN} 0.10526316  0.5797101 0.18157895 1.421225    40

Visualization

We visualize rules using standard arulesViz plots. These help assess the trade-off between support, confidence, and lift.

plot(rules_clean, measure = c("support", "lift"), shading = "confidence")

plot(rules_clean, method = "grouped")

plot(head(rules_clean, 30), method = "graph", control = list(type = "items"))
## Available control parameters (with default values):
## layout    =  stress
## circular  =  FALSE
## ggraphdots    =  NULL
## edges     =  <environment>
## nodes     =  <environment>
## nodetext  =  <environment>
## colors    =  c("#EE0000FF", "#EEEEEEFF")
## engine    =  ggplot2
## max   =  100
## verbose   =  FALSE

The scatter and grouped visualizations confirm that high-lift rules tend to combine multiple indicators of home dominance, while single-item rules usually achieve lower lift values.

Interpretation

We interpret the strongest rules (highest lift) in football terms. For example, patterns involving high home shot volume and strong shot/corner advantages should be positively associated with winning at home. High-lift rules typically combine multiple indicators of home dominance, suggesting that no single statistic alone explains home wins, but rather their joint occurrence.

top_rules <- head(rules_clean, 20)
inspect(top_rules)
##      lhs                         rhs                  support confidence   coverage     lift count
## [1]  {AWAY_SHOTS_LOW,                                                                             
##       GOALS_3PLUS,                                                                                
##       HOME_SHOTS_HIGH}        => {RESULT_HOME_WIN} 0.07105263  0.7714286 0.09210526 1.891244    27
## [2]  {AWAY_SHOTS_LOW,                                                                             
##       GOALS_3PLUS,                                                                                
##       SHOT_ADV_HOME_5PLUS}    => {RESULT_HOME_WIN} 0.10000000  0.7450980 0.13421053 1.826692    38
## [3]  {AWAY_SHOTS_LOW,                                                                             
##       GOALS_3PLUS,                                                                                
##       HOME_CARDS_LOW}         => {RESULT_HOME_WIN} 0.05526316  0.7241379 0.07631579 1.775306    21
## [4]  {AWAY_SHOTS_LOW,                                                                             
##       HOME_SHOTS_HIGH}        => {RESULT_HOME_WIN} 0.10263158  0.7090909 0.14473684 1.738416    39
## [5]  {AWAY_SHOTS_LOW,                                                                             
##       GOALS_3PLUS}            => {RESULT_HOME_WIN} 0.11315789  0.6825397 0.16578947 1.673323    43
## [6]  {GOALS_3PLUS,                                                                                
##       HOME_SHOTS_HIGH,                                                                            
##       SHOT_ADV_HOME_5PLUS}    => {RESULT_HOME_WIN} 0.10789474  0.6612903 0.16315789 1.621228    41
## [7]  {GOALS_3PLUS,                                                                                
##       SHOT_ADV_HOME_5PLUS}    => {RESULT_HOME_WIN} 0.13947368  0.6463415 0.21578947 1.584579    53
## [8]  {AWAY_SHOTS_LOW,                                                                             
##       SHOT_ADV_HOME_5PLUS}    => {RESULT_HOME_WIN} 0.15263158  0.6444444 0.23684211 1.579928    58
## [9]  {GOALS_3PLUS,                                                                                
##       HOME_SHOTS_HIGH}        => {RESULT_HOME_WIN} 0.12368421  0.6351351 0.19473684 1.557105    47
## [10] {AWAY_SHOTS_LOW,                                                                             
##       HOME_CARDS_LOW,                                                                             
##       HOME_CORNERS_HIGH}      => {RESULT_HOME_WIN} 0.05000000  0.6129032 0.08157895 1.502601    19
## [11] {AWAY_SHOTS_LOW,                                                                             
##       HOME_CARDS_LOW}         => {RESULT_HOME_WIN} 0.10263158  0.6093750 0.16842105 1.493952    39
## [12] {CORNER_ADV_HOME_3PLUS,                                                                      
##       GOALS_3PLUS,                                                                                
##       HOME_CORNERS_HIGH}      => {RESULT_HOME_WIN} 0.09210526  0.6034483 0.15263158 1.479422    35
## [13] {AWAY_SHOTS_LOW,                                                                             
##       CORNER_ADV_HOME_3PLUS,                                                                      
##       HOME_CORNERS_HIGH}      => {RESULT_HOME_WIN} 0.08947368  0.5862069 0.15263158 1.437152    34
## [14] {HOME_CARDS_LOW,                                                                             
##       SHOT_ADV_HOME_5PLUS}    => {RESULT_HOME_WIN} 0.10263158  0.5820896 0.17631579 1.427058    39
## [15] {GOALS_3PLUS,                                                                                
##       HOME_CORNERS_HIGH}      => {RESULT_HOME_WIN} 0.10526316  0.5797101 0.18157895 1.421225    40
## [16] {HOME_SHOTS_HIGH,                                                                            
##       SHOT_ADV_HOME_5PLUS}    => {RESULT_HOME_WIN} 0.14473684  0.5729167 0.25263158 1.404570    55
## [17] {AWAY_SHOTS_LOW}         => {RESULT_HOME_WIN} 0.17894737  0.5714286 0.31315789 1.400922    68
## [18] {CORNER_ADV_HOME_3PLUS,                                                                      
##       GOALS_3PLUS}            => {RESULT_HOME_WIN} 0.10526316  0.5633803 0.18684211 1.381190    40
## [19] {SHOT_ADV_HOME_5PLUS}    => {RESULT_HOME_WIN} 0.20000000  0.5547445 0.36052632 1.360019    76
## [20] {HOME_SHOTS_HIGH}        => {RESULT_HOME_WIN} 0.16578947  0.5526316 0.30000000 1.354839    63

Conclusion

Association rule mining provides an interpretable way to discover match-statistics combinations that are strongly associated with a home win. The strongest rules typically involve indicators of home dominance, such as high shooting volume or large shot/corner advantages. This approach can be extended by adding more features (e.g., odds, red cards) or by mining rules for other outcomes such as draws or away wins.

References

Data source: football-data.co.uk (Premier League match statistics CSV).