The AFLW Season finals is about to kick off, finishing up their home and away season. What better way to see which team is predicted to win the grand final based off this model. Using the fitzRoy data package, we focused on the last 4 years, from 2022 as that is when all 18 AFL teams began competeting.
Predictions are based on the AFLW ladder and Elo ratings as of the end of Round 11. The final round results may change the finals matchups and thus affect the predicted probabilities.
library(fitzRoy)
library(tidyverse)
library(dplyr)
library(ggplot2)
library(corrplot)
library(elo)
library(PlayerRatings)
We will be using data from 2022 - 2025 Season
seasons <- 2022:2025
AFLW <- purrr::map_df(seasons, ~fetch_results(comp = "AFLW", season = .x))
names(AFLW)
## [1] "k2kSponsored" "match.name"
## [3] "match.date" "match.status"
## [5] "match.matchId" "match.venue"
## [7] "match.utcStartTime" "match.homeTeamId"
## [9] "match.awayTeamId" "match.round"
## [11] "match.venueLocalStartTime" "match.abbr"
## [13] "match.twitterHashTag" "match.homeTeam.name"
## [15] "match.homeTeam.timeZone" "match.homeTeam.teamId"
## [17] "match.homeTeam.abbr" "match.homeTeam.nickname"
## [19] "match.awayTeam.name" "match.awayTeam.timeZone"
## [21] "match.awayTeam.teamId" "match.awayTeam.abbr"
## [23] "match.awayTeam.nickname" "venue.address"
## [25] "venue.name" "venue.state"
## [27] "venue.timeZone" "venue.venueId"
## [29] "venue.abbreviation" "venue.capacity"
## [31] "venue.groundDimension" "venue.latitude"
## [33] "venue.longitude" "venue.landOwner"
## [35] "round.name" "round.year"
## [37] "round.roundId" "round.abbreviation"
## [39] "round.competitionId" "round.roundNumber"
## [41] "status" "matchId"
## [43] "scoreWorm" "scoreMap"
## [45] "lastUpdated" "homeTeamScore.periodScore"
## [47] "homeTeamScore.rushedBehinds" "homeTeamScore.minutesInFront"
## [49] "homeTeamScore.matchScore.totalScore" "homeTeamScore.matchScore.goals"
## [51] "homeTeamScore.matchScore.behinds" "homeTeamScore.matchScore.superGoals"
## [53] "awayTeamScore.periodScore" "awayTeamScore.rushedBehinds"
## [55] "awayTeamScore.minutesInFront" "awayTeamScore.matchScore.totalScore"
## [57] "awayTeamScore.matchScore.goals" "awayTeamScore.matchScore.behinds"
## [59] "awayTeamScore.matchScore.superGoals" "matchClock.periods"
## [61] "weather.description" "weather.tempInCelsius"
## [63] "weather.weatherType" "homeTeamScoreChart.goals"
## [65] "homeTeamScoreChart.leftBehinds" "homeTeamScoreChart.rightBehinds"
## [67] "homeTeamScoreChart.leftPosters" "homeTeamScoreChart.rightPosters"
## [69] "homeTeamScoreChart.rushedBehinds" "homeTeamScoreChart.touchedBehinds"
## [71] "awayTeamScoreChart.goals" "awayTeamScoreChart.leftBehinds"
## [73] "awayTeamScoreChart.rightBehinds" "awayTeamScoreChart.leftPosters"
## [75] "awayTeamScoreChart.rightPosters" "awayTeamScoreChart.rushedBehinds"
## [77] "awayTeamScoreChart.touchedBehinds"
As the dataset contains 77 variables, we need to clean the data and choose which variables are relevant to use and which are not.
#Choose Variables
aflw_clean <- AFLW %>%
select(match.date,
match.matchId,
home_team = match.homeTeam.name,
away_team = match.awayTeam.name,
home_score = homeTeamScore.matchScore.totalScore,
away_score = awayTeamScore.matchScore.totalScore,
venue.name,
round.name,
round.year,
home_mins_front = homeTeamScore.minutesInFront,
away_mins_front = awayTeamScore.minutesInFront,
weather.tempInCelsius,
weather.weatherType)
#Rename some variables
aflw_clean <- aflw_clean %>%
rename(match_id = match.matchId,
venue_name = venue.name,
round_name = round.name,
round_year = round.year,
weather_celsius = weather.tempInCelsius,
weather_type = weather.weatherType)
#Create a new variable Margin & Home win (if the home team won)
aflw_clean <- aflw_clean %>%
mutate(margin = home_score - away_score,
home_win = ifelse(home_score > away_score, 1, 0))
aflw_clean <- aflw_clean %>%
mutate(match_id = str_remove(match_id, "^CD_M"))
#Change all character variables to factors
aflw_clean$match_number <- seq_len(nrow(aflw_clean))
aflw_clean <- aflw_clean %>%
mutate(
round_name = str_replace(round_name, "^Week", "Round"),
round_name = str_replace(round_name, "Finals Week 1", "Qualifying Final")
)
We can now have a look at the data to learn what we are working with, identify any issues like missing data and help us decide what modelling procedure to take.
str(aflw_clean)
## tibble [486 × 16] (S3: tbl_df/tbl/data.frame)
## $ match.date : POSIXct[1:486], format: "2022-01-07 08:15:00" "2022-01-08 06:10:00" ...
## $ match_id : chr [1:486] "20222640101" "20222640102" "20222640103" "20222640104" ...
## $ home_team : chr [1:486] "St Kilda" "Kangaroos" "Western Bulldogs" "Fremantle" ...
## $ away_team : chr [1:486] "Richmond" "Geelong Cats" "Melbourne" "West Coast Eagles" ...
## $ home_score : int [1:486] 23 26 22 43 39 25 21 38 41 17 ...
## $ away_score : int [1:486] 61 18 46 15 9 44 36 54 14 31 ...
## $ venue_name : chr [1:486] "Kinetic Stadium" "Arden Street Oval" "Mission Whitten Oval" "Fremantle Oval" ...
## $ round_name : chr [1:486] "Round 1" "Round 1" "Round 1" "Round 1" ...
## $ round_year : chr [1:486] "2022" "2022" "2022" "2022" ...
## $ home_mins_front: int [1:486] 8 42 0 44 45 22 20 19 56 2 ...
## $ away_mins_front: int [1:486] 49 16 53 12 4 45 36 45 0 49 ...
## $ weather_celsius: num [1:486] 29 21 21 18 18 23 18 32 25 27 ...
## $ weather_type : chr [1:486] "THUNDERSTORMS" "RAIN" "RAIN" "MOSTLY_SUNNY" ...
## $ margin : int [1:486] -38 8 -24 28 30 -19 -15 -16 27 -14 ...
## $ home_win : num [1:486] 0 1 0 1 1 0 0 0 1 0 ...
## $ match_number : int [1:486] 1 2 3 4 5 6 7 8 9 10 ...
summary(aflw_clean)
## match.date match_id home_team
## Min. :2022-01-07 08:15:00 Length:486 Length:486
## 1st Qu.:2022-09-30 14:02:30 Class :character Class :character
## Median :2023-10-21 17:10:00 Mode :character Mode :character
## Mean :2023-12-13 23:39:47
## 3rd Qu.:2024-11-01 21:37:30
## Max. :2025-11-01 08:15:00
## away_team home_score away_score venue_name
## Length:486 Min. : 1.00 Min. : 1.00 Length:486
## Class :character 1st Qu.: 25.00 1st Qu.: 22.00 Class :character
## Mode :character Median : 35.50 Median : 34.00 Mode :character
## Mean : 38.20 Mean : 36.61
## 3rd Qu.: 47.75 3rd Qu.: 46.00
## Max. :108.00 Max. :114.00
## round_name round_year home_mins_front away_mins_front
## Length:486 Length:486 Min. : 0.00 Min. : 0.00
## Class :character Class :character 1st Qu.: 5.25 1st Qu.: 3.00
## Mode :character Mode :character Median : 37.00 Median :24.00
## Mean : 35.67 Mean :31.62
## 3rd Qu.: 60.00 3rd Qu.:58.00
## Max. :118.00 Max. :87.00
## weather_celsius weather_type margin home_win
## Min. :12.00 Length:486 Min. :-100.000 Min. :0.0000
## 1st Qu.:18.00 Class :character 1st Qu.: -18.000 1st Qu.:0.0000
## Median :19.00 Mode :character Median : 2.000 Median :1.0000
## Mean :20.74 Mean : 1.588 Mean :0.5185
## 3rd Qu.:23.00 3rd Qu.: 22.000 3rd Qu.:1.0000
## Max. :37.00 Max. : 96.000 Max. :1.0000
## match_number
## Min. : 1.0
## 1st Qu.:122.2
## Median :243.5
## Mean :243.5
## 3rd Qu.:364.8
## Max. :486.0
head(aflw_clean)
## # A tibble: 6 × 16
## match.date match_id home_team away_team home_score away_score
## <dttm> <chr> <chr> <chr> <int> <int>
## 1 2022-01-07 08:15:00 20222640101 St Kilda Richmond 23 61
## 2 2022-01-08 06:10:00 20222640102 Kangaroos Geelong … 26 18
## 3 2022-01-08 08:00:00 20222640103 Western Bulld… Melbourne 22 46
## 4 2022-01-08 09:50:00 20222640104 Fremantle West Coa… 43 15
## 5 2022-01-09 03:10:00 20222640107 Adelaide Crows Brisbane… 39 9
## 6 2022-01-09 05:10:00 20222640105 Carlton Collingw… 25 44
## # ℹ 10 more variables: venue_name <chr>, round_name <chr>, round_year <chr>,
## # home_mins_front <int>, away_mins_front <int>, weather_celsius <dbl>,
## # weather_type <chr>, margin <int>, home_win <dbl>, match_number <int>
#Any missing values?
colSums(is.na(aflw_clean))
## match.date match_id home_team away_team home_score
## 0 0 0 0 0
## away_score venue_name round_name round_year home_mins_front
## 0 0 0 0 0
## away_mins_front weather_celsius weather_type margin home_win
## 0 0 0 0 0
## match_number
## 0
#Exploration plots
aflw_clean %>%
group_by(home_team, round_year) %>%
summarise(avg_score = mean(home_score)) %>%
ggplot(aes(x = home_team, y = avg_score, fill = factor(round_year))) +
geom_col(position = "dodge") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(aflw_clean, aes(x = factor(home_win), y = margin)) +
geom_boxplot(fill = "lightgreen") +
labs(x = "Home Win", y = "Margin", title = "Margin vs Match Outcome")
# Select numeric columns and calculate correlation matrix
num_cols <- c("home_score", "away_score", "home_mins_front", "away_mins_front", "weather_celsius")
cor_matrix <- cor(aflw_clean[, num_cols], use = "complete.obs")
# Plot correlation matrix
corrplot(cor_matrix, method = "color", type = "upper",
addCoef.col = "black", tl.col = "black", tl.srt = 45)
The correlation matrix is telling us that the variables home team time in the front and away team time in front are useful for refining an ELO model and can be used as extra predictors. However weather is a weakly correlated value and therefore not useful.
Next we want to split the data up.To train and text the accuracy of the model we will use seasons 22 - 24 and then the 2024 finals. Once tested and modified we can use the 2022 - 2025 seasons to predict the 2025 Grand Final Winner
#Filter Data to only bee regular games not finals games
train_data <- aflw_clean %>%
filter(round_year %in% 2022:2024) %>%
filter(grepl("Round", round_name))
test_data <- aflw_clean %>%
filter(round_year %in% 2024) %>%
filter(!grepl("Round", round_name))
predict_data <- aflw_clean %>%
filter(round_year %in% 2022:2025) %>%
filter(grepl("Round", round_name))
Next we can train the elo model
elo_model <- elo(train_data[c("match_number", "home_team", "away_team", "home_win")],
init = 1500,
kfac = 27,
history = TRUE)
plot(elo_model)
We can now save these finals ratings in a dataframe
final_ratings <- as.data.frame(elo_model$ratings) %>%
rename(team = Player) %>%
arrange(desc(Rating))
# Convert your data frame to a named vector
team_ratings <- setNames(final_ratings$Rating, final_ratings$team)
print(final_ratings)
## team Rating Games Win Draw Loss Lag
## 1 Brisbane Lions 1688.935 41 33 0 8 3
## 2 Adelaide Crows 1688.032 41 34 0 7 8
## 3 Kangaroos 1671.920 41 30 0 11 5
## 4 Melbourne 1649.351 41 32 0 9 7
## 5 Hawthorn 1555.182 31 16 0 15 2
## 6 Richmond 1548.481 41 23 0 18 2
## 7 Fremantle 1529.101 41 22 0 19 0
## 8 Geelong Cats 1517.474 41 20 0 21 8
## 9 Essendon 1513.939 31 16 0 15 4
## 10 Port Adelaide 1447.118 31 11 0 20 1
## 11 St Kilda 1437.911 41 15 0 26 3
## 12 Gold Coast SUNS 1434.777 41 17 0 24 5
## 13 Collingwood 1424.089 41 19 0 22 7
## 14 Western Bulldogs 1419.027 41 17 0 24 0
## 15 Carlton 1412.205 41 15 0 26 4
## 16 Sydney Swans 1395.919 31 9 0 22 6
## 17 West Coast Eagles 1337.610 41 9 0 32 6
## 18 GWS GIANTS 1328.929 41 11 0 30 1
Now we can calculate the probability of each team winning, with added predictors home advantage and time in front.
elo_prob <- function(home, away, ratings, home_adv = 0, front_factor = 0) {
# Adjusted ratings including home advantage and front factor
Rh <- ratings[home] + home_adv + front_factor
Ra <- ratings[away]
# Elo probability formula
1 / (1 + 10^((Ra - Rh)/400))
}
Then we check the accuracy of the model against the actual results to see if we need to change anything or make it better.
#Set scaling variables for elo prediction
home_advantage <- 50 # home ground advantage
front_scale <- 2 # weight for minutes in front
weather_scale <- 1 # weight for weather effect
margin_scale <- 1 # weight for margin effect
test_data <- test_data %>%
mutate(
#Home/Away ratings
home_rating = team_ratings[home_team],
away_rating = team_ratings[away_team],
#Time-in-front factor
front_factor = (home_mins_front - away_mins_front) * front_scale,
#Weather factor
weather_factor = case_when(
weather_type %in% c("RAIN","THUNDERSTORMS") ~ -20 * weather_scale,
weather_celsius < 10 ~ -10 * weather_scale,
TRUE ~ 0
),
#Margin factor
margin_factor = margin * margin_scale,
#Combine all factors into Elo probability
home_win_prob = elo_prob(
home_team,
away_team,
team_ratings,
home_adv = home_advantage,
front_factor = front_factor + weather_factor + margin_factor
)
)
#Predict
test_data <- test_data %>%
mutate(
pred_home_win = ifelse(home_win_prob > 0.5, 1, 0),
correct = ifelse(pred_home_win == home_win, 1, 0)
)
accuracy <- mean(test_data$correct, na.rm = TRUE)
brier <- mean((test_data$home_win_prob - test_data$home_win)^2, na.rm = TRUE)
cat("Elo Model Accuracy:", round(accuracy, 3), "\n",
"Brier Score:", round(brier, 3), "\n")
## Elo Model Accuracy: 0.889
## Brier Score: 0.167
The model has an accuracy of 0.889, which is saying that our model is 89% accurate, predicting every 9 out of 10 finals games correctly in 2024. The Brier score also shows us how close our predicted probabilities are to the actual results, with 0 being close to perfect and 1 showing the worst. A score of 0.167 demonstrates it is very close to actual predictions and the probabilities are well-calibrated.
As of Round 11, the current ladder is as follows; 1st - North Melbourne 2nd - Hawthorn 3rd - Melbourne 4th - Brisbane Lions 5th - Carlton 6th - St Kilda 7th - West Coast Eagles 8th - Adelaide
With round 12 playing this weekend, a few positions may be changed and some teams may be kicked out. Therefore for this sake, we will be using up until round 11 and assuming the final ladder position is as above. Once round 12 has finished, this should be updated to the correct ladder.
QF <- data.frame(
home_team = c("Kangaroos", "Hawthorn", "Melbourne", "Brisbane Lions"),
away_team = c("Carlton", "St Kilda", "Adelaide Crows", "West Coast Eagles"),
stringsAsFactors = FALSE
)
Now we know the model is fairly accurate we need to include the 2025 data to predict for the 2025 finals.
elo_model_final <- elo(predict_data[c("match_number", "home_team", "away_team", "home_win")],
init = 1500,
kfac = 27,
history = TRUE)
final_ratings <- as.data.frame(elo_model_final$ratings) %>%
rename(team = Player) %>%
arrange(desc(Rating))
team_ratings <- setNames(final_ratings$Rating, final_ratings$team)
print(final_ratings)
## team Rating Games Win Draw Loss Lag
## 1 Kangaroos 1755.478 53 42 0 11 5
## 2 Brisbane Lions 1698.314 52 41 0 11 10
## 3 Melbourne 1651.582 53 41 0 12 0
## 4 Adelaide Crows 1635.179 52 40 0 12 14
## 5 Hawthorn 1600.290 43 25 0 18 5
## 6 Fremantle 1539.772 52 28 0 24 11
## 7 St Kilda 1497.843 52 22 0 30 8
## 8 Geelong Cats 1491.096 53 25 0 28 0
## 9 Port Adelaide 1480.369 43 17 0 26 3
## 10 Carlton 1479.642 53 23 0 30 4
## 11 Essendon 1463.691 43 20 0 23 1
## 12 Richmond 1452.923 53 25 0 28 2
## 13 Sydney Swans 1428.078 43 15 0 28 1
## 14 Western Bulldogs 1408.219 52 21 0 31 9
## 15 West Coast Eagles 1384.326 53 15 0 38 4
## 16 Collingwood 1382.234 52 22 0 30 6
## 17 Gold Coast SUNS 1364.403 53 19 0 34 2
## 18 GWS GIANTS 1286.560 53 13 0 40 3
Now using these rating we can simulate a single game, and how two teams match up against each other and who will win.
simulate_match <- function(home, away, ratings, home_adv = 50, weather_factor = 0) {
rating_home <- ratings[home] + home_adv + weather_factor
rating_away <- ratings[away] + weather_factor
p_home <- 1 / (1 + 10^((rating_away - rating_home)/400))
winner <- ifelse(runif(1) < p_home, home, away)
loser <- ifelse(winner == home, away, home)
list(Winner = winner, Loser = loser)
}
Using these match simulations we can now simulate the finals series to see who is predicted to win and who is predicted to loose. Therefore making it to the grand final.
simulate_finals <- function(ratings, home_adv = 50, weather_factor = 0) {
results <- list()
# Week 1: Quarter Finals
results$QF1 <- simulate_match(QF$home_team[1], QF$away_team[1], ratings, home_adv, weather_factor)
results$QF2 <- simulate_match(QF$home_team[2], QF$away_team[2], ratings, home_adv, weather_factor)
results$QF3 <- simulate_match(QF$home_team[3], QF$away_team[3], ratings, home_adv, weather_factor)
results$QF4 <- simulate_match(QF$home_team[4], QF$away_team[4], ratings, home_adv, weather_factor)
# Week 2: Semi Finals (adjust to AFLW finals structure)
results$SF1 <- simulate_match(results$QF1$Winner, results$QF2$Winner, ratings, home_adv, weather_factor)
results$SF2 <- simulate_match(results$QF3$Winner, results$QF4$Winner, ratings, home_adv, weather_factor)
# Week 3: Preliminary Final
results$PF <- simulate_match(results$SF1$Winner, results$SF2$Winner, ratings, home_adv, weather_factor)
# Week 4: Grand Final
results$GF <- simulate_match(results$PF$Winner, results$SF2$Loser, ratings, home_adv, weather_factor)
results
}
Now we can simulate the finals series 10,000 times. This can give us a probabilistic view of outcomes.
Once simulated, we can get the probability of each team being eliminated during each round of the finals. This will provide us with the probability of each time winning the grand final.
# Run multiple simulations
set.seed(123)
n_sims <- 10000
elo_all_sims <- replicate(n_sims, simulate_finals(team_ratings), simplify = FALSE)
# Summarize finishing probabilities
get_positions <- function(sim) {
teams <- c(QF$home_team, QF$away_team)
pos <- setNames(rep(NA, length(teams)), teams)
# QF losers
pos[c(sim$QF1$Loser, sim$QF2$Loser, sim$QF3$Loser, sim$QF4$Loser)] <- "Eliminated Week 1"
# SF losers
pos[c(sim$SF1$Loser, sim$SF2$Loser)] <- "Eliminated Week 2"
# PF loser
pos[sim$PF$Loser] <- "Lose Preliminary Final"
# Grand Final
pos[sim$GF$Loser] <- "Lose Grand Final"
pos[sim$GF$Winner] <- "Premiership"
pos
}
# Run 10k simulations
all_sims <- replicate(n_sims, simulate_finals(team_ratings), simplify = FALSE)
We can put together the probabilities of each team winning. The team with the highest probability shows they are the most likely team to win the 2025 AFLW Grand final.
# Extract Grand Final winners
gf_winners <- sapply(all_sims, function(x) unname(x$GF$Winner))
# Make sure it's plain text
gf_winners <- as.character(gf_winners)
# Summarise win probabilities
gf_winner_probs <- as.data.frame(table(gf_winners), stringsAsFactors = FALSE) %>%
rename(Team = gf_winners, Wins = Freq) %>%
mutate(Probability = Wins / n_sims) %>%
arrange(desc(Probability))
# Print clear text output for the most likely winner
cat("Predicted Grand Final Winner:",
gf_winner_probs$Team[1],
"(",
round(gf_winner_probs$Probability[1] * 100, 1),
"% chance)\n",
"Followed by", gf_winner_probs$Team[2],
"(",
round(gf_winner_probs$Probability[2] * 100, 1),"% chance)")
## Predicted Grand Final Winner: Kangaroos ( 32.9 % chance)
## Followed by Brisbane Lions ( 32.6 % chance)