Load Required Libraries and Dataset

library(tidyverse)
library(fitzRoy)
library(GGally)
library(rsample)
library(performance)
library(sjPlot)


afl_raw <- fitzRoy::fetch_player_stats_fryzigg(season = 2025)

Clean up data and create variables

As there are over 82 variables within this data set, it is important to clean up the data set for easier readability. We will choose only variables that are required to create our model.

player_season_2025 <- afl_raw %>%
  group_by(match_round,player_first_name, player_last_name) %>%
  summarise(
    Fantasy_Score = (afl_fantasy_score),
    Goals = (goals),
    Behinds = (behinds),
    Tackles = (tackles),
    Hitouts = (hitouts),
    Kicks = (kicks),
    Handballs = (handballs),
    Marks = (marks),
    FF = (free_kicks_for),
    FA = (free_kicks_against)) %>% 
  ungroup()

EDA of fantasy scores

ggplot(player_season_2025, aes(x = Fantasy_Score)) +
  geom_histogram(fill = "red", bins = 20, color = 'white') +
  labs(title = "Variance of AFL Fantasy Scores", x = "Fantasy Scores", y = "Frequency")

As we can see the data is quite symmetric with no skewness present.

Visualing which variables correlate the most to fantasy score

ggpairs(player_season_2025, 
        columns = c("Fantasy_Score", "Goals", "Behinds", "Tackles","Hitouts", "Kicks", "Handballs","Marks","FF","FA"), 
        title = "Variables that correlate to Fantasy Score")

As seen in the pairs plot, every variable highly correlates to fantasy score, we will choose variables with ’***’ as more asterisks represent greater correlation.

Model Creation

Create Splits

Creating splits allows us to test the models performance by, training the model with a training data set, and then testing the model on the test data set.

split_players <- initial_split(player_season_2025)
train_split <- training(split_players)
test_split <- testing(split_players)

Build Linear Model

lm1 <- lm(Fantasy_Score ~ Goals + Behinds + Tackles + Hitouts + Kicks + Handballs + Marks + FF, data = train_split)

lm2 <- lm(Fantasy_Score ~ Goals + Behinds + Tackles*FF + Hitouts + Kicks + Handballs + Marks, data = train_split)

tab_model(lm1, lm2)

	Fantasy Score			Fantasy Score
Predictors	Estimates	CI	p	Estimates	CI	p
(Intercept)	-1.73	-1.90 – -1.56	<0.001	-1.71	-1.89 – -1.52	<0.001
Goals	5.95	5.87 – 6.02	<0.001	5.95	5.87 – 6.02	<0.001
Behinds	0.98	0.88 – 1.07	<0.001	0.98	0.88 – 1.08	<0.001
Tackles	3.86	3.83 – 3.89	<0.001	3.85	3.81 – 3.89	<0.001
Hitouts	0.94	0.93 – 0.95	<0.001	0.94	0.93 – 0.95	<0.001
Kicks	2.99	2.97 – 3.01	<0.001	2.99	2.97 – 3.01	<0.001
Handballs	1.98	1.96 – 2.00	<0.001	1.98	1.96 – 2.00	<0.001
Marks	3.03	2.99 – 3.06	<0.001	3.03	2.99 – 3.06	<0.001
FF	0.82	0.75 – 0.89	<0.001	0.79	0.69 – 0.90	<0.001
Tackles × FF				0.01	-0.01 – 0.03	0.447
Observations	7452			7452
R² / R² adjusted	0.990 / 0.990			0.990 / 0.990

compare_performance(lm1, lm2)

## # Comparison of Model Performance Indices
## 
## Name | Model |   AIC (weights) |  AICc (weights) |   BIC (weights) |    R2
## --------------------------------------------------------------------------
## lm1  |    lm | 36878.4 (0.671) | 36878.4 (0.671) | 36947.5 (0.985) | 0.990
## lm2  |    lm | 36879.8 (0.329) | 36879.8 (0.329) | 36955.9 (0.015) | 0.990
## 
## Name | R2 (adj.) |  RMSE | Sigma
## --------------------------------
## lm1  |     0.990 | 2.869 | 2.871
## lm2  |     0.990 | 2.869 | 2.871

tab_model shows us that for lm1 all variables are statistically significant. When adding a interaction effect in lm2 of Tackles and Free for, it is not statistically significant and holds no value.
compare_performance shows us which model performs the best based on multiple statistics. the key statistics so look at are; AIC, R2, and RMSE. as R2 and RMSE are the same for both models, we look at AIC and see that lm1 scored lower which means that lm1 is a better model.

Visualise model

predictions <- predict(lm1, newdata = test_split) 

test_results <- test_split %>% 
  mutate(Predicted_Fantasy_Score = predictions) 

#Let's plot the Actual vs Predicted Scores
ggplot(test_results, aes(x = Fantasy_Score, y = Predicted_Fantasy_Score)) +
  geom_jitter(width = 0.2, height = 0.2, color = "Black", alpha = 0.6) +
  geom_smooth(method = "lm") + 
  labs(
    title = "Predicted vs. Actual Fantasy Scores",
    x = "Actual Fantasy Scores",
    y = "Predicted Fantasy Scores"
  ) +
  theme_minimal()

When visualising our linear model on our test data. We can see that it is a near linear relationship with a tight spread which follows the line well.

Visualsing a single players fantasy score throughout the season

Now we will compare a single players real fantasy score to their predicted score from the linear model to see how will the model performed. The player chosen is the 2025 brownlow winner Matt Rowell.

Create player dataset and clean up table

# Search for player within data set
Matt_Rowell <- player_season_2025 %>% 
  filter(player_last_name == 'Rowell')

# Convert to long format
Matt_Rowell <- Matt_Rowell %>%
  select(match_round, Predicted_Fantasy_Score, Fantasy_Score) %>% 
  pivot_longer(cols = c(Predicted_Fantasy_Score, Fantasy_Score), 
               names_to = "type", 
               values_to = "score") %>% 
  mutate(match_round = case_when(match_round == "Finals Week 1" ~ "25",
                                 match_round == "Semi Finals" ~ "26",
                                 TRUE ~ match_round)) %>% 
  mutate(match_round = sprintf("%02d", as.numeric(match_round)))

By doing this we are now able to visualise the data set better as we now have two columns which tell us if the score is the predicted score or real fantasy score, and a column for the score itself. We have also altered named rounds to numbers and reordered the rounds so when visualising the plot it will be in the correct order.

Visualise Matt Rowells Fantasy Score

ggplot(Matt_Rowell, aes(x = match_round, y = score, color = type, group = type)) +
  geom_line(size = 1) +                       
  geom_point(size = 2) +                      
  labs(title = "Matt Rowell 2025 Fantasy Real vs Predicted scores", 
       x = "Round", 
       y = "Score") +
  theme_minimal() +
  scale_color_manual(values = c("Predicted_Fantasy_Score" = "gold", "Fantasy_Score" = "red"), 
                     name = "Score Type") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Here we can see Matt Rowells’ real vs predicted fantasy scores for the 2025 season. We can see that his real vs predicted is very close and shows the model performs well.

Predicting 2025 AFL Fantasy Scores through Linear Modelling

Nicholas Andrews

2025-10-30