#Load in required packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(fitzRoy)
library(dplyr)
library(purrr)
#Scrape afl data using FitzRoy package
afl2023 <- fitzRoy::fetch_player_stats_afltables(season = 2023)
## ℹ Looking for data from 2023-01-01 to 2023-10-21
##
ℹ fetching cached data from <github.com>
✔ fetching cached data from <github.com> ... done
## Finished getting afltables data
#Create data frame for whole team for each match.
afl_df <- afl2023 %>%
group_by(Playing.for,Round) %>%
mutate(across(c(30:50), sum)) %>%
ungroup() %>%
select(Round, Home.team, Home.score, Away.team, Away.score, Playing.for,Kicks:One.Percenters) %>%
distinct() %>%
mutate_if(is.character, as.factor)
#Create data frame for all home teams
home_afl_df <- afl_df %>%
filter(Home.team == Playing.for)
#create data frame for all away teams
away_afl_df <- afl_df %>%
filter(Away.team == Playing.for)
Combine both home and away df’s. This allows both teams’ stats from the same game to be on the same row
#Join based on the following columns as this will be identical for both teams from the same game
combined_afl_df <- left_join(home_afl_df, away_afl_df, by = c("Round", "Home.team",
'Away.team','Home.score',
'Away.score'))
#Neaten the df column names by changing x -> h and y -> a
#Create a margin column
combined_afl_df <- combined_afl_df %>%
rename_with(~ sub("\\.x$", ".h", .), ends_with(".x")) %>%
rename_with(~ sub("\\.y$", ".a", .), ends_with(".y")) %>%
mutate(margin = Home.score - Away.score)
Now that we have done all the data pre-processing we can start on the loop
#Define outcome variable (the variable you want to investigate)
outcome_var <- "margin"
#set predictors. Quickest way is to subtract the outcome variable as well as any other predictors you
#don't want to include
predictors <- setdiff(names(combined_afl_df), c(outcome_var, 'Round', 'Home.team','Home.score', 'Away.team', 'Away.score', 'Playing.for.h', 'Playing.for.a', 'Goals.a', 'Goals.h','Behinds.a','Behinds.h', 'Inside.50s.h','Inside.50s.a', 'Marks.Inside.50.h','Marks.Inside.50.a', 'Brownlow.Votes.h','Brownlow.Votes.a'))
#set max number of predictors (note: the higher the number the longer it
#will take for your computer to process)
max_predictors <- 3
###Create empty dataframe and iterate over values from to 1 max_predictors, executing
###the code within the curly brackets for each one
results <- map_dfr(1:max_predictors, ~{
###Generate all possible combinations of predictor variables.
predictor_combinations <- combn(predictors, .x, simplify = FALSE)
###For each combination of predictor variables, iterate over them by constructing a
##linear regression using the combination of predictors and the outcome variable
results_list <- map(predictor_combinations, ~{
formula_str <- paste(outcome_var, "~", paste(.x, collapse = "+"))
###Fit a linear regression model using the formula and dataset
model <- lm(as.formula(formula_str), data = combined_afl_df)
predictions <- model %>% predict(combined_afl_df)
###Calculate the rmse and r2 values
rmse <- sqrt(mean((predictions - combined_afl_df[[outcome_var]])^2))
r2 <- summary(model)$r.squared
###Calculate the AIC and BIC values
AIC_val <- AIC(model)
BIC_val <- BIC(model)
###Create a tibble or dataframe containing information about the model including the model,
#predictors, rmse, r2, aic and bic
tibble(
model = formula_str,
predictors = paste(.x, collapse = ", "),
rmse = rmse,
r2 = r2,
AIC = AIC_val,
BIC = BIC_val
)
})
###Combine all the tibbles into a single data frame.
bind_rows(results_list)
})
Choose you model based on the r2, rmse, BIC, and AIC.
rmse - the lower the better
r2 the higher the better between a range from 0 to 1
AIC - the lower the better
BIC - the lower the better (it penalizes model complexity more than AIC)
I chose the model below based on these factors and just as important because it seemed reasonable and logical (remember to think about logic when choosing your predictors)
model_margin <- lm(margin ~ Kicks.h + Contested.Marks.h + Kicks.a,data = combined_afl_df)
#See the effects of the predictors. A p value <0.05 determines that the predictor is signficant
summary(model_margin)
##
## Call:
## lm(formula = margin ~ Kicks.h + Contested.Marks.h + Kicks.a,
## data = combined_afl_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -77.705 -17.285 -1.464 17.802 85.104
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.0802 36.5728 0.905 0.366755
## Kicks.h 0.7225 0.1068 6.764 1.29e-10 ***
## Contested.Marks.h 1.8421 0.5236 3.518 0.000531 ***
## Kicks.a -0.9592 0.1075 -8.926 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 28.89 on 212 degrees of freedom
## Multiple R-squared: 0.4879, Adjusted R-squared: 0.4806
## F-statistic: 67.32 on 3 and 212 DF, p-value: < 2.2e-16
As we can see as both Home kicks and Home contested marks increased, the margin increased. Both predictors were signifcant.
The away kicks show us that as this variable increased, the margin decreased. This is also significant.