These are the required packages for this analysis. Note* Will need to install if not previously done so.
# install packages, remove hashtag to run install
# install.packages('tidyverse')
# install.packages('fitzRoy')
# load packages
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.3 ✔ purrr 1.0.2
## ✔ tibble 3.2.1 ✔ dplyr 1.1.3
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.4 ✔ forcats 1.0.0
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'purrr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.3
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(fitzRoy)
Data is taken from the fitzRoy package, which contains datasets for AFL competitions. In this analysis we will take data for the 2023 season from ‘fetch_player_stats_afltables’. Data will also be filtered for the 2023 season.
afl2023 <- fitzRoy::fetch_player_stats_afltables(season = 2023)
## ℹ Looking for data from 2023-01-01 to 2023-11-03
##
ℹ fetching cached data from <github.com>
✔ fetching cached data from <github.com> ... done
## Finished getting afltables data
head(afl2023)
## # A tibble: 6 × 59
## Season Round Date Local.start.time Venue Attendance Home.team HQ1G
## <dbl> <chr> <date> <int> <chr> <dbl> <chr> <int>
## 1 2023 1 2023-03-16 1920 M.C.G. 88084 Richmond 1
## 2 2023 1 2023-03-16 1920 M.C.G. 88084 Richmond 1
## 3 2023 1 2023-03-16 1920 M.C.G. 88084 Richmond 1
## 4 2023 1 2023-03-16 1920 M.C.G. 88084 Richmond 1
## 5 2023 1 2023-03-16 1920 M.C.G. 88084 Richmond 1
## 6 2023 1 2023-03-16 1920 M.C.G. 88084 Richmond 1
## # ℹ 51 more variables: HQ1B <int>, HQ2G <int>, HQ2B <int>, HQ3G <int>,
## # HQ3B <int>, HQ4G <int>, HQ4B <int>, Home.score <int>, Away.team <chr>,
## # AQ1G <int>, AQ1B <int>, AQ2G <int>, AQ2B <int>, AQ3G <int>, AQ3B <int>,
## # AQ4G <int>, AQ4B <int>, Away.score <int>, First.name <chr>, Surname <chr>,
## # ID <dbl>, Jumper.No. <chr>, Playing.for <chr>, Kicks <dbl>, Marks <dbl>,
## # Handballs <dbl>, Goals <dbl>, Behinds <dbl>, Hit.Outs <dbl>, Tackles <dbl>,
## # Rebounds <dbl>, Inside.50s <dbl>, Clearances <dbl>, Clangers <dbl>, …
The dataset contains individual player stats, now we want match averages for each team. This is done using the group_by function. However since we want stats for both home and away games for team, a dataset is needed for each and then merged together. Also create variables for scoring shots and match outcome(i.e. score)
# Get dataset for home team stats, with relevant stats
afl23_team_stats <- afl2023 %>%
group_by(Round, Home.team, Playing.for) %>%
summarise(Opponent = Away.team,
Ave_Kicks = mean(Kicks),
Ave_Marks = mean(Marks),
Ave_Handballs = mean(Handballs),
Ave_Goals = mean(Goals),
Ave_Behinds = mean(Behinds),
Ave_Scoring_shots = mean(Goals) + mean(Behinds),
Ave_Hit_Outs = mean(Hit.Outs),
Ave_Tackles = mean(Tackles),
Ave_Rebounds = mean(Rebounds),
Ave_Inside_50s = mean(Inside.50s),
Ave_Clearances = mean(Clearances),
Ave_Clangers = mean(Clangers),
Ave_Frees_For = mean(Frees.For),
Ave_Frees_Against = mean(Frees.Against),
Ave_Contested_Possessions = mean(Contested.Possessions),
Ave_Uncontested_Possessions = mean(Uncontested.Possessions),
Ave_Contested_Marks = mean(Contested.Marks),
Ave_Marks_Inside_50 = mean(Marks.Inside.50),
Ave_One_Percenters = mean(One.Percenters),
Ave_Bounces = mean(Bounces),
Margin = Home.score - Away.score,
Score = ifelse(Home.score > Away.score, 1,
ifelse(Home.score < Away.score, 0, 0.5))) %>%
distinct(Round, Home.team, .keep_all = TRUE) %>%
ungroup()
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'Round', 'Home.team', 'Playing.for'. You
## can override using the `.groups` argument.
# Get dataset for away team stats, with relevant stats
afl23_team_stats_away <- afl2023 %>%
group_by(Round, Away.team, Playing.for) %>%
summarise(Opponent = Home.team,
Ave_Kicks = mean(Kicks),
Ave_Marks = mean(Marks),
Ave_Handballs = mean(Handballs),
Ave_Goals = mean(Goals),
Ave_Behinds = mean(Behinds),
Ave_Scoring_shots = mean(Goals) + mean(Behinds),
Ave_Hit_Outs = mean(Hit.Outs),
Ave_Tackles = mean(Tackles),
Ave_Rebounds = mean(Rebounds),
Ave_Inside_50s = mean(Inside.50s),
Ave_Clearances = mean(Clearances),
Ave_Clangers = mean(Clangers),
Ave_Frees_For = mean(Frees.For),
Ave_Frees_Against = mean(Frees.Against),
Ave_Contested_Possessions = mean(Contested.Possessions),
Ave_Uncontested_Possessions = mean(Uncontested.Possessions),
Ave_Contested_Marks = mean(Contested.Marks),
Ave_Marks_Inside_50 = mean(Marks.Inside.50),
Ave_One_Percenters = mean(One.Percenters),
Ave_Bounces = mean(Bounces),
Margin = Away.score - Home.score,
Score = ifelse(Away.score > Home.score, 1,
ifelse(Away.score < Home.score, 0, 0.5))) %>%
distinct(Round, Away.team, .keep_all = TRUE) %>%
ungroup()
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'Round', 'Away.team', 'Playing.for'. You
## can override using the `.groups` argument.
# Remove rows where playing for doesn't match the home team
afl23_team_stats <- afl23_team_stats %>%
filter(Playing.for != Opponent)
# Remove rows where playing for doesn't match the away team
afl23_team_stats_away <- afl23_team_stats_away %>%
filter(Playing.for != Opponent)
# Change focus team column name to Team
colnames(afl23_team_stats)[colnames(afl23_team_stats) == "Home.team"] <- "Team"
colnames(afl23_team_stats_away)[colnames(afl23_team_stats_away) == "Away.team"] <- "Team"
# remove playing for column from each dataset
afl23_team_stats <- afl23_team_stats %>%
select(-Playing.for)
afl23_team_stats_away <- afl23_team_stats_away %>%
select(-Playing.for)
# bind home team dataset and away team dataset together
afl23_team_stats <- rbind(afl23_team_stats, afl23_team_stats_away)
Again using group_by, however this time we do not group by round. This provides each teams overall stats. Also create variable for goal kicking accuracy and total wins.
afl23_overall_stats <- afl23_team_stats %>%
group_by(Team) %>%
summarise(Ave_Kicks = mean(Ave_Kicks),
Ave_Marks = mean(Ave_Marks),
Ave_Handballs = mean(Ave_Handballs),
Ave_Goals = mean(Ave_Goals),
Ave_Behinds = mean(Ave_Behinds),
Ave_Scoring_shots = mean(Ave_Goals) + mean(Ave_Behinds),
Ave_Hit_Outs = mean(Ave_Hit_Outs),
Ave_Tackles = mean(Ave_Tackles),
Ave_Rebounds = mean(Ave_Rebounds),
Ave_Inside_50s = mean(Ave_Inside_50s),
Ave_Clearances = mean(Ave_Clearances),
Ave_Clangers = mean(Ave_Clangers),
Ave_Frees_For = mean(Ave_Frees_For),
Ave_Frees_Against = mean(Ave_Frees_Against),
Ave_Contested_Possessions = mean(Ave_Contested_Possessions),
Ave_Uncontested_Possessions = mean(Ave_Uncontested_Possessions),
Ave_Contested_Marks = mean(Ave_Contested_Marks),
Ave_Marks_Inside_50 = mean(Ave_Marks_Inside_50),
Ave_One_Percenters = mean(Ave_One_Percenters),
Ave_Bounces = mean(Ave_Bounces),
Wins = sum(Score))
# mutate new variable for goal kicking accuracy
afl23_overall_stats <- afl23_overall_stats %>%
mutate(Goal_Kick_Accuracy = (Ave_Goals / Ave_Scoring_shots)*100)
Here we plot each teams goal kicking accuracy, using ggplot and geom_point
ggplot(data = afl23_overall_stats, aes(x = Team, y = Goal_Kick_Accuracy))+
geom_point()
Now make the graph look nice. Adding a title and column labels, make team names clear on x axis.
ggplot(data = afl23_overall_stats, aes(x = Team, y = Goal_Kick_Accuracy))+
geom_point()+
labs(title = 'AFL Teams Goal Kicking Accuracy',
x = 'Team',
y = 'Accuracy (%)')+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10))
Now assess how Accuracy relates to winning with another plot.
ggplot(data = afl23_overall_stats, aes(x = Goal_Kick_Accuracy, y = Wins, col = Team))+
geom_point()+
geom_text(aes(label = Team), hjust = 1, vjust = 1) +
labs(title = 'AFL Teams Goal Kicking Accuracy, Wins relation',
x = 'Accuracy (%)',
y = 'Wins',
col = 'Team')