These are the required packages for this analysis. Note* Will need to install if not previously done so.
# install packages, remove hashtag to run install
# install.packages('tidyverse')
# install.packages('fitzRoy')
# load packages
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.3 ✔ purrr 1.0.2
## ✔ tibble 3.2.1 ✔ dplyr 1.1.3
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.4 ✔ forcats 1.0.0
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'purrr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.3
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(fitzRoy)
Data is taken from the fitzRoy package, which contains datasets for AFL competitions. In this analysis we will take data for the 2023 season from ‘fetch_player_stats_afltables’. Data will also be filtered for the 2023 season.
afl2023 <- fitzRoy::fetch_player_stats_afltables(season = 2023)
## ℹ Looking for data from 2023-01-01 to 2023-11-03
##
ℹ fetching cached data from <github.com>
✔ fetching cached data from <github.com> ... done
## Finished getting afltables data
head(afl2023)
## # A tibble: 6 × 59
## Season Round Date Local.start.time Venue Attendance Home.team HQ1G
## <dbl> <chr> <date> <int> <chr> <dbl> <chr> <int>
## 1 2023 1 2023-03-16 1920 M.C.G. 88084 Richmond 1
## 2 2023 1 2023-03-16 1920 M.C.G. 88084 Richmond 1
## 3 2023 1 2023-03-16 1920 M.C.G. 88084 Richmond 1
## 4 2023 1 2023-03-16 1920 M.C.G. 88084 Richmond 1
## 5 2023 1 2023-03-16 1920 M.C.G. 88084 Richmond 1
## 6 2023 1 2023-03-16 1920 M.C.G. 88084 Richmond 1
## # ℹ 51 more variables: HQ1B <int>, HQ2G <int>, HQ2B <int>, HQ3G <int>,
## # HQ3B <int>, HQ4G <int>, HQ4B <int>, Home.score <int>, Away.team <chr>,
## # AQ1G <int>, AQ1B <int>, AQ2G <int>, AQ2B <int>, AQ3G <int>, AQ3B <int>,
## # AQ4G <int>, AQ4B <int>, Away.score <int>, First.name <chr>, Surname <chr>,
## # ID <dbl>, Jumper.No. <chr>, Playing.for <chr>, Kicks <dbl>, Marks <dbl>,
## # Handballs <dbl>, Goals <dbl>, Behinds <dbl>, Hit.Outs <dbl>, Tackles <dbl>,
## # Rebounds <dbl>, Inside.50s <dbl>, Clearances <dbl>, Clangers <dbl>, …
The dataset contains individual player stats, now we want match averages for each team. This is done using the group_by function. However since we want stats for both home and away games for team, a dataset is needed for each and then merged together. Also create variables for scoring shots and match outcome(i.e. score)
# Get dataset for home team stats, with relevant stats
afl23_team_stats <- afl2023 %>%
group_by(Round, Home.team, Playing.for) %>%
summarise(Opponent = Away.team,
Ave_Kicks = mean(Kicks),
Ave_Marks = mean(Marks),
Ave_Handballs = mean(Handballs),
Ave_Goals = mean(Goals),
Ave_Behinds = mean(Behinds),
Ave_Scoring_shots = mean(Goals) + mean(Behinds),
Ave_Hit_Outs = mean(Hit.Outs),
Ave_Tackles = mean(Tackles),
Ave_Rebounds = mean(Rebounds),
Ave_Inside_50s = mean(Inside.50s),
Ave_Clearances = mean(Clearances),
Ave_Clangers = mean(Clangers),
Ave_Frees_For = mean(Frees.For),
Ave_Frees_Against = mean(Frees.Against),
Ave_Brownlow_votes = mean(Brownlow.Votes),
Ave_Contested_Possessions = mean(Contested.Possessions),
Ave_Uncontested_Possessions = mean(Uncontested.Possessions),
Ave_Contested_Marks = mean(Contested.Marks),
Ave_Marks_Inside_50 = mean(Marks.Inside.50),
Ave_One_Percenters = mean(One.Percenters),
Ave_Bounces = mean(Bounces),
Margin = Home.score - Away.score,
Score = ifelse(Home.score > Away.score, 1,
ifelse(Home.score < Away.score, 0, 0.5))) %>%
distinct(Round, Home.team, .keep_all = TRUE) %>%
ungroup()
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'Round', 'Home.team', 'Playing.for'. You
## can override using the `.groups` argument.
# Get dataset for away team stats, with relevant stats
afl23_team_stats_away <- afl2023 %>%
group_by(Round, Away.team, Playing.for) %>%
summarise(Opponent = Home.team,
Ave_Kicks = mean(Kicks),
Ave_Marks = mean(Marks),
Ave_Handballs = mean(Handballs),
Ave_Goals = mean(Goals),
Ave_Behinds = mean(Behinds),
Ave_Scoring_shots = mean(Goals) + mean(Behinds),
Ave_Hit_Outs = mean(Hit.Outs),
Ave_Tackles = mean(Tackles),
Ave_Rebounds = mean(Rebounds),
Ave_Inside_50s = mean(Inside.50s),
Ave_Clearances = mean(Clearances),
Ave_Clangers = mean(Clangers),
Ave_Frees_For = mean(Frees.For),
Ave_Frees_Against = mean(Frees.Against),
Ave_Brownlow_votes = mean(Brownlow.Votes),
Ave_Contested_Possessions = mean(Contested.Possessions),
Ave_Uncontested_Possessions = mean(Uncontested.Possessions),
Ave_Contested_Marks = mean(Contested.Marks),
Ave_Marks_Inside_50 = mean(Marks.Inside.50),
Ave_One_Percenters = mean(One.Percenters),
Ave_Bounces = mean(Bounces),
Margin = Away.score - Home.score,
Score = ifelse(Away.score > Home.score, 1,
ifelse(Away.score < Home.score, 0, 0.5))) %>%
distinct(Round, Away.team, .keep_all = TRUE) %>%
ungroup()
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'Round', 'Away.team', 'Playing.for'. You
## can override using the `.groups` argument.
# Remove rows where playing for doesn't match the home team
afl23_team_stats <- afl23_team_stats %>%
filter(Playing.for != Opponent)
# Remove rows where playing for doesn't match the away team
afl23_team_stats_away <- afl23_team_stats_away %>%
filter(Playing.for != Opponent)
# Change focus team column name to Team
colnames(afl23_team_stats)[colnames(afl23_team_stats) == "Home.team"] <- "Team"
colnames(afl23_team_stats_away)[colnames(afl23_team_stats_away) == "Away.team"] <- "Team"
# remove playing for column from each dataset
afl23_team_stats <- afl23_team_stats %>%
select(-Playing.for)
afl23_team_stats_away <- afl23_team_stats_away %>%
select(-Playing.for)
# bind home team dataset and away team dataset together
afl23_team_stats <- rbind(afl23_team_stats, afl23_team_stats_away)
Again using group_by, however this time we do not group by round. This provides each teams overall stats. Also create variable for goal kicking accuracy and total wins.
afl23_overall_stats <- afl23_team_stats %>%
group_by(Team) %>%
summarise(Ave_Kicks = mean(Ave_Kicks),
Ave_Marks = mean(Ave_Marks),
Ave_Handballs = mean(Ave_Handballs),
Ave_Goals = mean(Ave_Goals),
Ave_Behinds = mean(Ave_Behinds),
Ave_Scoring_shots = mean(Ave_Goals) + mean(Ave_Behinds),
Ave_Hit_Outs = mean(Ave_Hit_Outs),
Ave_Tackles = mean(Ave_Tackles),
Ave_Rebounds = mean(Ave_Rebounds),
Ave_Inside_50s = mean(Ave_Inside_50s),
Ave_Clearances = mean(Ave_Clearances),
Ave_Clangers = mean(Ave_Clangers),
Ave_Frees_For = mean(Ave_Frees_For),
Ave_Frees_Against = mean(Ave_Frees_Against),
Ave_Brownlow_votes = mean(Ave_Brownlow_votes),
Ave_Contested_Possessions = mean(Ave_Contested_Possessions),
Ave_Uncontested_Possessions = mean(Ave_Uncontested_Possessions),
Ave_Contested_Marks = mean(Ave_Contested_Marks),
Ave_Marks_Inside_50 = mean(Ave_Marks_Inside_50),
Ave_One_Percenters = mean(Ave_One_Percenters),
Ave_Bounces = mean(Ave_Bounces),
Wins = sum(Score))
# mutate new variable for goal kicking accuracy
afl23_overall_stats <- afl23_overall_stats %>%
mutate(Goal_Kick_Accuracy = (Ave_Goals / Ave_Scoring_shots)*100)
This dataset will provide a 1-18 ranking for each team for each variable. This is done using the rank function from the dplyr package. negative is used so that 1 is the top rating and 18 the worst.
afl23_rankings <- afl23_overall_stats %>%
summarise(Team = Team,
Rank_Kicks = rank(-Ave_Kicks, ties.method = "min"),
Rank_Marks = rank(-Ave_Marks, ties.method = "min"),
Rank_Handballs = rank(-Ave_Handballs, ties.method = "min"),
Rank_Goals = rank(-Ave_Goals, ties.method = "min"),
Rank_Behinds = rank(-Ave_Behinds, ties.method = "min"),
Rank_Scoring_shots = rank(-Ave_Scoring_shots, ties.method = "min"),
Rank_Hit_Outs = rank(-Ave_Hit_Outs, ties.method = "min"),
Rank_Tackles = rank(-Ave_Tackles, ties.method = "min"),
Rank_Rebounds = rank(-Ave_Rebounds, ties.method = "min"),
Rank_Inside_50s = rank(-Ave_Inside_50s, ties.method = "min"),
Rank_Clearances = rank(-Ave_Clearances, ties.method = "min"),
Rank_Clangers = rank(-Ave_Clangers, ties.method = "min"),
Rank_Frees_For = rank(-Ave_Frees_For, ties.method = "min"),
Rank_Frees_Against = rank(-Ave_Frees_Against, ties.method = "min"),
Rank_Brownlow_votes = rank(-Ave_Brownlow_votes, ties.method = "min"),
Rank_Contested_Possessions = rank(-Ave_Contested_Possessions, ties.method = "min"),
Rank_Uncontested_Possessions = rank(-Ave_Uncontested_Possessions, ties.method = "min"),
Rank_Contested_Marks = rank(-Ave_Contested_Marks, ties.method = "min"),
Rank_Marks_Inside_50 = rank(-Ave_Marks_Inside_50, ties.method = "min"),
Rank_One_Percenters = rank(-Ave_One_Percenters, ties.method = "min"),
Rank_Bounces = rank(-Ave_Bounces, ties.method = "min"),
Rank_Wins = rank(-Wins, ties.method = "min"))
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
To plot the rankings data we first have to filter Collingwood into a dataset. The dataset also need to be put in long format in order to plot, this is done with pivot longer.
Collingwood_ranks <- afl23_rankings %>%
filter(Team == 'Collingwood') %>%
pivot_longer(cols = starts_with("Rank_"), names_to = "Variable", values_to = "Rank")
ggplot(Collingwood_ranks, aes(x = Variable, y = Rank)) +
geom_point()
Now make the plot look nice. Invert y axis so 1 ranking is top, using scale_y_reverse. Change x axis labels so they clear in theme(axis.text.x), add labels.
ggplot(Collingwood_ranks, aes(x = Variable, y = Rank)) +
geom_point() +
labs(title = 'Collingwood Rankings for Each Variable',
x = 'Variable',
y = 'Rank') +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
scale_y_reverse()