In RStudio, we have access to a large pool of tools and packages that make coding quicker, cleaner, and easier to understand.
One of the most used resources is the dplyr package, a
part of the tidyverse, which was created for data
manipulation and transformation.
# Install the dplyr package
#install.packages("dplyr") #remove the hashtag before install for a one time install
# Activate the package before use
library(dplyr)
Note for Windows users: If you see a message saying “Rtools is required to build R packages but is not currently installed”, you’ll need to install Rtools first.
Go to https://cran.r-project.org/bin/windows/Rtools/
Download and install the version that matches your R version (e.g., Rtools44 for R 4.4).
Restart RStudio and try installing dplyr again.
Using the mtcars dataset which is preinstalled in
RStudio, the select function can be used to choose what columns that
need to be kept
# Using the mtcars
head(mtcars) # View the first 6 rows of the dataset to see headings
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
horsepower <- mtcars %>% select(hp) # Use %>% as a pipeline function
head(horsepower)
## hp
## Mazda RX4 110
## Mazda RX4 Wag 110
## Datsun 710 93
## Hornet 4 Drive 110
## Hornet Sportabout 175
## Valiant 105
hp_and_weight <- mtcars %>% select(hp, wt) # select multiple columns
head(hp_and_weight)
## hp wt
## Mazda RX4 110 2.620
## Mazda RX4 Wag 110 2.875
## Datsun 710 93 2.320
## Hornet 4 Drive 110 3.215
## Hornet Sportabout 175 3.440
## Valiant 105 3.460
not_hp <- mtcars %>% select(-hp) #select everything but hp
head(not_hp)
## mpg cyl disp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 2.76 3.460 20.22 1 0 3 1
Filter rows by a certain condition
fast_cars <- mtcars %>% filter(qsec > 20) # qsec is seconds to complete a quarter of a mile
fast_cars
## mpg cyl disp hp drat wt qsec vs am gear carb
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
mtcars %>% filter(mpg > 19, cyl == 6) # Filtering both 2 condition at once
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
Sort columns in ascending or descending values
head(mtcars %>% arrange(drat)) #ascending
## mpg cyl disp hp drat wt qsec vs am gear carb
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
## Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
head(mtcars %>% arrange(desc(drat))) #descending
## mpg cyl disp hp drat wt qsec vs am gear carb
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
Add or modify columns
mtcars_new <- mtcars %>% mutate(
kmpl = mpg * 0.425144, #km per litre from miles per gallon
)
head(mtcars_new)
## mpg cyl disp hp drat wt qsec vs am gear carb kmpl
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 8.928024
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 8.928024
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 9.693283
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 9.098082
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 7.950193
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 7.695106
Rename column names
mtcars_new <- mtcars_new %>% rename(weight = wt) #rename weight
head(mtcars_new)
## mpg cyl disp hp drat weight qsec vs am gear carb kmpl
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 8.928024
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 8.928024
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 9.693283
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 9.098082
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 7.950193
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 7.695106
Temporarily create new datasets only containing rows with matching variable values, then summarise to get extra metrics
mtcars_new %>%
group_by(gear) %>% # Will get 3 separate groups for gears = 3,4 & 5
summarise(
avg_mpg = mean(mpg),
avg_hp = mean(hp),
total = n() # Number of cars in each group
)
## # A tibble: 3 × 4
## gear avg_mpg avg_hp total
## <dbl> <dbl> <dbl> <int>
## 1 3 16.1 176. 15
## 2 4 24.5 89.5 12
## 3 5 21.4 196. 5
mtcars_new %>%
filter(hp > 150) %>%
select(hp, weight, kmpl, cyl) %>%
group_by(cyl) %>%
summarise(avg_hp = mean(hp),
avg_weight = mean(weight),
avg_kmpl = mean(kmpl)
) %>%
arrange(desc(avg_weight))
## # A tibble: 2 × 4
## cyl avg_hp avg_weight avg_kmpl
## <dbl> <dbl> <dbl> <dbl>
## 1 8 219. 4.09 6.40
## 2 6 175 2.77 8.38
Load in AFL data set, use dplyrto summarise stats and
plot using ggplot2.
#install.packages("fitzRoy")
library(fitzRoy)
afl_2025 <- fetch_results_afltables(season = 2025)
head(afl_2025)
## # A tibble: 6 × 16
## Game Date Round Home.Team Home.Goals Home.Behinds Home.Points Away.Team
## <dbl> <date> <chr> <chr> <int> <int> <int> <chr>
## 1 16623 2025-03-07 R1 Sydney 11 10 76 Hawthorn
## 2 16624 2025-03-09 R1 GWS 15 14 104 Collingw…
## 3 16625 2025-03-13 R2 Richmond 13 4 82 Carlton
## 4 16626 2025-03-14 R2 Hawthorn 17 9 111 Essendon
## 5 16627 2025-03-15 R2 Geelong 23 9 147 Fremantle
## 6 16628 2025-03-15 R2 Sydney 12 10 82 Brisbane…
## # ℹ 8 more variables: Away.Goals <int>, Away.Behinds <int>, Away.Points <int>,
## # Venue <chr>, Margin <int>, Season <dbl>, Round.Type <chr>,
## # Round.Number <int>
#Want points playing at Home and Away
afl_2025_points <- afl_2025 %>%
select(Home.Team, Home.Points, Away.Team, Away.Points)
head(afl_2025_points)
## # A tibble: 6 × 4
## Home.Team Home.Points Away.Team Away.Points
## <chr> <int> <chr> <int>
## 1 Sydney 76 Hawthorn 96
## 2 GWS 104 Collingwood 52
## 3 Richmond 82 Carlton 69
## 4 Hawthorn 111 Essendon 85
## 5 Geelong 147 Fremantle 69
## 6 Sydney 82 Brisbane Lions 86
# Calculate average home and away points per team
avg_home <- afl_2025_points %>%
group_by(Home.Team) %>%
summarise(avg_home_p = mean(Home.Points))
avg_away <- afl_2025_points %>%
group_by(Away.Team) %>%
summarise(avg_away_p = mean(Away.Points))
# Combine home and away averages
afl_2025_points_avg <- full_join(avg_home, avg_away,
by = c("Home.Team" = "Away.Team")) #Make Home and Away points in the same row
afl_2025_points_avg
## # A tibble: 18 × 3
## Home.Team avg_home_p avg_away_p
## <chr> <dbl> <dbl>
## 1 Adelaide 94.2 98.3
## 2 Brisbane Lions 86.8 94.3
## 3 Carlton 81.5 74.6
## 4 Collingwood 90.7 80.2
## 5 Essendon 68.7 64.9
## 6 Footscray 109. 107.
## 7 Fremantle 86.2 85.2
## 8 GWS 96 87.5
## 9 Geelong 104. 106.
## 10 Gold Coast 102. 84.2
## 11 Hawthorn 95.5 85.8
## 12 Melbourne 88.4 76.5
## 13 North Melbourne 80.8 76
## 14 Port Adelaide 87.7 59.4
## 15 Richmond 63.6 62.4
## 16 St Kilda 83.3 76.3
## 17 Sydney 77.4 82.8
## 18 West Coast 67.1 60.7
#install.packages("ggplot2")
library(ggplot2)
#install.packages("ggrepel")
library(ggrepel)
ggplot(data = afl_2025_points_avg, aes(x = avg_home_p, y = avg_away_p, label = Home.Team)) +
geom_point() +
geom_text_repel() + # Ensures not team labals are overlapping
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") + # Home = Away Points
# Labels
labs(
title = "AFL 2025: Average Home vs Away Points",
subtitle = "Each point represents a team — red line shows equal home & away scoring",
x = "Average Home Points",
y = "Average Away Points")
All teams plotted below the red line score better at home, above the red line score better away
This time we will compare player’s average kicks with their average handballs and plot separating by team.
afl_2025_player_stats <- fetch_player_stats_afltables(season = 2025)
head(afl_2025_player_stats)
## # A tibble: 6 × 81
## Season Round Date Local.start.time Venue Attendance First.name Surname
## <int> <chr> <date> <int> <chr> <int> <chr> <chr>
## 1 2025 1 2025-03-07 1940 S.C.G. 40310 Taylor Adams
## 2 2025 1 2025-03-07 1940 S.C.G. 40310 Joel Amartey
## 3 2025 1 2025-03-07 1940 S.C.G. 40310 Nick Blakey
## 4 2025 1 2025-03-07 1940 S.C.G. 40310 Braeden Campbell
## 5 2025 1 2025-03-07 1940 S.C.G. 40310 Oliver Florent
## 6 2025 1 2025-03-07 1940 S.C.G. 40310 Brodie Grundy
## # ℹ 73 more variables: ID <int>, Jumper.No. <chr>, Playing.for <chr>,
## # Kicks <int>, Marks <int>, Handballs <int>, Disposals <int>, Goals <int>,
## # Behinds <int>, Hit.Outs <int>, Tackles <int>, Rebounds <int>,
## # Inside.50s <int>, Clearances <int>, Clangers <int>, Frees.For <int>,
## # Frees.Against <int>, Brownlow.Votes <int>, Contested.Possessions <int>,
## # Uncontested.Possessions <int>, Contested.Marks <int>,
## # Marks.Inside.50 <int>, One.Percenters <int>, Bounces <int>, …
# Select required stats
afl_2025_player_stats <- afl_2025_player_stats %>%
select(Round, First.name, Surname, Playing.for, Kicks, Handballs)
head(afl_2025_player_stats)
## # A tibble: 6 × 6
## Round First.name Surname Playing.for Kicks Handballs
## <chr> <chr> <chr> <chr> <int> <int>
## 1 1 Taylor Adams Sydney 4 4
## 2 1 Joel Amartey Sydney 7 4
## 3 1 Nick Blakey Sydney 21 5
## 4 1 Braeden Campbell Sydney 6 2
## 5 1 Oliver Florent Sydney 9 7
## 6 1 Brodie Grundy Sydney 3 8
#Get average for each player
avg_stats <- afl_2025_player_stats %>%
group_by(First.name, Surname, Playing.for) %>%
summarise(avg_kicks = mean(Kicks),
avg_hb = mean(Handballs))
head(avg_stats)
## # A tibble: 6 × 5
## # Groups: First.name, Surname [6]
## First.name Surname Playing.for avg_kicks avg_hb
## <chr> <chr> <chr> <dbl> <dbl>
## 1 Aaron Cadman Greater Western Sydney 7.21 2.54
## 2 Aaron Francis Sydney 5.5 3.3
## 3 Aaron Naughton Western Bulldogs 8.22 3.57
## 4 Adam Cerra Carlton 13.6 11
## 5 Adam Saad Carlton 12.6 5.43
## 6 Adam Treloar Western Bulldogs 11 10.2
ggplot(data = avg_stats, aes(x = avg_kicks, y = avg_hb)) +
geom_point() +
geom_text_repel(
aes(label = ifelse(avg_kicks + avg_hb >= 28, Surname, "")), # Only label players averaging 28+ disposals
size = 2
) +
facet_wrap(~Playing.for) + # Separate by team
labs(
title = "AFL 2025: Player Kicks vs Handballs by Team",
subtitle = "Players averaging 28+ disposals per game are labelled",
x = "Average Kicks per Game",
y = "Average Handballs per Game"
)