Introduciton

In RStudio, we have access to a large pool of tools and packages that make coding quicker, cleaner, and easier to understand.

One of the most used resources is the dplyr package, a part of the tidyverse, which was created for data manipulation and transformation.

Setup

# Install the dplyr package
#install.packages("dplyr") #remove the hashtag before install for a one time install

# Activate the package before use
library(dplyr)

Note for Windows users: If you see a message saying “Rtools is required to build R packages but is not currently installed”, you’ll need to install Rtools first.

Go to https://cran.r-project.org/bin/windows/Rtools/

Download and install the version that matches your R version (e.g., Rtools44 for R 4.4).

Restart RStudio and try installing dplyr again.

Select

Using the mtcars dataset which is preinstalled in RStudio, the select function can be used to choose what columns that need to be kept

# Using the mtcars
head(mtcars) # View the first 6 rows of the dataset to see headings
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
horsepower <- mtcars %>% select(hp) # Use %>% as a pipeline function
head(horsepower)
##                    hp
## Mazda RX4         110
## Mazda RX4 Wag     110
## Datsun 710         93
## Hornet 4 Drive    110
## Hornet Sportabout 175
## Valiant           105
hp_and_weight <- mtcars %>% select(hp, wt) # select multiple columns
head(hp_and_weight)
##                    hp    wt
## Mazda RX4         110 2.620
## Mazda RX4 Wag     110 2.875
## Datsun 710         93 2.320
## Hornet 4 Drive    110 3.215
## Hornet Sportabout 175 3.440
## Valiant           105 3.460
not_hp <- mtcars %>% select(-hp) #select everything but hp
head(not_hp)
##                    mpg cyl disp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 2.76 3.460 20.22  1  0    3    1

Filter

Filter rows by a certain condition

fast_cars <- mtcars %>% filter(qsec > 20) # qsec is seconds to complete a quarter of a mile
fast_cars
##                mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Valiant       18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## Merc 230      22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## Toyota Corona 21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
mtcars %>% filter(mpg > 19, cyl == 6) # Filtering both 2 condition at once
##                 mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4      21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag  21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## Hornet 4 Drive 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## Merc 280       19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## Ferrari Dino   19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6

Arrange

Sort columns in ascending or descending values

head(mtcars %>% arrange(drat)) #ascending
##                      mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Valiant             18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## Dodge Challenger    15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2
## Cadillac Fleetwood  10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
## Lincoln Continental 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
## Merc 450SE          16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
## Merc 450SL          17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
head(mtcars %>% arrange(desc(drat))) #descending
##                 mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Honda Civic    30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
## Porsche 914-2  26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
## Toyota Corolla 33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
## Ford Pantera L 15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
## Volvo 142E     21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2
## Fiat 128       32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1

Mutate

Add or modify columns

mtcars_new <- mtcars %>% mutate(
  kmpl = mpg * 0.425144, #km per litre from miles per gallon
)

head(mtcars_new)
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb     kmpl
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4 8.928024
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4 8.928024
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1 9.693283
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1 9.098082
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2 7.950193
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1 7.695106

Rename

Rename column names

mtcars_new <- mtcars_new %>% rename(weight = wt) #rename weight
head(mtcars_new)
##                    mpg cyl disp  hp drat weight  qsec vs am gear carb     kmpl
## Mazda RX4         21.0   6  160 110 3.90  2.620 16.46  0  1    4    4 8.928024
## Mazda RX4 Wag     21.0   6  160 110 3.90  2.875 17.02  0  1    4    4 8.928024
## Datsun 710        22.8   4  108  93 3.85  2.320 18.61  1  1    4    1 9.693283
## Hornet 4 Drive    21.4   6  258 110 3.08  3.215 19.44  1  0    3    1 9.098082
## Hornet Sportabout 18.7   8  360 175 3.15  3.440 17.02  0  0    3    2 7.950193
## Valiant           18.1   6  225 105 2.76  3.460 20.22  1  0    3    1 7.695106

Group_by and Summarise

Temporarily create new datasets only containing rows with matching variable values, then summarise to get extra metrics

mtcars_new %>%
  group_by(gear) %>% # Will get 3 separate groups for gears = 3,4 & 5
  summarise(
    avg_mpg = mean(mpg),
    avg_hp = mean(hp),
    total = n() # Number of cars in each group
  )
## # A tibble: 3 × 4
##    gear avg_mpg avg_hp total
##   <dbl>   <dbl>  <dbl> <int>
## 1     3    16.1  176.     15
## 2     4    24.5   89.5    12
## 3     5    21.4  196.      5

Use everything we have learned

mtcars_new %>% 
  filter(hp > 150) %>% 
  select(hp, weight, kmpl, cyl) %>% 
  group_by(cyl) %>% 
  summarise(avg_hp = mean(hp),
            avg_weight = mean(weight),
            avg_kmpl = mean(kmpl)
           ) %>% 
  arrange(desc(avg_weight))
## # A tibble: 2 × 4
##     cyl avg_hp avg_weight avg_kmpl
##   <dbl>  <dbl>      <dbl>    <dbl>
## 1     8   219.       4.09     6.40
## 2     6   175        2.77     8.38

Visualisation

Load in AFL data set, use dplyrto summarise stats and plot using ggplot2.

#install.packages("fitzRoy")
library(fitzRoy)

afl_2025 <- fetch_results_afltables(season = 2025)
head(afl_2025)
## # A tibble: 6 × 16
##    Game Date       Round Home.Team Home.Goals Home.Behinds Home.Points Away.Team
##   <dbl> <date>     <chr> <chr>          <int>        <int>       <int> <chr>    
## 1 16623 2025-03-07 R1    Sydney            11           10          76 Hawthorn 
## 2 16624 2025-03-09 R1    GWS               15           14         104 Collingw…
## 3 16625 2025-03-13 R2    Richmond          13            4          82 Carlton  
## 4 16626 2025-03-14 R2    Hawthorn          17            9         111 Essendon 
## 5 16627 2025-03-15 R2    Geelong           23            9         147 Fremantle
## 6 16628 2025-03-15 R2    Sydney            12           10          82 Brisbane…
## # ℹ 8 more variables: Away.Goals <int>, Away.Behinds <int>, Away.Points <int>,
## #   Venue <chr>, Margin <int>, Season <dbl>, Round.Type <chr>,
## #   Round.Number <int>
#Want points playing at Home and Away
afl_2025_points <- afl_2025 %>% 
  select(Home.Team, Home.Points, Away.Team, Away.Points)
head(afl_2025_points)
## # A tibble: 6 × 4
##   Home.Team Home.Points Away.Team      Away.Points
##   <chr>           <int> <chr>                <int>
## 1 Sydney             76 Hawthorn                96
## 2 GWS               104 Collingwood             52
## 3 Richmond           82 Carlton                 69
## 4 Hawthorn          111 Essendon                85
## 5 Geelong           147 Fremantle               69
## 6 Sydney             82 Brisbane Lions          86
# Calculate average home and away points per team
avg_home <- afl_2025_points %>%
  group_by(Home.Team) %>%
  summarise(avg_home_p = mean(Home.Points))

avg_away <- afl_2025_points %>%
  group_by(Away.Team) %>%
  summarise(avg_away_p = mean(Away.Points))

# Combine home and away averages
afl_2025_points_avg <- full_join(avg_home, avg_away, 
                                 by = c("Home.Team" = "Away.Team")) #Make Home and Away points in the same row
afl_2025_points_avg
## # A tibble: 18 × 3
##    Home.Team       avg_home_p avg_away_p
##    <chr>                <dbl>      <dbl>
##  1 Adelaide              94.2       98.3
##  2 Brisbane Lions        86.8       94.3
##  3 Carlton               81.5       74.6
##  4 Collingwood           90.7       80.2
##  5 Essendon              68.7       64.9
##  6 Footscray            109.       107. 
##  7 Fremantle             86.2       85.2
##  8 GWS                   96         87.5
##  9 Geelong              104.       106. 
## 10 Gold Coast           102.        84.2
## 11 Hawthorn              95.5       85.8
## 12 Melbourne             88.4       76.5
## 13 North Melbourne       80.8       76  
## 14 Port Adelaide         87.7       59.4
## 15 Richmond              63.6       62.4
## 16 St Kilda              83.3       76.3
## 17 Sydney                77.4       82.8
## 18 West Coast            67.1       60.7
#install.packages("ggplot2")
library(ggplot2)

#install.packages("ggrepel")
library(ggrepel)

ggplot(data = afl_2025_points_avg, aes(x = avg_home_p, y = avg_away_p, label = Home.Team)) +
  geom_point() +
  geom_text_repel() + # Ensures not team labals are overlapping
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") + # Home = Away Points
  
  # Labels
  labs(
    title = "AFL 2025: Average Home vs Away Points",
    subtitle = "Each point represents a team — red line shows equal home & away scoring",
    x = "Average Home Points",
    y = "Average Away Points")

All teams plotted below the red line score better at home, above the red line score better away

New AFL dataset

This time we will compare player’s average kicks with their average handballs and plot separating by team.

afl_2025_player_stats <- fetch_player_stats_afltables(season = 2025)
head(afl_2025_player_stats)
## # A tibble: 6 × 81
##   Season Round Date       Local.start.time Venue  Attendance First.name Surname 
##    <int> <chr> <date>                <int> <chr>       <int> <chr>      <chr>   
## 1   2025 1     2025-03-07             1940 S.C.G.      40310 Taylor     Adams   
## 2   2025 1     2025-03-07             1940 S.C.G.      40310 Joel       Amartey 
## 3   2025 1     2025-03-07             1940 S.C.G.      40310 Nick       Blakey  
## 4   2025 1     2025-03-07             1940 S.C.G.      40310 Braeden    Campbell
## 5   2025 1     2025-03-07             1940 S.C.G.      40310 Oliver     Florent 
## 6   2025 1     2025-03-07             1940 S.C.G.      40310 Brodie     Grundy  
## # ℹ 73 more variables: ID <int>, Jumper.No. <chr>, Playing.for <chr>,
## #   Kicks <int>, Marks <int>, Handballs <int>, Disposals <int>, Goals <int>,
## #   Behinds <int>, Hit.Outs <int>, Tackles <int>, Rebounds <int>,
## #   Inside.50s <int>, Clearances <int>, Clangers <int>, Frees.For <int>,
## #   Frees.Against <int>, Brownlow.Votes <int>, Contested.Possessions <int>,
## #   Uncontested.Possessions <int>, Contested.Marks <int>,
## #   Marks.Inside.50 <int>, One.Percenters <int>, Bounces <int>, …
# Select required stats
afl_2025_player_stats <- afl_2025_player_stats %>% 
  select(Round, First.name, Surname, Playing.for, Kicks, Handballs)
head(afl_2025_player_stats)
## # A tibble: 6 × 6
##   Round First.name Surname  Playing.for Kicks Handballs
##   <chr> <chr>      <chr>    <chr>       <int>     <int>
## 1 1     Taylor     Adams    Sydney          4         4
## 2 1     Joel       Amartey  Sydney          7         4
## 3 1     Nick       Blakey   Sydney         21         5
## 4 1     Braeden    Campbell Sydney          6         2
## 5 1     Oliver     Florent  Sydney          9         7
## 6 1     Brodie     Grundy   Sydney          3         8
#Get average for each player
avg_stats <- afl_2025_player_stats %>% 
  group_by(First.name, Surname, Playing.for) %>% 
  summarise(avg_kicks = mean(Kicks),
            avg_hb = mean(Handballs))
head(avg_stats)
## # A tibble: 6 × 5
## # Groups:   First.name, Surname [6]
##   First.name Surname  Playing.for            avg_kicks avg_hb
##   <chr>      <chr>    <chr>                      <dbl>  <dbl>
## 1 Aaron      Cadman   Greater Western Sydney      7.21   2.54
## 2 Aaron      Francis  Sydney                      5.5    3.3 
## 3 Aaron      Naughton Western Bulldogs            8.22   3.57
## 4 Adam       Cerra    Carlton                    13.6   11   
## 5 Adam       Saad     Carlton                    12.6    5.43
## 6 Adam       Treloar  Western Bulldogs           11     10.2
ggplot(data = avg_stats, aes(x = avg_kicks, y = avg_hb)) +
  geom_point() +
   geom_text_repel(
    aes(label = ifelse(avg_kicks + avg_hb >= 28, Surname, "")), # Only label players averaging 28+ disposals
    size = 2
  ) +
  facet_wrap(~Playing.for) + # Separate by team
  labs(
    title = "AFL 2025: Player Kicks vs Handballs by Team",
    subtitle = "Players averaging 28+ disposals per game are labelled",
    x = "Average Kicks per Game",
    y = "Average Handballs per Game"
  )