This document will present some data visualisation techniques in R using one of the most popular packages, ggplot2.
Load the required packages.
If the following packages have not been installed, this can be done by using the install.packages() function before loading them using the library() function.
library(dplyr)
library(ggplot2)
library(tidyverse)
To complete the following data visualisation exercises, netball player stats data from the 2017 Suncorp Super Netball season was scraped using the superNetballR package. This requires the devtools package and if this is not already installed please do so using the install.packages() function. The following code shows how to gather this data.
library(devtools)
devtools::install_github("stevelane/superNetballR")
library(superNetballR)
data("players_2017")
View(players_2017)
head(players_2017)
## # A tibble: 6 x 9
## playerId period shortDisplayName firstname surname stat value round game
## <int> <int> <chr> <chr> <chr> <chr> <chr> <int> <int>
## 1 80439 1 Proud, M Maddy Proud rebounds 0 1 1
## 2 80439 2 Proud, M Maddy Proud rebounds 0 1 1
## 3 80439 3 Proud, M Maddy Proud rebounds 0 1 1
## 4 80439 4 Proud, M Maddy Proud rebounds 0 1 1
## 5 80574 1 Hadley, P Paige Hadley rebounds 0 1 1
## 6 80574 2 Hadley, P Paige Hadley rebounds 0 1 1
Run a check on the variable names and types:
summary(players_2017)
## playerId period shortDisplayName firstname
## Min. : 0 Min. :1.00 Length:158532 Length:158532
## 1st Qu.: 80299 1st Qu.:1.75 Class :character Class :character
## Median : 80577 Median :2.50 Mode :character Mode :character
## Mean : 504645 Mean :2.50
## 3rd Qu.: 998401 3rd Qu.:3.25
## Max. :1009013 Max. :4.00
## surname stat value round
## Length:158532 Length:158532 Length:158532 Min. : 1.000
## Class :character Class :character Class :character 1st Qu.: 4.000
## Mode :character Mode :character Mode :character Median : 8.000
## Mean : 8.045
## 3rd Qu.:12.000
## Max. :17.000
## game
## Min. :1.000
## 1st Qu.:1.000
## Median :2.000
## Mean :2.418
## 3rd Qu.:3.000
## Max. :4.000
From viewing the data as well as the summary we can that the data is in long format. In order to do the following data visualisations the data needs to be transformed into wide format, so that each column is a stat and contains the corresponding value. This is done by using the spread() function. We also want to subset the data to only include the regular season (14 rounds).
player17stats <- players_2017 %>% spread(stat, value)
player17stats <- subset(player17stats, round <= 14)
head(player17stats)
## # A tibble: 6 x 40
## playerId period shortDisplayName firstname surname round game badHands
## <int> <int> <chr> <chr> <chr> <int> <int> <chr>
## 1 80439 1 Proud, M Maddy Proud 1 1 0
## 2 80439 2 Proud, M Maddy Proud 1 1 0
## 3 80439 3 Proud, M Maddy Proud 1 1 0
## 4 80439 4 Proud, M Maddy Proud 1 1 0
## 5 80574 1 Hadley, P Paige Hadley 1 1 0
## 6 80574 2 Hadley, P Paige Hadley 1 1 0
## # ... with 32 more variables: badPasses <chr>, blocked <chr>, blocks <chr>,
## # breaks <chr>, centrePassReceives <chr>, contactPenalties <chr>,
## # currentPositionCode <chr>, defensiveRebounds <chr>, deflections <chr>,
## # disposals <chr>, feeds <chr>, gain <chr>, goalAssists <chr>,
## # goalAttempts <chr>, goalMisses <chr>, goals <chr>, intercepts <chr>,
## # minutesPlayed <chr>, missedGoalTurnover <chr>, obstructionPenalties <chr>,
## # offensiveRebounds <chr>, offsides <chr>, passes <chr>, penalties <chr>, ...
Check the data type of each variable using the glimpse() function.
glimpse(player17stats)
## Rows: 4,484
## Columns: 40
## $ playerId <int> 80439, 80439, 80439, 80439, 80574, 80574, 80574, ~
## $ period <int> 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1~
## $ shortDisplayName <chr> "Proud, M", "Proud, M", "Proud, M", "Proud, M", "~
## $ firstname <chr> "Maddy", "Maddy", "Maddy", "Maddy", "Paige", "Pai~
## $ surname <chr> "Proud", "Proud", "Proud", "Proud", "Hadley", "Ha~
## $ round <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
## $ game <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
## $ badHands <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",~
## $ badPasses <chr> "0", "0", "0", "0", "0", "0", "0", "1", "0", "0",~
## $ blocked <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",~
## $ blocks <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",~
## $ breaks <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",~
## $ centrePassReceives <chr> "0", "0", "0", "0", "4", "11", "8", "7", "2", "0"~
## $ contactPenalties <chr> "2", "1", "3", "4", "1", "0", "2", "0", "1", "2",~
## $ currentPositionCode <chr> "C", "C", "C", "C", "WA", "WA", "WA", "WA", "WD",~
## $ defensiveRebounds <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",~
## $ deflections <chr> "0", "1", "1", "0", "0", "0", "0", "1", "0", "0",~
## $ disposals <chr> "22", "19", "26", "19", "9", "12", "9", "6", "0",~
## $ feeds <chr> "10", "5", "13", "5", "7", "12", "9", "6", "0", "~
## $ gain <chr> "1", "0", "1", "0", "0", "0", "1", "1", "1", "0",~
## $ goalAssists <chr> "5", "1", "9", "2", "4", "7", "3", "3", "0", "0",~
## $ goalAttempts <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",~
## $ goalMisses <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",~
## $ goals <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",~
## $ intercepts <chr> "1", "0", "1", "0", "0", "0", "1", "1", "1", "0",~
## $ minutesPlayed <chr> "15", "15", "15", "15", "15", "15", "15", "15", "~
## $ missedGoalTurnover <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",~
## $ obstructionPenalties <chr> "1", "0", "0", "0", "0", "0", "0", "1", "0", "0",~
## $ offensiveRebounds <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",~
## $ offsides <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",~
## $ passes <chr> "22", "19", "25", "19", "7", "12", "9", "6", "0",~
## $ penalties <chr> "3", "1", "3", "4", "1", "0", "2", "1", "1", "2",~
## $ pickups <chr> "0", "0", "1", "1", "0", "0", "0", "0", "0", "1",~
## $ possessions <chr> "13", "14", "15", "16", "6", "11", "9", "8", "3",~
## $ quartersPlayed <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", "1",~
## $ rebounds <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",~
## $ squadId <chr> "806", "806", "806", "806", "806", "806", "806", ~
## $ startingPositionCode <chr> "C", "C", "C", "C", "WA", "WA", "WA", "WA", "WD",~
## $ tossUpWin <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",~
## $ turnovers <chr> "0", "0", "0", "2", "0", "0", "1", "1", "0", "0",~
From the results of the glimpse function we can see that multiple variables that have numeric values have been classified as character variables. Therefore, these variables need to be converted to numeric using the following code. We will also convert the squadId and round variables into a character.
player17stats[,8:14] <- sapply(player17stats[,8:14], as.numeric)
player17stats[,16:37] <- sapply(player17stats[,16:37], as.numeric)
player17stats[,39:40] <- sapply(player17stats[,39:40], as.numeric)
player17stats$squadId <- as.character(player17stats$squadId)
player17stats$round <- as.character(player17stats$round)
The last step to get the data prepared for visualisations is to summarise each stat for each player in each round of the regular season. This can be completed using the code below.
total_player_stats_per_round <- player17stats %>%
group_by(shortDisplayName, playerId, squadId, round) %>%
summarise(badHands = sum(badHands),
badPasses = sum(badPasses),
blocked = sum(blocked),
blocks = sum(blocks),
breaks = sum(breaks),
centrePassReceives = sum(centrePassReceives),
contactPenalties = sum(contactPenalties),
defensiveRebounds = sum(defensiveRebounds),
deflections = sum(deflections),
disposals = sum(disposals),
feeds = sum(feeds),
gain = sum(gain),
goalAssists = sum(goalAssists),
goalAttempts = sum(goalAttempts),
goalMisses = sum(goalMisses),
goals = sum(goals),
intercepts = sum(intercepts),
minutesPlayed = sum(minutesPlayed),
missedGoalTurnover = sum(missedGoalTurnover),
obstructionPenalties = sum(obstructionPenalties),
offensiveRebounds = sum(offensiveRebounds),
offsides = sum(offsides),
passes = sum(passes),
penalties = sum(penalties),
pickups = sum(pickups),
possessions = sum(possessions),
quartersPlayed = sum(quartersPlayed),
rebounds = sum(rebounds),
tossUpWin = sum(tossUpWin),
turnovers = sum(turnovers))
head(total_player_stats_per_round)
## # A tibble: 6 x 34
## # Groups: shortDisplayName, playerId, squadId [1]
## shortDisplayName playerId squadId round badHands badPasses blocked blocks
## <chr> <int> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Abernethy 995700 801 1 0 0 0 0
## 2 Abernethy 995700 801 10 0 0 0 0
## 3 Abernethy 995700 801 11 0 0 0 0
## 4 Abernethy 995700 801 12 0 0 0 0
## 5 Abernethy 995700 801 13 1 0 0 0
## 6 Abernethy 995700 801 14 0 0 0 0
## # ... with 26 more variables: breaks <dbl>, centrePassReceives <dbl>,
## # contactPenalties <dbl>, defensiveRebounds <dbl>, deflections <dbl>,
## # disposals <dbl>, feeds <dbl>, gain <dbl>, goalAssists <dbl>,
## # goalAttempts <dbl>, goalMisses <dbl>, goals <dbl>, intercepts <dbl>,
## # minutesPlayed <dbl>, missedGoalTurnover <dbl>, obstructionPenalties <dbl>,
## # offensiveRebounds <dbl>, offsides <dbl>, passes <dbl>, penalties <dbl>,
## # pickups <dbl>, possessions <dbl>, quartersPlayed <dbl>, rebounds <dbl>, ...
Firstly, we will start by plotting two variables that we might expect to be associated with each other. In netball this could be feeds and goal assists.
ggplot(data = total_player_stats_per_round, aes(x = feeds, y = goalAssists)) +
geom_point()
We can add an extra variable by including the round of the season.
ggplot(data = total_player_stats_per_round, aes(x = feeds, y = goalAssists, color = round)) +
geom_point()
This section will be about introducing colour to the plots. We will start by breaking down each part of the code.
ggplot(data = total_player_stats_per_round)
ggplot(data = total_player_stats_per_round, aes(x = disposals, y = minutesPlayed))
The aes() function gathers together layer mapping properties and passes them to the ggplot chart. Any aesthetics you wish to use from the dataset need to be included within aes().
The code above is followed by adding a component, using a +, which indicates the type of plot you wish to create (e.g. scatterplot, histogram, bar graph, etc.). Here we will use geom_point() to make it a scatterplot.
ggplot(data = total_player_stats_per_round, aes(x = minutesPlayed, y = disposals)) +
geom_point()
ggplot(data = total_player_stats_per_round, aes(x = minutesPlayed, y = disposals, color = squadId)) +
geom_point()
ggplot(data = total_player_stats_per_round, aes(x = minutesPlayed, y = disposals)) +
geom_point(color = "red")
We will now limit the data set to only include the players in squadId = 801. We will look at the two types of penalties obstruction and contact. The plot will include a size aesthetic into the visualisation.
squad801 <- total_player_stats_per_round %>%
filter(squadId == "801")
ggplot(squad801, aes(x = obstructionPenalties, y = contactPenalties, color = shortDisplayName, size = minutesPlayed)) + geom_point()
Despite this plot displaying lots of information it is not effective as the circles are overlapping each other.
To better differentiate each player on the team, we can include a shape aesthetic that represents each athlete. A filter will also be applied to include only five of the players in the squad.
squad801 <- squad801 %>%
filter(shortDisplayName == "Agbeze, A" |
shortDisplayName == "Bell, E" |
shortDisplayName == "Clarke, J" |
shortDisplayName == "Ingles, R" |
shortDisplayName == "Pitman, C")
ggplot(squad801, aes(x = obstructionPenalties, y = contactPenalties, color = shortDisplayName, size = minutesPlayed, shape = shortDisplayName)) + geom_point()
An additional way to include more information in a plot is by using facets, which are especially useful for categorical data where we might want to add a small plot for each group. This is a great way to introduce an extra dimension of data without overcrowding one plot. The following plot will look at deflections and gains.
ggplot(data = squad801, aes(x = deflections, y = gain, color = shortDisplayName, size = minutesPlayed)) +
geom_point() +
facet_wrap(~shortDisplayName, nrow = 2) +
theme_bw()
Highlighting particular subsets of data within a visualisation is a good way of conveying key information and directing the attention of viewers to where you want.
Lets say we want to plot two variables such as centrePassReceives and goalAssists and three specific players identified (Stephanie wood, Laura Langman and Chelsea Pitman).
To be able to distinguish which colour represents which player, a legend can be easily created in ggplot.
ggplot(data = total_player_stats_per_round, aes(x = centrePassReceives, y = goalAssists)) +
geom_point(alpha = 0.2, size = 0.1) +
geom_point(data = subset(total_player_stats_per_round, shortDisplayName == "Wood, S"), aes(color = 'Steph Wood')) +
geom_point(data = subset(total_player_stats_per_round, shortDisplayName == "Langman, L"), aes(color = 'Laura Langman')) +
geom_point(data = subset(total_player_stats_per_round, shortDisplayName == "Pitman, C"), aes(color = 'Chelsea Pitman'))
Additional information can also be added to the above plot by differentiate between each round of the season while also highlighting each of the three players.
ggplot(data = total_player_stats_per_round, aes(x = centrePassReceives, y = goalAssists, color = round)) +
geom_point(alpha = 0.2, size = 0.1) +
geom_point(data = subset(total_player_stats_per_round, shortDisplayName == "Wood, S"), aes(shape = 'Steph Wood')) +
geom_point(data = subset(total_player_stats_per_round, shortDisplayName == "Langman, L"), aes(shape = 'Laura Langman')) +
geom_point(data = subset(total_player_stats_per_round, shortDisplayName == "Pitman, C"), aes(shape = 'Chelsea Pitman'))
Summary
The plots shown in this document are only a few of the different types of data visualisation that can be produced in R using the ggplot2 package.
These plots can be applied to many different sports, not just netball.