# This is the R chunk for the required packages
knitr::opts_knit$set(root.dir = "/Users/scott/OneDrive/RMIT/2020s2/MATH2349 Data Wrangling/Assignment 2")
library(readr)
library(dplyr)
library(readxl)
library(gdata)
library(rvest)
library(tidyr)
library(knitr)
library(deductive)
library(validate)
library(Hmisc)
library(stringr)
library(lubridate)
library(outliers)
Data was gathered from Parkrun Australia on the results of all events at the Parkville course. This data was combined with three datasets from the Australian Bureau of Meteorologyto provide the Rainfall and Minimum and Maximum Temperatures.
The data as read from the Parkrun site was messy, and required several conversions from text to numbers, excess text removed, duplicate columns removed, and conversation to time. Bands were also added so that temperatures could be grouped into cold, cool, mild, warm, hot, etc.
The results contained race results for male and female competitors in columns. The function pivot_longer was applied to create a gender column into which the values Male and Female were placed.
Missing values were removed from the weather tables prior to joining on the events data. It was also discovered that three duplicate columns contained no data, and there columns were removed.
Numeric data were scanned for outliers using boxplots, histograms and analysing Z-Scores. These outliers were removed from the data. Admittedly this was not the best way to deal with the outliers, but there was no time for further action.
Finally, the Num_Finishers column was transformed from a right-skewed distribution into a normal distribution using the log function.
We are loading and combining three datasets in order to analyse the impact of weather on turnout and performance at running events.
The first data set is taken from the Parkrun Australia website, listing the 200 events to date at Parkville, showing the winning male and female athletes and their times.
The next three data sets are from the Bureau of Meteorology (BOM), and list the daily rainfall, and minimum and maximum temperatures for Melbourne since June 2013.
The Parkrun Dataset did not load cleanly from HTML, and required considerable processing. Headers were in the wrong position, duplicated, and multiple data items were concatenated in single columns and duplicated across columns.
As all three BOM datasets have very similar structure, the function read_BOM_Daily_Obs was written to avoid code duplication.
The function takes two parameters filename and observation_title.
Function: read_BOM_Daily_Obs(filename, observation_title)
filename (chr): The filename to read
observation_title (chr): The name of the observation variable in the file.
The weather observation in column 6 will be renamed to this name.
The function will:
The BOM datasets were joined with the Parkrun dataset on the Date column.
Normally I would have tided the events dataset into a longer form in this step, creating a Gender column, and reducing the male and female results to only Winner and Time. For the purposes of the structure of this assignement template, this will be done in Tidy & Manipulate Data I, as well as cleaning up the Num_Finishers and Num_Volunteers variables.
# This is the R chunk for the Data Section
##################
# EVENT DATA
##################
# Taken from https://www.parkrun.com.au/parkville/results/eventhistory/
# Note: scraping of the website is forbidden, so the html was saved down manually.
# Read the results page HTML
events_html <- read_html("data/results _ Parkville parkrun.html")
# Obtain the list of tables from the page
all_tables <- html_nodes(events_html, "table")
# Extract the results from the first table.
# Use fill = TRUE because some rows are missing columns which we will fill with NA
events_raw <- html_table(all_tables[[1]], fill = TRUE) #Error: Table has inconsistent number of columns. Do you want fill = TRUE?
# Display the top values
head(events_raw)
##################
# WEATHER DATA
##################
# Read Historical Temperatures and Rainfall from BOM for Melbourne.
# Rainfall: IDCJAC0009_086338_1800_Data.csv
# Maximum Temperatures: IDCJAC0010_086338_1800_Data.csv
# Minimum Temperatures: IDCJAC0011_086338_1800_Data.csv
# Data sourced from
# rain: http://www.bom.gov.au/jsp/ncc/cdio/wData/wdata?p_nccObsCode=136&p_display_type=dailyDataFile&p_stn_num=086338&p_startYear=
# max: http://www.bom.gov.au/jsp/ncc/cdio/wData/wdata?p_nccObsCode=122&p_display_type=dailyDataFile&p_stn_num=086338&p_startYear=
# min: http://www.bom.gov.au/jsp/ncc/cdio/wData/wdata?p_nccObsCode=123&p_display_type=dailyDataFile&p_stn_num=086338&p_startYear=
################
# Function for reading BOM Daily observation files (temperature, rainfall)
# Read CSV file in filename, remove missing values, and rename the observation variable to observation_title
#
# Function: read_BOM_Daily_Obs
# filename (chr): name of csv file to read
# observation_title (chr): name of the observation variable in the file
###############
read_BOM_Daily_Obs <- function(filename, observation_title) {
# Read the weather file
weather <- read_csv(filename)
# Rename Columns
names(weather) <- c("Product",
"Station",
"Year",
"Month",
"Day",
"this_obs",
"Days_Accum",
"Quality")
# Remove missing observations
weather <- weather[!is.na(weather$this_obs),]
# Extract Date from Year, Month, Day, and rename our observation
weather %>%
mutate(Date = make_date(Year, Month, Day)) %>%
select(Date, this_obs) %>%
rename(!!observation_title := this_obs)
}
rainfall <- read_BOM_Daily_Obs("data/IDCJAC0009_086338_1800_Data.csv", "Rainfall_mm")
## Parsed with column specification:
## cols(
## `Product code` = col_character(),
## `Bureau of Meteorology station number` = col_character(),
## Year = col_double(),
## Month = col_character(),
## Day = col_character(),
## `Rainfall amount (millimetres)` = col_double(),
## `Period over which rainfall was measured (days)` = col_double(),
## Quality = col_character()
## )
head(rainfall)
max_temps <- read_BOM_Daily_Obs("data/IDCJAC0010_086338_1800_Data.csv", "Max_Temp")
## Parsed with column specification:
## cols(
## `Product code` = col_character(),
## `Bureau of Meteorology station number` = col_character(),
## Year = col_double(),
## Month = col_character(),
## Day = col_character(),
## `Maximum temperature (Degree C)` = col_double(),
## `Days of accumulation of maximum temperature` = col_double(),
## Quality = col_character()
## )
head(max_temps)
min_temps <- read_BOM_Daily_Obs("data/IDCJAC0011_086338_1800_Data.csv", "Min_Temp")
## Parsed with column specification:
## cols(
## `Product code` = col_character(),
## `Bureau of Meteorology station number` = col_character(),
## Year = col_double(),
## Month = col_character(),
## Day = col_character(),
## `Minimum temperature (Degree C)` = col_double(),
## `Days of accumulation of minimum temperature` = col_double(),
## Quality = col_character()
## )
head(min_temps)
The data loaded from the HTML is corrupted with incorrect column names and duplicate data. This is cleaned.
Before joining the data, the key Date field needs to be created in the events table.
The Data violates the following Tidy Data Principles:
Column Headers contain Values. The variable Gender is contained in the headers for: Winner_Male Time_Male Winner_Female Time_Female Gender should be a Variable for each observation.
Multiple Variables are stored in one Column. More an issue with the HTML extraction process for this particular data, we found that multiple variables had been mixed into the columns.
For example, the “Date Column” contained the entire observation: 07/03/2020Benjamin KRONENBERG(M) 00:17:22Lauren COCKERELL(F) 00:19:32
and the Male_Winner and Female_Winner columns had the Winning Time appended. Sophie RYAN00:18:59
It turned out that this data was duplicated, so it was simply removed rather than split into columns. This was done in the Data and Understand sections.
# This is the R chunk for the Tidy & Manipulate Data I
#
# TIDY EVENT DATA
#
# Rename columns
events <- events_raw
names(events) <- c("Event_Number",
"Date",
"Num_Finishers",
"Num_Volunteers",
"Winner_Male",
"Time_Male",
"Winner_Female",
"Time_Female",
"Female_Duplicate_Column_1",
"Male_Duplicate_Column_1",
"Female_Duplicate_Column_2")
# To join with Weather data on date, we must extract the date from the text
# Extract the leftmost 10 characters and convert to Date
events$Date <- events$Date %>%
substr(1, 10) %>%
dmy()
# Extract Winners' names by stripping off the time at the end of the string
events$Winner_Male <- str_replace_all(events$Winner_Male, "[0-9:]", "")
events$Winner_Female <- str_replace_all(events$Winner_Female, "[0-9:]", "")
# Tidy Male and Female Finishers to long format, creating the Gender column
events <- events %>%
pivot_longer(
cols = Winner_Male:Time_Female,
names_to = c(".value", "Gender"),
names_sep = "_"
)
##################
# COMBINED DATA
##################
# Join the weather datasets with the events to get a picture of how weather patterns affect attendance and performance
events <- events %>%
left_join(min_temps, by = "Date") %>%
left_join(max_temps, by = "Date") %>%
left_join(rainfall, by = "Date")
head(events)
For later analysis, out of scope in this assignment, I want to apply names to the temperature bands.
| band_floor | band_ceiling | temperature_band |
|---|---|---|
| -5 | 0 | Freezing |
| 0 | 5 | Cold |
| 5 | 10 | Cool |
| 10 | 15 | Mild |
| 15 | 20 | Warm |
| 20 | 25 | Very Warm |
| 25 | 30 | Hot |
| 30 | Inf | Very Hot |
# This is the R chunk for the Tidy & Manipulate Data II
# Temerature Bands
#
# As the events are held at 8am, the daily Minimum temperature is more representative of the temperate at the time of the event.
# Classify the minimum temperatures
temperature_bands <- data.frame(band_floor = c(-5, 0, 5, 10, 15, 20, 25, 30),
band_ceiling = c(0, 5, 10, 15, 20, 25, 30, Inf),
temperature_band = c("Freezing", "Cold", "Cool", "Mild", "Warm", "Very Warm", "Hot", "Very Hot"))
temperature_bands
# Calculate the corresponding band_floor for each minimum temperature to join on temperature bands
events <- events %>% mutate(band_floor = floor(Min_Temp / 5) * 5) %>%
left_join(temperature_bands, by = "band_floor") %>%
select(-c(band_floor, band_ceiling))
# Create the temperature range column to show
events <- events %>% mutate(temperature_range = Max_Temp - Min_Temp)
# Display the data with the new columns
head(events)
The following variables are present in the dataset so far.
| Variable | From Dataset | Description | Datatype Loaded | Expected Datatype |
|---|---|---|---|---|
| Event_Number | Parkrun | Event Number | Integer | Integer |
| Date | Parkrun | Event Date | Date1 | Date |
| Num_Finishers | Parkrun | Number of Competitors | Char | Integer |
| Num_Volunteers | Parkrun | Number of Volunteers | Char | Integer |
| Winner_Male | Parkrun | Name of winner (male athlete) | Char | Char |
| Time_Male | Parkrun | Winning time (male athlete) | Char | POSIXct |
| Winner_Female | Parkrun | Name of winner (female athlete) | Char | Char |
| Time_Female | Parkrun | Winning time (female athlete) | Char | POSIXct |
| Female_Duplicate_Column_1 | Parkrun | Duplicate titled “Female First Finisher” | logical | tbc2 |
| Male_Duplicate_Column_1 | Parkrun | Duplicate titled “Male First Finisher” | logical | tbc2 |
| Female_Duplicate_Column_2 | Parkrun | Duplicate titled “Female First Finisher” | logical | tbc2 |
| Min_Temp | BOM Minimum Temperatures Melbourne | Minimum Temperature on the day (BOM) | Numeric | Numeric |
| Max_Temp | BOM Maxiimum Temperatures Melbourne | Maximum Temperature on the day (BOM) | Numeric | Numeric |
| Rainfall_mm | BOM Rainfall Melbourne | Rainfall (mm) on the day (BOM) | Numeric | Numeric |
1 This was already converted to a data in all four datasets as this was required for joining.
2 will be checked in the Scan I step when checking for missing values.
The following steps were applied to the data:
# This is the R chunk for the Understand Section
#Display the structure of the dataset
str(events)
## tibble [400 × 15] (S3: tbl_df/tbl/data.frame)
## $ Event_Number : int [1:400] 200 200 199 199 198 198 197 197 196 196 ...
## $ Date : Date[1:400], format: "2020-03-07" "2020-03-07" ...
## $ Num_Finishers : chr [1:400] "369finishers" "369finishers" "403finishers" "403finishers" ...
## $ Num_Volunteers : chr [1:400] "14volunteers" "14volunteers" "15volunteers" "15volunteers" ...
## $ Female_Duplicate_Column_1: logi [1:400] NA NA NA NA NA NA ...
## $ Male_Duplicate_Column_1 : logi [1:400] NA NA NA NA NA NA ...
## $ Female_Duplicate_Column_2: logi [1:400] NA NA NA NA NA NA ...
## $ Gender : chr [1:400] "Male" "Female" "Male" "Female" ...
## $ Winner : chr [1:400] "Benjamin KRONENBERG" "Lauren COCKERELL" "Frédéric TRANCHAND" "Sophie RYAN" ...
## $ Time : chr [1:400] "00:17:22" "00:19:32" "00:15:40" "00:18:59" ...
## $ Min_Temp : num [1:400] 15.4 15.4 14.7 14.7 11.4 11.4 18.9 18.9 17.4 17.4 ...
## $ Max_Temp : num [1:400] 20 20 22.4 22.4 21.1 21.1 19.3 19.3 29.4 29.4 ...
## $ Rainfall_mm : num [1:400] 0.2 0.2 0 0 0 0 5.4 5.4 0 0 ...
## $ temperature_band : chr [1:400] "Warm" "Warm" "Mild" "Mild" ...
## $ temperature_range : num [1:400] 4.6 4.6 7.7 7.7 9.7 ...
# Convert the race time text to POSIXct
events$Time <- parse_date_time(events$Time, "%H:%M:%S")
# Create a function to strip the characters from the Num_Finishers and Num_Volunteers, convert their data types to integer and apply the function to conver
stripChars <- function(x) {
x %>%
str_replace_all("[^0-9]", "") %>%
as.integer()
}
# Apply the stripChars function
events[c("Num_Finishers", "Num_Volunteers")] <- lapply(events[c("Num_Finishers", "Num_Volunteers")], stripChars)
# Convert Gender to Factor
events$Gender <- factor(events$Gender,
levels = c("Female", "Male")
)
The colSums function was used to count missing values in each column.
Missing observations were already scanned for and removed in the earlier Data loading section. What remained were only duplicate columns Female_Duplicate_Column_1, Male_Duplicate_Column_1, Female_Duplicate_Column_2 that actually contain no data - each value in these columns is NA and they were removed.
** Note that missing observations were already removed earlier in the process from the weather data prior to joining. This was performed in the function read_BOM_Daily_Obs in the Data section above.
The colSums function was used to count NaN and infinite values in each numeric column.
Each column in the dateset was checked as per below.
Summary function used to summarize the column.brave Min Value: 1 Max Value: 200
We would expect that each number between 1 and 200 is in this column. Checked by comparing the set of all integers between the minimum and max with the Event_Number column
Check that each data only appears twice (once each for Male and Female)
# This is the R chunk for the Scan I
# Check for Missing Values
###########################
# Scan for NA.
if(sum(is.na(events))>0){
naCountALL <- colSums(is.na(events))
colsWithNA <- colnames(events)[naCountALL > 0]
naCountCols <- naCountALL[colsWithNA]
# Deal with columns that have NA
"There are columns with NA"
} else {
"No columns with NA found"
}
## [1] "There are columns with NA"
# Display columns containing NA
naCountCols
## Female_Duplicate_Column_1 Male_Duplicate_Column_1 Female_Duplicate_Column_2
## 400 400 400
# Identify columns in which ALL values are NA. I.e. they contain no data.
allNA <- naCountCols == nrow(events)
# Extra columns with no data are removed from the dataset.
if(sum(allNA) > 0) {
# Remove the columns which are all NA
events <- events %>%
select(-names(allNA))
"Columns with not data (all NA) have been removed"
} else {
"No columns found with ALL values missing"
}
## [1] "Columns with not data (all NA) have been removed"
# Check remaining columns for NA values
ifelse(sum(is.na(events))>0,
"There are still columns with NA",
"No remaining columns found with NA"
)
## [1] "No remaining columns found with NA"
# Check Numeric columns
# Scan for other types of special values in numeric columns
is.special <- function(x) {
if (is.numeric(x)) (is.infinite(x) | is.nan(x))
}
# Scan numeric columns
special <- sapply(select_if(events, is.numeric), is.special)
ifelse(sum(special) > 0,
"There are numeric columns with special values",
"No columns with special values found")
## [1] "No columns with special values found"
colSums(special)
## Event_Number Num_Finishers Num_Volunteers Min_Temp
## 0 0 0 0
## Max_Temp Rainfall_mm temperature_range
## 0 0 0
# Check Column Event_Number
###########################
# Summarise
summary(events$Event_Number)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 50.75 100.50 100.50 150.25 200.00
# Check all values are present between the minimum and maximum event numbers - i.e. all events are present in the data
all_events <- c(min(events$Event_Number):max(events$Event_Number))
missing_events <- setdiff(all_events, events$Event_Number)
ifelse(length(missing_events) == 0, "No missing Events", paste0("Missing event count:", length(missing_events)))
## [1] "No missing Events"
# Check that male and female results only appears once for each event
entries <- events %>% group_by(Event_Number, Gender) %>% summarise(num = n())
## `summarise()` regrouping output by 'Event_Number' (override with `.groups` argument)
ifelse(max(entries$num) == 1 & min(entries$num) == 1,
"Correct. One observations per event",
"Incorrect number of observations per event.")
## [1] "Correct. One observations per event"
# Check that each event has two entries (one for male, one for female)
entries <- events %>% group_by(Event_Number) %>% summarise(num = n())
## `summarise()` ungrouping output (override with `.groups` argument)
ifelse(max(entries$num) == 2 & min(entries$num) == 2,
"Correct. Two observations per event",
"Incorrect number of observations per event.")
## [1] "Correct. Two observations per event"
# Check Column Date
###########################
# Each date should only be present twice - one event per day, 1 result per gender.
entries <- events %>% group_by(Date) %>% summarise(num = n())
## `summarise()` ungrouping output (override with `.groups` argument)
ifelse(max(entries$num) == 2 & min(entries$num) == 2,
"Correct. Two observations per Date",
"Incorrect number of observations per Date")
## [1] "Correct. Two observations per Date"
# Check Gender Column
###########################
#Show earliest and latest Dates
levels(events$Gender)
## [1] "Female" "Male"
Scan the numeric data for outliers.
Boxplots, histograms, and Z-Scores were produced for each numeric variable.
| Column | Outliers zScore > 3 | Skew |
|---|---|---|
| Event_Number | 0 | - |
| Num_Finishers | 4 | Right Skewed |
| Num_Volunteers | 2 | Right Skewed |
| Min_Temp | 0 | Normal |
| Max_Temp | 2 | Right Skewed |
| Rainfall_mm | 8 | Right Skewed |
| temperature_range | 2 | Right Skewed |
| Time | 0 | |
| FinishersPerVolunteer | 0 | Normal |
# This is the R chunk for the Scan II
# Select only numeric colums
events_numeric <- events %>% select_if(is.numeric)
# Calculate Z scores for each variable
zScores <- events_numeric %>% scores(type = "z")
# Function to display a boxplot, histogram and Z Score summary for a named column
# Analyse Event Number
col.name <- "Event_Number"
# Failed to get this to work as a function. Different results were produced.
# hist.bp <- function(col.name) {
my.col <- events[,col.name]
boxplot(my.col,
main = col.name)
hist(my.col,
main = col.name )
summary(zScores[,col.name])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.7213 -0.8606 0.0000 0.0000 0.8606 1.7213
length(which( abs(zScores[,col.name]) > 3))
## [1] 0
# Append to all Outliers
all_outliers <- which( abs(zScores[,col.name]) > 3)
# }
# Scan the Event Number
# hist.bp("Event_Number")
# Analyse Num_Finishers
col.name <- "Num_Finishers"
my.col <- events[,col.name]
boxplot(my.col,
main = col.name)
hist(my.col,
main = col.name )
summary(zScores[,col.name])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.4924 -0.7492 -0.1989 0.0000 0.5940 3.4951
length(which( abs(zScores[,col.name]) > 3))
## [1] 4
# Append to all Outliers
all_outliers <- c(all_outliers, which( abs(zScores[,col.name]) > 3))
# Analyse Num_Volunteers
col.name <- "Num_Volunteers"
my.col <- events[,col.name]
boxplot(my.col,
main = col.name)
hist(my.col,
main = col.name )
summary(zScores[,col.name])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.3520 -0.9214 -0.2756 0.0000 0.8009 3.3843
length(which( abs(zScores[,col.name]) > 3))
## [1] 2
# Append to all Outliers
all_outliers <- c(all_outliers, which( abs(zScores[,col.name]) > 3))
# Analyse Min_Temp
col.name <- "Min_Temp"
my.col <- events[,col.name]
boxplot(my.col,
main = col.name)
hist(my.col,
main = col.name )
summary(zScores[,col.name])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.3816 -0.7458 -0.1778 0.0000 0.7140 2.7076
length(which( abs(zScores[,col.name]) > 3))
## [1] 0
# Append to all Outliers
all_outliers <- c(all_outliers, which( abs(zScores[,col.name]) > 3))
# Analyse Max_Temp
col.name <- "Max_Temp"
my.col <- events[,col.name]
boxplot(my.col,
main = col.name)
hist(my.col,
main = col.name )
summary(zScores[,col.name])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.4143 -0.7860 -0.1821 0.0000 0.6163 3.4656
length(which( abs(zScores[,col.name]) > 3))
## [1] 2
# Append to all Outliers
all_outliers <- c(all_outliers, which( abs(zScores[,col.name]) > 3))
# Analyse Rainfall_mm
col.name <- "Rainfall_mm"
my.col <- events[,col.name]
boxplot(my.col,
main = col.name)
hist(my.col,
main = col.name )
summary(zScores[,col.name])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.39738 -0.39738 -0.39738 0.00000 -0.01181 9.53115
length(which( abs(zScores[,col.name]) > 3))
## [1] 8
# Append to all Outliers
all_outliers <- c(all_outliers, which( abs(zScores[,col.name]) > 3))
# Analyse temperature_range
col.name <- "temperature_range"
my.col <- events[,col.name]
boxplot(my.col,
main = col.name)
hist(my.col,
main = col.name )
summary(zScores[,col.name])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.9311 -0.7040 -0.2642 0.0000 0.5924 3.2780
length(which( abs(zScores[,col.name]) > 3))
## [1] 2
# Append to all Outliers
all_outliers <- c(all_outliers, which( abs(zScores[,col.name]) > 3))
# Analyse Time
col.name <- "Time"
my.col <- events[,col.name]
boxplot(my.col,
main = col.name)
# hist(my.col,
# # "minutes",
# format = "%k",
# main = col.name
# )
zScores$Time <- scores(events$Time, type = "z")
summary(zScores[,col.name])
## Length Class Mode
## 400 difftime numeric
length(which( abs(zScores[,col.name]) > 3))
## [1] 0
# Append to all Outliers
all_outliers <- c(all_outliers, which( abs(zScores[,col.name]) > 3))
# Analyse the relationship between the number of finishers and the number of volunteers
# Produce a scatter plot
plot(events$Num_Finishers, events$Num_Volunteers)
# Create a variable to see number of finishers per volunteer
events$FinishersPerVolunteer <- events$Num_Finishers / events$Num_Volunteers
# Make a boxplot and histogram of the new variable
col.name <- "FinishersPerVolunteer"
my.col <- events[,col.name]
boxplot(my.col,
main = col.name)
hist(my.col,
main = col.name
)
zScores$FinishersPerVolunteer <- scores(events$FinishersPerVolunteer, type = "z")
summary(zScores[,col.name])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.11057 -0.70184 -0.05025 0.00000 0.66638 2.96286
length(which( abs(zScores[,col.name]) > 3))
## [1] 0
# Remove all rows with outliers
events <- events[-all_outliers,]
We apply the log function to the Number of Finishers variable, and successfully convert the Right-Skewed data to a normal distribusion.
# This is the R chunk for the Transform Section
ln_Num_Finishers <- log(events$Num_Finishers)
hist(ln_Num_Finishers,
main = "Log of Num_Finishers")