library(readr)
library(tidyr)
library(dplyr)
library(Hmisc)
library(outliers)
library(lubridate)
library(stringr)
The purpose of this work is to preprocess a dataset on the matches played in the Women’s Tennis Association (WTA) for further exploration or analysis. The steps I have taken are as follows: * Join two raw tables together based on relational keys
Assess data structures and make any appropriate conversions
Reshape the data into tidy format
Scan the data for missing data and outliers
Transform a variable, converting to a normal distribution
I have found this data to be weighted more towards qualitative data than quantitative.
Now that the dataset is preprocessed, the researcher/analyst may want to explore it using descriptive statistics and build visualisations, or build analytic models and look at predictive factors.
The data I have selected are 2 relational datasets which contain data from the WTA on tennis matches and players.
The original source of the data was the public WTA website, which has been scraped and made available on kaggle https://www.kaggle.com/joaoevangelista/wta-matches-and-rankings/home
There is no metadata available for this data however we find most of the variable headings & observations self explanatory.
#Save csv files to working directory and set working directory
setwd("D:/Google Drive/RMIT/MATH2349 Data Preprocessing/Assignment 3")
#Read in files
players_raw <- read.csv("players.csv", stringsAsFactors = FALSE)
matches_raw <- read.csv("matches.csv", stringsAsFactors = FALSE)
Note the matches data has the player ID for both the winner and loser. We need to merge the player variables to the matches table. Because there is an ID for each player (winner & loser), we need to define and rename the variables accordingly.
#Join loser player names
WTA_v1 <- matches_raw %>% left_join((players_raw %>% select (player_id, loser_first_name=first_name, loser_last_name=last_name, loser_hand=hand, loser_dob=birth_date, loser_country=country_code)), by = c("loser_id" = "player_id"))
#Join winner player names
WTA_v2 <- WTA_v1 %>% left_join((players_raw %>% select (player_id, winner_first_name=first_name, winner_last_name=last_name, winner_hand=hand, winner_dob=birth_date, winner_country=country_code)), by = c("winner_id" = "player_id"))
For this assignment, I will also subset the data to look at only those matches which were part of a 3-set elimination tournament. Given all matches will be best of 3, we can also remove the best_of column.
#Create subset
WTA_v2.1 <- WTA_v2 %>% filter(round == "R128" | round == "R64" | round == "R32" | round == "R16" | round == "QF" | round == "SF" | round =="F") %>% filter(best_of == 3) %>% select(-best_of)
#View merged subset
head(WTA_v2.1)
## draw_size loser_id loser_rank loser_rank_points loser_seed match_num
## 1 128 200002 49 640 1
## 2 128 200004 133 199 2
## 3 128 200005 118 243 3
## 4 128 200007 23 1112 4
## 5 128 200010 60 516 5
## 6 128 200011 66 484 6
## minutes round score surface tourney_date tourney_id
## 1 NA R128 6-1 6-2 Hard 20000117 2000-W-SL-AUS-01A-2000
## 2 NA R128 6-4 6-2 Hard 20000117 2000-W-SL-AUS-01A-2000
## 3 NA R128 6-0 6-1 Hard 20000117 2000-W-SL-AUS-01A-2000
## 4 NA R128 6-1 5-7 7-5 Hard 20000117 2000-W-SL-AUS-01A-2000
## 5 NA R128 6-3 6-4 Hard 20000117 2000-W-SL-AUS-01A-2000
## 6 NA R128 6-4 1-6 6-4 Hard 20000117 2000-W-SL-AUS-01A-2000
## tourney_level tourney_name winner_id winner_rank winner_rank_points
## 1 G Australian Open 200001 1 6003
## 2 G Australian Open 200003 63 510
## 3 G Australian Open 200006 53 574
## 4 G Australian Open 200008 116 245
## 5 G Australian Open 200009 72 439
## 6 G Australian Open 200012 84 347
## winner_seed year loser_first_name loser_last_name loser_hand loser_dob
## 1 1 2000 Mirjana Lucic R 19820309
## 2 2000 Kerry Anne Guse R 19721204
## 3 2000 Jolene Watanabe Giltz R 19680831
## 4 2000 Silvija Talaja R 19780114
## 5 2000 Rita Grande R 19750323
## 6 2000 Katarina Srebotnik R 19810312
## loser_country winner_first_name winner_last_name winner_hand winner_dob
## 1 CRO Martina Hingis R 19800930
## 2 AUS Justine Henin R 19820601
## 3 USA Karina Habsudova R 19730802
## 4 CRO Alicia Molik R 19810127
## 5 ITA Tamarine Tanasugarn R 19770524
## 6 SLO Jana Nejedly R 19740609
## winner_country
## 1 SUI
## 2 BEL
## 3 SVK
## 4 AUS
## 5 THA
## 6 CAN
#Check data structure & attributes
str(WTA_v2.1)
## 'data.frame': 45045 obs. of 29 variables:
## $ draw_size : int 128 128 128 128 128 128 128 128 128 128 ...
## $ loser_id : int 200002 200004 200005 200007 200010 200011 200014 200015 200018 200019 ...
## $ loser_rank : chr "49" "133" "118" "23" ...
## $ loser_rank_points : int 640 199 243 1112 516 484 97 271 255 132 ...
## $ loser_seed : chr "" "" "" "" ...
## $ match_num : int 1 2 3 4 5 6 7 8 9 10 ...
## $ minutes : int NA NA NA NA NA NA NA NA NA NA ...
## $ round : chr "R128" "R128" "R128" "R128" ...
## $ score : chr "6-1 6-2" "6-4 6-2" "6-0 6-1" "6-1 5-7 7-5" ...
## $ surface : chr "Hard" "Hard" "Hard" "Hard" ...
## $ tourney_date : chr "20000117" "20000117" "20000117" "20000117" ...
## $ tourney_id : chr "2000-W-SL-AUS-01A-2000" "2000-W-SL-AUS-01A-2000" "2000-W-SL-AUS-01A-2000" "2000-W-SL-AUS-01A-2000" ...
## $ tourney_level : chr "G" "G" "G" "G" ...
## $ tourney_name : chr "Australian Open" "Australian Open" "Australian Open" "Australian Open" ...
## $ winner_id : int 200001 200003 200006 200008 200009 200012 200013 200016 200017 200020 ...
## $ winner_rank : chr "1" "63" "53" "116" ...
## $ winner_rank_points: int 6003 510 574 245 439 347 52 1635 1558 926 ...
## $ winner_seed : chr "1" "" "" "" ...
## $ year : int 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 ...
## $ loser_first_name : chr "Mirjana" "Kerry Anne" "Jolene" "Silvija" ...
## $ loser_last_name : chr "Lucic" "Guse" "Watanabe Giltz" "Talaja" ...
## $ loser_hand : chr "R" "R" "R" "R" ...
## $ loser_dob : int 19820309 19721204 19680831 19780114 19750323 19810312 19730419 19780117 19760521 19731122 ...
## $ loser_country : chr "CRO" "AUS" "USA" "CRO" ...
## $ winner_first_name : chr "Martina" "Justine" "Karina" "Alicia" ...
## $ winner_last_name : chr "Hingis" "Henin" "Habsudova" "Molik" ...
## $ winner_hand : chr "R" "R" "R" "R" ...
## $ winner_dob : int 19800930 19820601 19730802 19810127 19770524 19740609 19731003 19720403 19711218 19730810 ...
## $ winner_country : chr "SUI" "BEL" "SVK" "AUS" ...
As we can see from the above step we have a data frame with two different variable types; integer and character. While these data types are suitable for most variables, some of them require conversion to factor, date, or integer.
WTA_v3 <- WTA_v2.1
#Convert dates
WTA_v3$winner_dob <- ymd(WTA_v3$winner_dob)
WTA_v3$loser_dob <- ymd(WTA_v3$loser_dob)
WTA_v3$tourney_date <- ymd(WTA_v3$tourney_date)
#Convert integers
WTA_v3$loser_rank <- as.integer(WTA_v3$loser_rank)
WTA_v3$loser_seed <- as.integer(WTA_v3$loser_seed)
WTA_v3$winner_rank <- as.integer(WTA_v3$winner_rank)
WTA_v3$winner_seed <- as.integer(WTA_v3$winner_seed)
#Convert factors
WTA_v3$tourney_level <- as.factor(WTA_v3$tourney_level)
WTA_v3$winner_hand <- as.factor(WTA_v3$winner_hand)
WTA_v3$loser_hand <- as.factor(WTA_v3$loser_hand)
WTA_v3$surface <- as.factor(WTA_v3$surface)
Lastly one of our variables ‘round’ should be converted to a factor. It represents the matches progression level within the tournament and also needs to be ordered.
#Convert & order factor
WTA_v3 <- WTA_v3 %>% mutate(round = as.ordered(factor(round, levels = c("R128", "R64", "R32", "R16", "QF", "SF", "F"),
labels = c("R128", "R64", "R32", "R16", "Quarter Final", "Semi Final", "Final"))))
#Check levels
levels(WTA_v3$round)
## [1] "R128" "R64" "R32" "R16"
## [5] "Quarter Final" "Semi Final" "Final"
Now we view the structure a second time after the conversions and see that all our data types are appropriate.
#Check data structure & attributes
str(WTA_v3)
## 'data.frame': 45045 obs. of 29 variables:
## $ draw_size : int 128 128 128 128 128 128 128 128 128 128 ...
## $ loser_id : int 200002 200004 200005 200007 200010 200011 200014 200015 200018 200019 ...
## $ loser_rank : int 49 133 118 23 60 66 214 109 113 177 ...
## $ loser_rank_points : int 640 199 243 1112 516 484 97 271 255 132 ...
## $ loser_seed : int NA NA NA NA NA NA NA NA NA NA ...
## $ match_num : int 1 2 3 4 5 6 7 8 9 10 ...
## $ minutes : int NA NA NA NA NA NA NA NA NA NA ...
## $ round : Ord.factor w/ 7 levels "R128"<"R64"<"R32"<..: 1 1 1 1 1 1 1 1 1 1 ...
## $ score : chr "6-1 6-2" "6-4 6-2" "6-0 6-1" "6-1 5-7 7-5" ...
## $ surface : Factor w/ 4 levels "Carpet","Clay",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ tourney_date : Date, format: "2000-01-17" "2000-01-17" ...
## $ tourney_id : chr "2000-W-SL-AUS-01A-2000" "2000-W-SL-AUS-01A-2000" "2000-W-SL-AUS-01A-2000" "2000-W-SL-AUS-01A-2000" ...
## $ tourney_level : Factor w/ 13 levels "CC","CH","G",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ tourney_name : chr "Australian Open" "Australian Open" "Australian Open" "Australian Open" ...
## $ winner_id : int 200001 200003 200006 200008 200009 200012 200013 200016 200017 200020 ...
## $ winner_rank : int 1 63 53 116 72 84 320 14 15 28 ...
## $ winner_rank_points: int 6003 510 574 245 439 347 52 1635 1558 926 ...
## $ winner_seed : int 1 NA NA NA NA NA NA 12 13 NA ...
## $ year : int 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 ...
## $ loser_first_name : chr "Mirjana" "Kerry Anne" "Jolene" "Silvija" ...
## $ loser_last_name : chr "Lucic" "Guse" "Watanabe Giltz" "Talaja" ...
## $ loser_hand : Factor w/ 3 levels "L","R","U": 2 2 2 2 2 2 2 2 2 2 ...
## $ loser_dob : Date, format: "1982-03-09" "1972-12-04" ...
## $ loser_country : chr "CRO" "AUS" "USA" "CRO" ...
## $ winner_first_name : chr "Martina" "Justine" "Karina" "Alicia" ...
## $ winner_last_name : chr "Hingis" "Henin" "Habsudova" "Molik" ...
## $ winner_hand : Factor w/ 3 levels "L","R","U": 2 2 2 2 2 2 2 2 2 2 ...
## $ winner_dob : Date, format: "1980-09-30" "1982-06-01" ...
## $ winner_country : chr "SUI" "BEL" "SVK" "AUS" ...
The data set is not completely tidy. We have variables forming columns and observations in rows however there is a duplicate variable where there is both tournament date (YYYY-MM-DD) and year (YYYY). To put the data in a tidy format we can remove the year variable.
#Deselect year
WTA_v4 <- WTA_v3 %>% select(-year)
There is one more reshaping task which could further tidy the dataset. The data in the ‘score’ column could be arguably be considered storing multiple variables in a single column, as there are ‘set’ values within the score which can be separated.
#Separate score into sets
WTA_v5 <- WTA_v4 %>% separate(score, into = c("Set_1", "Set_2", "Set_3"), sep = " ")
#Check format correct in new columns
head(WTA_v5 %>% select(starts_with('Set')))
## Set_1 Set_2 Set_3
## 1 6-1 6-2 <NA>
## 2 6-4 6-2 <NA>
## 3 6-0 6-1 <NA>
## 4 6-1 5-7 7-5
## 5 6-3 6-4 <NA>
## 6 6-4 1-6 6-4
And I am now confident that the dataset is in a tidy format.
There are some calculations which may be useful for exploration/analysis after preprocessing.
We may want to do some comparisons of player age, and we have the required variables to make this calculation. Given inability to create year outputs with difftime(), I have manually divided by average days in year and rounded to nearest year.
#Calculate age of players
WTA_v6 <- WTA_v5 %>% mutate(winner_age = as.numeric(round(difftime(tourney_date, winner_dob, units="days")/365.25,0))) %>% mutate(loser_age = as.numeric(round(difftime(tourney_date, loser_dob, units="days")/365.25,0)))
#Check age format
head(WTA_v6 %>% select(loser_age, winner_age))
## loser_age winner_age
## 1 18 19
## 2 27 18
## 3 31 26
## 4 22 19
## 5 25 23
## 6 19 26
Let’s firstly sum the number of observations with missing values.
#Sum the number of missing values by column
colSums(is.na(WTA_v6))
## draw_size loser_id loser_rank
## 0 0 549
## loser_rank_points loser_seed match_num
## 549 33746 0
## minutes round Set_1
## 40218 0 0
## Set_2 Set_3 surface
## 267 30118 0
## tourney_date tourney_id tourney_level
## 0 0 0
## tourney_name winner_id winner_rank
## 0 0 336
## winner_rank_points winner_seed loser_first_name
## 336 23651 0
## loser_last_name loser_hand loser_dob
## 0 0 0
## loser_country winner_first_name winner_last_name
## 0 0 0
## winner_hand winner_dob winner_country
## 0 0 0
## winner_age loser_age
## 0 0
We see that the variable ‘minutes’ has ~40k NAs, while the data set has ~45k observations. Given that close to 90% is missing, we can assume that data for this variable wasn’t consistently recorded across all matches, and exclude this column completely.
#Remove minutes column
WTA_v7 <- WTA_v6 %>% select(-minutes)
There are significant number of missing values in three other variables: Set 3, winner_seed, and loser_seed. This is however in line with logical expectations i.e. most games won’t reach a third set, and many players will be unseeded in a tournament.
During earlier steps of checking data types and attributes I noticed some obvious errors, which have been naturally filtered out by the particular subsetting of the dataset. Checking for unexpected data types we can identify an example error in the earlier version of the dataset (WTA_v2) against expected non-numeric elements (WTA_v7).
#Check for numeric elements within character vector
WTA_v2 %>% filter(str_detect(surface, "\\d")) %>% select(surface)
## surface
## 1 6-2 6-3
#Expected elements
levels(WTA_v7$surface)
## [1] "Carpet" "Clay" "Grass" "Hard"
It is clear that there was a data error and that the score had been input into the surface column.
There are a limited number of numeric variables in this dataset. Some, such as rank and seed, are ordinal and would be inappropriate to assess outliers in. The quantitative variables we can scan are age and rank points.
#Perform union on winner & loser age
WTA_winner_age <- WTA_v7 %>% select(age = winner_age)
WTA_loser_age <- WTA_v7 %>% select(age = loser_age)
WTA_age <- bind_rows(WTA_winner_age, WTA_loser_age)
#Create boxplot of player age
WTA_age %>% boxplot(main="Box Plot of Player Age", ylab="Age (yrs)", col = "grey")
We can see there are a number of outliers up to age 50, however in this case we do not take any action to exclude these observations as it is very reasonable that the data is real and relevant, knowing that professional tennis players have played at close to 50 years of age.
#Perform union on winner & loser rank points
WTA_winner_rank_points <- WTA_v7 %>% select(rank_points = winner_rank_points)
WTA_loser_rank_points <- WTA_v7 %>% select(rank_points = loser_rank_points)
WTA_rank_points <- bind_rows(WTA_winner_rank_points, WTA_loser_rank_points)
#Create boxplot of Ranking Points
WTA_rank_points %>% boxplot(main="Box Plot of Rank Points", ylab="Ranking Points", col = "grey")
From the above plot we detect a large number of outliers. However again we know that these values fall into a reasonable range for this variable and being an outlier in this case does indicate data inaccuracy and so the outliers are not dealt with.
If we create histogram of the age variable, we can see that it is distributed with a slight right skew.
#Histogram of age
hist(WTA_age)
For further statistical analysis we may require that this variable is distributed normally. Therefore I will apply a transformation in order to reduce its skewness towards a normal distribution using the appropriate log transformation.
#Perform log transformation and create histogram
WTA_log_age <- log10(WTA_age)
hist(WTA_log_age)
As we can observe above, the distribution of player age has reduced in right skewness and is normally distributed.
#Create log of age variable in the final dataset
WTA_final <- WTA_v7 %>% mutate(winner_age_log = log10(winner_age), loser_age_log = log10(loser_age))
#Overview final data set
head(WTA_final)
## draw_size loser_id loser_rank loser_rank_points loser_seed match_num
## 1 128 200002 49 640 NA 1
## 2 128 200004 133 199 NA 2
## 3 128 200005 118 243 NA 3
## 4 128 200007 23 1112 NA 4
## 5 128 200010 60 516 NA 5
## 6 128 200011 66 484 NA 6
## round Set_1 Set_2 Set_3 surface tourney_date tourney_id
## 1 R128 6-1 6-2 <NA> Hard 2000-01-17 2000-W-SL-AUS-01A-2000
## 2 R128 6-4 6-2 <NA> Hard 2000-01-17 2000-W-SL-AUS-01A-2000
## 3 R128 6-0 6-1 <NA> Hard 2000-01-17 2000-W-SL-AUS-01A-2000
## 4 R128 6-1 5-7 7-5 Hard 2000-01-17 2000-W-SL-AUS-01A-2000
## 5 R128 6-3 6-4 <NA> Hard 2000-01-17 2000-W-SL-AUS-01A-2000
## 6 R128 6-4 1-6 6-4 Hard 2000-01-17 2000-W-SL-AUS-01A-2000
## tourney_level tourney_name winner_id winner_rank winner_rank_points
## 1 G Australian Open 200001 1 6003
## 2 G Australian Open 200003 63 510
## 3 G Australian Open 200006 53 574
## 4 G Australian Open 200008 116 245
## 5 G Australian Open 200009 72 439
## 6 G Australian Open 200012 84 347
## winner_seed loser_first_name loser_last_name loser_hand loser_dob
## 1 1 Mirjana Lucic R 1982-03-09
## 2 NA Kerry Anne Guse R 1972-12-04
## 3 NA Jolene Watanabe Giltz R 1968-08-31
## 4 NA Silvija Talaja R 1978-01-14
## 5 NA Rita Grande R 1975-03-23
## 6 NA Katarina Srebotnik R 1981-03-12
## loser_country winner_first_name winner_last_name winner_hand winner_dob
## 1 CRO Martina Hingis R 1980-09-30
## 2 AUS Justine Henin R 1982-06-01
## 3 USA Karina Habsudova R 1973-08-02
## 4 CRO Alicia Molik R 1981-01-27
## 5 ITA Tamarine Tanasugarn R 1977-05-24
## 6 SLO Jana Nejedly R 1974-06-09
## winner_country winner_age loser_age winner_age_log loser_age_log
## 1 SUI 19 18 1.278754 1.255273
## 2 BEL 18 27 1.255273 1.431364
## 3 SVK 26 31 1.414973 1.491362
## 4 AUS 19 22 1.278754 1.342423
## 5 THA 23 25 1.361728 1.397940
## 6 CAN 26 19 1.414973 1.278754