title: “Women Tennis Exploratory Analysis”
author: “calin”
output:
html_document:
number_sections: true
toc: true
theme: cosmo
highlight: tango

libraries & loading data

#WTA Exploratory analysis

rm(list=ls())
gc()
##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 479308 25.6     940480 50.3   750400 40.1
## Vcells 860607  6.6    1650153 12.6  1114162  8.6
#libs
library(ggplot2) 
library(data.table)
library(dplyr)
library(treemapify)
library(wordcloud)
library(ggrepel)

players           <-fread("../input/players.csv", stringsAsFactors = F, showProgress = T)
matches           <-fread("../input/matches.csv", stringsAsFactors = F, showProgress = T)
qualifying_matches<-fread("../input/qualifying_matches.csv", stringsAsFactors = F, showProgress = T)
rankings          <-fread("../input/rankings.csv", stringsAsFactors = F, showProgress = T)

brief data exploration

# summary(players)
# summary(matches)
# summary(qualifying_matches)
# summary(rankings)

dim(players)
## [1] 20662     6
dim(matches)
## [1] 50577    33
dim(qualifying_matches)
## [1] 305498     50
dim(rankings)
## [1] 1597902       5
str(players)
## Classes 'data.table' and 'data.frame':   20662 obs. of  6 variables:
##  $ player_id   : int  200001 200002 200003 200004 200005 200006 200007 200008 200009 200010 ...
##  $ first_name  : chr  "Martina" "Mirjana" "Justine" "Kerry Anne" ...
##  $ last_name   : chr  "Hingis" "Lucic" "Henin" "Guse" ...
##  $ hand        : chr  "R" "R" "R" "R" ...
##  $ birth_date  : int  19800930 19820309 19820601 19721204 19680831 19730802 19780114 19810127 19770524 19750323 ...
##  $ country_code: chr  "SUI" "CRO" "BEL" "AUS" ...
##  - attr(*, ".internal.selfref")=<externalptr>
str(matches)
## Classes 'data.table' and 'data.frame':   50577 obs. of  33 variables:
##  $ best_of           : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ draw_size         : int  128 128 128 128 128 128 128 128 128 128 ...
##  $ loser_age         : num  17.9 27.1 31.4 22 24.8 ...
##  $ loser_entry       : chr  "" "Q" "" "" ...
##  $ loser_hand        : chr  "R" "R" "R" "R" ...
##  $ loser_ht          : num  NA NA NA NA NA 180 NA NA NA NA ...
##  $ loser_id          : int  200002 200004 200005 200007 200010 200011 200014 200015 200018 200019 ...
##  $ loser_ioc         : chr  "CRO" "AUS" "USA" "CRO" ...
##  $ loser_name        : chr  "Mirjana Lucic" "Kerry Anne Guse" "Jolene Watanabe Giltz" "Silvija Talaja" ...
##  $ loser_rank        : chr  "49" "133" "118" "23" ...
##  $ loser_rank_points : num  640 199 243 1112 516 ...
##  $ loser_seed        : chr  "" "" "" "" ...
##  $ match_num         : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ minutes           : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ round             : chr  "R128" "R128" "R128" "R128" ...
##  $ score             : chr  "6-1 6-2" "6-4 6-2" "6-0 6-1" "6-1 5-7 7-5" ...
##  $ surface           : chr  "Hard" "Hard" "Hard" "Hard" ...
##  $ tourney_date      : chr  "20000117" "20000117" "20000117" "20000117" ...
##  $ tourney_id        : chr  "2000-W-SL-AUS-01A-2000" "2000-W-SL-AUS-01A-2000" "2000-W-SL-AUS-01A-2000" "2000-W-SL-AUS-01A-2000" ...
##  $ tourney_level     : chr  "G" "G" "G" "G" ...
##  $ tourney_name      : chr  "Australian Open" "Australian Open" "Australian Open" "Australian Open" ...
##  $ winner_age        : num  19.3 17.6 26.5 19 22.7 ...
##  $ winner_entry      : chr  "" "" "" "" ...
##  $ winner_hand       : chr  "R" "R" "R" "R" ...
##  $ winner_ht         : chr  "170" "167" "" "182" ...
##  $ winner_id         : num  2e+05 2e+05 2e+05 2e+05 2e+05 ...
##  $ winner_ioc        : chr  "SUI" "BEL" "SVK" "AUS" ...
##  $ winner_name       : chr  "Martina Hingis" "Justine Henin" "Karina Habsudova" "Alicia Molik" ...
##  $ winner_rank       : chr  "1" "63" "53" "116" ...
##  $ winner_rank_points: num  6003 510 574 245 439 ...
##  $ winner_seed       : chr  "1" "" "" "" ...
##  $ year              : int  2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 ...
##  $ V33               : int  NA NA NA NA NA NA NA NA NA NA ...
##  - attr(*, ".internal.selfref")=<externalptr>

Let’s meet them first

top 200 first names weighted by #

# first names wordcloud
first_names_freq <- players %>%
  group_by(first_name) %>%
  summarise(Players = n()) %>%
  top_n(n = 200, wt = Players)
pal <- brewer.pal(8, "Dark2")
wordcloud(
  words = first_names_freq$first_name, freq = first_names_freq$Players, 
  random.order=T, colors=pal, vfont=c("sans serif","plain"))

Countries weighted by number of WTA players (lifetime)

players %>%
  group_by(country_code) %>%
  summarise(Players = n()) %>%
    ggplot(aes(area = Players, fill = Players, label = country_code)) +
    geom_treemap() +
    geom_treemap_text(fontface = "italic", colour = "white", place = "centre",
                      grow = TRUE) +
    ggtitle("Countries by # WTA players")

Countries by decade

interesting trends emerge here: - beware USA relative share shrinking - RUS gaining share especially with the very young players - australia dissapearing from the top 20

players %>%
  filter(!is.na(birth_date)) %>%
  mutate(birth_year = as.integer(substr(birth_date, 1, 4))) %>%
  mutate(birth_decade = round((birth_year-5)/10,0)*10 )  %>%
  group_by(birth_decade, country_code) %>%
  filter(birth_decade>=1950) %>%
  filter(!is.na(birth_decade)) %>%
  summarise(Players = n()) %>%
  top_n(n = 20, wt = Players) %>%
  ggplot(aes(area = Players, fill = country_code, label = country_code, subgroup = birth_decade)) +
    geom_treemap() +
    geom_treemap_text(fontface = "italic", colour = "white", place = "centre",
                      grow = TRUE) +
    facet_wrap( ~ birth_decade) + 
    theme(legend.position = "none") + 
    ggtitle("Top 20 Countries by player's birth decade")

WTA players by birth year and right/left hand

LEft players are still a minority, same with the youngest players

players %>%
  mutate(birth_year=substr(birth_date, 1, 4))  %>%
  group_by(birth_year, hand) %>%
  filter(birth_year>=1950) %>%
  filter(!is.na(birth_year)) %>%
  filter(nchar(hand)>0) %>%
  summarise(Players = n()) %>%
  ggplot(aes(x = birth_year, y = Players, group=hand)) +
    geom_line(aes(col = hand)) + 
    theme_bw() + 
    theme(text = element_text(size=14), axis.text.x = element_text(angle = 45, hjust = 1)) + 
    ggtitle("Left/Right handed players by birth year")

Top players of all time

for those players that reached #1, we looked at the # of times they reached the very top vs the last year that they reached #1

players %>%
  inner_join(rankings) %>%
  filter(ranking==1) %>%
  mutate(name=paste(first_name, last_name)) %>%
  group_by(name) %>%
  summarise(
      Rankings_1=n(), 
      country=min(country_code), 
      last_time_reached = max(ranking_date)) %>%
  mutate(first_place_yr = substr(last_time_reached, 1, 4)) %>%
  ggplot(aes(y = first_place_yr, x = Rankings_1, label=name)) + 
    geom_point(position = "jitter") + 
  # geom_label() +
    geom_text_repel(aes(color = first_place_yr)) +
  theme(text = element_text(size=14)) + 
  ggtitle("WTA top players") + labs(x = "# of #1 Rankings", y = "last year when ranked #1") + 
  theme_bw(base_size = 15)

Top 3 players ranking distribution

players %>%
  inner_join(rankings) %>%
  filter(ranking<=3, ranking_points>1000) %>%
  mutate(name=paste(first_name, last_name)) %>%
  arrange(desc(ranking_date)) %>%
  ggplot(aes(y = ranking_points, x = name, label=name)) + 
    geom_boxplot() + 
    theme(text = element_text(size=14)) +
    theme_bw(base_size = 15) + 
    theme(text = element_text(size=14), axis.text.x = element_text(angle = 45, hjust = 1)) + 
    ggtitle("Rankings for top 3 playes") +
    labs(x = "", y = "WTA rating")

stay tuned.