| title: “Women Tennis Exploratory Analysis” |
| author: “calin” |
| output: |
| html_document: |
| number_sections: true |
| toc: true |
| theme: cosmo |
| highlight: tango |
#WTA Exploratory analysis
rm(list=ls())
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 479308 25.6 940480 50.3 750400 40.1
## Vcells 860607 6.6 1650153 12.6 1114162 8.6
#libs
library(ggplot2)
library(data.table)
library(dplyr)
library(treemapify)
library(wordcloud)
library(ggrepel)
players <-fread("../input/players.csv", stringsAsFactors = F, showProgress = T)
matches <-fread("../input/matches.csv", stringsAsFactors = F, showProgress = T)
qualifying_matches<-fread("../input/qualifying_matches.csv", stringsAsFactors = F, showProgress = T)
rankings <-fread("../input/rankings.csv", stringsAsFactors = F, showProgress = T)
# summary(players)
# summary(matches)
# summary(qualifying_matches)
# summary(rankings)
dim(players)
## [1] 20662 6
dim(matches)
## [1] 50577 33
dim(qualifying_matches)
## [1] 305498 50
dim(rankings)
## [1] 1597902 5
str(players)
## Classes 'data.table' and 'data.frame': 20662 obs. of 6 variables:
## $ player_id : int 200001 200002 200003 200004 200005 200006 200007 200008 200009 200010 ...
## $ first_name : chr "Martina" "Mirjana" "Justine" "Kerry Anne" ...
## $ last_name : chr "Hingis" "Lucic" "Henin" "Guse" ...
## $ hand : chr "R" "R" "R" "R" ...
## $ birth_date : int 19800930 19820309 19820601 19721204 19680831 19730802 19780114 19810127 19770524 19750323 ...
## $ country_code: chr "SUI" "CRO" "BEL" "AUS" ...
## - attr(*, ".internal.selfref")=<externalptr>
str(matches)
## Classes 'data.table' and 'data.frame': 50577 obs. of 33 variables:
## $ best_of : int 3 3 3 3 3 3 3 3 3 3 ...
## $ draw_size : int 128 128 128 128 128 128 128 128 128 128 ...
## $ loser_age : num 17.9 27.1 31.4 22 24.8 ...
## $ loser_entry : chr "" "Q" "" "" ...
## $ loser_hand : chr "R" "R" "R" "R" ...
## $ loser_ht : num NA NA NA NA NA 180 NA NA NA NA ...
## $ loser_id : int 200002 200004 200005 200007 200010 200011 200014 200015 200018 200019 ...
## $ loser_ioc : chr "CRO" "AUS" "USA" "CRO" ...
## $ loser_name : chr "Mirjana Lucic" "Kerry Anne Guse" "Jolene Watanabe Giltz" "Silvija Talaja" ...
## $ loser_rank : chr "49" "133" "118" "23" ...
## $ loser_rank_points : num 640 199 243 1112 516 ...
## $ loser_seed : chr "" "" "" "" ...
## $ match_num : int 1 2 3 4 5 6 7 8 9 10 ...
## $ minutes : num NA NA NA NA NA NA NA NA NA NA ...
## $ round : chr "R128" "R128" "R128" "R128" ...
## $ score : chr "6-1 6-2" "6-4 6-2" "6-0 6-1" "6-1 5-7 7-5" ...
## $ surface : chr "Hard" "Hard" "Hard" "Hard" ...
## $ tourney_date : chr "20000117" "20000117" "20000117" "20000117" ...
## $ tourney_id : chr "2000-W-SL-AUS-01A-2000" "2000-W-SL-AUS-01A-2000" "2000-W-SL-AUS-01A-2000" "2000-W-SL-AUS-01A-2000" ...
## $ tourney_level : chr "G" "G" "G" "G" ...
## $ tourney_name : chr "Australian Open" "Australian Open" "Australian Open" "Australian Open" ...
## $ winner_age : num 19.3 17.6 26.5 19 22.7 ...
## $ winner_entry : chr "" "" "" "" ...
## $ winner_hand : chr "R" "R" "R" "R" ...
## $ winner_ht : chr "170" "167" "" "182" ...
## $ winner_id : num 2e+05 2e+05 2e+05 2e+05 2e+05 ...
## $ winner_ioc : chr "SUI" "BEL" "SVK" "AUS" ...
## $ winner_name : chr "Martina Hingis" "Justine Henin" "Karina Habsudova" "Alicia Molik" ...
## $ winner_rank : chr "1" "63" "53" "116" ...
## $ winner_rank_points: num 6003 510 574 245 439 ...
## $ winner_seed : chr "1" "" "" "" ...
## $ year : int 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 ...
## $ V33 : int NA NA NA NA NA NA NA NA NA NA ...
## - attr(*, ".internal.selfref")=<externalptr>
top 200 first names weighted by #
# first names wordcloud
first_names_freq <- players %>%
group_by(first_name) %>%
summarise(Players = n()) %>%
top_n(n = 200, wt = Players)
pal <- brewer.pal(8, "Dark2")
wordcloud(
words = first_names_freq$first_name, freq = first_names_freq$Players,
random.order=T, colors=pal, vfont=c("sans serif","plain"))
players %>%
group_by(country_code) %>%
summarise(Players = n()) %>%
ggplot(aes(area = Players, fill = Players, label = country_code)) +
geom_treemap() +
geom_treemap_text(fontface = "italic", colour = "white", place = "centre",
grow = TRUE) +
ggtitle("Countries by # WTA players")
interesting trends emerge here: - beware USA relative share shrinking - RUS gaining share especially with the very young players - australia dissapearing from the top 20
players %>%
filter(!is.na(birth_date)) %>%
mutate(birth_year = as.integer(substr(birth_date, 1, 4))) %>%
mutate(birth_decade = round((birth_year-5)/10,0)*10 ) %>%
group_by(birth_decade, country_code) %>%
filter(birth_decade>=1950) %>%
filter(!is.na(birth_decade)) %>%
summarise(Players = n()) %>%
top_n(n = 20, wt = Players) %>%
ggplot(aes(area = Players, fill = country_code, label = country_code, subgroup = birth_decade)) +
geom_treemap() +
geom_treemap_text(fontface = "italic", colour = "white", place = "centre",
grow = TRUE) +
facet_wrap( ~ birth_decade) +
theme(legend.position = "none") +
ggtitle("Top 20 Countries by player's birth decade")
LEft players are still a minority, same with the youngest players
players %>%
mutate(birth_year=substr(birth_date, 1, 4)) %>%
group_by(birth_year, hand) %>%
filter(birth_year>=1950) %>%
filter(!is.na(birth_year)) %>%
filter(nchar(hand)>0) %>%
summarise(Players = n()) %>%
ggplot(aes(x = birth_year, y = Players, group=hand)) +
geom_line(aes(col = hand)) +
theme_bw() +
theme(text = element_text(size=14), axis.text.x = element_text(angle = 45, hjust = 1)) +
ggtitle("Left/Right handed players by birth year")
for those players that reached #1, we looked at the # of times they reached the very top vs the last year that they reached #1
players %>%
inner_join(rankings) %>%
filter(ranking==1) %>%
mutate(name=paste(first_name, last_name)) %>%
group_by(name) %>%
summarise(
Rankings_1=n(),
country=min(country_code),
last_time_reached = max(ranking_date)) %>%
mutate(first_place_yr = substr(last_time_reached, 1, 4)) %>%
ggplot(aes(y = first_place_yr, x = Rankings_1, label=name)) +
geom_point(position = "jitter") +
# geom_label() +
geom_text_repel(aes(color = first_place_yr)) +
theme(text = element_text(size=14)) +
ggtitle("WTA top players") + labs(x = "# of #1 Rankings", y = "last year when ranked #1") +
theme_bw(base_size = 15)
players %>%
inner_join(rankings) %>%
filter(ranking<=3, ranking_points>1000) %>%
mutate(name=paste(first_name, last_name)) %>%
arrange(desc(ranking_date)) %>%
ggplot(aes(y = ranking_points, x = name, label=name)) +
geom_boxplot() +
theme(text = element_text(size=14)) +
theme_bw(base_size = 15) +
theme(text = element_text(size=14), axis.text.x = element_text(angle = 45, hjust = 1)) +
ggtitle("Rankings for top 3 playes") +
labs(x = "", y = "WTA rating")
stay tuned.