library(tidyverse)
#data wrangle
  if (! "tidyverse" %in% installed.packages()) install.packages("tidyverse")
  if (! "skimr" %in% installed.packages()) install.packages("skimr")
  if (! "janitor" %in% installed.packages()) install.packages("janitor")
  if (! "scales" %in% installed.packages()) install.packages("scales")
#communication
  if (! "DT" %in% installed.packages()) install.packages("DT")
  if (! "kableExtra" %in% installed.packages()) install.packages("kableExtra")

Intro

Dados extraídos do kaggle https://www.kaggle.com/rio2016/olympic-games?select=athletes.csv

This dataset consists of the official statistics on the 11,538 athletes and 306 events at the 2016 Olympic Games in Rio de Janeiro. The athletes file includes the name, nationality (as a three letter IOC country code), gender, age (as date of birth), height in meters, weight in kilograms, sport, and quantity of gold, silver, and/or bronze medals won for every Olympic athlete at Rio. The events file lists the name, sport, discipline (if available), gender of competitors, and venue(s) for every Olympic event at Rio 2016.

CREDITS: Source Data: Rio 2016 website ; Data Files: GitHub user flother

  • alternativy

https://www.kaggle.com/heesoo37/120-years-of-olympic-history-athletes-and-results

This is a historical dataset on the modern Olympic Games, including all the Games from Athens 1896 to Rio 2016. I scraped this data from www.sports-reference.com in May 2018. The R code I used to scrape and wrangle the data is on GitHub. I recommend checking my kernel before starting your own analysis.

Note that the Winter and Summer Games were held in the same year up until 1992. After that, they staggered them such that Winter Games occur on a four year cycle starting with 1994, then Summer in 1996, then Winter in 1998, and so on. A common mistake people make when analyzing this data is to assume that the Summer and Winter Games have always been staggered.

Content

The file contains 271116 rows and 15 columns. Each row corresponds to an individual athlete competing in an individual Olympic event (athlete-events). The columns are:

ID - Unique number for each athlete Name - Athlete’s name Sex - M or F Age - Integer Height - In centimeters Weight - In kilograms Team - Team name NOC - National Olympic Committee 3-letter code Games - Year and season Year - Integer Season - Summer or Winter City - Host city Sport - Sport Event - Event Medal - Gold, Silver, Bronze, or NA

Read - separate databases

Athletes

athletes_raw <- readr::read_csv("data/athletes.csv") %>% 
  dplyr::glimpse()
## Rows: 11,538
## Columns: 11
## $ id          <dbl> 736041664, 532037425, 435962603, 521041435, 33922579, 1730…
## $ name        <chr> "A Jesus Garcia", "A Lam Shin", "Aaron Brown", "Aaron Cook…
## $ nationality <chr> "ESP", "KOR", "CAN", "MDA", "NZL", "AUS", "USA", "AUS", "E…
## $ sex         <chr> "male", "female", "male", "male", "male", "male", "male", …
## $ dob         <chr> "10/17/69", "9/23/86", "5/27/92", "1/2/91", "11/26/90", "1…
## $ height      <dbl> 1.72, 1.68, 1.98, 1.83, 1.81, 1.80, 2.05, 1.93, 1.80, 1.65…
## $ weight      <dbl> 64, 56, 79, 80, 71, 67, 98, 100, 62, 54, 63, 66, NA, 49, 6…
## $ sport       <chr> "athletics", "fencing", "athletics", "taekwondo", "cycling…
## $ gold        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0…
## $ silver      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0…
## $ bronze      <dbl> 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…

Countries

countriess_raw <- readr::read_csv("data/countries.csv") %>% 
  dplyr::glimpse()
## Rows: 201
## Columns: 4
## $ country        <chr> "Afghanistan", "Albania", "Algeria", "American Samoa*",…
## $ code           <chr> "AFG", "ALB", "ALG", "ASA", "AND", "ANG", "ANT", "ARG",…
## $ population     <dbl> 32526562, 2889167, 39666519, 55538, 70473, 25021974, 91…
## $ gdp_per_capita <dbl> 594.3231, 3945.2176, 4206.0312, NA, NA, 4101.4722, 1371…

Events

events_raw <- readr::read_csv("data/events.csv") %>% 
  dplyr::glimpse()
## Rows: 306
## Columns: 6
## $ id         <dbl> 701492, 305278, 708010, 729643, 567019, 607924, 519818, 778…
## $ sport      <chr> "aquatics", "aquatics", "aquatics", "aquatics", "aquatics",…
## $ discipline <chr> "backstroke", "backstroke", "backstroke", "backstroke", "br…
## $ name       <chr> "Women's 100m Backstroke", "Women's 200m Backstroke", "Men'…
## $ sex        <chr> "female", "female", "male", "male", "female", "female", "ma…
## $ venues     <chr> "Olympic Aquatics Stadium", "Olympic Aquatics Stadium", "Ol…

Data : Athletes

DT::datatable(athletes_raw)

Adjusts

f_transform_yy_in_yyyy <- function(x){
  ifelse(as.numeric(str_sub(x, start = -2, end = -1)) > 20,
         paste0(str_sub(x, start = 1, end = -3), "19", str_sub(x, start = -2, end = -1)),
         paste0(str_sub(x, start = 1, end = -3), "20", str_sub(x, start = -2, end = -1))
         )
  } 
athletes <- athletes_raw %>% 
  dplyr::mutate(id = as.character(id)) %>% 
  #IMC
    dplyr::mutate(IMC = weight/(height^2)) %>% 
    dplyr::relocate(IMC, .after = weight) %>%
  #Age
    dplyr::mutate(dob = f_transform_yy_in_yyyy(dob)) %>%
    dplyr::mutate(dob = lubridate::mdy(dob)) %>%
    dplyr::mutate(age = 2016 - lubridate::year(dob)) %>% 
    dplyr::relocate(age, .after = dob) %>% 
  #Medals Total
    dplyr::mutate(medal_total = gold + silver + bronze) %>% 
    dplyr::mutate(medal_total_nonZero = ifelse(medal_total == 0, NA, medal_total)) %>% 
  #Medals Rank
    dplyr::mutate(medal_rank = 
                    dplyr::case_when(
                      gold > 0 ~ "gold",
                      silver > 0 ~ "silver",
                      bronze > 0 ~ "bronze",
                      TRUE ~ "none"
                    )) %>% 
    dplyr::mutate(medal_rank = as.ordered(medal_rank)) %>%
    dplyr::mutate(medal_rank = forcats::fct_relevel(medal_rank, c("none", "bronze", "silver", "gold"))) %>%
  dplyr::glimpse()
## Rows: 11,538
## Columns: 16
## $ id                  <chr> "736041664", "532037425", "435962603", "521041435"…
## $ name                <chr> "A Jesus Garcia", "A Lam Shin", "Aaron Brown", "Aa…
## $ nationality         <chr> "ESP", "KOR", "CAN", "MDA", "NZL", "AUS", "USA", "…
## $ sex                 <chr> "male", "female", "male", "male", "male", "male", …
## $ dob                 <date> 1969-10-17, 1986-09-23, 1992-05-27, 1991-01-02, 1…
## $ age                 <dbl> 47, 30, 24, 25, 26, 26, 23, 25, 28, 25, 19, 20, 24…
## $ height              <dbl> 1.72, 1.68, 1.98, 1.83, 1.81, 1.80, 2.05, 1.93, 1.…
## $ weight              <dbl> 64, 56, 79, 80, 71, 67, 98, 100, 62, 54, 63, 66, N…
## $ IMC                 <dbl> 21.63332, 19.84127, 20.15100, 23.88844, 21.67211, …
## $ sport               <chr> "athletics", "fencing", "athletics", "taekwondo", …
## $ gold                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,…
## $ silver              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,…
## $ bronze              <dbl> 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ medal_total         <dbl> 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,…
## $ medal_total_nonZero <dbl> NA, NA, 1, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA, …
## $ medal_rank          <ord> none, none, bronze, none, none, none, bronze, none…

EDA - All athletes

skimr::skim(athletes)
Data summary
Name athletes
Number of rows 11538
Number of columns 16
_______________________
Column type frequency:
character 5
Date 1
factor 1
numeric 9
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
id 0 1 5 9 0 11538 0
name 0 1 4 40 0 11517 0
nationality 0 1 3 3 0 207 0
sex 0 1 4 6 0 2 0
sport 0 1 4 17 0 28 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
dob 1 1 1954-05-20 2002-11-26 1990-05-21 5595

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
medal_rank 0 1 TRUE 4 non: 9681, bro: 642, gol: 619, sil: 596

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 1 1.00 26.70 5.38 14.00 23.00 26.00 30.00 62.00 ▃▇▁▁▁
height 330 0.97 1.77 0.11 1.21 1.69 1.76 1.84 2.21 ▁▁▇▅▁
weight 659 0.94 72.07 16.18 31.00 60.00 70.00 81.00 170.00 ▂▇▂▁▁
IMC 680 0.94 22.87 3.41 11.40 20.72 22.42 24.43 53.40 ▂▇▁▁▁
gold 0 1.00 0.06 0.26 0.00 0.00 0.00 0.00 5.00 ▇▁▁▁▁
silver 0 1.00 0.06 0.24 0.00 0.00 0.00 0.00 2.00 ▇▁▁▁▁
bronze 0 1.00 0.06 0.24 0.00 0.00 0.00 0.00 2.00 ▇▁▁▁▁
medal_total 0 1.00 0.18 0.43 0.00 0.00 0.00 0.00 6.00 ▇▁▁▁▁
medal_total_nonZero 9681 0.16 1.09 0.38 1.00 1.00 1.00 1.00 6.00 ▇▁▁▁▁

EDA - By sex

athletes %>% 
  dplyr::group_by(sex) %>% 
  skimr::skim()  
Data summary
Name Piped data
Number of rows 11538
Number of columns 16
_______________________
Column type frequency:
character 4
Date 1
factor 1
numeric 9
________________________
Group variables sex

Variable type: character

skim_variable sex n_missing complete_rate min max empty n_unique whitespace
id female 0 1 5 9 0 5205 0
id male 0 1 5 9 0 6333 0
name female 0 1 4 40 0 5197 0
name male 0 1 4 37 0 6320 0
nationality female 0 1 3 3 0 202 0
nationality male 0 1 3 3 0 206 0
sport female 0 1 4 17 0 28 0
sport male 0 1 4 17 0 28 0

Variable type: Date

skim_variable sex n_missing complete_rate min max median n_unique
dob female 0 1 1954-05-20 2002-11-26 1990-09-30 3593
dob male 1 1 1955-08-05 2002-04-14 1990-02-02 4045

Variable type: factor

skim_variable sex n_missing complete_rate ordered n_unique top_counts
medal_rank female 0 1 TRUE 4 non: 4332, bro: 301, gol: 294, sil: 278
medal_rank male 0 1 TRUE 4 non: 5349, bro: 341, gol: 325, sil: 318

Variable type: numeric

skim_variable sex n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age female 0 1.00 26.20 5.26 14.00 23.00 26.00 29.00 62.00 ▅▇▁▁▁
age male 1 1.00 27.11 5.44 14.00 23.00 26.00 30.00 61.00 ▃▇▂▁▁
height female 139 0.97 1.70 0.09 1.21 1.64 1.70 1.75 2.03 ▁▁▇▆▁
height male 191 0.97 1.82 0.10 1.45 1.75 1.82 1.89 2.21 ▁▃▇▂▁
weight female 199 0.96 62.64 11.25 31.00 55.00 61.00 68.00 150.00 ▂▇▁▁▁
weight male 460 0.93 80.10 15.38 39.00 70.00 78.00 88.00 170.00 ▂▇▂▁▁
IMC female 209 0.96 21.58 2.93 13.26 19.84 21.23 22.77 48.98 ▅▇▁▁▁
IMC male 471 0.93 23.96 3.41 11.40 21.91 23.53 25.34 53.40 ▁▇▁▁▁
gold female 0 1.00 0.06 0.26 0.00 0.00 0.00 0.00 4.00 ▇▁▁▁▁
gold male 0 1.00 0.05 0.25 0.00 0.00 0.00 0.00 5.00 ▇▁▁▁▁
silver female 0 1.00 0.06 0.25 0.00 0.00 0.00 0.00 2.00 ▇▁▁▁▁
silver male 0 1.00 0.05 0.23 0.00 0.00 0.00 0.00 2.00 ▇▁▁▁▁
bronze female 0 1.00 0.06 0.25 0.00 0.00 0.00 0.00 2.00 ▇▁▁▁▁
bronze male 0 1.00 0.06 0.24 0.00 0.00 0.00 0.00 2.00 ▇▁▁▁▁
medal_total female 0 1.00 0.19 0.45 0.00 0.00 0.00 0.00 5.00 ▇▁▁▁▁
medal_total male 0 1.00 0.17 0.41 0.00 0.00 0.00 0.00 6.00 ▇▁▁▁▁
medal_total_nonZero female 4332 0.17 1.11 0.42 1.00 1.00 1.00 1.00 5.00 ▇▁▁▁▁
medal_total_nonZero male 5349 0.16 1.08 0.34 1.00 1.00 1.00 1.00 6.00 ▇▁▁▁▁

EDA - relation

sex

athletes %>% 
  janitor::tabyl(sex) %>% 
  dplyr::arrange(sex) %>% 
  janitor::adorn_totals("row") %>% 
  dplyr::mutate(percent = scales::percent(percent)) %>% 
  kableExtra::kbl() %>% 
  kableExtra::kable_styling(full_width = FALSE)
sex n percent
female 5205 45.1%
male 6333 54.9%
Total 11538 100.0%

medal_rank

athletes %>% 
  dplyr::mutate(medal_rank = forcats::fct_rev(medal_rank)) %>% 
  janitor::tabyl(medal_rank) %>% 
  dplyr::arrange(medal_rank) %>% 
  janitor::adorn_totals("row") %>% 
  dplyr::mutate(percent = scales::percent(percent)) %>% 
  kableExtra::kbl() %>% 
  kableExtra::kable_styling(full_width = FALSE)
medal_rank n percent
gold 619 5.36%
silver 596 5.17%
bronze 642 5.56%
none 9681 83.91%
Total 11538 100.00%

medal_rank vs. sex (absoluts)

athletes %>% 
  janitor::tabyl(medal_rank, sex) %>% 
  dplyr::arrange(desc(medal_rank)) %>% 
  kableExtra::kbl() %>% 
  kableExtra::kable_styling(full_width = FALSE)
medal_rank female male
gold 294 325
silver 278 318
bronze 301 341
none 4332 5349
athletes %>% 
  janitor::tabyl(medal_rank, sex) %>% 
  dplyr::arrange(desc(medal_rank)) %>% 
  janitor::adorn_totals("col") %>%
  kableExtra::kbl() %>% 
  kableExtra::kable_styling(full_width = FALSE)
medal_rank female male Total
gold 294 325 619
silver 278 318 596
bronze 301 341 642
none 4332 5349 9681
athletes %>% 
  janitor::tabyl(medal_rank, sex) %>% 
  dplyr::arrange(desc(medal_rank)) %>% 
  janitor::adorn_totals("row") %>%
  kableExtra::kbl() %>% 
  kableExtra::kable_styling(full_width = FALSE)
medal_rank female male
gold 294 325
silver 278 318
bronze 301 341
none 4332 5349
Total 5205 6333

medal_rank vs. sex (relatives)

athletes %>% 
  janitor::tabyl(medal_rank, sex) %>% 
  dplyr::arrange(desc(medal_rank)) %>% 
  janitor::adorn_totals(c("row", "col")) %>%
  janitor::adorn_percentages("all") %>% 
  dplyr::mutate(female = scales::percent(female)) %>% 
  dplyr::mutate(male = scales::percent(male)) %>% 
  dplyr::mutate(Total = scales::percent(Total)) %>% 
  kableExtra::kbl() %>% 
  kableExtra::kable_styling(full_width = FALSE)
medal_rank female male Total
gold 2.548% 2.817% 5.36%
silver 2.409% 2.756% 5.17%
bronze 2.609% 2.955% 5.56%
none 37.546% 46.360% 83.91%
Total 45.112% 54.888% 100.00%

medal_rank vs. sex (relative by Cols)

athletes %>% 
  janitor::tabyl(medal_rank, sex) %>% 
  dplyr::arrange(desc(medal_rank)) %>% 
  janitor::adorn_totals(c("row", "col")) %>%
  janitor::adorn_percentages("col") %>% 
  dplyr::mutate(female = scales::percent(female)) %>% 
  dplyr::mutate(male = scales::percent(male)) %>% 
  dplyr::mutate(Total = scales::percent(Total)) %>% 
  kableExtra::kbl() %>% 
  kableExtra::kable_styling(full_width = FALSE)
medal_rank female male Total
gold 5.65% 5.13% 5.36%
silver 5.34% 5.02% 5.17%
bronze 5.78% 5.38% 5.56%
none 83.23% 84.46% 83.91%
Total 100.00% 100.00% 100.00%

medal_rank vs. sex (relative by Rows)

athletes %>% 
  janitor::tabyl(medal_rank, sex) %>% 
  dplyr::arrange(desc(medal_rank)) %>% 
  janitor::adorn_totals(c("row", "col")) %>%
  janitor::adorn_percentages("row") %>% 
  dplyr::mutate(female = scales::percent(female)) %>% 
  dplyr::mutate(male = scales::percent(male)) %>% 
  dplyr::mutate(Total = scales::percent(Total)) %>% 
  kableExtra::kbl() %>% 
  kableExtra::kable_styling(full_width = FALSE)
medal_rank female male Total
gold 47.50% 52.50% 100%
silver 46.64% 53.36% 100%
bronze 46.88% 53.12% 100%
none 44.75% 55.25% 100%
Total 45.11% 54.89% 100%