This task is performed using the data set of 2021 (2020) Olympics In Tokyo. This dataset contains the details of over 11,000 athletes, with 47 disciplines, along with 743 Teams taking part in the Games. This includes the details of the Athletes, Coaches, Teams participating as well as the Entries by gender. It contains their names, countries represented, discipline, gender of competitors, name of the coaches and also medals won.

Project Source: Kaggle - Arjun Prasad Sarkhel

Data anlysis steps include Ask, Prepare, Process, Analyze, Share and Act (Google Data Analytics For Professional).

Table of Content

  1. Ask

  2. Prepare

  3. Process & Analyze

  4. Share & Act

ASK

Objective: EDA on Countries participated, disciplines, events & coaches

Goals:

Key Deliverables:

Timeline:

Date Key Milestone
21 Sep 2021 Download, Prepare, Process and Analyze
22 Sep 2021 Update Kaggle Workbook
23 Sep 2021 Gather feedback and improve

PREPARE

## Load necessary packages
install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.1'
## (as 'lib' is unspecified)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.4     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(skimr)

install.packages("readxl")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.1'
## (as 'lib' is unspecified)
library(readxl)

install.packages("plotly")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.1'
## (as 'lib' is unspecified)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
## Import datasets

athletes <- read_xlsx("data/Athletes.xlsx")
coaches <- read_xlsx("data/Coaches.xlsx")
gender <- read_xlsx("data/EntriesGender.xlsx")
medals <- read_xlsx("data/Medals.xlsx")
teams <- read_xlsx("data/Teams.xlsx")
## Preview "athletes"
glimpse(athletes)
## Rows: 11,085
## Columns: 3
## $ Name       <chr> "AALERUD Katrine", "ABAD Nestor", "ABAGNALE Giovanni", "ABA…
## $ NOC        <chr> "Norway", "Spain", "Italy", "Spain", "Spain", "France", "Ch…
## $ Discipline <chr> "Cycling Road", "Artistic Gymnastics", "Rowing", "Basketbal…
skim_without_charts(athletes)
Data summary
Name athletes
Number of rows 11085
Number of columns 3
_______________________
Column type frequency:
character 3
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Name 0 1 4 35 0 11062 0
NOC 0 1 3 34 0 206 0
Discipline 0 1 4 21 0 46 0

Remarks:

## Preview "coaches"

glimpse(coaches)
## Rows: 394
## Columns: 4
## $ Name       <chr> "ABDELMAGID Wael", "ABE Junya", "ABE Katsuhiko", "ADAMA Che…
## $ NOC        <chr> "Egypt", "Japan", "Japan", "Côte d'Ivoire", "Japan", "Japan…
## $ Discipline <chr> "Football", "Volleyball", "Basketball", "Football", "Volley…
## $ Event      <chr> NA, NA, NA, NA, NA, "Men", "Men", "Softball", "Men", "Women…
skim_without_charts(coaches)
Data summary
Name coaches
Number of rows 394
Number of columns 4
_______________________
Column type frequency:
character 4
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Name 0 1.00 8 30 0 381 0
NOC 0 1.00 3 26 0 61 0
Discipline 0 1.00 6 17 0 9 0
Event 145 0.63 3 8 0 6 0

Remarks:

## Preview "gender"

glimpse(gender)
## Rows: 46
## Columns: 4
## $ Discipline <chr> "3x3 Basketball", "Archery", "Artistic Gymnastics", "Artist…
## $ Female     <chr> "32", "64", "98", "105", "969", "86", "90", "144", "48", "1…
## $ Male       <chr> "32", "64", "98", "0", "1072", "87", "144", "144", "48", "1…
## $ Total      <chr> "64", "128", "196", "105", "2041", "173", "234", "288", "96…
gender <- gender %>%
  mutate(Female = as.numeric(Female),
         Male = as.numeric(Male),
         Total = as.numeric(Total))
skim_without_charts(gender)
Data summary
Name gender
Number of rows 46
Number of columns 4
_______________________
Column type frequency:
character 1
numeric 3
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Discipline 0 1 4 21 0 46 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
Female 0 1 118.09 147.17 10 42.75 90.0 138.75 969
Male 0 1 127.91 166.90 0 40.25 97.5 149.75 1072
Total 0 1 246.00 312.46 19 85.50 190.0 288.75 2041

Remarks:

## Preview "medals"

glimpse(medals)
## Rows: 93
## Columns: 7
## $ Rank            <chr> "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11…
## $ `Team/NOC`      <chr> "United States of America", "People's Republic of Chin…
## $ Gold            <chr> "39", "38", "27", "22", "20", "17", "10", "10", "10", …
## $ Silver          <chr> "41", "32", "14", "21", "28", "7", "12", "12", "11", "…
## $ Bronze          <chr> "33", "18", "17", "22", "23", "22", "14", "11", "16", …
## $ Total           <chr> "113", "88", "58", "65", "71", "46", "36", "33", "37",…
## $ `Rank by Total` <chr> "1", "2", "5", "4", "3", "6", "9", "10", "8", "7", "11…
medals <- medals %>%
  mutate(Rank = as.numeric(Rank),
         Gold = as.numeric(Gold),
         Silver = as.numeric(Silver),
         Bronze = as.numeric(Bronze),
         Total = as.numeric(Total),
         'Rank by Total' = as.numeric(`Rank by Total`))
skim_without_charts(medals)
Data summary
Name medals
Number of rows 93
Number of columns 7
_______________________
Column type frequency:
character 1
numeric 6
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Team/NOC 0 1 3 26 0 93 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
Rank 0 1 46.33 26.22 1 24 46 70 86
Gold 0 1 3.66 7.02 0 0 1 3 39
Silver 0 1 3.63 6.63 0 0 1 4 41
Bronze 0 1 4.32 6.21 0 1 2 5 33
Total 0 1 11.61 19.09 1 2 4 11 113
Rank by Total 0 1 43.49 24.17 1 23 47 66 77

Remarks:

## Preview "teams"

glimpse(teams)
## Rows: 743
## Columns: 4
## $ Name       <chr> "Belgium", "China", "China", "France", "Italy", "Japan", "J…
## $ Discipline <chr> "3x3 Basketball", "3x3 Basketball", "3x3 Basketball", "3x3 …
## $ NOC        <chr> "Belgium", "People's Republic of China", "People's Republic…
## $ Event      <chr> "Men", "Men", "Women", "Women", "Women", "Men", "Women", "M…
skim_without_charts(teams)
Data summary
Name teams
Number of rows 743
Number of columns 4
_______________________
Column type frequency:
character 4
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Name 0 1 3 34 0 146 0
Discipline 0 1 6 19 0 20 0
NOC 0 1 3 26 0 84 0
Event 0 1 3 32 0 36 0

Remarks:

PREPARE & ANALYZE

Number of Athletes, breakdown by Discipline

disciplines <- athletes %>%
    group_by(Discipline) %>%
    summarize(Number_of_Athletes = n(),
             Number_of_Countries = n_distinct(NOC)) %>%
    arrange(desc(Number_of_Athletes))
    
mean(disciplines$Number_of_Athletes)
## [1] 240.9783
plot_ly(disciplines, 
        x = ~reorder(Discipline, - Number_of_Athletes), 
        y = ~Number_of_Athletes,
        type = "bar") %>%
        layout(title = 'Number of Athletes <br><sup>breakdown by disciplines</sup>', 
               xaxis = list(title = 'Discipline'), 
               yaxis = list(title = 'Number of Athletes'),
               annotations = list(x = "Archery", y = 1000, 
                                  text = "Average Number of Athletes per Discipline is 240", 
                                  showarrow = FALSE))

Number of Countries, breakdown by Discipline

plot_ly(data = disciplines, 
        x = ~reorder(Discipline, - Number_of_Countries), 
        y = ~Number_of_Countries,
        type = "bar") %>%
        layout(title = 'Number of Countries <br><sup>breakdown by disciplines</sup>', 
               xaxis = list(title = 'Discipline'), 
               yaxis = list(title = 'Number of Countries'),
               annotations = list(x = "Beach Volleyball", y = 100, text = "Average Number of Countries per Discipline is 46", 
                                  showarrow = FALSE))

Number of Athletes vs. Countries, breakdown by Discipline

plot_ly(data = disciplines,
        x =  ~Number_of_Athletes, y = ~Number_of_Countries,
        type = 'scatter', 
        mode = 'markers',
        hoverinfo = 'text',
        text = ~paste('</br> Discipline: ', Discipline,
                      '</br> # Athletes: ', Number_of_Athletes,
                      '</br> # Countries: ', Number_of_Countries)) %>%
        layout(title = 'Number of Countries vs. Number of Athletes <br><sup>breakdown by disciplines</sup>', 
               xaxis = list(title = 'Number of Athletes'), 
               yaxis = list(title = 'Number of Countries'))

Number of Athletes vs. Disciplines, breakdown by Country

countries <- athletes %>%
    group_by(NOC) %>%
    summarize(Number_of_Athletes = n(),
             Number_of_Disciplines = n_distinct(Discipline))
plot_ly(countries,
        x =  ~Number_of_Athletes, 
        y = ~Number_of_Disciplines,
        type = 'scatter', 
        mode = 'markers',
        hoverinfo = 'text',
        text = ~paste('</br> Country: ', NOC,
                      '</br> # Athletes: ', Number_of_Athletes,
                      '</br> # Disciplines: ', Number_of_Disciplines)) %>%
        layout(title = 'Number of Athletes vs. Number of Disciplines', 
               xaxis = list(title = 'Number of Athletes'), 
               yaxis = list(title = 'Number of Disciplines'))

Number of Athletes vs. Coaches, breakdown by Country

coaches_by_countries <- coaches %>%
    group_by(NOC) %>%
    summarize(Number_of_Coaches = n())

coaches_athletes <- merge(x = countries, 
                          y = coaches_by_countries, 
                          by = "NOC", 
                          all.x = TRUE)

head(coaches_athletes, n = 5)
##              NOC Number_of_Athletes Number_of_Disciplines Number_of_Coaches
## 1    Afghanistan                  5                     4                NA
## 2        Albania                  8                     6                NA
## 3        Algeria                 41                    14                NA
## 4 American Samoa                  5                     4                NA
## 5        Andorra                  2                     2                NA
plot_ly(coaches_athletes,
        x =  ~Number_of_Athletes, 
        y = ~Number_of_Coaches,
        type = 'scatter', 
        mode = 'markers',
        hoverinfo = 'text',
        text = ~paste('</br> Country: ', NOC,
                      '</br> # Coaches', Number_of_Coaches, 
                      '</br> # Athletes',Number_of_Athletes)) %>%
        layout(title = 'Number of Athletes vs. Number of Coaches <br><sup>Excluding 145 countries with no coaches coming</sup>', 
               xaxis = list(title = 'Number of Athletes'), 
               yaxis = list(title = 'Coaches'))
## Warning: Ignoring 145 observations

Number of Females vs. Males, breakdown by Discipline

plot_ly(gender, 
        x = ~Female, 
        y = ~reorder(Discipline, -Total), 
        type = 'bar', orientation = 'h', 
        name = 'Female',
        marker = list(color = 'rgba(246, 78, 139, 0.6)',
                      line = list(color = 'rgba(246, 78, 139, 1.0)',
                      width = 1))) %>% 
        add_trace(x = ~Male, 
                  name = 'Male',
                  marker = list(color = 'rgba(58, 71, 80, 0.6)',
                                line = list(color = 'rgba(58, 71, 80, 1.0)',
                                width = 1)))%>%
        layout(barmode = 'stack',
         xaxis = list(title = ""),
         yaxis = list(title ="", size = 0.2))

Number of Medals, breakdown by Country and Medal Type

names(medals)[2] <- 'NOC'

medals_longer <- medals %>%
    pivot_longer(cols = 3:6, 
                 names_to = "Medal_Type", 
                 values_to = "Number_of_Medals") %>%
    arrange('Rank by Total')

head(medals_longer, n=10)
## # A tibble: 10 × 5
##     Rank NOC                        `Rank by Total` Medal_Type Number_of_Medals
##    <dbl> <chr>                                <dbl> <chr>                 <dbl>
##  1     1 United States of America                 1 Gold                     39
##  2     1 United States of America                 1 Silver                   41
##  3     1 United States of America                 1 Bronze                   33
##  4     1 United States of America                 1 Total                   113
##  5     2 People's Republic of China               2 Gold                     38
##  6     2 People's Republic of China               2 Silver                   32
##  7     2 People's Republic of China               2 Bronze                   18
##  8     2 People's Republic of China               2 Total                    88
##  9     3 Japan                                    5 Gold                     27
## 10     3 Japan                                    5 Silver                   14
plot_ly(medals_longer,
        x = ~reorder(NOC, Rank), 
        y = ~Number_of_Medals,
        type = 'bar',
        hoverinfo = 'text',
        text = ~paste('</br> Country: ', NOC,
                    '</br> # Medals', Number_of_Medals),
        transforms = list(
        list(
        type = 'filter',
        target = ~Medal_Type,
        operation = 'in',
        value = unique(medals_longer$Medal_Type)[1]
      )
    )) %>%
  layout(
    updatemenus = list(
      list(
        type = 'dropdown',
        active = 0,
        buttons = list(
          list(method = "restyle",
               args = list("transforms[0].value", unique(medals_longer$Medal_Type)[1]),
               label = unique(medals_longer$Medal_Type)[1]),
          list(method = "restyle",
               args = list("transforms[0].value", unique(medals_longer$Medal_Type)[2]),
               label = unique(medals_longer$Medal_Type)[2]),
          list(method = "restyle",
               args = list("transforms[0].value", unique(medals_longer$Medal_Type)[3]),
               label = unique(medals_longer$Medal_Type)[3]),
          list(method = "restyle",
               args = list("transforms[0].value", unique(medals_longer$Medal_Type)[4]),
               label = unique(medals_longer$Medal_Type)[4])
        )
      )
    )
  ) %>%
layout(title = 'Countries by Rank <br><sup>Pick type of Medals</sup>', 
        xaxis = list(title = 'Country'), yaxis = list(title = 'Number of Medals'))

SHARE & ACT