Data Source: https://www.kaggle.com/heesoo37/120-years-of-olympic-history-athletes-and-results

Data Set:“120 years of Olympic history: athletes and results”. Basic bio data on athletes and medal results from Athens 1896 to Rio 2016.

  1. Installing necessary packages
# install.packages("tidyr")
# install.packages("dplyr")
# install.packages("ggplot2")
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(stringr)
  1. Reading csv file (file is too big for placing it on GitHub)
athletes<- read.csv("/Users/Olga/Desktop/athlete_events.csv")
  1. Subsetting dataset: selecting only necessary for the analysis columns, filtering rows which do not have medal type.
sub_athletes<-athletes %>% 
  select(-c(NOC,Games,Event)) %>% 
   filter (Medal != "<NA>") 
head(sub_athletes)
##   ID                     Name Sex Age Height Weight           Team Year
## 1  4     Edgar Lindenau Aabye   M  34     NA     NA Denmark/Sweden 1900
## 2 15     Arvo Ossian Aaltonen   M  30     NA     NA        Finland 1920
## 3 15     Arvo Ossian Aaltonen   M  30     NA     NA        Finland 1920
## 4 16 Juhamatti Tapio Aaltonen   M  28    184     85        Finland 2014
## 5 17  Paavo Johannes Aaltonen   M  28    175     64        Finland 1948
## 6 17  Paavo Johannes Aaltonen   M  28    175     64        Finland 1948
##   Season      City      Sport  Medal
## 1 Summer     Paris Tug-Of-War   Gold
## 2 Summer Antwerpen   Swimming Bronze
## 3 Summer Antwerpen   Swimming Bronze
## 4 Winter     Sochi Ice Hockey Bronze
## 5 Summer    London Gymnastics Bronze
## 6 Summer    London Gymnastics   Gold

Genaral Analysis

  1. What counties have the biggest number of medals over the whole olympic history?
total_winners<-sub_athletes %>% 
  filter (Medal != "<NA>") %>% 
   group_by(Team) %>% 
   count(Medal) %>% 
  mutate(sum_n = sum(n)) %>% 
  select(Team,sum_n) %>% 
  slice(1) %>% 
  arrange(desc(sum_n)) 

total_winners 
## # A tibble: 498 x 2
## # Groups:   Team [498]
##    Team          sum_n
##    <fct>         <int>
##  1 United States  5219
##  2 Soviet Union   2451
##  3 Germany        1984
##  4 Great Britain  1673
##  5 France         1550
##  6 Italy          1527
##  7 Sweden         1434
##  8 Australia      1306
##  9 Canada         1243
## 10 Hungary        1127
## # ... with 488 more rows
  1. What kind of sport and what countries have the largest number of gold medals over the whole olympic history?
sport_winners<-sub_athletes %>% 
  filter(Medal=="Gold") %>% 
  group_by(Sport,Team) %>% 
  count(Medal) %>%
  arrange(desc(n))

head(sport_winners)
## # A tibble: 6 x 4
## # Groups:   Sport, Team [6]
##   Sport      Team          Medal     n
##   <fct>      <fct>         <fct> <int>
## 1 Swimming   United States Gold    645
## 2 Athletics  United States Gold    537
## 3 Basketball United States Gold    281
## 4 Ice Hockey Canada        Gold    220
## 5 Rowing     United States Gold    160
## 6 Fencing    Italy         Gold    151
tail(sport_winners)
## # A tibble: 6 x 4
## # Groups:   Sport, Team [6]
##   Sport     Team           Medal     n
##   <fct>     <fct>          <fct> <int>
## 1 Wrestling Austria        Gold      1
## 2 Wrestling Czechoslovakia Gold      1
## 3 Wrestling Greece         Gold      1
## 4 Wrestling Kazakhstan     Gold      1
## 5 Wrestling Serbia         Gold      1
## 6 Wrestling West Germany   Gold      1
  1. What country has the highest number of gold medals by each Olympic games.
winner_by_year<-sub_athletes %>%
  filter(Medal =="Gold") %>%
  select(Year,Team,Medal) %>% 
  group_by(Year,Team) %>% 
  count(Medal) %>%
  group_by(Year) %>% 
  arrange(Year,desc(n)) %>% 
  slice(1)
 
head(winner_by_year)
## # A tibble: 6 x 4
## # Groups:   Year [6]
##    Year Team          Medal     n
##   <int> <fct>         <fct> <int>
## 1  1896 Germany       Gold     24
## 2  1900 France        Gold     22
## 3  1904 United States Gold     65
## 4  1906 France        Gold     20
## 5  1908 Great Britain Gold     69
## 6  1912 Sweden        Gold     95
  1. What country appear most often across all Olympic games history as a gold medal winner?
absolute_leader<-winner_by_year %>% 
  group_by(Team) %>% 
  count(Team) %>% 
  arrange(desc(nn))
  
absolute_leader
## # A tibble: 8 x 2
## # Groups:   Team [8]
##   Team             nn
##   <fct>         <int>
## 1 United States    18
## 2 Soviet Union      5
## 3 Canada            3
## 4 Sweden            3
## 5 France            2
## 6 Germany           2
## 7 Great Britain     1
## 8 Unified Team      1
  1. Any correlation between kind of sports and height?
height_sport<-sub_athletes %>% 
  filter (Height != "<NA>") %>% 
  group_by(Sport) %>% 
  summarise(avg_height=mean(Height)) %>% 
  arrange(desc(avg_height))

head(height_sport)
## # A tibble: 6 x 2
##   Sport            avg_height
##   <fct>                 <dbl>
## 1 Basketball             192.
## 2 Volleyball             187.
## 3 Beach Volleyball       187.
## 4 Water Polo             185.
## 5 Rowing                 184.
## 6 Handball               183.
tail(height_sport)
## # A tibble: 6 x 2
##   Sport                 avg_height
##   <fct>                      <dbl>
## 1 Synchronized Swimming       169.
## 2 Figure Skating              169.
## 3 Weightlifting               167.
## 4 Trampolining                166.
## 5 Diving                      166.
## 6 Gymnastics                  162.
  1. Any correlation between kind of sports and weight?
Weight_sport<-sub_athletes %>% 
  filter (Weight != "<NA>") %>% 
  group_by(Sport) %>% 
  summarise(avg_Weight=mean(Weight)) %>% 
  arrange(desc(avg_Weight))

head(Weight_sport)
## # A tibble: 6 x 2
##   Sport        avg_Weight
##   <fct>             <dbl>
## 1 Tug-Of-War         94.1
## 2 Bobsleigh          90.4
## 3 Basketball         86.7
## 4 Baseball           85.8
## 5 Water Polo         85.6
## 6 Rugby Sevens       81.3
tail(Weight_sport)
## # A tibble: 6 x 2
##   Sport                 avg_Weight
##   <fct>                      <dbl>
## 1 Figure Skating              60.2
## 2 Trampolining                59.1
## 3 Diving                      58.7
## 4 Synchronized Swimming       56.4
## 5 Gymnastics                  55.1
## 6 Rhythmic Gymnastics         49.3