Data Source: https://www.kaggle.com/heesoo37/120-years-of-olympic-history-athletes-and-results
Data Set:“120 years of Olympic history: athletes and results”. Basic bio data on athletes and medal results from Athens 1896 to Rio 2016.
# install.packages("tidyr")
# install.packages("dplyr")
# install.packages("ggplot2")
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(stringr)
athletes<- read.csv("/Users/Olga/Desktop/athlete_events.csv")
sub_athletes<-athletes %>%
select(-c(NOC,Games,Event)) %>%
filter (Medal != "<NA>")
head(sub_athletes)
## ID Name Sex Age Height Weight Team Year
## 1 4 Edgar Lindenau Aabye M 34 NA NA Denmark/Sweden 1900
## 2 15 Arvo Ossian Aaltonen M 30 NA NA Finland 1920
## 3 15 Arvo Ossian Aaltonen M 30 NA NA Finland 1920
## 4 16 Juhamatti Tapio Aaltonen M 28 184 85 Finland 2014
## 5 17 Paavo Johannes Aaltonen M 28 175 64 Finland 1948
## 6 17 Paavo Johannes Aaltonen M 28 175 64 Finland 1948
## Season City Sport Medal
## 1 Summer Paris Tug-Of-War Gold
## 2 Summer Antwerpen Swimming Bronze
## 3 Summer Antwerpen Swimming Bronze
## 4 Winter Sochi Ice Hockey Bronze
## 5 Summer London Gymnastics Bronze
## 6 Summer London Gymnastics Gold
Genaral Analysis
total_winners<-sub_athletes %>%
filter (Medal != "<NA>") %>%
group_by(Team) %>%
count(Medal) %>%
mutate(sum_n = sum(n)) %>%
select(Team,sum_n) %>%
slice(1) %>%
arrange(desc(sum_n))
total_winners
## # A tibble: 498 x 2
## # Groups: Team [498]
## Team sum_n
## <fct> <int>
## 1 United States 5219
## 2 Soviet Union 2451
## 3 Germany 1984
## 4 Great Britain 1673
## 5 France 1550
## 6 Italy 1527
## 7 Sweden 1434
## 8 Australia 1306
## 9 Canada 1243
## 10 Hungary 1127
## # ... with 488 more rows
sport_winners<-sub_athletes %>%
filter(Medal=="Gold") %>%
group_by(Sport,Team) %>%
count(Medal) %>%
arrange(desc(n))
head(sport_winners)
## # A tibble: 6 x 4
## # Groups: Sport, Team [6]
## Sport Team Medal n
## <fct> <fct> <fct> <int>
## 1 Swimming United States Gold 645
## 2 Athletics United States Gold 537
## 3 Basketball United States Gold 281
## 4 Ice Hockey Canada Gold 220
## 5 Rowing United States Gold 160
## 6 Fencing Italy Gold 151
tail(sport_winners)
## # A tibble: 6 x 4
## # Groups: Sport, Team [6]
## Sport Team Medal n
## <fct> <fct> <fct> <int>
## 1 Wrestling Austria Gold 1
## 2 Wrestling Czechoslovakia Gold 1
## 3 Wrestling Greece Gold 1
## 4 Wrestling Kazakhstan Gold 1
## 5 Wrestling Serbia Gold 1
## 6 Wrestling West Germany Gold 1
winner_by_year<-sub_athletes %>%
filter(Medal =="Gold") %>%
select(Year,Team,Medal) %>%
group_by(Year,Team) %>%
count(Medal) %>%
group_by(Year) %>%
arrange(Year,desc(n)) %>%
slice(1)
head(winner_by_year)
## # A tibble: 6 x 4
## # Groups: Year [6]
## Year Team Medal n
## <int> <fct> <fct> <int>
## 1 1896 Germany Gold 24
## 2 1900 France Gold 22
## 3 1904 United States Gold 65
## 4 1906 France Gold 20
## 5 1908 Great Britain Gold 69
## 6 1912 Sweden Gold 95
absolute_leader<-winner_by_year %>%
group_by(Team) %>%
count(Team) %>%
arrange(desc(nn))
absolute_leader
## # A tibble: 8 x 2
## # Groups: Team [8]
## Team nn
## <fct> <int>
## 1 United States 18
## 2 Soviet Union 5
## 3 Canada 3
## 4 Sweden 3
## 5 France 2
## 6 Germany 2
## 7 Great Britain 1
## 8 Unified Team 1
height_sport<-sub_athletes %>%
filter (Height != "<NA>") %>%
group_by(Sport) %>%
summarise(avg_height=mean(Height)) %>%
arrange(desc(avg_height))
head(height_sport)
## # A tibble: 6 x 2
## Sport avg_height
## <fct> <dbl>
## 1 Basketball 192.
## 2 Volleyball 187.
## 3 Beach Volleyball 187.
## 4 Water Polo 185.
## 5 Rowing 184.
## 6 Handball 183.
tail(height_sport)
## # A tibble: 6 x 2
## Sport avg_height
## <fct> <dbl>
## 1 Synchronized Swimming 169.
## 2 Figure Skating 169.
## 3 Weightlifting 167.
## 4 Trampolining 166.
## 5 Diving 166.
## 6 Gymnastics 162.
Weight_sport<-sub_athletes %>%
filter (Weight != "<NA>") %>%
group_by(Sport) %>%
summarise(avg_Weight=mean(Weight)) %>%
arrange(desc(avg_Weight))
head(Weight_sport)
## # A tibble: 6 x 2
## Sport avg_Weight
## <fct> <dbl>
## 1 Tug-Of-War 94.1
## 2 Bobsleigh 90.4
## 3 Basketball 86.7
## 4 Baseball 85.8
## 5 Water Polo 85.6
## 6 Rugby Sevens 81.3
tail(Weight_sport)
## # A tibble: 6 x 2
## Sport avg_Weight
## <fct> <dbl>
## 1 Figure Skating 60.2
## 2 Trampolining 59.1
## 3 Diving 58.7
## 4 Synchronized Swimming 56.4
## 5 Gymnastics 55.1
## 6 Rhythmic Gymnastics 49.3