Olympics Data

dataset_olympics <- read_delim("dataset_olympics.csv")
## Rows: 70000 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): Name, Sex, Team, NOC, Games, Season, City, Sport, Event, Medal
## dbl  (5): ID, Age, Height, Weight, Year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Data Summary

summary(dataset_olympics)
##        ID            Name               Sex                 Age       
##  Min.   :    1   Length:70000       Length:70000       Min.   :11.00  
##  1st Qu.: 9326   Class :character   Class :character   1st Qu.:21.00  
##  Median :18032   Mode  :character   Mode  :character   Median :25.00  
##  Mean   :18082                                         Mean   :25.64  
##  3rd Qu.:26978                                         3rd Qu.:28.00  
##  Max.   :35658                                         Max.   :88.00  
##                                                        NA's   :2732   
##      Height          Weight          Team               NOC           
##  Min.   :127.0   Min.   : 25.0   Length:70000       Length:70000      
##  1st Qu.:168.0   1st Qu.: 61.0   Class :character   Class :character  
##  Median :175.0   Median : 70.0   Mode  :character   Mode  :character  
##  Mean   :175.5   Mean   : 70.9                                        
##  3rd Qu.:183.0   3rd Qu.: 79.0                                        
##  Max.   :223.0   Max.   :214.0                                        
##  NA's   :16254   NA's   :17101                                        
##     Games                Year         Season              City          
##  Length:70000       Min.   :1896   Length:70000       Length:70000      
##  Class :character   1st Qu.:1960   Class :character   Class :character  
##  Mode  :character   Median :1984   Mode  :character   Mode  :character  
##                     Mean   :1978                                        
##                     3rd Qu.:2002                                        
##                     Max.   :2016                                        
##                                                                         
##     Sport              Event              Medal          
##  Length:70000       Length:70000       Length:70000      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
## 

Saving the data as a data frame

as_tibble(data.frame(dataset_olympics))

Column Summary

Name

summary(dataset_olympics$Name)
##    Length     Class      Mode 
##     70000 character character
uniqueCount <- length(unique(dataset_olympics$Name))
print(uniqueCount)
## [1] 35556
head(sort(table(dataset_olympics$Name), decreasing = TRUE), 5)
## 
##      Oksana Aleksandrovna Chusovitina Adrianus Egbert Willem "Arie" de Jong 
##                                    29                                    27 
##                   Ole Einar Bjrndalen                  Gustaf Eric Carlberg 
##                                    27                                    26 
##                       Alberto Busnari 
##                                    24

There are 7000 records and 35556 unique athletes with the most commonly occurring names listed above.

Age

summary(dataset_olympics$Age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   11.00   21.00   25.00   25.64   28.00   88.00    2732
n_distinct(dataset_olympics$Age)
## [1] 69
# Barplot
ggplot(dataset_olympics, aes(x=Age, fill = Sex)) + 
  geom_bar() + 
  ggtitle("Age Frequency Distribution") +
  labs(x = "Age", y = "Frequency/Count") +
   scale_fill_manual(values = c("M" = "lightblue", "F" = "pink"))
## Warning: Removed 2732 rows containing non-finite values (`stat_count()`).

Height

summary(dataset_olympics$Height)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   127.0   168.0   175.0   175.5   183.0   223.0   16254
n_distinct(dataset_olympics$Height)
## [1] 88
# Barplot
ggplot(dataset_olympics, aes(x=Height,fill=Sex)) + 
  geom_bar() + 
  ggtitle("Height Frequency Distribution") +
  labs(x = "Height", y = "Frequency") +
  scale_fill_manual(values = c("M" = "lightblue", "F" = "pink"))
## Warning: Removed 16254 rows containing non-finite values (`stat_count()`).

Weight

summary(dataset_olympics$Weight)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    25.0    61.0    70.0    70.9    79.0   214.0   17101
n_distinct(dataset_olympics$Weight)
## [1] 174
# Barplot
ggplot(data = dataset_olympics, aes(x = Weight, color = Sex)) +
  geom_freqpoly(binwidth = 1) +
  labs(x = "Weight", y = "Frequency") +
  ggtitle("Weight Frequency Distribution by Sex")
## Warning: Removed 17101 rows containing non-finite values (`stat_bin()`).

Team

summary(dataset_olympics$Team)
##    Length     Class      Mode 
##     70000 character character
uniqueCount <- length(unique(dataset_olympics$Team))
print(uniqueCount)
## [1] 827
# Top Countries participating (All Time)
head(sort(table(dataset_olympics$Team), decreasing = TRUE), 5)
## 
## United States        France         Italy Great Britain        Canada 
##          4979          4608          4023          3282          2536
# Basic Piechart of countries
dataTeam <- table(dataset_olympics$Team)
pie(dataTeam) 

The chart above is indicative of how the Teams do not correspond strongly with the Nationality (827 unique Teams is much greater than total number of countries that exist). To get a better bbreakdown, we will use the NOC column.

NOC

summary(dataset_olympics$NOC)
##    Length     Class      Mode 
##     70000 character character
# A better breakdown of Unique Countries (226)
uniqueCount <- length(unique(dataset_olympics$NOC))
print(uniqueCount)
## [1] 226
# Top 5 Countries participating (All Time)
head(sort(table(dataset_olympics$NOC), decreasing = TRUE), 5)
## 
##  USA  FRA  ITA  GBR  CAN 
## 5216 4902 4176 3526 2662
# Bottom 5 Countries participating (All Time)
head(sort(table(dataset_olympics$NOC)), 5)
## 
## BDI BUR KIR TUV UNK 
##   1   1   1   1   1
# Basic Piechart of countries
dataNOC <- table(dataset_olympics$NOC)
pie(dataNOC)

While this isn’t as styled as it can be, the data corresponds better to a Country based distribution.

Year

summary(dataset_olympics$Year)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1896    1960    1984    1978    2002    2016
# Unique Years
(unique(dataset_olympics$Year))
##  [1] 1992 2012 1920 1900 1988 1994 1932 2002 1952 1980 2000 1996 1912 1924 2014
## [16] 1948 1998 2006 2008 2016 2004 1960 1964 1984 1968 1972 1936 1956 1928 1976
## [31] 2010 1906 1904 1908 1896
# Plotting Years

ggplot(dataset_olympics, aes(x=Year, fill = Sex)) + 
  geom_bar() + 
  ggtitle("Total Olympian Participation Distribution by Year") +
  labs(x = "Year", y = "Frequency/Count") +
   scale_fill_manual(values = c("M" = "lightblue", "F" = "pink"))

A fascinating observation is that we can see the period when the 4 year term between each olympics was established. We can also see when the Winter Olympics were established below:

dataset_olympics$IsWinterOlympics <- ifelse(grepl("Winter", dataset_olympics$Games), TRUE, FALSE)

ggplot(dataset_olympics, aes(x=Year, fill = IsWinterOlympics)) + 
  geom_bar() + 
  ggtitle("Total Olympian Participation Distribution by Year") +
  labs(x = "Year", y = "Frequency/Count") +
   scale_fill_manual(values = c("TRUE" = "blue", "FALSE" = "yellow"))

We can see certain years have Summer and Winter Olympics within the same year but that trend has been broken with the rule switch of alternating every two years.

Games

summary(dataset_olympics$Games)
##    Length     Class      Mode 
##     70000 character character
# Unique Years
(unique(dataset_olympics$Games))
##  [1] "1992 Summer" "2012 Summer" "1920 Summer" "1900 Summer" "1988 Winter"
##  [6] "1992 Winter" "1994 Winter" "1932 Summer" "2002 Winter" "1952 Summer"
## [11] "1980 Winter" "2000 Summer" "1996 Summer" "1912 Summer" "1924 Summer"
## [16] "2014 Winter" "1948 Summer" "1998 Winter" "2006 Winter" "2008 Summer"
## [21] "2016 Summer" "2004 Summer" "1960 Winter" "1964 Winter" "1984 Winter"
## [26] "1984 Summer" "1968 Summer" "1972 Summer" "1988 Summer" "1936 Summer"
## [31] "1952 Winter" "1956 Winter" "1956 Summer" "1960 Summer" "1928 Summer"
## [36] "1976 Summer" "1980 Summer" "1964 Summer" "2010 Winter" "1968 Winter"
## [41] "1906 Summer" "1972 Winter" "1976 Winter" "1924 Winter" "1904 Summer"
## [46] "1928 Winter" "1908 Summer" "1948 Winter" "1932 Winter" "1936 Winter"
## [51] "1896 Summer"
# Plotting Years

ggplot(dataset_olympics, aes(x=Games, fill = Sex)) + 
  geom_bar() + 
  ggtitle("Total Olympian Participation Distribution by Games") +
  labs(x = "Year", y = "Frequency/Count") +
   scale_fill_manual(values = c("M" = "lightblue", "F" = "pink")) + 
   theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

Season

summary(dataset_olympics$Season)
##    Length     Class      Mode 
##     70000 character character
# Unique Seasons
(unique(dataset_olympics$Season))
## [1] "Summer" "Winter"
# Plotting Seasons
dataSeason <- dataset_olympics$Season
pie(table(dataset_olympics$Season))

Medals

This field has entries only for winning olympians

summary(dataset_olympics$Medal)
##    Length     Class      Mode 
##     70000 character character
# Unique Medals
(unique(dataset_olympics$Medal))
## [1] NA       "Gold"   "Bronze" "Silver"
# Plotting Medals
dataset_olympics$MedalsWon <- ifelse(dataset_olympics$Medal == "", "None", dataset_olympics$Medal)
Medals <- table(dataset_olympics$MedalsWon)
pie(Medals)

# Distribution by Medal types 
ggplot(data=dataset_olympics)+
 geom_bar(mapping=aes(x=MedalsWon, fill=MedalsWon))+
 theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Based on Gender
ggplot(data=dataset_olympics)+
 geom_bar(mapping=aes(x=MedalsWon, fill=Sex))+
 theme(axis.text.x = element_text(angle = 45, hjust = 1))

The winners are only a small subset of all olympians.

5 Questions worth investigating

  1. How has participation across both sexes changed over the years?
  2. Can we identify any patterns between wins for a nation and their country hosting the olympics?
  3. Can we see a correlation between age and wins for both sexes.
  4. Are there any benefits to total wins in years that have both olympics the same year versus the new structure of bi-annual games?
  5. Are there sports that have clear dominant genders depending on their nationalities (for ex: Women’s Rowing is dominated by Norwegians, etc.)
  6. Which nations have consistent failure based on sports?