dataset_olympics <- read_delim("dataset_olympics.csv")
## Rows: 70000 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): Name, Sex, Team, NOC, Games, Season, City, Sport, Event, Medal
## dbl (5): ID, Age, Height, Weight, Year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
summary(dataset_olympics)
## ID Name Sex Age
## Min. : 1 Length:70000 Length:70000 Min. :11.00
## 1st Qu.: 9326 Class :character Class :character 1st Qu.:21.00
## Median :18032 Mode :character Mode :character Median :25.00
## Mean :18082 Mean :25.64
## 3rd Qu.:26978 3rd Qu.:28.00
## Max. :35658 Max. :88.00
## NA's :2732
## Height Weight Team NOC
## Min. :127.0 Min. : 25.0 Length:70000 Length:70000
## 1st Qu.:168.0 1st Qu.: 61.0 Class :character Class :character
## Median :175.0 Median : 70.0 Mode :character Mode :character
## Mean :175.5 Mean : 70.9
## 3rd Qu.:183.0 3rd Qu.: 79.0
## Max. :223.0 Max. :214.0
## NA's :16254 NA's :17101
## Games Year Season City
## Length:70000 Min. :1896 Length:70000 Length:70000
## Class :character 1st Qu.:1960 Class :character Class :character
## Mode :character Median :1984 Mode :character Mode :character
## Mean :1978
## 3rd Qu.:2002
## Max. :2016
##
## Sport Event Medal
## Length:70000 Length:70000 Length:70000
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
Saving the data as a data frame
as_tibble(data.frame(dataset_olympics))
summary(dataset_olympics$Name)
## Length Class Mode
## 70000 character character
uniqueCount <- length(unique(dataset_olympics$Name))
print(uniqueCount)
## [1] 35556
head(sort(table(dataset_olympics$Name), decreasing = TRUE), 5)
##
## Oksana Aleksandrovna Chusovitina Adrianus Egbert Willem "Arie" de Jong
## 29 27
## Ole Einar Bjrndalen Gustaf Eric Carlberg
## 27 26
## Alberto Busnari
## 24
There are 7000 records and 35556 unique athletes with the most commonly occurring names listed above.
summary(dataset_olympics$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 11.00 21.00 25.00 25.64 28.00 88.00 2732
n_distinct(dataset_olympics$Age)
## [1] 69
# Barplot
ggplot(dataset_olympics, aes(x=Age, fill = Sex)) +
geom_bar() +
ggtitle("Age Frequency Distribution") +
labs(x = "Age", y = "Frequency/Count") +
scale_fill_manual(values = c("M" = "lightblue", "F" = "pink"))
## Warning: Removed 2732 rows containing non-finite values (`stat_count()`).
summary(dataset_olympics$Height)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 127.0 168.0 175.0 175.5 183.0 223.0 16254
n_distinct(dataset_olympics$Height)
## [1] 88
# Barplot
ggplot(dataset_olympics, aes(x=Height,fill=Sex)) +
geom_bar() +
ggtitle("Height Frequency Distribution") +
labs(x = "Height", y = "Frequency") +
scale_fill_manual(values = c("M" = "lightblue", "F" = "pink"))
## Warning: Removed 16254 rows containing non-finite values (`stat_count()`).
summary(dataset_olympics$Weight)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 25.0 61.0 70.0 70.9 79.0 214.0 17101
n_distinct(dataset_olympics$Weight)
## [1] 174
# Barplot
ggplot(data = dataset_olympics, aes(x = Weight, color = Sex)) +
geom_freqpoly(binwidth = 1) +
labs(x = "Weight", y = "Frequency") +
ggtitle("Weight Frequency Distribution by Sex")
## Warning: Removed 17101 rows containing non-finite values (`stat_bin()`).
summary(dataset_olympics$Team)
## Length Class Mode
## 70000 character character
uniqueCount <- length(unique(dataset_olympics$Team))
print(uniqueCount)
## [1] 827
# Top Countries participating (All Time)
head(sort(table(dataset_olympics$Team), decreasing = TRUE), 5)
##
## United States France Italy Great Britain Canada
## 4979 4608 4023 3282 2536
# Basic Piechart of countries
dataTeam <- table(dataset_olympics$Team)
pie(dataTeam)
The chart above is indicative of how the Teams do not correspond
strongly with the Nationality (827 unique Teams is much greater than
total number of countries that exist). To get a better bbreakdown, we
will use the NOC column.
summary(dataset_olympics$NOC)
## Length Class Mode
## 70000 character character
# A better breakdown of Unique Countries (226)
uniqueCount <- length(unique(dataset_olympics$NOC))
print(uniqueCount)
## [1] 226
# Top 5 Countries participating (All Time)
head(sort(table(dataset_olympics$NOC), decreasing = TRUE), 5)
##
## USA FRA ITA GBR CAN
## 5216 4902 4176 3526 2662
# Bottom 5 Countries participating (All Time)
head(sort(table(dataset_olympics$NOC)), 5)
##
## BDI BUR KIR TUV UNK
## 1 1 1 1 1
# Basic Piechart of countries
dataNOC <- table(dataset_olympics$NOC)
pie(dataNOC)
While this isn’t as styled as it can be, the data corresponds better to
a Country based distribution.
summary(dataset_olympics$Year)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1896 1960 1984 1978 2002 2016
# Unique Years
(unique(dataset_olympics$Year))
## [1] 1992 2012 1920 1900 1988 1994 1932 2002 1952 1980 2000 1996 1912 1924 2014
## [16] 1948 1998 2006 2008 2016 2004 1960 1964 1984 1968 1972 1936 1956 1928 1976
## [31] 2010 1906 1904 1908 1896
# Plotting Years
ggplot(dataset_olympics, aes(x=Year, fill = Sex)) +
geom_bar() +
ggtitle("Total Olympian Participation Distribution by Year") +
labs(x = "Year", y = "Frequency/Count") +
scale_fill_manual(values = c("M" = "lightblue", "F" = "pink"))
A fascinating observation is that we can see the period when the 4 year
term between each olympics was established. We can also see when the
Winter Olympics were established below:
dataset_olympics$IsWinterOlympics <- ifelse(grepl("Winter", dataset_olympics$Games), TRUE, FALSE)
ggplot(dataset_olympics, aes(x=Year, fill = IsWinterOlympics)) +
geom_bar() +
ggtitle("Total Olympian Participation Distribution by Year") +
labs(x = "Year", y = "Frequency/Count") +
scale_fill_manual(values = c("TRUE" = "blue", "FALSE" = "yellow"))
We can see certain years have Summer and Winter Olympics within the same
year but that trend has been broken with the rule switch of alternating
every two years.
summary(dataset_olympics$Games)
## Length Class Mode
## 70000 character character
# Unique Years
(unique(dataset_olympics$Games))
## [1] "1992 Summer" "2012 Summer" "1920 Summer" "1900 Summer" "1988 Winter"
## [6] "1992 Winter" "1994 Winter" "1932 Summer" "2002 Winter" "1952 Summer"
## [11] "1980 Winter" "2000 Summer" "1996 Summer" "1912 Summer" "1924 Summer"
## [16] "2014 Winter" "1948 Summer" "1998 Winter" "2006 Winter" "2008 Summer"
## [21] "2016 Summer" "2004 Summer" "1960 Winter" "1964 Winter" "1984 Winter"
## [26] "1984 Summer" "1968 Summer" "1972 Summer" "1988 Summer" "1936 Summer"
## [31] "1952 Winter" "1956 Winter" "1956 Summer" "1960 Summer" "1928 Summer"
## [36] "1976 Summer" "1980 Summer" "1964 Summer" "2010 Winter" "1968 Winter"
## [41] "1906 Summer" "1972 Winter" "1976 Winter" "1924 Winter" "1904 Summer"
## [46] "1928 Winter" "1908 Summer" "1948 Winter" "1932 Winter" "1936 Winter"
## [51] "1896 Summer"
# Plotting Years
ggplot(dataset_olympics, aes(x=Games, fill = Sex)) +
geom_bar() +
ggtitle("Total Olympian Participation Distribution by Games") +
labs(x = "Year", y = "Frequency/Count") +
scale_fill_manual(values = c("M" = "lightblue", "F" = "pink")) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
summary(dataset_olympics$Season)
## Length Class Mode
## 70000 character character
# Unique Seasons
(unique(dataset_olympics$Season))
## [1] "Summer" "Winter"
# Plotting Seasons
dataSeason <- dataset_olympics$Season
pie(table(dataset_olympics$Season))
This field has entries only for winning olympians
summary(dataset_olympics$Medal)
## Length Class Mode
## 70000 character character
# Unique Medals
(unique(dataset_olympics$Medal))
## [1] NA "Gold" "Bronze" "Silver"
# Plotting Medals
dataset_olympics$MedalsWon <- ifelse(dataset_olympics$Medal == "", "None", dataset_olympics$Medal)
Medals <- table(dataset_olympics$MedalsWon)
pie(Medals)
# Distribution by Medal types
ggplot(data=dataset_olympics)+
geom_bar(mapping=aes(x=MedalsWon, fill=MedalsWon))+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Based on Gender
ggplot(data=dataset_olympics)+
geom_bar(mapping=aes(x=MedalsWon, fill=Sex))+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
The winners are only a small subset of all olympians.