knitr::opts_chunk$set(echo = TRUE)
library(readr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## âś” dplyr     1.1.3     âś” purrr     1.0.2
## âś” forcats   1.0.0     âś” stringr   1.5.0
## âś” ggplot2   3.4.3     âś” tibble    3.2.1
## âś” lubridate 1.9.2     âś” tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## âś– dplyr::filter() masks stats::filter()
## âś– dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.3.2
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine

The Data

The data covers various datapoints for Olympics games ranging from the year 1896 to 2016.

dataset_olympics <- read_delim("dataset_olympics.csv")
## Rows: 70000 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): Name, Sex, Team, NOC, Games, Season, City, Sport, Event, Medal
## dbl  (5): ID, Age, Height, Weight, Year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
noc_olympics <- read_delim("noc_region.csv")
## Rows: 230 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): noc_region, reg, notes
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
noc_olympics$NOC <- noc_olympics$noc_region
dataset_olympics <- left_join(dataset_olympics, noc_olympics, by = "NOC")
dataset_olympics$NOC <- dataset_olympics$noc_region
dataset_olympics$NOCRegion <- dataset_olympics$reg

Scope

Let’s assume our audience for this project is the National Olympics Team Representatives and the International Sports Federation.

The audience is interested in learning about recent games and how they can impact future games best through marketing and investments. What does the olympics currently do well? Where does it lack? What can it do better?

Success Criteria: Find key impact points that can be used as the focus for marketing and patterns that need to be improved upon internally

Why? The NOC FRG refers to West Germany, this data point is irrelevant to our current analysis as both sides of the country were unified in 1991. We only need data for countries and athletes that are still playing the sports. We will limit our scope to include only sports with relevant data spanning up till 2014 Winter Olympics and 2016 Summer Olympics. This way we can filter out most discontinued sports and remove two century old athletes.

Exploratory Data Analysis

The data can be explored along multiple variables to gain insight. Let’s consider the following possible EDA items:

ggplot(dataset_olympics, aes(x=Year, fill = Sex)) + 
  geom_bar() + 
  ggtitle("Total Olympian Participation Distribution by Year") +
  labs(x = "Year", y = "Athlete Frequency/Count") +
   scale_fill_manual(values = c("M" = "lightblue", "F" = "pink"))

Inference: Athlete count per year has increased from the beginning of The Games.

eventCount <- dataset_olympics |> 
  filter(Games == '2016 Summer') |>
  select(Sport, Event) |>
  distinct() |>
  group_by(Sport) |>
  summarise(Event = n())

ggplot(data = eventCount, aes(x = reorder(Sport, -Event), y = Event)) + 
  geom_bar(stat = "identity", fill = "skyblue") +
  labs(x = "Olympic Sport", y = "Number of Events", title = "Events in each Sport in Summer 2016") +
  geom_text(aes(label = Event), position=position_dodge(width=0.9), vjust=-0.25)+
   theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

Inference: We can see that in Summer 2016, 34 Sports had 305 unique events. Athletics and Swimming had the most number of unique linked events. Each sport has a minimum of 2 events: Men’s and Women’s divisions.

# Aggregate year-based data 
total_medals <- dataset_olympics |>
  filter(!is.na(Medal)) |>
  group_by(NOC) |>
  summarise(TotalMedals = sum(!is.na(Medal))) |>
  arrange(desc(TotalMedals))

OtherMedals <- mutate(total_medals, Country = ifelse(row_number() > 29, "All Others", as.character(NOC))) |> group_by(Country) |>
  summarise(Count = sum(TotalMedals)) |>
  arrange(desc(Count))

ggplot(data = OtherMedals, aes(x = reorder(Country, -Count), y = Count, fill = Count)) + 
  geom_bar(stat = "identity") +
  labs(x = "Country", y = "Number of Medals", title = "Total Medal Counts by Country (Top 20)") +
   theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

We see that the United States makes up the majority of all medals such that the “All Others” category, which contains 89 countries, is still short of medals earned by USA alone.

Growth: What does it look like?

We want to learn more about the growth within the countries’ performances as well as growth within the sports. We know what the overall medal count is, however, we would also like to know the trend in Medal count growth for each country.

# Aggregate year-based data 
growth_medal <- dataset_olympics |>
  group_by(NOC) |> 
  filter(any(Year %in% c(2014, 2016))) |>
  ungroup()

growth_medal <- growth_medal |>
  group_by(NOC, Year) |>
  count(Medal) |>
  spread(Medal, n, fill = 0) |>
  ungroup() |>
  mutate(TotalMedals = Gold + Silver + Bronze)

# Get growth YoY
growth_medal_data <- growth_medal |>
  filter(Year > 1945) |>
  group_by(NOC) |>
  arrange(Year) |>
  mutate(Growth = TotalMedals - lag(TotalMedals, default = 0)) |>
  mutate(GrowthPercentage = ((TotalMedals) - lag(ifelse(TotalMedals == 0, 1, TotalMedals))) / lag(TotalMedals) * 100) |>
  filter(!is.na(GrowthPercentage), is.finite(GrowthPercentage), !is.nan(GrowthPercentage))

growth_countries <- growth_medal_data |>
  group_by(NOC) |>
  summarise(TotalGrowth = mean(GrowthPercentage))

# Selecting Countries with best growth
growth_best <- growth_countries |> arrange(desc(TotalGrowth)) |>
  slice_head(n = 30)

ggplot(data = growth_best, aes(x = reorder(NOC, -TotalGrowth), y = TotalGrowth, fill = NOC)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = round(TotalGrowth)), position=position_dodge(width=0.9), vjust=-0.25)+
  labs(x = "Country", y = "Mean Growth YoY (%)", title = "Mean Growth by Country (Top 20)") +
   theme(legend.position = "none", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

The data reveals Australia has had the best growth over the years. Our Top Medal holders USA, FRANCE, GREAT BRITAIN aren’t part of the best growth countries with the exception of Great Britain. What do their graphs look like in comparison?

growth_medals_best <- growth_countries |> filter(NOC %in% OtherMedals$Country)

OtherMedals <- OtherMedals |> filter(Country != "All Others")

plot1 <- ggplot(data = OtherMedals, aes(x = Country, y = Count, fill = Count)) + 
  geom_bar(stat = "identity") +
  labs(x = "Country", y = "Number of Medals", title = "Total Medal Counts by Country (Top 20)") +
   theme(legend.position = "none",axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

plot2 <- ggplot(data = growth_medals_best, aes(x = NOC, y = TotalGrowth, fill = NOC)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = round(TotalGrowth)), position=position_dodge(width=0.9), vjust=-0.25)+
  labs(x = "Country", y = "Mean Growth YoY (%)", title = "Mean Growth for Top 20 Medal Count Countries") +
   theme(legend.position = "none", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

grid.arrange(plot1, plot2, nrow=2)

Which countries needs to realize their failures and where should they invest to fix them?

# Selecting Countries with best growth
growth_worst <- growth_countries |> arrange((TotalGrowth)) |>
  filter(TotalGrowth > -100) |>
  slice_head(n = 30)

ggplot(data = growth_worst, aes(x = reorder(NOC, -TotalGrowth), y = TotalGrowth, fill = NOC)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = round(TotalGrowth)), position=position_dodge(width=0.9), vjust=-0.25)+
  labs(x = "Country", y = "Mean Growth YoY (%)", title = "Mean Growth by Country (Bottom 30)") +
   theme(legend.position = "none", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

The growth shows a better picture of what the countries progress looks like. FRG = West Germany. That does not exist anymore. Argentina is part of the TOP20 medal count countries however has seen a drastic 82% decrease in growth on average. India, while being a strong contender, still faces a loss of 22% average growth in total medals every year.

Which Sport has seen the highest growth?

# Aggregate and clean year-based data for current games 
growth_sport <- dataset_olympics |>
  group_by(Sport) |> 
  filter(any(Year %in% c(2014, 2016))) |>
  ungroup()

growth_sports <- growth_sport |>
  group_by(Sport, Year) |>
  summarise(AthleteCount = n())
## `summarise()` has grouped output by 'Sport'. You can override using the
## `.groups` argument.
# Get growth YoY
growth_sport_by_year <- growth_sports |>
  arrange(Year) |>
  mutate(Growth = AthleteCount - lag(AthleteCount, default = 0)) |>
  mutate(GrowthPercentage = ((AthleteCount) - lag(ifelse(AthleteCount == 0, 1, AthleteCount))) / lag(AthleteCount) * 100)

growth_sport_list <- growth_sport_by_year |>
  group_by(Sport) |>
  summarise(TotalGrowth = sum(Growth), MeanGrowthPercentage = mean(GrowthPercentage, na.rm = TRUE), AthleteCount = sum(AthleteCount))

# Selecting Sport with best growth
growth_best <- growth_sport_list |> arrange(desc(MeanGrowthPercentage)) |>
  slice_head(n = 30)

ggplot(data = growth_best, aes(x = reorder(Sport, -MeanGrowthPercentage), y = MeanGrowthPercentage, fill = Sport)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = round(MeanGrowthPercentage)), position=position_dodge(width=0.9), vjust=-0.25)+
  labs(x = "Sport", y = "Mean Athlete Growth YoY (%)", title = "Mean Athlete Growth by Sport (Top 20)") +
   theme(legend.position = "none", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

eventSportrelation <- left_join(eventCount, growth_sport_list, by="Sport")
ggplot(data = eventSportrelation, aes(x = Event, y = MeanGrowthPercentage)) +
  geom_line() + 
  labs(title = "Correlation between Event count and Growth",
       x = "Event Count",
       y = "Growth Percentage") +
  theme()

No clear relation exists.

growth_worst_countries <- growth_sport_list |> arrange((MeanGrowthPercentage)) |>
  slice_head(n = 20)

ggplot(data = growth_worst_countries, aes(x = reorder(Sport, -MeanGrowthPercentage), y = MeanGrowthPercentage, fill = Sport)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = round(MeanGrowthPercentage)), position=position_dodge(width=0.9), vjust=-0.25)+
  labs(x = "Sport", y = "Mean Growth YoY (%)", title = "Mean Athlete Growth by Sport (Bottom 20)") +
   theme(legend.position = "none", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

Fencing has seen the most growth followed by Golf. Badminton is the only sport to have negative growth in total athletes and it needs to be invested further in to make sure that the sport doesn’t go the way of the Dodo.

Individual Athlete Growth

There are many athletes in the Game’s history who have participated multiple times and across multiple sports. Let’s try to learn some about that.

athlete_career <- dataset_olympics |>
  group_by(ID, Sport, Year) |>
  mutate(EventCount = n()) |>
  group_by(ID) |>
  summarise(
    Name = Name,
    FirstYear = min(Year),
    LastYear = max(Year),
    CareerDuration = max(Year) - min(Year) + 1,
    Sport = Sport,
    MeanEventCount = mean(EventCount),
    TotalEventCount = sum(EventCount),
    TotalMedals = sum(!is.na(Medal))
  ) |>
  filter(any(LastYear %in% c(2014, 2016))) |>
  filter(CareerDuration > 12)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'ID'. You can override using the `.groups`
## argument.
athlete_career_avg <- athlete_career |>
  group_by(Sport) |>
  summarise(AverageCareer = mean(CareerDuration))

ggplot(athlete_career_avg, aes(x = reorder(Sport, -AverageCareer), y = AverageCareer)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  labs(title = "Average Athlete Career Duration by Sports in the Olympics",
       subtitle = "Career Duration greater than 12 years (3 Olympic Games)",
       x = "Sport",
       y = "Athlete Count") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 70, hjust = 1))

We can see that Equestrianism, Luge, and Gymnastics have the longest competing athletes. These sports can be shown as the marketing focus for the future marketing campaigns. Lets focus on Athletes to highlight this year:

athlete_career <- dataset_olympics |>
  group_by(ID, Sport, Year) |>
  mutate(EventCount = n()) |>
  group_by(ID) |>
  summarise(
    Name = Name,
    FirstYear = min(Year),
    LastYear = max(Year),
    CareerDuration = max(Year) - min(Year) + 1,
    Sport = Sport,
    MeanEventCount = mean(EventCount),
    TotalEventCount = sum(EventCount),
    TotalMedals = sum(!is.na(Medal)),
    MedalPercentage = sum(!is.na(Medal))/sum(EventCount) * 100,
    Age = max(Age)
  ) |>
  filter(any(LastYear %in% c(2014, 2016))) |>
  filter(CareerDuration > 10) |>
  filter(MedalPercentage > 0) |>
  distinct()
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'ID'. You can override using the `.groups`
## argument.
ggplot(athlete_career, aes(x = Sport, y = CareerDuration, fill = CareerDuration)) +
  geom_bar(stat = "identity") +
  labs(title = "Athlete Career Duration by Sports in the Olympics",
       subtitle = "Career Duration over 10 years",
       x = "Sport",
       y = "Athlete Count") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 70, hjust = 1))

ggplot(athlete_career |> filter(CareerDuration > 15), aes(x = reorder(Name, -MedalPercentage), y = MedalPercentage, fill = Sport)) +
  geom_bar(stat = "identity") +
  labs(title = "Top Athlete Medal Percentage in the Olympics",
       subtitle = "(Career Duration > 15)",
       x = "Athlete",
       y = "Medal Percentage (%)", 
       fill = "Sport") +
  geom_text(aes(label = round(CareerDuration)), position=position_dodge(width=0.9), vjust=-0.25, size = 2)+
  theme_minimal() +
  theme(legend.position = "none", legend.text = element_text(size = 6), legend.title = element_text(size = 10), axis.text.x = element_text(angle = 90, hjust = 1))

Marketing Story

We can include the following items within the campaign: 1. Briefly introduce Sports and their unique events. 2. Share growth in athlete participation in sports over the years. 3. Highlight overall Medal Counts and focus on Australia, Netherlands, and Great Britain’s growth and success through the years. 4. Show Athlete popularity growth among sports such as Fencing, Golf, and Wrestling in recent years. 5. Focus on Nick Dempsey, Sofia B, and Tiago Henrique as the face of the campaign as they are the long-time athletes with the highest medal percentages.

Internal Causes for Concern

  1. Performance decline for Top20 Medal Countries such as Argentina, Belgium, Finland, and Hungary.
  2. Declining Athlete interests in the sports such as Basketball, Taekwondo, Table Tennis and Badminton.
  3. Major countries such as India, and Venezuela were found to be in performance decline.
  4. Focus on low athlete retention across sports.

Possible Areas of Improvement

  1. Further Data collection on Geopolitical data to focus on smaller countries.
  2. Find strong correlation between Medals won and other columns to build models.
  3. Expand dataset beyond 2016

====================

Thank you. I hope you enjoyed my Final Project. Sundeep Kakar