knitr::opts_chunk$set(echo = TRUE)
library(readr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ purrr     1.0.2
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(ggrepel)
library(lubridate)

Olympics Data

We structure the data to add a new column called “Medals earned”

dataset_olympics <- read_delim("dataset_olympics.csv")
## Rows: 70000 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): Name, Sex, Team, NOC, Games, Season, City, Sport, Event, Medal
## dbl  (5): ID, Age, Height, Weight, Year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Time-based Data? Year

dataset_olympics$DateYear <- ymd(dataset_olympics$Year, truncated = 2L)

Analysis over Time? New Athlete Count

Converting to tsibble:

library(tsibble)
## Warning: package 'tsibble' was built under R version 4.3.2
## 
## Attaching package: 'tsibble'
## The following object is masked from 'package:lubridate':
## 
##     interval
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, union
ath_count <- dataset_olympics |> distinct(ID, .keep_all = TRUE) |> group_by(DateYear) |> 
  summarize(athcount = n())

head(ath_count)
## # A tibble: 6 × 2
##   DateYear   athcount
##   <date>        <int>
## 1 1896-01-01       34
## 2 1900-01-01      429
## 3 1904-01-01      163
## 4 1906-01-01      186
## 5 1908-01-01      523
## 6 1912-01-01      512

We removed duplicate entries based on each players Olympic ID

library(xts)
## Warning: package 'xts' was built under R version 4.3.2
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.3.2
## 
## Attaching package: 'zoo'
## The following object is masked from 'package:tsibble':
## 
##     index
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
ath_countts <- as_tsibble(ath_count, index = DateYear)
ath_countxts <- xts(x = ath_countts$athcount, order.by = ath_countts$DateYear)
ath_countxts <- setNames(ath_countxts, "Count")
ath_countxts |>
  ggplot(mapping = aes(x = Index, y = Count)) +
  geom_line() +
  labs(title = "New Athlete Count over Time",
       subtitle = "Annual Athlete Count") +
  theme_hc()

The graph shows how there has been a growing trend in new athletes over the year and a point of stagnation closer to 2016 (latest year). We’re can see the growth before addition of the Winter Olympics system:

require(gridExtra)
## Loading required package: gridExtra
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
plot1 <- ath_countts |> filter_index("1896-01" ~ "1993-01") |>
  ggplot(mapping = aes(x = DateYear, y = athcount)) +
  geom_line() +
  labs(title = "New Athlete from 1896 to 1993",
       subtitle = "Annual Unique Athlete Count") +
  theme_hc()
plot2 <- ath_countts |> filter_index("1994-01" ~ "2016-01") |>
  ggplot(mapping = aes(x = DateYear, y = athcount)) +
  geom_line() +
  labs(title = "New Athletes from 1993 to 2016",
       subtitle = "Annual Unique Athlete Count") +
  theme_hc()
grid.arrange(plot1, plot2, ncol=2)

This is an interesting case where our data defers drastically after the threshold of the year 1993. We can try modelling for both.

plot1 <- ath_countts |> filter_index("1896-01" ~ "1993-01") |>
  ggplot(mapping = aes(x = DateYear, y = athcount)) +
  geom_line() +
  geom_smooth(method = 'lm', color = 'green', se=FALSE) +
  labs(title = "New Athlete from 1896 to 1993",
       subtitle = "Annual Unique Athlete Count") +
  theme_hc()
plot2 <- ath_countts |> filter_index("1994-01" ~ "2016-01") |>
  ggplot(mapping = aes(x = DateYear, y = athcount)) +
  geom_line() +
  geom_smooth(method = 'lm', color = 'green', se=FALSE) +
  labs(title = "New Athletes from 1993 to 2016",
       subtitle = "Annual Unique Athlete Count") +
  theme_hc()
grid.arrange(plot1, plot2, ncol=2)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

Both graphs show an upward trend over the years. We can see that while being plotted on the same y-axis, the growth before 1993 was of a higher magnitude to the growth we can see now. Upon smoothing the data, we can see:

plot1 <- ath_countts |> filter_index("1896-01" ~ "1993-01") |>
  ggplot(mapping = aes(x = DateYear, y = athcount)) +
  geom_point(size=1, shape='O') +
  geom_smooth(span=0.2, color = 'pink', se=FALSE) +
  labs(title = "New Athlete from 1896 to 1993",
       subtitle = "Annual Unique Athlete Count") +
  theme_hc()
plot2 <- ath_countts |> filter_index("1994-01" ~ "2016-01") |>
  ggplot(mapping = aes(x = DateYear, y = athcount)) +
  geom_line() +
  geom_smooth(span=0.2, color = 'pink', se=FALSE) +
  labs(title = "New Athletes from 1993 to 2016",
       subtitle = "Annual Unique Athlete Count") +
  theme_hc()
grid.arrange(plot1, plot2, ncol=2)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at -22646
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 1461
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : There are other near singularities as well. 8.5381e+06
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : span too small.  fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 8725.8
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 770.17
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : There are other near singularities as well. 5.9317e+05

The ACF and PACF:

acf(ath_countts, ci = 0.95, na.action = na.exclude)

pacf(ath_countts, ci = 0.95, na.action = na.exclude)