knitr::opts_chunk$set(echo = TRUE)
library(readr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ purrr 1.0.2
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(miscset)
##
## Attaching package: 'miscset'
##
## The following object is masked from 'package:dplyr':
##
## collapse
library(ggrepel)
library(effsize)
library(pwrss)
##
## Attaching package: 'pwrss'
##
## The following object is masked from 'package:stats':
##
## power.t.test
dataset_olympics <- read_delim("dataset_olympics.csv")
## Rows: 70000 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): Name, Sex, Team, NOC, Games, Season, City, Sport, Event, Medal
## dbl (5): ID, Age, Height, Weight, Year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
There are many hypothesis we can pull from the data. Two of them are:
Is there a correlation between age and performance? Younger athletes perform better than older athletes.
Assuming every year new athletes join the Games, Country performance over time gets better.
We can infer 2 null hypothesis from these statements:
# Group data by player
dataset_c <- dataset_olympics %>% group_by(Name) %>%
reframe(MedalCount = sum(!is.na(Medal)), Age = Age)
# Filter Data to get mean
data <- dataset_c %>% drop_na(Age)
mean_age <- mean(data$Age)
cat("Mean",mean_age)
## Mean 25.64465
cat("Median",median(data$Age))
## Median 25
# Dividing data based on old and Young participants
athletes_young <- data |> filter(Age > 25)
athletes_old <- data |> filter(Age <= 25)
par(mfrow = c(1,2))
ggplot(athletes_young, aes(x = MedalCount)) +
geom_boxplot(color="purple") + coord_flip() +
ggtitle("Younger Athletes")
ggplot(athletes_old, aes(x = MedalCount)) +
geom_boxplot(color="blue") + coord_flip() +
ggtitle("Older Athletes")
# T - Test for this data
t.test(athletes_young$MedalCount, athletes_old$MedalCount, var.equal = TRUE)
##
## Two Sample t-test
##
## data: athletes_young$MedalCount and athletes_old$MedalCount
## t = 17.877, df = 67266, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.1538973 0.1917983
## sample estimates:
## mean of x mean of y
## 0.6247338 0.4518860
As we can see, the alternate hypothesis is the one that correctly represents our true hypothesis. The p value is very small and way under our preset limit of 0.01
# Group data by player
dataset_c <- dataset_olympics %>% group_by(Year, NOC) %>%
mutate(MedalCount = sum(!is.na(Medal)))
# Divide Data based on
NOC_Old <- dataset_c |> filter(Year < 1995)
NOC_new <- dataset_c |> filter(Year >= 1995)
par(mfrow = c(1,2))
ggplot(NOC_Old, aes(x = MedalCount)) +
geom_boxplot(color="purple") + coord_flip() +
ggtitle("NOC Older Year wins")
ggplot(NOC_new, aes(x = MedalCount)) +
geom_boxplot(color="blue") + coord_flip() +
ggtitle("NOC Newer Year Wins")
# T - Test for this data
t.test(NOC_new$MedalCount, NOC_Old$MedalCount, var.equal = TRUE)
##
## Two Sample t-test
##
## data: NOC_new$MedalCount and NOC_Old$MedalCount
## t = -25.061, df = 69998, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -4.476795 -3.827343
## sample estimates:
## mean of x mean of y
## 13.21149 17.36356
The p value again ends up being significantly smaller than our cut-off therefore the Null hypothesis is rejected and we can conclude that there is an improvement in performance over time for a given NOC.