knitr::opts_chunk$set(echo = TRUE)
library(readr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ purrr     1.0.2
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(miscset)
## 
## Attaching package: 'miscset'
## 
## The following object is masked from 'package:dplyr':
## 
##     collapse
library(gapminder)
library(ggthemes)

Olympics Data

dataset_olympics <- read_delim("dataset_olympics.csv")
## Rows: 70000 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): Name, Sex, Team, NOC, Games, Season, City, Sport, Event, Medal
## dbl  (5): ID, Age, Height, Weight, Year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Goal?

To build sets of variable combinations.

summary(dataset_olympics$Age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   11.00   21.00   25.00   25.64   28.00   88.00    2732

Age vs Medals Won

dataset <- dataset_olympics %>% drop_na(Age) %>% 
  group_by(Age) %>%
  mutate(MedalCount = sum(!is.na(Medal)))

head(dataset$MedalCount)
## [1] 734 845 734 162 701 701
ggplot(data = dataset, aes(x = Age, y=MedalCount)) + 
  geom_point()

Checking for correlation:

cor(dataset$Age, dataset$MedalCount, use = 'complete.obs')
## [1] -0.3870976

The correlation is negative where the increase in Age leads to lower medals won. It is also has a moderate but not strong correlation.

Weight vs Age

dataset <- dataset_olympics %>% drop_na(Age) %>% drop_na(Weight) %>% 
  group_by(Weight) %>%
  mutate(AgeMean = mean(Age,na.rm = TRUE))

head(dataset$AgeMean)
## [1] 25.91173 24.25847 25.98312 25.98312 25.98312 25.98312
ggplot(data = dataset, aes(x = Weight, y=AgeMean)) + 
  geom_point()

cor(dataset$AgeMean, dataset$Weight, use = 'complete.obs')
## [1] 0.8268137

Weight and Age have a very strong correlation. This can be observed in general and as we’ve seen in the past, older athletes tend to participate in more weight-oriented sports such as Weight Lifting and Wrestling.

Height vs Age

dataset <- dataset_olympics %>% drop_na(Age) %>% drop_na(Height) %>% 
  group_by(Height) %>%
  mutate(AgeMean = mean(Age,na.rm = TRUE))

head(dataset$AgeMean)
## [1] 25.75522 25.25586 25.46046 25.46046 25.46046 25.46046
ggplot(data = dataset, aes(x = Height, y=AgeMean)) + 
  geom_point()

cor(dataset$AgeMean, dataset$Height, use = 'complete.obs')
## [1] 0.7119424

This shows a very strong correlation however we can observe a slight curve at the end which is indicative of how with age, the average height tends to reduce as humans shrink.