Import data

# excel file
olympics <- read_excel("../02_module5/data/myData.xlsx") %>%
  mutate(age = as.numeric(age),
         weight = as.numeric(weight),
         height = as.numeric(height))

Introduction

In this analysis I will be exploring the Olympics dataset. This dataset contains information on Olympic athletes such as their age, weight, sport, and what medals they won.

Questions

  1. How old are most Olympic athletes?
  2. Do medal winners change in age or weight?

Variation

Variation is how much a variable changes. In this dataset, athlete age and weight vary a lot depending on the sport they compete in.

Visualizing distributions

## bar shows them to be uneven but this is equal medals.
olympics %>%
    filter(medal != "NA") %>% 
  ggplot(aes(x = medal)) +
  geom_bar()

olympics %>%
  ggplot(mapping = aes(x = age)) +
  geom_histogram(binwidth = 1)

olympics %>%
  filter(age < 50) %>%
  ggplot(mapping = aes(x = age)) +
  geom_histogram(binwidth = 1)

olympics %>%
  filter(medal != "NA") %>%
  ggplot(aes(x = age, color = medal)) +
  geom_freqpoly()

Typical values

olympics %>%
  filter(age < 50) %>%
  ggplot(aes(x = age)) +
  geom_histogram(binwidth = 1)

Unusual values

olympics %>%
  ggplot(aes(x = age)) +
  geom_histogram() +
  coord_cartesian(ylim = c(0, 50))

Missing Values

olympics %>%
  mutate(age = ifelse(age < 10 | age > 80, NA, age)) %>%
  ggplot(aes(x = age, y = weight)) +
  geom_point()

Covariation

A categorical and continuous variable

olympics %>%
  ggplot(aes(x = medal, y = age)) +
  geom_boxplot()

Two categorical variables

olympics %>%
  filter(medal != "NA") %>%
  count(sport, medal) %>%
  ggplot(aes(x = medal, y = sport, fill = n)) +
  geom_tile() +
  theme(axis.text.y = element_text(size = 4))

Two continous variables

olympics %>%
  ggplot(aes(x = age, y = weight)) +
  geom_hex()

Patterns and models

olympics %>%
  ggplot(aes(x = age, y = weight)) +
  geom_boxplot(aes(group = cut_width(age, 5)))