Import data

olympics <- read_excel("../00_data/myData_apply2.xlsx")
olympics
## # A tibble: 271,116 × 16
##    Column1     id name  sex   age   height weight team  noc   games  year season
##      <dbl>  <dbl> <chr> <chr> <chr> <chr>  <chr>  <chr> <chr> <chr> <dbl> <chr> 
##  1  270479 135289 Zzim… M     20    NA     NA     Braz… BRA   1952…  1952 Summer
##  2  207677 104222 zzet… M     26    178    74     Turk… TUR   2016…  2016 Summer
##  3  102956  52087 zzet… M     23    172    85     Turk… TUR   2004…  2004 Summer
##  4  102957  52087 zzet… M     27    172    85     Turk… TUR   2008…  2008 Summer
##  5  106601  53910 Zyta… F     26    187    85     Pola… POL   1988…  1988 Summer
##  6  219493 110259 Zygm… M     25    185    82     Pola… POL   1932…  1932 Summer
##  7  183779  92370 Zygm… M     21    179    72     Pola… POL   1952…  1952 Summer
##  8  183780  92370 Zygm… M     26    179    72     Pola… POL   1956…  1956 Summer
##  9  152047  76313 Zygm… M     27    175    71     Pola… POL   1972…  1972 Summer
## 10  152048  76313 Zygm… M     31    175    71     Pola… POL   1976…  1976 Summer
## # ℹ 271,106 more rows
## # ℹ 4 more variables: city <chr>, sport <chr>, event <chr>, medal <chr>

Introduction

Questions

Variation

ggplot(data = olympics) + 
    geom_bar(mapping = aes(x = sex))

olympics %>% count(sex)
## # A tibble: 2 × 2
##   sex        n
##   <chr>  <int>
## 1 F      74522
## 2 M     196594

Visualizing distributions

ggplot(data = olympics) + 
geom_histogram(mapping = aes(x = year))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = olympics, mapping = aes(x = year, colour = sex)) +
  geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Typical values

ggplot(data = olympics, mapping = aes(x = year)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Unusual values

Missing Values

olympics %>% 
  mutate(age = as.numeric(age))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `age = as.numeric(age)`.
## Caused by warning:
## ! NAs introduced by coercion
## # A tibble: 271,116 × 16
##    Column1     id name  sex     age height weight team  noc   games  year season
##      <dbl>  <dbl> <chr> <chr> <dbl> <chr>  <chr>  <chr> <chr> <chr> <dbl> <chr> 
##  1  270479 135289 Zzim… M        20 NA     NA     Braz… BRA   1952…  1952 Summer
##  2  207677 104222 zzet… M        26 178    74     Turk… TUR   2016…  2016 Summer
##  3  102956  52087 zzet… M        23 172    85     Turk… TUR   2004…  2004 Summer
##  4  102957  52087 zzet… M        27 172    85     Turk… TUR   2008…  2008 Summer
##  5  106601  53910 Zyta… F        26 187    85     Pola… POL   1988…  1988 Summer
##  6  219493 110259 Zygm… M        25 185    82     Pola… POL   1932…  1932 Summer
##  7  183779  92370 Zygm… M        21 179    72     Pola… POL   1952…  1952 Summer
##  8  183780  92370 Zygm… M        26 179    72     Pola… POL   1956…  1956 Summer
##  9  152047  76313 Zygm… M        27 175    71     Pola… POL   1972…  1972 Summer
## 10  152048  76313 Zygm… M        31 175    71     Pola… POL   1976…  1976 Summer
## # ℹ 271,106 more rows
## # ℹ 4 more variables: city <chr>, sport <chr>, event <chr>, medal <chr>
ggplot(data = olympics, mapping = aes(x = year, y = age)) + 
  geom_point(na.rm = TRUE)

Covariation

ggplot(olympics) + 
  geom_bar(mapping = aes(x = age))

A categorical and continuous variable

olympics %>%
    
    ggplot(aes(x = sex, y = year)) +
    geom_boxplot()

Two categorical variables

olympics %>% 
    
    count(sport, sex) %>%
    
    ggplot(aes(x = sport, y = sex, fill = n)) +
    geom_tile() +
    theme(axis.text.x = element_text(angle = 70, hjust = 1)) +  # Rotate x-axis labels
  labs(title = "Participation in Olympic Sports by Gender", 
       x = "Sport", 
       y = "Gender", 
       fill = "Count")

Two continous variables

library(hexbin)
skiing_data <- olympics %>%
  filter(sport == "Alpine Skiing" & !is.na(height) & !is.na(weight)) %>%
  mutate(height = as.numeric(height),
         weight = as.numeric(weight)) %>%
  filter(!is.na(height) & !is.na(weight))
## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `height = as.numeric(height)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
ggplot(skiing_data, aes(x = height, y = weight)) +
  geom_hex(bins = 30) 

skiing_data %>%
  ggplot(aes(x = as.factor(year), y = height)) +
  geom_boxplot() +
  labs(title = "Height of Alpine Skiing Athletes by Year",
       x = "Year",
       y = "Height (cm)")

Patterns and models

library(modelr)
olympics <- olympics %>%
    mutate(age = as.numeric(age)) %>%
    filter(sport == "Alpine Skiing" & !is.na(age) & !is.na(year) & !is.na(team))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `age = as.numeric(age)`.
## Caused by warning:
## ! NAs introduced by coercion
mod <- lm(log(year) ~ log(age), data = olympics) 

olympics <- olympics %>%
    modelr::add_residuals(mod)%>%
    mutate(resid = exp(resid))

olympics %>%
    ggplot(aes(x = age, y = resid)) +
    geom_point()

olympics %>%
    ggplot(aes(x = team, y = resid)) +
    geom_boxplot() +
    theme(axis.text.x = element_text(angle = 90, hjust = 1))