Import Data

olympics <- read_excel("../00_data/myDataOlympics.xlsx")
olympics
## # A tibble: 271,116 × 15
##       id name     sex   age   height weight team  noc   games  year season city 
##    <dbl> <chr>    <chr> <chr> <chr>  <chr>  <chr> <chr> <chr> <dbl> <chr>  <chr>
##  1     1 A Dijia… M     24    180    80     China CHN   1992…  1992 Summer Barc…
##  2     2 A Lamusi M     23    170    60     China CHN   2012…  2012 Summer Lond…
##  3     3 Gunnar … M     24    NA     NA     Denm… DEN   1920…  1920 Summer Antw…
##  4     4 Edgar L… M     34    NA     NA     Denm… DEN   1900…  1900 Summer Paris
##  5     5 Christi… F     21    185    82     Neth… NED   1988…  1988 Winter Calg…
##  6     5 Christi… F     21    185    82     Neth… NED   1988…  1988 Winter Calg…
##  7     5 Christi… F     25    185    82     Neth… NED   1992…  1992 Winter Albe…
##  8     5 Christi… F     25    185    82     Neth… NED   1992…  1992 Winter Albe…
##  9     5 Christi… F     27    185    82     Neth… NED   1994…  1994 Winter Lill…
## 10     5 Christi… F     27    185    82     Neth… NED   1994…  1994 Winter Lill…
## # ℹ 271,106 more rows
## # ℹ 3 more variables: sport <chr>, event <chr>, medal <chr>

Introduction

Questions

Variation

Visualizing distributions

ggplot(data = olympics) +
  geom_bar(mapping = aes(x = year))

olympics %>% count(year)
## # A tibble: 35 × 2
##     year     n
##    <dbl> <int>
##  1  1896   380
##  2  1900  1936
##  3  1904  1301
##  4  1906  1733
##  5  1908  3101
##  6  1912  4040
##  7  1920  4292
##  8  1924  5693
##  9  1928  5574
## 10  1932  3321
## # ℹ 25 more rows
olympics %>%
    ggplot(mapping = aes(x = year)) +
    geom_histogram(binwidth = 0.8)

olympics %>%
    filter(year < 1992) %>%
    
    ggplot(aes(x = year)) +
    geom_histogram(binwidth = 0.8)

olympics %>%
    ggplot(aes(x = year, color = season)) +
    geom_freqpoly(binwidth = 0.8)

Typical values

olympics %>%
    
    # Filter out people older than 50
    filter(age < 50) %>%
    
       # Plot
    ggplot(aes(x = age)) +
    stat_count()

faithful %>%
    ggplot(aes(eruptions)) +
    stat_count()

Unusual values

ggplot(olympics) + 
  geom_histogram(mapping = aes(x = year), binwidth = 0.9)

ggplot(olympics) + 
  geom_histogram(mapping = aes(x = year), binwidth = 0.9) +
  coord_cartesian(ylim = c(0, 10000))

Missing Values

olympics %>%
    
    # filter(y < 2016 | y > 1992) %>%
    
    mutate(y = ifelse(year < 2016 | year > 1992, NA, year)) %>%
    
    # Plot
    ggplot(aes(x = year, y = season)) +
    geom_point()

Covariation

A categorical and continuous variable

olympics %>%
    
    ggplot(aes(x = year, y = season)) +
    geom_boxplot()

Two categorical variables

olympics %>%
    
    count(year, season) %>%
    
    ggplot(aes(x = year, y = season, fill = n)) +
    geom_tile()

Two continous variables

data_clean <- olympics %>% mutate(age = as.numeric(age))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `age = as.numeric(age)`.
## Caused by warning:
## ! NAs introduced by coercion
data_clean %>%
    ggplot(aes(x = year, y = age)) +
    geom_hex()
## Warning: Removed 9474 rows containing non-finite outside the scale range
## (`stat_binhex()`).

data_clean %>%
    filter(year <1992) %>%
    ggplot(aes(x = year, y = medal)) +
    geom_boxplot()

Patterns and models

library(modelr)
mod <- lm(log(year) ~ log(year), data = olympics)
## Warning in model.matrix.default(mt, mf, contrasts): the response appeared on
## the right-hand side and was dropped
## Warning in model.matrix.default(mt, mf, contrasts): problem with term 1 in
## model.matrix: no columns are assigned
olympics2 <- olympics %>%
    modelr::add_residuals(mod) %>%
    mutate(resid = exp(resid))

olympics2 %>%
    ggplot(aes(year, resid)) +
    geom_point()

olympics2 %>%
    ggplot(aes(year, resid)) +
    geom_boxplot()
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?