Import Data
olympics <- read_excel("../00_data/myDataOlympics.xlsx")
olympics
## # A tibble: 271,116 × 15
## id name sex age height weight team noc games year season city
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <chr> <chr>
## 1 1 A Dijia… M 24 180 80 China CHN 1992… 1992 Summer Barc…
## 2 2 A Lamusi M 23 170 60 China CHN 2012… 2012 Summer Lond…
## 3 3 Gunnar … M 24 NA NA Denm… DEN 1920… 1920 Summer Antw…
## 4 4 Edgar L… M 34 NA NA Denm… DEN 1900… 1900 Summer Paris
## 5 5 Christi… F 21 185 82 Neth… NED 1988… 1988 Winter Calg…
## 6 5 Christi… F 21 185 82 Neth… NED 1988… 1988 Winter Calg…
## 7 5 Christi… F 25 185 82 Neth… NED 1992… 1992 Winter Albe…
## 8 5 Christi… F 25 185 82 Neth… NED 1992… 1992 Winter Albe…
## 9 5 Christi… F 27 185 82 Neth… NED 1994… 1994 Winter Lill…
## 10 5 Christi… F 27 185 82 Neth… NED 1994… 1994 Winter Lill…
## # ℹ 271,106 more rows
## # ℹ 3 more variables: sport <chr>, event <chr>, medal <chr>
Introduction
Questions
Variation
Visualizing distributions
ggplot(data = olympics) +
geom_bar(mapping = aes(x = year))

olympics %>% count(year)
## # A tibble: 35 × 2
## year n
## <dbl> <int>
## 1 1896 380
## 2 1900 1936
## 3 1904 1301
## 4 1906 1733
## 5 1908 3101
## 6 1912 4040
## 7 1920 4292
## 8 1924 5693
## 9 1928 5574
## 10 1932 3321
## # ℹ 25 more rows
olympics %>%
ggplot(mapping = aes(x = year)) +
geom_histogram(binwidth = 0.8)

olympics %>%
filter(year < 1992) %>%
ggplot(aes(x = year)) +
geom_histogram(binwidth = 0.8)

olympics %>%
ggplot(aes(x = year, color = season)) +
geom_freqpoly(binwidth = 0.8)

Typical values
olympics %>%
# Filter out people older than 50
filter(age < 50) %>%
# Plot
ggplot(aes(x = age)) +
stat_count()

faithful %>%
ggplot(aes(eruptions)) +
stat_count()

Unusual values
ggplot(olympics) +
geom_histogram(mapping = aes(x = year), binwidth = 0.9)

ggplot(olympics) +
geom_histogram(mapping = aes(x = year), binwidth = 0.9) +
coord_cartesian(ylim = c(0, 10000))

Missing Values
olympics %>%
# filter(y < 2016 | y > 1992) %>%
mutate(y = ifelse(year < 2016 | year > 1992, NA, year)) %>%
# Plot
ggplot(aes(x = year, y = season)) +
geom_point()

Covariation
A categorical and continuous variable
olympics %>%
ggplot(aes(x = year, y = season)) +
geom_boxplot()

Two categorical variables
olympics %>%
count(year, season) %>%
ggplot(aes(x = year, y = season, fill = n)) +
geom_tile()

Two continous variables
data_clean <- olympics %>% mutate(age = as.numeric(age))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `age = as.numeric(age)`.
## Caused by warning:
## ! NAs introduced by coercion
data_clean %>%
ggplot(aes(x = year, y = age)) +
geom_hex()
## Warning: Removed 9474 rows containing non-finite outside the scale range
## (`stat_binhex()`).

data_clean %>%
filter(year <1992) %>%
ggplot(aes(x = year, y = medal)) +
geom_boxplot()

Patterns and models
library(modelr)
mod <- lm(log(year) ~ log(year), data = olympics)
## Warning in model.matrix.default(mt, mf, contrasts): the response appeared on
## the right-hand side and was dropped
## Warning in model.matrix.default(mt, mf, contrasts): problem with term 1 in
## model.matrix: no columns are assigned
olympics2 <- olympics %>%
modelr::add_residuals(mod) %>%
mutate(resid = exp(resid))
olympics2 %>%
ggplot(aes(year, resid)) +
geom_point()

olympics2 %>%
ggplot(aes(year, resid)) +
geom_boxplot()
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
