Import data
olympics <- read_excel("../00_data/myData_apply2.xlsx")
olympics
## # A tibble: 271,116 × 16
## Column1 id name sex age height weight team noc games year season
## <dbl> <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <chr>
## 1 270479 135289 Zzim… M 20 NA NA Braz… BRA 1952… 1952 Summer
## 2 207677 104222 zzet… M 26 178 74 Turk… TUR 2016… 2016 Summer
## 3 102956 52087 zzet… M 23 172 85 Turk… TUR 2004… 2004 Summer
## 4 102957 52087 zzet… M 27 172 85 Turk… TUR 2008… 2008 Summer
## 5 106601 53910 Zyta… F 26 187 85 Pola… POL 1988… 1988 Summer
## 6 219493 110259 Zygm… M 25 185 82 Pola… POL 1932… 1932 Summer
## 7 183779 92370 Zygm… M 21 179 72 Pola… POL 1952… 1952 Summer
## 8 183780 92370 Zygm… M 26 179 72 Pola… POL 1956… 1956 Summer
## 9 152047 76313 Zygm… M 27 175 71 Pola… POL 1972… 1972 Summer
## 10 152048 76313 Zygm… M 31 175 71 Pola… POL 1976… 1976 Summer
## # ℹ 271,106 more rows
## # ℹ 4 more variables: city <chr>, sport <chr>, event <chr>, medal <chr>
Introduction
Questions
Variation
ggplot(data = olympics) +
geom_bar(mapping = aes(x = sex))

olympics %>% count(sex)
## # A tibble: 2 × 2
## sex n
## <chr> <int>
## 1 F 74522
## 2 M 196594
Visualizing distributions
ggplot(data = olympics) +
geom_histogram(mapping = aes(x = year))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = olympics, mapping = aes(x = year, colour = sex)) +
geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Typical values
ggplot(data = olympics, mapping = aes(x = year)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Unusual values
Missing Values
olympics %>%
mutate(age = as.numeric(age))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `age = as.numeric(age)`.
## Caused by warning:
## ! NAs introduced by coercion
## # A tibble: 271,116 × 16
## Column1 id name sex age height weight team noc games year season
## <dbl> <dbl> <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <dbl> <chr>
## 1 270479 135289 Zzim… M 20 NA NA Braz… BRA 1952… 1952 Summer
## 2 207677 104222 zzet… M 26 178 74 Turk… TUR 2016… 2016 Summer
## 3 102956 52087 zzet… M 23 172 85 Turk… TUR 2004… 2004 Summer
## 4 102957 52087 zzet… M 27 172 85 Turk… TUR 2008… 2008 Summer
## 5 106601 53910 Zyta… F 26 187 85 Pola… POL 1988… 1988 Summer
## 6 219493 110259 Zygm… M 25 185 82 Pola… POL 1932… 1932 Summer
## 7 183779 92370 Zygm… M 21 179 72 Pola… POL 1952… 1952 Summer
## 8 183780 92370 Zygm… M 26 179 72 Pola… POL 1956… 1956 Summer
## 9 152047 76313 Zygm… M 27 175 71 Pola… POL 1972… 1972 Summer
## 10 152048 76313 Zygm… M 31 175 71 Pola… POL 1976… 1976 Summer
## # ℹ 271,106 more rows
## # ℹ 4 more variables: city <chr>, sport <chr>, event <chr>, medal <chr>
ggplot(data = olympics, mapping = aes(x = year, y = age)) +
geom_point(na.rm = TRUE)

Covariation
ggplot(olympics) +
geom_bar(mapping = aes(x = age))

A categorical and continuous variable
olympics %>%
ggplot(aes(x = sex, y = year)) +
geom_boxplot()

Two categorical variables
olympics %>%
count(sport, sex) %>%
ggplot(aes(x = sport, y = sex, fill = n)) +
geom_tile() +
theme(axis.text.x = element_text(angle = 70, hjust = 1)) + # Rotate x-axis labels
labs(title = "Participation in Olympic Sports by Gender",
x = "Sport",
y = "Gender",
fill = "Count")

Two continous variables
library(hexbin)
skiing_data <- olympics %>%
filter(sport == "Alpine Skiing" & !is.na(height) & !is.na(weight)) %>%
mutate(height = as.numeric(height),
weight = as.numeric(weight)) %>%
filter(!is.na(height) & !is.na(weight))
## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `height = as.numeric(height)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
ggplot(skiing_data, aes(x = height, y = weight)) +
geom_hex(bins = 30)

skiing_data %>%
ggplot(aes(x = as.factor(year), y = height)) +
geom_boxplot() +
labs(title = "Height of Alpine Skiing Athletes by Year",
x = "Year",
y = "Height (cm)")

Patterns and models
library(modelr)
olympics <- olympics %>%
mutate(age = as.numeric(age)) %>%
filter(sport == "Alpine Skiing" & !is.na(age) & !is.na(year) & !is.na(team))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `age = as.numeric(age)`.
## Caused by warning:
## ! NAs introduced by coercion
mod <- lm(log(year) ~ log(age), data = olympics)
olympics <- olympics %>%
modelr::add_residuals(mod)%>%
mutate(resid = exp(resid))
olympics %>%
ggplot(aes(x = age, y = resid)) +
geom_point()

olympics %>%
ggplot(aes(x = team, y = resid)) +
geom_boxplot() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
