import data

bakingshow <- read_excel("../01_module4/data/myData.xlsx")

Introduction

This data is from the Great British Baking show, containing data on each season’s contestants. There are 10 seasons of the show with many contestants in each one.

Questions

Variation

ggplot(data = bakingshow) +
    geom_bar(mapping = aes(x = technical_winner))

Visualizing distributions

ggplot(data = bakingshow) +
    geom_histogram(mapping = aes(x = total_episodes_appeared))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Typical values

winners <- bakingshow %>%
    filter(technical_winner > 0)
ggplot(data = winners, mapping = aes(x = total_episodes_appeared)) +
    geom_histogram(binwidth = .9)

Unusual values

ggplot(data = winners) +
    geom_bar(mapping = aes(x = technical_winner))

Missing Values

technical_win <- bakingshow %>%
    mutate(y = ifelse(y = 5, NA, y))

ggplot(technical_win, mapping = aes(x = series, y = y)) +
    geom_point()

Covariation

A categorical and continuous variable

ggplot(data = bakingshow, mapping = aes(x = series, y = technical_winner)) +
    geom_boxplot(aes(group = series))

Two categorical variables

ggplot(data = bakingshow) +
    geom_count(mapping = aes(x = series, y = technical_highest)) 

Two continous variables

ggplot(data = bakingshow) +
    geom_point(mapping = aes(x = series, y = age))

ggplot(data = bakingshow, mapping = aes(x = series, y = age)) +
    geom_boxplot(mapping = aes(group = series))

Patterns and models

ggplot(data = winners, mapping = aes(x = series, y = technical_winner)) +
    geom_point(mapping = aes(group = series))

mod <- lm(log(technical_winner) ~ log(series), data = winners)

winners2 <- winners %>%
    add_residuals(mod) %>%
    mutate(resid = exp(resid))

ggplot(data = winners2) +
    geom_point(mapping = aes(x = series, y = technical_winner))