Import data

data <- read.csv("../00_data/myData.csv")

Introduction

Questions

Variation

Visualizing distributions

data %>%
    ggplot(aes(x = birth_country)) +
    geom_bar(fill = "orange")

data %>% 
    ggplot(mapping = aes(x = birth_year)) +
    geom_histogram(binwidth = 0.5, fill = "red")

Typical values

data %>%
    
    # Filter out people born after 1950
    
    filter(birth_year > 1950) %>%
    
    #Plot 
    ggplot(aes(x = birth_year)) +
    geom_histogram(fill= "blue")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Unusual values

Missing Values

Covariation

A categorical and continuous variable

data %>%
    ggplot(aes(x = birth_month, y = birth_country)) +
    geom_boxplot(fill = "green")

Two categorical variables

library(hexbin)
## Warning: package 'hexbin' was built under R version 4.4.3
data %>%
    ggplot(aes(x = birth_state_province, y = birth_country)) +
    geom_hex()

data %>%
    ggplot(aes(x = birth_state_province, y = birth_country)) +
    geom_boxplot(fill = "orange")

Two continous variables

library(hexbin)

data %>%
    ggplot(aes(x = birth_year, y = birth_month)) +
    geom_hex()

data %>%
    ggplot(aes(x = birth_year, y = birth_month)) +
    geom_boxplot()
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

Patterns and models

library(modelr)
## Warning: package 'modelr' was built under R version 4.4.3
mod <- lm(log(birth_year) ~ log(birth_month), data = data)

data4 <- data %>%
    modelr::add_residuals(mod) %>%
    mutate(resid = exp(resid))

data4 %>%
    ggplot(aes(birth_year, resid)) +
    geom_point()

data4 %>%
    ggplot(aes(birth_year, resid)) +
    geom_boxplot(fill= "navy")
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?