Import data

# excel file
my_data <- read_excel("../00_data/myData_all.xlsx")
my_data
## # A tibble: 220 × 7
##    observed_month      prod_type  prod_process n_hens n_eggs source Eggs_Per_Hen
##    <dttm>              <chr>      <chr>         <dbl>  <dbl> <chr>         <dbl>
##  1 2016-07-31 00:00:00 hatching … all          5.80e7 1.15e9 ChicE…         19.8
##  2 2016-08-31 00:00:00 hatching … all          5.76e7 1.14e9 ChicE…         19.8
##  3 2016-09-30 00:00:00 hatching … all          5.72e7 1.09e9 ChicE…         19.1
##  4 2016-10-31 00:00:00 hatching … all          5.69e7 1.13e9 ChicE…         19.8
##  5 2016-11-30 00:00:00 hatching … all          5.71e7 1.10e9 ChicE…         19.2
##  6 2016-12-31 00:00:00 hatching … all          5.77e7 1.13e9 ChicE…         19.6
##  7 2017-01-31 00:00:00 hatching … all          5.80e7 1.12e9 ChicE…         19.4
##  8 2017-02-28 00:00:00 hatching … all          5.83e7 1.01e9 ChicE…         17.4
##  9 2017-03-31 00:00:00 hatching … all          5.87e7 1.13e9 ChicE…         19.2
## 10 2017-04-30 00:00:00 hatching … all          5.91e7 1.10e9 ChicE…         18.6
## # ℹ 210 more rows

Introduction

Questions

Variation

Visualizing distributions

my_data %>%
    ggplot() +
    geom_bar(aes(x=prod_process))

# data in thousands
ggplot(data = my_data, mapping = aes(x = n_eggs/1000, colour = prod_process)) +
  geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Typical values

my_data %>%
    ggplot(aes(x=Eggs_Per_Hen)) +
    geom_histogram(binwidth = 0.5)

Unusual values

Missing Values

# data in thousands
my_data %>%
    ggplot(mapping = aes(x=n_hens/1000, y=n_eggs/1000)) + 
    geom_point()

# data in thousands
my_data %>%
    
    mutate(n_hens = ifelse(n_hens < 1000 | n_hens > 100000000, NA, n_hens)) %>%
    
    ggplot(mapping = aes(x=n_hens/1000, y=n_eggs/1000)) + 
    geom_point(na.rm = TRUE)

# data in thousands
my_data %>%
    
    mutate(n_hens = ifelse(n_hens < 100000000 | n_hens > 10000000000, NA, n_hens)) %>%
    
    ggplot(mapping = aes(x=n_hens/1000, y=n_eggs/1000)) + 
    geom_point(na.rm = TRUE)

Covariation

A categorical and continuous variable

my_data %>%
    ggplot(aes(x = prod_process, y = Eggs_Per_Hen)) +
    geom_boxplot()

Two categorical variables

my_data %>% 
    count(prod_type, prod_process) %>%
    
    ggplot(aes(x = prod_type, y = prod_process)) +
    geom_tile()

Two continous variables

# data in thousands
library(hexbin)
my_data %>%
    ggplot(aes(x = n_hens/1000, y = n_eggs/1000)) +
    geom_hex()

Patterns and models

ggplot(data = my_data) + 
  geom_point(mapping = aes(x = n_hens, y = n_eggs))

library(modelr)

mod <- lm(log(n_hens) ~ log(n_eggs), data = my_data)

my_data2 <- my_data %>% 
  add_residuals(mod) %>% 
  mutate(resid = exp(resid))

ggplot(data = my_data2) + 
  geom_point(mapping = aes(x = n_hens, y = resid))