Import data
# excel file
my_data <- read_excel("../00_data/myData_all.xlsx")
my_data
## # A tibble: 220 × 7
## observed_month prod_type prod_process n_hens n_eggs source Eggs_Per_Hen
## <dttm> <chr> <chr> <dbl> <dbl> <chr> <dbl>
## 1 2016-07-31 00:00:00 hatching … all 5.80e7 1.15e9 ChicE… 19.8
## 2 2016-08-31 00:00:00 hatching … all 5.76e7 1.14e9 ChicE… 19.8
## 3 2016-09-30 00:00:00 hatching … all 5.72e7 1.09e9 ChicE… 19.1
## 4 2016-10-31 00:00:00 hatching … all 5.69e7 1.13e9 ChicE… 19.8
## 5 2016-11-30 00:00:00 hatching … all 5.71e7 1.10e9 ChicE… 19.2
## 6 2016-12-31 00:00:00 hatching … all 5.77e7 1.13e9 ChicE… 19.6
## 7 2017-01-31 00:00:00 hatching … all 5.80e7 1.12e9 ChicE… 19.4
## 8 2017-02-28 00:00:00 hatching … all 5.83e7 1.01e9 ChicE… 17.4
## 9 2017-03-31 00:00:00 hatching … all 5.87e7 1.13e9 ChicE… 19.2
## 10 2017-04-30 00:00:00 hatching … all 5.91e7 1.10e9 ChicE… 18.6
## # ℹ 210 more rows
Introduction
Questions
Variation
Visualizing distributions
my_data %>%
ggplot() +
geom_bar(aes(x=prod_process))

# data in thousands
ggplot(data = my_data, mapping = aes(x = n_eggs/1000, colour = prod_process)) +
geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Typical values
my_data %>%
ggplot(aes(x=Eggs_Per_Hen)) +
geom_histogram(binwidth = 0.5)

Unusual values
Missing Values
# data in thousands
my_data %>%
ggplot(mapping = aes(x=n_hens/1000, y=n_eggs/1000)) +
geom_point()

# data in thousands
my_data %>%
mutate(n_hens = ifelse(n_hens < 1000 | n_hens > 100000000, NA, n_hens)) %>%
ggplot(mapping = aes(x=n_hens/1000, y=n_eggs/1000)) +
geom_point(na.rm = TRUE)

# data in thousands
my_data %>%
mutate(n_hens = ifelse(n_hens < 100000000 | n_hens > 10000000000, NA, n_hens)) %>%
ggplot(mapping = aes(x=n_hens/1000, y=n_eggs/1000)) +
geom_point(na.rm = TRUE)

Covariation
A categorical and continuous variable
my_data %>%
ggplot(aes(x = prod_process, y = Eggs_Per_Hen)) +
geom_boxplot()

Two categorical variables
my_data %>%
count(prod_type, prod_process) %>%
ggplot(aes(x = prod_type, y = prod_process)) +
geom_tile()

Two continous variables
# data in thousands
library(hexbin)
my_data %>%
ggplot(aes(x = n_hens/1000, y = n_eggs/1000)) +
geom_hex()

Patterns and models
ggplot(data = my_data) +
geom_point(mapping = aes(x = n_hens, y = n_eggs))

library(modelr)
mod <- lm(log(n_hens) ~ log(n_eggs), data = my_data)
my_data2 <- my_data %>%
add_residuals(mod) %>%
mutate(resid = exp(resid))
ggplot(data = my_data2) +
geom_point(mapping = aes(x = n_hens, y = resid))
