Import Data

# excel file
data <- read_excel("../00_data/MyData-Charts.xlsx")

data 
## # A tibble: 1,222 × 11
##     year months    state colon…¹ colon…² colon…³ colon…⁴ colon…⁵ colon…⁶ colon…⁷
##    <dbl> <chr>     <chr>   <dbl> <chr>     <dbl>   <dbl> <chr>   <chr>   <chr>  
##  1  2015 January-… Alab…    7000 7000       1800      26 2800    250     4      
##  2  2015 January-… Ariz…   35000 35000      4600      13 3400    2100    6      
##  3  2015 January-… Arka…   13000 14000      1500      11 1200    90      1      
##  4  2015 January-… Cali… 1440000 1690000  255000      15 250000  124000  7      
##  5  2015 January-… Colo…    3500 12500      1500      12 200     140     1      
##  6  2015 January-… Conn…    3900 3900        870      22 290     NA      NA     
##  7  2015 January-… Flor…  305000 315000    42000      13 54000   25000   8      
##  8  2015 January-… Geor…  104000 105000    14500      14 47000   9500    9      
##  9  2015 January-… Hawa…   10500 10500       380       4 3400    760     7      
## 10  2015 January-… Idaho   81000 88000      3700       4 2600    8000    9      
## # … with 1,212 more rows, 1 more variable: `Growth of colonies` <dbl>, and
## #   abbreviated variable names ¹​colony_n, ²​colony_max, ³​colony_lost,
## #   ⁴​colony_lost_pct, ⁵​colony_added, ⁶​colony_reno, ⁷​colony_reno_pct

Introduction

Questions

Variation

ggplot(data = data) +
  geom_bar(mapping = aes(x = months))

Visualizing distributions

ggplot(data = data) +
  geom_bar(mapping = aes(x = months))

ggplot(data = data) +
  geom_histogram(mapping = aes(x = colony_lost_pct))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 54 rows containing non-finite values (`stat_bin()`).

Typical values

ggplot(data = data, mapping = aes(x = colony_lost_pct)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 54 rows containing non-finite values (`stat_bin()`).

Unusual values

ggplot(data = data, mapping = aes(x = colony_lost_pct)) +
  geom_histogram() +
  coord_cartesian(ylim = c(0, 40))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 54 rows containing non-finite values (`stat_bin()`).

Missing Values

data <- data %>% 
  mutate(y = ifelse(colony_lost_pct < 3 | colony_lost_pct > 20, NA, colony_lost_pct))

Covariation

A categorical and continuous variable

ggplot(data = data, mapping = aes(x = colony_lost_pct)) + 
  geom_freqpoly(mapping = aes(colour = months), binwidth = 1)
## Warning: Removed 54 rows containing non-finite values (`stat_bin()`).

### Two categorical variables

data %>%
    
    count(months, year) %>%
    
    ggplot(aes(x = months, y = year, fill = n)) +
    geom_tile()

Two continous variables

ggplot(data = data) +
  geom_bin2d(mapping = aes(x = colony_max, y = colony_lost))
## Warning: Removed 47 rows containing non-finite values (`stat_bin2d()`).

# install.packages("hexbin")
ggplot(data = data) +
  geom_hex(mapping = aes(x = colony_max, y = colony_lost))
## Warning: Removed 47 rows containing non-finite values (`stat_binhex()`).

Two continous variables

library(hexbin)
data %>%
    ggplot(aes(x = colony_added, y = colony_lost)) +
    geom_hex()
## Warning: Removed 47 rows containing non-finite values (`stat_binhex()`).

data %>%
    filter(colony_added < 20) %>%
    ggplot(aes(x = colony_added, y = colony_lost)) +
    geom_boxplot(aes(group = cut_width(colony_added, 0.5)))

Patterns and models

ggplot(data = data) + 
  geom_boxplot(mapping = aes(x = as.factor(year), y = colony_lost_pct))
## Warning: Removed 54 rows containing non-finite values (`stat_boxplot()`).

ggplot(data = data) + 
  geom_point(mapping = aes(x = year, y = colony_lost_pct))
## Warning: Removed 54 rows containing missing values (`geom_point()`).