Import your data

colony <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-01-11/colony.csv')
## Rows: 1222 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): months, state
## dbl (8): year, colony_n, colony_max, colony_lost, colony_lost_pct, colony_ad...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Chapter 15

Create a factor

months <- c("January-March", "April-June", "July-September", "October-December")
months
## [1] "January-March"    "April-June"       "July-September"   "October-December"
month_levels <- c("Janurary", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December")

colony_months <- factor(months, levels = month_levels)
colony_months
## [1] <NA> <NA> <NA> <NA>
## 12 Levels: Janurary February March April May June July August ... December

Modify factor order

set.seed(123)

colony_small <- colony %>%
    sample_n(9) %>%
    select(year, state, colony_lost, colony_added)

colony_small
## # A tibble: 9 × 4
##    year state      colony_lost colony_added
##   <dbl> <chr>            <dbl>        <dbl>
## 1  2017 Utah              2700         2900
## 2  2017 Vermont            170          390
## 3  2015 Texas            25000        13000
## 4  2017 Hawaii             130          970
## 5  2016 Florida          45000        36000
## 6  2019 Wyoming           3300          100
## 7  2021 Kansas            1400         2300
## 8  2020 California       69000        61000
## 9  2018 Florida          30000        53000

Before

loss_summary <- colony_small %>%
    group_by(state) %>%
    summarise(colony_added = mean(colony_added, na.rm = TRUE), colony_lost = mean(colony_lost, na.rm = TRUE), n = n())

ggplot(loss_summary, aes(colony_lost, state)) + geom_point()

After

ggplot(loss_summary, aes(colony_lost, fct_reorder(state, colony_lost))) +
  geom_point()

Modify factor levels

Recode

colony %>% distinct(months)
## # A tibble: 4 × 1
##   months          
##   <chr>           
## 1 January-March   
## 2 April-June      
## 3 July-September  
## 4 October-December
colony %>%
    mutate(Jan_group = fct_recode(months, "Jan, Feb, Mar" = "January-March")) %>%
    select(months, Jan_group) %>%
    filter(months == "January-March")
## # A tibble: 329 × 2
##    months        Jan_group    
##    <chr>         <fct>        
##  1 January-March Jan, Feb, Mar
##  2 January-March Jan, Feb, Mar
##  3 January-March Jan, Feb, Mar
##  4 January-March Jan, Feb, Mar
##  5 January-March Jan, Feb, Mar
##  6 January-March Jan, Feb, Mar
##  7 January-March Jan, Feb, Mar
##  8 January-March Jan, Feb, Mar
##  9 January-March Jan, Feb, Mar
## 10 January-March Jan, Feb, Mar
## # … with 319 more rows

Collapse

  • fct_collapse
colony_small %>%
    mutate(state_col = fct_collapse(state, "Southern state" = c("Texas", "Florida"))) %>%
    select(state, state_col)
## # A tibble: 9 × 2
##   state      state_col     
##   <chr>      <fct>         
## 1 Utah       Utah          
## 2 Vermont    Vermont       
## 3 Texas      Southern state
## 4 Hawaii     Hawaii        
## 5 Florida    Southern state
## 6 Wyoming    Wyoming       
## 7 Kansas     Kansas        
## 8 California California    
## 9 Florida    Southern state

Lump

colony_small %>% count(state)
## # A tibble: 8 × 2
##   state          n
##   <chr>      <int>
## 1 California     1
## 2 Florida        2
## 3 Hawaii         1
## 4 Kansas         1
## 5 Texas          1
## 6 Utah           1
## 7 Vermont        1
## 8 Wyoming        1
colony_small %>% mutate(state_lump = fct_lump(state)) %>% distinct(state_lump)
## # A tibble: 8 × 1
##   state_lump
##   <fct>     
## 1 Utah      
## 2 Vermont   
## 3 Texas     
## 4 Hawaii    
## 5 Florida   
## 6 Wyoming   
## 7 Kansas    
## 8 California

Chapter 16

No need to do anything here.