Import data

# excel file
data <- read_excel("../00_data/MyData3.xlsx")
data
## # A tibble: 1,222 × 10
##     year months        state  colony_size colony_max colony_lost colony_lost_pct
##    <dbl> <chr>         <chr>        <dbl> <chr>            <dbl>           <dbl>
##  1  2015 January-March Alaba…        7000 7000              1800              26
##  2  2015 January-March Arizo…       35000 35000             4600              13
##  3  2015 January-March Arkan…       13000 14000             1500              11
##  4  2015 January-March Calif…     1440000 1690000         255000              15
##  5  2015 January-March Color…        3500 12500             1500              12
##  6  2015 January-March Conne…        3900 3900               870              22
##  7  2015 January-March Flori…      305000 315000           42000              13
##  8  2015 January-March Georg…      104000 105000           14500              14
##  9  2015 January-March Hawaii       10500 10500              380               4
## 10  2015 January-March Idaho        81000 88000             3700               4
## # ℹ 1,212 more rows
## # ℹ 3 more variables: colony_added <chr>, colony_reno <chr>,
## #   colony_reno_pct <chr>

Apply the following dplyr verbs to your data

Filter rows

filter(data, year == 2015, months == "January-March")
## # A tibble: 47 × 10
##     year months        state  colony_size colony_max colony_lost colony_lost_pct
##    <dbl> <chr>         <chr>        <dbl> <chr>            <dbl>           <dbl>
##  1  2015 January-March Alaba…        7000 7000              1800              26
##  2  2015 January-March Arizo…       35000 35000             4600              13
##  3  2015 January-March Arkan…       13000 14000             1500              11
##  4  2015 January-March Calif…     1440000 1690000         255000              15
##  5  2015 January-March Color…        3500 12500             1500              12
##  6  2015 January-March Conne…        3900 3900               870              22
##  7  2015 January-March Flori…      305000 315000           42000              13
##  8  2015 January-March Georg…      104000 105000           14500              14
##  9  2015 January-March Hawaii       10500 10500              380               4
## 10  2015 January-March Idaho        81000 88000             3700               4
## # ℹ 37 more rows
## # ℹ 3 more variables: colony_added <chr>, colony_reno <chr>,
## #   colony_reno_pct <chr>

Arrange rows

arrange(data, colony_size)
## # A tibble: 1,222 × 10
##     year months         state colony_size colony_max colony_lost colony_lost_pct
##    <dbl> <chr>          <chr>       <dbl> <chr>            <dbl>           <dbl>
##  1  2020 January-March  Maine        1300 1700               130               8
##  2  2020 April-June     Maine        1600 22000             1300               6
##  3  2017 April-June     Mass…        1900 8000               130               2
##  4  2018 January-March  Maine        1900 1900               240              13
##  5  2018 April-June     New …        1900 4300                50               1
##  6  2021 April-June     Maine        2000 18000             1100               6
##  7  2018 July-September Okla…        2100 2100               370              18
##  8  2016 January-March  Maine        2200 3700               170               5
##  9  2017 April-June     Maine        2200 40000              910               2
## 10  2018 April-June     Maine        2200 17000             1100               6
## # ℹ 1,212 more rows
## # ℹ 3 more variables: colony_added <chr>, colony_reno <chr>,
## #   colony_reno_pct <chr>

Select columns

select(data, year, months, state, colony_size, colony_lost, colony_reno)
## # A tibble: 1,222 × 6
##     year months        state       colony_size colony_lost colony_reno
##    <dbl> <chr>         <chr>             <dbl>       <dbl> <chr>      
##  1  2015 January-March Alabama            7000        1800 250        
##  2  2015 January-March Arizona           35000        4600 2100       
##  3  2015 January-March Arkansas          13000        1500 90         
##  4  2015 January-March California      1440000      255000 124000     
##  5  2015 January-March Colorado           3500        1500 140        
##  6  2015 January-March Connecticut        3900         870 NA         
##  7  2015 January-March Florida          305000       42000 25000      
##  8  2015 January-March Georgia          104000       14500 9500       
##  9  2015 January-March Hawaii            10500         380 760        
## 10  2015 January-March Idaho             81000        3700 8000       
## # ℹ 1,212 more rows
select(data, year, months, state, contains("colony"))
## # A tibble: 1,222 × 10
##     year months        state  colony_size colony_max colony_lost colony_lost_pct
##    <dbl> <chr>         <chr>        <dbl> <chr>            <dbl>           <dbl>
##  1  2015 January-March Alaba…        7000 7000              1800              26
##  2  2015 January-March Arizo…       35000 35000             4600              13
##  3  2015 January-March Arkan…       13000 14000             1500              11
##  4  2015 January-March Calif…     1440000 1690000         255000              15
##  5  2015 January-March Color…        3500 12500             1500              12
##  6  2015 January-March Conne…        3900 3900               870              22
##  7  2015 January-March Flori…      305000 315000           42000              13
##  8  2015 January-March Georg…      104000 105000           14500              14
##  9  2015 January-March Hawaii       10500 10500              380               4
## 10  2015 January-March Idaho        81000 88000             3700               4
## # ℹ 1,212 more rows
## # ℹ 3 more variables: colony_added <chr>, colony_reno <chr>,
## #   colony_reno_pct <chr>
select(data, year, months, state, ends_with("pct"))
## # A tibble: 1,222 × 5
##     year months        state       colony_lost_pct colony_reno_pct
##    <dbl> <chr>         <chr>                 <dbl> <chr>          
##  1  2015 January-March Alabama                  26 4              
##  2  2015 January-March Arizona                  13 6              
##  3  2015 January-March Arkansas                 11 1              
##  4  2015 January-March California               15 7              
##  5  2015 January-March Colorado                 12 1              
##  6  2015 January-March Connecticut              22 NA             
##  7  2015 January-March Florida                  13 8              
##  8  2015 January-March Georgia                  14 9              
##  9  2015 January-March Hawaii                    4 7              
## 10  2015 January-March Idaho                     4 9              
## # ℹ 1,212 more rows

Add columns

mutate(data,
       remaining_colony = colony_size - colony_lost) %>%
    
    # Select year, months, state and remaining_colony
    select(year:state, remaining_colony)
## # A tibble: 1,222 × 4
##     year months        state       remaining_colony
##    <dbl> <chr>         <chr>                  <dbl>
##  1  2015 January-March Alabama                 5200
##  2  2015 January-March Arizona                30400
##  3  2015 January-March Arkansas               11500
##  4  2015 January-March California           1185000
##  5  2015 January-March Colorado                2000
##  6  2015 January-March Connecticut             3030
##  7  2015 January-March Florida               263000
##  8  2015 January-March Georgia                89500
##  9  2015 January-March Hawaii                 10120
## 10  2015 January-March Idaho                  77300
## # ℹ 1,212 more rows
# Just keep remaining_colony
mutate(data,
       remaining_colony = colony_size - colony_lost) %>%
    # Select year, months, state and remaining_colony
    select(remaining_colony)
## # A tibble: 1,222 × 1
##    remaining_colony
##               <dbl>
##  1             5200
##  2            30400
##  3            11500
##  4          1185000
##  5             2000
##  6             3030
##  7           263000
##  8            89500
##  9            10120
## 10            77300
## # ℹ 1,212 more rows
# alternative using transmute()
transmute(data,
          remaining_colony = colony_size - colony_lost)
## # A tibble: 1,222 × 1
##    remaining_colony
##               <dbl>
##  1             5200
##  2            30400
##  3            11500
##  4          1185000
##  5             2000
##  6             3030
##  7           263000
##  8            89500
##  9            10120
## 10            77300
## # ℹ 1,212 more rows
# lag()
select(data, colony_lost) %>%
    
    mutate(colony_lost_lag1 = lag(colony_lost))
## # A tibble: 1,222 × 2
##    colony_lost colony_lost_lag1
##          <dbl>            <dbl>
##  1        1800               NA
##  2        4600             1800
##  3        1500             4600
##  4      255000             1500
##  5        1500           255000
##  6         870             1500
##  7       42000              870
##  8       14500            42000
##  9         380            14500
## 10        3700              380
## # ℹ 1,212 more rows

Summarize by groups

Collapsing data to a single row

data
## # A tibble: 1,222 × 10
##     year months        state  colony_size colony_max colony_lost colony_lost_pct
##    <dbl> <chr>         <chr>        <dbl> <chr>            <dbl>           <dbl>
##  1  2015 January-March Alaba…        7000 7000              1800              26
##  2  2015 January-March Arizo…       35000 35000             4600              13
##  3  2015 January-March Arkan…       13000 14000             1500              11
##  4  2015 January-March Calif…     1440000 1690000         255000              15
##  5  2015 January-March Color…        3500 12500             1500              12
##  6  2015 January-March Conne…        3900 3900               870              22
##  7  2015 January-March Flori…      305000 315000           42000              13
##  8  2015 January-March Georg…      104000 105000           14500              14
##  9  2015 January-March Hawaii       10500 10500              380               4
## 10  2015 January-March Idaho        81000 88000             3700               4
## # ℹ 1,212 more rows
## # ℹ 3 more variables: colony_added <chr>, colony_reno <chr>,
## #   colony_reno_pct <chr>
# average colony loss
summarise(data, loss = mean(colony_lost, na.rm = TRUE))
## # A tibble: 1 × 1
##     loss
##    <dbl>
## 1 16551.

Summarize by group

Which state has the biggest loss?

data %>%
    
    # Group by state
    group_by(state) %>%
    
    # Calculation of average colony loss
    summarise(loss = mean(colony_lost, na.rm = TRUE)) %>%
    
    # Sort it
    arrange(loss)
## # A tibble: 47 × 2
##    state          loss
##    <chr>         <dbl>
##  1 Connecticut    258 
##  2 Vermont        346 
##  3 New Jersey     690 
##  4 Massachusetts  731.
##  5 Hawaii         833.
##  6 Maine          877.
##  7 West Virginia  907.
##  8 Missouri       916.
##  9 Maryland      1004.
## 10 Virginia      1015.
## # ℹ 37 more rows

Missing values

data %>%
    
    # Remove missing values
    filter(!is.na(colony_size))
## # A tibble: 1,175 × 10
##     year months        state  colony_size colony_max colony_lost colony_lost_pct
##    <dbl> <chr>         <chr>        <dbl> <chr>            <dbl>           <dbl>
##  1  2015 January-March Alaba…        7000 7000              1800              26
##  2  2015 January-March Arizo…       35000 35000             4600              13
##  3  2015 January-March Arkan…       13000 14000             1500              11
##  4  2015 January-March Calif…     1440000 1690000         255000              15
##  5  2015 January-March Color…        3500 12500             1500              12
##  6  2015 January-March Conne…        3900 3900               870              22
##  7  2015 January-March Flori…      305000 315000           42000              13
##  8  2015 January-March Georg…      104000 105000           14500              14
##  9  2015 January-March Hawaii       10500 10500              380               4
## 10  2015 January-March Idaho        81000 88000             3700               4
## # ℹ 1,165 more rows
## # ℹ 3 more variables: colony_added <chr>, colony_reno <chr>,
## #   colony_reno_pct <chr>
data <- data %>% 
  filter(!is.na(colony_size), !is.na(colony_max))

data %>% 
  group_by(year, months, state) %>% 
  summarise(mean = mean(colony_size))
## # A tibble: 1,175 × 4
## # Groups:   year, months [25]
##     year months     state          mean
##    <dbl> <chr>      <chr>         <dbl>
##  1  2015 April-June Alabama        7500
##  2  2015 April-June Arizona       33000
##  3  2015 April-June Arkansas      13500
##  4  2015 April-June California  1040000
##  5  2015 April-June Colorado      12000
##  6  2015 April-June Connecticut    3000
##  7  2015 April-June Florida      210000
##  8  2015 April-June Georgia      114000
##  9  2015 April-June Hawaii        13500
## 10  2015 April-June Idaho         62000
## # ℹ 1,165 more rows

useful summary functions

What is the biggest and smallest colony?

data %>% 
  group_by(year, months, state) %>% 
  summarise(
    first = min(colony_size),
    last = max(colony_size)
  )
## # A tibble: 1,175 × 5
## # Groups:   year, months [25]
##     year months     state         first    last
##    <dbl> <chr>      <chr>         <dbl>   <dbl>
##  1  2015 April-June Alabama        7500    7500
##  2  2015 April-June Arizona       33000   33000
##  3  2015 April-June Arkansas      13500   13500
##  4  2015 April-June California  1040000 1040000
##  5  2015 April-June Colorado      12000   12000
##  6  2015 April-June Connecticut    3000    3000
##  7  2015 April-June Florida      210000  210000
##  8  2015 April-June Georgia      114000  114000
##  9  2015 April-June Hawaii        13500   13500
## 10  2015 April-June Idaho         62000   62000
## # ℹ 1,165 more rows

Which state has the most losses?

data %>% 
  group_by(state) %>% 
  summarise(colony_lost = n_distinct(colony_lost)) %>% 
  arrange(desc(colony_lost))
## # A tibble: 47 × 2
##    state         colony_lost
##    <chr>               <int>
##  1 United States          25
##  2 California             24
##  3 Minnesota              24
##  4 Oklahoma               24
##  5 Hawaii                 23
##  6 North Dakota           23
##  7 Kansas                 22
##  8 Maine                  22
##  9 New Jersey             22
## 10 Texas                  22
## # ℹ 37 more rows

Grouping multiple variables

data %>%
    group_by(year, state, colony_lost) %>%
    summarize(count = n()) %>%
    ungroup()
## # A tibble: 1,150 × 4
##     year state    colony_lost count
##    <dbl> <chr>          <dbl> <int>
##  1  2015 Alabama          610     1
##  2  2015 Alabama          860     1
##  3  2015 Alabama         1400     1
##  4  2015 Alabama         1800     1
##  5  2015 Arizona         4600     1
##  6  2015 Arizona         5500     1
##  7  2015 Arizona         6000     1
##  8  2015 Arizona        12000     1
##  9  2015 Arkansas        1500     1
## 10  2015 Arkansas        1900     1
## # ℹ 1,140 more rows