Import data

# excel file
data <- read_excel("../00_data/MyData-Charts.xlsx")
data
## # A tibble: 1,222 × 11
##     year months    state colon…¹ colon…² colon…³ colon…⁴ colon…⁵ colon…⁶ colon…⁷
##    <dbl> <chr>     <chr>   <dbl> <chr>     <dbl>   <dbl> <chr>   <chr>   <chr>  
##  1  2015 January-… Alab…    7000 7000       1800      26 2800    250     4      
##  2  2015 January-… Ariz…   35000 35000      4600      13 3400    2100    6      
##  3  2015 January-… Arka…   13000 14000      1500      11 1200    90      1      
##  4  2015 January-… Cali… 1440000 1690000  255000      15 250000  124000  7      
##  5  2015 January-… Colo…    3500 12500      1500      12 200     140     1      
##  6  2015 January-… Conn…    3900 3900        870      22 290     NA      NA     
##  7  2015 January-… Flor…  305000 315000    42000      13 54000   25000   8      
##  8  2015 January-… Geor…  104000 105000    14500      14 47000   9500    9      
##  9  2015 January-… Hawa…   10500 10500       380       4 3400    760     7      
## 10  2015 January-… Idaho   81000 88000      3700       4 2600    8000    9      
## # … with 1,212 more rows, 1 more variable: `Growth of colonies` <dbl>, and
## #   abbreviated variable names ¹​colony_n, ²​colony_max, ³​colony_lost,
## #   ⁴​colony_lost_pct, ⁵​colony_added, ⁶​colony_reno, ⁷​colony_reno_pct

Apply the following dplyr verbs to your data

Filter rows

filter(data, year == "2020", months == "January-March")
## # A tibble: 47 × 11
##     year months    state colon…¹ colon…² colon…³ colon…⁴ colon…⁵ colon…⁶ colon…⁷
##    <dbl> <chr>     <chr>   <dbl> <chr>     <dbl>   <dbl> <chr>   <chr>   <chr>  
##  1  2020 January-… Alab…    5500 8000        410       5 1000    780     10     
##  2  2020 January-… Ariz…   29000 32000      6500      20 150     120     NA     
##  3  2020 January-… Arka…   16500 17500      1200       7 700     30      NA     
##  4  2020 January-… Cali… 1350000 1710000  230000      13 240000  66000   4      
##  5  2020 January-… Colo…    4500 16000       650       4 850     500     3      
##  6  2020 January-… Conn…    3700 4100        330       8 60      NA      NA     
##  7  2020 January-… Flor…  295000 310000    30000      10 41000   8000    3      
##  8  2020 January-… Geor…  115000 121000    14500      12 32000   22000   18     
##  9  2020 January-… Hawa…   16000 16000      1500       9 1000    2200    14     
## 10  2020 January-… Idaho  156000 168000    18500      11 4400    920     1      
## # … with 37 more rows, 1 more variable: `Growth of colonies` <dbl>, and
## #   abbreviated variable names ¹​colony_n, ²​colony_max, ³​colony_lost,
## #   ⁴​colony_lost_pct, ⁵​colony_added, ⁶​colony_reno, ⁷​colony_reno_pct

Arrange rows

arrange(data, desc(colony_lost_pct))
## # A tibble: 1,222 × 11
##     year months    state colon…¹ colon…² colon…³ colon…⁴ colon…⁵ colon…⁶ colon…⁷
##    <dbl> <chr>     <chr>   <dbl> <chr>     <dbl>   <dbl> <chr>   <chr>   <chr>  
##  1  2018 January-… New …    7500 7500       3900      52 180     NA      NA     
##  2  2015 January-… Ohio    18000 22000     10500      48 5000    3300    15     
##  3  2016 January-… Okla…    6000 6000       2900      48 60      510     9      
##  4  2020 April-Ju… Alab…    8500 8500       4100      48 4700    2620    31     
##  5  2017 January-… Mass…    3300 3300       1500      45 120     110     3      
##  6  2015 January-… Mary…    7500 10000      4100      41 1600    870     9      
##  7  2015 January-… Illi…    6000 10500      4200      40 2300    390     4      
##  8  2015 October-… Kans…    8500 8500       3400      40 50      20      NA     
##  9  2015 January-… Kent…    7500 10500      4100      39 250     NA      NA     
## 10  2017 January-… New …    5500 5500       2100      38 3600    NA      NA     
## # … with 1,212 more rows, 1 more variable: `Growth of colonies` <dbl>, and
## #   abbreviated variable names ¹​colony_n, ²​colony_max, ³​colony_lost,
## #   ⁴​colony_lost_pct, ⁵​colony_added, ⁶​colony_reno, ⁷​colony_reno_pct

Select columns

select(data, state, colony_lost_pct, colony_added, starts_with("abc"))
## # A tibble: 1,222 × 3
##    state       colony_lost_pct colony_added
##    <chr>                 <dbl> <chr>       
##  1 Alabama                  26 2800        
##  2 Arizona                  13 3400        
##  3 Arkansas                 11 1200        
##  4 California               15 250000      
##  5 Colorado                 12 200         
##  6 Connecticut              22 290         
##  7 Florida                  13 54000       
##  8 Georgia                  14 47000       
##  9 Hawaii                    4 3400        
## 10 Idaho                     4 2600        
## # … with 1,212 more rows

Add columns

plot_data <- data %>% group_by(year, state) %>% summarise(avg_pct = mean(colony_lost_pct, na.rm = TRUE)) %>% filter(year == 2021) %>% arrange(-avg_pct) %>% slice(1:10)

plot_data %>% ggplot(aes(x = avg_pct, y = fct_reorder(state, avg_pct))) + geom_col()

Sumarise

summarise(data, delay = mean(colony_lost_pct, na.rm = TRUE))
## # A tibble: 1 × 1
##   delay
##   <dbl>
## 1  11.4

Sumarise by Group

by_day <- group_by(data, year, months, state)
summarise(by_day, delay = mean(colony_lost_pct, na.rm = TRUE))
## # A tibble: 1,222 × 4
## # Groups:   year, months [26]
##     year months     state       delay
##    <dbl> <chr>      <chr>       <dbl>
##  1  2015 April-June Alabama        12
##  2  2015 April-June Arizona        17
##  3  2015 April-June Arkansas       14
##  4  2015 April-June California     10
##  5  2015 April-June Colorado        8
##  6  2015 April-June Connecticut     3
##  7  2015 April-June Florida        16
##  8  2015 April-June Georgia        19
##  9  2015 April-June Hawaii          1
## 10  2015 April-June Idaho           9
## # … with 1,212 more rows