# excel file
data <- read_excel("../00_data/MyData3.xlsx")
data
## # A tibble: 1,222 × 10
## year months state colony_size colony_max colony_lost colony_lost_pct
## <dbl> <chr> <chr> <dbl> <chr> <dbl> <dbl>
## 1 2015 January-March Alaba… 7000 7000 1800 26
## 2 2015 January-March Arizo… 35000 35000 4600 13
## 3 2015 January-March Arkan… 13000 14000 1500 11
## 4 2015 January-March Calif… 1440000 1690000 255000 15
## 5 2015 January-March Color… 3500 12500 1500 12
## 6 2015 January-March Conne… 3900 3900 870 22
## 7 2015 January-March Flori… 305000 315000 42000 13
## 8 2015 January-March Georg… 104000 105000 14500 14
## 9 2015 January-March Hawaii 10500 10500 380 4
## 10 2015 January-March Idaho 81000 88000 3700 4
## # ℹ 1,212 more rows
## # ℹ 3 more variables: colony_added <chr>, colony_reno <chr>,
## # colony_reno_pct <chr>
filter(data, year == 2015, months == "January-March")
## # A tibble: 47 × 10
## year months state colony_size colony_max colony_lost colony_lost_pct
## <dbl> <chr> <chr> <dbl> <chr> <dbl> <dbl>
## 1 2015 January-March Alaba… 7000 7000 1800 26
## 2 2015 January-March Arizo… 35000 35000 4600 13
## 3 2015 January-March Arkan… 13000 14000 1500 11
## 4 2015 January-March Calif… 1440000 1690000 255000 15
## 5 2015 January-March Color… 3500 12500 1500 12
## 6 2015 January-March Conne… 3900 3900 870 22
## 7 2015 January-March Flori… 305000 315000 42000 13
## 8 2015 January-March Georg… 104000 105000 14500 14
## 9 2015 January-March Hawaii 10500 10500 380 4
## 10 2015 January-March Idaho 81000 88000 3700 4
## # ℹ 37 more rows
## # ℹ 3 more variables: colony_added <chr>, colony_reno <chr>,
## # colony_reno_pct <chr>
arrange(data, colony_size)
## # A tibble: 1,222 × 10
## year months state colony_size colony_max colony_lost colony_lost_pct
## <dbl> <chr> <chr> <dbl> <chr> <dbl> <dbl>
## 1 2020 January-March Maine 1300 1700 130 8
## 2 2020 April-June Maine 1600 22000 1300 6
## 3 2017 April-June Mass… 1900 8000 130 2
## 4 2018 January-March Maine 1900 1900 240 13
## 5 2018 April-June New … 1900 4300 50 1
## 6 2021 April-June Maine 2000 18000 1100 6
## 7 2018 July-September Okla… 2100 2100 370 18
## 8 2016 January-March Maine 2200 3700 170 5
## 9 2017 April-June Maine 2200 40000 910 2
## 10 2018 April-June Maine 2200 17000 1100 6
## # ℹ 1,212 more rows
## # ℹ 3 more variables: colony_added <chr>, colony_reno <chr>,
## # colony_reno_pct <chr>
select(data, year, months, state, colony_size, colony_lost, colony_reno)
## # A tibble: 1,222 × 6
## year months state colony_size colony_lost colony_reno
## <dbl> <chr> <chr> <dbl> <dbl> <chr>
## 1 2015 January-March Alabama 7000 1800 250
## 2 2015 January-March Arizona 35000 4600 2100
## 3 2015 January-March Arkansas 13000 1500 90
## 4 2015 January-March California 1440000 255000 124000
## 5 2015 January-March Colorado 3500 1500 140
## 6 2015 January-March Connecticut 3900 870 NA
## 7 2015 January-March Florida 305000 42000 25000
## 8 2015 January-March Georgia 104000 14500 9500
## 9 2015 January-March Hawaii 10500 380 760
## 10 2015 January-March Idaho 81000 3700 8000
## # ℹ 1,212 more rows
select(data, year, months, state, contains("colony"))
## # A tibble: 1,222 × 10
## year months state colony_size colony_max colony_lost colony_lost_pct
## <dbl> <chr> <chr> <dbl> <chr> <dbl> <dbl>
## 1 2015 January-March Alaba… 7000 7000 1800 26
## 2 2015 January-March Arizo… 35000 35000 4600 13
## 3 2015 January-March Arkan… 13000 14000 1500 11
## 4 2015 January-March Calif… 1440000 1690000 255000 15
## 5 2015 January-March Color… 3500 12500 1500 12
## 6 2015 January-March Conne… 3900 3900 870 22
## 7 2015 January-March Flori… 305000 315000 42000 13
## 8 2015 January-March Georg… 104000 105000 14500 14
## 9 2015 January-March Hawaii 10500 10500 380 4
## 10 2015 January-March Idaho 81000 88000 3700 4
## # ℹ 1,212 more rows
## # ℹ 3 more variables: colony_added <chr>, colony_reno <chr>,
## # colony_reno_pct <chr>
select(data, year, months, state, ends_with("pct"))
## # A tibble: 1,222 × 5
## year months state colony_lost_pct colony_reno_pct
## <dbl> <chr> <chr> <dbl> <chr>
## 1 2015 January-March Alabama 26 4
## 2 2015 January-March Arizona 13 6
## 3 2015 January-March Arkansas 11 1
## 4 2015 January-March California 15 7
## 5 2015 January-March Colorado 12 1
## 6 2015 January-March Connecticut 22 NA
## 7 2015 January-March Florida 13 8
## 8 2015 January-March Georgia 14 9
## 9 2015 January-March Hawaii 4 7
## 10 2015 January-March Idaho 4 9
## # ℹ 1,212 more rows
mutate(data,
remaining_colony = colony_size - colony_lost) %>%
# Select year, months, state and remaining_colony
select(year:state, remaining_colony)
## # A tibble: 1,222 × 4
## year months state remaining_colony
## <dbl> <chr> <chr> <dbl>
## 1 2015 January-March Alabama 5200
## 2 2015 January-March Arizona 30400
## 3 2015 January-March Arkansas 11500
## 4 2015 January-March California 1185000
## 5 2015 January-March Colorado 2000
## 6 2015 January-March Connecticut 3030
## 7 2015 January-March Florida 263000
## 8 2015 January-March Georgia 89500
## 9 2015 January-March Hawaii 10120
## 10 2015 January-March Idaho 77300
## # ℹ 1,212 more rows
# Just keep remaining_colony
mutate(data,
remaining_colony = colony_size - colony_lost) %>%
# Select year, months, state and remaining_colony
select(remaining_colony)
## # A tibble: 1,222 × 1
## remaining_colony
## <dbl>
## 1 5200
## 2 30400
## 3 11500
## 4 1185000
## 5 2000
## 6 3030
## 7 263000
## 8 89500
## 9 10120
## 10 77300
## # ℹ 1,212 more rows
# alternative using transmute()
transmute(data,
remaining_colony = colony_size - colony_lost)
## # A tibble: 1,222 × 1
## remaining_colony
## <dbl>
## 1 5200
## 2 30400
## 3 11500
## 4 1185000
## 5 2000
## 6 3030
## 7 263000
## 8 89500
## 9 10120
## 10 77300
## # ℹ 1,212 more rows
# lag()
select(data, colony_lost) %>%
mutate(colony_lost_lag1 = lag(colony_lost))
## # A tibble: 1,222 × 2
## colony_lost colony_lost_lag1
## <dbl> <dbl>
## 1 1800 NA
## 2 4600 1800
## 3 1500 4600
## 4 255000 1500
## 5 1500 255000
## 6 870 1500
## 7 42000 870
## 8 14500 42000
## 9 380 14500
## 10 3700 380
## # ℹ 1,212 more rows
Collapsing data to a single row
data
## # A tibble: 1,222 × 10
## year months state colony_size colony_max colony_lost colony_lost_pct
## <dbl> <chr> <chr> <dbl> <chr> <dbl> <dbl>
## 1 2015 January-March Alaba… 7000 7000 1800 26
## 2 2015 January-March Arizo… 35000 35000 4600 13
## 3 2015 January-March Arkan… 13000 14000 1500 11
## 4 2015 January-March Calif… 1440000 1690000 255000 15
## 5 2015 January-March Color… 3500 12500 1500 12
## 6 2015 January-March Conne… 3900 3900 870 22
## 7 2015 January-March Flori… 305000 315000 42000 13
## 8 2015 January-March Georg… 104000 105000 14500 14
## 9 2015 January-March Hawaii 10500 10500 380 4
## 10 2015 January-March Idaho 81000 88000 3700 4
## # ℹ 1,212 more rows
## # ℹ 3 more variables: colony_added <chr>, colony_reno <chr>,
## # colony_reno_pct <chr>
# average colony loss
summarise(data, loss = mean(colony_lost, na.rm = TRUE))
## # A tibble: 1 × 1
## loss
## <dbl>
## 1 16551.
Summarize by group
Which state has the biggest loss?
data %>%
# Group by state
group_by(state) %>%
# Calculation of average colony loss
summarise(loss = mean(colony_lost, na.rm = TRUE)) %>%
# Sort it
arrange(loss)
## # A tibble: 47 × 2
## state loss
## <chr> <dbl>
## 1 Connecticut 258
## 2 Vermont 346
## 3 New Jersey 690
## 4 Massachusetts 731.
## 5 Hawaii 833.
## 6 Maine 877.
## 7 West Virginia 907.
## 8 Missouri 916.
## 9 Maryland 1004.
## 10 Virginia 1015.
## # ℹ 37 more rows
Missing values
data %>%
# Remove missing values
filter(!is.na(colony_size))
## # A tibble: 1,175 × 10
## year months state colony_size colony_max colony_lost colony_lost_pct
## <dbl> <chr> <chr> <dbl> <chr> <dbl> <dbl>
## 1 2015 January-March Alaba… 7000 7000 1800 26
## 2 2015 January-March Arizo… 35000 35000 4600 13
## 3 2015 January-March Arkan… 13000 14000 1500 11
## 4 2015 January-March Calif… 1440000 1690000 255000 15
## 5 2015 January-March Color… 3500 12500 1500 12
## 6 2015 January-March Conne… 3900 3900 870 22
## 7 2015 January-March Flori… 305000 315000 42000 13
## 8 2015 January-March Georg… 104000 105000 14500 14
## 9 2015 January-March Hawaii 10500 10500 380 4
## 10 2015 January-March Idaho 81000 88000 3700 4
## # ℹ 1,165 more rows
## # ℹ 3 more variables: colony_added <chr>, colony_reno <chr>,
## # colony_reno_pct <chr>
data <- data %>%
filter(!is.na(colony_size), !is.na(colony_max))
data %>%
group_by(year, months, state) %>%
summarise(mean = mean(colony_size))
## # A tibble: 1,175 × 4
## # Groups: year, months [25]
## year months state mean
## <dbl> <chr> <chr> <dbl>
## 1 2015 April-June Alabama 7500
## 2 2015 April-June Arizona 33000
## 3 2015 April-June Arkansas 13500
## 4 2015 April-June California 1040000
## 5 2015 April-June Colorado 12000
## 6 2015 April-June Connecticut 3000
## 7 2015 April-June Florida 210000
## 8 2015 April-June Georgia 114000
## 9 2015 April-June Hawaii 13500
## 10 2015 April-June Idaho 62000
## # ℹ 1,165 more rows
useful summary functions
data %>%
group_by(year, months, state) %>%
summarise(
first = min(colony_size),
last = max(colony_size)
)
## # A tibble: 1,175 × 5
## # Groups: year, months [25]
## year months state first last
## <dbl> <chr> <chr> <dbl> <dbl>
## 1 2015 April-June Alabama 7500 7500
## 2 2015 April-June Arizona 33000 33000
## 3 2015 April-June Arkansas 13500 13500
## 4 2015 April-June California 1040000 1040000
## 5 2015 April-June Colorado 12000 12000
## 6 2015 April-June Connecticut 3000 3000
## 7 2015 April-June Florida 210000 210000
## 8 2015 April-June Georgia 114000 114000
## 9 2015 April-June Hawaii 13500 13500
## 10 2015 April-June Idaho 62000 62000
## # ℹ 1,165 more rows
data %>%
group_by(state) %>%
summarise(colony_lost = n_distinct(colony_lost)) %>%
arrange(desc(colony_lost))
## # A tibble: 47 × 2
## state colony_lost
## <chr> <int>
## 1 United States 25
## 2 California 24
## 3 Minnesota 24
## 4 Oklahoma 24
## 5 Hawaii 23
## 6 North Dakota 23
## 7 Kansas 22
## 8 Maine 22
## 9 New Jersey 22
## 10 Texas 22
## # ℹ 37 more rows
Grouping multiple variables
data %>%
group_by(year, state, colony_lost) %>%
summarize(count = n()) %>%
ungroup()
## # A tibble: 1,150 × 4
## year state colony_lost count
## <dbl> <chr> <dbl> <int>
## 1 2015 Alabama 610 1
## 2 2015 Alabama 860 1
## 3 2015 Alabama 1400 1
## 4 2015 Alabama 1800 1
## 5 2015 Arizona 4600 1
## 6 2015 Arizona 5500 1
## 7 2015 Arizona 6000 1
## 8 2015 Arizona 12000 1
## 9 2015 Arkansas 1500 1
## 10 2015 Arkansas 1900 1
## # ℹ 1,140 more rows