library(tidyverse)
library(here)
library(janitor)
library(ggbeeswarm)
We can see that the data is fairly well organized, but the column names are spread across 3 or more rows and contain units/other metadata. We’ll take care of that with some tidyverse
cleaning!
# check what the columns represent
readr::read_csv(here("week9/honey-production/honeyraw_1998to2002.csv"), skip = 4) %>%
.[1:10,] %>%
select(-`1`, -`h`)
## Warning: Missing column names filled in: 'X3' [3], 'X6' [6], 'X7' [7]
## Parsed with column specification:
## cols(
## `1` = col_integer(),
## h = col_character(),
## X3 = col_character(),
## Honey = col_character(),
## Yield = col_character(),
## X6 = col_character(),
## X7 = col_character(),
## Average = col_character(),
## Value = col_character()
## )
## # A tibble: 10 x 7
## X3 Honey Yield X6 X7 Average Value
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 State Producing per Production Stocks Price per of
## 2 <NA> Colonies Colony <NA> Dec 15 2/ Pound Production
## 3 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 4 <NA> 1,000 Pounds 1,000 Pounds 1,000 Pounds Cents 1,000 Dolla~
## 5 AL 16 71 1136 159 72 818
## 6 AZ 55 60 3300 1485 64 2112
## 7 AR 53 65 3445 1688 59 2033
## 8 CA 450 83 37350 12326 62 23157
## 9 CO 27 72 1944 1594 70 1361
## 10 FL 230 98 22540 4508 64 14426
I originally pasted a tribble here with all the state names/abbreviations (using Miles McBain’s datapasta
package addin) but realized R has a built in name/abbreviations for states w/ state.name
and state.abb
.
# state_abbreviations and names built in
state.abb
## [1] "AL" "AK" "AZ" "AR" "CA" "CO" "CT" "DE" "FL" "GA" "HI" "ID" "IL" "IN"
## [15] "IA" "KS" "KY" "LA" "ME" "MD" "MA" "MI" "MN" "MS" "MO" "MT" "NE" "NV"
## [29] "NH" "NJ" "NM" "NY" "NC" "ND" "OH" "OK" "OR" "PA" "RI" "SC" "SD" "TN"
## [43] "TX" "UT" "VT" "VA" "WA" "WV" "WI" "WY"
state.name
## [1] "Alabama" "Alaska" "Arizona" "Arkansas"
## [5] "California" "Colorado" "Connecticut" "Delaware"
## [9] "Florida" "Georgia" "Hawaii" "Idaho"
## [13] "Illinois" "Indiana" "Iowa" "Kansas"
## [17] "Kentucky" "Louisiana" "Maine" "Maryland"
## [21] "Massachusetts" "Michigan" "Minnesota" "Mississippi"
## [25] "Missouri" "Montana" "Nebraska" "Nevada"
## [29] "New Hampshire" "New Jersey" "New Mexico" "New York"
## [33] "North Carolina" "North Dakota" "Ohio" "Oklahoma"
## [37] "Oregon" "Pennsylvania" "Rhode Island" "South Carolina"
## [41] "South Dakota" "Tennessee" "Texas" "Utah"
## [45] "Vermont" "Virginia" "Washington" "West Virginia"
## [49] "Wisconsin" "Wyoming"
# pasted dataframe w/ abbreviations
state_names <- tibble::tribble(
~state, ~abbreviation,
"ALABAMA", "AL",
"ALASKA", "AK",
"ARIZONA", "AZ",
"ARKANSAS", "AR",
"CALIFORNIA", "CA",
"COLORADO", "CO",
"CONNECTICUT", "CT",
"DELAWARE", "DE",
"FLORIDA", "FL",
"GEORGIA", "GA",
"HAWAII", "HI",
"IDAHO", "ID",
"ILLINOIS", "IL",
"INDIANA", "IN",
"IOWA", "IA",
"KANSAS", "KS",
"KENTUCKY", "KY",
"LOUISIANA", "LA",
"MAINE", "ME",
"MARYLAND", "MD",
"MASSACHUSETTS", "MA",
"MICHIGAN", "MI",
"MINNESOTA", "MN",
"MISSISSIPPI", "MS",
"MISSOURI", "MO",
"MONTANA", "MT",
"NEBRASKA", "NE",
"NEVADA", "NV",
"NEW HAMPSHIRE", "NH",
"NEW JERSEY", "NJ",
"NEW MEXICO", "NM",
"NEW YORK", "NY",
"NORTH CAROLINA", "NC",
"NORTH DAKOTA", "ND",
"OHIO", "OH",
"OKLAHOMA", "OK",
"OREGON", "OR",
"PENNSYLVANIA", "PA",
"RHODE ISLAND", "RI",
"SOUTH CAROLINA", "SC",
"SOUTH DAKOTA", "SD",
"TENNESSEE", "TN",
"TEXAS", "TX",
"UTAH", "UT",
"VERMONT", "VT",
"VIRGINIA", "VA",
"WASHINGTON", "WA",
"WEST VIRGINIA", "WV",
"WISCONSIN", "WI",
"WYOMING", "WY"
)
We’re reading in each of the dataframes and cleaning things up with janitor
and some tidyverse
calls.
honey98_02 <- readr::read_csv(here("week9/honey-production/honeyraw_1998to2002.csv"), skip = 8) %>%
clean_names() %>%
select(-x1, -u) %>%
rename(state = x3) %>%
filter(state %in% state.abb)
honey03_07 <- readr::read_csv(here("week9/honey-production/honeyraw_2003to2007.csv"), skip = 80) %>%
clean_names() %>%
select(-x1, -u) %>%
rename(state = x3) %>%
filter(state %in% state.abb)
honey08_12 <- readr::read_csv(here("week9/honey-production/honeyraw_2008to2012.csv"), skip = 71) %>%
clean_names() %>%
select(-x1, -u) %>%
mutate(state = state.abb[match(x3, state.name)]) %>%
select(-x3) %>%
filter(state %in% state.abb)
Now that we have the somewhat clean dataframes we will merge via bind_rows to form one dataframe and then fully tidy and assign years based on the start and stop for each set of honey production. We know that the first state is always AL and the last state is always WY, so we can build up a logical set of vectors to repeat the years across.
clean_df <- honey98_02 %>%
bind_rows(honey03_07, honey08_12)
honey_df <- clean_df %>%
rename(bee_colonies = x1_000,
honey_per_colony = pounds,
honey_produced = x1_000_pounds,
honey_stock = x1_000_pounds_1,
honey_price = cents,
honey_production_value = x1_000_dollars) %>%
mutate_at(2:7, as.numeric) %>%
mutate(bee_colonies = bee_colonies * 1000,
honey_produced = honey_produced * 1000,
honey_stock = honey_stock * 1000,
honey_price = honey_price/100,
honey_production_value = honey_production_value * 1000)
ystart <- c(grep("AL", honey_df$state))
yend <- c(grep("WY", honey_df$state))
years <- 1998:2012
ynums <- yend - ystart + 1
honey_df$year <- rep(years, ynums)
# look at df
glimpse(honey_df)
## Observations: 626
## Variables: 8
## $ state <chr> "AL", "AZ", "AR", "CA", "CO", "FL", "GA...
## $ bee_colonies <dbl> 16000, 55000, 53000, 450000, 27000, 230...
## $ honey_per_colony <dbl> 71, 60, 65, 83, 72, 98, 56, 118, 50, 71...
## $ honey_produced <dbl> 1136000, 3300000, 3445000, 37350000, 19...
## $ honey_stock <dbl> 159000, 1485000, 1688000, 12326000, 159...
## $ honey_price <dbl> 0.72, 0.64, 0.59, 0.62, 0.70, 0.64, 0.6...
## $ honey_production_value <dbl> 818000, 2112000, 2033000, 23157000, 136...
## $ year <int> 1998, 1998, 1998, 1998, 1998, 1998, 199...
Just a quick sanity check to see how many states were recorded per year. Some states were missing from my initial lookthrough the data w/ View
.
honey_df %>%
group_by(year) %>% count()
## # A tibble: 15 x 2
## # Groups: year [15]
## year n
## <int> <int>
## 1 1998 43
## 2 1999 43
## 3 2000 43
## 4 2001 44
## 5 2002 44
## 6 2003 44
## 7 2004 41
## 8 2005 41
## 9 2006 41
## 10 2007 41
## 11 2008 41
## 12 2009 40
## 13 2010 40
## 14 2011 40
## 15 2012 40
According to the article, Beekeepers are shifting from using bees primarily in honey production to selling colonies and pollination, this is reflected in the number of colonies staying fairly stable while the honey production/colony is decreasing across the timeframe.
(bee_colonies <- honey_df %>%
ggplot(aes(x = year, y = bee_colonies)) +
geom_beeswarm(shape = 21, color = "black", fill = "yellow", size = 2) +
stat_summary(fun.y = "median", colour = "black", size = 4, geom = "point", alpha = 0.5) +
stat_summary(fun.y = "median", colour = "black", size = 1, geom = "line", alpha = 0.5) +
labs(x = "\nYear",
y = "Number of Colonies\n",
title = "Beekeepers are shifting from honey production to pollination",
subtitle = "The number of colonies is relatively stable over time",
caption = "\nData: USDA-NASS | Graphic: @thomas_mock") +
ggthemes::theme_fivethirtyeight() +
theme(axis.title = element_text(size = 12, face = "bold"),
plot.title = element_text(size = 16, face = "bold")) +
scale_y_continuous(breaks = seq(0, 500000, 100000), labels = c("0", "100,000", "200,000", "300,000", "400,000", "500,000")) +
scale_x_continuous(breaks = seq(1998, 2012, 2)))
ggsave("colony_number.png", bee_colonies, height = 8, width = 8, units = "in", dpi = 800)
(honey_yield_colony <- honey_df %>%
ggplot(aes(x = year, y = honey_per_colony)) +
geom_beeswarm(shape = 21, color = "black", fill = "yellow", size = 2) +
stat_summary(fun.y = "median", colour = "black", size = 4, geom = "point", alpha = 0.5) +
stat_summary(fun.y = "median", colour = "black", size = 1, geom = "line", alpha = 0.5) +
labs(x = "\nYear",
y = "Honey yield per colony (lbs)\n",
title = "Beekeepers are shifting from honey production to pollination",
subtitle = "Beekeepers are collecting less honey per colony",
caption = "\nData: USDA-NASS | Graphic: @thomas_mock") +
ggthemes::theme_fivethirtyeight() +
theme(axis.title = element_text(size = 12, face = "bold"),
plot.title = element_text(size = 16, face = "bold")) +
scale_x_continuous(breaks = seq(1998, 2012, 2)) +
scale_y_continuous(breaks = seq(25, 150, 25)))
ggsave("honey_per_colony.png", honey_yield_colony, height = 8, width = 8, units = "in", dpi = 800)