Load libraries and Column metadata

library(tidyverse)
library(here)
library(janitor)
library(ggbeeswarm)

state = abbreviation (all caps)
Producing Colonies = producing colonies in 1000s
Yield per colony = pound of honey per colony
Production = pounds of honey in 1000s
Stocks = stocks held by producers in 1000s of pounds
Average = price per pound of honey in cents
Value of production = dollars in 1000s

See what the columns look like

We can see that the data is fairly well organized, but the column names are spread across 3 or more rows and contain units/other metadata. We’ll take care of that with some tidyverse cleaning!

# check what the columns represent
readr::read_csv(here("week9/honey-production/honeyraw_1998to2002.csv"), skip = 4) %>% 
    .[1:10,] %>% 
    select(-`1`, -`h`)

## Warning: Missing column names filled in: 'X3' [3], 'X6' [6], 'X7' [7]

## Parsed with column specification:
## cols(
##   `1` = col_integer(),
##   h = col_character(),
##   X3 = col_character(),
##   Honey = col_character(),
##   Yield = col_character(),
##   X6 = col_character(),
##   X7 = col_character(),
##   Average = col_character(),
##   Value = col_character()
## )

## # A tibble: 10 x 7
##    X3    Honey     Yield  X6           X7           Average   Value       
##    <chr> <chr>     <chr>  <chr>        <chr>        <chr>     <chr>       
##  1 State Producing per    Production   Stocks       Price per of          
##  2 <NA>  Colonies  Colony <NA>         Dec 15 2/    Pound     Production  
##  3 <NA>  <NA>      <NA>   <NA>         <NA>         <NA>      <NA>        
##  4 <NA>  1,000     Pounds 1,000 Pounds 1,000 Pounds Cents     1,000 Dolla~
##  5 AL    16        71     1136         159          72        818         
##  6 AZ    55        60     3300         1485         64        2112        
##  7 AR    53        65     3445         1688         59        2033        
##  8 CA    450       83     37350        12326        62        23157       
##  9 CO    27        72     1944         1594         70        1361        
## 10 FL    230       98     22540        4508         64        14426

Dealing with State Abbreviations

I originally pasted a tribble here with all the state names/abbreviations (using Miles McBain’s datapasta package addin) but realized R has a built in name/abbreviations for states w/ state.name and state.abb.

# state_abbreviations and names built in
state.abb

##  [1] "AL" "AK" "AZ" "AR" "CA" "CO" "CT" "DE" "FL" "GA" "HI" "ID" "IL" "IN"
## [15] "IA" "KS" "KY" "LA" "ME" "MD" "MA" "MI" "MN" "MS" "MO" "MT" "NE" "NV"
## [29] "NH" "NJ" "NM" "NY" "NC" "ND" "OH" "OK" "OR" "PA" "RI" "SC" "SD" "TN"
## [43] "TX" "UT" "VT" "VA" "WA" "WV" "WI" "WY"

state.name

##  [1] "Alabama"        "Alaska"         "Arizona"        "Arkansas"      
##  [5] "California"     "Colorado"       "Connecticut"    "Delaware"      
##  [9] "Florida"        "Georgia"        "Hawaii"         "Idaho"         
## [13] "Illinois"       "Indiana"        "Iowa"           "Kansas"        
## [17] "Kentucky"       "Louisiana"      "Maine"          "Maryland"      
## [21] "Massachusetts"  "Michigan"       "Minnesota"      "Mississippi"   
## [25] "Missouri"       "Montana"        "Nebraska"       "Nevada"        
## [29] "New Hampshire"  "New Jersey"     "New Mexico"     "New York"      
## [33] "North Carolina" "North Dakota"   "Ohio"           "Oklahoma"      
## [37] "Oregon"         "Pennsylvania"   "Rhode Island"   "South Carolina"
## [41] "South Dakota"   "Tennessee"      "Texas"          "Utah"          
## [45] "Vermont"        "Virginia"       "Washington"     "West Virginia" 
## [49] "Wisconsin"      "Wyoming"

# pasted dataframe w/ abbreviations
state_names <- tibble::tribble(
              ~state, ~abbreviation,
           "ALABAMA",          "AL",
            "ALASKA",          "AK",
           "ARIZONA",          "AZ",
          "ARKANSAS",          "AR",
        "CALIFORNIA",          "CA",
          "COLORADO",          "CO",
       "CONNECTICUT",          "CT",
          "DELAWARE",          "DE",
           "FLORIDA",          "FL",
           "GEORGIA",          "GA",
            "HAWAII",          "HI",
             "IDAHO",          "ID",
          "ILLINOIS",          "IL",
           "INDIANA",          "IN",
              "IOWA",          "IA",
            "KANSAS",          "KS",
          "KENTUCKY",          "KY",
         "LOUISIANA",          "LA",
             "MAINE",          "ME",
          "MARYLAND",          "MD",
     "MASSACHUSETTS",          "MA",
          "MICHIGAN",          "MI",
         "MINNESOTA",          "MN",
       "MISSISSIPPI",          "MS",
          "MISSOURI",          "MO",
           "MONTANA",          "MT",
          "NEBRASKA",          "NE",
            "NEVADA",          "NV",
     "NEW HAMPSHIRE",          "NH",
        "NEW JERSEY",          "NJ",
        "NEW MEXICO",          "NM",
          "NEW YORK",          "NY",
    "NORTH CAROLINA",          "NC",
      "NORTH DAKOTA",          "ND",
              "OHIO",          "OH",
          "OKLAHOMA",          "OK",
            "OREGON",          "OR",
      "PENNSYLVANIA",          "PA",
      "RHODE ISLAND",          "RI",
    "SOUTH CAROLINA",          "SC",
      "SOUTH DAKOTA",          "SD",
         "TENNESSEE",          "TN",
             "TEXAS",          "TX",
              "UTAH",          "UT",
           "VERMONT",          "VT",
          "VIRGINIA",          "VA",
        "WASHINGTON",          "WA",
     "WEST VIRGINIA",          "WV",
         "WISCONSIN",          "WI",
           "WYOMING",          "WY"
    )

Read in the data

We’re reading in each of the dataframes and cleaning things up with janitor and some tidyverse calls.

honey98_02 <- readr::read_csv(here("week9/honey-production/honeyraw_1998to2002.csv"), skip = 8) %>% 
    clean_names() %>% 
    select(-x1, -u) %>%
    rename(state = x3) %>% 
    filter(state %in% state.abb)
    
honey03_07 <- readr::read_csv(here("week9/honey-production/honeyraw_2003to2007.csv"), skip = 80) %>% 
    clean_names() %>% 
    select(-x1, -u) %>% 
    rename(state = x3) %>% 
    filter(state %in% state.abb)

honey08_12 <- readr::read_csv(here("week9/honey-production/honeyraw_2008to2012.csv"), skip = 71) %>% 
    clean_names() %>% 
    select(-x1, -u) %>%
    mutate(state = state.abb[match(x3, state.name)]) %>% 
    select(-x3) %>% 
    filter(state %in% state.abb)

Merge dataframes

Now that we have the somewhat clean dataframes we will merge via bind_rows to form one dataframe and then fully tidy and assign years based on the start and stop for each set of honey production. We know that the first state is always AL and the last state is always WY, so we can build up a logical set of vectors to repeat the years across.

clean_df <- honey98_02 %>% 
    bind_rows(honey03_07, honey08_12)

honey_df <- clean_df %>% 
    rename(bee_colonies = x1_000, 
           honey_per_colony = pounds,
           honey_produced = x1_000_pounds, 
           honey_stock = x1_000_pounds_1, 
           honey_price = cents, 
           honey_production_value = x1_000_dollars) %>% 
    mutate_at(2:7, as.numeric) %>%
    mutate(bee_colonies = bee_colonies * 1000,
           honey_produced = honey_produced * 1000,
           honey_stock = honey_stock * 1000,
           honey_price = honey_price/100,
           honey_production_value = honey_production_value * 1000)

ystart <- c(grep("AL", honey_df$state))
yend <- c(grep("WY", honey_df$state))
years <- 1998:2012
ynums <- yend - ystart + 1
honey_df$year <- rep(years, ynums)

# look at df
glimpse(honey_df)

## Observations: 626
## Variables: 8
## $ state                  <chr> "AL", "AZ", "AR", "CA", "CO", "FL", "GA...
## $ bee_colonies           <dbl> 16000, 55000, 53000, 450000, 27000, 230...
## $ honey_per_colony       <dbl> 71, 60, 65, 83, 72, 98, 56, 118, 50, 71...
## $ honey_produced         <dbl> 1136000, 3300000, 3445000, 37350000, 19...
## $ honey_stock            <dbl> 159000, 1485000, 1688000, 12326000, 159...
## $ honey_price            <dbl> 0.72, 0.64, 0.59, 0.62, 0.70, 0.64, 0.6...
## $ honey_production_value <dbl> 818000, 2112000, 2033000, 23157000, 136...
## $ year                   <int> 1998, 1998, 1998, 1998, 1998, 1998, 199...

Just a quick sanity check to see how many states were recorded per year. Some states were missing from my initial lookthrough the data w/ View.

honey_df %>% 
    group_by(year) %>% count()

## # A tibble: 15 x 2
## # Groups:   year [15]
##     year     n
##    <int> <int>
##  1  1998    43
##  2  1999    43
##  3  2000    43
##  4  2001    44
##  5  2002    44
##  6  2003    44
##  7  2004    41
##  8  2005    41
##  9  2006    41
## 10  2007    41
## 11  2008    41
## 12  2009    40
## 13  2010    40
## 14  2011    40
## 15  2012    40

Build out the plots

According to the article, Beekeepers are shifting from using bees primarily in honey production to selling colonies and pollination, this is reflected in the number of colonies staying fairly stable while the honey production/colony is decreasing across the timeframe.

(bee_colonies <- honey_df %>% 
    ggplot(aes(x = year, y = bee_colonies)) +
    geom_beeswarm(shape = 21, color = "black", fill = "yellow", size = 2) +
    stat_summary(fun.y = "median", colour = "black", size = 4, geom = "point", alpha = 0.5) +
    stat_summary(fun.y = "median", colour = "black", size = 1, geom = "line", alpha = 0.5) +
    labs(x = "\nYear",
         y = "Number of Colonies\n",
         title = "Beekeepers are shifting from honey production to pollination",
         subtitle = "The number of colonies is relatively stable over time",
         caption = "\nData: USDA-NASS | Graphic: @thomas_mock") +
    ggthemes::theme_fivethirtyeight() +
    theme(axis.title = element_text(size = 12, face = "bold"),
    plot.title = element_text(size = 16, face = "bold")) +
        scale_y_continuous(breaks = seq(0, 500000, 100000), labels = c("0", "100,000", "200,000", "300,000", "400,000", "500,000")) +
     scale_x_continuous(breaks = seq(1998, 2012, 2)))

ggsave("colony_number.png", bee_colonies, height = 8, width = 8, units = "in", dpi = 800)

(honey_yield_colony <- honey_df %>% 
    ggplot(aes(x = year, y = honey_per_colony)) +
    geom_beeswarm(shape = 21, color = "black", fill = "yellow", size = 2) +
    stat_summary(fun.y = "median", colour = "black", size = 4, geom = "point", alpha = 0.5) +
    stat_summary(fun.y = "median", colour = "black", size = 1, geom = "line", alpha = 0.5) +
    
    labs(x = "\nYear",
         y = "Honey yield per colony (lbs)\n",
         title = "Beekeepers are shifting from honey production to pollination",
         subtitle = "Beekeepers are collecting less honey per colony",
         caption = "\nData: USDA-NASS | Graphic: @thomas_mock") +
    ggthemes::theme_fivethirtyeight() +
    theme(axis.title = element_text(size = 12, face = "bold"),
    plot.title = element_text(size = 16, face = "bold")) +
    scale_x_continuous(breaks = seq(1998, 2012, 2)) +
    scale_y_continuous(breaks = seq(25, 150, 25)))

ggsave("honey_per_colony.png", honey_yield_colony, height = 8, width = 8, units = "in", dpi = 800)

Honey Production

Tom

May 18, 2018