In 2006, global concern was raised over the rapid decline in the honeybee population, an integral component to American honey agriculture. Large numbers of hives were lost to Colony Collapse Disorder, a phenomenon of disappearing worker bees causing the remaining hive colony to collapse. Speculation to the cause of this disorder points to hive diseases and pesticides harming the pollinators, though no overall consensus has been reached. Twelve years later, some industries are observing recovery but the American honey industry is still largely struggling. The U.S. used to locally produce over half the honey it consumes per year. Now, honey mostly comes from overseas, with 350 of the 400 million pounds of honey consumed every year originating from imports. This dataset provides insight into honey production supply and demand in America by state from 1998 to 2012.
NASS(National Agricultural Statistics Service) is a part of United States Department of Agriculture(USDA). NASS conducts many surveys every year and prepares reports covering virtually every aspect of U.S. agriculture such as production and supplies of food and fiber, prices paid and recieived by farmers, farm labor and wages, farm finances, chemical use, and changes in the demographics of U.S. and so on. Those are a few of what they are doing. NSAA largely is responsible for collecting data in agricultural industry. Since this organization is backed by U.S. government, those datasets are reliable and accurate.
Keeping the following as the main points to be analyzed:
suppressMessages(library(lubridate))
suppressMessages(library(tidyverse))
suppressMessages(library(cowplot))
suppressMessages(library(scales))
suppressMessages(library(DataCombine))
suppressMessages(library(stargazer))
suppressMessages(library(sandwich))
suppressMessages(library(dyn) )
suppressMessages(library(lmtest))
suppressMessages(library(data.table))
suppressMessages(library(tidyr))
suppressMessages(library(readr))
suppressMessages(library(stringr))
suppressMessages(library(forecast))
suppressMessages(library(gridExtra))
dir <- "../Data/"
func <- "../Codes/"
output <- "../Output/"
source(paste0(func, "theme_bg.R"))
source(paste0(func, "da_helper_functions.R"))
read_lines()
and then convert each vector into a dataframe.#Load Data
honeyraw9802 <- read_lines(paste(dir, "honeyraw_1998to2002.csv", sep=""))
honeyraw9802 <- data.frame(honeyraw9802[-1:-4])
honeyraw0307 <- read_lines(paste(dir,"honeyraw_2003to2007.csv", sep=""))
honeyraw0307 <- data.frame(honeyraw0307[-1:-76])
honeyraw0812 <- read_lines(paste(dir, "honeyraw_2008to2012.csv", sep=""))
honeyraw0812 <- data.frame(honeyraw0812[-1:-67])
# Clean Data
honeyrawlist <- list(honeyraw9802,honeyraw0307,honeyraw0812)
honeyfinallist <- list()
suppressWarnings(
for(honey_idx in 1:length(honeyrawlist)) {
honey <- honeyrawlist[[honey_idx]]
honey <- honey %>%
separate(colnames(honey[1]), paste0("X", 1:9), sep = ",") %>%
setNames(c("X1", "X2", "state", "numcol", "yieldpercol","totalprod",
"stocks","priceperlb","prodvalue")) %>%
select(-X1, -X2)
honey$state <- gsub("\"", "", honey$state)
honey$numcol <- as.integer(honey$numcol)*1000
honey$yieldpercol <- as.integer(honey$yieldpercol)
honey$totalprod <- as.integer(honey$totalprod)*1000
honey$stocks <- as.integer(honey$stocks)*1000
honey$priceperlb <- as.numeric(honey$priceperlb)/100
honey$prodvalue <- as.numeric(honey$prodvalue)*1000
honey <- honey %>%
filter(!is.na(prodvalue)&!grepl("/", honey$state))
honeyfinallist[[honey_idx]] <- honey
}
)
# Combine all into one dataframe.
suppressWarnings(
honeyfinaldf <- bind_rows(honeyfinallist)
)
# Use match() to find full state names and replace with abbreviations
matches_and_nas <- match(honeyfinaldf$state, state.name)
matches <- !is.na(matches_and_nas)
honeyfinaldf$state[matches] <- state.abb[matches_and_nas[matches]]
# Lastly, we want to add a year column
ystart <- c(grep("AL", honeyfinaldf$state))
yend <- c(grep("WY", honeyfinaldf$state))
years <- 1998:2012
ynums <- yend - ystart + 1
honeyfinaldf$year <- rep(years, ynums)
#adding a new variable consumption
honeyfinaldf$consumption <- honeyfinaldf$totalprod-honeyfinaldf$stocks
suppressMessages(glimpse(honeyfinaldf))
## Observations: 626
## Variables: 9
## $ state <chr> "AL", "AZ", "AR", "CA", "CO", "FL", "GA", "HI", "I...
## $ numcol <dbl> 16000, 55000, 53000, 450000, 27000, 230000, 75000,...
## $ yieldpercol <int> 71, 60, 65, 83, 72, 98, 56, 118, 50, 71, 92, 78, 4...
## $ totalprod <dbl> 1136000, 3300000, 3445000, 37350000, 1944000, 2254...
## $ stocks <dbl> 159000, 1485000, 1688000, 12326000, 1594000, 45080...
## $ priceperlb <dbl> 0.72, 0.64, 0.59, 0.62, 0.70, 0.64, 0.69, 0.77, 0....
## $ prodvalue <dbl> 818000, 2112000, 2033000, 23157000, 1361000, 14426...
## $ year <int> 1998, 1998, 1998, 1998, 1998, 1998, 1998, 1998, 19...
## $ consumption <dbl> 977000, 1815000, 1757000, 25024000, 350000, 180320...
# Save working file
write.csv(honeyfinaldf, paste(output, "honeyproduction.csv", sep=""), row.names = FALSE)
colSums(is.na(honeyfinaldf))
## state numcol yieldpercol totalprod stocks priceperlb
## 0 0 0 0 0 0
## prodvalue year consumption
## 0 0 0
change <- honeyfinaldf %>%
group_by(year) %>%
summarise(YearTotal=sum(totalprod))
ggplot(data=change, aes(x=year, y=YearTotal/1000000)) +
geom_smooth(method = "lm") +
geom_line(color = "red", size = 2) +
geom_point() +
scale_x_continuous(breaks=seq(1998, 2012, 2)) +
ylab("Production (in Million lb)") +
xlab("Years") +
theme_bg()+
background_grid(major = "xy", minor="y", size.major = 0.2)
productionperstate <- honeyfinaldf %>%
group_by(state) %>%
summarise(sumprod=sum(totalprod)) %>%
arrange(desc(sumprod)) %>%
head(10) %>%
mutate(percentage=round(sumprod/sum(sumprod)*100,2))
ggplot(data=productionperstate, aes(x=reorder(state,sumprod),y=sumprod/1000000))+
geom_col(aes(fill=state),show.legend=F)+
geom_text(aes(label=percentage))+
coord_flip()+
labs(x="States in the USA",y="Total Honey Produced (in Million lb)",title="Total Honey Produced by Each State")
productnperstate <- honeyfinaldf %>%
group_by(state) %>%
summarise(sumprod=sum(totalprod)) %>%
arrange(sumprod) %>%
head(10) %>%
mutate(percentage=round(sumprod/sum(sumprod)*100,2))
ggplot(data=productnperstate, aes(x=reorder(state,sumprod),y=sumprod/1000000))+
geom_col(aes(fill=state),show.legend=F)+
geom_text(aes(label=percentage))+
coord_flip()+
labs(x="States in the USA",y="Total Honey Produced (in Million lb)",title="Total Honey Produced by Each State")
# Top 10 states producing most honey in 1998
productionnper1998 <- honeyfinaldf %>%
filter(year==1998) %>%
group_by(state) %>%
summarise(sumprod=sum(totalprod)) %>%
arrange(desc(sumprod)) %>%
head(10) %>%
mutate(percentage=round(sumprod/sum(sumprod)*100,2))
y1998 <- ggplot(data = productionnper1998, aes(x=reorder(state,sumprod),y=sumprod/1000000, fill = state)) +
geom_bar(stat = "identity") +
guides(fill = FALSE) +
xlab("State") +
ylab("Total Honey Produced (in Million lb)") +
ggtitle("Honey Production in 1998") +
coord_flip()
# Top 10 states producing most honey in 2012
productionnper2012 <- honeyfinaldf %>%
filter(year==2012) %>%
group_by(state) %>%
summarise(sumprod=sum(totalprod)) %>%
arrange(desc(sumprod)) %>%
head(10) %>%
mutate(percentage=round(sumprod/sum(sumprod)*100,2))
y2012 <- ggplot(data = productionnper2012, aes(x=reorder(state,sumprod),y=sumprod/1000000, fill = state)) +
geom_bar(stat = "identity") +
guides(fill = FALSE) +
xlab("State") +
ylab("Total Honey Produced (in Million lb)") +
ggtitle("Honey Production in 2012") +
coord_flip()
grid.arrange(y1998, y2012, ncol=2)
change <- honeyfinaldf %>%
group_by(year) %>%
summarise(YearTotal=sum(totalprod))
ggplot(data=change, aes(x=year, y=YearTotal/1000000)) +
geom_smooth(method = "lm") +
geom_line(color = "red", size = 2) +
geom_point() +
scale_x_continuous(breaks=seq(1998, 2012, 2)) +
ylab("Production (in Million lb)") +
xlab("Years") +
theme_bg()+
background_grid(major = "xy", minor="y", size.major = 0.2)
price <- honeyfinaldf %>%
group_by(year) %>%
summarise(YearAvgPrice=mean(priceperlb))
ggplot(data=price, aes(x=year, y=YearAvgPrice)) +
geom_smooth(method = "lm") +
geom_line(color = "red", size = 2) +
geom_point() +
scale_x_continuous(breaks=seq(1998, 2012, 2)) +
ylab("Prices (Avg)") +
xlab("Years") +
theme_bg()+
background_grid(major = "xy", minor="y", size.major = 0.2)
ProductionPerYear <- honeyfinaldf %>%
group_by(year) %>%
summarize(TotalConsumption=sum(consumption))
ggplot(ProductionPerYear,aes(x=year,y=TotalConsumption/1000000)) +
geom_smooth(method = "lm") +
geom_line(color="red", size=2) +
geom_point() +
scale_x_continuous(breaks=seq(1998, 2012, 2)) +
labs(x="Year", y="Total Consumption (in Million lb)", title="Total Consumption for years(1998-2012)") +
theme_bg()+
background_grid(major = "xy", minor="y", size.major = 0.2)
hny <- honeyfinaldf %>%
group_by(year) %>%
summarise(totalproduction=sum(totalprod), VoP=sum(prodvalue))
ggplot(hny) +
geom_smooth(aes(x=year,y=totalproduction/1000000), method = "lm", color="blue", size=1, se=FALSE) +
geom_smooth(aes(x=year,y=VoP/1000000), method = "lm", color="darkblue", size=1, se=FALSE) +
geom_line(aes(x=year,y=totalproduction/1000000), color="red", size=2) +
geom_line(aes(x=year,y=VoP/1000000), color="green", size=2) +
geom_point(aes(x=year,y=totalproduction/1000000)) +
geom_point(aes(x=year,y=VoP/1000000)) +
scale_x_continuous(breaks=seq(1998, 2012, 2)) +
labs(x="Year", y="", title="Total Honey Production Vs. Value of Production") +
theme_bg()+
background_grid(major = "xy", minor="y", size.major = 0.2)
honey <- honeyfinaldf %>%
group_by(year) %>%
summarise(total_prod=sum(totalprod))
honey_production <- ts(honey$total_prod, start=1998, frequency=1)
plot(honey_production/1000000, ylab="Honey Production (in Million lb)")
ets2013 <- forecast(honey_production, h=1)
plot(ets2013, ylab="Honey Production")
naive2013 <- forecast(naive(honey_production, h=1))
plot(naive2013, ylab="Honey Production")
ses2013 <- forecast(ses(honey_production, h = 1))
plot(ses2013, ylab="Honey Production")
arima2013 <- forecast(auto.arima(honey_production), h=1)
tbats2013 <- forecast(tbats(honey_production), h=1)
nnetar2013 <- forecast(nnetar(honey_production), h=1)
rbindlist(list(
data.table(method = 'naive', as.data.frame(accuracy(naive2013))),
data.table(method = 'ses', as.data.frame(accuracy(ses2013))),
data.table(method = 'ets', as.data.frame(accuracy(ets2013))),
data.table(method = 'arima', as.data.frame(accuracy(arima2013))),
data.table(method = 'tbats', as.data.frame(accuracy(tbats2013))),
data.table(method = 'nnetar', as.data.frame(accuracy(nnetar2013)))))
## method ME RMSE MAE MPE MAPE MASE
## 1: naive -5615142.8571 18619687 16216857 -3.7520688 9.580399 1.0000000
## 2: ses -5222305.9214 17688262 15333611 -3.8247471 8.993452 0.9455353
## 3: ets -5166539.9021 17777015 15371532 -3.8073382 9.011067 0.9478737
## 4: arima -5226165.4073 17988415 15150368 -3.4952642 8.948372 0.9342358
## 5: tbats 1939465.6880 11654869 9195955 0.4851627 5.230874 0.5670615
## 6: nnetar -100.1844 13252495 11534284 -0.6101339 6.821371 0.7112527
## ACF1
## 1: -0.48898070
## 2: -0.15065170
## 3: -0.13812952
## 4: -0.50069873
## 5: -0.08123227
## 6: -0.17337098