# get ready
# Load Necessary Libraries
library(RSQLite)
library(plyr)
library(dplyr)
library(ggplot2)
library(knitr)
library(xtable)
library(ggthemes)
#Define Custom Multiplot function
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
library(grid)
plots <- c(list(...), plotlist)
numPlots = length(plots)
if (is.null(layout)) {
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots==1) {
print(plots[[1]])
} else {
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
for (i in 1:numPlots) {
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}
Use RSQLite Package to connect to database and load Seasonal Farming table. Ensure database file is in Working Directory
#Connect to Database
con <- dbConnect(drv=RSQLite::SQLite(), dbname="Zambia_HHDatabase_Revision_4.db")
#Pull Tables
hh_sf <- dbGetQuery(conn=con, statement="SELECT * FROM hh_seasonal_farming")
dbDisconnect(con)
## [1] TRUE
hh_sf <- transform(hh_sf,
b7_d = as.factor(b7_d),
b7_f = as.factor(b7_f),
b7_a = as.factor(b7_a),
year_collected = as.factor(year_collected),
new_hh_code = as.factor(new_hh_code))
levels(factor(hh_sf$b7_d))
## [1] "-7" "0" "1" "4"
## [5] "acres" "hectares" "limas" "Not Applicable"
hh_sf$b7_d <- mapvalues(hh_sf$b7_d, from=c(-7,0,"Not Applicable",1,4), to=c(NA, NA, NA, "hectares", "limas"))
hh_sf$b7_a <- mapvalues(hh_sf$b7_a, from =c(22,103), to =c("Other bean", "Sunflower oil"))
hh_sf$hectares <- with(hh_sf,ifelse(b7_d == 'acres', b7_c*0.4,
ifelse(b7_d == 'hectares', b7_c*1,
ifelse(b7_d == 'hectares', b7_c*1,
ifelse(b7_d == 'limas', b7_c*0.25, NA)))))
hh_sf$kilograms <- with(hh_sf,ifelse(b7_f == '100 kg bags', b7_e*100,
ifelse(b7_f == '40 kg bags', b7_e*40,
ifelse(b7_f == '50 kg bags', b7_e*50,
ifelse(b7_f == '90 kg bags', b7_e*90,
ifelse(b7_f == "metric tonnes", b7_e/1000,
ifelse(b7_f == "quintals", b7_e/100, NA)))))))
hh_sf$days_hired_labor <- hh_sf$b7_p + hh_sf$b7_r
sfsub <- hh_sf[c("year_collected","new_hh_code","b7_a","hectares","kilograms","days_hired_labor")]
#rename crop variable
sfsub <- rename(sfsub, crop_type = b7_a)
sfg <- group_by(sfsub, year_collected)
#Farm Size in hectares
sfsum1 <- summarise(sfg,
Total_Rows = n(),
Complete_Cases = sum(complete.cases(hectares)),
Mean = mean(hectares, na.rm = TRUE),
Variance = var(hectares, na.rm = TRUE),
Max = max(hectares, na.rm = TRUE),
Min = min(hectares, na.rm = TRUE))
#Crops Harvested in Kg
sfsum2 <- summarise(sfg,
Total_Rows = n(),
Complete_Cases = sum(complete.cases(kilograms)),
Mean = mean(kilograms, na.rm = TRUE),
Variance = var(kilograms, na.rm = TRUE),
Max = max(kilograms, na.rm = TRUE),
Min = min(kilograms, na.rm = TRUE))
#Outside Man Hours Employes
sfsum3 <- summarise(sfg,
Total_Rows = n(),
Complete_Cases = sum(complete.cases(days_hired_labor)),
Mean = mean(days_hired_labor, na.rm = TRUE),
Variance = var(days_hired_labor, na.rm = TRUE),
Max = max(days_hired_labor, na.rm = TRUE),
Min = min(days_hired_labor, na.rm = TRUE))
kable(sfsum1)
| year_collected | Total_Rows | Complete_Cases | Mean | Variance | Max | Min |
|---|---|---|---|---|---|---|
| 2010 | 17555 | 2713 | 0.8380575 | 0.8004915 | 8 | 0 |
| 2014 | 1190 | 905 | 0.6996133 | 0.8226077 | 10 | 0 |
kable(sfsum2)
| year_collected | Total_Rows | Complete_Cases | Mean | Variance | Max | Min |
|---|---|---|---|---|---|---|
| 2010 | 17555 | 2291 | 1814.103 | 9610778 | 75000 | 0.003 |
| 2014 | 1190 | 734 | 1535.313 | 6930778 | 20000 | 0.090 |
kable(sfsum3)
| year_collected | Total_Rows | Complete_Cases | Mean | Variance | Max | Min |
|---|---|---|---|---|---|---|
| 2010 | 17555 | 501 | 12.11377 | 257.861 | 185 | 0 |
| 2014 | 1190 | 54 | 20.11111 | 4688.025 | 365 | 0 |
fplot <- qplot(x=year_collected, y=Mean, fill = year_collected, data=sfsum1, geom = "bar", stat="identity") +
labs(y = "Avg Farm Size(ha)", x = "Year") +
ggtitle("Avg Farm Size(Hectares)") +
theme_igray() +
theme(axis.title.x = element_text(color="#666666", face="bold", size=10),
axis.title.y = element_text(color="#666666", face="bold", size=10))
cplot <- qplot(x=year_collected, y=Mean, fill = year_collected, data=sfsum2, geom = "bar", stat="identity") +
labs(y = "Avg Harvest(Kg)", x = "Year") +
ggtitle("Avg Crops Harvested(kg)") +
theme_igray() +
theme(axis.title.x = element_text(color="#666666", face="bold", size=10),
axis.title.y = element_text(color="#666666", face="bold", size=10))
mplot <- qplot(x=year_collected, y=Mean, fill = year_collected, data=sfsum3, geom = "bar", stat="identity") +
labs(y = "Avg Outside Man-Hours", x = "Year") +
ggtitle("Avg Outside Man Hours") +
theme_igray() +
theme(axis.title.x = element_text(color="#666666", face="bold", size=10),
axis.title.y = element_text(color="#666666", face="bold", size=10))
multiplot(fplot,cplot,mplot,cols = 2)
Question: Has there been a notable change in productivity metrics?
Finding: Average Farm Size and Crop Production Per Respondant who Answered Appears to have slightly decreased from 2010 to 2014. This could be due to an increse in the total number of small-holder farmers. I suspect this reduction is statistically insignifant, however. This could be tested by:
Testing Each Group(year subset) For Normality
We also note that The Average # of Man-Hours Employed has increased. This is difficult to verify based on the large disparity of sample size and respondants between 2010 and 2014. It can be noted that roughly 3% of responses claimed to hire outside labor in 2010 while 4% did in 2014.
#Group Data - Harvest by crop type and year
sfgc <- group_by(sfsub, year_collected, crop_type)
sfsum4 <- summarise(sfgc, mean(kilograms, na.rm = TRUE))
sfsum4 <- sfsum4[complete.cases(sfsum4),]
colnames(sfsum4)[3] <- "kilograms"
#reorder largest to smallest
sfsum4$crop_type <- factor(sfsum4$crop_type, levels = sfsum4$crop_type[order(sfsum4$kilograms)])
#Plot
ggplot(sfsum4, aes(x = crop_type, y = kilograms, fill = crop_type)) + geom_bar(stat = "identity") +
coord_flip() + facet_wrap( ~ year_collected)+
ggtitle("Mean Crop Output by Type and Year") +
theme_igray() +
theme(legend.position="none") +
theme(axis.title.x = element_text(color="#666666", face="bold", size=10),
axis.title.y = element_text(color="#666666", face="bold", size=10))
Question: What were the most prevalent crops produced between the two years?
Finding: We see large differences in average crop type production per household between 2010 and 2014. Notably a large reduction in Tomato production and large increase in Onion production. Maize appears to be the most stable, averaging around 3300kg per household for both years in question. A detailed table can be seen below.