# get ready
# Load Necessary Libraries
library(RSQLite)
library(plyr)
library(dplyr)
library(ggplot2)
library(knitr)
library(xtable)
library(ggthemes)

#Define Custom Multiplot function
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  library(grid)
  plots <- c(list(...), plotlist)
  numPlots = length(plots)
  if (is.null(layout)) {
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                     ncol = cols, nrow = ceiling(numPlots/cols))
  }
  if (numPlots==1) {
    print(plots[[1]])
  } else {
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
    for (i in 1:numPlots) {
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}

Summary

Seasonal Farming Indicators

Step 1. Load data source

Use RSQLite Package to connect to database and load Seasonal Farming table. Ensure database file is in Working Directory

#Connect to Database
con <- dbConnect(drv=RSQLite::SQLite(), dbname="Zambia_HHDatabase_Revision_4.db")

#Pull Tables
hh_sf <- dbGetQuery(conn=con, statement="SELECT * FROM hh_seasonal_farming")

dbDisconnect(con)
## [1] TRUE

Step 2. Deep data cleaning

  1. Factorize necessary variables
hh_sf <- transform(hh_sf,
                   b7_d = as.factor(b7_d),
                   b7_f = as.factor(b7_f),
                   b7_a = as.factor(b7_a),
                   year_collected = as.factor(year_collected),
                   new_hh_code = as.factor(new_hh_code))
  1. Look at plot size variable
levels(factor(hh_sf$b7_d))
## [1] "-7"             "0"              "1"              "4"             
## [5] "acres"          "hectares"       "limas"          "Not Applicable"
  1. Map inconsistent or missing values
hh_sf$b7_d <- mapvalues(hh_sf$b7_d, from=c(-7,0,"Not Applicable",1,4), to=c(NA, NA, NA, "hectares", "limas"))
hh_sf$b7_a <- mapvalues(hh_sf$b7_a, from =c(22,103), to =c("Other bean", "Sunflower oil"))
  1. Create new variable that converts all sizes to hectares (NA for unrecognized)
hh_sf$hectares <- with(hh_sf,ifelse(b7_d == 'acres', b7_c*0.4,
                                ifelse(b7_d == 'hectares', b7_c*1, 
                                        ifelse(b7_d == 'hectares', b7_c*1,
                                               ifelse(b7_d == 'limas', b7_c*0.25, NA)))))
  1. Create new variable that converts all amounts to kilograms (NA for unrecognized)
hh_sf$kilograms <- with(hh_sf,ifelse(b7_f == '100 kg bags', b7_e*100,
                                     ifelse(b7_f == '40 kg bags', b7_e*40, 
                                            ifelse(b7_f == '50 kg bags', b7_e*50,
                                                   ifelse(b7_f == '90 kg bags', b7_e*90,
                                                          ifelse(b7_f == "metric tonnes", b7_e/1000,
                                                                 ifelse(b7_f == "quintals", b7_e/100, NA)))))))
  1. Create new variable of hired manhours
hh_sf$days_hired_labor <- hh_sf$b7_p + hh_sf$b7_r
  1. Subset variables for analysis
sfsub <- hh_sf[c("year_collected","new_hh_code","b7_a","hectares","kilograms","days_hired_labor")]
#rename crop variable
sfsub <- rename(sfsub, crop_type = b7_a) 

Step 3. Summary Analysis

  1. Prepare Data for Summary Analysis - Farm Size in Hectacres, Crops Harvested in Kg, Outside Man Hours Employed
sfg <- group_by(sfsub, year_collected)

#Farm Size in hectares
sfsum1 <- summarise(sfg,
                   Total_Rows = n(),
                   Complete_Cases = sum(complete.cases(hectares)),
                   Mean = mean(hectares, na.rm = TRUE),
                   Variance = var(hectares, na.rm = TRUE),
                   Max = max(hectares, na.rm = TRUE),
                   Min = min(hectares, na.rm = TRUE))


#Crops Harvested in Kg
sfsum2 <- summarise(sfg,
                   Total_Rows = n(),
                   Complete_Cases = sum(complete.cases(kilograms)),
                   Mean = mean(kilograms, na.rm = TRUE),
                   Variance = var(kilograms, na.rm = TRUE),
                   Max = max(kilograms, na.rm = TRUE),
                   Min = min(kilograms, na.rm = TRUE))


#Outside Man Hours Employes
sfsum3 <- summarise(sfg,
                   Total_Rows = n(),
                   Complete_Cases = sum(complete.cases(days_hired_labor)),
                   Mean = mean(days_hired_labor, na.rm = TRUE),
                   Variance = var(days_hired_labor, na.rm = TRUE),
                   Max = max(days_hired_labor, na.rm = TRUE),
                   Min = min(days_hired_labor, na.rm = TRUE))
  1. Print Summary Statistics for Indicators
  1. Farm Size in Hectares
kable(sfsum1)
year_collected Total_Rows Complete_Cases Mean Variance Max Min
2010 17555 2713 0.8380575 0.8004915 8 0
2014 1190 905 0.6996133 0.8226077 10 0
  1. Crops Harvested in Kg
kable(sfsum2)
year_collected Total_Rows Complete_Cases Mean Variance Max Min
2010 17555 2291 1814.103 9610778 75000 0.003
2014 1190 734 1535.313 6930778 20000 0.090
  1. Outside Man Hours Employed
kable(sfsum3)
year_collected Total_Rows Complete_Cases Mean Variance Max Min
2010 17555 501 12.11377 257.861 185 0
2014 1190 54 20.11111 4688.025 365 0

Step 4. Reportable Indicator Analysis

Look at Change in Farm Size, Outside Man Hours, and Crops Harvested

fplot <- qplot(x=year_collected, y=Mean, fill = year_collected, data=sfsum1, geom = "bar", stat="identity") + 
  labs(y = "Avg Farm Size(ha)", x = "Year") + 
  ggtitle("Avg Farm Size(Hectares)") +
  theme_igray() +
  theme(axis.title.x = element_text(color="#666666", face="bold", size=10),
        axis.title.y = element_text(color="#666666", face="bold", size=10))

cplot <- qplot(x=year_collected, y=Mean, fill = year_collected, data=sfsum2, geom = "bar", stat="identity") + 
  labs(y = "Avg Harvest(Kg)", x = "Year") + 
  ggtitle("Avg Crops Harvested(kg)") +
  theme_igray() +
  theme(axis.title.x = element_text(color="#666666", face="bold", size=10),
        axis.title.y = element_text(color="#666666", face="bold", size=10))

mplot <- qplot(x=year_collected, y=Mean, fill = year_collected, data=sfsum3, geom = "bar", stat="identity") + 
  labs(y = "Avg Outside Man-Hours", x = "Year") + 
  ggtitle("Avg Outside Man Hours") +
  theme_igray() +
  theme(axis.title.x = element_text(color="#666666", face="bold", size=10),
        axis.title.y = element_text(color="#666666", face="bold", size=10))

multiplot(fplot,cplot,mplot,cols = 2)

Question: Has there been a notable change in productivity metrics?

Finding: Average Farm Size and Crop Production Per Respondant who Answered Appears to have slightly decreased from 2010 to 2014. This could be due to an increse in the total number of small-holder farmers. I suspect this reduction is statistically insignifant, however. This could be tested by:

  1. Testing Each Group(year subset) For Normality

  2. Testing Each Group For Equal Variance
  1. Testing Each Group For Equal Means

We also note that The Average # of Man-Hours Employed has increased. This is difficult to verify based on the large disparity of sample size and respondants between 2010 and 2014. It can be noted that roughly 3% of responses claimed to hire outside labor in 2010 while 4% did in 2014.

Look at Crop Output by type between years of the survey

#Group Data - Harvest by crop type and year
sfgc <- group_by(sfsub, year_collected, crop_type)
sfsum4 <- summarise(sfgc, mean(kilograms, na.rm = TRUE))
sfsum4 <- sfsum4[complete.cases(sfsum4),]
colnames(sfsum4)[3] <- "kilograms"

#reorder largest to smallest
sfsum4$crop_type <- factor(sfsum4$crop_type, levels = sfsum4$crop_type[order(sfsum4$kilograms)])

#Plot
ggplot(sfsum4, aes(x = crop_type, y = kilograms, fill = crop_type)) + geom_bar(stat = "identity") + 
  coord_flip() + facet_wrap( ~ year_collected)+ 
  ggtitle("Mean Crop Output by Type and Year") +
  theme_igray() +
  theme(legend.position="none") +
  theme(axis.title.x = element_text(color="#666666", face="bold", size=10),
        axis.title.y = element_text(color="#666666", face="bold", size=10))

Question: What were the most prevalent crops produced between the two years?

Finding: We see large differences in average crop type production per household between 2010 and 2014. Notably a large reduction in Tomato production and large increase in Onion production. Maize appears to be the most stable, averaging around 3300kg per household for both years in question. A detailed table can be seen below.