library(RSQLite)
library(plyr)
library(dplyr)
library(ggplot2)
library(reshape2)

Step 1. Load data source

#Connect to Database and get tables
con <- dbConnect(drv=RSQLite::SQLite(), dbname="Zambia_Database_Revision_6.sqlite3")
hh_serv <- dbGetQuery(conn=con, statement="SELECT * FROM hh_support_services")
fo_serv <- dbGetQuery(conn=con, statement="SELECT * FROM fo_services")
dbDisconnect(con)
## [1] TRUE

Step 2. Deep data cleaning

#################Clean fo_services############################

#Variables total_trained, women_trained, and men_trained need to be converted to numbers. Code Don't Know and -999 as NA
# replace don't know
fo_serc <- data.frame(
  apply(fo_serv, 2, function(x) {
    gsub(pattern = "’t", replacement = NA, x)
    tolower(x)
  }), stringsAsFactors = FALSE)

fo_serc <- fo_serc[, c(4, 5, 8:11)]

# convert numeric
fo_serc <- transform(fo_serc,
                        normalized_fo_code = as.factor(normalized_fo_code), 
                        year_collected = as.factor(year_collected),
                        b3_VALUE = as.factor(b3_VALUE))

#Add Column Names
colnames(fo_serc) <- c("year", "fo", "service_type", "other_provide", "service", "fo_provide")

#create numeric varaible for if FO provided Service
fo_serc$fo_yes <- with(fo_serc,ifelse(fo_provide == "yes", 1 , NA))

#Create Summary Statistic for fo_servies proived, convert to proportion data for accurate comparison given unequal n
fo_serc <- group_by(fo_serc, year, service_type)
fosg <- summarise(fo_serc, 
                  n = n(),
                  fo_yes = sum(fo_yes, na.rm = TRUE))
fosg$p_offer <- with(fosg,fo_yes/n)


###############Clean hh_services_offered######################## 
#Subset Variables
hh_serc <- select(hh_serv, year_collected, a2_VALUE, a2_b, a2_c, a2_d, a2_h, a2_f)

#Give readable var names
hh_serc <- rename(hh_serc, service_type = a2_VALUE, provider1 = a2_b, provider2 = a2_c, 
provider3 = a2_d, provider4 = a2_h, service_used = a2_f)

#Clean provider name variables for Farmers Organization
hh_serc$provider1 <- mapvalues(hh_serc$provider1, from =c("Farmer Organisation","Farmer organization"),to =c("FO", "FO"))
hh_serc$provider2 <- mapvalues(hh_serc$provider2, from = c("Farmer Organisation","Farmer organization"),to = c("FO", "FO"))
hh_serc$provider3 <- mapvalues(hh_serc$provider3, from = c("Farmer organization","Farmer Organisation"),to = c("FO", "FO"))
hh_serc$provider4 <- mapvalues(hh_serc$provider4, from = c("Farmers? organization"),to = c("FO"))

#Create boolean nominal Variable if HH used service
hh_serc$hh_use <- with(hh_serc,ifelse(service_used == "Yes", 1 , 0))

#Create boolean nominal variable if HH used service and it was provided by FO
hh_serc$hh_use_fo <- with(hh_serc,ifelse(service_used == "Yes" & 
                                           (provider1 == "FO" | provider2 == "FO" | provider3 == "FO" | provider4 == "FO"), 1 , 0))

#Group By Year and Service Type for Analysis

hh_serc <- group_by(hh_serc, year_collected, service_type)
hhsg <- summarise(hh_serc, 
                  n = n(),
                  hh_use = sum(hh_use, na.rm = TRUE),
                  hh_use_fo = sum(hh_use_fo, na.rm = TRUE))
hhsg$p_use <- with(hhsg, hh_use/n)
hhsg$p_use_fo <- with(hhsg, hh_use_fo/n)

Step 3. Plot Analysis

#Plot Services Offered by FO by year
fosg$service_type <- factor(fosg$service_type, levels = fosg$service_type[order(fosg$p_offer)])

ggplot(fosg, aes(x = service_type, y = p_offer, fill = service_type)) + geom_bar(stat = "identity") + 
  coord_flip() +
  facet_wrap(~ year) +
  theme(legend.position="none") + 
  ggtitle("Proportion of FOs That Offer Service")

#Plot proportion of HHs using Services from any provider
hhsg$service_type <- factor(hhsg$service_type, levels = hhsg$service_type[order(hhsg$p_use)])

ggplot(hhsg, aes(x = service_type, y = p_use, fill = service_type)) + geom_bar(stat = "identity") + 
  coord_flip() +
  facet_wrap(~ year_collected) +
  theme(legend.position="none") + 
  ggtitle("Proportion of HHs using Service Overall")

#Plot proportion of HHs using Services from an FO
hhsg$service_type <- factor(hhsg$service_type, levels = hhsg$service_type[order(hhsg$p_use_fo)])

ggplot(hhsg, aes(x = service_type, y = p_use_fo, fill = service_type)) + geom_bar(stat = "identity") + 
  coord_flip() +
  facet_wrap(~ year_collected) +
  theme(legend.position="none") + 
  ggtitle("Proportion of HHs using Service From an FO")

#Melt data to see comparison
hhsm <- melt(hhsg, id.vars = 1:5)
hhsm$variable <- factor(hhsm$variable, levels = c("p_use_fo", "p_use"))

#Plot proportion of HHs using Services from an FO
ggplot(hhsm, aes(x = service_type, y = value, fill = variable)) + geom_bar(stat = "identity", position="dodge") + 
  coord_flip() +
  facet_wrap(~ year_collected) +
  ggtitle("Proportion of HHs using Service vs Using Directly From FO")