library(RSQLite)
library(plyr)
library(dplyr)
library(ggplot2)
library(reshape2)
Step 1. Load data source
#Connect to Database and get tables
con <- dbConnect(drv=RSQLite::SQLite(), dbname="Zambia_Database_Revision_6.sqlite3")
hh_serv <- dbGetQuery(conn=con, statement="SELECT * FROM hh_support_services")
fo_serv <- dbGetQuery(conn=con, statement="SELECT * FROM fo_services")
dbDisconnect(con)
## [1] TRUE
Step 2. Deep data cleaning
#################Clean fo_services############################
#Variables total_trained, women_trained, and men_trained need to be converted to numbers. Code Don't Know and -999 as NA
# replace don't know
fo_serc <- data.frame(
apply(fo_serv, 2, function(x) {
gsub(pattern = "’t", replacement = NA, x)
tolower(x)
}), stringsAsFactors = FALSE)
fo_serc <- fo_serc[, c(4, 5, 8:11)]
# convert numeric
fo_serc <- transform(fo_serc,
normalized_fo_code = as.factor(normalized_fo_code),
year_collected = as.factor(year_collected),
b3_VALUE = as.factor(b3_VALUE))
#Add Column Names
colnames(fo_serc) <- c("year", "fo", "service_type", "other_provide", "service", "fo_provide")
#create numeric varaible for if FO provided Service
fo_serc$fo_yes <- with(fo_serc,ifelse(fo_provide == "yes", 1 , NA))
#Create Summary Statistic for fo_servies proived, convert to proportion data for accurate comparison given unequal n
fo_serc <- group_by(fo_serc, year, service_type)
fosg <- summarise(fo_serc,
n = n(),
fo_yes = sum(fo_yes, na.rm = TRUE))
fosg$p_offer <- with(fosg,fo_yes/n)
###############Clean hh_services_offered########################
#Subset Variables
hh_serc <- select(hh_serv, year_collected, a2_VALUE, a2_b, a2_c, a2_d, a2_h, a2_f)
#Give readable var names
hh_serc <- rename(hh_serc, service_type = a2_VALUE, provider1 = a2_b, provider2 = a2_c,
provider3 = a2_d, provider4 = a2_h, service_used = a2_f)
#Clean provider name variables for Farmers Organization
hh_serc$provider1 <- mapvalues(hh_serc$provider1, from =c("Farmer Organisation","Farmer organization"),to =c("FO", "FO"))
hh_serc$provider2 <- mapvalues(hh_serc$provider2, from = c("Farmer Organisation","Farmer organization"),to = c("FO", "FO"))
hh_serc$provider3 <- mapvalues(hh_serc$provider3, from = c("Farmer organization","Farmer Organisation"),to = c("FO", "FO"))
hh_serc$provider4 <- mapvalues(hh_serc$provider4, from = c("Farmers? organization"),to = c("FO"))
#Create boolean nominal Variable if HH used service
hh_serc$hh_use <- with(hh_serc,ifelse(service_used == "Yes", 1 , 0))
#Create boolean nominal variable if HH used service and it was provided by FO
hh_serc$hh_use_fo <- with(hh_serc,ifelse(service_used == "Yes" &
(provider1 == "FO" | provider2 == "FO" | provider3 == "FO" | provider4 == "FO"), 1 , 0))
#Group By Year and Service Type for Analysis
hh_serc <- group_by(hh_serc, year_collected, service_type)
hhsg <- summarise(hh_serc,
n = n(),
hh_use = sum(hh_use, na.rm = TRUE),
hh_use_fo = sum(hh_use_fo, na.rm = TRUE))
hhsg$p_use <- with(hhsg, hh_use/n)
hhsg$p_use_fo <- with(hhsg, hh_use_fo/n)
Step 3. Plot Analysis
#Plot Services Offered by FO by year
fosg$service_type <- factor(fosg$service_type, levels = fosg$service_type[order(fosg$p_offer)])
ggplot(fosg, aes(x = service_type, y = p_offer, fill = service_type)) + geom_bar(stat = "identity") +
coord_flip() +
facet_wrap(~ year) +
theme(legend.position="none") +
ggtitle("Proportion of FOs That Offer Service")

#Plot proportion of HHs using Services from any provider
hhsg$service_type <- factor(hhsg$service_type, levels = hhsg$service_type[order(hhsg$p_use)])
ggplot(hhsg, aes(x = service_type, y = p_use, fill = service_type)) + geom_bar(stat = "identity") +
coord_flip() +
facet_wrap(~ year_collected) +
theme(legend.position="none") +
ggtitle("Proportion of HHs using Service Overall")

#Plot proportion of HHs using Services from an FO
hhsg$service_type <- factor(hhsg$service_type, levels = hhsg$service_type[order(hhsg$p_use_fo)])
ggplot(hhsg, aes(x = service_type, y = p_use_fo, fill = service_type)) + geom_bar(stat = "identity") +
coord_flip() +
facet_wrap(~ year_collected) +
theme(legend.position="none") +
ggtitle("Proportion of HHs using Service From an FO")

#Melt data to see comparison
hhsm <- melt(hhsg, id.vars = 1:5)
hhsm$variable <- factor(hhsm$variable, levels = c("p_use_fo", "p_use"))
#Plot proportion of HHs using Services from an FO
ggplot(hhsm, aes(x = service_type, y = value, fill = variable)) + geom_bar(stat = "identity", position="dodge") +
coord_flip() +
facet_wrap(~ year_collected) +
ggtitle("Proportion of HHs using Service vs Using Directly From FO")
