Shiny Dashboard for BGAK Banana Survey Written by Leon Kim for Keystone Accountability July 22, 2015
# Required libraries
require(plyr)
require(rmongodb)
Initialize MongoDB and load data into R. Our dataset is small, so we will load the data into R (using up RAM).If using big dataset, consider querying variable from MongoDB without loading to R.
data_list <- mongo.find.all(mongo=mongo.create(db = "fbc", host = "50.97.79.169:29883", username = "fbcommons_user", password = "&sm1rKs4A"),
ns="fbc.results",
query='{"survey_id": 2199089}', # SurveyGizmo survey id
field= list( # Rmongodb shorthand for projection query
q631=1L, q590=1L, # COmmon Interest Group
q539=1L, # County
q603=1L, q604=1L, # Sex and Age, respectively
q534=1L, # Number of dairy cows
q605=1L, q606=1L, # Farm size and Land under bananas, repctively
q633=1L, # Question 1.1: Were you informed about market day on time
q634=1L, # Question 1.2: Was it easy for you take your bananas to aggregation centre
q635=1L, # Question 1.3: Do you feel that the group negotiated a fair/good price with the buyer
q636=1L, # Question 1.4: Were you paid on time as you expected
q637=1L, # Question 1.5: Were you paid in full as expected
q638=1L, # Question 2: How did you transportyour produce
q639=1L, # Question 3: Were you able to sell all the bananas that you expected
q640=1L, # Question 4: if no to Question 4, select reason
q641=1L, # Question 5: Recommoned Likelihood
q642=1L # Question 6: Reasons for question 5 ranking
)
)
#data_list element is document (one survey)
# ASSUMPTION: value of each data_list element is length of 1 (i.e. data_list is not nested)
df_from_db <- data.frame(matrix(unlist(data_list), nrow = length(data_list),
byrow = T))
colnames(df_from_db) <- names(data_list[[1]]) # assumes non-empty query
Renaming the questions to recognizable variable names using plyr
df_from_db <- rename(df_from_db,
replace= c("q631"="cig",
"q590"="cig1",
"q539"="county",
"q603"="sex",
"q604"="age",
"q534"="numDairyCows",
"q605"="sizeFarm",
"q606"="sizeBananaFarm",
"q633"="Informed on time", # Q1.1
"q634"="Easy to transport bananas", # Q1.2
"q635"="Fair price negotiated", # Q1.3
"q636"="Paid on time", # Q1.4
"q637"="Paid in full", # Q1.5
"q638"="Q2",
"q639"="Q3",
"q640"="Q4",
"q641"="Q5",
"q642"="Q6"
)
)
levels(df_from_db$cig1)[levels(df_from_db$cig1) == "KIBANGA"] <- "KIBANA"
trim <- function(x) gsub("^\\s+|\\s+$", "", x)
tmp <- apply(subset(df_from_db, select = colnames(df_from_db) == "cig" | colnames(df_from_db) ==
"cig1"), 1, function(x) {
if (trim(toupper(toString(x[1]))) == trim(toupper(toString(x[2])))) {
# if two cigs are same
return(trim(toupper(toString(x[1])))) # just return one of them
}
if (trim(toString(x[1])) == "") {
return(trim(toupper(toString(x[2])))) # if first one is empty, return the second one
}
if (trim(toString(x[2])) == "") {
return(trim(toupper(toString(x[1])))) # if secondone is empty, return the second one
}
# if above doesnt solve the problem, return them (for decoding)
# return(paste(toString(x[1]),toString(x[2]),sep='|'))
# if above doesnt solve the problem
return("")
})
# add combined cig.tmp as cig and take out two pre-existing columns
df <- subset(df_from_db, select = -c(cig, cig1))
df <- data.frame(df, cig = tmp)
This is fixed by imputation… because surveys are in groups, if the missing county is within a chunk of rows with same county names, assume it is indeed that county.
for (i in 1:length(df$county)) {
# skip the first two and last two
if (i <= 2) {
next
}
if (i >= length(df$county) - 2) {
next
}
if (df$county[i] == "") {
if (df$county[i - 1] == df$county[i + 1]) {
# if one before and after are the same
if (df$county[i - 2] == df$county[i + 1] | df$county[i - 1] == df$county[i +
2]) {
df$county[i] <- df$county[i - 1] # missing value is part of the chunk
}
}
}
}
df$county <- ifelse(df$county == "", NA, as.character(df$county))
for (i in 1:length(df$cig)) {
# skip the first two and last two
if (i <= 2) {
next
}
if (i >= length(df$cig) - 2) {
next
}
if (df$cig[i] == "") {
if (df$cig[i - 1] == df$cig[i + 1]) {
# if one before and after are the same
if (df$cig[i - 2] == df$cig[i + 1] | df$cig[i - 1] == df$cig[i +
2]) {
df$cig[i] <- df$cig[i - 1] # missing value is part of the chunk
}
}
}
}
rm(i)
df$cig <- ifelse(df$cig == "", NA, as.character(df$cig))
df$sex <- factor(df$sex, label = c("", "FEMALE", "MALE"))
df$sex <- ifelse(df$sex == "", NA, as.character(df$sex))
df$sizeFarm <- as.character(df$sizeFarm)
df$sizeFarm <- tolower(df$sizeFarm)
# Remove texts from farm size
df$sizeFarm <- gsub("point \\(", "\\.", df$sizeFarm)
df$sizeFarm <- gsub("\\)", "", df$sizeFarm)
df$sizeFarm <- gsub("[a-z]", "", df$sizeFarm)
df$sizeFarm <- trim(df$sizeFarm) # trim trailing and leading space
# Convert text to formula and evaluate df$sizeFarm <- gsub('
# ','+',df$sizeFarm) df$sizeFarm <- round(sapply(df$sizeFarm,
# function(txt){eval(parse(text=txt))}),2)
# ASSUMPTION: There are no fractional cows. Coercing the values to numeric
# is good enough
df$numDairyCows <- as.character(df$numDairyCows)
df$numDairyCows <- as.numeric(df$numDairyCows)
## Warning: NAs introduced by coercion
df$numDairyCows <- ifelse(df$numDairyCows == -1, NA, df$numDairyCows)
# #
df$wealth <- as.character(df$numDairyCows)
df$wealth[which(is.na(df$wealth))] <- "Unknown"
df$wealth[which(df$wealth == "0")] <- "Poorer"
df$wealth[which(df$wealth == "1")] <- "Middle"
df$wealth[which(df$wealth == "2")] <- "Middle"
df$wealth[-which(df$wealth %in% c("Unknown", "Poorer", "Middle"))] <- "Richer"
df$wealth <- as.factor(df$wealth)
# Question 1.1: Were you informed about market day on time Question 1.2: Was
# it easy for you take your bananas to aggregation centre Question 1.3: Do
# you feel that the group negotiated a fair/good price with the buyer
# Question 1.4: Were you paid on time as you expected Question 1.5: Were you
# paid in full as expected
# -1 => 'UNKNOWN', 0 => 'YES', 1 => 'NO'
levels(df$Informed.on.time) <- c("UNKNOWN", "YES", "NO")
levels(df$Easy.to.transport.bananas) <- c("UNKNOWN", "YES", "NO")
levels(df$Fair.price.negotiated) <- c("UNKNOWN", "YES", "NO")
levels(df$Paid.on.time) <- c("UNKNOWN", "YES", "NO")
levels(df$Paid.in.full) <- c("UNKNOWN", "YES", "NO")
Recoding levels.
# ------------------ Question 2
levels(df$Q2) <- c("Unknown", "bicycle", "Motor cycle", "Animal drawn cart",
"Hired pickup truck", "My own motor vehicle")
# ------------------ Question 3
levels(df$Q3) <- c("UNKNOWN", "YES", "NO")
# ------------------ Question 4
levels(df$Q4) <- c("Unknown", "The buyer did not buy all the stock that we brought",
"The buyer was not satisfied with the quality", "Damaged during transportation",
"When I arrived the buyer had already completed purchases", "Other reasons")
# ------------------
df$Q5 <- factor(df$Q5, ordered = T, levels = c("-1", "0", "1", "2", "3", "4",
"5", "6", "7", "8", "9", "10"))
DPP_transform <- function(x) {
# x: vector of likelihood to recommend in 0 - 10 scale returns vector of c(%
# of Detractors, % of Passives, % of Promoters)
DPP <- rep(NA, length(x))
x <- as.numeric(as.character(x))
x <- ifelse(x == -1, NA, x)
# ---- LEON'S ----------------------- DPP[0 <= x & x <= 6] <- 'Negative'
# DPP[7 <= x & x <= 8] <- 'Neutral' DPP[9 <= x & x <= 10] <- 'Positive'
# ---- Ahmed'S -----------------------
DPP[0 <= x & x <= 4] <- "Negative"
DPP[5 <= x & x <= 6] <- "Neutral"
DPP[7 <= x & x <= 10] <- "Positive"
DPP <- factor(DPP, levels = c("Negative", "Neutral", "Positive"), order = T)
return(DPP)
}
df <- data.frame(df, DPP = DPP_transform(df$Q5))
df <- df[!duplicated(df[, -1]), ]
df.orig <- df
rm(df)
# ... unfortunately we need to double the size of the dataset to achieve
# this do this for each var1
df.orig.2 <- df.orig
df.orig.2$cig <- rep("AVERAGE", nrow(df.orig))
df.orig.2$county <- rep("AVERAGE", nrow(df.orig))
df.orig <- rbind(df.orig.2, df.orig)
write.csv(df.orig, "bgak_farmers_table.csv")
rm(df.orig)
rm(df.orig.2)
rm(DPP_transform)
rm(df_from_db)
rm(data_list)
rm(tmp)
rm(trim)