LoadingData

Shiny Dashboard for BGAK Banana Survey Written by Leon Kim for Keystone Accountability July 22, 2015

# Required libraries
require(plyr)
require(rmongodb)

Initialize MongoDB and load data into R. Our dataset is small, so we will load the data into R (using up RAM).If using big dataset, consider querying variable from MongoDB without loading to R.

Query questions individually!

data_list <- mongo.find.all(mongo=mongo.create(db = "fbc", host = "50.97.79.169:29883", username = "fbcommons_user", password = "&sm1rKs4A"), 
                  ns="fbc.results", 
                  query='{"survey_id": 2199089}', # SurveyGizmo survey id
                  field= list(       # Rmongodb shorthand for projection query
                            q631=1L, q590=1L, # COmmon Interest Group
                            q539=1L,          # County
                            q603=1L, q604=1L, # Sex and Age, respectively
                            q534=1L,          # Number of dairy cows
                            q605=1L, q606=1L, # Farm size and Land under bananas, repctively
                            q633=1L,          # Question 1.1: Were you informed about market day on time
                            q634=1L,          # Question 1.2: Was it easy for you take your bananas to aggregation centre
                            q635=1L,          # Question 1.3: Do you feel that the group negotiated a fair/good price with the buyer
                            q636=1L,          # Question 1.4: Were you paid on time as you expected
                            q637=1L,          # Question 1.5: Were you paid in full as expected
                            q638=1L,          # Question 2: How did you transportyour produce
                            q639=1L,          # Question 3: Were you able to sell all the bananas that you expected
                            q640=1L,          # Question 4: if no to Question 4, select reason
                            q641=1L,           # Question 5: Recommoned Likelihood
                            q642=1L          # Question 6: Reasons for question 5 ranking
                          )
                  )
 
#data_list element is document (one survey)
# ASSUMPTION: value of each data_list element is length of 1 (i.e. data_list is not nested)

df_from_db <- data.frame(matrix(unlist(data_list), nrow = length(data_list), 
    byrow = T))
colnames(df_from_db) <- names(data_list[[1]])  # assumes non-empty query

Renaming the questions to recognizable variable names using plyr

df_from_db <- rename(df_from_db, 
                     replace= c("q631"="cig", 
                                "q590"="cig1",
                                "q539"="county",
                                "q603"="sex", 
                                "q604"="age",
                                "q534"="numDairyCows",
                                "q605"="sizeFarm",
                                "q606"="sizeBananaFarm",
                                "q633"="Informed on time",          # Q1.1 
                                "q634"="Easy to transport bananas", # Q1.2 
                                "q635"="Fair price negotiated",     # Q1.3
                                "q636"="Paid on time",              # Q1.4
                                "q637"="Paid in full",              # Q1.5
                                "q638"="Q2", 
                                "q639"="Q3",
                                "q640"="Q4", 
                                "q641"="Q5", 
                                "q642"="Q6"
                                )
                      )

Data cleaning

Manual editing of cig1 error: “KIBANGA” -> “KIBANA”

levels(df_from_db$cig1)[levels(df_from_db$cig1) == "KIBANGA"] <- "KIBANA"

Function for Getting rid of leading or trailing whitespace with string answers

trim <- function(x) gsub("^\\s+|\\s+$", "", x)

Combine two CIG labels into one cig

tmp <- apply(subset(df_from_db, select = colnames(df_from_db) == "cig" | colnames(df_from_db) == 
    "cig1"), 1, function(x) {
    if (trim(toupper(toString(x[1]))) == trim(toupper(toString(x[2])))) {
        # if two cigs are same
        return(trim(toupper(toString(x[1]))))  # just return one of them
    }
    if (trim(toString(x[1])) == "") {
        return(trim(toupper(toString(x[2]))))  # if first one is empty, return the second one
    }
    if (trim(toString(x[2])) == "") {
        return(trim(toupper(toString(x[1]))))  # if secondone is empty, return the second one
    }
    # if above doesnt solve the problem, return them (for decoding)
    # return(paste(toString(x[1]),toString(x[2]),sep='|'))
    
    # if above doesnt solve the problem
    return("")
})
# add combined cig.tmp as cig and take out two pre-existing columns
df <- subset(df_from_db, select = -c(cig, cig1))
df <- data.frame(df, cig = tmp)

Missing county ==> NA

This is fixed by imputation… because surveys are in groups, if the missing county is within a chunk of rows with same county names, assume it is indeed that county.

for (i in 1:length(df$county)) {
    # skip the first two and last two
    if (i <= 2) {
        next
    }
    if (i >= length(df$county) - 2) {
        next
    }
    if (df$county[i] == "") {
        if (df$county[i - 1] == df$county[i + 1]) {
            # if one before and after are the same
            if (df$county[i - 2] == df$county[i + 1] | df$county[i - 1] == df$county[i + 
                2]) {
                df$county[i] <- df$county[i - 1]  # missing value is part of the chunk
            }
        }
    }
}
df$county <- ifelse(df$county == "", NA, as.character(df$county))

Common Interest Group : missing Common Interest Group ==> NA; Same logical as in (4)

for (i in 1:length(df$cig)) {
    # skip the first two and last two
    if (i <= 2) {
        next
    }
    if (i >= length(df$cig) - 2) {
        next
    }
    if (df$cig[i] == "") {
        if (df$cig[i - 1] == df$cig[i + 1]) {
            # if one before and after are the same
            if (df$cig[i - 2] == df$cig[i + 1] | df$cig[i - 1] == df$cig[i + 
                2]) {
                df$cig[i] <- df$cig[i - 1]  # missing value is part of the chunk
            }
        }
    }
}
rm(i)
df$cig <- ifelse(df$cig == "", NA, as.character(df$cig))

Gender

df$sex <- factor(df$sex, label = c("", "FEMALE", "MALE"))
df$sex <- ifelse(df$sex == "", NA, as.character(df$sex))

Farm Size

df$sizeFarm <- as.character(df$sizeFarm)
df$sizeFarm <- tolower(df$sizeFarm)

# Remove texts from farm size
df$sizeFarm <- gsub("point \\(", "\\.", df$sizeFarm)
df$sizeFarm <- gsub("\\)", "", df$sizeFarm)
df$sizeFarm <- gsub("[a-z]", "", df$sizeFarm)
df$sizeFarm <- trim(df$sizeFarm)  # trim trailing and leading space

# Convert text to formula and evaluate df$sizeFarm <- gsub('
# ','+',df$sizeFarm) df$sizeFarm <- round(sapply(df$sizeFarm,
# function(txt){eval(parse(text=txt))}),2)

Wealth ~ Number of Dairy Cows Relationship

# ASSUMPTION: There are no fractional cows. Coercing the values to numeric
# is good enough

df$numDairyCows <- as.character(df$numDairyCows)
df$numDairyCows <- as.numeric(df$numDairyCows)

## Warning: NAs introduced by coercion

df$numDairyCows <- ifelse(df$numDairyCows == -1, NA, df$numDairyCows)
# #
df$wealth <- as.character(df$numDairyCows)
df$wealth[which(is.na(df$wealth))] <- "Unknown"
df$wealth[which(df$wealth == "0")] <- "Poorer"
df$wealth[which(df$wealth == "1")] <- "Middle"
df$wealth[which(df$wealth == "2")] <- "Middle"
df$wealth[-which(df$wealth %in% c("Unknown", "Poorer", "Middle"))] <- "Richer"
df$wealth <- as.factor(df$wealth)

Question 1

# Question 1.1: Were you informed about market day on time Question 1.2: Was
# it easy for you take your bananas to aggregation centre Question 1.3: Do
# you feel that the group negotiated a fair/good price with the buyer
# Question 1.4: Were you paid on time as you expected Question 1.5: Were you
# paid in full as expected

# -1 => 'UNKNOWN', 0 => 'YES', 1 => 'NO'

levels(df$Informed.on.time) <- c("UNKNOWN", "YES", "NO")
levels(df$Easy.to.transport.bananas) <- c("UNKNOWN", "YES", "NO")
levels(df$Fair.price.negotiated) <- c("UNKNOWN", "YES", "NO")
levels(df$Paid.on.time) <- c("UNKNOWN", "YES", "NO")
levels(df$Paid.in.full) <- c("UNKNOWN", "YES", "NO")

Question 2,3,4

Recoding levels.

# ------------------ Question 2
levels(df$Q2) <- c("Unknown", "bicycle", "Motor cycle", "Animal drawn cart", 
    "Hired pickup truck", "My own motor vehicle")

# ------------------ Question 3
levels(df$Q3) <- c("UNKNOWN", "YES", "NO")

# ------------------ Question 4
levels(df$Q4) <- c("Unknown", "The buyer did not buy all the stock that we brought", 
    "The buyer was not satisfied with the quality", "Damaged during transportation", 
    "When I arrived the buyer had already completed purchases", "Other reasons")
# ------------------

Question 5 into NPS categories

df$Q5 <- factor(df$Q5, ordered = T, levels = c("-1", "0", "1", "2", "3", "4", 
    "5", "6", "7", "8", "9", "10"))


DPP_transform <- function(x) {
    # x: vector of likelihood to recommend in 0 - 10 scale returns vector of c(%
    # of Detractors, % of Passives, % of Promoters)
    DPP <- rep(NA, length(x))
    x <- as.numeric(as.character(x))
    x <- ifelse(x == -1, NA, x)
    
    # ---- LEON'S ----------------------- DPP[0 <= x & x <= 6] <- 'Negative'
    # DPP[7 <= x & x <= 8] <- 'Neutral' DPP[9 <= x & x <= 10] <- 'Positive'
    
    # ---- Ahmed'S -----------------------
    DPP[0 <= x & x <= 4] <- "Negative"
    DPP[5 <= x & x <= 6] <- "Neutral"
    DPP[7 <= x & x <= 10] <- "Positive"
    
    DPP <- factor(DPP, levels = c("Negative", "Neutral", "Positive"), order = T)
    return(DPP)
}

df <- data.frame(df, DPP = DPP_transform(df$Q5))

Remove Duplicate Rows

df <- df[!duplicated(df[, -1]), ]
df.orig <- df
rm(df)

Getting average NPS for the whole dataset (???)

# ... unfortunately we need to double the size of the dataset to achieve
# this do this for each var1

df.orig.2 <- df.orig
df.orig.2$cig <- rep("AVERAGE", nrow(df.orig))
df.orig.2$county <- rep("AVERAGE", nrow(df.orig))
df.orig <- rbind(df.orig.2, df.orig)
write.csv(df.orig, "bgak_farmers_table.csv")
rm(df.orig)
rm(df.orig.2)
rm(DPP_transform)
rm(df_from_db)
rm(data_list)
rm(tmp)
rm(trim)

LoadingData

Ahmed Tadde

September 27, 2015

Query questions individually!

Data cleaning