'Research Assistant Code Challenge'
## [1] "Research Assistant Code Challenge"
'Catherine Peng'
## [1] "Catherine Peng"
'November 2018'
## [1] "November 2018"
# ===========================================================================
# SETUP 
# ===========================================================================
# Clear all objects
rm(list = ls(all = TRUE))
# Set seed
set.seed(7482)
# Load packages
library(labelled)
## 
## LABELLED 2.0.0: BREAKING CHANGE
## 
## Following version 2.0.0 of `haven`, `labelled()` and `labelled_spss()` now produce objects with class 'haven_labelled' and 'haven_labelled_spss', due to conflict between the previous 'labelled' class and the 'labelled' class used by `Hmisc`.
## 
## A new function `update_labelled()` could be used to convert data imported with an older version of `haven`/`labelled` to the new classes.
library(lmtest)
## Warning: package 'lmtest' was built under R version 3.4.4
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(sandwich)
library(magrittr)
library(ggplot2)
library(stringr)
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
# Generate fake data set
df <- data.frame(
  # Respondent ID
  respondent_id = 1:1000,
  # Survey weights
  weights = runif(n = 1000, min = 0.5, max = 1.5),
  # Outcome
  Y1 = sample(x = c(2, 1, -1, -2, -77, -99, -88), size = 1000, replace = TRUE, 
              prob = c(0.2, 0.5, 0.4, 0.2, 0.05, 0.1, 0.1)),
  Y2 = sample(x = c(1, 2, 3, -77, -99, -88), size = 1000, replace = TRUE, 
              prob = c(0.2, 0.5, 0.4, 0.05, 0.05, 0.1))
)
df$Y1 <- labelled(x = df$Y1, labels = c("Strongly agree" = 2, 
                                        "Agree" = 1, 
                                        "Disagree" = -1, 
                                        "Strongly disagree" = -2, 
                                        "Not shown" = -77, "I don't know" = -99, "Missing" = -88))

df$Y2 <- labelled(x = df$Y2, labels = c("High" = 3, 
                                        "Medium" = 2, 
                                        "Low" = 1, 
                                        "Not shown" = -77,"I don't know" = -99, "Missing" = -88))
# ===========================================================================
# FUNCTIONS BY BAOBAO (grouped together for personal clarity)
# ===========================================================================
# Function to relabel variables
relabel_var <- function(old_var, old_labels, new_labels) {
  new_var <- rep(NA, length(old_var))
  if (is.factor(old_var)) {
    old_var <- as.character(old_var)
  }
  for (i in 1:length(old_labels)) {
    new_var[old_var == old_labels[i]] <- new_labels[i]
  }
  return(new_var)
}
# Trailing zeros rounding function
roundfunc <- function(x,
                      round_digits = 2,
                      lessthan = TRUE) {
  if (lessthan) {
    temp <- ifelse(x > 0 & round(x, round_digits) == 0,
                   paste0("<0.", rep(0, (round_digits - 1)), 1),
                   sprintf(paste0("%.", round_digits, "f"), round(x, round_digits)))
    temp <- ifelse(x < 0 & round(x, round_digits) == 0,
                   paste0(">-0.", rep(0, (round_digits - 1)), 1),
                   temp)
    temp[x == 0] <- 0
    return(temp)
  } else {
    return(sprintf(paste0("%.", round_digits, "f"), round(x, round_digits)))
  }
}
# Function to summarize categorical variables
catvar_func <-
  function(outcome, # outcome name
           outcome_var, # numeric outcome variable
           label_var, # labelled outcome variable 
           num_missing, # numerical value for missing or skipped
           num_DK, # numeric value for don't know
           shown, # variable for whether the question was shown to respondents
           output_type, # num_outcome = output clean data; value_table = frequency table; value_sum = mean/SE/N
           new_values, # new numerical values
           edit_labels = TRUE, # TRUE allows us to edit the value labels
           survey_weights, # survey weights
           missing_recode # the recode value for missing/skipped/don't know responses 
  ) {
    # Clean data to make the bar chart
    # Get the value labels
    value_labels <- as.data.frame(val_labels(label_var))
    value_labels$labels <- row.names(value_labels)
    names(value_labels)[1] <- "num"
    row.names(value_labels) <- NULL
    
    # Make data frame for the new values
    new_values_table <- data.frame(labels = value_labels$labels, new_values = new_values)
    
    # Make the frequency table
    sum_func <- function(outcome_var, value, survey_weights) {
      se_md <- lm(outcome_var[shown] == value ~ 1, 
                  weights = survey_weights[shown])
      out <- coeftest(se_md, vcov = vcovHC(se_md, type="HC2"))
      return(data.frame(num = value, 
                        Freq = sum(outcome_var[shown] == value),
                        Prop = as.numeric(out[1]), 
                        se = as.numeric(out[2])))
    }
    value_table <- do.call(rbind, lapply(value_labels$num, sum_func,
                                         outcome_var = outcome_var,
                                         survey_weights = survey_weights))
    # Merge the frequency table with the value labels
    value_table <-
      merge(x = value_table, y = value_labels, all.y = TRUE)
    value_table$group <-
      ifelse(value_table$num %in% c(num_missing, num_DK),
             "Don't know/Missing",
             "Responses")
    value_table$group <-
      factor(value_table$group,
             levels = c("Responses",
                        "Don't know/Missing"))
    value_table$labels <- capitalize(value_table$labels)
    value_table$outcome <- outcome
    value_table <- merge(x = value_table, y = new_values_table, all.x = TRUE)
    if (edit_labels) {
      value_table$labels <- ifelse(
        value_table$group == "Responses",
        paste0(value_table$new_values, ". ",
               value_table$labels),
        value_table$labels
      )
    }
    # Remove the not asked
    value_table <- value_table[!grepl(pattern = "not asked|not shown", 
                                      value_table$labels, 
                                      ignore.case = TRUE),]
    # Get the summary statistics
    num_outcome <- as.numeric(outcome_var[shown])
    survey_weights <- survey_weights[shown]
    num_outcome <-
      relabel_var(
        old_var = num_outcome,
        old_labels = value_table$num,
        new_labels = value_table$new_values
      )
    num_outcome_missing <- is.na(num_outcome)
    num_outcome[is.na(num_outcome)] <- missing_recode
    # Get the percent missing
    percent_missing <-
      sum(num_outcome_missing) / length(num_outcome_missing)
    # Get the mean and se
    md <- if (percent_missing > 0.1) {
      # If more than 10 percent is missing, then we condition on normalized dummy variable for missingness
      se_md <- lm(num_outcome ~ scale(num_outcome_missing),
                  weights = survey_weights)
      coeftest(se_md, vcov = vcovHC(se_md, type="HC2"))[1,]
    } else {
      se_md <- lm(num_outcome ~ 1,
                  weights = survey_weights)
      coeftest(se_md, vcov = vcovHC(se_md, type="HC2"))
    }
    # Put the summary statistics together
    value_sum <-
      data.frame(
        outcome = outcome,
        num = md[1],
        se = md[2],
        group = "Responses",
        sum_stat = paste0("Mean: ", roundfunc(md[1]), 
                          " (MOE: +/-", roundfunc(qnorm(0.975)*md[2]),
                          "); N = ",
                          sum(shown)),
        N = sum(shown)
      )
    if (output_type == "num_outcome") {
      return(num_outcome)
    } else if (output_type == "value_table") {
      return(value_table)
    } else {
      return(value_sum)
    }
  }
# ===========================================================================
# Y1 Output (used as a model for Y2 output)
# ===========================================================================
# Output for Y1
Y1_func <- function(output_type, dk_plot = NULL, skipped_plot = NULL) {
  out <- catvar_func(
    outcome = "Y1",
    outcome_var = as.numeric(df$Y1),
    label_var = df$Y1,
    output_type = output_type,
    shown = df$Y1 != -77,
    num_missing = -88,
    num_DK = -99,
    new_values <- c(2, 1, -1, -2, NA, NA, NA),
    survey_weights = df$weights, 
    missing_recode = mean(df$Y1[!df$Y1 %in% c(-77, -99, -88)]) # mean impute the don't know and missings
  )  
  # For frequency tables, change the position of the "num" variable for plotting 
  if (output_type == "value_table") {
    out$num[grep(pattern = "don't know", x = out$labels, ignore.case = TRUE)] <- dk_plot
    out$num[grep(pattern = "skipped|missing", x = out$labels, ignore.case = TRUE)] <- skipped_plot
  }
  return(out)
}
# Note that Baobao changed the position of the don't know and skipped responses
# *this means that when graphed, don't know/skipped responses are grouped to the right, 
# away from the other response types 

Y1_value_table <- Y1_func("value_table", dk_plot = 8, skipped_plot = 9) 
Y1_value_sum <- Y1_func("value_sum")
# ===========================================================================
# Y1 graph (used as a model for Y2 graph)
# with comments on my slight personal adjustments
# ===========================================================================
ggplot() +
  # Bar graph
  geom_bar(data = Y1_value_table, aes(x = num, y = Prop), 
           stat = "identity",
           fill = "grey70") +
  # Error bars
  geom_errorbar(data = 
                  Y1_value_table[Y1_value_table$Prop !=0,], 
                aes(x = num, ymin = Prop + qnorm(0.025)*se,
                    ymax = Prop + qnorm(0.975)*se), width = 0.1) +
  #-----------------------------------------------------------------------
  # Add in the percentage texts 

  # MODIFICATION 1: DELETE "NUDGE_X = 0.25"
  # the original code includes "nudge_x = 0.25" after y = 0.02
  # after running the code, however, the following error message is produced: 
  #"ERROR POSITION_NUDGE REQUIRES FOLLOWING MISSING AESTHETICS:y" 
  # Perhaps you meant to reposition x by including "nudge_x = 0.25"
  # however, aes() function already places the cleaned proportion number
  # under the matching "response" type using variable "num" provided 
  # in the value table. 

  # MODIFICATION 2: REFORMAT THE PERCENT LABEL TO INCLUDE % (PERCENT SYMBOL)
  # The Y axis is formatted in percents. For visual consistency, I converted
  # the cleaned percent into a string and pasted % to the end so that the 
  # label for each response type now reads: <rounded percent>%

  # MODIFICATION 3: REFORMAT THE Y POSITION OF THE PERCENT LABEL
  # y = 0.02 within geom_text() places the label inside the bar.
  # However, since the bar is grey, this makes text hard to read.
  # I have now placed the label in the white space directly under each bar.

  # deleted nudge_x and the graph works well
  geom_text(data = Y1_value_table, 
            aes(x = num,label = paste(as.character(roundfunc(Prop*100, 0)),"%", sep = "")), 
            y = -0.01) +
  
  #babao's code now continues normally 
  #-----------------------------------------------------------------------
  # Beautify the x-axis
  scale_x_continuous(breaks = Y1_value_table$num[order(Y1_value_table$num)],
                     labels = str_wrap(Y1_value_table$labels[order(Y1_value_table$num)], 
                                       width = 15)) +
  # Break up the responses by type
  facet_grid(~group, scales = "free_x", space = "free_x") + theme_bw() +
  # Add in the numerical summary statistics
  geom_text(data = Y1_value_sum, aes(x = 0, label = sum_stat,
                                     y = max(Y1_value_table$Prop)+0.05)) +
  # Change the scale to percentage points
  scale_y_continuous(labels = scales::percent, 
                     limits = c(0, max(Y1_value_table$Prop)+0.05)) +
  # Label the axis and add in the caption 
  xlab("Responses") + ylab("Percentage of respondents") + 
  labs(caption = "Source: Governance of AI Program")

# ===========================================================================
# RESPONSES TO CODING CHALLENGES
# We now begin the direct assignments 
# ===========================================================================

# ===========================================================================
# Q1: Recode the `Y2` variable such that "High" = 2, "Medium" = 1, "Low" = 0.
# ===========================================================================
# create list of old labels for Y2 using c()
old_labels <- c("High" = 3, "Medium" = 2, "Low" = 1, 
                "Not shown" = -77, "I don't know" = -99, "Missing" = -88)

# create list of new  labels for Y2 using c()
new_labels <- c("High" = 2, "Medium" = 1, "Low" = 0, 
                "Not shown" = -77, "I don't know" = -99, "Missing" = -88) 

# use arguments in relabel_var function to change VALUES of Y2
# replace old Y2 values with new Y2 values 
df$Y2 <- relabel_var(df$Y2, old_labels, new_labels)

# use labelled function to make sure the new VALUES of Y2 match new LABELS
df$Y2 <- labelled(x = df$Y2, 
                  labels <- new_labels)

# check to see if changes have taken place properly 
describe(df$Y2)
## df$Y2 : 2 df$Y2 : 1 df$Y2 : 0 df$Y2 : -77 df$Y2 : -99 df$Y2 : -88 
##        n  missing distinct     Info     Mean      Gmd 
##     1000        0        6    0.892   -10.83    21.47 
##                                               
## Value        -99   -88   -77     0     1     2
## Frequency     26    74    36   135   426   303
## Proportion 0.026 0.074 0.036 0.135 0.426 0.303
# frequencies match frequences of original values! 
# this means that values have been recoded properly

# ========================================================================
# Q2: Use the code above to generate a similar graph for the `Y2` variable
# ========================================================================
#(1)---------------------------------------------------------------------
# create Y2 output function based on Y1 output function 
# replace areas where "Y1" is hard coded with "Y2" and replace 
# Y1 values with Y2 values, etc. 

Y2_func <- function(output_type, dk_plot = NULL, skipped_plot = NULL) {
  out <- catvar_func(
    outcome = "Y2",
    outcome_var = as.numeric(df$Y2),
    label_var = df$Y2,
    output_type = output_type,
    shown = df$Y2 != -77,
    num_missing = -88,
    num_DK = -99,
    
    # assign Y2 values to new_values 
    
    # notice that it is also possible to change labels of Y2 within the output function ! 
    # as such, another way to "recode" Y2 is to skip the modifications I coded in Question 1. 
    # You can keep the original values of Y2 and recode using new_values <- c(2, 1, 0, NA, NA, NA)
    
    new_values <- c(2, 1, 0, NA, NA, NA),
    survey_weights = df$weights, 
    missing_recode = mean(df$Y2[!df$Y2 %in% c(-77, -99, -88)]) # mean impute the don't know and missings
  )  
  # For frequency tables, change the position of the "num" variable for plotting 
  if (output_type == "value_table") {
    out$num[grep(pattern = "don't know", x = out$labels, ignore.case = TRUE)] <- dk_plot
    out$num[grep(pattern = "skipped|missing", x = out$labels, ignore.case = TRUE)] <- skipped_plot
  }
  return(out)
}
# Note that Baobao changed the position of the don't know and skipped responses
# *this means that when graphed, don't know/skipped responses are grouped to the right, 
# away from the other response types 

#label Y2 output in the same convention/style as Y1 for consistency! 
Y2_value_table <- Y2_func("value_table", dk_plot = 8, skipped_plot = 9) 
Y2_value_sum <- Y2_func("value_sum")

#(2)---------------------------------------------------------------------
# Using Y2 outputs, we are ready to create Y2 graph 
# Replace hardcode of "Y1" with "Y2"
# other modifications are also noted in commented portions !

ggplot() +
  # Bar graph
  geom_bar(data = Y2_value_table, aes(x = num, y = Prop), 
           stat = "identity",
           fill = "grey70") +
  # Error bars
  geom_errorbar(data = 
                  Y2_value_table[Y2_value_table$Prop !=0,], 
                  aes(x = num, ymin = Prop + qnorm(0.025)*se,
                    ymax = Prop + qnorm(0.975)*se), width = 0.1) +
  #-----------------------------------------------------------------------
# Add in the percentage texts 

# MODIFICATION 1: DELETE "NUDGE_X = 0.25"
# the original code includes "nudge_x = 0.25" after y = 0.02
# after running the code, however, the following error message is produced: 
#"ERROR POSITION_NUDGE REQUIRES FOLLOWING MISSING AESTHETICS:y" 
# Perhaps you meant to reposition x by including "nudge_x = 0.25"
# however, aes() function already places the cleaned proportion number
# under the matching "response" type using variable "num" provided 
# in the value table. 

# MODIFICATION 2: REFORMAT THE PERCENT LABEL TO INCLUDE % (PERCENT SYMBOL)
# The Y axis is formatted in percents. For visual consistency, I converted
# the cleaned percent into a string and pasted % to the end so that the 
# label for each response type now reads: <rounded percent>%

# MODIFICATION 3: REFORMAT THE Y POSITION OF THE PERCENT LABEL
# y = 0.02 within geom_text() places the label inside the bar.
# However, since the bar is grey, this makes text hard to read.
# I have now placed the label in the white space directly under each bar.

  geom_text(data = Y2_value_table, 
          aes(x = num,label = paste(as.character(roundfunc(Prop*100, 0)),"%", sep = "")), 
          y = -0.01) +
  
  #babao's code now continues normally 
  #-----------------------------------------------------------------------
  # Beautify the x-axis
  scale_x_continuous(breaks = Y2_value_table$num[order(Y2_value_table$num)],
                   labels = str_wrap(Y2_value_table$labels[order(Y2_value_table$num)], 
                                     width = 15)) +
  # Break up the responses by type
  facet_grid(~group, scales = "free_x", space = "free_x") + theme_bw() +
  
  #-----------------------------------------------------------------------
  # Add in the numerical summary statistics
  # center the placement of the sum_stat label over relevant bars 
  # 1 is the median value of the "Responses" bars (not the don't know/missing types)
  
  geom_text(data = Y2_value_sum, aes(x = 1, label = sum_stat,
                                     y = max(Y2_value_table$Prop)+0.05)) +
  #babao's code now continues normally 
  #-----------------------------------------------------------------------
  # Change the scale to percentage points
  scale_y_continuous(labels = scales::percent, 
                     limits = c(0, max(Y2_value_table$Prop)+0.05)) +
  # Label the axis and add in the caption 
  xlab("Responses") + ylab("Percentage of respondents") + 
  labs(caption = "Source: Governance of AI Program")

# ========================================================================
# Bonus challenge: write a general function to generate graphs for Y1 and Y2
# ========================================================================
# create a general graph that takes three arguments 
# value_table is the Y1 or Y2 specific value table created above 
# value_sum is the Y1 or Y2 specific value sum table created above 
# median response value is the median value of the "Responses" bars (not the don't know/missing bars)
# including median response value improves formatting of the general graphing function 

# The code within the graph_function is the same as the code used to 
# generate Y1 and Y2 graphs above. However, specific inputs have now been
# replaced by the general function arguments 

graph_function <- function(value_table, value_sum, median_response_value){
  ggplot() +
    # Bar graph
    geom_bar(data = value_table, aes(x = num, y = Prop), 
             stat = "identity",
             fill = "grey70") +
    # Error bars
    geom_errorbar(data = 
                    value_table[value_table$Prop !=0,], 
                  aes(x = num, ymin = Prop + qnorm(0.025)*se,
                      ymax = Prop + qnorm(0.975)*se), width = 0.1) +
    # Add in the percentage texts
    geom_text(data = value_table, 
              aes(x = num,label = paste(as.character(roundfunc(Prop*100, 0)),"%", sep = "")), 
              y = -0.01) +
    # Beautify the x-axis
    scale_x_continuous(breaks = value_table$num[order(value_table$num)],
                       labels = str_wrap(value_table$labels[order(value_table$num)], 
                                         width = 15)) +
    # Break up the responses by type
    facet_grid(~group, scales = "free_x", space = "free_x") + theme_bw() +
    # Add in the numerical summary statistics
    geom_text(data = value_sum, aes(x = median_response_value, label = sum_stat,
                                    y = max(value_table$Prop)+0.05)) +
    # Change the scale to percentage points
    scale_y_continuous(labels = scales::percent, 
                       limits = c(0, max(value_table$Prop)+0.05)) +
    # Label the axis and add in the caption 
    xlab("Responses") + ylab("Percentage of respondents") + 
    labs(caption = "Source: Governance of AI Program")
}
# Test results 
Y1_test <- graph_function(Y1_value_table, Y1_value_sum, 0)
Y1_test

Y2_test <- graph_function(Y2_value_table, Y2_value_sum, 1)
Y2_test