'Research Assistant Code Challenge'
## [1] "Research Assistant Code Challenge"
'Catherine Peng'
## [1] "Catherine Peng"
'November 2018'
## [1] "November 2018"
# ===========================================================================
# SETUP
# ===========================================================================
# Clear all objects
rm(list = ls(all = TRUE))
# Set seed
set.seed(7482)
# Load packages
library(labelled)
##
## LABELLED 2.0.0: BREAKING CHANGE
##
## Following version 2.0.0 of `haven`, `labelled()` and `labelled_spss()` now produce objects with class 'haven_labelled' and 'haven_labelled_spss', due to conflict between the previous 'labelled' class and the 'labelled' class used by `Hmisc`.
##
## A new function `update_labelled()` could be used to convert data imported with an older version of `haven`/`labelled` to the new classes.
library(lmtest)
## Warning: package 'lmtest' was built under R version 3.4.4
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(sandwich)
library(magrittr)
library(ggplot2)
library(stringr)
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
# Generate fake data set
df <- data.frame(
# Respondent ID
respondent_id = 1:1000,
# Survey weights
weights = runif(n = 1000, min = 0.5, max = 1.5),
# Outcome
Y1 = sample(x = c(2, 1, -1, -2, -77, -99, -88), size = 1000, replace = TRUE,
prob = c(0.2, 0.5, 0.4, 0.2, 0.05, 0.1, 0.1)),
Y2 = sample(x = c(1, 2, 3, -77, -99, -88), size = 1000, replace = TRUE,
prob = c(0.2, 0.5, 0.4, 0.05, 0.05, 0.1))
)
df$Y1 <- labelled(x = df$Y1, labels = c("Strongly agree" = 2,
"Agree" = 1,
"Disagree" = -1,
"Strongly disagree" = -2,
"Not shown" = -77, "I don't know" = -99, "Missing" = -88))
df$Y2 <- labelled(x = df$Y2, labels = c("High" = 3,
"Medium" = 2,
"Low" = 1,
"Not shown" = -77,"I don't know" = -99, "Missing" = -88))
# ===========================================================================
# FUNCTIONS BY BAOBAO (grouped together for personal clarity)
# ===========================================================================
# Function to relabel variables
relabel_var <- function(old_var, old_labels, new_labels) {
new_var <- rep(NA, length(old_var))
if (is.factor(old_var)) {
old_var <- as.character(old_var)
}
for (i in 1:length(old_labels)) {
new_var[old_var == old_labels[i]] <- new_labels[i]
}
return(new_var)
}
# Trailing zeros rounding function
roundfunc <- function(x,
round_digits = 2,
lessthan = TRUE) {
if (lessthan) {
temp <- ifelse(x > 0 & round(x, round_digits) == 0,
paste0("<0.", rep(0, (round_digits - 1)), 1),
sprintf(paste0("%.", round_digits, "f"), round(x, round_digits)))
temp <- ifelse(x < 0 & round(x, round_digits) == 0,
paste0(">-0.", rep(0, (round_digits - 1)), 1),
temp)
temp[x == 0] <- 0
return(temp)
} else {
return(sprintf(paste0("%.", round_digits, "f"), round(x, round_digits)))
}
}
# Function to summarize categorical variables
catvar_func <-
function(outcome, # outcome name
outcome_var, # numeric outcome variable
label_var, # labelled outcome variable
num_missing, # numerical value for missing or skipped
num_DK, # numeric value for don't know
shown, # variable for whether the question was shown to respondents
output_type, # num_outcome = output clean data; value_table = frequency table; value_sum = mean/SE/N
new_values, # new numerical values
edit_labels = TRUE, # TRUE allows us to edit the value labels
survey_weights, # survey weights
missing_recode # the recode value for missing/skipped/don't know responses
) {
# Clean data to make the bar chart
# Get the value labels
value_labels <- as.data.frame(val_labels(label_var))
value_labels$labels <- row.names(value_labels)
names(value_labels)[1] <- "num"
row.names(value_labels) <- NULL
# Make data frame for the new values
new_values_table <- data.frame(labels = value_labels$labels, new_values = new_values)
# Make the frequency table
sum_func <- function(outcome_var, value, survey_weights) {
se_md <- lm(outcome_var[shown] == value ~ 1,
weights = survey_weights[shown])
out <- coeftest(se_md, vcov = vcovHC(se_md, type="HC2"))
return(data.frame(num = value,
Freq = sum(outcome_var[shown] == value),
Prop = as.numeric(out[1]),
se = as.numeric(out[2])))
}
value_table <- do.call(rbind, lapply(value_labels$num, sum_func,
outcome_var = outcome_var,
survey_weights = survey_weights))
# Merge the frequency table with the value labels
value_table <-
merge(x = value_table, y = value_labels, all.y = TRUE)
value_table$group <-
ifelse(value_table$num %in% c(num_missing, num_DK),
"Don't know/Missing",
"Responses")
value_table$group <-
factor(value_table$group,
levels = c("Responses",
"Don't know/Missing"))
value_table$labels <- capitalize(value_table$labels)
value_table$outcome <- outcome
value_table <- merge(x = value_table, y = new_values_table, all.x = TRUE)
if (edit_labels) {
value_table$labels <- ifelse(
value_table$group == "Responses",
paste0(value_table$new_values, ". ",
value_table$labels),
value_table$labels
)
}
# Remove the not asked
value_table <- value_table[!grepl(pattern = "not asked|not shown",
value_table$labels,
ignore.case = TRUE),]
# Get the summary statistics
num_outcome <- as.numeric(outcome_var[shown])
survey_weights <- survey_weights[shown]
num_outcome <-
relabel_var(
old_var = num_outcome,
old_labels = value_table$num,
new_labels = value_table$new_values
)
num_outcome_missing <- is.na(num_outcome)
num_outcome[is.na(num_outcome)] <- missing_recode
# Get the percent missing
percent_missing <-
sum(num_outcome_missing) / length(num_outcome_missing)
# Get the mean and se
md <- if (percent_missing > 0.1) {
# If more than 10 percent is missing, then we condition on normalized dummy variable for missingness
se_md <- lm(num_outcome ~ scale(num_outcome_missing),
weights = survey_weights)
coeftest(se_md, vcov = vcovHC(se_md, type="HC2"))[1,]
} else {
se_md <- lm(num_outcome ~ 1,
weights = survey_weights)
coeftest(se_md, vcov = vcovHC(se_md, type="HC2"))
}
# Put the summary statistics together
value_sum <-
data.frame(
outcome = outcome,
num = md[1],
se = md[2],
group = "Responses",
sum_stat = paste0("Mean: ", roundfunc(md[1]),
" (MOE: +/-", roundfunc(qnorm(0.975)*md[2]),
"); N = ",
sum(shown)),
N = sum(shown)
)
if (output_type == "num_outcome") {
return(num_outcome)
} else if (output_type == "value_table") {
return(value_table)
} else {
return(value_sum)
}
}
# ===========================================================================
# Y1 Output (used as a model for Y2 output)
# ===========================================================================
# Output for Y1
Y1_func <- function(output_type, dk_plot = NULL, skipped_plot = NULL) {
out <- catvar_func(
outcome = "Y1",
outcome_var = as.numeric(df$Y1),
label_var = df$Y1,
output_type = output_type,
shown = df$Y1 != -77,
num_missing = -88,
num_DK = -99,
new_values <- c(2, 1, -1, -2, NA, NA, NA),
survey_weights = df$weights,
missing_recode = mean(df$Y1[!df$Y1 %in% c(-77, -99, -88)]) # mean impute the don't know and missings
)
# For frequency tables, change the position of the "num" variable for plotting
if (output_type == "value_table") {
out$num[grep(pattern = "don't know", x = out$labels, ignore.case = TRUE)] <- dk_plot
out$num[grep(pattern = "skipped|missing", x = out$labels, ignore.case = TRUE)] <- skipped_plot
}
return(out)
}
# Note that Baobao changed the position of the don't know and skipped responses
# *this means that when graphed, don't know/skipped responses are grouped to the right,
# away from the other response types
Y1_value_table <- Y1_func("value_table", dk_plot = 8, skipped_plot = 9)
Y1_value_sum <- Y1_func("value_sum")
# ===========================================================================
# Y1 graph (used as a model for Y2 graph)
# with comments on my slight personal adjustments
# ===========================================================================
ggplot() +
# Bar graph
geom_bar(data = Y1_value_table, aes(x = num, y = Prop),
stat = "identity",
fill = "grey70") +
# Error bars
geom_errorbar(data =
Y1_value_table[Y1_value_table$Prop !=0,],
aes(x = num, ymin = Prop + qnorm(0.025)*se,
ymax = Prop + qnorm(0.975)*se), width = 0.1) +
#-----------------------------------------------------------------------
# Add in the percentage texts
# MODIFICATION 1: DELETE "NUDGE_X = 0.25"
# the original code includes "nudge_x = 0.25" after y = 0.02
# after running the code, however, the following error message is produced:
#"ERROR POSITION_NUDGE REQUIRES FOLLOWING MISSING AESTHETICS:y"
# Perhaps you meant to reposition x by including "nudge_x = 0.25"
# however, aes() function already places the cleaned proportion number
# under the matching "response" type using variable "num" provided
# in the value table.
# MODIFICATION 2: REFORMAT THE PERCENT LABEL TO INCLUDE % (PERCENT SYMBOL)
# The Y axis is formatted in percents. For visual consistency, I converted
# the cleaned percent into a string and pasted % to the end so that the
# label for each response type now reads: <rounded percent>%
# MODIFICATION 3: REFORMAT THE Y POSITION OF THE PERCENT LABEL
# y = 0.02 within geom_text() places the label inside the bar.
# However, since the bar is grey, this makes text hard to read.
# I have now placed the label in the white space directly under each bar.
# deleted nudge_x and the graph works well
geom_text(data = Y1_value_table,
aes(x = num,label = paste(as.character(roundfunc(Prop*100, 0)),"%", sep = "")),
y = -0.01) +
#babao's code now continues normally
#-----------------------------------------------------------------------
# Beautify the x-axis
scale_x_continuous(breaks = Y1_value_table$num[order(Y1_value_table$num)],
labels = str_wrap(Y1_value_table$labels[order(Y1_value_table$num)],
width = 15)) +
# Break up the responses by type
facet_grid(~group, scales = "free_x", space = "free_x") + theme_bw() +
# Add in the numerical summary statistics
geom_text(data = Y1_value_sum, aes(x = 0, label = sum_stat,
y = max(Y1_value_table$Prop)+0.05)) +
# Change the scale to percentage points
scale_y_continuous(labels = scales::percent,
limits = c(0, max(Y1_value_table$Prop)+0.05)) +
# Label the axis and add in the caption
xlab("Responses") + ylab("Percentage of respondents") +
labs(caption = "Source: Governance of AI Program")

# ===========================================================================
# RESPONSES TO CODING CHALLENGES
# We now begin the direct assignments
# ===========================================================================
# ===========================================================================
# Q1: Recode the `Y2` variable such that "High" = 2, "Medium" = 1, "Low" = 0.
# ===========================================================================
# create list of old labels for Y2 using c()
old_labels <- c("High" = 3, "Medium" = 2, "Low" = 1,
"Not shown" = -77, "I don't know" = -99, "Missing" = -88)
# create list of new labels for Y2 using c()
new_labels <- c("High" = 2, "Medium" = 1, "Low" = 0,
"Not shown" = -77, "I don't know" = -99, "Missing" = -88)
# use arguments in relabel_var function to change VALUES of Y2
# replace old Y2 values with new Y2 values
df$Y2 <- relabel_var(df$Y2, old_labels, new_labels)
# use labelled function to make sure the new VALUES of Y2 match new LABELS
df$Y2 <- labelled(x = df$Y2,
labels <- new_labels)
# check to see if changes have taken place properly
describe(df$Y2)
## df$Y2 : 2 df$Y2 : 1 df$Y2 : 0 df$Y2 : -77 df$Y2 : -99 df$Y2 : -88
## n missing distinct Info Mean Gmd
## 1000 0 6 0.892 -10.83 21.47
##
## Value -99 -88 -77 0 1 2
## Frequency 26 74 36 135 426 303
## Proportion 0.026 0.074 0.036 0.135 0.426 0.303
# frequencies match frequences of original values!
# this means that values have been recoded properly
# ========================================================================
# Q2: Use the code above to generate a similar graph for the `Y2` variable
# ========================================================================
#(1)---------------------------------------------------------------------
# create Y2 output function based on Y1 output function
# replace areas where "Y1" is hard coded with "Y2" and replace
# Y1 values with Y2 values, etc.
Y2_func <- function(output_type, dk_plot = NULL, skipped_plot = NULL) {
out <- catvar_func(
outcome = "Y2",
outcome_var = as.numeric(df$Y2),
label_var = df$Y2,
output_type = output_type,
shown = df$Y2 != -77,
num_missing = -88,
num_DK = -99,
# assign Y2 values to new_values
# notice that it is also possible to change labels of Y2 within the output function !
# as such, another way to "recode" Y2 is to skip the modifications I coded in Question 1.
# You can keep the original values of Y2 and recode using new_values <- c(2, 1, 0, NA, NA, NA)
new_values <- c(2, 1, 0, NA, NA, NA),
survey_weights = df$weights,
missing_recode = mean(df$Y2[!df$Y2 %in% c(-77, -99, -88)]) # mean impute the don't know and missings
)
# For frequency tables, change the position of the "num" variable for plotting
if (output_type == "value_table") {
out$num[grep(pattern = "don't know", x = out$labels, ignore.case = TRUE)] <- dk_plot
out$num[grep(pattern = "skipped|missing", x = out$labels, ignore.case = TRUE)] <- skipped_plot
}
return(out)
}
# Note that Baobao changed the position of the don't know and skipped responses
# *this means that when graphed, don't know/skipped responses are grouped to the right,
# away from the other response types
#label Y2 output in the same convention/style as Y1 for consistency!
Y2_value_table <- Y2_func("value_table", dk_plot = 8, skipped_plot = 9)
Y2_value_sum <- Y2_func("value_sum")
#(2)---------------------------------------------------------------------
# Using Y2 outputs, we are ready to create Y2 graph
# Replace hardcode of "Y1" with "Y2"
# other modifications are also noted in commented portions !
ggplot() +
# Bar graph
geom_bar(data = Y2_value_table, aes(x = num, y = Prop),
stat = "identity",
fill = "grey70") +
# Error bars
geom_errorbar(data =
Y2_value_table[Y2_value_table$Prop !=0,],
aes(x = num, ymin = Prop + qnorm(0.025)*se,
ymax = Prop + qnorm(0.975)*se), width = 0.1) +
#-----------------------------------------------------------------------
# Add in the percentage texts
# MODIFICATION 1: DELETE "NUDGE_X = 0.25"
# the original code includes "nudge_x = 0.25" after y = 0.02
# after running the code, however, the following error message is produced:
#"ERROR POSITION_NUDGE REQUIRES FOLLOWING MISSING AESTHETICS:y"
# Perhaps you meant to reposition x by including "nudge_x = 0.25"
# however, aes() function already places the cleaned proportion number
# under the matching "response" type using variable "num" provided
# in the value table.
# MODIFICATION 2: REFORMAT THE PERCENT LABEL TO INCLUDE % (PERCENT SYMBOL)
# The Y axis is formatted in percents. For visual consistency, I converted
# the cleaned percent into a string and pasted % to the end so that the
# label for each response type now reads: <rounded percent>%
# MODIFICATION 3: REFORMAT THE Y POSITION OF THE PERCENT LABEL
# y = 0.02 within geom_text() places the label inside the bar.
# However, since the bar is grey, this makes text hard to read.
# I have now placed the label in the white space directly under each bar.
geom_text(data = Y2_value_table,
aes(x = num,label = paste(as.character(roundfunc(Prop*100, 0)),"%", sep = "")),
y = -0.01) +
#babao's code now continues normally
#-----------------------------------------------------------------------
# Beautify the x-axis
scale_x_continuous(breaks = Y2_value_table$num[order(Y2_value_table$num)],
labels = str_wrap(Y2_value_table$labels[order(Y2_value_table$num)],
width = 15)) +
# Break up the responses by type
facet_grid(~group, scales = "free_x", space = "free_x") + theme_bw() +
#-----------------------------------------------------------------------
# Add in the numerical summary statistics
# center the placement of the sum_stat label over relevant bars
# 1 is the median value of the "Responses" bars (not the don't know/missing types)
geom_text(data = Y2_value_sum, aes(x = 1, label = sum_stat,
y = max(Y2_value_table$Prop)+0.05)) +
#babao's code now continues normally
#-----------------------------------------------------------------------
# Change the scale to percentage points
scale_y_continuous(labels = scales::percent,
limits = c(0, max(Y2_value_table$Prop)+0.05)) +
# Label the axis and add in the caption
xlab("Responses") + ylab("Percentage of respondents") +
labs(caption = "Source: Governance of AI Program")

# ========================================================================
# Bonus challenge: write a general function to generate graphs for Y1 and Y2
# ========================================================================
# create a general graph that takes three arguments
# value_table is the Y1 or Y2 specific value table created above
# value_sum is the Y1 or Y2 specific value sum table created above
# median response value is the median value of the "Responses" bars (not the don't know/missing bars)
# including median response value improves formatting of the general graphing function
# The code within the graph_function is the same as the code used to
# generate Y1 and Y2 graphs above. However, specific inputs have now been
# replaced by the general function arguments
graph_function <- function(value_table, value_sum, median_response_value){
ggplot() +
# Bar graph
geom_bar(data = value_table, aes(x = num, y = Prop),
stat = "identity",
fill = "grey70") +
# Error bars
geom_errorbar(data =
value_table[value_table$Prop !=0,],
aes(x = num, ymin = Prop + qnorm(0.025)*se,
ymax = Prop + qnorm(0.975)*se), width = 0.1) +
# Add in the percentage texts
geom_text(data = value_table,
aes(x = num,label = paste(as.character(roundfunc(Prop*100, 0)),"%", sep = "")),
y = -0.01) +
# Beautify the x-axis
scale_x_continuous(breaks = value_table$num[order(value_table$num)],
labels = str_wrap(value_table$labels[order(value_table$num)],
width = 15)) +
# Break up the responses by type
facet_grid(~group, scales = "free_x", space = "free_x") + theme_bw() +
# Add in the numerical summary statistics
geom_text(data = value_sum, aes(x = median_response_value, label = sum_stat,
y = max(value_table$Prop)+0.05)) +
# Change the scale to percentage points
scale_y_continuous(labels = scales::percent,
limits = c(0, max(value_table$Prop)+0.05)) +
# Label the axis and add in the caption
xlab("Responses") + ylab("Percentage of respondents") +
labs(caption = "Source: Governance of AI Program")
}
# Test results
Y1_test <- graph_function(Y1_value_table, Y1_value_sum, 0)
Y1_test

Y2_test <- graph_function(Y2_value_table, Y2_value_sum, 1)
Y2_test
