The goal of this script is to estimate treatment effects (in line with our pre-analysis plan) for phase 1 of the vaccine hesitancy project.
# Load packages
pacman::p_load(DT, estimatr, kableExtra, readr, reshape2, tidyverse, xtable, dataMaid, ggcorrplot, ggmap, rpart, rpart.plot, pollster, RColorBrewer, hrbrthemes, janitor, purrr, gridExtra, cowplot, rcompanion, texreg, compareGroups, multcomp)
set.seed(94305)# Data Loading and Merging
## read ads data for 4 pilot waves
ads_v7 <-
read.csv("../ads_data/ads_data_v7.csv") %>%
rename(original_ref = Ad.ID, ad_name = Ad.name) %>%
mutate(
`Analysis 3 - impediment theme` = str_sub(ad_name, 8) %>% str_to_lower()
) %>%
relocate(original_ref, ad_name)
# read in CURRENT chatfuel data
# full data
df_full_v7 <-
read_csv("../chatfuel_data/chatfuel_full_v7.csv") %>%
clean_names() %>%
mutate_if(is.character, ~ str_replace_all(., '[\n\t]', '')) %>%
mutate(
first_name = if_else(is.na(first_name), "", first_name),
middle_name = if_else(is.na(middle_name), "", middle_name),
last_name = if_else(is.na(last_name), "", last_name),
full_name = str_c(first_name, middle_name, last_name, sep = " "),
full_name_short = str_c(first_name, last_name, sep = " "),
date = lubridate::date(signed_up)
) %>%
filter(!(full_name %in% c("Robert Kuan", "James Li", "Kaylin Rochford",
"Saurabh Khanna", "Dingchen Sha", "Kristine Koutout",
"Susan Athey", "Dean Karlan"))) %>%
filter(version == "ALP_May") %>%
mutate(original_ref = parse_number(original_ref)) %>%
mutate(
motive = if_else(str_detect(motive, "yes"), "yes", "no"),
motive_main = if_else(str_detect(motive_main, "risk"), "risk", motive_main)
) %>%
remove_empty()
# filter to completes and remove duplicates
df_v7 <-
df_full_v7 %>%
filter(full_complete == "yes") %>%
drop_na(vax_status) %>%
mutate(
phone_number = str_replace_all(phone_number, " ", ""),
phone_number = str_replace_all(phone_number, "-", ""),
) %>%
arrange(phone_number, last_seen) %>%
distinct(phone_number, .keep_all = T) %>%
remove_empty()The current script is built on simulated data using 5216 respondents who started the v7 pilot survey.
The 5 treatments (github issue) are listed below:
The 3 outcomes of interest (github issue) are listed below:
Treatment Generation
Dummy distribution of 5 treatments among 5216 respondents who started the survey.
D <- 1:5
df_full_v7$T <- factor(sample(D, nrow(df_full_v7), replace = T), levels = D)
df_full_v7$complete <- ifelse(!is.na(df_full_v7$full_complete), 1, 0)
df_full_v7 %>% transmute(Treatment = str_c("T", T)) %>% count(Treatment, name = "Count") %>% kable(format = "pipe")| Treatment | Count |
|---|---|
| T1 | 1048 |
| T2 | 1083 |
| T3 | 1022 |
| T4 | 1018 |
| T5 | 1045 |
As a quick overview before we dive into the hypotheses, the plot below summarizes the treatment effects by outcome:
estimate_effect <- function(df, outcome, outcome_name){
eq <- as.formula(paste(outcome, " ~ 0 + T"))
lm <- lm_robust(eq, data = df)
response <- bind_rows(list(tidy(lm))) %>%
mutate(term = gsub('treatment', '', term)) %>%
filter(!(term %in% c("(Intercept)")))
ggplot(response, aes(x = term, y = estimate)) +
geom_point(position = position_dodge(width = 0.5), size = 3) +
geom_errorbar(
aes(x = term,ymin = estimate - 1.96 * std.error,ymax = estimate + 1.96 * std.error),width = .1,position = position_dodge(width = 0.5)) +
xlab('Treatment') +
ylab('Estimate') +
theme(plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
#geom_hline(yintercept = 0, colour = 'grey60', linetype = 2) +
ggtitle(paste0('Treatment Effect Estimate on ', outcome_name)) +
theme_minimal() +
theme(plot.title = element_text(face = "bold", hjust = 0.5))
}
outcome <- "complete"
outcome_completion <- "Survey Completion Rate"plot1 <- estimate_effect(df_full_v7, outcome, outcome_completion)
plot1Now we undertake treatment effect estimation for each hypothesis (detailed in our pre-analysis plan) for the first outcome O1 – survey completion rate conditional on survey started.
Hypothesis: Participants in Treatment 4 provide more information, higher-quality information, and are more receptive to new information about the vaccine compared to participants in Treatment 1.
estimate_effect(df_full_v7 %>% filter(T %in% c(1, 4)), outcome, outcome_completion)df_full_v7 %>%
filter(T %in% c(1, 4)) %>%
lm_robust(complete ~ T, data = .) %>%
summary()##
## Call:
## lm_robust(formula = complete ~ T, data = .)
##
## Standard error type: HC2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|) CI Lower CI Upper DF
## (Intercept) 0.48664 0.01545 31.5041 3.279e-178 0.45635 0.51693 2064
## T4 -0.01218 0.02200 -0.5538 5.798e-01 -0.05532 0.03095 2064
##
## Multiple R-squared: 0.0001486 , Adjusted R-squared: -0.0003358
## F-statistic: 0.3067 on 1 and 2064 DF, p-value: 0.5798
Hypothesis: Participants in Treatment 3 provide more information, higher-quality information, and are more receptive to new information about the vaccine compared to participants in Treatment 2.
estimate_effect(df_full_v7 %>% filter(T %in% c(2, 3)), outcome, outcome_completion)df_full_v7 %>%
filter(T %in% c(2, 3)) %>%
lm_robust(complete ~ T, data = .) %>%
summary()##
## Call:
## lm_robust(formula = complete ~ T, data = .)
##
## Standard error type: HC2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|) CI Lower CI Upper DF
## (Intercept) 0.50416 0.01520 33.16827 2.115e-194 0.4743 0.53396 2103
## T3 -0.00122 0.02181 -0.05591 9.554e-01 -0.0440 0.04156 2103
##
## Multiple R-squared: 1.487e-06 , Adjusted R-squared: -0.000474
## F-statistic: 0.003126 on 1 and 2103 DF, p-value: 0.9554
Hypothesis: Participants in Treatment 4 provide more information, higher-quality information, and are more receptive to new information about the vaccine compared to participants in Treatment 3.
estimate_effect(df_full_v7 %>% filter(T %in% c(3, 4)), outcome, outcome_completion)df_full_v7 %>%
filter(T %in% c(3, 4)) %>%
lm_robust(complete ~ T, data = .) %>%
summary()##
## Call:
## lm_robust(formula = complete ~ T, data = .)
##
## Standard error type: HC2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|) CI Lower CI Upper DF
## (Intercept) 0.50294 0.01565 32.141 1.037e-183 0.47225 0.53362 2038
## T4 -0.02848 0.02214 -1.286 1.985e-01 -0.07189 0.01494 2038
##
## Multiple R-squared: 0.0008113 , Adjusted R-squared: 0.000321
## F-statistic: 1.655 on 1 and 2038 DF, p-value: 0.1985
Hypothesis: Participants in Treatment 4 provide more information, higher-quality information, and are more receptive to new information about the vaccine compared to participants in Treatment 5.
estimate_effect(df_full_v7 %>% filter(T %in% c(4, 5)), outcome, outcome_completion)df_full_v7 %>%
filter(T %in% c(4, 5)) %>%
lm_robust(complete ~ T, data = .) %>%
summary()##
## Call:
## lm_robust(formula = complete ~ T, data = .)
##
## Standard error type: HC2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|) CI Lower CI Upper DF
## (Intercept) 0.47446 0.01566 30.301 4.027e-167 0.44375 0.50517 2061
## T5 0.02506 0.02201 1.138 2.551e-01 -0.01811 0.06824 2061
##
## Multiple R-squared: 0.0006284 , Adjusted R-squared: 0.0001435
## F-statistic: 1.296 on 1 and 2061 DF, p-value: 0.2551
Hypothesis: Participants in Treatment 2 provide more information, higher-quality information, and are more receptive to new information about the vaccine compared to participants in Treatment 1.
estimate_effect(df_full_v7 %>% filter(T %in% c(1, 2)), outcome, outcome_completion)df_full_v7 %>%
filter(T %in% c(1, 2)) %>%
lm_robust(complete ~ T, data = .) %>%
summary()##
## Call:
## lm_robust(formula = complete ~ T, data = .)
##
## Standard error type: HC2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|) CI Lower CI Upper DF
## (Intercept) 0.48664 0.01545 31.5041 3.785e-179 0.45635 0.51693 2129
## T2 0.01751 0.02167 0.8082 4.191e-01 -0.02499 0.06001 2129
##
## Multiple R-squared: 0.0003067 , Adjusted R-squared: -0.0001629
## F-statistic: 0.6531 on 1 and 2129 DF, p-value: 0.4191
Hypothesis: For all outcomes, the difference between participants in Treatment 2 and participants in Treatment 1 is greater than the difference between participants in Treatment 4 and participants in Treatment 5.
df_full_v7 %>%
lm_robust(complete ~ 0 + T, data = .) %>%
glht("(T2 - T1) - (T4 - T5) = 0") %>%
summary()##
## Simultaneous Tests for General Linear Hypotheses
##
## Fit: lm_robust(formula = complete ~ 0 + T, data = .)
##
## Linear Hypotheses:
## Estimate Std. Error z value Pr(>|z|)
## (T2 - T1) - (T4 - T5) == 0 0.04258 0.03089 1.378 0.168
## (Adjusted p values reported -- single-step method)
To be added with v8 pilot data.