1 Goal of the Script

The goal of this script is to estimate treatment effects (in line with our pre-analysis plan) for phase 1 of the vaccine hesitancy project.

# Load packages
pacman::p_load(DT, estimatr, kableExtra, readr, reshape2, tidyverse, xtable, dataMaid, ggcorrplot, ggmap, rpart, rpart.plot, pollster, RColorBrewer, hrbrthemes, janitor, purrr, gridExtra, cowplot, rcompanion, texreg, compareGroups, multcomp)

set.seed(94305)

# Data Loading and Merging
## read ads data for 4 pilot waves
ads_v7 <-
  read.csv("../ads_data/ads_data_v7.csv") %>% 
  rename(original_ref = Ad.ID, ad_name = Ad.name) %>% 
  mutate(
    `Analysis 3 - impediment theme` = str_sub(ad_name, 8) %>% str_to_lower()
  ) %>% 
  relocate(original_ref, ad_name)


# read in CURRENT chatfuel data
# full data
df_full_v7 <- 
  read_csv("../chatfuel_data/chatfuel_full_v7.csv") %>%
  clean_names() %>% 
  mutate_if(is.character, ~ str_replace_all(., '[\n\t]', '')) %>% 
  mutate(
    first_name = if_else(is.na(first_name), "", first_name),
    middle_name = if_else(is.na(middle_name), "", middle_name),
    last_name = if_else(is.na(last_name), "", last_name),
    full_name = str_c(first_name, middle_name, last_name, sep = " "),
    full_name_short = str_c(first_name, last_name, sep = " "),
    date = lubridate::date(signed_up)
  ) %>% 
  filter(!(full_name %in% c("Robert Kuan", "James Li", "Kaylin Rochford", 
                            "Saurabh Khanna", "Dingchen Sha", "Kristine Koutout", 
                            "Susan Athey", "Dean Karlan"))) %>%
  filter(version == "ALP_May") %>%
  mutate(original_ref = parse_number(original_ref)) %>% 
  mutate(
    motive = if_else(str_detect(motive, "yes"), "yes", "no"),
    motive_main = if_else(str_detect(motive_main, "risk"), "risk", motive_main)
  ) %>%
  remove_empty()

# filter to completes and remove duplicates
df_v7 <-
  df_full_v7 %>%
  filter(full_complete == "yes") %>% 
  drop_na(vax_status) %>% 
  mutate(
    phone_number = str_replace_all(phone_number, " ", ""),
    phone_number = str_replace_all(phone_number, "-", ""),
  ) %>% 
  arrange(phone_number, last_seen) %>% 
  distinct(phone_number, .keep_all = T) %>%
  remove_empty()

2 Experimental Analysis

The current script is built on simulated data using 5216 respondents who started the v7 pilot survey.

The 5 treatments (github issue) are listed below:

T1: Questions asked with survey tone and deployed on Qualtrics
T2: Questions asked with survey tone and deployed on Chatbot
T3: Adding anthropomorphism to questions and languages in T2
T4: Adding politeness and gratitude to questions and languages in T3
T5: T4, but deployed on Qualtrics

The 3 outcomes of interest (github issue) are listed below:

O1: Quantity of information: P(Completion | Survey Started)
O2: Quality of information: Respondent-level proportion of free text answers with useful information
O3: Receptivity of respondents to new information

Treatment Generation

Dummy distribution of 5 treatments among 5216 respondents who started the survey.

D <- 1:5
df_full_v7$T <- factor(sample(D, nrow(df_full_v7), replace = T), levels = D)
df_full_v7$complete <- ifelse(!is.na(df_full_v7$full_complete), 1, 0)

df_full_v7 %>% transmute(Treatment = str_c("T", T)) %>% count(Treatment, name = "Count") %>% kable(format = "pipe")

Treatment	Count
T1	1048
T2	1083
T3	1022
T4	1018
T5	1045

As a quick overview before we dive into the hypotheses, the plot below summarizes the treatment effects by outcome:

estimate_effect <- function(df, outcome, outcome_name){
  
  eq <- as.formula(paste(outcome, " ~ 0 + T"))
  lm <- lm_robust(eq, data = df)
  
  response <- bind_rows(list(tidy(lm))) %>% 
    mutate(term = gsub('treatment', '', term)) %>%
    filter(!(term %in% c("(Intercept)")))
  
  ggplot(response, aes(x = term, y = estimate)) +
    geom_point(position = position_dodge(width = 0.5), size = 3) +
    geom_errorbar(
      aes(x = term,ymin = estimate - 1.96 * std.error,ymax = estimate + 1.96 * std.error),width = .1,position = position_dodge(width = 0.5)) + 
    xlab('Treatment') +
    ylab('Estimate') +
    theme(plot.title = element_text(hjust = 0.5),
          axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
    #geom_hline(yintercept = 0, colour = 'grey60', linetype = 2) + 
    ggtitle(paste0('Treatment Effect Estimate on ', outcome_name)) +
    theme_minimal() +
    theme(plot.title = element_text(face = "bold", hjust = 0.5))
  
}

outcome <- "complete"
outcome_completion <- "Survey Completion Rate"

plot1 <- estimate_effect(df_full_v7, outcome, outcome_completion)
plot1

Now we undertake treatment effect estimation for each hypothesis (detailed in our pre-analysis plan) for the first outcome O1 – survey completion rate conditional on survey started.

2.1 Hypothesis 1

Hypothesis: Participants in Treatment 4 provide more information, higher-quality information, and are more receptive to new information about the vaccine compared to participants in Treatment 1.

estimate_effect(df_full_v7 %>% filter(T %in% c(1, 4)), outcome, outcome_completion)

df_full_v7 %>% 
  filter(T %in% c(1, 4)) %>% 
  lm_robust(complete ~ T, data = .) %>% 
  summary()

## 
## Call:
## lm_robust(formula = complete ~ T, data = .)
## 
## Standard error type:  HC2 
## 
## Coefficients:
##             Estimate Std. Error t value   Pr(>|t|) CI Lower CI Upper   DF
## (Intercept)  0.48664    0.01545 31.5041 3.279e-178  0.45635  0.51693 2064
## T4          -0.01218    0.02200 -0.5538  5.798e-01 -0.05532  0.03095 2064
## 
## Multiple R-squared:  0.0001486 , Adjusted R-squared:  -0.0003358 
## F-statistic: 0.3067 on 1 and 2064 DF,  p-value: 0.5798

2.2 Hypothesis 2

Hypothesis: Participants in Treatment 3 provide more information, higher-quality information, and are more receptive to new information about the vaccine compared to participants in Treatment 2.

estimate_effect(df_full_v7 %>% filter(T %in% c(2, 3)), outcome, outcome_completion)

df_full_v7 %>% 
  filter(T %in% c(2, 3)) %>% 
  lm_robust(complete ~ T, data = .) %>% 
  summary()

## 
## Call:
## lm_robust(formula = complete ~ T, data = .)
## 
## Standard error type:  HC2 
## 
## Coefficients:
##             Estimate Std. Error  t value   Pr(>|t|) CI Lower CI Upper   DF
## (Intercept)  0.50416    0.01520 33.16827 2.115e-194   0.4743  0.53396 2103
## T3          -0.00122    0.02181 -0.05591  9.554e-01  -0.0440  0.04156 2103
## 
## Multiple R-squared:  1.487e-06 , Adjusted R-squared:  -0.000474 
## F-statistic: 0.003126 on 1 and 2103 DF,  p-value: 0.9554

2.3 Hypothesis 3

Hypothesis: Participants in Treatment 4 provide more information, higher-quality information, and are more receptive to new information about the vaccine compared to participants in Treatment 3.

estimate_effect(df_full_v7 %>% filter(T %in% c(3, 4)), outcome, outcome_completion)

df_full_v7 %>% 
  filter(T %in% c(3, 4)) %>% 
  lm_robust(complete ~ T, data = .) %>% 
  summary()

## 
## Call:
## lm_robust(formula = complete ~ T, data = .)
## 
## Standard error type:  HC2 
## 
## Coefficients:
##             Estimate Std. Error t value   Pr(>|t|) CI Lower CI Upper   DF
## (Intercept)  0.50294    0.01565  32.141 1.037e-183  0.47225  0.53362 2038
## T4          -0.02848    0.02214  -1.286  1.985e-01 -0.07189  0.01494 2038
## 
## Multiple R-squared:  0.0008113 , Adjusted R-squared:  0.000321 
## F-statistic: 1.655 on 1 and 2038 DF,  p-value: 0.1985

2.4 Hypothesis 4

Hypothesis: Participants in Treatment 4 provide more information, higher-quality information, and are more receptive to new information about the vaccine compared to participants in Treatment 5.

estimate_effect(df_full_v7 %>% filter(T %in% c(4, 5)), outcome, outcome_completion)

df_full_v7 %>% 
  filter(T %in% c(4, 5)) %>% 
  lm_robust(complete ~ T, data = .) %>% 
  summary()

## 
## Call:
## lm_robust(formula = complete ~ T, data = .)
## 
## Standard error type:  HC2 
## 
## Coefficients:
##             Estimate Std. Error t value   Pr(>|t|) CI Lower CI Upper   DF
## (Intercept)  0.47446    0.01566  30.301 4.027e-167  0.44375  0.50517 2061
## T5           0.02506    0.02201   1.138  2.551e-01 -0.01811  0.06824 2061
## 
## Multiple R-squared:  0.0006284 , Adjusted R-squared:  0.0001435 
## F-statistic: 1.296 on 1 and 2061 DF,  p-value: 0.2551

2.5 Hypothesis 5

Hypothesis: Participants in Treatment 2 provide more information, higher-quality information, and are more receptive to new information about the vaccine compared to participants in Treatment 1.

estimate_effect(df_full_v7 %>% filter(T %in% c(1, 2)), outcome, outcome_completion)

df_full_v7 %>% 
  filter(T %in% c(1, 2)) %>% 
  lm_robust(complete ~ T, data = .) %>% 
  summary()

## 
## Call:
## lm_robust(formula = complete ~ T, data = .)
## 
## Standard error type:  HC2 
## 
## Coefficients:
##             Estimate Std. Error t value   Pr(>|t|) CI Lower CI Upper   DF
## (Intercept)  0.48664    0.01545 31.5041 3.785e-179  0.45635  0.51693 2129
## T2           0.01751    0.02167  0.8082  4.191e-01 -0.02499  0.06001 2129
## 
## Multiple R-squared:  0.0003067 , Adjusted R-squared:  -0.0001629 
## F-statistic: 0.6531 on 1 and 2129 DF,  p-value: 0.4191

2.6 Hypothesis 6

Hypothesis: For all outcomes, the difference between participants in Treatment 2 and participants in Treatment 1 is greater than the difference between participants in Treatment 4 and participants in Treatment 5.

df_full_v7 %>% 
  lm_robust(complete ~ 0 + T, data = .) %>% 
  glht("(T2 - T1) - (T4 - T5) = 0") %>% 
  summary()

## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Fit: lm_robust(formula = complete ~ 0 + T, data = .)
## 
## Linear Hypotheses:
##                            Estimate Std. Error z value Pr(>|z|)
## (T2 - T1) - (T4 - T5) == 0  0.04258    0.03089   1.378    0.168
## (Adjusted p values reported -- single-step method)

3 Key takeaways

To be added with v8 pilot data.

Vaccine Hesitancy Pre-Analysis Plan Script

July 28, 2022