## Load packages
pacman::p_load(DT, estimatr, kableExtra, readr, reshape2, tidyverse, xtable, dataMaid, ggcorrplot, ggmap, rpart, rpart.plot, pollster, wordcloud, tm, RColorBrewer, hrbrthemes, janitor, purrr, gridExtra, cowplot, rcompanion, nnet, texreg, compareGroups, caret, randomForest)

set.seed(94305)
## Data Handling

### read ads data
ads_v6 <- 
  read.csv("data/old/ads_data_v6.csv") %>% 
  rename(original_ref = Ad.ID, ad_name = Ad.name) %>% 
  # group_by(ad_name) %>%
  # summarise_at(vars(Impressions, Link.clicks, Results, `Amount.spent..USD.`), ~ sum(., na.rm = TRUE)) %>%
  remove_empty() %>% 
  inner_join(
    read_csv(here::here("data/Pilot_creative_text.xlsx - Pilot_5.csv")) %>% rename(ad_name = Ad),
    by = "ad_name"
  ) %>% 
  relocate(original_ref, ad_name)

ads_v5 <-
  read.csv("data/old/ads_data_v5.csv") %>% 
  rename(original_ref = Ad.ID, ad_name = Ad.name) %>% 
  remove_empty() %>% 
  inner_join(
    read_csv(here::here("data/Pilot_creative_text.xlsx - Pilot_5.csv")) %>% rename(ad_name = Ad),
    by = "ad_name"
  ) %>% 
  relocate(original_ref, ad_name)

# ads_v4b <- read.csv("data/old/ads_data_v4b.csv")
# ads_v4a <- read.csv("data/old/ads_data_v4a.csv")
# ads %>% glimpse()

form_percent <- function(dec){
  return(paste0(round(dec* 100, 1), "%"))
}

form_cost <- function(num){
  if (num == 0) {
    return(NA_character_)
  } else {
    return(paste0("$", round(cost / num, 3)))
  }
}

# sum(ads_v6$Impressions)
# read in CURRENT survey data
df_full_v6 <- 
  read_csv("data/latest/Share_Your_Voice_v6_ALP.csv") %>%
  clean_names() %>% 
  mutate_if(is.character, ~ str_replace_all(., '[\n\t]', '')) %>% 
  mutate(
    first_name = if_else(is.na(first_name), "", first_name),
    middle_name = if_else(is.na(middle_name), "", middle_name),
    last_name = if_else(is.na(last_name), "", last_name),
    full_name = str_c(first_name, middle_name, last_name, sep = " "),
    full_name_short = str_c(first_name, last_name, sep = " "),
    date = lubridate::date(signed_up)
  ) %>% 
  filter(!(full_name %in% c("Robert Kuan", "James Li", "Kaylin Rochford", "Saurabh Khanna", "Dingchen Sha", "Kristine Koutout", "Susan Athey", "Dean Karlan"))) %>%
  filter(date > "2022-03-23", !is.na(version)) %>%
  mutate(original_ref = parse_number(original_ref)) %>% 
  mutate(
    motive = if_else(str_detect(motive, "yes"), "yes", "no"),
    motive_main = if_else(str_detect(motive_main, "risk"), "risk", motive_main)
  ) %>%
  left_join(ads_v6, by = "original_ref") %>% 
  remove_empty()

# filtering for current wave (main data for this wave)
df_v6 <-
  df_full_v6 %>%
  filter(version == "pilot_6", full_complete == "yes") %>% 
  drop_na(vax_status) %>% 
  remove_empty()

# removing duplicates (keep first response per phone no.)
df_v6 <-
  df_v6 %>%
  mutate(
    phone_number = str_replace_all(phone_number, " ", ""),
    phone_number = str_replace_all(phone_number, "-", ""),
  ) %>% 
  arrange(phone_number, last_seen) %>% 
  distinct(phone_number, .keep_all = T) %>%
  remove_empty()


## OLD WAVES

# add full data for future wave
# place code from chunk above
df_full_v5 <-
  read_csv("data/latest/Share_Your_Voice_2022_02_22_03_35_54.csv") %>% 
  clean_names() %>% remove_empty() %>% 
  mutate_if(is.character, ~ str_replace_all(., '[\n\t]', '')) %>% 
  mutate(
    full_name = str_c(first_name, last_name, sep = " "),
    date = lubridate::date(signed_up)
  ) %>% 
  filter(!(full_name %in% c("Robert Kuan", "James Li", "Kaylin Rochford", "Saurabh Khanna", "Dingchen Sha"))) %>%
  filter(date > "2022-02-18", !is.na(version)) %>%
  select(-contains("name"), -profile_pic_url, -messenger_user_id) %>% 
  mutate(original_ref = parse_number(original_ref)) %>% 
  remove_empty()


df_v5 <-
  df_full_v5 %>% 
  filter(version == "pilot_5", full_complete == "yes") %>% 
  drop_na(vax_status) %>% 
  mutate(
    phone_number = str_replace_all(phone_number, " ", ""),
    phone_number = str_replace_all(phone_number, "-", ""),
  ) %>% 
  arrange(phone_number, last_seen) %>% 
  distinct(phone_number, .keep_all = T) %>%
  remove_empty()
  

# combining chatfuel data with ads data. we use `df` in most analyses below. 
df <-
  bind_rows(
    df_v6 %>% left_join(ads_v6, by = "original_ref"),
    df_v5 %>% left_join(ads_v5, by = "original_ref"),
  ) %>% 
  remove_empty()

# combining full datatsets
df_full <- 
  bind_rows(
    df_full_v6 %>% left_join(ads_v6, by = "original_ref"), 
    df_full_v5 %>% left_join(ads_v5, by = "original_ref")
  ) %>% 
  remove_empty()

ads <- bind_rows(ads_v5, ads_v6) %>% remove_empty()

## Keep pilot 4b for some graphs as well 
# full data for 4b wave
df_full_v4b <-
  read_csv(here::here("data/Social_Impact_Research_Initiative_2022_02_04_17_40_24.csv")) %>% 
  clean_names() %>% 
  mutate_if(is.character, ~ str_replace_all(., '[\n\t]', '')) %>%  
  mutate(
    full_name = str_c(first_name, last_name, sep = " ")
  ) %>% 
  filter(!(full_name %in% c("Robert Kuan", "James Li", "Kaylin Rochford", "Saurabh Khanna", "Dingchen Sha"))) %>%
  select(-contains("name"), -profile_pic_url, -messenger_user_id) %>% 
  mutate(original_ref = parse_number(original_ref)) %>% 
  remove_empty()

# full data for 4a wave
df_full_v4a <-
  read_csv(here::here("data/old/Share_Your_Voice_2022_01_18_07_51_30.csv")) %>%
  clean_names() %>% 
  mutate_if(is.character, ~ str_replace_all(., '[\n\t]', '')) %>% 
  mutate(
    full_name = str_c(first_name, last_name, sep = " "),
    treatment_complete = if_else(vax_status == "unvax" & vax_future == "no way!", "yes", treatment_complete) # pilot v4 error corrected here
  ) %>% 
  filter(!(full_name %in% c("Robert Kuan", "James Li", "Kaylin Rochford", "Saurabh Khanna", "Dingchen Sha"))) %>%
  select(-contains("name"), -profile_pic_url, -messenger_user_id) %>% 
  filter(ad_set %in% c("pilot_v4_1", "pilot_v4_2", "pilot_v4_3")) %>%  
  mutate(version = "pilot_4a") %>% 
  remove_empty()

# for the free text charts
df_text <-
  bind_rows(
    df_full_v5 %>% select(-original_ref), 
    df_full_v4a %>% select(-original_ref), 
    df_full_v4b %>% select(-original_ref)
  ) %>% 
  filter(full_complete == "yes", vax_status == "unvax") %>% 
  remove_empty()

# merging cleaned free text
motive_df <-
  readxl::read_excel(here::here("data/freetext/free_text_responses_tillv5.xlsx"), sheet = "motive_other") %>% 
  select(1:2) %>% 
  remove_empty()

ability_df <-
  readxl::read_excel(here::here("data/freetext/free_text_responses_tillv5.xlsx"), sheet = "ability_other") %>% 
  select(1:2) %>% 
  remove_empty()

treatment_df <-
  readxl::read_excel(here::here("data/freetext/free_text_responses_tillv5.xlsx"), sheet = "best_treatment_other") %>% 
  select(1:2) %>% 
  remove_empty()

df_text <-
  df_text %>% 
  left_join(motive_df) %>% 
  left_join(ability_df) %>%
  left_join(treatment_df)
# clean up demographic variables

clean_up_demog <- function(df){
  df$gender[!df$gender %in% c("male", "female", NA_character_)] <- "other"
  df$ethnicity[!df$ethnicity %in% c("black or african", "coloured", "white or caucasian", "prefer not to say", "asian or indian", NA_character_)] <- "other"
  df$income[!df$income %in% c("< R5,000", "R5,000 – R9,999", "R10,000 – R29,999", "R30,000 – R49,999", "R50,000 – R99,999", "> R100,000", "prefer not to say", NA_character_)] <- "other"
  df$education[!df$education %in% c("< high school", "high school", "some college", "2-year degree", "4-year degree", "graduate degree", "prefer not to say", NA_character_)] <- "other"
  df$politics[!df$politics %in% c("conservative", "moderate", "liberal", "prefer not to say", NA_character_)] <- "other"
  df$location[!df$location %in% c("urban", "suburban", "rural", "prefer not to say", NA_character_)] <- "other"
  df$religion[grep("christ", tolower(df$religion))] <- "christian"
  df$religion[!df$religion %in% c("christian", "african traditional", "islam", "hinduism", "no religion", "prefer not to say", NA_character_)] <- "other"
  return(df)
}

df <- clean_up_demog(df)

1 Project Overview

Background

COVID-19 vaccine hesitancy has been recognized as a problem across nations. A resistance to getting vaccinated is emerging as a major hurdle, especially in the developing world, where vaccine access issues are still being gradually resolved. Persistent pools of unvaccinated people around the world could present a greater risk for the emergence of new variants of concern. Addressing people’s vaccine hesitancy is hence crucial to curb the spread of COVID-19, and to consequently avert hospitalizations and death.

Objectives

We intend to understand why people are hesitant about getting the COVID-19 vaccine. Hesitancy could not only occur within the unvaccinated population but also in a subset of people who already got vaccinated. Therefore, the first phase of our project has the following objectives:

  1. Understand why people are hesitant to get the COVID-19 vaccines
  2. Understand ways to best elicit vaccine impediments from respondents
  3. Pinpoint what treatments will help people get vaccinated

Approach

We intend to use chatbot as a medium (on Facebook) to conduct conversations with people and understand how we can best achieve the above three objectives. We have run six pilots as of March 2022 – 2 in the United States using Lucid, and 4 in South Africa on Facebook. Our eventual goal will be running this using an interactive, personalized chatbot that enables the conversation to flow more naturally than in a survey format. We are running the pilots in order to – 1) achieve technical proofs of concept, 2) reduce participant recruitment & completion costs (survey completion of unvaccinated participants open to treatment ) before experiment launch, 3) improve chatbot script/forking/engagement before experiment launch, and 4) gather exploratory ideas for impactful covariates and treatments. Our insights from these pilots are detailed ahead.


2 Analysis glossary

Before starting with our key takeaways from previous pilots, here is a glossary of different detailed analyses scripts that we have generated so far:

  • Descriptive analysis: Descriptive analysis including analysis of relevant covariates, funnel drop and engagement analysis, and treatment analysis [issue link.]
  • Ads analysis: Comparing ad performance by impediment themes (unnecessary, risky, inaccessible), prompts (airtime, control), and images used.
  • General analysis: 1-way crosstabs, 2-way crosstabs, and free text response elaboration summaries of all pilot variables.
  • Free text analysis: Analysis of free text responses (both for impediments and treatments) from pilots v4a, v4b, and v5.
  • A/B Test Script: Analysis whether Facebook A/B test indeed is a valid A/B test and analyze ads performance for pilot v5.
  • ML analysis: Sentiment, politeness, and receptiveness distributions and demographic associations in impediment and treatment free text responses for pilots v4a, v4b, and v5.

3 Key Takeaways

The current analysis uses data from 3456 respondents combined across our two largest pilots – pilots 5 (1621 respondents in Feb 2022) and 6 (1835 respondents in March 2022).

Analysis script used for generating these results is linked here. Related GitHub issue is linked here.

We have split our key takeaways into 5 sections:

  • Demographics
    • We see a shift in the percentage of vaccinated people among respondents (38% in pilot 4 in January –> 62% in the latest pilot 6), reflecting South Africa’s current situation after the Omicron wave.
    • Our sample demographic is dominated by individuals who are black/african (80%), females (60%), have education till high school (60%), and earn less than 5000 ZAR (315 USD) per year (43%).
    • Associations among demographic variables are low, except for a moderate association between education and income.
  • Impediments
    • Within unvaccinated respondents, a majority lack motivation to get the vaccine.
    • As vaccination rates rise, motivation impediments shift significantly from “Vaccine has No benefits” to “Vaccine is Risky” (we are comparing samples across pilots though).
    • Within the small set (20%) of respondents with ability impediments across pilots, a majority mention (50%) lack of time.
    • Free text responses for impediments are both sparse, and not far from our encoded responses [except for Vaccinated people can die being a frequent free text response we do not have]. Around 24% responses in the no-motivation fork and 19% responses in the no-ability fork are redundant. Mis-forking (i.e. being assigned to the wrong fork) is negligible for the motivation fork, but at 10% for the ability fork (i.e. 10% respondents here should be in the motivation fork).
    • Models with/without interactions among demographics do not predict motivation or ability impediments well.
  • Treatments
    • In our February pilot version 5, no clear treatment was preferred by unvaccinated individuals, while vaccinated individuals strongly preferred family support. This trend changed with rising vaccinations in our March pilot version 6. A large proportion of unvaccinated and vaccinated respondents indicated job/school mandates as the treatment that worked for them.
    • Free text responses for preferred treatments were dominated by responses highlighting that nothing would work. The second most frequent response was job/school mandates, which has now been incorporated as an encoded response in our chatbot.
    • Models with demographics are not strong predictors of preferred treatments.
  • Funnel and Engagement
    • 85% respondents complete the survey conditional on consent. Cost per completed survey is around $0.5 (excluding airtime). Cost per completed survey for unvaccinated participants is relatively higher at $1.27 (excluding airtime) as we have less unvaccinated respondents in the most recent pilot.
    • Dropoff starts early among participants not completing our survey:
      • Around half provide consent.
      • Around 35% provide vaccination status
      • Around 33% provide motive information to get vaccine, but only 24% provide ability information to get vaccine
      • Around 13% provide preferred treatment
      • Less than 5% provide demographic information
    • Most participants reaching the end of the survey enjoyed interacting with/were comfortable with the chatbot.
  • Ads
    • Facebook A|B testing is not a “true” randomized A|B test as gender distribution is unbalanced across our five ad sets.
    • Ads with airtime outperform our control ads in recruiting participants, even after adjusting for the quality of responses.

3.1 Demographics

1. We see a shift in the percentage of vaccinated people among respondents (38% in pilot 4 in January –> 62% in the latest pilot 6), reflecting South Africa’s current situation after the Omicron wave.

bind_rows(
  df_full_v4b %>% filter(full_complete == "yes"),
  df_v5,
  df_v6
) %>%
  transmute(
    version,
    vaxxed = (vax_status == "vax")
  ) %>% 
  group_by(version) %>% 
  summarise(
    vax_mean = mean(vaxxed, na.rm = T),
    vax_sd = sd(vaxxed, na.rm = T),
    vax_n = sum(!is.na(vaxxed)),
    vax_se = vax_sd / sqrt(vax_n)
  ) %>% 
  mutate(
    confint_low = vax_mean - 1.96*vax_se,
    confint_high = vax_mean + 1.96*vax_se,
  ) %>%  
  ggplot() +
  geom_linerange(aes(x = version, ymin = confint_low, ymax = confint_high), width = 0.5) +
  geom_point(aes(x = version, y = vax_mean), size = 4, alpha = 0.8, color = "blue") +
  geom_hline(yintercept = 0.5, linetype = "dashed") +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1), breaks = scales::breaks_width(0.1)) +
  theme_bw() +
  labs(
    y = "% Vaccinated",
    title = "Rising vaccination status in South Africa (Jan'22 → March'22)",
    subtitle = "Error bars represent 95% confidence intervals"
  )

2. Our sample demopgraphic is dominated by individuals who are black/african (80%), females (60%), have education till high school (60%), and earn less than 5000 ZAR (315 USD) per year (43%).

The table below provides an overview of relevant demographic variables in our sample. Roughly, we have:

  • 60% females
  • 80% Black/African, 6% White, 7% Colored
  • 43% are in households earning less than ZAR 5000 per year
  • 60% completed high school or less
df %>%
  mutate_if(is_character, ~ as_factor(.)) %>%
  select(contains(c("gender", "income", "ethnicity", "education", "religion", "politics", "location"))) %>%
  select(-cv_gender) %>% 
  clean_names(case = "title") %>%
  papeR::summarize_factor()

Here are stacked bar charts representing different demographic variables:

df %>%
  mutate_if(is_character, ~ as_factor(.)) %>%
  select(contains(c("gender", "income", "ethnicity", "education", "religion", "politics", "location"))) %>%
  select(-cv_gender) %>% 
  clean_names(case = "title") %>%
  papeR::summarize_factor() %>% 
  clean_names() %>% 
  transmute(x = na_if(x, ""), level, percent = parse_double(percent)/100) %>% 
  fill(x) %>% 
  filter(x == "Gender") %>% 
  ggplot(aes(x, percent)) +
  geom_col(aes(fill = level), color = "black") +
  theme_minimal() +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1), breaks = scales::breaks_width(0.1)) +
  coord_flip() +
  labs(
    x = "", y = "", fill = ""
  )

df %>%
  mutate_if(is_character, ~ as_factor(.)) %>%
  select(contains(c("gender", "income", "ethnicity", "education", "religion", "politics", "location"))) %>%
  select(-cv_gender) %>% 
  clean_names(case = "title") %>%
  papeR::summarize_factor() %>% 
  clean_names() %>% 
  transmute(x = na_if(x, ""), level, percent = parse_double(percent)/100) %>% 
  fill(x) %>%
  filter(x == "Education") %>% 
  ggplot(aes(x, percent)) +
  geom_col(aes(fill = level), color = "black") +
  theme_minimal() +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1), breaks = scales::breaks_width(0.1)) +
  coord_flip() +
  labs(
    x = "", y = "", fill = ""
  )

df %>%
  mutate_if(is_character, ~ as_factor(.)) %>%
  select(contains(c("gender", "income", "ethnicity", "education", "religion", "politics", "location"))) %>%
  select(-cv_gender) %>% 
  clean_names(case = "title") %>%
  papeR::summarize_factor() %>% 
  clean_names() %>% 
  transmute(x = na_if(x, ""), level, percent = parse_double(percent)/100) %>% 
  fill(x) %>% #count(x)
  filter(x == "Ethnicity") %>% 
  ggplot(aes(x, percent)) +
  geom_col(aes(fill = level), color = "black") +
  theme_minimal() +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1), breaks = scales::breaks_width(0.1)) +
  coord_flip() +
  labs(
    x = "", y = "", fill = ""
  )

df %>%
  mutate_if(is_character, ~ as_factor(.)) %>%
  select(contains(c("gender", "income", "ethnicity", "education", "religion", "politics", "location"))) %>%
  select(-cv_gender) %>% 
  clean_names(case = "title") %>%
  papeR::summarize_factor() %>% 
  clean_names() %>% 
  transmute(x = na_if(x, ""), level, percent = parse_double(percent)/100) %>% 
  fill(x) %>% #count(x)
  filter(x == "Income") %>% 
  ggplot(aes(x, percent)) +
  geom_col(aes(fill = level), color = "black") +
  theme_minimal() +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1), breaks = scales::breaks_width(0.1)) +
  coord_flip() +
  labs(
    x = "", y = "", fill = ""
  )

df %>%
  mutate_if(is_character, ~ as_factor(.)) %>%
  select(contains(c("gender", "income", "ethnicity", "education", "religion", "politics", "location"))) %>%
  select(-cv_gender) %>% 
  clean_names(case = "title") %>%
  papeR::summarize_factor() %>% 
  clean_names() %>% 
  transmute(x = na_if(x, ""), level, percent = parse_double(percent)/100) %>% 
  fill(x) %>% #count(x)
  filter(x == "Religion") %>% 
  ggplot(aes(x, percent)) +
  geom_col(aes(fill = level), color = "black") +
  theme_minimal() +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1), breaks = scales::breaks_width(0.1)) +
  coord_flip() +
  labs(
    x = "", y = "", fill = ""
  )

df %>%
  mutate_if(is_character, ~ as_factor(.)) %>%
  select(contains(c("gender", "income", "ethnicity", "education", "religion", "politics", "location"))) %>%
  select(-cv_gender) %>% 
  clean_names(case = "title") %>%
  papeR::summarize_factor() %>% 
  clean_names() %>% 
  transmute(x = na_if(x, ""), level, percent = parse_double(percent)/100) %>% 
  fill(x) %>% #count(x)
  filter(x == "Politics") %>% 
  ggplot(aes(x, percent)) +
  geom_col(aes(fill = level), color = "black") +
  theme_minimal() +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1), breaks = scales::breaks_width(0.1)) +
  coord_flip() +
  labs(
    x = "", y = "", fill = ""
  )

df %>%
  mutate_if(is_character, ~ as_factor(.)) %>%
  select(contains(c("gender", "income", "ethnicity", "education", "religion", "politics", "location"))) %>%
  select(-cv_gender) %>% 
  clean_names(case = "title") %>%
  papeR::summarize_factor() %>% 
  clean_names() %>% 
  transmute(x = na_if(x, ""), level, percent = parse_double(percent)/100) %>% 
  fill(x) %>% #count(x)
  filter(x == "Location") %>% 
  ggplot(aes(x, percent)) +
  geom_col(aes(fill = level), color = "black") +
  theme_minimal() +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1), breaks = scales::breaks_width(0.1)) +
  coord_flip() +
  labs(
    x = "", y = "", fill = ""
  )

Here is a comparison of vaccinated and unvaccinated individuals by demographic. Unvaccinated sample includes more males than vaccinated sample, and is less likely to disclose their income.

df %>%
  mutate_if(is_character, ~ as_factor(.)) %>%
  select(vax_status, contains(c("gender", "income", "ethnicity", "education", "religion", "politics", "location"))) %>%
  select(-cv_gender) %>% 
  compareGroups(vax_status ~ ., data = .) %>% 
  createTable(show.all = F, show.p.overall = T)
## 
## --------Summary descriptives table by 'vax_status'---------
## 
## ___________________________________________________________ 
##                            unvax         vax      p.overall 
##                            N=1361       N=2095              
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## gender:                                             0.003   
##     female              760 (56.1%)  1278 (61.3%)           
##     male                594 (43.9%)  808 (38.7%)            
## income:                                            <0.001   
##     < R5,000            519 (38.1%)  965 (46.1%)            
##     other                43 (3.16%)  127 (6.06%)            
##     prefer not to say   393 (28.9%)  405 (19.3%)            
##     R5,000 – R9,999     152 (11.2%)  260 (12.4%)            
##     R50,000 – R99,999    46 (3.38%)   47 (2.24%)            
##     > R100,000           63 (4.63%)   68 (3.25%)            
##     R10,000 – R29,999    91 (6.69%)  171 (8.16%)            
##     R30,000 – R49,999    54 (3.97%)   52 (2.48%)            
## ethnicity:                                         <0.001   
##     black or african    1065 (78.3%) 1666 (79.6%)           
##     coloured            113 (8.30%)  144 (6.88%)            
##     other                41 (3.01%)  102 (4.87%)            
##     prefer not to say    50 (3.67%)   31 (1.48%)            
##     white or caucasian   81 (5.95%)  114 (5.44%)            
##     asian or indian      11 (0.81%)   37 (1.77%)            
## education:                                         <0.001   
##     high school         528 (38.8%)  768 (36.7%)            
##     < high school       302 (22.2%)  569 (27.2%)            
##     other                29 (2.13%)   89 (4.25%)            
##     some college        281 (20.7%)  400 (19.1%)            
##     2-year degree        50 (3.68%)   52 (2.48%)            
##     prefer not to say    77 (5.66%)   77 (3.68%)            
##     graduate degree      56 (4.12%)   78 (3.72%)            
##     4-year degree        37 (2.72%)   62 (2.96%)            
## religion:                                          <0.001   
##     christian           1008 (74.1%) 1654 (78.9%)           
##     african traditional  88 (6.47%)  125 (5.97%)            
##     prefer not to say    73 (5.36%)   69 (3.29%)            
##     other                56 (4.11%)   77 (3.68%)            
##     islam                39 (2.87%)   55 (2.63%)            
##     no religion          93 (6.83%)   91 (4.34%)            
##     hinduism             4 (0.29%)    24 (1.15%)            
## politics:                                           0.005   
##     prefer not to say   679 (49.9%)  929 (44.3%)            
##     conservative        179 (13.2%)  347 (16.6%)            
##     liberal             148 (10.9%)  225 (10.7%)            
##     moderate            337 (24.8%)  553 (26.4%)            
##     other                17 (1.25%)   41 (1.96%)            
## location:                                          <0.001   
##     urban               467 (34.3%)  726 (34.7%)            
##     rural               508 (37.3%)  812 (38.8%)            
##     prefer not to say    94 (6.91%)   88 (4.20%)            
##     suburban            287 (21.1%)  434 (20.7%)            
##     other                5 (0.37%)    34 (1.62%)            
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯

3. Associations among demographic variables are low, except for a moderate association between education and income.

The correlation plot among just demographic variables can be seen below. We map binary and ordinal variables to continuous variables as follows:

  • female: 1 if female, 0 if male
  • income: 0 if the participant is unemployed, 1 if household income < R5,000, 2 if household income in R5,000 – R9,999, …, 6 if household income > R100,000
  • education: 1 if the participant’s education < high school, 2 if education is high school, …, 6 if education is a graduate degree
  • religiosity: 1 if the participant is not very religious, 2 if somewhat religious, 3 if very religious
  • politics: 1 if the participant is conservative, 2 if moderate, 3 if liberal
  • location: 1 if the participant lives in rural, 2 if suburban, 3 if urban,
  • white: 1 if the participant is a white or caucasian, 0 if not

Weak correlations here weed out multicollinearity concerns when using these variables together in logit models.

df_mc_numeric <-
  df %>%
  mutate(
    female = case_when(
      gender == "female" ~ 1,
      gender == "male" ~ 0,
    ),
    income = case_when(
      income == "Unemployed" ~ 0,
      income == "< R5,000" ~ 1,
      income == "R5,000 – R9,999" ~ 2,
      income == "R10,000 – R29,999" ~ 3,
      income == "R30,000 – R49,999" ~ 4,
      income == "R50,000 – R99,999" ~ 5,
      income == "> R100,000" ~ 6,
    ),
    education = case_when(
      education == "< high school" ~ 1,
      education == "high school" ~ 2,
      education == "some college" ~ 3,
      education == "2-year degree" ~ 4,
      education == "4-year degree" ~ 5,
      education == "graduate degree" ~ 6,
    ),
    religiosity = case_when(
      religiosity == "not very religious" ~ 1,
      religiosity == "somewhat religious" ~ 2,
      religiosity == "very religious" ~ 3,
    ),
    politics = case_when(
      politics == "conservative" ~ 1,
      politics == "moderate" ~ 2,
      politics == "liberal" ~ 3,
    ),
    location = case_when(
      location == "rural" ~ 1,
      location == "suburban" ~ 2,
      location == "urban" ~ 3,
    ),
    white = case_when(
      ethnicity == "white or caucasian" ~ 1,
      ethnicity != "white or caucasian" ~ 0
    ),
    ability = case_when(
      ability == "no" ~ 0,
      ability == "yes" ~ 1,
    ),
    motive = case_when(
      motive == "no" ~ 0,
      motive == "yes" ~ 1,
    ),
    treat_nchar = nchar(best_treatment_explain),
    unvax = if_else(vax_status == "unvax", 1L, 0L)
  ) %>%
  select("female", "income", "education", "religiosity", "politics", "location", "white", "unvax")

ggcorrplot(cor(df_mc_numeric, use = "pairwise.complete.obs"), type = "lower", lab = TRUE, lab_size = 3, tl.cex = 10) + 
  ggtitle("Correlation Matrix (demographic variables only)")

3.2 Impediments

1. Within unvaccinated respondents, a majority lack motivation to get the vaccine.

The last column in the table below shows the distribution of impediments within each vaccination status.

For instance, among the unvaccinated respondents:

  • 62% lack motivation but not ability to get the vaccine
  • 18% lack both motivation and ability to get the vaccine
  • 3% lack ability but not motivation to get the vaccine
  • 17% lack neither

For instance, among the vaccinated respondents:

  • 33% lack motivation but not ability to get the vaccine
  • 6% lack both motivation and ability to get the vaccine
  • 6% lack ability but not motivation to get the vaccine
  • 55% lack neither
df %>% 
  group_by(vax_status, ability, motive) %>%
  summarise(count = n()) %>%
  drop_na(ability, motive) %>% 
  group_by(vax_status) %>% 
  mutate(`% of vax status` = round(100 * count/sum(count, na.rm = T), 2) %>% str_c(., "%")) %>%
  ungroup() %>% 
  arrange(vax_status, desc(count)) %>%
  kable(col.names = 
          c("Vaccination status", "Able to get vaccine", "Have motivation to get vaccine", "Count", "Percentage of vax status"), 
        align = "c",
        caption = "Distribution of forking segments of participants' impediments") %>%
  kable_styling()
Distribution of forking segments of participants’ impediments
Vaccination status Able to get vaccine Have motivation to get vaccine Count Percentage of vax status
unvax yes no 839 61.65%
unvax no no 245 18%
unvax yes yes 233 17.12%
unvax no yes 44 3.23%
vax yes yes 1151 54.97%
vax yes no 694 33.14%
vax no yes 131 6.26%
vax no no 118 5.64%

2. As vaccination rates rise, motivation impediments shift from “No benefits” to “Risky”.

Note: These trends are purely descriptive though, as these pilots were administered to different participants, and we cannot know if samples in either pilot are largely similar.

df_temp <-
  df %>% 
  transmute(version, motive_main) %>% 
  drop_na(motive_main) %>% 
  filter(motive_main != "other") %>%
  fastDummies::dummy_cols(select_columns = c("motive_main")) %>% 
  group_by(version) %>%
  summarise(
    mean_belief = mean(motive_main_belief, na.rm = T),
    sd_belief = sd(motive_main_belief, na.rm = T),
    belief_n = sum(!is.na(motive_main_belief)),
    se_belief = sd_belief / sqrt(belief_n),
    mean_benefit = mean(motive_main_benefit, na.rm = T),
    sd_benefit = sd(motive_main_benefit, na.rm = T),
    n_benefit = sum(!is.na(motive_main_benefit)),
    se_benefit = sd_benefit / sqrt(n_benefit),
    mean_risk = mean(motive_main_risk, na.rm = T),
    sd_risk = sd(motive_main_risk, na.rm = T),
    n_risk = sum(!is.na(motive_main_risk)),
    se_risk = sd_risk / sqrt(n_risk)
  ) %>%
  ungroup() %>% 
  transmute(
    version,
    mean_belief, mean_benefit, mean_risk,
    ci_low_belief = mean_belief - 1.96 * se_belief,
    ci_high_belief = mean_belief + 1.96 * se_belief,
    ci_low_benefit = mean_benefit - 1.96 * se_benefit,
    ci_high_benefit = mean_benefit + 1.96 * se_benefit,
    ci_low_risk = mean_risk - 1.96 * se_risk,
    ci_high_risk = mean_risk + 1.96 * se_risk
  ) %>% 
  pivot_longer(cols = -version)

df_temp %>% 
  filter(str_detect(name, "mean")) %>% 
  transmute(version, name = str_sub(name, 6, -1), mean = value) %>%
  inner_join(
    df_temp %>% filter(str_detect(name, "ci_low")) %>% transmute(version, name = str_sub(name, 8, -1), ci_low = value) 
  ) %>% 
  inner_join(
    df_temp %>% filter(str_detect(name, "ci_high")) %>% transmute(version, name = str_sub(name, 9, -1), ci_high = value)
  ) %>%
  ggplot() +
  geom_linerange(aes(x = name, ymin = ci_low, ymax = ci_high), width = 0.5) +
  geom_point(aes(x = name, y = mean), size = 4, alpha = 0.8, color = "blue") +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1), breaks = scales::breaks_width(0.1)) +
  theme_bw() +
  facet_wrap(vars(version)) +
  labs(
    x = "Impediment",
    y = "% respondents",
    title = "Impediment shift from 'No benefits' to 'Risky' (Feb'22 → March'22)",
    subtitle = "Error bars represent 95% confidence intervals"
  )

rm(df_temp)

3. Within the small set (20%) of respondents with ability impediments across pilots, a majority mention (50%) lack of time.

df %>% 
  filter(ability_main %in% c("availability", "time", "money")) %>% 
  transmute(ability_main) %>% 
  fastDummies::dummy_cols(select_columns = c("ability_main")) %>% 
  summarise(
    mean1 = mean(ability_main_time, na.rm = T),
    sd1 = sd(ability_main_time, na.rm = T),
    n1 = sum(!is.na(ability_main_time)),
    se1 = sd1 / sqrt(n1),
    
    mean2 = mean(ability_main_money, na.rm = T),
    sd2 = sd(ability_main_money, na.rm = T),
    n2 = sum(!is.na(ability_main_money)),
    se2 = sd2 / sqrt(n2),
    
    mean3 = mean(ability_main_availability, na.rm = T),
    sd3 = sd(ability_main_availability, na.rm = T),
    n3 = sum(!is.na(ability_main_availability)),
    se3 = sd3 / sqrt(n3),
  ) %>% 
  mutate(
    id = 1,
    ci_low1 = mean1 - 1.96*se1,
    ci_high1 = mean1 + 1.96*se1,
    ci_low2 = mean2 - 1.96*se2,
    ci_high2 = mean2 + 1.96*se2,
    ci_low3 = mean3 - 1.96*se3,
    ci_high3 = mean3 + 1.96*se3,
  ) %>% 
  pivot_longer(cols = -id) %>% 
  transmute(
    measure = str_sub(name, 1, -2),
    imped = str_sub(name, -1, -1),
    value
  ) %>% 
  arrange(measure, imped) %>% 
  pivot_wider(names_from = measure, values_from = value) %>% 
  mutate(
    imped = case_when(
      imped == 1 ~ "Time",
      imped == 2 ~ "Money",
      imped == 3 ~ "Availability",
    ) %>% factor(levels = c("Time", "Availability", "Money"))
  ) %>% 
  ggplot() +
  # geom_col(aes(x = imped, y = mean), alpha = 0.8) +
  # geom_errorbar(aes(x = imped, ymin = ci_low, ymax = ci_high), width = 0.4) +
  geom_linerange(aes(x = imped, ymin = ci_low, ymax = ci_high), width = 0.5) +
  geom_point(aes(x = imped, y = mean), size = 4, alpha = 0.8, color = "blue") +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1), breaks = scales::breaks_width(0.1)) +
  theme_bw() +
  labs(
    x = "Ability Impediment",
    y = "% Respondents with Ability Impediment",
    title = "Majority ability impediments relate to lack of time",
    subtitle = "Error bars represent 95% confidence intervals"
  )

4. Free text responses for impediments are both sparse, and not far from our encoded responses [except for Vaccinated people can die being a frequent free text response we do not have]. Around 24% responses in the no-motivation fork and 19% responses in the no-ability fork are redundant. Mis-forking (i.e. being assigned to the wrong fork) is negligible for the motivation fork, but at 10% for the ability fork (i.e. 10% respondents here should be in the motivation fork).

A free text response is labeled redundant if it does not convey any information relevant to the question asked. For instance, “Can you say more?” can have redundant responses like “No”, “Yes”, “gibberish@#4”.

A free text response is labeled misforked if it mentions a reason that does not match the fork the participant is in. For instance, if a participant in the no ability, yes motivation to get vaxxed fork mentions a reason showing they have no motivation to get vaxxed, they are considered misforked.

The charts below are created from cleaned and standardized free text responses. A couple of notes:

  1. Free text response categories that are already encoded in our chatbot MCQs are highlighted in blue.
  2. Redundant responses (as defined above) are excluded from these charts.

Motivation impediments

Question: Are there other reasons why you might not want the vaccine?

df_text %>%
  count(motive_other_clean) %>% 
  arrange(n) %>% 
  drop_na(motive_other_clean) %>% 
  mutate(
    motive_other_clean = fct_inorder(motive_other_clean),
    encoded = (motive_other_clean %in% c("bad side effects", "dont trust pharma", "freedom to choose", "vaccines dont work", "religious reasons", "dont trust government", "unlikely to get sick", "no benefit", "needles/pain", "had covid already", "i can recover", "covid not dangerous")),
  ) %>% 
  filter(motive_other_clean != "no", motive_other_clean != "redundant") %>% 
  ggplot(aes(motive_other_clean, n, fill = encoded)) +
  geom_col() +
  coord_flip() +
  theme_bw() +
  theme(legend.position = "bottom") +
  labs(
    y = "Respondent count",
    x = "Motivation impediment text",
    fill = "Encoded in chatbot"
  )

Proportion of redundant responses: 24%

df_text %>% 
  mutate(is_redundant = (motive_other_clean == "redundant")) %>%
  drop_na(motive_other_clean) %>% 
  tabyl(is_redundant) %>% 
  as_tibble() %>% 
  select(is_redundant, percent) %>% 
  kable(format = "pipe")
is_redundant percent
FALSE 0.757874
TRUE 0.242126


Ability impediments

Question: Are there other reasons it might be hard to get the vaccine?

df_text %>%
  count(ability_other_clean) %>% 
  arrange(n) %>% 
  drop_na(ability_other_clean) %>% 
  mutate(
    ability_other_clean = fct_inorder(ability_other_clean),
    encoded = (ability_other_clean %in% c("no motive", "too far away", "travel costs", "getting off work", "no cash", "no vaccines left"))
  ) %>% 
  filter(ability_other_clean != "no", ability_other_clean != "redundant") %>% 
  ggplot(aes(ability_other_clean, n, fill = encoded)) +
  geom_col() +
  coord_flip() +
  theme_ipsum() +
  theme(legend.position = "bottom") +
  labs(
    y = "Respondent count",
    x = "Ability impediment text",
    fill = "Encoded in chatbot"
  )

Proportion of redundant responses: 19%

df_text %>% 
  mutate(is_redundant = (ability_other_clean == "redundant")) %>%
  drop_na(ability_other_clean) %>% 
  tabyl(is_redundant) %>% 
  as_tibble() %>% 
  select(is_redundant, percent) %>% 
  kable(format = "pipe")
is_redundant percent
FALSE 0.8058252
TRUE 0.1941748

Proportion of misforked responses: 10%

df_text %>% 
  mutate(is_redundant = (ability_other_clean == "no motive")) %>%
  drop_na(ability_other_clean) %>% 
  tabyl(is_redundant) %>% 
  as_tibble() %>% 
  select(is_misforked = is_redundant, percent) %>% 
  kable(format = "pipe")
is_misforked percent
FALSE 0.9029126
TRUE 0.0970874

5. A multinomial logit model with polynomial interactions among demographics, or even a random forst model, does not predict motivation or ability impediments well for unvaccinated respondents. This is also corroborated by the correlation plots.

set.seed(929292911)

# multinomial logit
df_mlm <-
  df %>%
  transmute(
    female = case_when(
      gender == "female" ~ 1,
      gender == "male" ~ 0,
    ),
    income = case_when(
      income == "Unemployed" ~ 0,
      income == "< R5,000" ~ 1,
      income == "R5,000 – R9,999" ~ 2,
      income == "R10,000 – R29,999" ~ 3,
      income == "R30,000 – R49,999" ~ 4,
      income == "R50,000 – R99,999" ~ 5,
      income == "> R100,000" ~ 6,
    ),
    education = case_when(
      education == "< high school" ~ 1,
      education == "high school" ~ 2,
      education == "some college" ~ 3,
      education == "2-year degree" ~ 4,
      education == "4-year degree" ~ 5,
      education == "graduate degree" ~ 6,
    ),
    religiosity = case_when(
      religiosity == "not very religious" ~ 1,
      religiosity == "somewhat religious" ~ 2,
      religiosity == "very religious" ~ 3,
    ),
    politics = case_when(
      politics == "conservative" ~ 1,
      politics == "moderate" ~ 2,
      politics == "liberal" ~ 3,
    ),
    location = case_when(
      location == "rural" ~ 1,
      location == "suburban" ~ 2,
      location == "urban" ~ 3,
    ),
    white = case_when(
      ethnicity == "white or caucasian" ~ 1,
      ethnicity != "white or caucasian" ~ 0
    ),
    no_ability = case_when(
      ability == "no" ~ 1,
      ability == "yes" ~ 0,
    ),
    no_motive = case_when(
      motive == "no" ~ 1,
      motive == "yes" ~ 0,
    ),
    against_beliefs = if_else(motive_main == "belief", 1L, 0L) %>% replace_na(0L),
    no_benefits = if_else(motive_main == "benefit", 1L, 0L) %>% replace_na(0L),
    risky = if_else(motive_main == "risk", 1L, 0L) %>% replace_na(0L),
    treat_nchar = nchar(best_treatment_explain) %>% replace_na(0L),
    unvax = if_else(vax_status == "unvax", 1L, 0L),
    impediment = case_when(
      motive_main == "belief" ~ "Against beliefs",
      motive_main == "benefit" ~ "No benefits",
      motive_main == "risk" ~ "Risky",
      TRUE ~ "No impediment" 
    ) %>% as_factor(),
    vax_status
  ) %>% 
  filter(vax_status == "unvax")

df_mlm$impediment <- relevel(df_mlm$impediment, ref = "No impediment")

trainingRows <- sample(1:nrow(df_mlm), 0.7 * nrow(df_mlm))
training <- df_mlm[trainingRows, ]
test <- df_mlm[-trainingRows, ]

mlm1 <- 
  multinom(impediment ~ female + education + income + religiosity + politics + location + white, data = training)

mlm2 <-
  multinom(impediment ~ female * education * income * religiosity * politics * location * white, data = training)

Motivation impediments

Multinomial Logit

Baseline model with 70/30 train/test split and no interactions [impediment ~ female + education + income + religiosity + politics + location + white] for unvaccinated respondents does a bad job of predicting true motivation impediments. Misclassification error for this model is too high at 68%.

predicted_class <- predict(mlm1, test)

table(`Predicted class` = predicted_class, `True class` = test$impediment)
##                  True class
## Predicted class   No impediment Risky No benefits Against beliefs
##   No impediment              14    15          13               4
##   Risky                      14    17          19              11
##   No benefits                 7     5          12               3
##   Against beliefs             0     0           0               0
mean(as.character(predicted_class) != as.character(test$impediment), na.rm = T)
## [1] 0.6791045

A slightly more complex model with interactions among all independent variables for unvaccinated respondents does a bad job too, as misclassification error is at 67%.

predicted_class <- predict(mlm2, test)

table(`Predicted class` = predicted_class, `True class` = test$impediment)
##                  True class
## Predicted class   No impediment Risky No benefits Against beliefs
##   No impediment              15    12          18               9
##   Risky                       9     8          16               6
##   No benefits                 9    10           5               3
##   Against beliefs             2     7           5               0
mean(as.character(predicted_class) != as.character(test$impediment), na.rm = T)
## [1] 0.7910448

Random Forest

We also try a random forest model with 70/30 train/test splitfor this task. The misclassification error still stays as high as 69%.

# impute mising values before feeding into RF
training <-
  training %>% 
  simputation::impute_lm(politics + income + religiosity + location + education ~ female + white) %>% 
  drop_na()

rf_model <- 
  randomForest::randomForest(impediment ~ female + education + income + religiosity + politics + location + white, data = training, importance = TRUE)

predicted_class <- predict(rf_model, test)

table(`Predicted class` = predicted_class, `True class` = test$impediment)
##                  True class
## Predicted class   No impediment Risky No benefits Against beliefs
##   No impediment              17    13          12               9
##   Risky                      14    19          22               6
##   No benefits                 4     4          10               3
##   Against beliefs             0     1           0               0
mean(as.character(predicted_class) != as.character(test$impediment), na.rm = T)
## [1] 0.6567164

Given the high misclassification error, the variable importance based on Mean decrease accuracy and Mean decrease Gini are inconsistent.

  • MeanDecreaseAccuracy gives a rough estimate of the loss in prediction performance when that particular variable is omitted from the training set.
  • MeanDecreaseGini: GINI is a measure of node impurity. Highest purity means that each node contains only elements of a single class. Assessing the decrease in GINI when that feature is omitted leads to an understanding of how important that feature is to split the data correctly.

Note: Both these measures are used to rank variables in terms of importance and their absolute values could be disregarded.

randomForest::varImpPlot(rf_model, sort = T, main = "Random Forest - Variable Importance")


Correlation Plots

As a next step, the correlation plots below show correlation coefficients between continuous variables. We map binary and ordinal variables to continuous variables as follows:

  • female: 1 if female, 0 if male
  • income: 0 if the participant is unemployed, 1 if household income < R5,000, 2 if household income in R5,000 – R9,999, …, 6 if household income > R100,000
  • education: 1 if the participant’s education < high school, 2 if education is high school, …, 6 if education is a graduate degree
  • religiosity: 1 if the participant is not very religious, 2 if somewhat religious, 3 if very religious
  • politics: 1 if the participant is conservative, 2 if moderate, 3 if liberal
  • location: 1 if the participant lives in rural, 2 if suburban, 3 if urban,
  • white: 1 if the participant is a white or caucasian, 0 if not
  • no_motive: 1 if the participant does not have the motive to get vax, 0 otherwise
    • no_benefits: 1 if participant believes vaccine has no benefits, 0 otherwise
    • against_beliefs:: 1 if participant believes vaccine is against their beliefs, 0 otherwise
    • risky:: 1 if participant believes vaccine is risky, 0 otherwise
  • no_ability: 1 if the participant does not have the ability to get vax, 0 otherwise
    • no_time: 1 if participant has no time to get vaxxed, 0 otherwise
    • no_money: 1 if participant has no money to get vaxxed, 0 otherwise
    • no_availability: 1 if participant faces vax availability issues, 0 otherwise
df_mc_numeric <-
  df %>%
  mutate(
    female = case_when(
      gender == "female" ~ 1,
      gender == "male" ~ 0,
    ),
    income = case_when(
      income == "Unemployed" ~ 0,
      income == "< R5,000" ~ 1,
      income == "R5,000 – R9,999" ~ 2,
      income == "R10,000 – R29,999" ~ 3,
      income == "R30,000 – R49,999" ~ 4,
      income == "R50,000 – R99,999" ~ 5,
      income == "> R100,000" ~ 6,
    ),
    education = case_when(
      education == "< high school" ~ 1,
      education == "high school" ~ 2,
      education == "some college" ~ 3,
      education == "2-year degree" ~ 4,
      education == "4-year degree" ~ 5,
      education == "graduate degree" ~ 6,
    ),
    religiosity = case_when(
      religiosity == "not very religious" ~ 1,
      religiosity == "somewhat religious" ~ 2,
      religiosity == "very religious" ~ 3,
    ),
    politics = case_when(
      politics == "conservative" ~ 1,
      politics == "moderate" ~ 2,
      politics == "liberal" ~ 3,
    ),
    location = case_when(
      location == "rural" ~ 1,
      location == "suburban" ~ 2,
      location == "urban" ~ 3,
    ),
    white = case_when(
      ethnicity == "white or caucasian" ~ 1,
      ethnicity != "white or caucasian" ~ 0
    ),
    no_ability = case_when(
      ability == "no" ~ 1,
      ability == "yes" ~ 0,
    ),
    no_motive = case_when(
      motive == "no" ~ 1,
      motive == "yes" ~ 0,
    ),
    against_beliefs = if_else(motive_main == "belief", 1L, 0L),
    no_benefits = if_else(motive_main == "benefit", 1L, 0L),
    risky = if_else(motive_main == "risk", 1L, 0L),
    treat_nchar = nchar(best_treatment_explain),
    unvax = if_else(vax_status == "unvax", 1L, 0L)
  ) %>%
  select("no_motive", "female", "income", "education", "religiosity", "politics", "location", "white")


mat1 <- cor(df_mc_numeric, use = "pairwise.complete.obs")[1, 2:8]
mat1_p <- cor_pmat(df_mc_numeric)[1, 2:8]
df_mc_numeric <-
  df %>%
  mutate(
    female = case_when(
      gender == "female" ~ 1,
      gender == "male" ~ 0,
    ),
    income = case_when(
      income == "Unemployed" ~ 0,
      income == "< R5,000" ~ 1,
      income == "R5,000 – R9,999" ~ 2,
      income == "R10,000 – R29,999" ~ 3,
      income == "R30,000 – R49,999" ~ 4,
      income == "R50,000 – R99,999" ~ 5,
      income == "> R100,000" ~ 6,
    ),
    education = case_when(
      education == "< high school" ~ 1,
      education == "high school" ~ 2,
      education == "some college" ~ 3,
      education == "2-year degree" ~ 4,
      education == "4-year degree" ~ 5,
      education == "graduate degree" ~ 6,
    ),
    religiosity = case_when(
      religiosity == "not very religious" ~ 1,
      religiosity == "somewhat religious" ~ 2,
      religiosity == "very religious" ~ 3,
    ),
    politics = case_when(
      politics == "conservative" ~ 1,
      politics == "moderate" ~ 2,
      politics == "liberal" ~ 3,
    ),
    location = case_when(
      location == "rural" ~ 1,
      location == "suburban" ~ 2,
      location == "urban" ~ 3,
    ),
    white = case_when(
      ethnicity == "white or caucasian" ~ 1,
      ethnicity != "white or caucasian" ~ 0
    ),
    no_ability = case_when(
      ability == "no" ~ 1,
      ability == "yes" ~ 0,
    ),
    no_motive = case_when(
      motive == "no" ~ 1,
      motive == "yes" ~ 0,
    ),
    against_beliefs = if_else(motive_main == "belief", 1L, 0L),
    no_benefits = if_else(motive_main == "benefit", 1L, 0L),
    risky = if_else(motive_main == "risk", 1L, 0L),
    treat_nchar = nchar(best_treatment_explain),
    unvax = if_else(vax_status == "unvax", 1L, 0L)
  ) %>%
  select("no_benefits", "female", "income", "education", "religiosity", "politics", "location", "white")


mat2 <- cor(df_mc_numeric, use = "pairwise.complete.obs")[1, 2:8]
mat2_p <- cor_pmat(df_mc_numeric)[1, 2:8]
df_mc_numeric <-
  df %>%
  mutate(
    female = case_when(
      gender == "female" ~ 1,
      gender == "male" ~ 0,
    ),
    income = case_when(
      income == "Unemployed" ~ 0,
      income == "< R5,000" ~ 1,
      income == "R5,000 – R9,999" ~ 2,
      income == "R10,000 – R29,999" ~ 3,
      income == "R30,000 – R49,999" ~ 4,
      income == "R50,000 – R99,999" ~ 5,
      income == "> R100,000" ~ 6,
    ),
    education = case_when(
      education == "< high school" ~ 1,
      education == "high school" ~ 2,
      education == "some college" ~ 3,
      education == "2-year degree" ~ 4,
      education == "4-year degree" ~ 5,
      education == "graduate degree" ~ 6,
    ),
    religiosity = case_when(
      religiosity == "not very religious" ~ 1,
      religiosity == "somewhat religious" ~ 2,
      religiosity == "very religious" ~ 3,
    ),
    politics = case_when(
      politics == "conservative" ~ 1,
      politics == "moderate" ~ 2,
      politics == "liberal" ~ 3,
    ),
    location = case_when(
      location == "rural" ~ 1,
      location == "suburban" ~ 2,
      location == "urban" ~ 3,
    ),
    white = case_when(
      ethnicity == "white or caucasian" ~ 1,
      ethnicity != "white or caucasian" ~ 0
    ),
    no_ability = case_when(
      ability == "no" ~ 1,
      ability == "yes" ~ 0,
    ),
    no_motive = case_when(
      motive == "no" ~ 1,
      motive == "yes" ~ 0,
    ),
    against_beliefs = if_else(motive_main == "belief", 1L, 0L),
    no_benefits = if_else(motive_main == "benefit", 1L, 0L),
    risky = if_else(motive_main == "risk", 1L, 0L),
    treat_nchar = nchar(best_treatment_explain),
    unvax = if_else(vax_status == "unvax", 1L, 0L)
  ) %>%
  select("against_beliefs", "female", "income", "education", "religiosity", "politics", "location", "white")

mat3 <- cor(df_mc_numeric, use = "pairwise.complete.obs")[1, 2:8]
mat3_p <- cor_pmat(df_mc_numeric)[1, 2:8]
df_mc_numeric <-
  df %>%
  mutate(
    female = case_when(
      gender == "female" ~ 1,
      gender == "male" ~ 0,
    ),
    income = case_when(
      income == "Unemployed" ~ 0,
      income == "< R5,000" ~ 1,
      income == "R5,000 – R9,999" ~ 2,
      income == "R10,000 – R29,999" ~ 3,
      income == "R30,000 – R49,999" ~ 4,
      income == "R50,000 – R99,999" ~ 5,
      income == "> R100,000" ~ 6,
    ),
    education = case_when(
      education == "< high school" ~ 1,
      education == "high school" ~ 2,
      education == "some college" ~ 3,
      education == "2-year degree" ~ 4,
      education == "4-year degree" ~ 5,
      education == "graduate degree" ~ 6,
    ),
    religiosity = case_when(
      religiosity == "not very religious" ~ 1,
      religiosity == "somewhat religious" ~ 2,
      religiosity == "very religious" ~ 3,
    ),
    politics = case_when(
      politics == "conservative" ~ 1,
      politics == "moderate" ~ 2,
      politics == "liberal" ~ 3,
    ),
    location = case_when(
      location == "rural" ~ 1,
      location == "suburban" ~ 2,
      location == "urban" ~ 3,
    ),
    white = case_when(
      ethnicity == "white or caucasian" ~ 1,
      ethnicity != "white or caucasian" ~ 0
    ),
    no_ability = case_when(
      ability == "no" ~ 1,
      ability == "yes" ~ 0,
    ),
    no_motive = case_when(
      motive == "no" ~ 1,
      motive == "yes" ~ 0,
    ),
    against_beliefs = if_else(motive_main == "belief", 1L, 0L),
    no_benefits = if_else(motive_main == "benefit", 1L, 0L),
    risky = if_else(motive_main == "risk", 1L, 0L),
    treat_nchar = nchar(best_treatment_explain),
    unvax = if_else(vax_status == "unvax", 1L, 0L)
  ) %>%
  select("risky", "female", "income", "education", "religiosity", "politics", "location", "white")

mat4 <- cor(df_mc_numeric, use = "pairwise.complete.obs")[1, 2:8]
mat4_p <- cor_pmat(df_mc_numeric)[1, 2:8]
final_mat <-
  tibble(
    mat4, mat3, mat2, mat1
  ) %>% 
  as.matrix()

final_mat_p <-
  tibble(
    mat4_p, mat3_p, mat2_p, mat1_p
  ) %>% 
  as.matrix() 

rownames(final_mat) <- c("female", "income", "education", "religiosity", "politics", "location", "white")
colnames(final_mat) <- c("risky", "against_beliefs", "no_benefits", "no_motive")
colnames(final_mat_p) <- c("risky", "against_beliefs", "no_benefits", "no_motive")

ggcorrplot(final_mat, lab_size = 3, lab = T, tl.cex = 10)

Ability impediments

Multinomial Logit

# multinomial logit
df_mlm <-
  df %>%
  transmute(
    female = case_when(
      gender == "female" ~ 1,
      gender == "male" ~ 0,
    ),
    income = case_when(
      income == "Unemployed" ~ 0,
      income == "< R5,000" ~ 1,
      income == "R5,000 – R9,999" ~ 2,
      income == "R10,000 – R29,999" ~ 3,
      income == "R30,000 – R49,999" ~ 4,
      income == "R50,000 – R99,999" ~ 5,
      income == "> R100,000" ~ 6,
    ),
    education = case_when(
      education == "< high school" ~ 1,
      education == "high school" ~ 2,
      education == "some college" ~ 3,
      education == "2-year degree" ~ 4,
      education == "4-year degree" ~ 5,
      education == "graduate degree" ~ 6,
    ),
    religiosity = case_when(
      religiosity == "not very religious" ~ 1,
      religiosity == "somewhat religious" ~ 2,
      religiosity == "very religious" ~ 3,
    ),
    politics = case_when(
      politics == "conservative" ~ 1,
      politics == "moderate" ~ 2,
      politics == "liberal" ~ 3,
    ),
    location = case_when(
      location == "rural" ~ 1,
      location == "suburban" ~ 2,
      location == "urban" ~ 3,
    ),
    white = case_when(
      ethnicity == "white or caucasian" ~ 1,
      ethnicity != "white or caucasian" ~ 0
    ),
    no_ability = case_when(
      ability == "no" ~ 1,
      ability == "yes" ~ 0,
    ),
    no_motive = case_when(
      motive == "no" ~ 1,
      motive == "yes" ~ 0,
    ),
    against_beliefs = if_else(motive_main == "belief", 1L, 0L),
    no_benefits = if_else(motive_main == "benefit", 1L, 0L),
    risky = if_else(motive_main == "risk", 1L, 0L),
    treat_nchar = nchar(best_treatment_explain),
    unvax = if_else(vax_status == "unvax", 1L, 0L),
    impediment = case_when(
      ability_main == "availability" ~ "No Availability",
      ability_main == "money" ~ "No Money",
      ability_main == "time" ~ "No Time",
      TRUE ~ "No impediment" 
    ) %>% as_factor(),
    vax_status
  ) %>% 
  filter(vax_status == "unvax")

df_mlm$impediment <- relevel(df_mlm$impediment, ref = "No impediment")

trainingRows <- sample(1:nrow(df_mlm), 0.7 * nrow(df_mlm))
training <- df_mlm[trainingRows, ]
test <- df_mlm[-trainingRows, ]

mlm1 <- 
  multinom(impediment ~ female + education + income + religiosity + politics + location + white, data = training)

mlm2 <-
  multinom(impediment ~ female * education * income * religiosity * politics * location * white, data = training)

Baseline model with 70/30 train/test split and no interactions [impediment ~ female + education + income + religiosity + politics + location + white] for unvaccinated respondents classifies everyone as having no impediment. Since we have very few respondents with ability impediments, the model is not going to be useful despite the misclassification error being low (19%).

predicted_class <- predict(mlm1, test)

table(`Predicted class` = predicted_class, `True class` = test$impediment)
##                  True class
## Predicted class   No impediment No Money No Time No Availability
##   No impediment             107        3      19               3
##   No Money                    0        0       0               0
##   No Time                     0        0       0               0
##   No Availability             0        0       0               0
mean(as.character(predicted_class) != as.character(test$impediment), na.rm = T)
## [1] 0.1893939

Random Forest

We also try a random forest model with 70/30 train/test splitfor this task. The misclassification error is around 20%, though most predictions are No impediment.

# impute mising values before feeding into RF
training <-
  training %>% 
  simputation::impute_lm(politics + income + religiosity + location + education ~ female + white) %>% 
  drop_na()

rf_model <- 
  randomForest::randomForest(impediment ~ female + education + income + religiosity + politics + location + white, data = training, importance = TRUE)

predicted_class <- predict(rf_model, test)

table(`Predicted class` = predicted_class, `True class` = test$impediment)
##                  True class
## Predicted class   No impediment No Money No Time No Availability
##   No impediment             106        3      19               3
##   No Money                    0        0       0               0
##   No Time                     1        0       0               0
##   No Availability             0        0       0               0
mean(as.character(predicted_class) != as.character(test$impediment), na.rm = T)
## [1] 0.1969697

The variable importance based on Mean decrease accuracy and Mean decrease Gini are again inconsistent.

randomForest::varImpPlot(rf_model, sort = T, main = "Random Forest - Variable Importance")


Correlation plots

The correlation plots for ability impediments and demographics support these results.

df_mc_numeric <-
  df %>%
  mutate(
    female = case_when(
      gender == "female" ~ 1,
      gender == "male" ~ 0,
    ),
    income = case_when(
      income == "Unemployed" ~ 0,
      income == "< R5,000" ~ 1,
      income == "R5,000 – R9,999" ~ 2,
      income == "R10,000 – R29,999" ~ 3,
      income == "R30,000 – R49,999" ~ 4,
      income == "R50,000 – R99,999" ~ 5,
      income == "> R100,000" ~ 6,
    ),
    education = case_when(
      education == "< high school" ~ 1,
      education == "high school" ~ 2,
      education == "some college" ~ 3,
      education == "2-year degree" ~ 4,
      education == "4-year degree" ~ 5,
      education == "graduate degree" ~ 6,
    ),
    religiosity = case_when(
      religiosity == "not very religious" ~ 1,
      religiosity == "somewhat religious" ~ 2,
      religiosity == "very religious" ~ 3,
    ),
    politics = case_when(
      politics == "conservative" ~ 1,
      politics == "moderate" ~ 2,
      politics == "liberal" ~ 3,
    ),
    location = case_when(
      location == "rural" ~ 1,
      location == "suburban" ~ 2,
      location == "urban" ~ 3,
    ),
    white = case_when(
      ethnicity == "white or caucasian" ~ 1,
      ethnicity != "white or caucasian" ~ 0
    ),
    no_ability = case_when(
      ability == "no" ~ 1,
      ability == "yes" ~ 0,
    ),
    no_motive = case_when(
      motive == "no" ~ 1,
      motive == "yes" ~ 0,
    ),
    no_time = if_else(ability_main == "time", 1L, 0L),
    no_money = if_else(ability_main == "money", 1L, 0L),
    no_availability = if_else(ability_main == "availability", 1L, 0L),
    treat_nchar = nchar(best_treatment_explain),
    unvax = if_else(vax_status == "unvax", 1L, 0L)
  ) %>%
  select("no_ability", "female", "income", "education", "religiosity", "politics", "location", "white")


mat5 <- cor(df_mc_numeric, use = "pairwise.complete.obs")[1, 2:8]
mat5_p <- cor_pmat(df_mc_numeric)[1, 2:8]
df_mc_numeric <-
  df %>%
  mutate(
    female = case_when(
      gender == "female" ~ 1,
      gender == "male" ~ 0,
    ),
    income = case_when(
      income == "Unemployed" ~ 0,
      income == "< R5,000" ~ 1,
      income == "R5,000 – R9,999" ~ 2,
      income == "R10,000 – R29,999" ~ 3,
      income == "R30,000 – R49,999" ~ 4,
      income == "R50,000 – R99,999" ~ 5,
      income == "> R100,000" ~ 6,
    ),
    education = case_when(
      education == "< high school" ~ 1,
      education == "high school" ~ 2,
      education == "some college" ~ 3,
      education == "2-year degree" ~ 4,
      education == "4-year degree" ~ 5,
      education == "graduate degree" ~ 6,
    ),
    religiosity = case_when(
      religiosity == "not very religious" ~ 1,
      religiosity == "somewhat religious" ~ 2,
      religiosity == "very religious" ~ 3,
    ),
    politics = case_when(
      politics == "conservative" ~ 1,
      politics == "moderate" ~ 2,
      politics == "liberal" ~ 3,
    ),
    location = case_when(
      location == "rural" ~ 1,
      location == "suburban" ~ 2,
      location == "urban" ~ 3,
    ),
    white = case_when(
      ethnicity == "white or caucasian" ~ 1,
      ethnicity != "white or caucasian" ~ 0
    ),
    no_ability = case_when(
      ability == "no" ~ 1,
      ability == "yes" ~ 0,
    ),
    no_motive = case_when(
      motive == "no" ~ 1,
      motive == "yes" ~ 0,
    ),
    no_time = if_else(ability_main == "time", 1L, 0L),
    no_money = if_else(ability_main == "money", 1L, 0L),
    no_availability = if_else(ability_main == "availability", 1L, 0L),
    treat_nchar = nchar(best_treatment_explain),
    unvax = if_else(vax_status == "unvax", 1L, 0L)
  ) %>%
  select("no_time", "female", "income", "education", "religiosity", "politics", "location", "white")

mat6 <- cor(df_mc_numeric, use = "pairwise.complete.obs")[1, 2:8]
mat6_p <- cor_pmat(df_mc_numeric)[1, 2:8]
df_mc_numeric <-
  df %>%
  mutate(
    female = case_when(
      gender == "female" ~ 1,
      gender == "male" ~ 0,
    ),
    income = case_when(
      income == "Unemployed" ~ 0,
      income == "< R5,000" ~ 1,
      income == "R5,000 – R9,999" ~ 2,
      income == "R10,000 – R29,999" ~ 3,
      income == "R30,000 – R49,999" ~ 4,
      income == "R50,000 – R99,999" ~ 5,
      income == "> R100,000" ~ 6,
    ),
    education = case_when(
      education == "< high school" ~ 1,
      education == "high school" ~ 2,
      education == "some college" ~ 3,
      education == "2-year degree" ~ 4,
      education == "4-year degree" ~ 5,
      education == "graduate degree" ~ 6,
    ),
    religiosity = case_when(
      religiosity == "not very religious" ~ 1,
      religiosity == "somewhat religious" ~ 2,
      religiosity == "very religious" ~ 3,
    ),
    politics = case_when(
      politics == "conservative" ~ 1,
      politics == "moderate" ~ 2,
      politics == "liberal" ~ 3,
    ),
    location = case_when(
      location == "rural" ~ 1,
      location == "suburban" ~ 2,
      location == "urban" ~ 3,
    ),
    white = case_when(
      ethnicity == "white or caucasian" ~ 1,
      ethnicity != "white or caucasian" ~ 0
    ),
    no_ability = case_when(
      ability == "no" ~ 1,
      ability == "yes" ~ 0,
    ),
    no_motive = case_when(
      motive == "no" ~ 1,
      motive == "yes" ~ 0,
    ),
    no_time = if_else(ability_main == "time", 1L, 0L),
    no_money = if_else(ability_main == "money", 1L, 0L),
    no_availability = if_else(ability_main == "availability", 1L, 0L),
    treat_nchar = nchar(best_treatment_explain),
    unvax = if_else(vax_status == "unvax", 1L, 0L)
  ) %>%
  select("no_money", "female", "income", "education", "religiosity", "politics", "location", "white")

mat7 <- cor(df_mc_numeric, use = "pairwise.complete.obs")[1, 2:8]
mat7_p <- cor_pmat(df_mc_numeric)[1, 2:8]
df_mc_numeric <-
  df %>%
  mutate(
    female = case_when(
      gender == "female" ~ 1,
      gender == "male" ~ 0,
    ),
    income = case_when(
      income == "Unemployed" ~ 0,
      income == "< R5,000" ~ 1,
      income == "R5,000 – R9,999" ~ 2,
      income == "R10,000 – R29,999" ~ 3,
      income == "R30,000 – R49,999" ~ 4,
      income == "R50,000 – R99,999" ~ 5,
      income == "> R100,000" ~ 6,
    ),
    education = case_when(
      education == "< high school" ~ 1,
      education == "high school" ~ 2,
      education == "some college" ~ 3,
      education == "2-year degree" ~ 4,
      education == "4-year degree" ~ 5,
      education == "graduate degree" ~ 6,
    ),
    religiosity = case_when(
      religiosity == "not very religious" ~ 1,
      religiosity == "somewhat religious" ~ 2,
      religiosity == "very religious" ~ 3,
    ),
    politics = case_when(
      politics == "conservative" ~ 1,
      politics == "moderate" ~ 2,
      politics == "liberal" ~ 3,
    ),
    location = case_when(
      location == "rural" ~ 1,
      location == "suburban" ~ 2,
      location == "urban" ~ 3,
    ),
    white = case_when(
      ethnicity == "white or caucasian" ~ 1,
      ethnicity != "white or caucasian" ~ 0
    ),
    no_ability = case_when(
      ability == "no" ~ 1,
      ability == "yes" ~ 0,
    ),
    no_motive = case_when(
      motive == "no" ~ 1,
      motive == "yes" ~ 0,
    ),
    no_time = if_else(ability_main == "time", 1L, 0L),
    no_money = if_else(ability_main == "money", 1L, 0L),
    no_availability = if_else(ability_main == "availability", 1L, 0L),
    treat_nchar = nchar(best_treatment_explain),
    unvax = if_else(vax_status == "unvax", 1L, 0L)
  ) %>%
  select("no_availability", "female", "income", "education", "religiosity", "politics", "location", "white")

mat8 <- cor(df_mc_numeric, use = "pairwise.complete.obs")[1, 2:8]
mat8_p <- cor_pmat(df_mc_numeric)[1, 2:8]
final_mat <-
  tibble(
    mat8, mat7, mat6, mat5
  ) %>% 
  as.matrix()

final_mat_p <-
  tibble(
    mat8_p, mat7_p, mat6_p, mat5_p
  ) %>% 
  as.matrix() 

rownames(final_mat) <- c("female", "income", "education", "religiosity", "politics", "location", "white")
colnames(final_mat) <- c("no_availability", "no_money", "no_time", "no_ability")
colnames(final_mat_p) <- c("no_availability", "no_money", "no_time", "no_ability")

ggcorrplot(final_mat, lab_size = 3, lab = T, tl.cex = 10)

3.3 Treatments

1. In our February pilot version 5, no clear treatment was preferred by unvaccinated individuals, while vaccinated individuals strongly preferred family support. This trend changed with rising vaccinations in our March pilot version 6. A large proportion of unvaccinated and vaccinated respondents indicated job/school mandates as the treatment that worked for them.

Note: Job/school required treatment was added after cleaning free text responses from pilot 5.

plot1 <-
  df_v5 %>% 
  mutate(
    best_treatment = str_to_sentence(best_treatment),
    best_treatment = if_else(str_detect(best_treatment, "^Nothing"), "Nothing", best_treatment),
    best_treatment = if_else(best_treatment %in% c("Family supports it", "Trusted info source", "More transparency", "Nothing", "Rewards for vaxxing", "Job/school required"), best_treatment, NA_character_),
  ) %>%
  drop_na(best_treatment) %>% 
  count(vax_status, best_treatment) %>% 
  arrange(vax_status, -n) %>% 
  group_by(vax_status) %>% 
  mutate(perc = n/sum(n)) %>% 
  ungroup() %>% 
  mutate(best_treatment = fct_inorder(best_treatment) %>% fct_rev()) %>% 
  ggplot(aes(best_treatment, n)) +
  geom_col() +
  coord_flip() +
  theme_bw() + 
  facet_wrap(vars(vax_status)) +
  labs(x = "Best treatment", y = "# of respondents", title = "Treatments preferred to get vaxxed (Pilot 5, n = 1429)")

plot2 <-
  df_v6 %>% 
  mutate(
    best_treatment = str_to_sentence(best_treatment),
    best_treatment = if_else(str_detect(best_treatment, "^Nothing"), "Nothing", best_treatment),
    best_treatment = if_else(best_treatment %in% c("Family supports it", "Trusted info source", "More transparency", "Nothing", "Rewards for vaxxing", "Job/school required"), best_treatment, NA_character_),
  ) %>%
  drop_na(best_treatment) %>% 
  count(vax_status, best_treatment) %>% 
  arrange(vax_status, -n) %>% 
  group_by(vax_status) %>% 
  mutate(perc = n/sum(n)) %>% 
  ungroup() %>% 
  mutate(best_treatment = fct_inorder(best_treatment) %>% fct_rev()) %>% 
  ggplot(aes(best_treatment, n)) +
  geom_col() +
  coord_flip() +
  theme_bw() + 
  facet_wrap(vars(vax_status)) +
  labs(x = "Best treatment", y = "# of respondents", title = "Treatments preferred to get vaxxed (Pilot 6, n = 1763)")

cowplot::plot_grid(plot1, plot2, nrow = 2)

df_v5 %>% 
  transmute(
    
    best_treatment = str_to_sentence(best_treatment),
    best_treatment = if_else(str_detect(best_treatment, "^Nothing"), "Nothing", best_treatment),
    best_treatment = if_else(best_treatment %in% c("Family supports it", "Trusted info source", "More transparency", "Nothing", "Rewards for vaxxing", "Job/school required"), best_treatment, NA_character_),
  ) %>%
  drop_na(best_treatment) %>% 
  group_by(vax_status, best_treatment) %>% 
  summarise(
    vax_mean = mean(vaxxed, na.rm = T),
    vax_sd = sd(vaxxed, na.rm = T),
    vax_n = sum(!is.na(vaxxed)),
    vax_se = vax_sd / sqrt(vax_n)
  ) %>% 
  mutate(
    confint_low = vax_mean - 1.96*vax_se,
    confint_high = vax_mean + 1.96*vax_se,
  )

2. Free text responses for preferred treatments were dominated by responses highlighting that nothing would work. The second most frequent response was job/school mandates, which has now been incorporated as an encoded response in our chatbot.

Question: What would best help you to get vaccinated?

df_text %>%
  count(best_treatment_other_clean) %>% 
  arrange(n) %>% 
  drop_na(best_treatment_other_clean) %>% 
  filter(best_treatment_other_clean != "redundant") %>% 
  mutate(
    best_treatment_other_clean = fct_inorder(best_treatment_other_clean),
    encoded = (best_treatment_other_clean %in% c("nothing", "trusted info source", "more transparency"))
  ) %>% 
  ggplot(aes(best_treatment_other_clean, n, fill = encoded)) +
  geom_col() +
  coord_flip() +
  theme_ipsum() +
  theme(legend.position = "bottom") +
  labs(
    y = "Respondent count",
    x = "Treatment text",
    fill = "Encoded in chatbot"
  )

Proportion of redundant responses: 7.3%

df_text %>% 
  mutate(is_redundant = (best_treatment_other_clean == "redundant")) %>%
  drop_na(best_treatment_other_clean) %>% 
  tabyl(is_redundant) %>% 
  as_tibble() %>% 
  select(is_redundant, percent) %>% 
  kable(format = "pipe")
is_redundant percent
FALSE 0.9266667
TRUE 0.0733333

3. Models with demographics are not strong predictors of preferred treatments.

Multinomial logit

# multinomial logit
df_mlm <-
  df %>%
  transmute(
    female = case_when(
      gender == "female" ~ 1,
      gender == "male" ~ 0,
    ),
    income = case_when(
      income == "Unemployed" ~ 0,
      income == "< R5,000" ~ 1,
      income == "R5,000 – R9,999" ~ 2,
      income == "R10,000 – R29,999" ~ 3,
      income == "R30,000 – R49,999" ~ 4,
      income == "R50,000 – R99,999" ~ 5,
      income == "> R100,000" ~ 6,
    ),
    education = case_when(
      education == "< high school" ~ 1,
      education == "high school" ~ 2,
      education == "some college" ~ 3,
      education == "2-year degree" ~ 4,
      education == "4-year degree" ~ 5,
      education == "graduate degree" ~ 6,
    ),
    religiosity = case_when(
      religiosity == "not very religious" ~ 1,
      religiosity == "somewhat religious" ~ 2,
      religiosity == "very religious" ~ 3,
    ),
    politics = case_when(
      politics == "conservative" ~ 1,
      politics == "moderate" ~ 2,
      politics == "liberal" ~ 3,
    ),
    location = case_when(
      location == "rural" ~ 1,
      location == "suburban" ~ 2,
      location == "urban" ~ 3,
    ),
    white = case_when(
      ethnicity == "white or caucasian" ~ 1,
      ethnicity != "white or caucasian" ~ 0
    ),
    best_treatment = str_to_lower(best_treatment),
    best_treatment = if_else(str_detect(best_treatment, "^nothing"), "nothing", best_treatment),
    best_treatment = if_else(best_treatment %in% c("job/school required", "family supports it", "trusted info source", "more transparency", "nothing", "rewards for vaxxing"), best_treatment, NA_character_) %>% as_factor(),
  )

df_mlm$best_treatment <- relevel(df_mlm$best_treatment, ref = "nothing")

trainingRows <- sample(1:nrow(df_mlm), 0.7 * nrow(df_mlm))
training <- df_mlm[trainingRows, ]
test <- df_mlm[-trainingRows, ]

mlm1 <- 
  multinom(best_treatment ~ female + education + income + religiosity + politics + location + white, data = training)

mlm2 <-
  multinom(best_treatment ~ female * education * income * religiosity * politics * location * white, data = training)

Baseline model with 70/30 train/test split and no interactions [best_treatment ~ female + education + income + religiosity + politics + location + white] for all respondents doesn’t do well as we have a misclassification error of 70%.

predicted_class <- predict(mlm1, test)

table(`Predicted class` = predicted_class, `True class` = test$best_treatment)
##                      True class
## Predicted class       nothing family supports it job/school required
##   nothing                   1                  0                   0
##   family supports it       26                 60                  29
##   job/school required      12                 55                  39
##   rewards for vaxxing       1                  3                   1
##   trusted info source       3                  1                   4
##   more transparency         1                  0                   1
##                      True class
## Predicted class       rewards for vaxxing trusted info source more transparency
##   nothing                               0                   1                 0
##   family supports it                    7                  31                17
##   job/school required                   8                  20                 7
##   rewards for vaxxing                   0                   3                 1
##   trusted info source                   1                   1                 2
##   more transparency                     1                   0                 0
mean(as.character(predicted_class) != as.character(test$best_treatment), na.rm = T)
## [1] 0.7002967

Baseline model with 70/30 train/test split and all possible independent variable interactions doesn’t do well as we have a misclassification error of 73%.

predicted_class <- predict(mlm2, test)

table(`Predicted class` = predicted_class, `True class` = test$best_treatment)
##                      True class
## Predicted class       nothing family supports it job/school required
##   nothing                   3                  8                   5
##   family supports it       18                 48                  30
##   job/school required      13                 45                  21
##   rewards for vaxxing       3                  1                   3
##   trusted info source       6                 14                  13
##   more transparency         1                  3                   2
##                      True class
## Predicted class       rewards for vaxxing trusted info source more transparency
##   nothing                               1                   1                 1
##   family supports it                    7                  29                16
##   job/school required                   7                  17                 5
##   rewards for vaxxing                   0                   3                 1
##   trusted info source                   1                   5                 4
##   more transparency                     1                   1                 0
mean(as.character(predicted_class) != as.character(test$best_treatment), na.rm = T)
## [1] 0.7715134

Random Forest

We also try a random forest model with 70/30 train/test splitfor this task. The misclassification error is around 71%.

# impute mising values before feeding into RF
training <-
  training %>% 
  simputation::impute_lm(politics + income + religiosity + location + education ~ female + white) %>% 
  drop_na()

rf_model <- 
  randomForest::randomForest(best_treatment ~ female + education + income + religiosity + politics + location + white, data = training, importance = TRUE)

predicted_class <- predict(rf_model, test)

table(`Predicted class` = predicted_class, `True class` = test$best_treatment)
##                      True class
## Predicted class       nothing family supports it job/school required
##   nothing                   5                  7                   1
##   family supports it       18                 50                  26
##   job/school required      19                 56                  40
##   rewards for vaxxing       0                  1                   1
##   trusted info source       2                  4                   5
##   more transparency         0                  1                   1
##                      True class
## Predicted class       rewards for vaxxing trusted info source more transparency
##   nothing                               0                   4                 0
##   family supports it                    6                  28                17
##   job/school required                   7                  23                 8
##   rewards for vaxxing                   0                   0                 0
##   trusted info source                   3                   1                 2
##   more transparency                     1                   0                 0
mean(as.character(predicted_class) != as.character(test$best_treatment), na.rm = T)
## [1] 0.7151335

The variable importance based on Mean decrease accuracy and Mean decrease Gini are again inconsistent.

randomForest::varImpPlot(rf_model, sort = T, main = "Random Forest - Variable Importance")


Correlation Plots

Correlation plots for treatments vs demographics support these results:

df_mc_numeric <-
  df %>%
  mutate(
    female = case_when(
      gender == "female" ~ 1,
      gender == "male" ~ 0,
    ),
    income = case_when(
      income == "Unemployed" ~ 0,
      income == "< R5,000" ~ 1,
      income == "R5,000 – R9,999" ~ 2,
      income == "R10,000 – R29,999" ~ 3,
      income == "R30,000 – R49,999" ~ 4,
      income == "R50,000 – R99,999" ~ 5,
      income == "> R100,000" ~ 6,
    ),
    education = case_when(
      education == "< high school" ~ 1,
      education == "high school" ~ 2,
      education == "some college" ~ 3,
      education == "2-year degree" ~ 4,
      education == "4-year degree" ~ 5,
      education == "graduate degree" ~ 6,
    ),
    religiosity = case_when(
      religiosity == "not very religious" ~ 1,
      religiosity == "somewhat religious" ~ 2,
      religiosity == "very religious" ~ 3,
    ),
    politics = case_when(
      politics == "conservative" ~ 1,
      politics == "moderate" ~ 2,
      politics == "liberal" ~ 3,
    ),
    location = case_when(
      location == "rural" ~ 1,
      location == "suburban" ~ 2,
      location == "urban" ~ 3,
    ),
    white = case_when(
      ethnicity == "white or caucasian" ~ 1,
      ethnicity != "white or caucasian" ~ 0
    ),
    treat_nchar = nchar(best_treatment_explain),
    unvax = if_else(vax_status == "unvax", 1L, 0L),
    best_treatment = str_to_lower(best_treatment),
    best_treatment = if_else(str_detect(best_treatment, "^nothing"), "nothing", best_treatment),
    best_treatment = if_else(best_treatment %in% c("job/school required", "family supports it", "trusted info source", "more transparency", "nothing", "rewards for vaxxing"), best_treatment, NA_character_),
  ) %>%
  fastDummies::dummy_cols(select_columns = "best_treatment") %>% 
  select(family_supports_it = `best_treatment_family supports it`, "female", "income", "education", "religiosity", "politics", "location", "white")

mat9 <- cor(df_mc_numeric, use = "pairwise.complete.obs")[1, 2:8]
mat9_p <- cor_pmat(df_mc_numeric)[1, 2:8]
df_mc_numeric <-
  df %>%
  mutate(
    female = case_when(
      gender == "female" ~ 1,
      gender == "male" ~ 0,
    ),
    income = case_when(
      income == "Unemployed" ~ 0,
      income == "< R5,000" ~ 1,
      income == "R5,000 – R9,999" ~ 2,
      income == "R10,000 – R29,999" ~ 3,
      income == "R30,000 – R49,999" ~ 4,
      income == "R50,000 – R99,999" ~ 5,
      income == "> R100,000" ~ 6,
    ),
    education = case_when(
      education == "< high school" ~ 1,
      education == "high school" ~ 2,
      education == "some college" ~ 3,
      education == "2-year degree" ~ 4,
      education == "4-year degree" ~ 5,
      education == "graduate degree" ~ 6,
    ),
    religiosity = case_when(
      religiosity == "not very religious" ~ 1,
      religiosity == "somewhat religious" ~ 2,
      religiosity == "very religious" ~ 3,
    ),
    politics = case_when(
      politics == "conservative" ~ 1,
      politics == "moderate" ~ 2,
      politics == "liberal" ~ 3,
    ),
    location = case_when(
      location == "rural" ~ 1,
      location == "suburban" ~ 2,
      location == "urban" ~ 3,
    ),
    white = case_when(
      ethnicity == "white or caucasian" ~ 1,
      ethnicity != "white or caucasian" ~ 0
    ),
    treat_nchar = nchar(best_treatment_explain),
    unvax = if_else(vax_status == "unvax", 1L, 0L),
    best_treatment = str_to_lower(best_treatment),
    best_treatment = if_else(str_detect(best_treatment, "^nothing"), "nothing", best_treatment),
    best_treatment = if_else(best_treatment %in% c("job/school required", "family supports it", "trusted info source", "more transparency", "nothing", "rewards for vaxxing"), best_treatment, NA_character_)
  ) %>%
  fastDummies::dummy_cols(select_columns = "best_treatment") %>% 
  select(trusted_info_source = `best_treatment_trusted info source`, "female", "income", "education", "religiosity", "politics", "location", "white")

mat10 <- cor(df_mc_numeric, use = "pairwise.complete.obs")[1, 2:8]
mat10_p <- cor_pmat(df_mc_numeric)[1, 2:8]
df_mc_numeric <-
  df %>%
  mutate(
    female = case_when(
      gender == "female" ~ 1,
      gender == "male" ~ 0,
    ),
    income = case_when(
      income == "Unemployed" ~ 0,
      income == "< R5,000" ~ 1,
      income == "R5,000 – R9,999" ~ 2,
      income == "R10,000 – R29,999" ~ 3,
      income == "R30,000 – R49,999" ~ 4,
      income == "R50,000 – R99,999" ~ 5,
      income == "> R100,000" ~ 6,
    ),
    education = case_when(
      education == "< high school" ~ 1,
      education == "high school" ~ 2,
      education == "some college" ~ 3,
      education == "2-year degree" ~ 4,
      education == "4-year degree" ~ 5,
      education == "graduate degree" ~ 6,
    ),
    religiosity = case_when(
      religiosity == "not very religious" ~ 1,
      religiosity == "somewhat religious" ~ 2,
      religiosity == "very religious" ~ 3,
    ),
    politics = case_when(
      politics == "conservative" ~ 1,
      politics == "moderate" ~ 2,
      politics == "liberal" ~ 3,
    ),
    location = case_when(
      location == "rural" ~ 1,
      location == "suburban" ~ 2,
      location == "urban" ~ 3,
    ),
    white = case_when(
      ethnicity == "white or caucasian" ~ 1,
      ethnicity != "white or caucasian" ~ 0
    ),
    treat_nchar = nchar(best_treatment_explain),
    unvax = if_else(vax_status == "unvax", 1L, 0L),
    best_treatment = str_to_lower(best_treatment),
    best_treatment = if_else(str_detect(best_treatment, "^nothing"), "nothing", best_treatment),
    best_treatment = if_else(best_treatment %in% c("job/school required", "family supports it", "trusted info source", "more transparency", "nothing", "rewards for vaxxing"), best_treatment, NA_character_)
  ) %>%
  fastDummies::dummy_cols(select_columns = "best_treatment") %>% 
  select(job_school_required = `best_treatment_job/school required`, "female", "income", "education", "religiosity", "politics", "location", "white")

mat11 <- cor(df_mc_numeric, use = "pairwise.complete.obs")[1, 2:8]
mat11_p <- cor_pmat(df_mc_numeric)[1, 2:8]
df_mc_numeric <-
  df %>%
  mutate(
    female = case_when(
      gender == "female" ~ 1,
      gender == "male" ~ 0,
    ),
    income = case_when(
      income == "Unemployed" ~ 0,
      income == "< R5,000" ~ 1,
      income == "R5,000 – R9,999" ~ 2,
      income == "R10,000 – R29,999" ~ 3,
      income == "R30,000 – R49,999" ~ 4,
      income == "R50,000 – R99,999" ~ 5,
      income == "> R100,000" ~ 6,
    ),
    education = case_when(
      education == "< high school" ~ 1,
      education == "high school" ~ 2,
      education == "some college" ~ 3,
      education == "2-year degree" ~ 4,
      education == "4-year degree" ~ 5,
      education == "graduate degree" ~ 6,
    ),
    religiosity = case_when(
      religiosity == "not very religious" ~ 1,
      religiosity == "somewhat religious" ~ 2,
      religiosity == "very religious" ~ 3,
    ),
    politics = case_when(
      politics == "conservative" ~ 1,
      politics == "moderate" ~ 2,
      politics == "liberal" ~ 3,
    ),
    location = case_when(
      location == "rural" ~ 1,
      location == "suburban" ~ 2,
      location == "urban" ~ 3,
    ),
    white = case_when(
      ethnicity == "white or caucasian" ~ 1,
      ethnicity != "white or caucasian" ~ 0
    ),
    treat_nchar = nchar(best_treatment_explain),
    unvax = if_else(vax_status == "unvax", 1L, 0L),
    best_treatment = str_to_lower(best_treatment),
    best_treatment = if_else(str_detect(best_treatment, "^nothing"), "nothing", best_treatment),
    best_treatment = if_else(best_treatment %in% c("job/school required", "family supports it", "trusted info source", "more transparency", "nothing", "rewards for vaxxing"), best_treatment, NA_character_)
  ) %>%
  fastDummies::dummy_cols(select_columns = "best_treatment") %>% 
  select(rewards_for_vaxxing = `best_treatment_rewards for vaxxing`, "female", "income", "education", "religiosity", "politics", "location", "white")

mat12 <- cor(df_mc_numeric, use = "pairwise.complete.obs")[1, 2:8]
mat12_p <- cor_pmat(df_mc_numeric)[1, 2:8]
final_mat <-
  tibble(
    mat12, mat11, mat10, mat9
  ) %>% 
  as.matrix()

final_mat_p <-
  tibble(
    mat12_p, mat11_p, mat10_p, mat9_p
  ) %>% 
  as.matrix() 

rownames(final_mat) <- c("female", "income", "education", "religiosity", "politics", "location", "white")
colnames(final_mat) <- c("rewards", "mandatory", "trusted_info_source", "family_support")
colnames(final_mat_p) <- c("rewards", "mandatory", "trusted_info_source", "family_support")

ggcorrplot(final_mat, lab_size = 3, lab = T, tl.cex = 10)

3.4 Funnel and Engagement

1. 85% respondents complete the survey conditional on consent. Cost per completed survey is around $0.5 (excluding airtime). Cost per completed survey for unvaccinated participants is relatively higher at $1.27 (excluding airtime) as we have less unvaccinated respondents in the most recent pilot.

Metrics used in the table below are defined as follows:

  • Impressions (Total Count) = the total number of times our ad has been viewed
  • Clickthrough (%) = #clicks / #impressions
  • Messages Sent (%) = #conversations / #clicks
  • Consent Obtained (%) = #consents / #conversations
  • Core Survey Complete (%) = #forking section completed / #consents
  • Treatment Complete (%) = #treatment section completed / #forking section completed
  • Demo Questions Complete (%) = #demog section completed / #treatment section completed
  • Full Survey Complete (%) = #full chat completed / #demog section completed
  • Cost per Impression = amount spent / #impressions (in USD)
  • Cost per Link Click = amount spent / #clicks (in USD)
  • Cost per Survey Complete (All participants) = amount spent / #full chat completed (in USD)
  • Cost per Survey Complete (Unvax) = amount spent / #full chat completed with unvaccinated participants (in USD)
  • Cost per Survey Complete (Unvax, Open to Treatment) = amount spent / #full chat completed with unvaccinated and open to treatment participants (in USD)

Note: Costs in below table exclude $0.5 airtime cost per respondent completing the survey.

# overall ad cost for full pilot
df_temp <- df_full
ads_latest <- ads

df_ads_unvax_current <- df_temp %>% filter(vax_status == "unvax")
df_ads_unvax_open_current <- df_ads_unvax_current %>% filter(vax_future %in% c("maybe", "of course!"))

cost <- sum(ads_latest$Amount.spent..USD., na.rm = T)
impressions <- sum(ads_latest$Impressions, na.rm = T)
clicks <- sum(ads_latest$Link.clicks, na.rm = T)
conversations <- sum(ads_latest$Results, na.rm = T)

consents <- sum(df_temp$consent == "yes", na.rm = T)
core_complete <- sum(!is.na(df_temp$main_complete), na.rm = T)
treatment_complete <- sum(df_temp$treatment_complete == "yes", na.rm = T)
demog_complete <- sum(!is.na(df_temp$demog_complete), na.rm = T)
full_complete <- sum(df_temp$full_complete == "yes", na.rm = T)
full_complete_unvax <- sum(df_ads_unvax_current$full_complete == "yes", na.rm = T)
full_complete_unvax_open <- sum(df_ads_unvax_open_current$full_complete == "yes", na.rm = T)


dropoff_current <- data.frame(
  metric = c("Impressions", "Link Clicks", "Messages Sent", 
             "Consent Obtained", "Core Survey Complete", "Treatment complete",
             "Demographic Questions Complete", "Full Survey Complete", 
             "Full Survey Complete (Unvax)", "Full Survey Complete (Unvax, Open to Treatment)"), 
  
  total = c(impressions, clicks, conversations, consents, 
            core_complete, treatment_complete, demog_complete, full_complete,
            full_complete_unvax, full_complete_unvax_open),
  
  perc_of_prev = c("-", form_percent(clicks / impressions),
                   form_percent(conversations / clicks),
                   form_percent(consents / conversations),
                   form_percent(core_complete / consents),
                   form_percent(treatment_complete/core_complete),
                   form_percent(demog_complete / treatment_complete),
                   form_percent(full_complete / demog_complete),
                   form_percent(full_complete_unvax / full_complete),
                   form_percent(full_complete_unvax_open / full_complete_unvax)),
  
  cost_per = c(form_cost(impressions), form_cost(clicks),
               form_cost(conversations), form_cost(consents),
               form_cost(core_complete), form_cost(treatment_complete),
               form_cost(demog_complete), form_cost(full_complete),
               form_cost(full_complete_unvax), form_cost(full_complete_unvax_open)))


dropoff_current %>% 
  rename(
    Metric = metric,
    `Total` = total,
    `% of Previous Funnel` = perc_of_prev,
    `$ Ad Cost per` = cost_per
  ) %>% 
  datatable(options = list(pageLength = 10, columnDefs = list(list(orderable = TRUE, targets = 0))))

2. Dropoff starts early among participants not completing our survey.

For unvaccinated respondents (including those not reporting vax status):

  • Around half provide consent.
  • Around 35% provide vaccination status
  • Around 33% provide motive information to get vaccine, but only 24% provide ability information to get vaccine
  • Around 13% provide preferred treatment
  • Less than 5% provide demographic information

Question-wise completion rates for unvaccinated participants not fully completing the survey:

df_full %>% 
  select(continue_2,continue_3,vax_status,motive,ability,unvaxeasy_main,unvaxeasy_main_other,forget_main,forget_main_other,priority_main,priority_main_other,unvaxeasy_other,unvaxeasy_other_add,vax_future,best_treatment,best_treatment_explain,best_treatment_explain,best_treatment_explain,best_treatment_explain,post_vax_measure,cv_age,ethnicity,income,education,religion,christian_type,religiosity,politics,location,comfortable,engaging,enjoyable,enjoyable_not_parts,suggestions,phone_number,full_complete) %>%
  filter(is.na(full_complete)) %>%
  filter(is.na(vax_status) | vax_status == "unvax") %>% 
  summarize_all(~ mean(!is.na(.))) %>% 
  mutate(id = 1) %>% 
  pivot_longer(cols = -id, names_to = "question", values_to = "completion_rate") %>% 
  arrange(-completion_rate) %>% 
  transmute(
    completion_rate,
    question = case_when(
      question == "continue_2" ~ "Consent 1: Ask your opinion on vaccines",
      question == "continue_3" ~ "Consent 2: Continue if you're over 18 and wish to start",
      question == "vax_status" ~ "Vaccination status",
      question == "motive" ~ "Motive to get vaccine",
      question == "ability" ~ "Ability to get vaacine",
      question == "vax_future" ~ "Get vaccine in future",
      question == "best_treatment" ~ "Preferred treatment",
      question == "ethnicity" ~ "Ethnicity",
      question == "income" ~ "Income",
      question == "education" ~ "Education",
      question == "religion" ~ "Religion",
      question == "religiosity" ~ "Religiosity",
      question == "politics" ~ "Politics",
      question == "location" ~ "Location",
      question == "comfortable" ~ "Was chat comfortable?",
      question == "enjoyable" ~ "Was chat enjoyable?",
      TRUE ~ NA_character_
    ) %>% fct_inorder() %>% fct_rev()
  ) %>% 
  drop_na() %>% 
  ggplot(aes(question, completion_rate)) +
  geom_col(alpha = 0.85) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  coord_flip() +
  theme_minimal() +
  labs(
    x = "Question",
    y = "Completion rate (%)",
    # subtitle = "Completion rate by question for non-completers"
  )

3. Most participants reaching the end of the survey enjoyed interacting with/were comfortable with the chatbot.

df_v6 %>%
  select(version, enjoyable) %>% 
  bind_rows(
    df_full_v4b %>% filter(full_complete == "yes", version == "pilot_4b") %>% select(version, enjoyable),
    df_full_v5 %>% filter(full_complete == "yes", version == "pilot_5") %>% select(version, enjoyable)
  ) %>% 
  mutate(
    enjoyable = str_to_sentence(enjoyable) %>% str_squish()
  ) %>% 
  filter(enjoyable %in% c("Very enjoyable", "Moderately enjoyable", "Not enjoyable")) %>%
  count(version, enjoyable) %>% 
  group_by(version) %>% 
  mutate(percent = n/sum(n)) %>% 
  ungroup() %>% 
  arrange(version, n) %>% 
  mutate(
    enjoyable = enjoyable %>% fct_inorder() %>% fct_rev()
  ) %>% 
  #rename(ad_text = `Analysis 4 - body text`)
  ggplot(aes(version, percent, fill = enjoyable), color = "black") +
  geom_col(position = "stack", alpha = 0.8) +
  scale_y_continuous(labels = scales::percent) +
  scale_fill_brewer(palette = "Set2") +
  theme_bw() +
  theme(legend.position = "bottom") +
  labs(
    x = "Version",
    y = "% respondents",
    color = ""
  )

df %>%
  select(version, enjoyable) %>% 
  mutate(
    enjoyable = str_to_sentence(enjoyable) %>% str_squish()
  ) %>% 
  filter(enjoyable %in% c("Very enjoyable", "Moderately enjoyable", "Not enjoyable")) %>%
  count(version, enjoyable) %>% 
  group_by(version) %>% 
  mutate(percent = n/sum(n)) %>% 
  ungroup() %>% 
  arrange(version, n) %>% 
  mutate(
    enjoyable = enjoyable %>% fct_inorder() %>% fct_rev()
  ) %>% 
  ggplot(aes(version, percent, fill = enjoyable), color = "black") +
  geom_col(position = "stack", alpha = 0.8) +
  scale_y_continuous(labels = scales::percent) +
  scale_fill_brewer(palette = "Set2") +
  theme_bw() +
  theme(legend.position = "bottom") +
  labs(
    x = "Version",
    y = "% respondents",
    color = ""
  )

df_v6 %>%
  select(version, comfortable) %>%
  bind_rows(
    df_full_v4b %>% filter(full_complete == "yes", version == "pilot_4b") %>% select(version, comfortable),
    df_full_v5 %>% filter(full_complete == "yes", version == "pilot_5") %>% select(version, comfortable)
  ) %>% 
  mutate(
    comfortable = str_to_sentence(comfortable) %>% str_squish()
  ) %>% 
  filter(comfortable %in% c("Very comfortable", "Somewhat comfortable", "Not comfortable")) %>%
  count(version, comfortable) %>% 
  group_by(version) %>% 
  mutate(percent = n/sum(n)) %>% 
  ungroup() %>% 
  arrange(version, n) %>% 
  mutate(
    comfortable = comfortable %>% fct_inorder() %>% fct_rev()
  ) %>% 
  ggplot(aes(version, percent, fill = comfortable), color = "black") +
  geom_col(position = "stack", alpha = 0.8) +
  scale_y_continuous(labels = scales::percent) +
  scale_fill_brewer(palette = "Set2") +
  theme_bw() +
  theme(legend.position = "bottom") +
  labs(
    x = "Version",
    y = "% respondents",
    color = ""
  )

3.5 Ads

For context on ads used in this pilot, we have:

  • 1 campaign
  • 5 ad sets, 3 ads in each ad set
  • 15 ads split into:
    • 3 impediment themes (3 inaccessible, 6 risky, 6 unnecessary)
    • 2 different prompts (9 ads mentioning mobile airtime, 6 ads without it)
    • 9 different images (Images 1-6 used twice, images 7-9 used once)

Details on each ad can be found here.

1. Facebook A|B testing is not a “true” randomized A|B test as gender distribution is unbalanced across the 5 ad sets.

Even though the randomization is at the ad set level, we can see that pilot_5_unncessary_control and pilot_5_risky_airtime have very different gender ratios (close to 1 and 2 respectively). Facebook essentially enables a comparative test by dividing up the total population into five equal samples here, then runs an algorithm to identify attractive target populations to maximize cost efficiency on each sample.

df_v5 %>% 
  inner_join(ads_v5, by = "original_ref") %>% 
  clean_names() %>%
  filter(gender != "other") %>%
  compareGroups(gender ~ ad_set, data = .) %>% 
  createTable(show.all = T, show.p.overall = F)
## 
## --------Summary descriptives table by 'gender'---------
## 
## _____________________________________________________________________ 
##                                      [ALL]      female       male     
##                                     N=1379       N=839       N=540    
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## ad_set:                                                               
##     pilot_5_unnecessary_control   139 (10.1%) 75 (8.94%)  64 (11.9%)  
##     pilot_v5_inaccessible_airtime 252 (18.3%) 143 (17.0%) 109 (20.2%) 
##     pilot_v5_risky_airtime        429 (31.1%) 285 (34.0%) 144 (26.7%) 
##     pilot_v5_risky_control        115 (8.34%) 79 (9.42%)  36 (6.67%)  
##     pilot_v5_unnecessary_airtime  444 (32.2%) 257 (30.6%) 187 (34.6%) 
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯

2. Ads with airtime outperform our control ads in recruiting participants, even after adjusting for the quality of responses.

The table below compares two Ad body text approaches - control (share your opinion) vs (take a short survey and earn airtime) - in terms of the metrics defined in section 3.4.

ads_specific <- ads %>% filter(`Analysis 4 - body text` == "control")

df_ads <- df_full %>% filter(original_ref %in% (ads_specific %>% pull(original_ref)))
df_ads_unvax <- df_ads %>% filter(vax_status == "unvax")
df_ads_unvax_open <- df_ads_unvax %>% filter(vax_future %in% c("maybe", "of course!"))

cost <- sum(ads_specific$Amount.spent..USD., na.rm = T)
impressions <- sum(ads_specific$Impressions, na.rm = T)
clicks <- sum(ads_specific$Link.clicks, na.rm = T)
conversations <- sum(ads_specific$Results, na.rm = T)

consents <- sum(df_ads$consent == "yes", na.rm = T)
core_complete <- sum(!is.na(df_ads$main_complete), na.rm = T)
treat_complete <- sum(!is.na(df_ads$treatment_complete), na.rm = T)
demog_complete <- sum(!is.na(df_ads$demog_complete), na.rm = T)
full_complete <- sum(df_ads$full_complete == "yes", na.rm = T)
full_complete_unvax <- sum(df_ads_unvax$full_complete == "yes", na.rm = T)
full_complete_unvax_open <- sum(df_ads_unvax_open$full_complete == "yes", na.rm = T)

treat_nchar <-
  df_ads %>% 
  transmute(
    treat_nchar = nchar(best_treatment_explain)
  ) %>% 
  summarize_all(sum, na.rm = T) %>% 
  pull(treat_nchar)

free_text_vars <- df_ads %>% select(ends_with("_explain")) %>% colnames()
free_text_vars <- free_text_vars[!free_text_vars %in% c('best_treatment_explain')]
mean_nchar <-
  df_ads %>%
  transmute_at(
    vars(free_text_vars),
    ~ nchar(.)
  ) %>%
  summarize_all(sum, na.rm = T) %>% 
  mutate(
    mean_nchar =
      pmap_dbl(
        select(., ends_with("_explain")),
        ~ mean(c(...), na.rm = TRUE)
      )
  ) %>% 
  pull(mean_nchar) %>% 
  round(2)

dropoff_1 <- tibble(
  metric = c(
    "Impressions (Total Count)", 
    "Clickthrough (%)",
    "Messages Sent (%)",
    "Consent Obtained (%)", 
    "Core Survey Complete (%)", 
    "Treatment Complete (%)",
    "Demo Questions Complete (%)",
    "Full Survey Complete (%)",
    "Total characters elicited per completed survey (treatment)",
    "Avg characters elicited per completed survey (impediment explanations)",
    "Cost per Impression",
    "Cost per Link Click",
    "Cost per Survey Complete (All participants)",
    "Cost per Survey Complete (Unvax)",
    "Cost per Survey Complete (Unvax, Open to Treatment)"
  ),
  ad = c(
    impressions, 
    form_percent(clicks / impressions),
    form_percent(conversations / clicks),
    form_percent(consents / conversations),
    form_percent(core_complete / consents),
    form_percent(treat_complete / core_complete),
    form_percent(demog_complete / treat_complete),
    form_percent(full_complete / demog_complete),
    round(treat_nchar/full_complete, 2),
    round(mean_nchar/full_complete, 2),
    form_cost(impressions), 
    form_cost(clicks),
    form_cost(full_complete),
    form_cost(full_complete_unvax),
    form_cost(full_complete_unvax_open)
  )
)

colnames(dropoff_1) <- c("Metric of Interest", "Control (6)")
ads_specific <- ads %>% filter(`Analysis 4 - body text` == "airtime")

df_ads <- df_full %>% filter(original_ref %in% (ads_specific %>% pull(original_ref)))
df_ads_unvax <- df_ads %>% filter(vax_status == "unvax")
df_ads_unvax_open <- df_ads_unvax %>% filter(vax_future %in% c("maybe", "of course!"))

cost <- sum(ads_specific$Amount.spent..USD., na.rm = T)
impressions <- sum(ads_specific$Impressions, na.rm = T)
clicks <- sum(ads_specific$Link.clicks, na.rm = T)
conversations <- sum(ads_specific$Results, na.rm = T)

consents <- sum(df_ads$consent == "yes", na.rm = T)
core_complete <- sum(!is.na(df_ads$main_complete), na.rm = T)
treat_complete <- sum(!is.na(df_ads$treatment_complete), na.rm = T)
demog_complete <- sum(!is.na(df_ads$demog_complete), na.rm = T)
full_complete <- sum(df_ads$full_complete == "yes", na.rm = T)
full_complete_unvax <- sum(df_ads_unvax$full_complete == "yes", na.rm = T)
full_complete_unvax_open <- sum(df_ads_unvax_open$full_complete == "yes", na.rm = T)

treat_nchar <-
  df_ads %>% 
  transmute(
    treat_nchar = nchar(best_treatment_explain)
  ) %>% 
  summarize_all(sum, na.rm = T) %>% 
  pull(treat_nchar)

free_text_vars <- df_ads %>% select(ends_with("_explain")) %>% colnames()
free_text_vars <- free_text_vars[!free_text_vars %in% c('best_treatment_explain')]
mean_nchar <-
  df_ads %>%
  transmute_at(
    vars(free_text_vars),
    ~ nchar(.)
  ) %>%
  summarize_all(sum, na.rm = T) %>% 
  mutate(
    mean_nchar =
      pmap_dbl(
        select(., ends_with("_explain")),
        ~ mean(c(...), na.rm = TRUE)
      )
  ) %>% 
  pull(mean_nchar) %>% 
  round(2)

dropoff_2 <- tibble(
  metric = c(
    "Impressions (Total Count)", 
    "Clickthrough (%)",
    "Messages Sent (%)",
    "Consent Obtained (%)", 
    "Core Survey Complete (%)", 
    "Treatment Complete (%)",
    "Demo Questions Complete (%)",
    "Full Survey Complete (%)",
    "Total characters elicited per completed survey (treatment)",
    "Avg characters elicited per completed survey (impediment explanations)",
    "Cost per Impression",
    "Cost per Link Click",
    "Cost per Survey Complete (All participants)",
    "Cost per Survey Complete (Unvax)",
    "Cost per Survey Complete (Unvax, Open to Treatment)"
  ),
  ad = c(
    impressions, 
    form_percent(clicks / impressions),
    form_percent(conversations / clicks),
    form_percent(consents / conversations),
    form_percent(core_complete / consents),
    form_percent(treat_complete / core_complete),
    form_percent(demog_complete / treat_complete),
    form_percent(full_complete / demog_complete),
    round(treat_nchar/full_complete, 2),
    round(mean_nchar/full_complete, 2),
    form_cost(impressions), 
    form_cost(clicks),
    form_cost(full_complete),
    form_cost(full_complete_unvax),
    form_cost(full_complete_unvax_open)
  )
)

colnames(dropoff_2) <- c("Metric of Interest", "Airtime (9)")
## MERGE dfs ##
dropoff_1 %>% 
  full_join(dropoff_2, by = "Metric of Interest") %>%
  datatable(options = list(pageLength = 20, columnDefs = list(list(orderable = TRUE, targets = 0))))