## Load packages
pacman::p_load(DT, estimatr, kableExtra, readr, reshape2, tidyverse, xtable, dataMaid, ggcorrplot, ggmap, rpart, rpart.plot, pollster, wordcloud, tm, topicmodels, ldatuning, lda,SnowballC, pals, flextable, RColorBrewer, hrbrthemes, janitor, purrr, gridExtra, cowplot, rcompanion, nnet, texreg, compareGroups, factoextra, cluster, fastDummies, simputation, sentimentr, politeness, textir)

set.seed(94305)
## Data Loading and Merging
### read ads data for 4 pilot waves
ads_v7 <-
  read.csv("../ads_data/ads_data_v7.csv") %>% 
  rename(original_ref = Ad.ID, ad_name = Ad.name) %>% 
  mutate(
    `Analysis 3 - impediment theme` = str_sub(ad_name, 8) %>% str_to_lower()
  ) %>% 
  relocate(original_ref, ad_name)
## read in CURRENT chatfuel data
# full data
df_full_v7 <- 
  read_csv(here::here("chatfuel_data/chatfuel_full_v7.csv")) %>%
  clean_names() %>% 
  mutate_if(is.character, ~ str_replace_all(., '[\n\t]', '')) %>% 
  mutate(
    first_name = if_else(is.na(first_name), "", first_name),
    middle_name = if_else(is.na(middle_name), "", middle_name),
    last_name = if_else(is.na(last_name), "", last_name),
    full_name = str_c(first_name, middle_name, last_name, sep = " "),
    full_name_short = str_c(first_name, last_name, sep = " "),
    date = lubridate::date(signed_up)
  ) %>% 
  filter(!(full_name %in% c("Robert Kuan", "James Li", "Kaylin Rochford", "Saurabh Khanna", "Dingchen Sha", "Kristine Koutout", "Susan Athey", "Dean Karlan"))) %>%
  filter(version == "ALP_May") %>%
  mutate(original_ref = parse_number(original_ref)) %>% 
  mutate(
    motive = if_else(str_detect(motive, "yes"), "yes", "no"),
    motive_main = if_else(str_detect(motive_main, "risk"), "risk", motive_main)
  ) %>%
  remove_empty()

# filter to completes
df_v7 <-
  df_full_v7 %>%
  filter(full_complete == "yes") %>% 
  drop_na(vax_status) %>% 
  mutate(
    phone_number = str_replace_all(phone_number, " ", ""),
    phone_number = str_replace_all(phone_number, "-", ""),
  ) %>% 
  arrange(phone_number, last_seen) %>% 
  distinct(phone_number, .keep_all = T) %>%
  remove_empty()



# combining chatfuel data with ads data. we use `df` in most analyses below. 
df <-
  df_v7 %>% 
  left_join(ads_v7, by = "original_ref") %>% 
  remove_empty()


# combining full datatsets
df_full <- 
  df_full_v7 %>% 
  left_join(ads_v7, by = "original_ref") %>% 
  remove_empty()

ads <- ads_v7 %>% remove_empty()

rm(df_full_v7, df_v7, ads_v7)
# clean up demographic variables

clean_up_demog <- function(df){
  df$gender[!df$gender %in% c("male", "female", NA_character_)] <- "other"
  df$ethnicity[!df$ethnicity %in% c("black or african", "coloured", "white or caucasian", "prefer not to say", "asian or indian", NA_character_)] <- "other"
  df$income[!df$income %in% c("< R5,000", "R5,000 – R9,999", "R10,000 – R29,999", "R30,000 – R49,999", "R50,000 – R99,999", "> R100,000", "prefer not to say", NA_character_)] <- "other"
  df$education[!df$education %in% c("< high school", "high school", "some college", "2-year degree", "4-year degree", "graduate degree", "prefer not to say", NA_character_)] <- "other"
  df$politics[!df$politics %in% c("conservative", "moderate", "liberal", "prefer not to say", NA_character_)] <- "other"
  df$location[!df$location %in% c("urban", "suburban", "rural", "prefer not to say", NA_character_)] <- "other"
  df$religion[grep("christ", tolower(df$religion))] <- "christian"
  df$religion[!df$religion %in% c("christian", "african traditional", "islam", "hinduism", "no religion", "prefer not to say", NA_character_)] <- "other"
  return(df)
}

df <- clean_up_demog(df)
df_v7 %>%
  filter(is.na(ability_raw), !is.na(ability)) %>% 
  glimpse()

df_v7 %>%
  tabyl(vax_status_raw, vax_status)

df %>% get_dupes(chatfuel_user_id) %>% 
  transmute(chatfuel_user_id = as.character(chatfuel_user_id), version, last_seen, dupe_count)

1 Project Overview

Background

COVID-19 vaccine hesitancy has been recognized as a problem across nations. A resistance to getting vaccinated is emerging as a major hurdle, especially in the developing world, where vaccine access issues are still being gradually resolved. Persistent pools of unvaccinated people around the world could present a greater risk for the emergence of new variants of concern. Addressing people’s vaccine hesitancy is hence crucial to curb the spread of COVID-19, and to consequently avert hospitalizations and death.

Objectives

We intend to understand why people are hesitant about getting the COVID-19 vaccine. Hesitancy could not only occur within the unvaccinated population but also in a subset of people who already got vaccinated. Therefore, the first phase of our project has the following objectives:

  1. Understand why people are hesitant to get the COVID-19 vaccines
  2. Understand ways to best elicit vaccine impediments from respondents
  3. Pinpoint what treatments will help people get vaccinated

Approach

We intend to use chatbot as a medium (on Facebook) to conduct conversations with people and understand how we can best achieve the above three objectives. We have run six pilots as of March 2022 – 2 in the United States using Lucid, and 5 in South Africa on Facebook. Our eventual goal will be running this using an interactive, personalized chatbot that enables the conversation to flow more naturally than in a survey format. We are running the pilots in order to – 1) achieve technical proofs of concept, 2) reduce participant recruitment & completion costs (survey completion of unvaccinated participants open to treatment ) before experiment launch, 3) improve chatbot script/forking/engagement before experiment launch, and 4) gather exploratory ideas for impactful covariates and treatments. Our insights from these pilots are detailed ahead.


The sample for this analysis consists of 2271 unvaccinated respondents recruited across pilot waves 5-7 (Jan-May 2022). The related GitHub issue is here.

We start with cleaning all chatbot features to be used later in our models.

2 Feature cleaning for models

We start by cleaning and filtering our data frames to features we expect to be useful for modeling. We consider the following 4 sets of features:

  • Feature set 1 (Impediments):
    • Motivation impediments: against_beliefs (vaccine is against my beliefs), no_benefits (vaccine has no benefits), risky (vaccine is risky). These are coded as binary (0/1).
    • Ability impediments: no_time (no time to get vaxxed), no_money (no money to get vaxxed), no_availability (vaccine not easily available). These are coded as binary (0/1).
  • Feature set 2 (Ads):
    • Ad theme: ad_theme_risky (vaccine is risky) and ad_theme_unnecessary (vaccine is not necessary). These are coded as binary (0/1).
    • Ad text: ad_text_airtime is a binary (0/1) indicating whether the ad text explicitly mentions that respondents will receive mobile airtime.
  • Feature set 3 (Free text):
    • elaboration: The average number of characters typed by a respondent across all free text questions faced. We normalize this variable before clustering.
    • politeness: Politeness score calculated across all free text typed by a respondent. This ranges from 0 to 104 in our sample (based on number of politeness attributes touched) and is constructed using Mike Yeomans’ politeness package. We normalize this variable before running the clustering algorithm. Higher score means more polite text.
    • receptivity: Receptivity score is a continuous score calculated across all free text typed by a respondent. This is also constructed using Mike Yeomans’ politeness package. We normalize it before running the clustering algorithm. Higher score means more receptive text.
    • sentiment: Sentiment polarity (range -1 to 1) for all free text typed by a respondent.
  • Feature set 4 (Demographics):
    • age: Participant age in years
    • female: 1 if female, 0 if male
    • income: 0 if the participant is unemployed, 1 if household income < R5,000, 2 if household income in R5,000 – R9,999, …, 6 if household income > R100,000
    • education: 1 if the participant’s education < high school, 2 if education is high school, …, 6 if education is a graduate degree
    • religiosity: 1 if the participant is not very religious, 2 if somewhat religious, 3 if very religious
    • politics: 1 if the participant is conservative, 2 if moderate, 3 if liberal
    • location: 1 if the participant lives in rural, 2 if suburban, 3 if urban,
    • white: 1 if the participant is a white or caucasian, 0 if not

All continuous variables are normalized before clustering.

Since we can’t feed missing values to a clustering algorithm, we impute any missingness in demographic variables using a linear imputation model building on values from female, white, and age.

df_sentiment <-
  df %>% 
  select(contains(c("_other", "_explain"))) %>%
  unite("text", 1:49, na.rm = T, remove = T, sep = ". ") %>%
  pull(text) %>%
  get_sentences() %>%
  sentiment_by() %>% 
  bind_cols(df %>% select(chatfuel_user_id)) %>% 
  transmute(chatfuel_user_id, sentiment = ave_sentiment)

df_polite <-
  df %>% 
  select(contains(c("_other", "_explain"))) %>% 
  unite("text", 1:49, na.rm = T, remove = T, sep = ". ") %>%
  politeness::politeness(.$text, parser = "spacy") %>% 
  transmute(politeness = rowSums(.)) %>% 
  bind_cols(df %>% select(chatfuel_user_id))
## ================================================================================
df_receptive <-
  df %>%
  select(contains(c("_other", "_explain"))) %>%
  unite("text", 1:49, na.rm = T, remove = T, sep = ". ") %>%
  transmute(receptivity = politeness::receptiveness(.)) %>%
  bind_cols(df %>% select(chatfuel_user_id))
## ================================================================================
df_elaboration <-
  df %>%
  mutate(age = parse_number(cv_age)) %>% 
  filter(age <= 120) %>% 
  select(contains(c("_other", "_explain"))) %>%
  mutate_all(~ nchar(.)) %>% 
  transmute(
    elaboration =
      pmap_dbl(
        .,
        ~ mean(c(...), na.rm = TRUE)
      )
  )
df_ad_features <-
  df %>% 
  mutate(age = parse_number(cv_age)) %>% 
  filter(age <= 120) %>%
  transmute(
    chatfuel_user_id,
    # ad_image = `Analysis 2 - image`,
    ad_theme = `Analysis 3 - impediment theme`,
    # ad_text = `Analysis 4 - body text`
  ) %>%
  fastDummies::dummy_cols(
    select_columns = c("ad_theme"), remove_first_dummy = TRUE,
    remove_selected_columns = TRUE, ignore_na = TRUE
  ) %>% 
  # mutate(ad_text_airtime = if_else(ad_text == "airtime", 1L, 0L) %>% replace_na(0)) %>% 
  select(chatfuel_user_id, starts_with(c("ad_text_", "ad_theme_"))) %>% 
  mutate_at(vars(starts_with("ad_theme")), ~ replace_na(., 0)) %>% 
  bind_cols(df_elaboration) %>% 
  left_join(df_polite, by = "chatfuel_user_id") %>% 
  left_join(df_receptive, by = "chatfuel_user_id") %>% 
  left_join(df_sentiment, by = "chatfuel_user_id") %>% 
  distinct(chatfuel_user_id, .keep_all = T)


df_features_raw <-
  df %>% 
  mutate(age = parse_number(cv_age)) %>% 
  filter(age <= 120) %>%
  transmute(
    unvax = if_else(vax_status == "unvax", 1L, 0L),
    chatfuel_user_id,
    age,
    female = case_when(
      gender == "female" ~ 1,
      gender == "male" ~ 0,
    ),
    income = case_when(
      income == "Unemployed" ~ 0,
      income == "< R5,000" ~ 1,
      income == "R5,000 – R9,999" ~ 2,
      income == "R10,000 – R29,999" ~ 3,
      income == "R30,000 – R49,999" ~ 4,
      income == "R50,000 – R99,999" ~ 5,
      income == "> R100,000" ~ 6,
    ),
    education = case_when(
      education == "< high school" ~ 1,
      education == "high school" ~ 2,
      education == "some college" ~ 3,
      education == "2-year degree" ~ 4,
      education == "4-year degree" ~ 5,
      education == "graduate degree" ~ 6,
    ),
    religiosity = case_when(
      religiosity == "not very religious" ~ 1,
      religiosity == "somewhat religious" ~ 2,
      religiosity == "very religious" ~ 3,
    ),
    politics = case_when(
      politics == "conservative" ~ 1,
      politics == "moderate" ~ 2,
      politics == "liberal" ~ 3,
    ),
    location = case_when(
      location == "rural" ~ 1,
      location == "suburban" ~ 2,
      location == "urban" ~ 3,
    ),
    white = case_when(
      ethnicity == "white or caucasian" ~ 1,
      ethnicity != "white or caucasian" ~ 0
    ),
    # no_ability = case_when(
    #   ability == "no" ~ 1,
    #   ability == "yes" ~ 0,
    # ),
    # no_motive = case_when(
    #   motive == "no" ~ 1,
    #   motive == "yes" ~ 0,
    # ),
    against_beliefs = if_else(motive_main == "belief", 1L, 0L) %>% replace_na(0),
    no_benefits = if_else(motive_main == "benefit", 1L, 0L) %>% replace_na(0),
    risky = if_else(motive_main == "risk", 1L, 0L) %>% replace_na(0),
    no_time = if_else(ability_main == "time", 1L, 0L) %>% replace_na(0),
    no_money = if_else(ability_main == "money", 1L, 0L) %>% replace_na(0),
    no_availability = if_else(ability_main == "availability", 1L, 0L) %>% replace_na(0),
  ) %>%
  distinct(chatfuel_user_id, .keep_all = T) %>% 
  left_join(df_ad_features, by = "chatfuel_user_id") %>% 
  impute_lm(politics + income + religiosity + location + education ~ female + white + age) %>% 
  drop_na() %>% 
  mutate_at(vars(age, elaboration, sentiment, receptivity, politeness), ~ scale(.) %>% as.vector()) %>% 
  relocate(chatfuel_user_id)

#df_features %>% summarise_all(~ sum(is.na(.), na.rm = T))

df_features <- df_features_raw %>% select(-chatfuel_user_id)

Here are summary statistics for the features fed into the algorithm:

df_features %>%
  clean_names(case = "title") %>%
  papeR::summarize_numeric() %>% 
  datatable(options = list(pageLength = 10, columnDefs = list(list(orderable = TRUE, targets = 0))))

3 Summarizing cleaned free text


4 Wordclouds by preferred treatment

The wordclouds below are generated from a participant’s complete free text conversation with the chatbot. We generate a separate wordcould for each subgroup of participants choosing a preferred treatment.

Family supports it:

vector_wc <-
  df %>%
  mutate(
    best_treatment = str_to_sentence(best_treatment),
    best_treatment = if_else(str_detect(best_treatment, "^Nothing"), "Nothing", best_treatment),
    best_treatment = if_else(best_treatment %in% c("Family supports it", "Trusted info source", "More transparency", "Nothing", "Rewards for vaxxing", "Job/school required"), best_treatment, "Something else") %>% as_factor()
    ) %>% 
  filter(best_treatment == "Family supports it") %>% 
  select(contains(c("_other", "_explain"))) %>% 
  unite("text", 1:49, na.rm = T, remove = T, sep = ". ") %>%
  mutate(
    text = gsub("http[^[:space:]]*", "", text), # remove bad URLs in text
    text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|vaccination")
  ) %>%
  pull(text)

# Create corpus 
docs <- Corpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)

# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))

Trusted info source:

vector_wc <-
  df %>%
  mutate(
    best_treatment = str_to_sentence(best_treatment),
    best_treatment = if_else(str_detect(best_treatment, "^Nothing"), "Nothing", best_treatment),
    best_treatment = if_else(best_treatment %in% c("Family supports it", "Trusted info source", "More transparency", "Nothing", "Rewards for vaxxing", "Job/school required"), best_treatment, "Something else") %>% as_factor()
    ) %>% 
  filter(best_treatment == "Trusted info source") %>% 
  select(contains(c("_other", "_explain"))) %>% 
  unite("text", 1:49, na.rm = T, remove = T, sep = ". ") %>%
  mutate(
    text = gsub("http[^[:space:]]*", "", text), # remove bad URLs in text
    text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|vaccination")
  ) %>%
  pull(text)

# Create corpus 
docs <- Corpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)

# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))

More Transparency:

vector_wc <-
  df %>%
  mutate(
    best_treatment = str_to_sentence(best_treatment),
    best_treatment = if_else(str_detect(best_treatment, "^Nothing"), "Nothing", best_treatment),
    best_treatment = if_else(best_treatment %in% c("Family supports it", "Trusted info source", "More transparency", "Nothing", "Rewards for vaxxing", "Job/school required"), best_treatment, "Something else") %>% as_factor()
    ) %>% 
  filter(best_treatment == "More transparency") %>% 
  select(contains(c("_other", "_explain"))) %>% 
  unite("text", 1:49, na.rm = T, remove = T, sep = ". ") %>%
  mutate(
    text = gsub("http[^[:space:]]*", "", text), # remove bad URLs in text
    text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|vaccination")
  ) %>%
  pull(text)

# Create corpus 
docs <- Corpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)

# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))

Rewards for vaxxing:

vector_wc <-
  df %>%
  mutate(
    best_treatment = str_to_sentence(best_treatment),
    best_treatment = if_else(str_detect(best_treatment, "^Nothing"), "Nothing", best_treatment),
    best_treatment = if_else(best_treatment %in% c("Family supports it", "Trusted info source", "More transparency", "Nothing", "Rewards for vaxxing", "Job/school required"), best_treatment, "Something else") %>% as_factor()
    ) %>% 
  filter(best_treatment == "Rewards for vaxxing") %>% 
  select(contains(c("_other", "_explain"))) %>% 
  unite("text", 1:49, na.rm = T, remove = T, sep = ". ") %>%
  mutate(
    text = gsub("http[^[:space:]]*", "", text), # remove bad URLs in text
    text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|vaccination")
  ) %>%
  pull(text)

# Create corpus 
docs <- Corpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)

# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))

Job/school required:

vector_wc <-
  df %>%
  mutate(
    best_treatment = str_to_sentence(best_treatment),
    best_treatment = if_else(str_detect(best_treatment, "^Nothing"), "Nothing", best_treatment),
    best_treatment = if_else(best_treatment %in% c("Family supports it", "Trusted info source", "More transparency", "Nothing", "Rewards for vaxxing", "Job/school required"), best_treatment, "Something else") %>% as_factor()
    ) %>% 
  filter(best_treatment == "Job/school required") %>% 
  select(contains(c("_other", "_explain"))) %>% 
  unite("text", 1:49, na.rm = T, remove = T, sep = ". ") %>%
  mutate(
    text = gsub("http[^[:space:]]*", "", text), # remove bad URLs in text
    text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|vaccination")
  ) %>%
  pull(text)

# Create corpus 
docs <- Corpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)

# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))

Nothing would work:

vector_wc <-
  df %>%
  mutate(
    best_treatment = str_to_sentence(best_treatment),
    best_treatment = if_else(str_detect(best_treatment, "^Nothing"), "Nothing", best_treatment),
    best_treatment = if_else(best_treatment %in% c("Family supports it", "Trusted info source", "More transparency", "Nothing", "Rewards for vaxxing", "Job/school required"), best_treatment, "Something else") %>% as_factor()
    ) %>% 
  filter(best_treatment == "Nothing") %>% 
  select(contains(c("_other", "_explain"))) %>% 
  unite("text", 1:49, na.rm = T, remove = T, sep = ". ") %>%
  mutate(
    text = gsub("http[^[:space:]]*", "", text), # remove bad URLs in text
    text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|vaccination")
  ) %>%
  pull(text)

# Create corpus 
docs <- Corpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)

# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))

Something else would work:

vector_wc <-
  df %>%
  mutate(
    best_treatment = str_to_sentence(best_treatment),
    best_treatment = if_else(str_detect(best_treatment, "^Nothing"), "Nothing", best_treatment),
    best_treatment = if_else(best_treatment %in% c("Family supports it", "Trusted info source", "More transparency", "Nothing", "Rewards for vaxxing", "Job/school required"), best_treatment, "Something else") %>% as_factor()
    ) %>% 
  filter(best_treatment == "Something else") %>% 
  select(contains(c("_other", "_explain"))) %>% 
  unite("text", 1:49, na.rm = T, remove = T, sep = ". ") %>%
  mutate(
    text = gsub("http[^[:space:]]*", "", text), # remove bad URLs in text
    text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|vaccination")
  ) %>%
  pull(text)

# Create corpus 
docs <- Corpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)

# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))


5 Multinomial Inverse Regressions

We start with looking at multinomial inverse regressions (MNIR) to understand how chatbot free text responses connect with preferred treatments. In MNIR, we regress the token counts on a set of known attributes (preferred treatments in our case).

Creating counts and covariate matrices:

## TOKENS ##
freetext <-
  df %>%
  mutate(
    best_treatment = str_to_sentence(best_treatment),
    best_treatment = if_else(str_detect(best_treatment, "^Nothing"), "Nothing", best_treatment),
    best_treatment = if_else(best_treatment %in% c("Family supports it", "Trusted info source", "More transparency", "Nothing", "Rewards for vaxxing", "Job/school required"), best_treatment, "Something else"),
  ) %>%
  drop_na(best_treatment) %>%
  select(contains(c("_other", "_explain"))) %>%
  mutate_all(~ str_to_lower(.) %>% str_squish() %>% str_remove_all("no|yes")) %>%
  unite("text", 1:49, na.rm = T, remove = T, sep = ". ") %>%
  drop_na() %>%
  pull(text)

# Create corpus
docs <- Corpus(VectorSource(freetext))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
matrix <- t(matrix)


## COVARIATES ##
covars_df <-
  df %>%
  transmute(
    best_treatment = str_to_sentence(best_treatment),
    best_treatment = if_else(str_detect(best_treatment, "^Nothing"), "Nothing", best_treatment),
    best_treatment = if_else(best_treatment %in% c("Family supports it", "Trusted info source", "More transparency", "Nothing", "Rewards for vaxxing", "Job/school required"), best_treatment, "Something else"),
  ) %>%
  drop_na(best_treatment) %>%
  dummy_cols(select_columns = "best_treatment", remove_first_dummy = TRUE, remove_selected_columns = TRUE)

covars <- covars_df %>% as.matrix()

covar_names <-
  df %>%
  transmute(
    best_treatment = str_to_sentence(best_treatment),
    best_treatment = if_else(str_detect(best_treatment, "^Nothing"), "Nothing", best_treatment),
    best_treatment = if_else(best_treatment %in% c("Family supports it", "Trusted info source", "More transparency", "Nothing", "Rewards for vaxxing", "Job/school required"), best_treatment, "Something else"),
  ) %>%
  drop_na(best_treatment) %>%
  dummy_cols(select_columns = "best_treatment", remove_first_dummy = TRUE, remove_selected_columns = TRUE) %>% 
  colnames() %>% 
  str_sub(., 16, -1)

# words[1:99]

Fitting the model:

cl <- makeCluster(detectCores())
fit <- mnlm(cl, covars, matrix)
stopCluster(cl)
B <- coef(fit)

5.1 Lasso paths for high frequency terms

Consolidating takeaways:

  • Side effect utterances are positively related to more transparency treatment, but not to other treatments.
  • Family related utterances are negatively related to all treatments.
  • Fear related utterances are negatively related to all treatments, except mandates where it sees no effect.
  • Time related utterances are positively (but weakly) related to rewards for vaxxing treatment, but negatively related to other treatments.
  • Death related utterances are positively (though weakly) related to job/school mandates treatment, but negatively related to other treatments.

5.1.1 Side effects

Side effect utterances are positively related to more transparency treatment, but not to other treatments.

plot(fit$side, col = c("red", "green", "blue", "orange", "black"))
legend("topright", legend = covar_names, fill = c("red", "green", "blue", "orange", "black"))

5.1.2 Family

Family related utterances are negatively related to all treatments.

plot(fit$family, col = c("red", "green", "blue", "orange", "black"))
legend("topright", legend = covar_names, fill = c("red", "green", "blue", "orange", "black"))

5.1.3 Scared

Fear related utterances are negatively related to all treatments, except mandates where it sees no effect.

plot(fit$scared, col = c("red", "green", "blue", "orange", "black"))
legend("topright", legend = covar_names, fill = c("red", "green", "blue", "orange", "black"))

5.1.4 Time

Time related utterances are positively (but weakly) related to rewards for vaxxing treatment, but negatively related to other treatments.

plot(fit$time, col = c("red", "green", "blue", "orange", "black"))
legend("topright", legend = covar_names, fill = c("red", "green", "blue", "orange", "black"))

5.1.5 Die

Death related utterances are positively (thougly weakly) related to job/school mandates treatment, but negatively related to other treatments.

plot(fit$die, col = c("red", "green", "blue", "orange", "black"))
legend("topright", legend = covar_names, fill = c("red", "green", "blue", "orange", "black"))


5.2 Utterances by preferred treatment

Top 10 utterances by treatment:

B %>% 
  as.matrix() %>% 
  as_tibble() %>% 
  tail(6) %>% 
  bind_cols(
    preferred_treatment = covar_names, .
  ) %>% 
  pivot_longer(cols = -preferred_treatment) %>% 
  arrange(preferred_treatment, -value) %>%
  mutate(value = round(value, 2)) %>% 
  group_by(preferred_treatment) %>% 
  mutate(row = row_number()) %>% 
  filter(row <= 10) %>% 
  ungroup() %>% 
  select(-value) %>% 
  pivot_wider(names_from = row, values_from = name)

Bottom 10 utterances by treatment:

B %>% 
  as.matrix() %>% 
  as_tibble() %>% 
  tail(6) %>% 
  bind_cols(
    preferred_treatment = covar_names, .
  ) %>% 
  pivot_longer(cols = -preferred_treatment) %>% 
  arrange(preferred_treatment, -value) %>%
  mutate(value = round(value, 2)) %>% 
  group_by(preferred_treatment) %>% 
  mutate(row = row_number()) %>%
  arrange(preferred_treatment, -row) %>%
  mutate(row = row_number()) %>% 
  filter(row <= 10) %>% 
  ungroup() %>% 
  select(-value) %>% 
  pivot_wider(names_from = row, values_from = name)

The following table shows all respondent utterances, sorted within each treatment preferred by MNIR coefficient size.

B %>% 
  as.matrix() %>% 
  as_tibble() %>% 
  tail(6) %>% 
  bind_cols(
    preferred_treatment = covar_names, .
  ) %>% 
  pivot_longer(cols = -preferred_treatment) %>% 
  arrange(preferred_treatment, -value) %>%
  mutate(value = round(value, 2)) %>% 
  select(`Preferred treatment` = preferred_treatment, `Utterance` = name, `Coefficient` = value) %>%
  DT::datatable()

6 Topic models

# output file for python
data_vector <-
  df %>% 
  select(best_treatment, contains(c("_other", "_explain"))) %>% 
  mutate(
    best_treatment = str_to_sentence(best_treatment),
    best_treatment = if_else(str_detect(best_treatment, "^Nothing"), "Nothing", best_treatment),
    best_treatment = if_else(best_treatment %in% c("Family supports it", "Trusted info source", "More transparency", "Nothing", "Rewards for vaxxing", "Job/school required"), best_treatment, "Something else"),
  ) %>% 
  unite("text", 2:50, na.rm = T, remove = T, sep = ". ") %>% 
  mutate(
    text = gsub("http[^[:space:]]*", "", text),
    text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|vaccination")  
  ) %>% 
  pull(text)

# Create corpus
corpus <- Corpus(VectorSource(data_vector))

# Clean corpus
docs <-
  corpus %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)


minimumFrequency <- 5
DTM <- DocumentTermMatrix(docs, control = list(bounds = list(global = c(minimumFrequency, Inf))))

sel_idx <- slam::row_sums(DTM) > 0
DTM <- DTM[sel_idx, ]

We move to inspect the number of topics. In this case, we use two methods CaoJuan2009 and Griffith2004. The best number of topics shows low values for CaoJuan2009 and high values for Griffith2004.

result <- 
  ldatuning::FindTopicsNumber(
    DTM,
    topics = seq(from = 2, to = 20, by = 1),
    metrics = c("CaoJuan2009",  "Deveaud2014"),
    method = "Gibbs",
    control = list(seed = 77),
    verbose = TRUE
  )
## fit models... done.
## calculate metrics:
##   CaoJuan2009... done.
##   Deveaud2014... done.
ldatuning::FindTopicsNumber_plot(result)

Based on the plots above, 3 seems to be a decent number of topics to start with, as it has low CaoJuan2009 value and high Deveaud2014 value.

K <- 3

# set random number generator seed
set.seed(94305)

# compute the LDA model, inference via 1000 iterations of Gibbs sampling
topicModel <- LDA(DTM, K, method = "Gibbs", control = list(iter = 1000, verbose = 25))
## K = 3; V = 809; M = 2128
## Sampling 1000 iterations!
## Iteration 25 ...
## Iteration 50 ...
## Iteration 75 ...
## Iteration 100 ...
## Iteration 125 ...
## Iteration 150 ...
## Iteration 175 ...
## Iteration 200 ...
## Iteration 225 ...
## Iteration 250 ...
## Iteration 275 ...
## Iteration 300 ...
## Iteration 325 ...
## Iteration 350 ...
## Iteration 375 ...
## Iteration 400 ...
## Iteration 425 ...
## Iteration 450 ...
## Iteration 475 ...
## Iteration 500 ...
## Iteration 525 ...
## Iteration 550 ...
## Iteration 575 ...
## Iteration 600 ...
## Iteration 625 ...
## Iteration 650 ...
## Iteration 675 ...
## Iteration 700 ...
## Iteration 725 ...
## Iteration 750 ...
## Iteration 775 ...
## Iteration 800 ...
## Iteration 825 ...
## Iteration 850 ...
## Iteration 875 ...
## Iteration 900 ...
## Iteration 925 ...
## Iteration 950 ...
## Iteration 975 ...
## Iteration 1000 ...
## Gibbs sampling completed!

Top 10 terms per topic:

# have a look a some of the results (posterior distributions)
tmResult <- posterior(topicModel)

# format of the resulting object
# attributes(tmResult)
# nTerms(DTM)

# topics are probability distributions over the entire vocabulary
beta <- tmResult$terms   # get beta from results

# for every document we have a probability distribution of its contained topics
theta <- tmResult$topics

terms(topicModel, 10)
##       Topic 1   Topic 2  Topic 3  
##  [1,] "get"     "people" "dont"   
##  [2,] "sick"    "covid"  "yes"    
##  [3,] "can"     "will"   "side"   
##  [4,] "got"     "time"   "effects"
##  [5,] "family"  "many"   "like"   
##  [6,] "getting" "say"    "trust"  
##  [7,] "take"    "one"    "believe"
##  [8,] "scared"  "really" "think"  
##  [9,] "even"    "also"   "just"   
## [10,] "virus"   "taking" "work"
top5termsPerTopic <- terms(topicModel, 3)

# concatenate the five most likely terms of each topic to a string that represents a pseudo-name for each topic
topicNames <- apply(top5termsPerTopic, 2, paste, collapse = " ")

Word cloud for topic 1:

# visualize topics as word cloud

topicToViz <- 1 # change for your own topic of interest

# select to 100 most probable terms from the topic by sorting the term-topic-probability vector in decreasing order
top100terms <- sort(tmResult$terms[topicToViz,], decreasing = TRUE)[1:100]
words <- names(top100terms)

# extract the probabilites of each of the 100 terms
probabilities <- sort(tmResult$terms[topicToViz,], decreasing=TRUE)[1:100]

# visualize the terms as wordcloud
mycolors <- brewer.pal(8, "Dark2")

wordcloud(words, probabilities, random.order = FALSE, color = mycolors)

Word cloud for topic 2:

# visualize topics as word cloud

topicToViz <- 2 # change for your own topic of interest

# select to 100 most probable terms from the topic by sorting the term-topic-probability vector in decreasing order
top100terms <- sort(tmResult$terms[topicToViz,], decreasing = TRUE)[1:100]
words <- names(top100terms)

# extract the probabilites of each of the 100 terms
probabilities <- sort(tmResult$terms[topicToViz,], decreasing=TRUE)[1:100]

# visualize the terms as wordcloud
mycolors <- brewer.pal(8, "Dark2")

wordcloud(words, probabilities, random.order = FALSE, color = mycolors)

Word cloud for topic 3:

# visualize topics as word cloud

topicToViz <- 3 # change for your own topic of interest

# select to 100 most probable terms from the topic by sorting the term-topic-probability vector in decreasing order
top100terms <- sort(tmResult$terms[topicToViz,], decreasing = TRUE)[1:100]
words <- names(top100terms)

# extract the probabilites of each of the 100 terms
probabilities <- sort(tmResult$terms[topicToViz,], decreasing=TRUE)[1:100]

# visualize the terms as wordcloud
mycolors <- brewer.pal(8, "Dark2")

wordcloud(words, probabilities, random.order = FALSE, color = mycolors)