## Load packages
pacman::p_load(DT, estimatr, kableExtra, readr, reshape2, tidyverse, xtable, dataMaid, ggcorrplot, ggmap, rpart, rpart.plot, pollster, wordcloud, tm, RColorBrewer, hrbrthemes, janitor, purrr, gridExtra, cowplot, rcompanion, nnet, texreg, compareGroups, factoextra, cluster, fastDummies, simputation, sentimentr, politeness, textir)

set.seed(94305)
## Data Loading and Merging

### read ads data for 4 pilot waves
ads_v7 <-
  read.csv("data/ads_data_latest.csv") %>% 
  rename(original_ref = Ad.ID, ad_name = Ad.name) %>% 
  mutate(
    `Analysis 3 - impediment theme` = str_sub(ad_name, 8) %>% str_to_lower()
  ) %>% 
  relocate(original_ref, ad_name)

ads_v6 <- 
  read.csv("data/old/ads_data_v6.csv") %>% 
  rename(original_ref = Ad.ID, ad_name = Ad.name) %>% 
  # group_by(ad_name) %>%
  # summarise_at(vars(Impressions, Link.clicks, Results, `Amount.spent..USD.`), ~ sum(., na.rm = TRUE)) %>%
  remove_empty() %>% 
  inner_join(
    read_csv(here::here("data/Pilot_creative_text.xlsx - Pilot_5.csv")) %>% rename(ad_name = Ad),
    by = "ad_name"
  ) %>% 
  relocate(original_ref, ad_name)

ads_v5 <-
  read.csv("data/old/ads_data_v5.csv") %>% 
  rename(original_ref = Ad.ID, ad_name = Ad.name) %>% 
  remove_empty() %>% 
  inner_join(
    read_csv(here::here("data/Pilot_creative_text.xlsx - Pilot_5.csv")) %>% rename(ad_name = Ad),
    by = "ad_name"
  ) %>% 
  relocate(original_ref, ad_name)

ads_v4b <- 
  read.csv("data/old/ads_data_v4b.csv") %>% 
  rename(original_ref = Ad.ID, ad_name = Ad.name) %>% 
  remove_empty() %>% 
  inner_join(
    read_csv(here::here("data/Pilot_4b_creative_text.xlsx - Creative Reporting.csv")) %>% rename(ad_name = Ad),
    by = "ad_name"
  ) %>% 
  relocate(original_ref, ad_name)
## read in CURRENT chatfuel data
# full data
df_full_v7 <- 
  read_csv("data/latest/Share_Your_Voice_v7.csv") %>%
  clean_names() %>% 
  mutate_if(is.character, ~ str_replace_all(., '[\n\t]', '')) %>% 
  mutate(
    first_name = if_else(is.na(first_name), "", first_name),
    middle_name = if_else(is.na(middle_name), "", middle_name),
    last_name = if_else(is.na(last_name), "", last_name),
    full_name = str_c(first_name, middle_name, last_name, sep = " "),
    full_name_short = str_c(first_name, last_name, sep = " "),
    date = lubridate::date(signed_up)
  ) %>% 
  filter(!(full_name %in% c("Robert Kuan", "James Li", "Kaylin Rochford", "Saurabh Khanna", "Dingchen Sha", "Kristine Koutout", "Susan Athey", "Dean Karlan"))) %>%
  filter(version == "ALP_May") %>%
  mutate(original_ref = parse_number(original_ref)) %>% 
  mutate(
    motive = if_else(str_detect(motive, "yes"), "yes", "no"),
    motive_main = if_else(str_detect(motive_main, "risk"), "risk", motive_main)
  ) %>%
  remove_empty()

# filter to completes
df_v7 <-
  df_full_v7 %>%
  filter(full_complete == "yes") %>% 
  drop_na(vax_status) %>% 
  mutate(
    phone_number = str_replace_all(phone_number, " ", ""),
    phone_number = str_replace_all(phone_number, "-", ""),
  ) %>% 
  arrange(phone_number, last_seen) %>% 
  distinct(phone_number, .keep_all = T) %>%
  remove_empty()


## OLD WAVES

# v6
df_full_v6 <- 
  read_csv("data/latest/Share_Your_Voice_v6_ALP.csv") %>%
  clean_names() %>% 
  mutate_if(is.character, ~ str_replace_all(., '[\n\t]', '')) %>% 
  mutate(
    first_name = if_else(is.na(first_name), "", first_name),
    middle_name = if_else(is.na(middle_name), "", middle_name),
    last_name = if_else(is.na(last_name), "", last_name),
    full_name = str_c(first_name, middle_name, last_name, sep = " "),
    full_name_short = str_c(first_name, last_name, sep = " "),
    date = lubridate::date(signed_up)
  ) %>% 
  filter(!(full_name %in% c("Robert Kuan", "James Li", "Kaylin Rochford", "Saurabh Khanna", "Dingchen Sha", "Kristine Koutout", "Susan Athey", "Dean Karlan"))) %>%
  filter(date > "2022-03-23", !is.na(version)) %>%
  mutate(original_ref = parse_number(original_ref)) %>% 
  mutate(
    motive = if_else(str_detect(motive, "yes"), "yes", "no"),
    motive_main = if_else(str_detect(motive_main, "risk"), "risk", motive_main)
  ) %>%
  # left_join(ads_v6, by = "original_ref") %>% 
  remove_empty()

# filtering for survey completes
df_v6 <-
  df_full_v6 %>%
  filter(version == "pilot_6", full_complete == "yes") %>% 
  drop_na(vax_status) %>% 
  mutate(
    phone_number = str_replace_all(phone_number, " ", ""),
    phone_number = str_replace_all(phone_number, "-", ""),
  ) %>% 
  arrange(phone_number, last_seen) %>% 
  distinct(phone_number, .keep_all = T) %>%
  remove_empty()

# v5
df_full_v5 <-
  read_csv("data/latest/Share_Your_Voice_2022_02_22_03_35_54.csv") %>% 
  clean_names() %>% remove_empty() %>% 
  mutate_if(is.character, ~ str_replace_all(., '[\n\t]', '')) %>% 
  mutate(
    full_name = str_c(first_name, last_name, sep = " "),
    date = lubridate::date(signed_up)
  ) %>% 
  filter(!(full_name %in% c("Robert Kuan", "James Li", "Kaylin Rochford", "Saurabh Khanna", "Dingchen Sha"))) %>%
  filter(date > "2022-02-18", !is.na(version)) %>%
  select(-contains("name"), -profile_pic_url, -messenger_user_id) %>% 
  mutate(original_ref = parse_number(original_ref)) %>% 
  remove_empty()


df_v5 <-
  df_full_v5 %>% 
  filter(version == "pilot_5", full_complete == "yes") %>% 
  drop_na(vax_status) %>% 
  mutate(
    phone_number = str_replace_all(phone_number, " ", ""),
    phone_number = str_replace_all(phone_number, "-", ""),
  ) %>% 
  arrange(phone_number, last_seen) %>% 
  distinct(phone_number, .keep_all = T) %>%
  remove_empty()
  

# 4b
df_full_v4b <-
  read_csv(here::here("data/Social_Impact_Research_Initiative_2022_02_04_17_40_24.csv")) %>% 
  clean_names() %>% 
  mutate_if(is.character, ~ str_replace_all(., '[\n\t]', '')) %>%  
  mutate(
    full_name = str_c(first_name, last_name, sep = " ")
  ) %>% 
  filter(!(full_name %in% c("Robert Kuan", "James Li", "Kaylin Rochford", "Saurabh Khanna", "Dingchen Sha"))) %>%
  select(-contains("name"), -profile_pic_url, -messenger_user_id) %>% 
  mutate(original_ref = parse_number(original_ref)) %>% 
  select(-motive_explain) %>% 
  remove_empty()

df_v4b <-
  df_full_v4b %>% 
  filter(version == "pilot_4b", full_complete == "yes") %>% 
  drop_na(vax_status) %>% 
  mutate(
    phone_number = str_replace_all(phone_number, " ", ""),
    phone_number = str_replace_all(phone_number, "-", ""),
  ) %>% 
  arrange(phone_number, last_seen) %>% 
  distinct(phone_number, .keep_all = T) %>% 
  remove_empty()


# combining chatfuel data with ads data. we use `df` in most analyses below. 
df <-
  bind_rows(
    df_v7 %>% left_join(ads_v7, by = "original_ref"),
    df_v6 %>% left_join(ads_v6, by = "original_ref"),
    df_v5 %>% left_join(ads_v5, by = "original_ref"),
    df_v4b %>% left_join(ads_v4b, by = "original_ref"),
  ) %>% 
  remove_empty()


# combining full datatsets
df_full <- 
  bind_rows(
    df_full_v7 %>% left_join(ads_v7, by = "original_ref"),
    df_full_v6 %>% left_join(ads_v6, by = "original_ref"), 
    df_full_v5 %>% left_join(ads_v5, by = "original_ref"),
    df_full_v4b %>% left_join(ads_v6, by = "original_ref"),
  ) %>% 
  remove_empty()

ads <- 
  bind_rows(ads_v4b, ads_v5, ads_v6, ads_v7) %>% 
  remove_empty()
# clean up demographic variables

clean_up_demog <- function(df){
  df$gender[!df$gender %in% c("male", "female", NA_character_)] <- "other"
  df$ethnicity[!df$ethnicity %in% c("black or african", "coloured", "white or caucasian", "prefer not to say", "asian or indian", NA_character_)] <- "other"
  df$income[!df$income %in% c("< R5,000", "R5,000 – R9,999", "R10,000 – R29,999", "R30,000 – R49,999", "R50,000 – R99,999", "> R100,000", "prefer not to say", NA_character_)] <- "other"
  df$education[!df$education %in% c("< high school", "high school", "some college", "2-year degree", "4-year degree", "graduate degree", "prefer not to say", NA_character_)] <- "other"
  df$politics[!df$politics %in% c("conservative", "moderate", "liberal", "prefer not to say", NA_character_)] <- "other"
  df$location[!df$location %in% c("urban", "suburban", "rural", "prefer not to say", NA_character_)] <- "other"
  df$religion[grep("christ", tolower(df$religion))] <- "christian"
  df$religion[!df$religion %in% c("christian", "african traditional", "islam", "hinduism", "no religion", "prefer not to say", NA_character_)] <- "other"
  return(df)
}

df <- clean_up_demog(df)
df_v7 %>%
  filter(is.na(ability_raw), !is.na(ability)) %>% 
  glimpse()

df_v7 %>%
  tabyl(vax_status_raw, vax_status)

df %>% get_dupes(chatfuel_user_id) %>% 
  transmute(chatfuel_user_id = as.character(chatfuel_user_id), version, last_seen, dupe_count)

1 Project Overview

Background

COVID-19 vaccine hesitancy has been recognized as a problem across nations. A resistance to getting vaccinated is emerging as a major hurdle, especially in the developing world, where vaccine access issues are still being gradually resolved. Persistent pools of unvaccinated people around the world could present a greater risk for the emergence of new variants of concern. Addressing people’s vaccine hesitancy is hence crucial to curb the spread of COVID-19, and to consequently avert hospitalizations and death.

Objectives

We intend to understand why people are hesitant about getting the COVID-19 vaccine. Hesitancy could not only occur within the unvaccinated population but also in a subset of people who already got vaccinated. Therefore, the first phase of our project has the following objectives:

  1. Understand why people are hesitant to get the COVID-19 vaccines
  2. Understand ways to best elicit vaccine impediments from respondents
  3. Pinpoint what treatments will help people get vaccinated

Approach

We intend to use chatbot as a medium (on Facebook) to conduct conversations with people and understand how we can best achieve the above three objectives. We have run six pilots as of March 2022 – 2 in the United States using Lucid, and 5 in South Africa on Facebook. Our eventual goal will be running this using an interactive, personalized chatbot that enables the conversation to flow more naturally than in a survey format. We are running the pilots in order to – 1) achieve technical proofs of concept, 2) reduce participant recruitment & completion costs (survey completion of unvaccinated participants open to treatment ) before experiment launch, 3) improve chatbot script/forking/engagement before experiment launch, and 4) gather exploratory ideas for impactful covariates and treatments. Our insights from these pilots are detailed ahead.


The sample for this analysis consists of 2271 unvaccinated respondents recruited across pilot waves 4-7 (Jan-May 2022). The related GitHub issue is here.

We start with cleaning all chatbot features to be used later in our models.

2 Feature cleaning for models

We start by cleaning and filtering our data frames to features we expect to be useful for modeling. We consider the following 4 sets of features:

  • Feature set 1 (Impediments):
    • Motivation impediments: against_beliefs (vaccine is against my beliefs), no_benefits (vaccine has no benefits), risky (vaccine is risky). These are coded as binary (0/1).
    • Ability impediments: no_time (no time to get vaxxed), no_money (no money to get vaxxed), no_availability (vaccine not easily available). These are coded as binary (0/1).
  • Feature set 2 (Ads):
    • Ad theme: ad_theme_risky (vaccine is risky) and ad_theme_unnecessary (vaccine is not necessary). These are coded as binary (0/1).
    • Ad text: ad_text_airtime is a binary (0/1) indicating whether the ad text explicitly mentions that respondents will receive mobile airtime.
  • Feature set 3 (Free text):
    • elaboration: The average number of characters typed by a respondent across all free text questions faced. We normalize this variable before clustering.
    • politeness: Politeness score calculated across all free text typed by a respondent. This ranges from 0 to 104 in our sample (based on number of politeness attributes touched) and is constructed using Mike Yeomans’ politeness package. We normalize this variable before running the clustering algorithm. Higher score means more polite text.
    • receptivity: Receptivity score is a continuous score calculated across all free text typed by a respondent. This is also constructed using Mike Yeomans’ politeness package. We normalize it before running the clustering algorithm. Higher score means more receptive text.
    • sentiment: Sentiment polarity (range -1 to 1) for all free text typed by a respondent.
  • Feature set 4 (Demographics):
    • age: Participant age in years
    • female: 1 if female, 0 if male
    • income: 0 if the participant is unemployed, 1 if household income < R5,000, 2 if household income in R5,000 – R9,999, …, 6 if household income > R100,000
    • education: 1 if the participant’s education < high school, 2 if education is high school, …, 6 if education is a graduate degree
    • religiosity: 1 if the participant is not very religious, 2 if somewhat religious, 3 if very religious
    • politics: 1 if the participant is conservative, 2 if moderate, 3 if liberal
    • location: 1 if the participant lives in rural, 2 if suburban, 3 if urban,
    • white: 1 if the participant is a white or caucasian, 0 if not

All continuous variables are normalized before clustering.

Since we can’t feed missing values to a clustering algorithm, we impute any missingness in demographic variables using a linear imputation model building on values from female, white, and age.

df_sentiment <-
  df %>% 
  select(contains(c("_other", "_explain"))) %>% 
  unite("text", 1:56, na.rm = T, remove = T, sep = ". ") %>%
  pull(text) %>%
  get_sentences() %>%
  sentiment_by() %>% 
  bind_cols(df %>% select(chatfuel_user_id)) %>% 
  transmute(chatfuel_user_id, sentiment = ave_sentiment)

df_polite <-
  df %>% 
  select(contains(c("_other", "_explain"))) %>% 
  unite("text", 1:56, na.rm = T, remove = T, sep = ". ") %>%
  politeness::politeness(.$text, parser = "spacy") %>% 
  transmute(politeness = rowSums(.)) %>% 
  bind_cols(df %>% select(chatfuel_user_id))
## ================================================================================
# df_receptive <-
#   df %>% 
#   select(contains(c("_other", "_explain"))) %>% 
#   unite("text", 1:56, na.rm = T, remove = T, sep = ". ") %>%
#   transmute(receptivity = politeness::receptiveness(.)) %>% 
#   bind_cols(df %>% select(chatfuel_user_id))


df_elaboration <-
  df %>%
  mutate(age = parse_number(cv_age)) %>% 
  filter(vax_status == "unvax", age <= 120) %>% 
  select(ends_with(c("_other", "_explain"))) %>% 
  mutate_all(~ nchar(.)) %>% 
  transmute(
    elaboration =
      pmap_dbl(
        .,
        ~ mean(c(...), na.rm = TRUE)
      )
  )
df_ad_features <-
  df %>% 
  mutate(age = parse_number(cv_age)) %>% 
  filter(vax_status == "unvax", age <= 120) %>%
  transmute(
    chatfuel_user_id,
    # ad_image = `Analysis 2 - image`,
    ad_theme = `Analysis 3 - impediment theme`,
    ad_text = `Analysis 4 - body text`
  ) %>%
  fastDummies::dummy_cols(
    select_columns = c("ad_theme"), remove_first_dummy = TRUE,
    remove_selected_columns = TRUE, ignore_na = TRUE
  ) %>% 
  mutate(ad_text_airtime = if_else(ad_text == "airtime", 1L, 0L) %>% replace_na(0)) %>% 
  select(chatfuel_user_id, starts_with(c("ad_text_", "ad_theme_"))) %>% 
  mutate_at(vars(starts_with("ad_theme")), ~ replace_na(., 0)) %>% 
  bind_cols(df_elaboration) %>% 
  left_join(df_polite, by = "chatfuel_user_id") %>% 
  #left_join(df_receptive, by = "chatfuel_user_id") %>% 
  left_join(df_sentiment, by = "chatfuel_user_id") %>% 
  distinct(chatfuel_user_id, .keep_all = T)


df_features_raw <-
  df %>% 
  mutate(age = parse_number(cv_age)) %>% 
  filter(vax_status == "unvax", age <= 120) %>%
  transmute(
    # unvax = if_else(vax_status == "unvax", 1L, 0L),
    chatfuel_user_id,
    age,
    female = case_when(
      gender == "female" ~ 1,
      gender == "male" ~ 0,
    ),
    income = case_when(
      income == "Unemployed" ~ 0,
      income == "< R5,000" ~ 1,
      income == "R5,000 – R9,999" ~ 2,
      income == "R10,000 – R29,999" ~ 3,
      income == "R30,000 – R49,999" ~ 4,
      income == "R50,000 – R99,999" ~ 5,
      income == "> R100,000" ~ 6,
    ),
    education = case_when(
      education == "< high school" ~ 1,
      education == "high school" ~ 2,
      education == "some college" ~ 3,
      education == "2-year degree" ~ 4,
      education == "4-year degree" ~ 5,
      education == "graduate degree" ~ 6,
    ),
    religiosity = case_when(
      religiosity == "not very religious" ~ 1,
      religiosity == "somewhat religious" ~ 2,
      religiosity == "very religious" ~ 3,
    ),
    politics = case_when(
      politics == "conservative" ~ 1,
      politics == "moderate" ~ 2,
      politics == "liberal" ~ 3,
    ),
    location = case_when(
      location == "rural" ~ 1,
      location == "suburban" ~ 2,
      location == "urban" ~ 3,
    ),
    white = case_when(
      ethnicity == "white or caucasian" ~ 1,
      ethnicity != "white or caucasian" ~ 0
    ),
    # no_ability = case_when(
    #   ability == "no" ~ 1,
    #   ability == "yes" ~ 0,
    # ),
    # no_motive = case_when(
    #   motive == "no" ~ 1,
    #   motive == "yes" ~ 0,
    # ),
    against_beliefs = if_else(motive_main == "belief", 1L, 0L) %>% replace_na(0),
    no_benefits = if_else(motive_main == "benefit", 1L, 0L) %>% replace_na(0),
    risky = if_else(motive_main == "risk", 1L, 0L) %>% replace_na(0),
    no_time = if_else(ability_main == "time", 1L, 0L) %>% replace_na(0),
    no_money = if_else(ability_main == "money", 1L, 0L) %>% replace_na(0),
    no_availability = if_else(ability_main == "availability", 1L, 0L) %>% replace_na(0),
  ) %>%
  distinct(chatfuel_user_id, .keep_all = T) %>% 
  left_join(df_ad_features, by = "chatfuel_user_id") %>% 
  impute_lm(politics + income + religiosity + location + education ~ female + white + age) %>% 
  drop_na() %>% 
  mutate_at(vars(age, elaboration, sentiment, politeness), ~ scale(.) %>% as.vector()) %>% 
  relocate(chatfuel_user_id)

#df_features %>% summarise_all(~ sum(is.na(.), na.rm = T))

df_features <- df_features_raw %>% select(-chatfuel_user_id)

Here are summary statistics for the features fed into the algorithm:

df_features %>%
  select(against_beliefs, no_benefits, risky, no_time, no_money, no_availability, ad_text_airtime, ad_theme_risky, ad_theme_unnecessary, ad_theme_inaccessible, ad_theme_neutral, ad_theme_innocence = `ad_theme_innocence/curiosity`, ad_theme_fear, ad_theme_familyvalues, elaboration, politeness, sentiment, age, female, income, education, religiosity, politics, location, white, everything()) %>%
  clean_names(case = "title") %>%
  papeR::summarize_numeric() %>% 
  datatable(options = list(pageLength = 10, columnDefs = list(list(orderable = TRUE, targets = 0))))

3 Multinomial Inverse Regressions

We start with looking at multinomial inverse regressions (MNIR) to understand how chatbot free text responses connect with preferred treatments. In MNIR, we regress the token counts on a set of known attributes (preferred treatments in our case).

Creating counts and covariate matrices:

## TOKENS ##
freetext <-
  df %>%
  mutate(
    best_treatment = str_to_sentence(best_treatment),
    best_treatment = if_else(str_detect(best_treatment, "^Nothing"), "Nothing", best_treatment),
    best_treatment = if_else(best_treatment %in% c("Family supports it", "Trusted info source", "More transparency", "Nothing", "Rewards for vaxxing", "Job/school required"), best_treatment, NA_character_),
  ) %>%
  drop_na(best_treatment) %>%
  select(contains(c("_other", "_explain"))) %>%
  mutate_all(~ str_to_lower(.) %>% str_squish() %>% str_remove_all("no|yes")) %>%
  unite("text", 1:56, na.rm = T, remove = T, sep = ". ") %>%
  drop_na() %>%
  pull(text)

# Create corpus
docs <- Corpus(VectorSource(freetext))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
matrix <- t(matrix)


## COVARIATES ##
covars_df <-
  df %>%
  transmute(
    best_treatment = str_to_sentence(best_treatment),
    best_treatment = if_else(str_detect(best_treatment, "^Nothing"), "Nothing", best_treatment),
    best_treatment = if_else(best_treatment %in% c("Family supports it", "Trusted info source", "More transparency", "Nothing", "Rewards for vaxxing", "Job/school required"), best_treatment, NA_character_),
  ) %>%
  drop_na(best_treatment) %>%
  dummy_cols(select_columns = "best_treatment", remove_first_dummy = TRUE, remove_selected_columns = TRUE)

covars <- covars_df %>% as.matrix()

covar_names <-
  df %>%
  transmute(
    best_treatment = str_to_sentence(best_treatment),
    best_treatment = if_else(str_detect(best_treatment, "^Nothing"), "Nothing", best_treatment),
    best_treatment = if_else(best_treatment %in% c("Family supports it", "Trusted info source", "More transparency", "Nothing", "Rewards for vaxxing", "Job/school required"), best_treatment, NA_character_),
  ) %>%
  drop_na(best_treatment) %>%
  dummy_cols(select_columns = "best_treatment", remove_first_dummy = TRUE, remove_selected_columns = TRUE) %>% 
  colnames() %>% 
  str_sub(., 16, -1)

# words[1:99]

Fitting the model:

cl <- makeCluster(detectCores())
fit <- mnlm(cl, covars, matrix)
stopCluster(cl)
B <- coef(fit)

3.1 Lasso paths for high frequency terms

Consolidating takeaways:

  • Side effect utterances are positively related to more transparency treatment, but not to other treatments.
  • Family related utterances are negatively related to all treatments.
  • Fear related utterances are negatively related to all treatments, except mandates where it sees no effect.
  • Time related utterances are positively (but weakly) related to rewards for vaxxing treatment, but negatively related to other treatments.
  • Death related utterances are positively (though weakly) related to job/school mandates treatment, but negatively related to other treatments.

3.1.1 Side effects

Side effect utterances are positively related to more transparency treatment, but not to other treatments.

plot(fit$side, col = c("red", "green", "blue", "orange", "black"))
legend("topright", legend = covar_names, fill = c("red", "green", "blue", "orange", "black"))

3.1.2 Family

Family related utterances are negatively related to all treatments.

plot(fit$family, col = c("red", "green", "blue", "orange", "black"))
legend("topright", legend = covar_names, fill = c("red", "green", "blue", "orange", "black"))

3.1.3 Scared

Fear related utterances are negatively related to all treatments, except mandates where it sees no effect.

plot(fit$scared, col = c("red", "green", "blue", "orange", "black"))
legend("topright", legend = covar_names, fill = c("red", "green", "blue", "orange", "black"))

3.1.4 Time

Time related utterances are positively (but weakly) related to rewards for vaxxing treatment, but negatively related to other treatments.

plot(fit$time, col = c("red", "green", "blue", "orange", "black"))
legend("topright", legend = covar_names, fill = c("red", "green", "blue", "orange", "black"))

3.1.5 Die

Death related utterances are positively (thougly weakly) related to job/school mandates treatment, but negatively related to other treatments.

plot(fit$die, col = c("red", "green", "blue", "orange", "black"))
legend("topright", legend = covar_names, fill = c("red", "green", "blue", "orange", "black"))


3.2 Utterances by preferred treatment

The following table shows all respondent utterances, sorted within each treatment preferred by MNIR coefficient size.

B %>% 
  as.matrix() %>% 
  as_tibble() %>% 
  tail(5) %>% 
  bind_cols(
    preferred_treatment = covar_names, .
  ) %>% 
  pivot_longer(cols = -preferred_treatment) %>% 
  arrange(preferred_treatment, -value) %>%
  mutate(value = round(value, 2)) %>% 
  select(`Preferred treatment` = preferred_treatment, `Utterance` = name, `Coefficient` = value) %>% 
  DT::datatable()

4 Supervised models

Let’s move ahead and look at two supervised learning approaches – multinomial logit and random forest. The task here is to try and predict – i) motivation impediments, and ii) preferred treatments, using all available chatbot features.

Here’s an overview of misclassification rates from these supervised approaches:

Predicting Preferred Treatments Predicting Impediments
Multinomial Logit 65% 63%
Random Forest 64% 68%

Details on each implementation ahead.

4.1 Multinomial Logit

4.1.1 Impediments

df_mlm <-
  df_features_raw %>% 
  inner_join(
    df %>% 
      transmute(
        chatfuel_user_id,
        impediment = case_when(
          motive_main == "belief" ~ "Against beliefs",
          motive_main == "benefit" ~ "No benefits",
          motive_main == "risk" ~ "Risky",
          TRUE ~ "No impediment" 
        ) %>% as_factor()
      ) %>% distinct(chatfuel_user_id, .keep_all = T),
    by = "chatfuel_user_id"
  ) %>% 
  select(-c(against_beliefs, no_benefits, risky, chatfuel_user_id))


df_mlm$impediment <- relevel(df_mlm$impediment, ref = "No impediment")

trainingRows <- sample(1:nrow(df_mlm), 0.7 * nrow(df_mlm))
training <- df_mlm[trainingRows, ]
test <- df_mlm[-trainingRows, ]

mlm1 <- 
  multinom(impediment ~ ., data = training)
## # weights:  96 (69 variable)
## initial  value 2202.821740 
## iter  10 value 2076.207493
## iter  20 value 2042.066608
## iter  30 value 2020.569589
## iter  40 value 2017.239678
## iter  50 value 2016.942216
## iter  60 value 2016.925929
## final  value 2016.925519 
## converged

Baseline model with 70/30 train/test split and no interactions [impediment ~ linear_combination_of_features] for unvaccinated respondents does a bad job of predicting true motivation impediments. Misclassification error for this model is high at 63%.

predicted_class <- predict(mlm1, test)

table(`Predicted class` = predicted_class, `True class` = test$impediment)
##                  True class
## Predicted class   No impediment No benefits Risky Against beliefs
##   No impediment              76          46    52              27
##   No benefits                33          49    46              20
##   Risky                      75          82   117              34
##   Against beliefs             7           1     9               8
mean(as.character(predicted_class) != as.character(test$impediment), na.rm = T)
## [1] 0.6334311

4.1.2 Preferred treatments

df_mlm <-
  df_features_raw %>% 
  inner_join(
    df %>% 
      transmute(
        chatfuel_user_id,
        best_treatment = str_to_sentence(best_treatment),
        best_treatment = if_else(str_detect(best_treatment, "^Nothing"), "Nothing", best_treatment),
        best_treatment = if_else(best_treatment %in% c("Family supports it", "Trusted info source", "More transparency", "Nothing", "Rewards for vaxxing", "Job/school required"), best_treatment, NA_character_) %>% 
      as_factor()) %>% 
      distinct(chatfuel_user_id, .keep_all = T),
    by = "chatfuel_user_id"
  ) %>% 
  select(-c(chatfuel_user_id))


df_mlm$best_treatment <- relevel(df_mlm$best_treatment, ref = "Nothing")

trainingRows <- sample(1:nrow(df_mlm), 0.7 * nrow(df_mlm))
training <- df_mlm[trainingRows, ]
test <- df_mlm[-trainingRows, ]

mlm1 <- 
  multinom(best_treatment ~ ., data = training)
## # weights:  162 (130 variable)
## initial  value 2748.559026 
## iter  10 value 2450.044814
## iter  20 value 2347.836419
## iter  30 value 2305.934606
## iter  40 value 2294.525788
## iter  50 value 2288.230803
## iter  60 value 2285.837110
## iter  70 value 2285.173795
## iter  80 value 2284.788372
## iter  90 value 2284.629819
## iter 100 value 2284.596110
## final  value 2284.596110 
## stopped after 100 iterations

Baseline model with 70/30 train/test split and no interactions [preferred_treatment ~ linear_combination_of_features] for unvaccinated respondents does a bad job of predicting preferred treatments. Misclassification error for this model is high at 65%.

predicted_class <- predict(mlm1, test)

table(`Predicted class` = predicted_class, `True class` = test$best_treatment)
##                      True class
## Predicted class       Nothing Rewards for vaxxing Trusted info source
##   Nothing                  82                  20                  32
##   Rewards for vaxxing       2                   2                   1
##   Trusted info source       3                   7                   3
##   Family supports it        8                   4                   7
##   Job/school required      75                  27                  46
##   More transparency         4                   0                   0
##                      True class
## Predicted class       Family supports it Job/school required More transparency
##   Nothing                             23                  53                28
##   Rewards for vaxxing                  4                   3                 1
##   Trusted info source                  2                   4                 5
##   Family supports it                  18                  10                 2
##   Job/school required                 43                 123                11
##   More transparency                    0                   1                 0
mean(as.character(predicted_class) != as.character(test$best_treatment), na.rm = T)
## [1] 0.6513761

4.2 Random Forest

4.2.1 Impediments

We now try a random forest model with 70/30 train/test split for the task of predicting impediments from a linear combination of features. The misclassification error still stays as high as 68%.

df_mlm <-
  df_features_raw %>% 
  inner_join(
    df %>% 
      transmute(
        chatfuel_user_id,
        impediment = case_when(
          motive_main == "belief" ~ "Against beliefs",
          motive_main == "benefit" ~ "No benefits",
          motive_main == "risk" ~ "Risky",
          TRUE ~ "No impediment" 
        ) %>% as_factor()
      ) %>% distinct(chatfuel_user_id, .keep_all = T),
    by = "chatfuel_user_id"
  ) %>% 
  select(-c(against_beliefs, no_benefits, risky, chatfuel_user_id)) %>% 
  rename(ad_theme_innocence = `ad_theme_innocence/curiosity`)


df_mlm$impediment <- relevel(df_mlm$impediment, ref = "No impediment")

trainingRows <- sample(1:nrow(df_mlm), 0.7 * nrow(df_mlm))
training <- df_mlm[trainingRows, ]
test <- df_mlm[-trainingRows, ]
# impute missing values before feeding into RF
training <-
  training %>% 
  simputation::impute_lm(politics + income + religiosity + location + education ~ female + white) %>% 
  drop_na()

rf_model <- 
  randomForest::randomForest(impediment ~ ., data = training %>% clean_names(), importance = TRUE)

predicted_class <- predict(rf_model, test %>% clean_names())

table(`Predicted class` = predicted_class, `True class` = test$impediment)
##                  True class
## Predicted class   No impediment No benefits Risky Against beliefs
##   No impediment              71          47    60              20
##   No benefits                52          46    53              34
##   Risky                      73          78    99              27
##   Against beliefs             5           7     7               3
mean(as.character(predicted_class) != as.character(test$impediment), na.rm = T)
## [1] 0.6788856

Given the high misclassification error (68%), the variable importance based on Mean decrease accuracy and Mean decrease Gini are inconsistent.

  • MeanDecreaseAccuracy gives a rough estimate of the loss in prediction performance when that particular variable is omitted from the training set.
  • MeanDecreaseGini: GINI is a measure of node impurity. Highest purity means that each node contains only elements of a single class. Assessing the decrease in GINI when that feature is omitted leads to an understanding of how important that feature is to split the data correctly.

Note: Both these measures are used to rank variables in terms of importance and their absolute values could be disregarded.

randomForest::varImpPlot(rf_model, sort = T, main = "Random Forest - Variable Importance")

4.2.2 Treatments

We now try a random forest model with 70/30 train/test split for the task of predicting preferred treatments from a linear combination of features. The misclassification error still stays as high as 64%.

df_mlm <-
  df_features_raw %>% 
  inner_join(
    df %>% 
      transmute(
        chatfuel_user_id,
        best_treatment = str_to_sentence(best_treatment),
        best_treatment = if_else(str_detect(best_treatment, "^Nothing"), "Nothing", best_treatment),
        best_treatment = if_else(best_treatment %in% c("Family supports it", "Trusted info source", "More transparency", "Nothing", "Rewards for vaxxing", "Job/school required"), best_treatment, NA_character_) %>% 
      as_factor()) %>% 
      distinct(chatfuel_user_id, .keep_all = T),
    by = "chatfuel_user_id"
  ) %>% 
  select(-c(chatfuel_user_id)) %>%
  rename(ad_theme_innocence = `ad_theme_innocence/curiosity`)


df_mlm$best_treatment <- relevel(df_mlm$best_treatment, ref = "Nothing")

trainingRows <- sample(1:nrow(df_mlm), 0.7 * nrow(df_mlm))
training <- df_mlm[trainingRows, ]
test <- df_mlm[-trainingRows, ]
# impute missing values before feeding into RF
training <-
  training %>% 
  simputation::impute_lm(politics + income + religiosity + location + education ~ female + white) %>% 
  drop_na()

rf_model <- 
  randomForest::randomForest(best_treatment ~ ., data = training %>% clean_names(), importance = TRUE)

predicted_class <- predict(rf_model, test %>% clean_names())

table(`Predicted class` = predicted_class, `True class` = test$best_treatment)
##                      True class
## Predicted class       Nothing Rewards for vaxxing Trusted info source
##   Nothing                  89                  13                  31
##   Rewards for vaxxing       2                   3                   1
##   Trusted info source       2                   2                   4
##   Family supports it        7                   3                   5
##   Job/school required      90                  37                  55
##   More transparency         1                   1                   0
##                      True class
## Predicted class       Family supports it Job/school required More transparency
##   Nothing                             23                  36                24
##   Rewards for vaxxing                  4                   2                 0
##   Trusted info source                  4                   2                 4
##   Family supports it                   8                  10                 2
##   Job/school required                 53                 122                11
##   More transparency                    1                   0                 1
mean(as.character(predicted_class) != as.character(test$best_treatment), na.rm = T)
## [1] 0.6523737

Given the high misclassification error (65%), the variable importance based on Mean decrease accuracy and Mean decrease Gini are inconsistent.

  • MeanDecreaseAccuracy gives a rough estimate of the loss in prediction performance when that particular variable is omitted from the training set.
  • MeanDecreaseGini: GINI is a measure of node impurity. Highest purity means that each node contains only elements of a single class. Assessing the decrease in GINI when that feature is omitted leads to an understanding of how important that feature is to split the data correctly.

Note: Both these measures are used to rank variables in terms of importance and their absolute values could be disregarded.

randomForest::varImpPlot(rf_model, sort = T, main = "Random Forest - Variable Importance")


5 Unsupervised models

5.1 K-means clustering

We start with k-means clustering on all relevant variables collected through the chatbot. We will proceed with the clustering analyses in three steps.

5.1.1 Optimal number of clusters.

We start by examining the optimal number of clusters we will split our data into. We will look at 2 approaches for this: the elbow method, and the silhouette method. We have also tried the gap statistic in a previous iteration.

First, for the elbow method, we plot the number of clusters versus the within cluster sum of squares. The location of a bend (elbow) in the plot is considered as an indicator of the appropriate number of clusters.

fviz_nbclust(df_features, kmeans, method = "wss")

We see there is no clear elbow here. Maybe a very small bend around \(k = 3\).

Second, let’s try the average silhouette approach. The average silhouette approach measures the quality of a clustering by determining how well each object lies within its cluster. A high average silhouette width indicates a good clustering. The optimal number of clusters k is the one that maximizes the average silhouette over a range of possible values for k.

fviz_nbclust(df_features, kmeans, method = "silhouette")

We see 2 as the optimal \(k\) from the above pilot, but we err in favor of the next closest silhouette width value at \(k = 3\) (and as we expect to have more than 2 clusters eventually).

Taken together, we have limited consensus around 3 clusters to start us off (we try for larger \(k\)s in the appendix section).

5.1.2 Cluster

We cluster the data with \(k = 3\). The plot below projects the generated clusters on a 2-d plane, with the axes denoting the first 2 principal components explaining the most variance. We see that the first two components explain very little variation in the data.

km <- kmeans(df_features, centers = 3, nstart = 25)

fviz_cluster(km, data = df_features, geom = "point", ggtheme = theme_minimal())

Size of each cluster is reported below:

tibble(
  Cluster = c(1:3),
  Size = km$size
) %>% 
  kable() %>% 
  kable_styling()
Cluster Size
1 356
2 1528
3 387

5.1.3 Aggregated statistics by cluster

The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.

bind_cols(df_features, cluster = km$cluster) %>% 
  mutate_at(vars(-cluster), ~ scale(.) %>% as.vector()) %>% 
  group_by(cluster) %>% 
  summarise_all(~ mean(., na.rm = T)) %>%
  bind_rows(
    bind_cols(df_features, cluster = km$cluster) %>% 
      mutate_at(vars(-cluster), ~ scale(.) %>% as.vector()) %>% 
      group_by(cluster) %>% 
      summarise_all(~ sd(., na.rm = T)),
    .id = "stat"
  ) %>% 
  mutate(stat = if_else(stat == 1, "mean", "sd")) %>% 
  select(against_beliefs, no_benefits, risky, no_time, no_money, no_availability, ad_text_airtime, ad_theme_risky, ad_theme_unnecessary, ad_theme_inaccessible, ad_theme_neutral, ad_theme_innocence = `ad_theme_innocence/curiosity`, ad_theme_fear, ad_theme_familyvalues, elaboration, politeness, sentiment, age, female, income, education, religiosity, politics, location, white, everything()) %>% 
  clean_names(case = "title") %>%
  pivot_longer(cols = -c(Cluster, Stat)) %>%
  mutate(
    Cluster = factor(Cluster),
    name = name %>% fct_inorder() %>% fct_rev()
  ) %>%
  pivot_wider(names_from = Stat, values_from = value) %>% 
  mutate(
    mean = round(mean, 2),
    mean_char = as.character(mean),
    mean_char = if_else(mean > 0.4, "0.4+", mean_char),
    mean = if_else(mean > 0.4, 0.4, mean),
    sd = round(sd, 2),
    label_str = str_c(mean_char, " (", sd, ")"),
  ) %>% 
  ggplot() +
  geom_tile(aes(Cluster, name, fill = mean)) +
  geom_text(aes(Cluster, name, label = label_str), size = 3) +
  scale_fill_gradient2(
    low = "red",
    mid = "white",
    high = "green",
    midpoint = 0,
    limits = c(-0.4, 0.4)
  ) +
  theme_minimal() +
  labs(
    x = "Cluster", y = "Feature",
    fill = "Means (std)"
  )

Let’s move ahead and look at respondents’ free text by each cluster.

5.1.4 Free text by cluster

While looking at respondents’ free text by each cluster, we will consider both word clouds and tabulated text.

5.1.4.1 Motivation Impediments - Word clouds


Cluster 1:

vector_wc <-
  bind_cols(df_features_raw, cluster = km$cluster) %>% 
  select(chatfuel_user_id, cluster) %>% 
  left_join(df, by = "chatfuel_user_id") %>% 
  filter(cluster == 1) %>% 
  count(motive_other) %>% 
  arrange(-n) %>%
  mutate(
    motive_other = str_to_lower(motive_other),
    motive_other = str_replace(motive_other, "side effects", "sideeffects"),
    motive_other = str_replace(motive_other, "vaccine", ""),
    motive_other = str_replace(motive_other, "yes", ""),
    motive_other = str_replace(motive_other, "no", ""),
    motive_other = str_replace(motive_other, "dont", ""),
    motive_other = str_replace(motive_other, "don't", ""),
  ) %>% 
  filter(n < 5) %>%  # (filtering out generic responses like just yes, no)
  pull(motive_other)

# Create corpus 
docs <- Corpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)

# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words=200, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"))


Cluster 2:

vector_wc <-
  bind_cols(df_features_raw, cluster = km$cluster) %>% 
  select(chatfuel_user_id, cluster) %>% 
  left_join(df, by = "chatfuel_user_id") %>% 
  filter(cluster == 2) %>% 
  count(motive_other) %>% 
  arrange(-n) %>% 
  mutate(
    motive_other = str_to_lower(motive_other),
    motive_other = str_replace(motive_other, "side effects", "sideeffects"),
    motive_other = str_replace(motive_other, "vaccine", ""),
    motive_other = str_replace(motive_other, "yes", ""),
    motive_other = str_replace(motive_other, "no", ""),
    motive_other = str_replace(motive_other, "dont", ""),
    motive_other = str_replace(motive_other, "don't", ""),
  ) %>% 
  filter(n < 5) %>%  # (filtering out generic responses like just yes, no)
  pull(motive_other)

# Create corpus 
docs <- Corpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)

# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words=200, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"))


Cluster 3:

vector_wc <-
  bind_cols(df_features_raw, cluster = km$cluster) %>% 
  select(chatfuel_user_id, cluster) %>% 
  left_join(df, by = "chatfuel_user_id") %>% 
  filter(cluster == 3) %>% 
  count(motive_other) %>% 
  arrange(-n) %>%
  mutate(
    motive_other = str_to_lower(motive_other),
    motive_other = str_replace(motive_other, "side effects", "sideeffects"),
    motive_other = str_replace(motive_other, "vaccine", ""),
    motive_other = str_replace(motive_other, "yes", ""),
    motive_other = str_replace(motive_other, "no", ""),
    motive_other = str_replace(motive_other, "dont", ""),
    motive_other = str_replace(motive_other, "don't", ""),
  ) %>% 
  filter(n < 5) %>%  # (filtering out generic responses like just yes, no)
  pull(motive_other)

# Create corpus 
docs <- Corpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)

# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words=200, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"))

5.1.4.2 Motivation Impediments - Raw responses


Cluster 1:

bind_cols(df_features_raw, cluster = km$cluster) %>% 
  select(chatfuel_user_id, cluster) %>% 
  left_join(df, by = "chatfuel_user_id") %>% 
  filter(cluster == 1) %>% 
  count(motive_other) %>% 
  arrange(-n, motive_other) %>% 
  filter(n < 2)  # (filtering out generic responses like just yes, no)


Cluster 2:

bind_cols(df_features_raw, cluster = km$cluster) %>% 
  select(chatfuel_user_id, cluster) %>% 
  left_join(df, by = "chatfuel_user_id") %>% 
  filter(cluster == 2) %>% 
  count(motive_other) %>% 
  arrange(-n, motive_other) %>% 
  filter(n < 2)  # (filtering out generic responses like just yes, no)


Cluster 3:

bind_cols(df_features_raw, cluster = km$cluster) %>% 
  select(chatfuel_user_id, cluster) %>% 
  left_join(df, by = "chatfuel_user_id") %>% 
  filter(cluster == 3) %>% 
  count(motive_other) %>% 
  arrange(-n, motive_other) %>% 
  filter(n < 5)  # (filtering out generic responses like just yes, no)

5.1.4.3 Preferred Treatments - Word clouds


Cluster 1:

vector_wc <-
  bind_cols(df_features_raw, cluster = km$cluster) %>% 
  select(chatfuel_user_id, cluster) %>% 
  left_join(df, by = "chatfuel_user_id") %>% 
  filter(cluster == 1) %>% 
  count(best_treatment_other) %>% 
  arrange(-n) %>% 
  mutate(
    best_treatment_other = str_to_lower(best_treatment_other),
    best_treatment_other = str_replace(best_treatment_other, "side effects", "sideeffects"),
    best_treatment_other = str_replace(best_treatment_other, "vaccine", ""),
    best_treatment_other = str_replace(best_treatment_other, "yes", ""),
    best_treatment_other = str_replace(best_treatment_other, "no", ""),
    best_treatment_other = str_replace(best_treatment_other, "dont", ""),
    best_treatment_other = str_replace(best_treatment_other, "don't", ""),
  ) %>% 
  filter(n < 5) %>%  # (filtering out generic responses like just yes, no)
  pull(best_treatment_other)

# Create corpus 
docs <- Corpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)

# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words=200, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"))


Cluster 2:

vector_wc <-
  bind_cols(df_features_raw, cluster = km$cluster) %>% 
  select(chatfuel_user_id, cluster) %>% 
  left_join(df, by = "chatfuel_user_id") %>% 
  filter(cluster == 2) %>% 
  count(best_treatment_other) %>% 
  arrange(-n) %>% 
  mutate(
    best_treatment_other = str_to_lower(best_treatment_other),
    best_treatment_other = str_replace(best_treatment_other, "side effects", "sideeffects"),
    best_treatment_other = str_replace(best_treatment_other, "vaccine", ""),
    best_treatment_other = str_replace(best_treatment_other, "yes", ""),
    best_treatment_other = str_replace(best_treatment_other, "no", ""),
    best_treatment_other = str_replace(best_treatment_other, "dont", ""),
    best_treatment_other = str_replace(best_treatment_other, "don't", ""),
  ) %>% 
  filter(n < 5) %>%  # (filtering out generic responses like just yes, no)
  pull(best_treatment_other)

# Create corpus 
docs <- Corpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)

# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words=200, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"))


Cluster 3:

vector_wc <-
  bind_cols(df_features_raw, cluster = km$cluster) %>% 
  select(chatfuel_user_id, cluster) %>% 
  left_join(df, by = "chatfuel_user_id") %>% 
  filter(cluster == 3) %>% 
  count(best_treatment_other) %>% 
  arrange(-n) %>% 
  mutate(
    best_treatment_other = str_to_lower(best_treatment_other),
    best_treatment_other = str_replace(best_treatment_other, "side effects", "sideeffects"),
    best_treatment_other = str_replace(best_treatment_other, "vaccine", ""),
    best_treatment_other = str_replace(best_treatment_other, "yes", ""),
    best_treatment_other = str_replace(best_treatment_other, "no", ""),
    best_treatment_other = str_replace(best_treatment_other, "dont", ""),
    best_treatment_other = str_replace(best_treatment_other, "don't", ""),
  ) %>% 
  filter(n < 5) %>%  # (filtering out generic responses like just yes, no)
  pull(best_treatment_other)

# Create corpus 
docs <- Corpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)

# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words=200, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"))

5.1.4.4 Preferred Treatments - Raw responses


Cluster 1:

bind_cols(df_features_raw, cluster = km$cluster) %>% 
  select(chatfuel_user_id, cluster) %>% 
  left_join(df, by = "chatfuel_user_id") %>% 
  filter(cluster == 1) %>% 
  count(best_treatment_other) %>% 
  arrange(-n, best_treatment_other) %>% 
  filter(n < 5)  # (filtering out generic responses like just yes, no)


Cluster 2:

bind_cols(df_features_raw, cluster = km$cluster) %>% 
  select(chatfuel_user_id, cluster) %>% 
  left_join(df, by = "chatfuel_user_id") %>% 
  filter(cluster == 2) %>% 
  count(best_treatment_other) %>% 
  arrange(-n, best_treatment_other) %>% 
  filter(n < 2)  # (filtering out generic responses like just yes, no)


Cluster 3:

bind_cols(df_features_raw, cluster = km$cluster) %>% 
  select(chatfuel_user_id, cluster) %>% 
  left_join(df, by = "chatfuel_user_id") %>% 
  filter(cluster == 3) %>% 
  count(best_treatment_other) %>% 
  arrange(-n, best_treatment_other) %>% 
  filter(n < 5)  # (filtering out generic responses like just yes, no)

6 Appendix

Clustering trials with 4/5/6 clusters:

6.1 4 clusters

We cluster the data with \(k = 4\) here. The plot below projects the generated clusters on a 2-d plane, with the axes denoting the first 2 principal components explaining the most variance. We see that the first two components explain very little variation in the data.

km <- kmeans(df_features, centers = 4, nstart = 25)

fviz_cluster(km, data = df_features, geom = "point", ggtheme = theme_minimal())

Size of each cluster is reported below:

tibble(
  Cluster = c(1:4),
  Size = km$size
) %>% 
  kable() %>% 
  kable_styling()
Cluster Size
1 287
2 160
3 1480
4 344

The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.

bind_cols(df_features, cluster = km$cluster) %>% 
  mutate_at(vars(-cluster), ~ scale(.) %>% as.vector()) %>% 
  group_by(cluster) %>% 
  summarise_all(~ mean(., na.rm = T)) %>%
  bind_rows(
    bind_cols(df_features, cluster = km$cluster) %>% 
      mutate_at(vars(-cluster), ~ scale(.) %>% as.vector()) %>% 
      group_by(cluster) %>% 
      summarise_all(~ sd(., na.rm = T)),
    .id = "stat"
  ) %>% 
  mutate(stat = if_else(stat == 1, "mean", "sd")) %>% 
  select(against_beliefs, no_benefits, risky, no_time, no_money, no_availability, ad_text_airtime, ad_theme_risky, ad_theme_unnecessary, ad_theme_inaccessible, ad_theme_neutral, ad_theme_innocence = `ad_theme_innocence/curiosity`, ad_theme_fear, ad_theme_familyvalues, elaboration, politeness, sentiment, age, female, income, education, religiosity, politics, location, white, everything()) %>% 
  clean_names(case = "title") %>%
  pivot_longer(cols = -c(Cluster, Stat)) %>%
  mutate(
    Cluster = factor(Cluster),
    name = name %>% fct_inorder() %>% fct_rev()
  ) %>%
  pivot_wider(names_from = Stat, values_from = value) %>% 
  mutate(
    mean = round(mean, 2),
    mean_char = as.character(mean),
    mean_char = if_else(mean > 0.4, "0.4+", mean_char),
    mean = if_else(mean > 0.4, 0.4, mean),
    sd = round(sd, 2),
    label_str = str_c(mean_char, " (", sd, ")"),
  ) %>% 
  ggplot() +
  geom_tile(aes(Cluster, name, fill = mean)) +
  geom_text(aes(Cluster, name, label = label_str), size = 3) +
  scale_fill_gradient2(
    low = "red",
    mid = "white",
    high = "green",
    midpoint = 0,
    limits = c(-0.4, 0.4)
  ) +
  theme_minimal() +
  labs(
    x = "Cluster", y = "Feature",
    fill = "Means (std)"
  )

6.2 5 clusters

We cluster the data with \(k = 5\) here. The plot below projects the generated clusters on a 2-d plane, with the axes denoting the first 2 principal components explaining the most variance. We see that the first two components explain very little variation in the data.

km <- kmeans(df_features, centers = 5, nstart = 25)

fviz_cluster(km, data = df_features, geom = "point", ggtheme = theme_minimal())

Size of each cluster is reported below:

tibble(
  Cluster = c(1:5),
  Size = km$size
) %>% 
  kable() %>% 
  kable_styling()
Cluster Size
1 258
2 542
3 320
4 171
5 980

The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.

bind_cols(df_features, cluster = km$cluster) %>% 
  mutate_at(vars(-cluster), ~ scale(.) %>% as.vector()) %>% 
  group_by(cluster) %>% 
  summarise_all(~ mean(., na.rm = T)) %>%
  bind_rows(
    bind_cols(df_features, cluster = km$cluster) %>% 
      mutate_at(vars(-cluster), ~ scale(.) %>% as.vector()) %>% 
      group_by(cluster) %>% 
      summarise_all(~ sd(., na.rm = T)),
    .id = "stat"
  ) %>% 
  mutate(stat = if_else(stat == 1, "mean", "sd")) %>% 
  select(against_beliefs, no_benefits, risky, no_time, no_money, no_availability, ad_text_airtime, ad_theme_risky, ad_theme_unnecessary, ad_theme_inaccessible, ad_theme_neutral, ad_theme_innocence = `ad_theme_innocence/curiosity`, ad_theme_fear, ad_theme_familyvalues, elaboration, politeness, sentiment, age, female, income, education, religiosity, politics, location, white, everything()) %>% 
  clean_names(case = "title") %>%
  pivot_longer(cols = -c(Cluster, Stat)) %>%
  mutate(
    Cluster = factor(Cluster),
    name = name %>% fct_inorder() %>% fct_rev()
  ) %>%
  pivot_wider(names_from = Stat, values_from = value) %>% 
  mutate(
    mean = round(mean, 2),
    mean_char = as.character(mean),
    mean_char = if_else(mean > 0.4, "0.4+", mean_char),
    mean = if_else(mean > 0.4, 0.4, mean),
    mean_char = if_else(mean < -0.4, "-0.4", mean_char),
    mean = if_else(mean < -0.4, -0.4, mean),
    sd = round(sd, 2),
    label_str = str_c(mean_char, " (", sd, ")"),
  ) %>% 
  ggplot() +
  geom_tile(aes(Cluster, name, fill = mean)) +
  geom_text(aes(Cluster, name, label = label_str), size = 3) +
  scale_fill_gradient2(
    low = "red",
    mid = "white",
    high = "green",
    midpoint = 0,
    limits = c(-0.4, 0.4)
  ) +
  theme_minimal() +
  labs(
    x = "Cluster", y = "Feature",
    fill = "Means (std)"
  )

6.3 6 clusters

We cluster the data with \(k = 6\) here. The plot below projects the generated clusters on a 2-d plane, with the axes denoting the first 2 principal components explaining the most variance. We see that the first two components explain very little variation in the data.

km <- kmeans(df_features, centers = 6, nstart = 25)

fviz_cluster(km, data = df_features, geom = "point", ggtheme = theme_minimal())

Size of each cluster is reported below:

tibble(
  Cluster = c(1:6),
  Size = km$size
) %>% 
  kable() %>% 
  kable_styling()
Cluster Size
1 224
2 299
3 271
4 426
5 903
6 148

The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.

bind_cols(df_features, cluster = km$cluster) %>% 
  mutate_at(vars(-cluster), ~ scale(.) %>% as.vector()) %>% 
  group_by(cluster) %>% 
  summarise_all(~ mean(., na.rm = T)) %>%
  bind_rows(
    bind_cols(df_features, cluster = km$cluster) %>% 
      mutate_at(vars(-cluster), ~ scale(.) %>% as.vector()) %>% 
      group_by(cluster) %>% 
      summarise_all(~ sd(., na.rm = T)),
    .id = "stat"
  ) %>% 
  mutate(stat = if_else(stat == 1, "mean", "sd")) %>% 
  select(against_beliefs, no_benefits, risky, no_time, no_money, no_availability, ad_text_airtime, ad_theme_risky, ad_theme_unnecessary, ad_theme_inaccessible, ad_theme_neutral, ad_theme_innocence = `ad_theme_innocence/curiosity`, ad_theme_fear, ad_theme_familyvalues, elaboration, politeness, sentiment, age, female, income, education, religiosity, politics, location, white, everything()) %>% 
  clean_names(case = "title") %>%
  pivot_longer(cols = -c(Cluster, Stat)) %>%
  mutate(
    Cluster = factor(Cluster),
    name = name %>% fct_inorder() %>% fct_rev()
  ) %>%
  pivot_wider(names_from = Stat, values_from = value) %>% 
  mutate(
    mean = round(mean, 2),
    mean_char = as.character(mean),
    mean_char = if_else(mean > 0.4, "0.4+", mean_char),
    mean = if_else(mean > 0.4, 0.4, mean),
    mean_char = if_else(mean < -0.4, "-0.4", mean_char),
    mean = if_else(mean < -0.4, -0.4, mean),
    sd = round(sd, 2),
    label_str = str_c(mean_char, " (", sd, ")"),
  ) %>% 
  ggplot() +
  geom_tile(aes(Cluster, name, fill = mean)) +
  geom_text(aes(Cluster, name, label = label_str), size = 3) +
  scale_fill_gradient2(
    low = "red",
    mid = "white",
    high = "green",
    midpoint = 0,
    limits = c(-0.4, 0.4)
  ) +
  theme_minimal() +
  labs(
    x = "Cluster", y = "Feature",
    fill = "Means (std)"
  )