1 Overview

The present user segmentation analysis is based on 7318 respondents completing our survey for pilot wave 8 (August 2022), and 81 features (both MCQ and free text). The related GitHub issue is here.

Analysis outline

  • Free text analyses
    • Multinomial inverse regressions on free text tokens (Section 2.1)
    • Free text word clouds for each preferred treatment (Section 2.2)
    • Topic models for free text responses (Section 2.3)

   

# Load packages
pacman::p_load(DT, estimatr, kableExtra, readr, reshape2, tidyverse, xtable, dataMaid, ggcorrplot, ggmap, rpart, rpart.plot, pollster, wordcloud, tm, RColorBrewer, hrbrthemes, janitor, purrr, gridExtra, cowplot, rcompanion, nnet, texreg, compareGroups, factoextra, cluster, fastDummies, simputation, sentimentr, politeness, textir, glmnet, gamlr, tm, topicmodels, ldatuning, lda, SnowballC, olsrr)

set.seed(94305)

# read in CURRENT chatfuel data

# full data
df_full_v8 <- 
  read_csv(here::here("pilot8/data/full_df_clean.csv")) %>% 
  clean_names() %>% 
  mutate_if(is.character, ~ str_replace_all(., '[\n\t]', '')) %>%
  mutate(
    motive = if_else(str_detect(motive, "yes"), "yes", "no"),
    motive_main = if_else(str_detect(motive_main, "risk"), "risk", motive_main),
    best_treatment = str_remove_all(best_treatment, "\\.") %>% str_to_sentence(),
    best_treatment = if_else(best_treatment == "New trusted info", "More safety evidence", best_treatment),
  ) %>%
  remove_empty()

# filter to completes
# filter to completes
df <-
  df_full_v8 %>%
  filter(full_complete == "complete") %>% 
  drop_na(vax_status) %>%
  left_join(read_csv(here::here("pilot8/data/df_features_treatment_proposal.csv")) %>% select(id, treatment_text = best_treatment_proposal_feature), by = "id") %>%
  left_join(read_csv(here::here("pilot8/data/df_features_ability_reason.csv")) %>% select(id, lack_ability = ability_reason_feature), by = "id") %>%
  mutate(
    best_treatment = case_when(
      best_treatment == "Other" & str_detect(treatment_text, "easier access to vaccine") ~ "Convenient to vaccinate",
      best_treatment == "Other" & str_detect(treatment_text, "nothing") ~ "Nothing",
      best_treatment == "Other" & str_detect(treatment_text, "family|encourag") ~ "Family/friend endorses it",
      best_treatment == "Other" & str_detect(treatment_text, "safe|info") ~ "More safety evidence",
      best_treatment == "Other" & str_detect(treatment_text, "mandate") ~ "Job/school required it",
      best_treatment == "Other" & str_detect(treatment_text, "rewards") ~ "Rewards for vaccinating",
      best_treatment == "Other" & str_detect(treatment_text, "time") ~ "Time off work",
      TRUE ~ best_treatment,
    )
  ) %>% 
  remove_empty("rows")

#df %>% select(contains("treatm")) %>% glimpse()
#df %>% count(best_treatment) %>% arrange(-n)
#glimpse(df)

2 Free text Analysis

We start with cleaning chatbot features to be used in our models. Below are summary statistics for the cleaned features fed into the models:

# clean up demographic variables

df_features <-
  df %>% 
  transmute(
    age,
    education = education_num,
    religiosity = religiosity_num,
    location = location_num,
    black_or_african = if_else(ethnicity == "black or african", 1L, 0L) %>% replace_na(0),
    vaccinated = vax_status_num,
    covid_already = if_else(str_detect(covid_already, "No|no"), 0L, 1L) %>% replace_na(0),
    covid_vax_future = if_else(str_detect(covid_vax_future, "Likely|Maybe"), 1L, 0L) %>% replace_na(0),
    
    no_motive = case_when(
      motive == "yes" ~ 0L,
      motive == "no" ~ 1L,
    ),
    motive_elaboration = motive_nchar,
    
    no_ability = case_when(
      ability == "easy" ~ 0L,
      ability != "easy" & !is.na(ability) ~ 1L,
    ) %>% replace_na(0),
    ability_elaboration = ability_nchar %>% replace_na(0),
    
    against_beliefs = if_else(motive_main == "beliefs", 1L, 0L) %>% replace_na(0),
    no_benefits = if_else(motive_main == "benefit", 1L, 0L) %>% replace_na(0),
    risky = if_else(motive_main == "risk", 1L, 0L) %>% replace_na(0),
    
    no_time = if_else(ability_main == "time", 1L, 0L) %>% replace_na(0),
    no_money = if_else(ability_main == "money", 1L, 0L) %>% replace_na(0),
    no_availability = if_else(ability_main == "availability", 1L, 0L) %>% replace_na(0),
    
    # info variables
    info_source = str_squish(info_source) %>% str_to_lower(),
    info_source = case_when(
      str_detect(info_source, "social|facebook|twitter|whatsapp|fb|insta|tiktok|youtube") ~ "social media",
      str_detect(info_source, "online|google|internet|web") ~ "internet",
      str_detect(info_source, "media|tv|television") ~ "television",
      str_detect(info_source, "radio") ~ "radio",
      str_detect(info_source, "news") ~ "news",
      str_detect(info_source, "friend|family|work|elder") ~ "friends & family",
      str_detect(info_source, "health|hospital|clinic") ~ "hospital",
      str_detect(info_source, "school|teach|educ") ~ "school",
    ),
    info_confidence = str_to_lower(info_confidence),
    info_confidence_high = if_else(str_detect(info_confidence, "very"), 1L, 0L) %>% replace_na(0),
    want_link = if_else(str_detect(want_link, "Sure"), 1L, 0L) %>% replace_na(0),
    want_answer = if_else(str_detect(want_answer, "Sure"), 1L, 0L) %>% replace_na(0),
    self_reflection = if_else(self_reflection == "A lot!", 1L, 0L) %>% replace_na(0),
    
    best_treatment,
    
    lack_motive  = case_when(
      
      str_detect(motive_reason, "(((negative|sides?) ?-?)?(a|e)fff?ects?[^ive]|ma(k|d)(e|es|ing) (me|people|them|you) sick|g(e|o)t(ting)? sick|more sick|sick effect)|(after|side|the) effect|react|bad effect|aftermath|vomit|ache|cause health problems|dizzy|negative effect|sick after|clot|fever|fatigue|health complication afterward|weakens my body|getting ill|collapse|falling down|dizziness") ~ "side_effects",
      
      str_detect(motive_reason, "needles?|injections?") ~ "needles_injection",
      
      str_detect(motive_reason, "not good for my body|safe|harm|dangerous|dangers|unhealthy|bad for my health") ~ "unsafe",
      
      str_detect(motive_reason, "lab rats|paralyzed|illuminati|rumurs|infertil|misinformed|mis ?information|ro?umou?rs?|myths?|theories|conspiracy|scam|zombie|controvers|romours|robot|political|politics|criticism|propaganda|gossip|online|misleading info|false info|misconceptions?|(fake )?news|article|magnetic|hoax|fake") ~ "misinformation",
      
      str_detect(motive_reason, "scar(e|y)|fear|afraid") ~ "scared", 
      
      str_detect(motive_reason, "((don\'?t|didn'?t|did not|do not|no) (really )?trust|trusted|distrust|doubt|s(k|c)eptical)|not trusting it|untrustworthy|the reviews") ~ "trust",
      
      str_detect(motive_reason, "preg3|am expectant|breastfeeding|pregnan|lactating|newborn|infant|nursing") ~ "pregnancy_nursing",
      
      str_detect(motive_reason, "pain|hurt") ~ "pain", 
      
      str_detect(motive_reason, "demonic|religio|church|god|christian|antichrist|666") ~ "religion", 
      
      str_detect(motive_reason, "die|death|dead|kill|dying") ~ "death", 
      
      str_detect(motive_reason, "soceity judgment|sibling|family|friends?|parents?|brother|sister|uncle|aunt|community|cousin") ~ "family_friends",
      
      str_detect(motive_reason, "risk") ~ "risk", 
      
      str_detect(motive_reason, "more research|enough proof|i did see enough proof|no sure prove|unanswered questions|limited data|not informed|knowledge|information|didn't know|i don't (really )?know") ~ "more_information", 
      
      str_detect(motive_reason, "government") ~ "government", 
      
      str_detect(motive_reason, "benefit") ~ "no_benefit", 
      
      str_detect(motive_reason, "effective|no proof it works") ~ "ineffective", 
      
      str_detect(motive_reason, "i don'?t have reason|^not really$|no reason|nothing|not? interest|don't feel like|^none$") ~ "no_reason", 
      
      str_detect(motive_reason, "not real[^ly]|no covid|exist|is real") ~ "covid_not_real", 
      
      str_detect(motive_reason, "did not think it'?s important|not common|didn'?t think it was necessary|is gone|few cases|less prevalent|almost over|post\\-? ?covid|isn't anything serious|not important|already controlled|i don't find it necessary|isn't as prevalent|covid is gone|i don't really hear of the covid|rates have decreased|spread.*is low|not serious|not that important|no need|not help|not? use|no more|no longer|is over|lowered|need |finished|necessity|not necessary|i don't think it'?s necessary|i don'?t see the point") ~ "no_need",
      
      str_detect(motive_reason, "listened to people|negative reviews|people talk|talking negative things|what people say|(bad|negative) report|negative things from social media|condemning|opinion|bad reviews|hearing negative things|people say|negativity|heare?d|hearsay|stories|talks|discussion|speculation|saying|feedback") ~ "heard_hearsay", 
      
      str_detect(motive_reason, "belie(f|ve)") ~ "beliefs", 
      
      str_detect(motive_reason, "never met a victim|i haven't seen someone|never saw|never seen|no one (has|from)|i have never someone") ~ "never_saw_covid",
      
      str_detect(motive_reason, "my immune is strong|okay with my health|am good/fine|am technically okay|my immunity system|(i'?m|am) not sick|good health|i feel (healthy|fine)|i feel (very )?ok(ay)?|healthy? is good|am healthy|healthy person|high immunity|healthy lifestyle|am just healthy|fine healthy|still healthy|i was healthy|i am just healthy|always been healthy|eating healthy|^(?!.*(weak|affect my)).*immune system|strong immunity") ~ "healthy",
      
      str_detect(motive_reason, "asthmatic|weak immune system|health issue|it's not good for human health|not good for my health|no good for health|allergies|health issues|body weakness|chronic illness|high blood pressure") ~ "health_issues",
      
      str_detect(motive_reason, "distant visit|near the place|nearby|nearest place|far |not available in my area") ~ "distance",
      
      str_detect(motive_reason, "office|not living close|at work|requirement at work|work related|job") ~ "work",
      
      str_detect(motive_reason, "travel") ~ "travel", 
      
      str_detect(motive_reason, "nowhere near contacting|am not infected|i had not gotten corona|don'?t have.*(virus|covid|symptoms|it$)") ~ "dont_have_covid",
      
      str_detect(motive_reason, "^(yes|nice|noo?|ok(ay)?|bad|yes sure|nope?|unsure|pardon|lol)$|https\\:\\/\\/|personal reasons") | nchar(motive_reason) <3 ~ "nonsensical",
      
      str_detect(motive_reason, "availab") ~ "no_availability",
      
      str_detect(motive_reason, "expensive|money|lack of money|cost") ~ "need_money",
      
      TRUE ~ NA_character_,
    ),
    treatment = treatment_text,
    
  ) %>% 
  select(-info_confidence) %>%
  dummy_cols(select_columns = "info_source", remove_most_frequent_dummy = TRUE, remove_selected_columns = TRUE, ignore_na = TRUE) %>%
  dummy_cols(select_columns = "lack_motive", remove_most_frequent_dummy = TRUE, remove_selected_columns = TRUE, ignore_na = TRUE) %>%
  dummy_cols(select_columns = "treatment", remove_most_frequent_dummy = TRUE, remove_selected_columns = TRUE, ignore_na = TRUE) %>%
  mutate_at(vars(starts_with(c("info", "motive", "treatment"))), ~ replace_na(., 0))
## Error: Problem with `mutate()` column `covid_vax_future`.
## ℹ `covid_vax_future = `%>%`(...)`.
## x object 'covid_vax_future' not found
df_features %>%
  clean_names(case = "title") %>%
  papeR::summarize_numeric() %>% 
  datatable(options = list(pageLength = 10, columnDefs = list(list(orderable = TRUE, targets = 0))))
## Error in clean_names(., case = "title"): object 'df_features' not found
df %>% count(best_treatment) %>% arrange(-n)

2.1 Multinomial Inverse regressions

We look at multinomial inverse regressions to understand how chatbot free text responses connect with preferred treatments. In these regressions, we regress the free text token counts on a set of known attributes (preferred treatments in our case).

Creating counts and covariate matrices:

## TOKENS ##
freetext <-
  df %>%
  filter(!(best_treatment %in% c("Other", "No, something else", "Reminders"))) %>%
  drop_na(best_treatment) %>%
  select(contains(c("_other", "_explain")), opinion_friend_family, opinion_conversation, motive_reason, motive_misunderstood_explain, ability_reason, ability_misunderstood_explain, best_treatment_proposal, best_treatment_none_explain, info_confidence_explain, challenge_have_covid, self_reflection_explain) %>% 
  unite("text", 1:29, na.rm = T, remove = T, sep = ". ") %>% 
  mutate(
    text = gsub("http[^[:space:]]*", "", text) %>% str_to_lower(),
    text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|covid|vaccination")  
  ) %>% 
  drop_na() %>% 
  pull(text)

# Create corpus
docs <- Corpus(VectorSource(freetext))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
matrix <- t(matrix)


## COVARIATES ##
covars_df <-
  df %>%
  select(best_treatment) %>% 
  drop_na(best_treatment) %>%
  filter(!(best_treatment %in% c("Other", "No, something else", "Reminders"))) %>%
  dummy_cols(select_columns = "best_treatment", remove_first_dummy = TRUE, remove_selected_columns = TRUE)

covars <- covars_df %>% as.matrix()

covar_names <-
  df %>%
  select(best_treatment) %>%
  drop_na(best_treatment) %>%
  filter(!(best_treatment %in% c("Other", "No, something else", "Reminders"))) %>%
  dummy_cols(select_columns = "best_treatment", remove_first_dummy = TRUE, remove_selected_columns = TRUE) %>% 
  colnames() %>% 
  str_sub(., 16, -1)

Top 100 tokens with counts:

words[1:100]
##         good       people          get         dont         know      nothing 
##         3015         2962         2240         2167         2152         1935 
##        think  information       health      effects         will         side 
##         1744         1546         1462         1395         1364         1279 
##      getting         just       really         safe       family         take 
##         1277         1218         1155         1118         1082         1061 
##         much          say         sure         like        virus      friends 
##         1021          940          899          874          848          831 
##         true   government          got         well       saying         time 
##          828          825          769          769          760          741 
##      believe          can          one    awareness          yes       taking 
##          740          721          699          676          668          649 
##         many         work         help         made         fear         real 
##          623          591          588          586          574          547 
##      helpful          now         need    important         said        still 
##          546          543          541          528          528          525 
##         want         also    isolation        trust          bad       scared 
##          512          510          502          495          487          486 
##       others         feel        since         make        great    available 
##          483          464          464          459          455          454 
##        money        cause      disease         nice        media        didnt 
##          449          447          437          434          431          429 
##   importance availability         body        going          job    education 
##          426          416          399          396          393          386 
##         news       safety       source         okay          lot     hospital 
##          385          383          383          383          379          376 
##      thought       things        right        first     actually         able 
##          375          375          372          360          360          349 
##        taken    challenge       afraid     positive         sick          see 
##          348          343          341          338          337          332 
##         alot       always       helped        might       public       around 
##          327          322          321          319          317          316 
##         cant        heard       social   understand 
##          315          314          314          312

Fitting the model:

cl <- makeCluster(detectCores())

fit <- mnlm(cl, covars, matrix)
stopCluster(cl)
B <- coef(fit)

Exploring coefficients on preferred treatments for 5 frequent and relevant free text tokens (information, safety, family, health, government) below.

2.1.1 Information

plot(fit$information, col = c("red", "green", "blue", "orange", "black", "magenta", "cyan"), main = "'Information' token in free text")
legend("bottomleft", legend = covar_names, fill = c("red", "green", "blue", "orange", "black", "magenta", "cyan"))

2.1.2 Safe

plot(fit$safe, col = c("red", "green", "blue", "orange", "black", "magenta", "cyan"), main = "'Safe' token in free text")
legend("bottomright", legend = covar_names, fill = c("red", "green", "blue", "orange", "black", "magenta", "cyan"))

2.1.3 Family

plot(fit$family, col = c("red", "green", "blue", "orange", "black", "magenta", "cyan"), main = "'Family' token in free text")
legend("bottomleft", legend = covar_names, fill = c("red", "green", "blue", "orange", "black", "magenta", "cyan"))

2.1.4 Health

plot(fit$health, col = c("red", "green", "blue", "orange", "black", "magenta", "cyan"), main = "'Health' token in free text")
legend("topleft", legend = covar_names, fill = c("red", "green", "blue", "orange", "black", "magenta", "cyan"))

2.1.5 Government

plot(fit$government, col = c("red", "green", "blue", "orange", "black", "magenta", "cyan"), main = "'Government' token in free text")

legend("bottomright", legend = covar_names, fill = c("red", "green", "blue", "orange", "black", "magenta", "cyan"))


2.2 Free text: Wordclouds by preferred treatment

The wordclouds below are generated from a participant’s complete free text conversation with the chatbot. We generate a separate wordcould for each subgroup of participants choosing a preferred treatment.

More safety evidence:

vector_wc <-
  df %>%
  filter(best_treatment == "More safety evidence") %>% 
  select(best_treatment, contains(c("_other", "_explain")), opinion_friend_family, opinion_conversation, motive_reason, motive_misunderstood_explain, ability_reason, ability_misunderstood_explain, best_treatment_proposal, best_treatment_none_explain, info_confidence_explain, challenge_have_covid, self_reflection_explain) %>% 
  unite("text", 2:30, na.rm = T, remove = T, sep = ". ") %>% 
  mutate(
    text = gsub("http[^[:space:]]*", "", text) %>% str_to_lower(),
    text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|covid|vaccination")  
  ) %>% 
  pull(text)

# Create corpus 
docs <- Corpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)

# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))

Family/friend endorses it:

vector_wc <-
  df %>%
  filter(best_treatment == "Family/friend endorses it") %>% 
  select(best_treatment, contains(c("_other", "_explain")), opinion_friend_family, opinion_conversation, motive_reason, motive_misunderstood_explain, ability_reason, ability_misunderstood_explain, best_treatment_proposal, best_treatment_none_explain, info_confidence_explain, challenge_have_covid, self_reflection_explain) %>% 
  unite("text", 2:30, na.rm = T, remove = T, sep = ". ") %>% 
  mutate(
    text = gsub("http[^[:space:]]*", "", text) %>% str_to_lower(),
    text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|covid|vaccination")  
  ) %>% 
  pull(text)

# Create corpus 
docs <- Corpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)

# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))

Job/school required it:

vector_wc <-
  df %>%
  filter(best_treatment == "Job/school required it") %>% 
  select(best_treatment, contains(c("_other", "_explain")), opinion_friend_family, opinion_conversation, motive_reason, motive_misunderstood_explain, ability_reason, ability_misunderstood_explain, best_treatment_proposal, best_treatment_none_explain, info_confidence_explain, challenge_have_covid, self_reflection_explain) %>% 
  unite("text", 2:30, na.rm = T, remove = T, sep = ". ") %>% 
  mutate(
    text = gsub("http[^[:space:]]*", "", text) %>% str_to_lower(),
    text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|covid|vaccination")  
  ) %>% 
  pull(text)

# Create corpus 
docs <- Corpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)

# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))

Convenient to vaccinate:

vector_wc <-
  df %>%
  filter(best_treatment == "Convenient to vaccinate") %>% 
  select(best_treatment, contains(c("_other", "_explain")), opinion_friend_family, opinion_conversation, motive_reason, motive_misunderstood_explain, ability_reason, ability_misunderstood_explain, best_treatment_proposal, best_treatment_none_explain, info_confidence_explain, challenge_have_covid, self_reflection_explain) %>% 
  unite("text", 2:30, na.rm = T, remove = T, sep = ". ") %>% 
  mutate(
    text = gsub("http[^[:space:]]*", "", text) %>% str_to_lower(),
    text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|covid|vaccination")  
  ) %>% 
  pull(text)

# Create corpus 
docs <- Corpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)

# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))

Nothing:

vector_wc <-
  df %>%
  filter(best_treatment == "Nothing") %>% 
  select(best_treatment, contains(c("_other", "_explain")), opinion_friend_family, opinion_conversation, motive_reason, motive_misunderstood_explain, ability_reason, ability_misunderstood_explain, best_treatment_proposal, best_treatment_none_explain, info_confidence_explain, challenge_have_covid, self_reflection_explain) %>% 
  unite("text", 2:30, na.rm = T, remove = T, sep = ". ") %>% 
  mutate(
    text = gsub("http[^[:space:]]*", "", text) %>% str_to_lower(),
    text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|covid|vaccination")  
  ) %>% 
  pull(text)

# Create corpus 
docs <- Corpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)

# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))

Other/Something else:

vector_wc <-
  df %>%
  filter(best_treatment %in% c("Other", "No, something else")) %>%
  select(best_treatment, contains(c("_other", "_explain")), opinion_friend_family, opinion_conversation, motive_reason, motive_misunderstood_explain, ability_reason, ability_misunderstood_explain, best_treatment_proposal, best_treatment_none_explain, info_confidence_explain, challenge_have_covid, self_reflection_explain) %>% 
  unite("text", 2:30, na.rm = T, remove = T, sep = ". ") %>% 
  mutate(
    text = gsub("http[^[:space:]]*", "", text) %>% str_to_lower(),
    text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|covid|vaccination")  
  ) %>% 
  pull(text)

# Create corpus 
docs <- Corpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)

# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))


2.3 Topic Models

As another unsupervised approach, we fit a topic model to free text answers mentioned by respondents.

data_vector <-
  df %>% 
  select(best_treatment, contains(c("_other", "_explain")), opinion_friend_family, opinion_conversation, motive_reason, motive_misunderstood_explain, ability_reason, ability_misunderstood_explain, best_treatment_proposal, best_treatment_none_explain, info_confidence_explain, challenge_have_covid, self_reflection_explain) %>% 
  unite("text", 2:30, na.rm = T, remove = T, sep = ". ") %>% 
  mutate(
    text = gsub("http[^[:space:]]*", "", text) %>% str_to_lower(),
    text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|covid|vaccination")  
  ) %>% 
  pull(text)

# Create corpus
corpus <- Corpus(VectorSource(data_vector))

# Clean corpus
docs <-
  corpus %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)


minimumFrequency <- 5
DTM <- DocumentTermMatrix(docs, control = list(bounds = list(global = c(minimumFrequency, Inf))))

sel_idx <- slam::row_sums(DTM) > 0
DTM <- DTM[sel_idx, ]

We first inspect the ideal number of topics. In this case, we use two methods CaoJuan2009 and Deveaud2014. The best number of topics shows low values for CaoJuan2009 and high values for Deveaud2014.

result <- 
  ldatuning::FindTopicsNumber(
    DTM,
    topics = seq(from = 2, to = 20, by = 1),
    metrics = c("CaoJuan2009",  "Deveaud2014"),
    method = "Gibbs",
    control = list(seed = 77),
    verbose = TRUE
  )
## fit models... done.
## calculate metrics:
##   CaoJuan2009... done.
##   Deveaud2014... done.
ldatuning::FindTopicsNumber_plot(result)

Based on the plots above, 6 seems to be a decent number of topics to start with, as it has low CaoJuan2009 value and high Deveaud2014 value.

K <- 6

# set random number generator seed
set.seed(94305)

# compute the LDA model, inference via 1000 iterations of Gibbs sampling
topicModel <- LDA(DTM, K, method = "Gibbs", control = list(iter = 1000, verbose = 25))
## K = 6; V = 2578; M = 7227
## Sampling 1000 iterations!
## Iteration 25 ...
## Iteration 50 ...
## Iteration 75 ...
## Iteration 100 ...
## Iteration 125 ...
## Iteration 150 ...
## Iteration 175 ...
## Iteration 200 ...
## Iteration 225 ...
## Iteration 250 ...
## Iteration 275 ...
## Iteration 300 ...
## Iteration 325 ...
## Iteration 350 ...
## Iteration 375 ...
## Iteration 400 ...
## Iteration 425 ...
## Iteration 450 ...
## Iteration 475 ...
## Iteration 500 ...
## Iteration 525 ...
## Iteration 550 ...
## Iteration 575 ...
## Iteration 600 ...
## Iteration 625 ...
## Iteration 650 ...
## Iteration 675 ...
## Iteration 700 ...
## Iteration 725 ...
## Iteration 750 ...
## Iteration 775 ...
## Iteration 800 ...
## Iteration 825 ...
## Iteration 850 ...
## Iteration 875 ...
## Iteration 900 ...
## Iteration 925 ...
## Iteration 950 ...
## Iteration 975 ...
## Iteration 1000 ...
## Gibbs sampling completed!

Top 10 terms per topic:

# have a look a some of the results (posterior distributions)
tmResult <- posterior(topicModel)

# format of the resulting object
# attributes(tmResult)
# nTerms(DTM)

# topics are probability distributions over the entire vocabulary
beta <- tmResult$terms   # get beta from results

# for every document we have a probability distribution of its contained topics
theta <- tmResult$topics

terms(topicModel, 10)
##       Topic 1   Topic 2      Topic 3   Topic 4   Topic 5        Topic 6  
##  [1,] "people"  "good"       "dont"    "effects" "information"  "think"  
##  [2,] "get"     "virus"      "know"    "side"    "health"       "take"   
##  [3,] "will"    "yes"        "nothing" "safe"    "awareness"    "believe"
##  [4,] "family"  "help"       "just"    "true"    "isolation"    "can"    
##  [5,] "friends" "getting"    "really"  "getting" "available"    "need"   
##  [6,] "many"    "helpful"    "much"    "sure"    "importance"   "taking" 
##  [7,] "got"     "real"       "well"    "fear"    "availability" "say"    
##  [8,] "also"    "important"  "like"    "body"    "media"        "one"    
##  [9,] "say"     "disease"    "said"    "safety"  "government"   "didnt"  
## [10,] "time"    "government" "trust"   "others"  "education"    "made"
top5termsPerTopic <- terms(topicModel, 3)

# concatenate the five most likely terms of each topic to a string that represents a pseudo-name for each topic
topicNames <- apply(top5termsPerTopic, 2, paste, collapse = " ")

Word cloud for topic 1:

# visualize topics as word cloud

topicToViz <- 1 # change for your own topic of interest

# select to 100 most probable terms from the topic by sorting the term-topic-probability vector in decreasing order
top100terms <- sort(tmResult$terms[topicToViz,], decreasing = TRUE)[1:100]
words <- names(top100terms)

# extract the probabilites of each of the 100 terms
probabilities <- sort(tmResult$terms[topicToViz,], decreasing=TRUE)[1:100]

# visualize the terms as wordcloud
mycolors <- brewer.pal(8, "Dark2")

wordcloud(words, probabilities, random.order = FALSE, color = mycolors)

Word cloud for topic 2:

# visualize topics as word cloud

topicToViz <- 2 # change for your own topic of interest

# select to 100 most probable terms from the topic by sorting the term-topic-probability vector in decreasing order
top100terms <- sort(tmResult$terms[topicToViz,], decreasing = TRUE)[1:100]
words <- names(top100terms)

# extract the probabilites of each of the 100 terms
probabilities <- sort(tmResult$terms[topicToViz,], decreasing=TRUE)[1:100]

# visualize the terms as wordcloud
mycolors <- brewer.pal(8, "Dark2")

wordcloud(words, probabilities, random.order = FALSE, color = mycolors)

Word cloud for topic 3:

# visualize topics as word cloud

topicToViz <- 3 # change for your own topic of interest

# select to 100 most probable terms from the topic by sorting the term-topic-probability vector in decreasing order
top100terms <- sort(tmResult$terms[topicToViz,], decreasing = TRUE)[1:100]
words <- names(top100terms)

# extract the probabilites of each of the 100 terms
probabilities <- sort(tmResult$terms[topicToViz,], decreasing=TRUE)[1:100]

# visualize the terms as wordcloud
mycolors <- brewer.pal(8, "Dark2")

wordcloud(words, probabilities, random.order = FALSE, color = mycolors)

Word cloud for topic 4:

# visualize topics as word cloud

topicToViz <- 4 # change for your own topic of interest

# select to 100 most probable terms from the topic by sorting the term-topic-probability vector in decreasing order
top100terms <- sort(tmResult$terms[topicToViz,], decreasing = TRUE)[1:100]
words <- names(top100terms)

# extract the probabilites of each of the 100 terms
probabilities <- sort(tmResult$terms[topicToViz,], decreasing=TRUE)[1:100]

# visualize the terms as wordcloud
mycolors <- brewer.pal(8, "Dark2")

wordcloud(words, probabilities, random.order = FALSE, color = mycolors)

Word cloud for topic 5:

# visualize topics as word cloud

topicToViz <- 5 # change for your own topic of interest

# select to 100 most probable terms from the topic by sorting the term-topic-probability vector in decreasing order
top100terms <- sort(tmResult$terms[topicToViz,], decreasing = TRUE)[1:100]
words <- names(top100terms)

# extract the probabilites of each of the 100 terms
probabilities <- sort(tmResult$terms[topicToViz,], decreasing=TRUE)[1:100]

# visualize the terms as wordcloud
mycolors <- brewer.pal(8, "Dark2")

wordcloud(words, probabilities, random.order = FALSE, color = mycolors)

Word cloud for topic 6:

# visualize topics as word cloud

topicToViz <- 6 # change for your own topic of interest

# select to 100 most probable terms from the topic by sorting the term-topic-probability vector in decreasing order
top100terms <- sort(tmResult$terms[topicToViz,], decreasing = TRUE)[1:100]
words <- names(top100terms)

# extract the probabilites of each of the 100 terms
probabilities <- sort(tmResult$terms[topicToViz,], decreasing=TRUE)[1:100]

# visualize the terms as wordcloud
mycolors <- brewer.pal(8, "Dark2")

wordcloud(words, probabilities, random.order = FALSE, color = mycolors)