The present user segmentation analysis is based on 7318 respondents completing our survey for pilot wave 8 (August 2022), and 81 features (both MCQ and free text). The related GitHub issue is here.
Analysis outline
 Â
# Load packages
pacman::p_load(DT, estimatr, kableExtra, readr, reshape2, tidyverse, xtable, dataMaid, ggcorrplot, ggmap, rpart, rpart.plot, pollster, wordcloud, tm, RColorBrewer, hrbrthemes, janitor, purrr, gridExtra, cowplot, rcompanion, nnet, texreg, compareGroups, factoextra, cluster, fastDummies, simputation, sentimentr, politeness, textir, glmnet, gamlr, tm, topicmodels, ldatuning, lda, SnowballC, olsrr)
set.seed(94305)
# read in CURRENT chatfuel data
# full data
df_full_v8 <-
read_csv(here::here("pilot8/data/full_df_clean.csv")) %>%
clean_names() %>%
mutate_if(is.character, ~ str_replace_all(., '[\n\t]', '')) %>%
mutate(
motive = if_else(str_detect(motive, "yes"), "yes", "no"),
motive_main = if_else(str_detect(motive_main, "risk"), "risk", motive_main),
best_treatment = str_remove_all(best_treatment, "\\.") %>% str_to_sentence(),
best_treatment = if_else(best_treatment == "New trusted info", "More safety evidence", best_treatment),
) %>%
remove_empty()
# filter to completes
# filter to completes
df <-
df_full_v8 %>%
filter(full_complete == "complete") %>%
drop_na(vax_status) %>%
left_join(read_csv(here::here("pilot8/data/df_features_treatment_proposal.csv")) %>% select(id, treatment_text = best_treatment_proposal_feature), by = "id") %>%
left_join(read_csv(here::here("pilot8/data/df_features_ability_reason.csv")) %>% select(id, lack_ability = ability_reason_feature), by = "id") %>%
mutate(
best_treatment = case_when(
best_treatment == "Other" & str_detect(treatment_text, "easier access to vaccine") ~ "Convenient to vaccinate",
best_treatment == "Other" & str_detect(treatment_text, "nothing") ~ "Nothing",
best_treatment == "Other" & str_detect(treatment_text, "family|encourag") ~ "Family/friend endorses it",
best_treatment == "Other" & str_detect(treatment_text, "safe|info") ~ "More safety evidence",
best_treatment == "Other" & str_detect(treatment_text, "mandate") ~ "Job/school required it",
best_treatment == "Other" & str_detect(treatment_text, "rewards") ~ "Rewards for vaccinating",
best_treatment == "Other" & str_detect(treatment_text, "time") ~ "Time off work",
TRUE ~ best_treatment,
)
) %>%
remove_empty("rows")
#df %>% select(contains("treatm")) %>% glimpse()
#df %>% count(best_treatment) %>% arrange(-n)
#glimpse(df)We start with cleaning chatbot features to be used in our models. Below are summary statistics for the cleaned features fed into the models:
# clean up demographic variables
df_features <-
df %>%
transmute(
age,
education = education_num,
religiosity = religiosity_num,
location = location_num,
black_or_african = if_else(ethnicity == "black or african", 1L, 0L) %>% replace_na(0),
vaccinated = vax_status_num,
covid_already = if_else(str_detect(covid_already, "No|no"), 0L, 1L) %>% replace_na(0),
covid_vax_future = if_else(str_detect(covid_vax_future, "Likely|Maybe"), 1L, 0L) %>% replace_na(0),
no_motive = case_when(
motive == "yes" ~ 0L,
motive == "no" ~ 1L,
),
motive_elaboration = motive_nchar,
no_ability = case_when(
ability == "easy" ~ 0L,
ability != "easy" & !is.na(ability) ~ 1L,
) %>% replace_na(0),
ability_elaboration = ability_nchar %>% replace_na(0),
against_beliefs = if_else(motive_main == "beliefs", 1L, 0L) %>% replace_na(0),
no_benefits = if_else(motive_main == "benefit", 1L, 0L) %>% replace_na(0),
risky = if_else(motive_main == "risk", 1L, 0L) %>% replace_na(0),
no_time = if_else(ability_main == "time", 1L, 0L) %>% replace_na(0),
no_money = if_else(ability_main == "money", 1L, 0L) %>% replace_na(0),
no_availability = if_else(ability_main == "availability", 1L, 0L) %>% replace_na(0),
# info variables
info_source = str_squish(info_source) %>% str_to_lower(),
info_source = case_when(
str_detect(info_source, "social|facebook|twitter|whatsapp|fb|insta|tiktok|youtube") ~ "social media",
str_detect(info_source, "online|google|internet|web") ~ "internet",
str_detect(info_source, "media|tv|television") ~ "television",
str_detect(info_source, "radio") ~ "radio",
str_detect(info_source, "news") ~ "news",
str_detect(info_source, "friend|family|work|elder") ~ "friends & family",
str_detect(info_source, "health|hospital|clinic") ~ "hospital",
str_detect(info_source, "school|teach|educ") ~ "school",
),
info_confidence = str_to_lower(info_confidence),
info_confidence_high = if_else(str_detect(info_confidence, "very"), 1L, 0L) %>% replace_na(0),
want_link = if_else(str_detect(want_link, "Sure"), 1L, 0L) %>% replace_na(0),
want_answer = if_else(str_detect(want_answer, "Sure"), 1L, 0L) %>% replace_na(0),
self_reflection = if_else(self_reflection == "A lot!", 1L, 0L) %>% replace_na(0),
best_treatment,
lack_motive = case_when(
str_detect(motive_reason, "(((negative|sides?) ?-?)?(a|e)fff?ects?[^ive]|ma(k|d)(e|es|ing) (me|people|them|you) sick|g(e|o)t(ting)? sick|more sick|sick effect)|(after|side|the) effect|react|bad effect|aftermath|vomit|ache|cause health problems|dizzy|negative effect|sick after|clot|fever|fatigue|health complication afterward|weakens my body|getting ill|collapse|falling down|dizziness") ~ "side_effects",
str_detect(motive_reason, "needles?|injections?") ~ "needles_injection",
str_detect(motive_reason, "not good for my body|safe|harm|dangerous|dangers|unhealthy|bad for my health") ~ "unsafe",
str_detect(motive_reason, "lab rats|paralyzed|illuminati|rumurs|infertil|misinformed|mis ?information|ro?umou?rs?|myths?|theories|conspiracy|scam|zombie|controvers|romours|robot|political|politics|criticism|propaganda|gossip|online|misleading info|false info|misconceptions?|(fake )?news|article|magnetic|hoax|fake") ~ "misinformation",
str_detect(motive_reason, "scar(e|y)|fear|afraid") ~ "scared",
str_detect(motive_reason, "((don\'?t|didn'?t|did not|do not|no) (really )?trust|trusted|distrust|doubt|s(k|c)eptical)|not trusting it|untrustworthy|the reviews") ~ "trust",
str_detect(motive_reason, "preg3|am expectant|breastfeeding|pregnan|lactating|newborn|infant|nursing") ~ "pregnancy_nursing",
str_detect(motive_reason, "pain|hurt") ~ "pain",
str_detect(motive_reason, "demonic|religio|church|god|christian|antichrist|666") ~ "religion",
str_detect(motive_reason, "die|death|dead|kill|dying") ~ "death",
str_detect(motive_reason, "soceity judgment|sibling|family|friends?|parents?|brother|sister|uncle|aunt|community|cousin") ~ "family_friends",
str_detect(motive_reason, "risk") ~ "risk",
str_detect(motive_reason, "more research|enough proof|i did see enough proof|no sure prove|unanswered questions|limited data|not informed|knowledge|information|didn't know|i don't (really )?know") ~ "more_information",
str_detect(motive_reason, "government") ~ "government",
str_detect(motive_reason, "benefit") ~ "no_benefit",
str_detect(motive_reason, "effective|no proof it works") ~ "ineffective",
str_detect(motive_reason, "i don'?t have reason|^not really$|no reason|nothing|not? interest|don't feel like|^none$") ~ "no_reason",
str_detect(motive_reason, "not real[^ly]|no covid|exist|is real") ~ "covid_not_real",
str_detect(motive_reason, "did not think it'?s important|not common|didn'?t think it was necessary|is gone|few cases|less prevalent|almost over|post\\-? ?covid|isn't anything serious|not important|already controlled|i don't find it necessary|isn't as prevalent|covid is gone|i don't really hear of the covid|rates have decreased|spread.*is low|not serious|not that important|no need|not help|not? use|no more|no longer|is over|lowered|need |finished|necessity|not necessary|i don't think it'?s necessary|i don'?t see the point") ~ "no_need",
str_detect(motive_reason, "listened to people|negative reviews|people talk|talking negative things|what people say|(bad|negative) report|negative things from social media|condemning|opinion|bad reviews|hearing negative things|people say|negativity|heare?d|hearsay|stories|talks|discussion|speculation|saying|feedback") ~ "heard_hearsay",
str_detect(motive_reason, "belie(f|ve)") ~ "beliefs",
str_detect(motive_reason, "never met a victim|i haven't seen someone|never saw|never seen|no one (has|from)|i have never someone") ~ "never_saw_covid",
str_detect(motive_reason, "my immune is strong|okay with my health|am good/fine|am technically okay|my immunity system|(i'?m|am) not sick|good health|i feel (healthy|fine)|i feel (very )?ok(ay)?|healthy? is good|am healthy|healthy person|high immunity|healthy lifestyle|am just healthy|fine healthy|still healthy|i was healthy|i am just healthy|always been healthy|eating healthy|^(?!.*(weak|affect my)).*immune system|strong immunity") ~ "healthy",
str_detect(motive_reason, "asthmatic|weak immune system|health issue|it's not good for human health|not good for my health|no good for health|allergies|health issues|body weakness|chronic illness|high blood pressure") ~ "health_issues",
str_detect(motive_reason, "distant visit|near the place|nearby|nearest place|far |not available in my area") ~ "distance",
str_detect(motive_reason, "office|not living close|at work|requirement at work|work related|job") ~ "work",
str_detect(motive_reason, "travel") ~ "travel",
str_detect(motive_reason, "nowhere near contacting|am not infected|i had not gotten corona|don'?t have.*(virus|covid|symptoms|it$)") ~ "dont_have_covid",
str_detect(motive_reason, "^(yes|nice|noo?|ok(ay)?|bad|yes sure|nope?|unsure|pardon|lol)$|https\\:\\/\\/|personal reasons") | nchar(motive_reason) <3 ~ "nonsensical",
str_detect(motive_reason, "availab") ~ "no_availability",
str_detect(motive_reason, "expensive|money|lack of money|cost") ~ "need_money",
TRUE ~ NA_character_,
),
treatment = treatment_text,
) %>%
select(-info_confidence) %>%
dummy_cols(select_columns = "info_source", remove_most_frequent_dummy = TRUE, remove_selected_columns = TRUE, ignore_na = TRUE) %>%
dummy_cols(select_columns = "lack_motive", remove_most_frequent_dummy = TRUE, remove_selected_columns = TRUE, ignore_na = TRUE) %>%
dummy_cols(select_columns = "treatment", remove_most_frequent_dummy = TRUE, remove_selected_columns = TRUE, ignore_na = TRUE) %>%
mutate_at(vars(starts_with(c("info", "motive", "treatment"))), ~ replace_na(., 0))## Error: Problem with `mutate()` column `covid_vax_future`.
## ℹ `covid_vax_future = `%>%`(...)`.
## x object 'covid_vax_future' not found
df_features %>%
clean_names(case = "title") %>%
papeR::summarize_numeric() %>%
datatable(options = list(pageLength = 10, columnDefs = list(list(orderable = TRUE, targets = 0))))## Error in clean_names(., case = "title"): object 'df_features' not found
df %>% count(best_treatment) %>% arrange(-n)We look at multinomial inverse regressions to understand how chatbot free text responses connect with preferred treatments. In these regressions, we regress the free text token counts on a set of known attributes (preferred treatments in our case).
Creating counts and covariate matrices:
## TOKENS ##
freetext <-
df %>%
filter(!(best_treatment %in% c("Other", "No, something else", "Reminders"))) %>%
drop_na(best_treatment) %>%
select(contains(c("_other", "_explain")), opinion_friend_family, opinion_conversation, motive_reason, motive_misunderstood_explain, ability_reason, ability_misunderstood_explain, best_treatment_proposal, best_treatment_none_explain, info_confidence_explain, challenge_have_covid, self_reflection_explain) %>%
unite("text", 1:29, na.rm = T, remove = T, sep = ". ") %>%
mutate(
text = gsub("http[^[:space:]]*", "", text) %>% str_to_lower(),
text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|covid|vaccination")
) %>%
drop_na() %>%
pull(text)
# Create corpus
docs <- Corpus(VectorSource(freetext))
# Clean corpus
docs <-
docs %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeWords, stopwords("english"))
# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
matrix <- t(matrix)
## COVARIATES ##
covars_df <-
df %>%
select(best_treatment) %>%
drop_na(best_treatment) %>%
filter(!(best_treatment %in% c("Other", "No, something else", "Reminders"))) %>%
dummy_cols(select_columns = "best_treatment", remove_first_dummy = TRUE, remove_selected_columns = TRUE)
covars <- covars_df %>% as.matrix()
covar_names <-
df %>%
select(best_treatment) %>%
drop_na(best_treatment) %>%
filter(!(best_treatment %in% c("Other", "No, something else", "Reminders"))) %>%
dummy_cols(select_columns = "best_treatment", remove_first_dummy = TRUE, remove_selected_columns = TRUE) %>%
colnames() %>%
str_sub(., 16, -1)Top 100 tokens with counts:
words[1:100]## good people get dont know nothing
## 3015 2962 2240 2167 2152 1935
## think information health effects will side
## 1744 1546 1462 1395 1364 1279
## getting just really safe family take
## 1277 1218 1155 1118 1082 1061
## much say sure like virus friends
## 1021 940 899 874 848 831
## true government got well saying time
## 828 825 769 769 760 741
## believe can one awareness yes taking
## 740 721 699 676 668 649
## many work help made fear real
## 623 591 588 586 574 547
## helpful now need important said still
## 546 543 541 528 528 525
## want also isolation trust bad scared
## 512 510 502 495 487 486
## others feel since make great available
## 483 464 464 459 455 454
## money cause disease nice media didnt
## 449 447 437 434 431 429
## importance availability body going job education
## 426 416 399 396 393 386
## news safety source okay lot hospital
## 385 383 383 383 379 376
## thought things right first actually able
## 375 375 372 360 360 349
## taken challenge afraid positive sick see
## 348 343 341 338 337 332
## alot always helped might public around
## 327 322 321 319 317 316
## cant heard social understand
## 315 314 314 312
Fitting the model:
cl <- makeCluster(detectCores())
fit <- mnlm(cl, covars, matrix)
stopCluster(cl)
B <- coef(fit)Exploring coefficients on preferred treatments for 5 frequent and relevant free text tokens (information, safety, family, health, government) below.
plot(fit$information, col = c("red", "green", "blue", "orange", "black", "magenta", "cyan"), main = "'Information' token in free text")
legend("bottomleft", legend = covar_names, fill = c("red", "green", "blue", "orange", "black", "magenta", "cyan"))plot(fit$safe, col = c("red", "green", "blue", "orange", "black", "magenta", "cyan"), main = "'Safe' token in free text")
legend("bottomright", legend = covar_names, fill = c("red", "green", "blue", "orange", "black", "magenta", "cyan"))plot(fit$family, col = c("red", "green", "blue", "orange", "black", "magenta", "cyan"), main = "'Family' token in free text")
legend("bottomleft", legend = covar_names, fill = c("red", "green", "blue", "orange", "black", "magenta", "cyan"))plot(fit$health, col = c("red", "green", "blue", "orange", "black", "magenta", "cyan"), main = "'Health' token in free text")
legend("topleft", legend = covar_names, fill = c("red", "green", "blue", "orange", "black", "magenta", "cyan"))plot(fit$government, col = c("red", "green", "blue", "orange", "black", "magenta", "cyan"), main = "'Government' token in free text")
legend("bottomright", legend = covar_names, fill = c("red", "green", "blue", "orange", "black", "magenta", "cyan"))The wordclouds below are generated from a participant’s complete free text conversation with the chatbot. We generate a separate wordcould for each subgroup of participants choosing a preferred treatment.
More safety evidence:
vector_wc <-
df %>%
filter(best_treatment == "More safety evidence") %>%
select(best_treatment, contains(c("_other", "_explain")), opinion_friend_family, opinion_conversation, motive_reason, motive_misunderstood_explain, ability_reason, ability_misunderstood_explain, best_treatment_proposal, best_treatment_none_explain, info_confidence_explain, challenge_have_covid, self_reflection_explain) %>%
unite("text", 2:30, na.rm = T, remove = T, sep = ". ") %>%
mutate(
text = gsub("http[^[:space:]]*", "", text) %>% str_to_lower(),
text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|covid|vaccination")
) %>%
pull(text)
# Create corpus
docs <- Corpus(VectorSource(vector_wc))
# Clean corpus
docs <-
docs %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeWords, stopwords("english"))
# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)
# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))Family/friend endorses it:
vector_wc <-
df %>%
filter(best_treatment == "Family/friend endorses it") %>%
select(best_treatment, contains(c("_other", "_explain")), opinion_friend_family, opinion_conversation, motive_reason, motive_misunderstood_explain, ability_reason, ability_misunderstood_explain, best_treatment_proposal, best_treatment_none_explain, info_confidence_explain, challenge_have_covid, self_reflection_explain) %>%
unite("text", 2:30, na.rm = T, remove = T, sep = ". ") %>%
mutate(
text = gsub("http[^[:space:]]*", "", text) %>% str_to_lower(),
text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|covid|vaccination")
) %>%
pull(text)
# Create corpus
docs <- Corpus(VectorSource(vector_wc))
# Clean corpus
docs <-
docs %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeWords, stopwords("english"))
# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)
# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))Job/school required it:
vector_wc <-
df %>%
filter(best_treatment == "Job/school required it") %>%
select(best_treatment, contains(c("_other", "_explain")), opinion_friend_family, opinion_conversation, motive_reason, motive_misunderstood_explain, ability_reason, ability_misunderstood_explain, best_treatment_proposal, best_treatment_none_explain, info_confidence_explain, challenge_have_covid, self_reflection_explain) %>%
unite("text", 2:30, na.rm = T, remove = T, sep = ". ") %>%
mutate(
text = gsub("http[^[:space:]]*", "", text) %>% str_to_lower(),
text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|covid|vaccination")
) %>%
pull(text)
# Create corpus
docs <- Corpus(VectorSource(vector_wc))
# Clean corpus
docs <-
docs %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeWords, stopwords("english"))
# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)
# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))Convenient to vaccinate:
vector_wc <-
df %>%
filter(best_treatment == "Convenient to vaccinate") %>%
select(best_treatment, contains(c("_other", "_explain")), opinion_friend_family, opinion_conversation, motive_reason, motive_misunderstood_explain, ability_reason, ability_misunderstood_explain, best_treatment_proposal, best_treatment_none_explain, info_confidence_explain, challenge_have_covid, self_reflection_explain) %>%
unite("text", 2:30, na.rm = T, remove = T, sep = ". ") %>%
mutate(
text = gsub("http[^[:space:]]*", "", text) %>% str_to_lower(),
text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|covid|vaccination")
) %>%
pull(text)
# Create corpus
docs <- Corpus(VectorSource(vector_wc))
# Clean corpus
docs <-
docs %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeWords, stopwords("english"))
# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)
# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))Nothing:
vector_wc <-
df %>%
filter(best_treatment == "Nothing") %>%
select(best_treatment, contains(c("_other", "_explain")), opinion_friend_family, opinion_conversation, motive_reason, motive_misunderstood_explain, ability_reason, ability_misunderstood_explain, best_treatment_proposal, best_treatment_none_explain, info_confidence_explain, challenge_have_covid, self_reflection_explain) %>%
unite("text", 2:30, na.rm = T, remove = T, sep = ". ") %>%
mutate(
text = gsub("http[^[:space:]]*", "", text) %>% str_to_lower(),
text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|covid|vaccination")
) %>%
pull(text)
# Create corpus
docs <- Corpus(VectorSource(vector_wc))
# Clean corpus
docs <-
docs %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeWords, stopwords("english"))
# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)
# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))Other/Something else:
vector_wc <-
df %>%
filter(best_treatment %in% c("Other", "No, something else")) %>%
select(best_treatment, contains(c("_other", "_explain")), opinion_friend_family, opinion_conversation, motive_reason, motive_misunderstood_explain, ability_reason, ability_misunderstood_explain, best_treatment_proposal, best_treatment_none_explain, info_confidence_explain, challenge_have_covid, self_reflection_explain) %>%
unite("text", 2:30, na.rm = T, remove = T, sep = ". ") %>%
mutate(
text = gsub("http[^[:space:]]*", "", text) %>% str_to_lower(),
text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|covid|vaccination")
) %>%
pull(text)
# Create corpus
docs <- Corpus(VectorSource(vector_wc))
# Clean corpus
docs <-
docs %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeWords, stopwords("english"))
# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)
# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))As another unsupervised approach, we fit a topic model to free text answers mentioned by respondents.
data_vector <-
df %>%
select(best_treatment, contains(c("_other", "_explain")), opinion_friend_family, opinion_conversation, motive_reason, motive_misunderstood_explain, ability_reason, ability_misunderstood_explain, best_treatment_proposal, best_treatment_none_explain, info_confidence_explain, challenge_have_covid, self_reflection_explain) %>%
unite("text", 2:30, na.rm = T, remove = T, sep = ". ") %>%
mutate(
text = gsub("http[^[:space:]]*", "", text) %>% str_to_lower(),
text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|covid|vaccination")
) %>%
pull(text)
# Create corpus
corpus <- Corpus(VectorSource(data_vector))
# Clean corpus
docs <-
corpus %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeWords, stopwords("english"))
# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)
minimumFrequency <- 5
DTM <- DocumentTermMatrix(docs, control = list(bounds = list(global = c(minimumFrequency, Inf))))
sel_idx <- slam::row_sums(DTM) > 0
DTM <- DTM[sel_idx, ]We first inspect the ideal number of topics. In this case, we use two methods CaoJuan2009 and Deveaud2014. The best number of topics shows low values for CaoJuan2009 and high values for Deveaud2014.
result <-
ldatuning::FindTopicsNumber(
DTM,
topics = seq(from = 2, to = 20, by = 1),
metrics = c("CaoJuan2009", "Deveaud2014"),
method = "Gibbs",
control = list(seed = 77),
verbose = TRUE
)## fit models... done.
## calculate metrics:
## CaoJuan2009... done.
## Deveaud2014... done.
ldatuning::FindTopicsNumber_plot(result)Based on the plots above, 6 seems to be a decent number of topics to start with, as it has low CaoJuan2009 value and high Deveaud2014 value.
K <- 6
# set random number generator seed
set.seed(94305)
# compute the LDA model, inference via 1000 iterations of Gibbs sampling
topicModel <- LDA(DTM, K, method = "Gibbs", control = list(iter = 1000, verbose = 25))## K = 6; V = 2578; M = 7227
## Sampling 1000 iterations!
## Iteration 25 ...
## Iteration 50 ...
## Iteration 75 ...
## Iteration 100 ...
## Iteration 125 ...
## Iteration 150 ...
## Iteration 175 ...
## Iteration 200 ...
## Iteration 225 ...
## Iteration 250 ...
## Iteration 275 ...
## Iteration 300 ...
## Iteration 325 ...
## Iteration 350 ...
## Iteration 375 ...
## Iteration 400 ...
## Iteration 425 ...
## Iteration 450 ...
## Iteration 475 ...
## Iteration 500 ...
## Iteration 525 ...
## Iteration 550 ...
## Iteration 575 ...
## Iteration 600 ...
## Iteration 625 ...
## Iteration 650 ...
## Iteration 675 ...
## Iteration 700 ...
## Iteration 725 ...
## Iteration 750 ...
## Iteration 775 ...
## Iteration 800 ...
## Iteration 825 ...
## Iteration 850 ...
## Iteration 875 ...
## Iteration 900 ...
## Iteration 925 ...
## Iteration 950 ...
## Iteration 975 ...
## Iteration 1000 ...
## Gibbs sampling completed!
Top 10 terms per topic:
# have a look a some of the results (posterior distributions)
tmResult <- posterior(topicModel)
# format of the resulting object
# attributes(tmResult)
# nTerms(DTM)
# topics are probability distributions over the entire vocabulary
beta <- tmResult$terms # get beta from results
# for every document we have a probability distribution of its contained topics
theta <- tmResult$topics
terms(topicModel, 10)## Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 Topic 6
## [1,] "people" "good" "dont" "effects" "information" "think"
## [2,] "get" "virus" "know" "side" "health" "take"
## [3,] "will" "yes" "nothing" "safe" "awareness" "believe"
## [4,] "family" "help" "just" "true" "isolation" "can"
## [5,] "friends" "getting" "really" "getting" "available" "need"
## [6,] "many" "helpful" "much" "sure" "importance" "taking"
## [7,] "got" "real" "well" "fear" "availability" "say"
## [8,] "also" "important" "like" "body" "media" "one"
## [9,] "say" "disease" "said" "safety" "government" "didnt"
## [10,] "time" "government" "trust" "others" "education" "made"
top5termsPerTopic <- terms(topicModel, 3)
# concatenate the five most likely terms of each topic to a string that represents a pseudo-name for each topic
topicNames <- apply(top5termsPerTopic, 2, paste, collapse = " ")Word cloud for topic 1:
# visualize topics as word cloud
topicToViz <- 1 # change for your own topic of interest
# select to 100 most probable terms from the topic by sorting the term-topic-probability vector in decreasing order
top100terms <- sort(tmResult$terms[topicToViz,], decreasing = TRUE)[1:100]
words <- names(top100terms)
# extract the probabilites of each of the 100 terms
probabilities <- sort(tmResult$terms[topicToViz,], decreasing=TRUE)[1:100]
# visualize the terms as wordcloud
mycolors <- brewer.pal(8, "Dark2")
wordcloud(words, probabilities, random.order = FALSE, color = mycolors)Word cloud for topic 2:
# visualize topics as word cloud
topicToViz <- 2 # change for your own topic of interest
# select to 100 most probable terms from the topic by sorting the term-topic-probability vector in decreasing order
top100terms <- sort(tmResult$terms[topicToViz,], decreasing = TRUE)[1:100]
words <- names(top100terms)
# extract the probabilites of each of the 100 terms
probabilities <- sort(tmResult$terms[topicToViz,], decreasing=TRUE)[1:100]
# visualize the terms as wordcloud
mycolors <- brewer.pal(8, "Dark2")
wordcloud(words, probabilities, random.order = FALSE, color = mycolors)Word cloud for topic 3:
# visualize topics as word cloud
topicToViz <- 3 # change for your own topic of interest
# select to 100 most probable terms from the topic by sorting the term-topic-probability vector in decreasing order
top100terms <- sort(tmResult$terms[topicToViz,], decreasing = TRUE)[1:100]
words <- names(top100terms)
# extract the probabilites of each of the 100 terms
probabilities <- sort(tmResult$terms[topicToViz,], decreasing=TRUE)[1:100]
# visualize the terms as wordcloud
mycolors <- brewer.pal(8, "Dark2")
wordcloud(words, probabilities, random.order = FALSE, color = mycolors)Word cloud for topic 4:
# visualize topics as word cloud
topicToViz <- 4 # change for your own topic of interest
# select to 100 most probable terms from the topic by sorting the term-topic-probability vector in decreasing order
top100terms <- sort(tmResult$terms[topicToViz,], decreasing = TRUE)[1:100]
words <- names(top100terms)
# extract the probabilites of each of the 100 terms
probabilities <- sort(tmResult$terms[topicToViz,], decreasing=TRUE)[1:100]
# visualize the terms as wordcloud
mycolors <- brewer.pal(8, "Dark2")
wordcloud(words, probabilities, random.order = FALSE, color = mycolors)Word cloud for topic 5:
# visualize topics as word cloud
topicToViz <- 5 # change for your own topic of interest
# select to 100 most probable terms from the topic by sorting the term-topic-probability vector in decreasing order
top100terms <- sort(tmResult$terms[topicToViz,], decreasing = TRUE)[1:100]
words <- names(top100terms)
# extract the probabilites of each of the 100 terms
probabilities <- sort(tmResult$terms[topicToViz,], decreasing=TRUE)[1:100]
# visualize the terms as wordcloud
mycolors <- brewer.pal(8, "Dark2")
wordcloud(words, probabilities, random.order = FALSE, color = mycolors)Word cloud for topic 6:
# visualize topics as word cloud
topicToViz <- 6 # change for your own topic of interest
# select to 100 most probable terms from the topic by sorting the term-topic-probability vector in decreasing order
top100terms <- sort(tmResult$terms[topicToViz,], decreasing = TRUE)[1:100]
words <- names(top100terms)
# extract the probabilites of each of the 100 terms
probabilities <- sort(tmResult$terms[topicToViz,], decreasing=TRUE)[1:100]
# visualize the terms as wordcloud
mycolors <- brewer.pal(8, "Dark2")
wordcloud(words, probabilities, random.order = FALSE, color = mycolors)