## Load packages
pacman::p_load(DT, estimatr, kableExtra, readr, reshape2, tidyverse, xtable, dataMaid, ggcorrplot, ggmap, rpart, rpart.plot, pollster, wordcloud, tm, RColorBrewer, hrbrthemes, janitor, purrr, gridExtra, cowplot, rcompanion, nnet, texreg, compareGroups, factoextra, cluster, fastDummies, simputation, sentimentr, politeness, textir)
set.seed(94305)## Data Loading and Merging
### read ads data for 4 pilot waves
ads_v7 <-
read.csv("data/ads_data_latest.csv") %>%
rename(original_ref = Ad.ID, ad_name = Ad.name) %>%
mutate(
`Analysis 3 - impediment theme` = str_sub(ad_name, 8) %>% str_to_lower()
) %>%
relocate(original_ref, ad_name)
ads_v6 <-
read.csv("data/old/ads_data_v6.csv") %>%
rename(original_ref = Ad.ID, ad_name = Ad.name) %>%
# group_by(ad_name) %>%
# summarise_at(vars(Impressions, Link.clicks, Results, `Amount.spent..USD.`), ~ sum(., na.rm = TRUE)) %>%
remove_empty() %>%
inner_join(
read_csv(here::here("data/Pilot_creative_text.xlsx - Pilot_5.csv")) %>% rename(ad_name = Ad),
by = "ad_name"
) %>%
relocate(original_ref, ad_name)
ads_v5 <-
read.csv("data/old/ads_data_v5.csv") %>%
rename(original_ref = Ad.ID, ad_name = Ad.name) %>%
remove_empty() %>%
inner_join(
read_csv(here::here("data/Pilot_creative_text.xlsx - Pilot_5.csv")) %>% rename(ad_name = Ad),
by = "ad_name"
) %>%
relocate(original_ref, ad_name)
ads_v4b <-
read.csv("data/old/ads_data_v4b.csv") %>%
rename(original_ref = Ad.ID, ad_name = Ad.name) %>%
remove_empty() %>%
inner_join(
read_csv(here::here("data/Pilot_4b_creative_text.xlsx - Creative Reporting.csv")) %>% rename(ad_name = Ad),
by = "ad_name"
) %>%
relocate(original_ref, ad_name)## read in CURRENT chatfuel data
# full data
df_full_v7 <-
read_csv("data/latest/Share_Your_Voice_v7.csv") %>%
clean_names() %>%
mutate_if(is.character, ~ str_replace_all(., '[\n\t]', '')) %>%
mutate(
first_name = if_else(is.na(first_name), "", first_name),
middle_name = if_else(is.na(middle_name), "", middle_name),
last_name = if_else(is.na(last_name), "", last_name),
full_name = str_c(first_name, middle_name, last_name, sep = " "),
full_name_short = str_c(first_name, last_name, sep = " "),
date = lubridate::date(signed_up)
) %>%
filter(!(full_name %in% c("Robert Kuan", "James Li", "Kaylin Rochford", "Saurabh Khanna", "Dingchen Sha", "Kristine Koutout", "Susan Athey", "Dean Karlan"))) %>%
filter(version == "ALP_May") %>%
mutate(original_ref = parse_number(original_ref)) %>%
mutate(
motive = if_else(str_detect(motive, "yes"), "yes", "no"),
motive_main = if_else(str_detect(motive_main, "risk"), "risk", motive_main)
) %>%
remove_empty()
# filter to completes
df_v7 <-
df_full_v7 %>%
filter(full_complete == "yes") %>%
drop_na(vax_status) %>%
mutate(
phone_number = str_replace_all(phone_number, " ", ""),
phone_number = str_replace_all(phone_number, "-", ""),
) %>%
arrange(phone_number, last_seen) %>%
distinct(phone_number, .keep_all = T) %>%
remove_empty()
## OLD WAVES
# v6
df_full_v6 <-
read_csv("data/latest/Share_Your_Voice_v6_ALP.csv") %>%
clean_names() %>%
mutate_if(is.character, ~ str_replace_all(., '[\n\t]', '')) %>%
mutate(
first_name = if_else(is.na(first_name), "", first_name),
middle_name = if_else(is.na(middle_name), "", middle_name),
last_name = if_else(is.na(last_name), "", last_name),
full_name = str_c(first_name, middle_name, last_name, sep = " "),
full_name_short = str_c(first_name, last_name, sep = " "),
date = lubridate::date(signed_up)
) %>%
filter(!(full_name %in% c("Robert Kuan", "James Li", "Kaylin Rochford", "Saurabh Khanna", "Dingchen Sha", "Kristine Koutout", "Susan Athey", "Dean Karlan"))) %>%
filter(date > "2022-03-23", !is.na(version)) %>%
mutate(original_ref = parse_number(original_ref)) %>%
mutate(
motive = if_else(str_detect(motive, "yes"), "yes", "no"),
motive_main = if_else(str_detect(motive_main, "risk"), "risk", motive_main)
) %>%
# left_join(ads_v6, by = "original_ref") %>%
remove_empty()
# filtering for survey completes
df_v6 <-
df_full_v6 %>%
filter(version == "pilot_6", full_complete == "yes") %>%
drop_na(vax_status) %>%
mutate(
phone_number = str_replace_all(phone_number, " ", ""),
phone_number = str_replace_all(phone_number, "-", ""),
) %>%
arrange(phone_number, last_seen) %>%
distinct(phone_number, .keep_all = T) %>%
remove_empty()
# v5
df_full_v5 <-
read_csv("data/latest/Share_Your_Voice_2022_02_22_03_35_54.csv") %>%
clean_names() %>% remove_empty() %>%
mutate_if(is.character, ~ str_replace_all(., '[\n\t]', '')) %>%
mutate(
full_name = str_c(first_name, last_name, sep = " "),
date = lubridate::date(signed_up)
) %>%
filter(!(full_name %in% c("Robert Kuan", "James Li", "Kaylin Rochford", "Saurabh Khanna", "Dingchen Sha"))) %>%
filter(date > "2022-02-18", !is.na(version)) %>%
select(-contains("name"), -profile_pic_url, -messenger_user_id) %>%
mutate(original_ref = parse_number(original_ref)) %>%
remove_empty()
df_v5 <-
df_full_v5 %>%
filter(version == "pilot_5", full_complete == "yes") %>%
drop_na(vax_status) %>%
mutate(
phone_number = str_replace_all(phone_number, " ", ""),
phone_number = str_replace_all(phone_number, "-", ""),
) %>%
arrange(phone_number, last_seen) %>%
distinct(phone_number, .keep_all = T) %>%
remove_empty()
# 4b
df_full_v4b <-
read_csv(here::here("data/Social_Impact_Research_Initiative_2022_02_04_17_40_24.csv")) %>%
clean_names() %>%
mutate_if(is.character, ~ str_replace_all(., '[\n\t]', '')) %>%
mutate(
full_name = str_c(first_name, last_name, sep = " ")
) %>%
filter(!(full_name %in% c("Robert Kuan", "James Li", "Kaylin Rochford", "Saurabh Khanna", "Dingchen Sha"))) %>%
select(-contains("name"), -profile_pic_url, -messenger_user_id) %>%
mutate(original_ref = parse_number(original_ref)) %>%
select(-motive_explain) %>%
remove_empty()
df_v4b <-
df_full_v4b %>%
filter(version == "pilot_4b", full_complete == "yes") %>%
drop_na(vax_status) %>%
mutate(
phone_number = str_replace_all(phone_number, " ", ""),
phone_number = str_replace_all(phone_number, "-", ""),
) %>%
arrange(phone_number, last_seen) %>%
distinct(phone_number, .keep_all = T) %>%
remove_empty()
# combining chatfuel data with ads data. we use `df` in most analyses below.
df <-
bind_rows(
df_v7 %>% left_join(ads_v7, by = "original_ref"),
df_v6 %>% left_join(ads_v6, by = "original_ref"),
df_v5 %>% left_join(ads_v5, by = "original_ref"),
df_v4b %>% left_join(ads_v4b, by = "original_ref"),
) %>%
remove_empty()
# combining full datatsets
df_full <-
bind_rows(
df_full_v7 %>% left_join(ads_v7, by = "original_ref"),
df_full_v6 %>% left_join(ads_v6, by = "original_ref"),
df_full_v5 %>% left_join(ads_v5, by = "original_ref"),
df_full_v4b %>% left_join(ads_v6, by = "original_ref"),
) %>%
remove_empty()
ads <-
bind_rows(ads_v4b, ads_v5, ads_v6, ads_v7) %>%
remove_empty()# clean up demographic variables
clean_up_demog <- function(df){
df$gender[!df$gender %in% c("male", "female", NA_character_)] <- "other"
df$ethnicity[!df$ethnicity %in% c("black or african", "coloured", "white or caucasian", "prefer not to say", "asian or indian", NA_character_)] <- "other"
df$income[!df$income %in% c("< R5,000", "R5,000 – R9,999", "R10,000 – R29,999", "R30,000 – R49,999", "R50,000 – R99,999", "> R100,000", "prefer not to say", NA_character_)] <- "other"
df$education[!df$education %in% c("< high school", "high school", "some college", "2-year degree", "4-year degree", "graduate degree", "prefer not to say", NA_character_)] <- "other"
df$politics[!df$politics %in% c("conservative", "moderate", "liberal", "prefer not to say", NA_character_)] <- "other"
df$location[!df$location %in% c("urban", "suburban", "rural", "prefer not to say", NA_character_)] <- "other"
df$religion[grep("christ", tolower(df$religion))] <- "christian"
df$religion[!df$religion %in% c("christian", "african traditional", "islam", "hinduism", "no religion", "prefer not to say", NA_character_)] <- "other"
return(df)
}
df <- clean_up_demog(df)df_v7 %>%
filter(is.na(ability_raw), !is.na(ability)) %>%
glimpse()
df_v7 %>%
tabyl(vax_status_raw, vax_status)
df %>% get_dupes(chatfuel_user_id) %>%
transmute(chatfuel_user_id = as.character(chatfuel_user_id), version, last_seen, dupe_count)Background
COVID-19 vaccine hesitancy has been recognized as a problem across nations. A resistance to getting vaccinated is emerging as a major hurdle, especially in the developing world, where vaccine access issues are still being gradually resolved. Persistent pools of unvaccinated people around the world could present a greater risk for the emergence of new variants of concern. Addressing people’s vaccine hesitancy is hence crucial to curb the spread of COVID-19, and to consequently avert hospitalizations and death.
Objectives
We intend to understand why people are hesitant about getting the COVID-19 vaccine. Hesitancy could not only occur within the unvaccinated population but also in a subset of people who already got vaccinated. Therefore, the first phase of our project has the following objectives:
Approach
We intend to use chatbot as a medium (on Facebook) to conduct conversations with people and understand how we can best achieve the above three objectives. We have run six pilots as of March 2022 – 2 in the United States using Lucid, and 5 in South Africa on Facebook. Our eventual goal will be running this using an interactive, personalized chatbot that enables the conversation to flow more naturally than in a survey format. We are running the pilots in order to – 1) achieve technical proofs of concept, 2) reduce participant recruitment & completion costs (survey completion of unvaccinated participants open to treatment ) before experiment launch, 3) improve chatbot script/forking/engagement before experiment launch, and 4) gather exploratory ideas for impactful covariates and treatments. Our insights from these pilots are detailed ahead.
The sample for this analysis consists of 2271 unvaccinated respondents recruited across pilot waves 4-7 (Jan-May 2022). The related GitHub issue is here.
We start with cleaning all chatbot features to be used later in our models.
We start by cleaning and filtering our data frames to features we expect to be useful for modeling. We consider the following 4 sets of features:
against_beliefs (vaccine is
against my beliefs), no_benefits (vaccine has no benefits),
risky (vaccine is risky). These are coded as binary
(0/1).no_time (no time to get vaxxed),
no_money (no money to get vaxxed),
no_availability (vaccine not easily available). These are
coded as binary (0/1).ad_theme_risky (vaccine is risky) and
ad_theme_unnecessary (vaccine is not necessary). These are
coded as binary (0/1).ad_text_airtime is a binary (0/1) indicating
whether the ad text explicitly mentions that respondents will receive
mobile airtime.elaboration: The average number of characters typed by
a respondent across all free text questions faced. We normalize this
variable before clustering.politeness: Politeness score calculated across all free
text typed by a respondent. This ranges from 0 to 104 in our sample
(based on number of politeness attributes touched) and is constructed
using Mike Yeomans’ politeness
package. We normalize this variable before running the clustering
algorithm. Higher score means more polite text.receptivity: Receptivity score is a continuous score
calculated across all free text typed by a respondent. This is also
constructed using Mike Yeomans’ politeness package. We normalize it
before running the clustering algorithm. Higher score means more
receptive text.sentiment: Sentiment polarity (range -1 to 1) for all
free text typed by a respondent.age: Participant age in yearsfemale: 1 if female, 0 if maleincome: 0 if the participant is unemployed, 1 if
household income < R5,000, 2 if household income in R5,000 – R9,999,
…, 6 if household income > R100,000education: 1 if the participant’s education < high
school, 2 if education is high school, …, 6 if education is a graduate
degreereligiosity: 1 if the participant is not very
religious, 2 if somewhat religious, 3 if very religiouspolitics: 1 if the participant is conservative, 2 if
moderate, 3 if liberallocation: 1 if the participant lives in rural, 2 if
suburban, 3 if urban,white: 1 if the participant is a white or caucasian, 0
if notAll continuous variables are normalized before clustering.
Since we can’t feed missing values to a clustering algorithm, we
impute any missingness in demographic variables using a linear
imputation model building on values from female,
white, and age.
df_sentiment <-
df %>%
select(contains(c("_other", "_explain"))) %>%
unite("text", 1:56, na.rm = T, remove = T, sep = ". ") %>%
pull(text) %>%
get_sentences() %>%
sentiment_by() %>%
bind_cols(df %>% select(chatfuel_user_id)) %>%
transmute(chatfuel_user_id, sentiment = ave_sentiment)
df_polite <-
df %>%
select(contains(c("_other", "_explain"))) %>%
unite("text", 1:56, na.rm = T, remove = T, sep = ". ") %>%
politeness::politeness(.$text, parser = "spacy") %>%
transmute(politeness = rowSums(.)) %>%
bind_cols(df %>% select(chatfuel_user_id))## ================================================================================
# df_receptive <-
# df %>%
# select(contains(c("_other", "_explain"))) %>%
# unite("text", 1:56, na.rm = T, remove = T, sep = ". ") %>%
# transmute(receptivity = politeness::receptiveness(.)) %>%
# bind_cols(df %>% select(chatfuel_user_id))
df_elaboration <-
df %>%
mutate(age = parse_number(cv_age)) %>%
filter(vax_status == "unvax", age <= 120) %>%
select(ends_with(c("_other", "_explain"))) %>%
mutate_all(~ nchar(.)) %>%
transmute(
elaboration =
pmap_dbl(
.,
~ mean(c(...), na.rm = TRUE)
)
)df_ad_features <-
df %>%
mutate(age = parse_number(cv_age)) %>%
filter(vax_status == "unvax", age <= 120) %>%
transmute(
chatfuel_user_id,
# ad_image = `Analysis 2 - image`,
ad_theme = `Analysis 3 - impediment theme`,
ad_text = `Analysis 4 - body text`
) %>%
fastDummies::dummy_cols(
select_columns = c("ad_theme"), remove_first_dummy = TRUE,
remove_selected_columns = TRUE, ignore_na = TRUE
) %>%
mutate(ad_text_airtime = if_else(ad_text == "airtime", 1L, 0L) %>% replace_na(0)) %>%
select(chatfuel_user_id, starts_with(c("ad_text_", "ad_theme_"))) %>%
mutate_at(vars(starts_with("ad_theme")), ~ replace_na(., 0)) %>%
bind_cols(df_elaboration) %>%
left_join(df_polite, by = "chatfuel_user_id") %>%
#left_join(df_receptive, by = "chatfuel_user_id") %>%
left_join(df_sentiment, by = "chatfuel_user_id") %>%
distinct(chatfuel_user_id, .keep_all = T)
df_features_raw <-
df %>%
mutate(age = parse_number(cv_age)) %>%
filter(vax_status == "unvax", age <= 120) %>%
transmute(
# unvax = if_else(vax_status == "unvax", 1L, 0L),
chatfuel_user_id,
age,
female = case_when(
gender == "female" ~ 1,
gender == "male" ~ 0,
),
income = case_when(
income == "Unemployed" ~ 0,
income == "< R5,000" ~ 1,
income == "R5,000 – R9,999" ~ 2,
income == "R10,000 – R29,999" ~ 3,
income == "R30,000 – R49,999" ~ 4,
income == "R50,000 – R99,999" ~ 5,
income == "> R100,000" ~ 6,
),
education = case_when(
education == "< high school" ~ 1,
education == "high school" ~ 2,
education == "some college" ~ 3,
education == "2-year degree" ~ 4,
education == "4-year degree" ~ 5,
education == "graduate degree" ~ 6,
),
religiosity = case_when(
religiosity == "not very religious" ~ 1,
religiosity == "somewhat religious" ~ 2,
religiosity == "very religious" ~ 3,
),
politics = case_when(
politics == "conservative" ~ 1,
politics == "moderate" ~ 2,
politics == "liberal" ~ 3,
),
location = case_when(
location == "rural" ~ 1,
location == "suburban" ~ 2,
location == "urban" ~ 3,
),
white = case_when(
ethnicity == "white or caucasian" ~ 1,
ethnicity != "white or caucasian" ~ 0
),
# no_ability = case_when(
# ability == "no" ~ 1,
# ability == "yes" ~ 0,
# ),
# no_motive = case_when(
# motive == "no" ~ 1,
# motive == "yes" ~ 0,
# ),
against_beliefs = if_else(motive_main == "belief", 1L, 0L) %>% replace_na(0),
no_benefits = if_else(motive_main == "benefit", 1L, 0L) %>% replace_na(0),
risky = if_else(motive_main == "risk", 1L, 0L) %>% replace_na(0),
no_time = if_else(ability_main == "time", 1L, 0L) %>% replace_na(0),
no_money = if_else(ability_main == "money", 1L, 0L) %>% replace_na(0),
no_availability = if_else(ability_main == "availability", 1L, 0L) %>% replace_na(0),
) %>%
distinct(chatfuel_user_id, .keep_all = T) %>%
left_join(df_ad_features, by = "chatfuel_user_id") %>%
impute_lm(politics + income + religiosity + location + education ~ female + white + age) %>%
drop_na() %>%
mutate_at(vars(age, elaboration, sentiment, politeness), ~ scale(.) %>% as.vector()) %>%
relocate(chatfuel_user_id)
#df_features %>% summarise_all(~ sum(is.na(.), na.rm = T))
df_features <- df_features_raw %>% select(-chatfuel_user_id)Here are summary statistics for the features fed into the algorithm:
df_features %>%
select(against_beliefs, no_benefits, risky, no_time, no_money, no_availability, ad_text_airtime, ad_theme_risky, ad_theme_unnecessary, ad_theme_inaccessible, ad_theme_neutral, ad_theme_innocence = `ad_theme_innocence/curiosity`, ad_theme_fear, ad_theme_familyvalues, elaboration, politeness, sentiment, age, female, income, education, religiosity, politics, location, white, everything()) %>%
clean_names(case = "title") %>%
papeR::summarize_numeric() %>%
datatable(options = list(pageLength = 10, columnDefs = list(list(orderable = TRUE, targets = 0))))We start with looking at multinomial inverse regressions (MNIR) to understand how chatbot free text responses connect with preferred treatments. In MNIR, we regress the token counts on a set of known attributes (preferred treatments in our case).
Creating counts and covariate matrices:
## TOKENS ##
freetext <-
df %>%
mutate(
best_treatment = str_to_sentence(best_treatment),
best_treatment = if_else(str_detect(best_treatment, "^Nothing"), "Nothing", best_treatment),
best_treatment = if_else(best_treatment %in% c("Family supports it", "Trusted info source", "More transparency", "Nothing", "Rewards for vaxxing", "Job/school required"), best_treatment, NA_character_),
) %>%
drop_na(best_treatment) %>%
select(contains(c("_other", "_explain"))) %>%
mutate_all(~ str_to_lower(.) %>% str_squish() %>% str_remove_all("no|yes")) %>%
unite("text", 1:56, na.rm = T, remove = T, sep = ". ") %>%
drop_na() %>%
pull(text)
# Create corpus
docs <- Corpus(VectorSource(freetext))
# Clean corpus
docs <-
docs %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeWords, stopwords("english"))
# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
matrix <- t(matrix)
## COVARIATES ##
covars_df <-
df %>%
transmute(
best_treatment = str_to_sentence(best_treatment),
best_treatment = if_else(str_detect(best_treatment, "^Nothing"), "Nothing", best_treatment),
best_treatment = if_else(best_treatment %in% c("Family supports it", "Trusted info source", "More transparency", "Nothing", "Rewards for vaxxing", "Job/school required"), best_treatment, NA_character_),
) %>%
drop_na(best_treatment) %>%
dummy_cols(select_columns = "best_treatment", remove_first_dummy = TRUE, remove_selected_columns = TRUE)
covars <- covars_df %>% as.matrix()
covar_names <-
df %>%
transmute(
best_treatment = str_to_sentence(best_treatment),
best_treatment = if_else(str_detect(best_treatment, "^Nothing"), "Nothing", best_treatment),
best_treatment = if_else(best_treatment %in% c("Family supports it", "Trusted info source", "More transparency", "Nothing", "Rewards for vaxxing", "Job/school required"), best_treatment, NA_character_),
) %>%
drop_na(best_treatment) %>%
dummy_cols(select_columns = "best_treatment", remove_first_dummy = TRUE, remove_selected_columns = TRUE) %>%
colnames() %>%
str_sub(., 16, -1)
# words[1:99]Fitting the model:
cl <- makeCluster(detectCores())
fit <- mnlm(cl, covars, matrix)
stopCluster(cl)
B <- coef(fit)Consolidating takeaways:
Side effect utterances are positively related to
more transparency treatment, but not to other
treatments.Family related utterances are negatively related to all
treatments.Fear related utterances are negatively related to all
treatments, except mandates where it sees no effect.Time related utterances are positively (but weakly)
related to rewards for vaxxing treatment, but negatively
related to other treatments.Death related utterances are positively (though weakly)
related to job/school mandates treatment, but negatively
related to other treatments.Side effect utterances are positively related to
more transparency treatment, but not to other
treatments.
plot(fit$side, col = c("red", "green", "blue", "orange", "black"))
legend("topright", legend = covar_names, fill = c("red", "green", "blue", "orange", "black"))Family related utterances are negatively related to all treatments.
plot(fit$family, col = c("red", "green", "blue", "orange", "black"))
legend("topright", legend = covar_names, fill = c("red", "green", "blue", "orange", "black"))Fear related utterances are negatively related to all treatments, except mandates where it sees no effect.
plot(fit$scared, col = c("red", "green", "blue", "orange", "black"))
legend("topright", legend = covar_names, fill = c("red", "green", "blue", "orange", "black"))Time related utterances are positively (but weakly) related to
rewards for vaxxing treatment, but negatively related to
other treatments.
plot(fit$time, col = c("red", "green", "blue", "orange", "black"))
legend("topright", legend = covar_names, fill = c("red", "green", "blue", "orange", "black"))Death related utterances are positively (thougly weakly) related to
job/school mandates treatment, but negatively related to
other treatments.
plot(fit$die, col = c("red", "green", "blue", "orange", "black"))
legend("topright", legend = covar_names, fill = c("red", "green", "blue", "orange", "black"))The following table shows all respondent utterances, sorted within each treatment preferred by MNIR coefficient size.
B %>%
as.matrix() %>%
as_tibble() %>%
tail(5) %>%
bind_cols(
preferred_treatment = covar_names, .
) %>%
pivot_longer(cols = -preferred_treatment) %>%
arrange(preferred_treatment, -value) %>%
mutate(value = round(value, 2)) %>%
select(`Preferred treatment` = preferred_treatment, `Utterance` = name, `Coefficient` = value) %>%
DT::datatable()Let’s move ahead and look at two supervised learning approaches – multinomial logit and random forest. The task here is to try and predict – i) motivation impediments, and ii) preferred treatments, using all available chatbot features.
Here’s an overview of misclassification rates from these supervised approaches:
| Predicting Preferred Treatments | Predicting Impediments | |
|---|---|---|
| Multinomial Logit | 65% | 63% |
| Random Forest | 64% | 68% |
Details on each implementation ahead.
df_mlm <-
df_features_raw %>%
inner_join(
df %>%
transmute(
chatfuel_user_id,
impediment = case_when(
motive_main == "belief" ~ "Against beliefs",
motive_main == "benefit" ~ "No benefits",
motive_main == "risk" ~ "Risky",
TRUE ~ "No impediment"
) %>% as_factor()
) %>% distinct(chatfuel_user_id, .keep_all = T),
by = "chatfuel_user_id"
) %>%
select(-c(against_beliefs, no_benefits, risky, chatfuel_user_id))
df_mlm$impediment <- relevel(df_mlm$impediment, ref = "No impediment")
trainingRows <- sample(1:nrow(df_mlm), 0.7 * nrow(df_mlm))
training <- df_mlm[trainingRows, ]
test <- df_mlm[-trainingRows, ]
mlm1 <-
multinom(impediment ~ ., data = training)## # weights: 96 (69 variable)
## initial value 2202.821740
## iter 10 value 2076.207493
## iter 20 value 2042.066608
## iter 30 value 2020.569589
## iter 40 value 2017.239678
## iter 50 value 2016.942216
## iter 60 value 2016.925929
## final value 2016.925519
## converged
Baseline model with 70/30 train/test split and no interactions
[impediment ~ linear_combination_of_features] for
unvaccinated respondents does a bad job of predicting true motivation
impediments. Misclassification error for this model is high at 63%.
predicted_class <- predict(mlm1, test)
table(`Predicted class` = predicted_class, `True class` = test$impediment)## True class
## Predicted class No impediment No benefits Risky Against beliefs
## No impediment 76 46 52 27
## No benefits 33 49 46 20
## Risky 75 82 117 34
## Against beliefs 7 1 9 8
mean(as.character(predicted_class) != as.character(test$impediment), na.rm = T)## [1] 0.6334311
df_mlm <-
df_features_raw %>%
inner_join(
df %>%
transmute(
chatfuel_user_id,
best_treatment = str_to_sentence(best_treatment),
best_treatment = if_else(str_detect(best_treatment, "^Nothing"), "Nothing", best_treatment),
best_treatment = if_else(best_treatment %in% c("Family supports it", "Trusted info source", "More transparency", "Nothing", "Rewards for vaxxing", "Job/school required"), best_treatment, NA_character_) %>%
as_factor()) %>%
distinct(chatfuel_user_id, .keep_all = T),
by = "chatfuel_user_id"
) %>%
select(-c(chatfuel_user_id))
df_mlm$best_treatment <- relevel(df_mlm$best_treatment, ref = "Nothing")
trainingRows <- sample(1:nrow(df_mlm), 0.7 * nrow(df_mlm))
training <- df_mlm[trainingRows, ]
test <- df_mlm[-trainingRows, ]
mlm1 <-
multinom(best_treatment ~ ., data = training)## # weights: 162 (130 variable)
## initial value 2748.559026
## iter 10 value 2450.044814
## iter 20 value 2347.836419
## iter 30 value 2305.934606
## iter 40 value 2294.525788
## iter 50 value 2288.230803
## iter 60 value 2285.837110
## iter 70 value 2285.173795
## iter 80 value 2284.788372
## iter 90 value 2284.629819
## iter 100 value 2284.596110
## final value 2284.596110
## stopped after 100 iterations
Baseline model with 70/30 train/test split and no interactions
[preferred_treatment ~ linear_combination_of_features] for
unvaccinated respondents does a bad job of predicting preferred
treatments. Misclassification error for this model is high at 65%.
predicted_class <- predict(mlm1, test)
table(`Predicted class` = predicted_class, `True class` = test$best_treatment)## True class
## Predicted class Nothing Rewards for vaxxing Trusted info source
## Nothing 82 20 32
## Rewards for vaxxing 2 2 1
## Trusted info source 3 7 3
## Family supports it 8 4 7
## Job/school required 75 27 46
## More transparency 4 0 0
## True class
## Predicted class Family supports it Job/school required More transparency
## Nothing 23 53 28
## Rewards for vaxxing 4 3 1
## Trusted info source 2 4 5
## Family supports it 18 10 2
## Job/school required 43 123 11
## More transparency 0 1 0
mean(as.character(predicted_class) != as.character(test$best_treatment), na.rm = T)## [1] 0.6513761
We now try a random forest model with 70/30 train/test split for the task of predicting impediments from a linear combination of features. The misclassification error still stays as high as 68%.
df_mlm <-
df_features_raw %>%
inner_join(
df %>%
transmute(
chatfuel_user_id,
impediment = case_when(
motive_main == "belief" ~ "Against beliefs",
motive_main == "benefit" ~ "No benefits",
motive_main == "risk" ~ "Risky",
TRUE ~ "No impediment"
) %>% as_factor()
) %>% distinct(chatfuel_user_id, .keep_all = T),
by = "chatfuel_user_id"
) %>%
select(-c(against_beliefs, no_benefits, risky, chatfuel_user_id)) %>%
rename(ad_theme_innocence = `ad_theme_innocence/curiosity`)
df_mlm$impediment <- relevel(df_mlm$impediment, ref = "No impediment")
trainingRows <- sample(1:nrow(df_mlm), 0.7 * nrow(df_mlm))
training <- df_mlm[trainingRows, ]
test <- df_mlm[-trainingRows, ]# impute missing values before feeding into RF
training <-
training %>%
simputation::impute_lm(politics + income + religiosity + location + education ~ female + white) %>%
drop_na()
rf_model <-
randomForest::randomForest(impediment ~ ., data = training %>% clean_names(), importance = TRUE)
predicted_class <- predict(rf_model, test %>% clean_names())
table(`Predicted class` = predicted_class, `True class` = test$impediment)## True class
## Predicted class No impediment No benefits Risky Against beliefs
## No impediment 71 47 60 20
## No benefits 52 46 53 34
## Risky 73 78 99 27
## Against beliefs 5 7 7 3
mean(as.character(predicted_class) != as.character(test$impediment), na.rm = T)## [1] 0.6788856
Given the high misclassification error (68%), the variable importance based on Mean decrease accuracy and Mean decrease Gini are inconsistent.
MeanDecreaseAccuracy gives a rough estimate of the loss
in prediction performance when that particular variable is omitted from
the training set.MeanDecreaseGini: GINI is a measure of node impurity.
Highest purity means that each node contains only elements of a single
class. Assessing the decrease in GINI when that feature is omitted leads
to an understanding of how important that feature is to split the data
correctly.Note: Both these measures are used to rank variables in terms of importance and their absolute values could be disregarded.
randomForest::varImpPlot(rf_model, sort = T, main = "Random Forest - Variable Importance")We now try a random forest model with 70/30 train/test split for the task of predicting preferred treatments from a linear combination of features. The misclassification error still stays as high as 64%.
df_mlm <-
df_features_raw %>%
inner_join(
df %>%
transmute(
chatfuel_user_id,
best_treatment = str_to_sentence(best_treatment),
best_treatment = if_else(str_detect(best_treatment, "^Nothing"), "Nothing", best_treatment),
best_treatment = if_else(best_treatment %in% c("Family supports it", "Trusted info source", "More transparency", "Nothing", "Rewards for vaxxing", "Job/school required"), best_treatment, NA_character_) %>%
as_factor()) %>%
distinct(chatfuel_user_id, .keep_all = T),
by = "chatfuel_user_id"
) %>%
select(-c(chatfuel_user_id)) %>%
rename(ad_theme_innocence = `ad_theme_innocence/curiosity`)
df_mlm$best_treatment <- relevel(df_mlm$best_treatment, ref = "Nothing")
trainingRows <- sample(1:nrow(df_mlm), 0.7 * nrow(df_mlm))
training <- df_mlm[trainingRows, ]
test <- df_mlm[-trainingRows, ]# impute missing values before feeding into RF
training <-
training %>%
simputation::impute_lm(politics + income + religiosity + location + education ~ female + white) %>%
drop_na()
rf_model <-
randomForest::randomForest(best_treatment ~ ., data = training %>% clean_names(), importance = TRUE)
predicted_class <- predict(rf_model, test %>% clean_names())
table(`Predicted class` = predicted_class, `True class` = test$best_treatment)## True class
## Predicted class Nothing Rewards for vaxxing Trusted info source
## Nothing 89 13 31
## Rewards for vaxxing 2 3 1
## Trusted info source 2 2 4
## Family supports it 7 3 5
## Job/school required 90 37 55
## More transparency 1 1 0
## True class
## Predicted class Family supports it Job/school required More transparency
## Nothing 23 36 24
## Rewards for vaxxing 4 2 0
## Trusted info source 4 2 4
## Family supports it 8 10 2
## Job/school required 53 122 11
## More transparency 1 0 1
mean(as.character(predicted_class) != as.character(test$best_treatment), na.rm = T)## [1] 0.6523737
Given the high misclassification error (65%), the variable importance based on Mean decrease accuracy and Mean decrease Gini are inconsistent.
MeanDecreaseAccuracy gives a rough estimate of the loss
in prediction performance when that particular variable is omitted from
the training set.MeanDecreaseGini: GINI is a measure of node impurity.
Highest purity means that each node contains only elements of a single
class. Assessing the decrease in GINI when that feature is omitted leads
to an understanding of how important that feature is to split the data
correctly.Note: Both these measures are used to rank variables in terms of importance and their absolute values could be disregarded.
randomForest::varImpPlot(rf_model, sort = T, main = "Random Forest - Variable Importance")We start with k-means clustering on all relevant variables collected through the chatbot. We will proceed with the clustering analyses in three steps.
We start by examining the optimal number of clusters we will split our data into. We will look at 2 approaches for this: the elbow method, and the silhouette method. We have also tried the gap statistic in a previous iteration.
First, for the elbow method, we plot the number of clusters versus the within cluster sum of squares. The location of a bend (elbow) in the plot is considered as an indicator of the appropriate number of clusters.
fviz_nbclust(df_features, kmeans, method = "wss")We see there is no clear elbow here. Maybe a very small bend around \(k = 3\).
Second, let’s try the average silhouette approach. The average silhouette approach measures the quality of a clustering by determining how well each object lies within its cluster. A high average silhouette width indicates a good clustering. The optimal number of clusters k is the one that maximizes the average silhouette over a range of possible values for k.
fviz_nbclust(df_features, kmeans, method = "silhouette")We see 2 as the optimal \(k\) from the above pilot, but we err in favor of the next closest silhouette width value at \(k = 3\) (and as we expect to have more than 2 clusters eventually).
Taken together, we have limited consensus around 3 clusters to start us off (we try for larger \(k\)s in the appendix section).
We cluster the data with \(k = 3\). The plot below projects the generated clusters on a 2-d plane, with the axes denoting the first 2 principal components explaining the most variance. We see that the first two components explain very little variation in the data.
km <- kmeans(df_features, centers = 3, nstart = 25)
fviz_cluster(km, data = df_features, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:3),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 356 |
| 2 | 1528 |
| 3 | 387 |
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
bind_cols(df_features, cluster = km$cluster) %>%
mutate_at(vars(-cluster), ~ scale(.) %>% as.vector()) %>%
group_by(cluster) %>%
summarise_all(~ mean(., na.rm = T)) %>%
bind_rows(
bind_cols(df_features, cluster = km$cluster) %>%
mutate_at(vars(-cluster), ~ scale(.) %>% as.vector()) %>%
group_by(cluster) %>%
summarise_all(~ sd(., na.rm = T)),
.id = "stat"
) %>%
mutate(stat = if_else(stat == 1, "mean", "sd")) %>%
select(against_beliefs, no_benefits, risky, no_time, no_money, no_availability, ad_text_airtime, ad_theme_risky, ad_theme_unnecessary, ad_theme_inaccessible, ad_theme_neutral, ad_theme_innocence = `ad_theme_innocence/curiosity`, ad_theme_fear, ad_theme_familyvalues, elaboration, politeness, sentiment, age, female, income, education, religiosity, politics, location, white, everything()) %>%
clean_names(case = "title") %>%
pivot_longer(cols = -c(Cluster, Stat)) %>%
mutate(
Cluster = factor(Cluster),
name = name %>% fct_inorder() %>% fct_rev()
) %>%
pivot_wider(names_from = Stat, values_from = value) %>%
mutate(
mean = round(mean, 2),
mean_char = as.character(mean),
mean_char = if_else(mean > 0.4, "0.4+", mean_char),
mean = if_else(mean > 0.4, 0.4, mean),
sd = round(sd, 2),
label_str = str_c(mean_char, " (", sd, ")"),
) %>%
ggplot() +
geom_tile(aes(Cluster, name, fill = mean)) +
geom_text(aes(Cluster, name, label = label_str), size = 3) +
scale_fill_gradient2(
low = "red",
mid = "white",
high = "green",
midpoint = 0,
limits = c(-0.4, 0.4)
) +
theme_minimal() +
labs(
x = "Cluster", y = "Feature",
fill = "Means (std)"
)Let’s move ahead and look at respondents’ free text by each cluster.
While looking at respondents’ free text by each cluster, we will consider both word clouds and tabulated text.
Cluster 1:
vector_wc <-
bind_cols(df_features_raw, cluster = km$cluster) %>%
select(chatfuel_user_id, cluster) %>%
left_join(df, by = "chatfuel_user_id") %>%
filter(cluster == 1) %>%
count(motive_other) %>%
arrange(-n) %>%
mutate(
motive_other = str_to_lower(motive_other),
motive_other = str_replace(motive_other, "side effects", "sideeffects"),
motive_other = str_replace(motive_other, "vaccine", ""),
motive_other = str_replace(motive_other, "yes", ""),
motive_other = str_replace(motive_other, "no", ""),
motive_other = str_replace(motive_other, "dont", ""),
motive_other = str_replace(motive_other, "don't", ""),
) %>%
filter(n < 5) %>% # (filtering out generic responses like just yes, no)
pull(motive_other)
# Create corpus
docs <- Corpus(VectorSource(vector_wc))
# Clean corpus
docs <-
docs %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeWords, stopwords("english"))
# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)
# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words=200, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"))Cluster 2:
vector_wc <-
bind_cols(df_features_raw, cluster = km$cluster) %>%
select(chatfuel_user_id, cluster) %>%
left_join(df, by = "chatfuel_user_id") %>%
filter(cluster == 2) %>%
count(motive_other) %>%
arrange(-n) %>%
mutate(
motive_other = str_to_lower(motive_other),
motive_other = str_replace(motive_other, "side effects", "sideeffects"),
motive_other = str_replace(motive_other, "vaccine", ""),
motive_other = str_replace(motive_other, "yes", ""),
motive_other = str_replace(motive_other, "no", ""),
motive_other = str_replace(motive_other, "dont", ""),
motive_other = str_replace(motive_other, "don't", ""),
) %>%
filter(n < 5) %>% # (filtering out generic responses like just yes, no)
pull(motive_other)
# Create corpus
docs <- Corpus(VectorSource(vector_wc))
# Clean corpus
docs <-
docs %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeWords, stopwords("english"))
# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)
# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words=200, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"))Cluster 3:
vector_wc <-
bind_cols(df_features_raw, cluster = km$cluster) %>%
select(chatfuel_user_id, cluster) %>%
left_join(df, by = "chatfuel_user_id") %>%
filter(cluster == 3) %>%
count(motive_other) %>%
arrange(-n) %>%
mutate(
motive_other = str_to_lower(motive_other),
motive_other = str_replace(motive_other, "side effects", "sideeffects"),
motive_other = str_replace(motive_other, "vaccine", ""),
motive_other = str_replace(motive_other, "yes", ""),
motive_other = str_replace(motive_other, "no", ""),
motive_other = str_replace(motive_other, "dont", ""),
motive_other = str_replace(motive_other, "don't", ""),
) %>%
filter(n < 5) %>% # (filtering out generic responses like just yes, no)
pull(motive_other)
# Create corpus
docs <- Corpus(VectorSource(vector_wc))
# Clean corpus
docs <-
docs %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeWords, stopwords("english"))
# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)
# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words=200, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"))Cluster 1:
bind_cols(df_features_raw, cluster = km$cluster) %>%
select(chatfuel_user_id, cluster) %>%
left_join(df, by = "chatfuel_user_id") %>%
filter(cluster == 1) %>%
count(motive_other) %>%
arrange(-n, motive_other) %>%
filter(n < 2) # (filtering out generic responses like just yes, no)Cluster 2:
bind_cols(df_features_raw, cluster = km$cluster) %>%
select(chatfuel_user_id, cluster) %>%
left_join(df, by = "chatfuel_user_id") %>%
filter(cluster == 2) %>%
count(motive_other) %>%
arrange(-n, motive_other) %>%
filter(n < 2) # (filtering out generic responses like just yes, no)Cluster 3:
bind_cols(df_features_raw, cluster = km$cluster) %>%
select(chatfuel_user_id, cluster) %>%
left_join(df, by = "chatfuel_user_id") %>%
filter(cluster == 3) %>%
count(motive_other) %>%
arrange(-n, motive_other) %>%
filter(n < 5) # (filtering out generic responses like just yes, no)Cluster 1:
vector_wc <-
bind_cols(df_features_raw, cluster = km$cluster) %>%
select(chatfuel_user_id, cluster) %>%
left_join(df, by = "chatfuel_user_id") %>%
filter(cluster == 1) %>%
count(best_treatment_other) %>%
arrange(-n) %>%
mutate(
best_treatment_other = str_to_lower(best_treatment_other),
best_treatment_other = str_replace(best_treatment_other, "side effects", "sideeffects"),
best_treatment_other = str_replace(best_treatment_other, "vaccine", ""),
best_treatment_other = str_replace(best_treatment_other, "yes", ""),
best_treatment_other = str_replace(best_treatment_other, "no", ""),
best_treatment_other = str_replace(best_treatment_other, "dont", ""),
best_treatment_other = str_replace(best_treatment_other, "don't", ""),
) %>%
filter(n < 5) %>% # (filtering out generic responses like just yes, no)
pull(best_treatment_other)
# Create corpus
docs <- Corpus(VectorSource(vector_wc))
# Clean corpus
docs <-
docs %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeWords, stopwords("english"))
# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)
# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words=200, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"))Cluster 2:
vector_wc <-
bind_cols(df_features_raw, cluster = km$cluster) %>%
select(chatfuel_user_id, cluster) %>%
left_join(df, by = "chatfuel_user_id") %>%
filter(cluster == 2) %>%
count(best_treatment_other) %>%
arrange(-n) %>%
mutate(
best_treatment_other = str_to_lower(best_treatment_other),
best_treatment_other = str_replace(best_treatment_other, "side effects", "sideeffects"),
best_treatment_other = str_replace(best_treatment_other, "vaccine", ""),
best_treatment_other = str_replace(best_treatment_other, "yes", ""),
best_treatment_other = str_replace(best_treatment_other, "no", ""),
best_treatment_other = str_replace(best_treatment_other, "dont", ""),
best_treatment_other = str_replace(best_treatment_other, "don't", ""),
) %>%
filter(n < 5) %>% # (filtering out generic responses like just yes, no)
pull(best_treatment_other)
# Create corpus
docs <- Corpus(VectorSource(vector_wc))
# Clean corpus
docs <-
docs %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeWords, stopwords("english"))
# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)
# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words=200, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"))Cluster 3:
vector_wc <-
bind_cols(df_features_raw, cluster = km$cluster) %>%
select(chatfuel_user_id, cluster) %>%
left_join(df, by = "chatfuel_user_id") %>%
filter(cluster == 3) %>%
count(best_treatment_other) %>%
arrange(-n) %>%
mutate(
best_treatment_other = str_to_lower(best_treatment_other),
best_treatment_other = str_replace(best_treatment_other, "side effects", "sideeffects"),
best_treatment_other = str_replace(best_treatment_other, "vaccine", ""),
best_treatment_other = str_replace(best_treatment_other, "yes", ""),
best_treatment_other = str_replace(best_treatment_other, "no", ""),
best_treatment_other = str_replace(best_treatment_other, "dont", ""),
best_treatment_other = str_replace(best_treatment_other, "don't", ""),
) %>%
filter(n < 5) %>% # (filtering out generic responses like just yes, no)
pull(best_treatment_other)
# Create corpus
docs <- Corpus(VectorSource(vector_wc))
# Clean corpus
docs <-
docs %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeWords, stopwords("english"))
# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)
# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words=200, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"))Cluster 1:
bind_cols(df_features_raw, cluster = km$cluster) %>%
select(chatfuel_user_id, cluster) %>%
left_join(df, by = "chatfuel_user_id") %>%
filter(cluster == 1) %>%
count(best_treatment_other) %>%
arrange(-n, best_treatment_other) %>%
filter(n < 5) # (filtering out generic responses like just yes, no)Cluster 2:
bind_cols(df_features_raw, cluster = km$cluster) %>%
select(chatfuel_user_id, cluster) %>%
left_join(df, by = "chatfuel_user_id") %>%
filter(cluster == 2) %>%
count(best_treatment_other) %>%
arrange(-n, best_treatment_other) %>%
filter(n < 2) # (filtering out generic responses like just yes, no)Cluster 3:
bind_cols(df_features_raw, cluster = km$cluster) %>%
select(chatfuel_user_id, cluster) %>%
left_join(df, by = "chatfuel_user_id") %>%
filter(cluster == 3) %>%
count(best_treatment_other) %>%
arrange(-n, best_treatment_other) %>%
filter(n < 5) # (filtering out generic responses like just yes, no)Clustering trials with 4/5/6 clusters:
We cluster the data with \(k = 4\) here. The plot below projects the generated clusters on a 2-d plane, with the axes denoting the first 2 principal components explaining the most variance. We see that the first two components explain very little variation in the data.
km <- kmeans(df_features, centers = 4, nstart = 25)
fviz_cluster(km, data = df_features, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:4),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 287 |
| 2 | 160 |
| 3 | 1480 |
| 4 | 344 |
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
bind_cols(df_features, cluster = km$cluster) %>%
mutate_at(vars(-cluster), ~ scale(.) %>% as.vector()) %>%
group_by(cluster) %>%
summarise_all(~ mean(., na.rm = T)) %>%
bind_rows(
bind_cols(df_features, cluster = km$cluster) %>%
mutate_at(vars(-cluster), ~ scale(.) %>% as.vector()) %>%
group_by(cluster) %>%
summarise_all(~ sd(., na.rm = T)),
.id = "stat"
) %>%
mutate(stat = if_else(stat == 1, "mean", "sd")) %>%
select(against_beliefs, no_benefits, risky, no_time, no_money, no_availability, ad_text_airtime, ad_theme_risky, ad_theme_unnecessary, ad_theme_inaccessible, ad_theme_neutral, ad_theme_innocence = `ad_theme_innocence/curiosity`, ad_theme_fear, ad_theme_familyvalues, elaboration, politeness, sentiment, age, female, income, education, religiosity, politics, location, white, everything()) %>%
clean_names(case = "title") %>%
pivot_longer(cols = -c(Cluster, Stat)) %>%
mutate(
Cluster = factor(Cluster),
name = name %>% fct_inorder() %>% fct_rev()
) %>%
pivot_wider(names_from = Stat, values_from = value) %>%
mutate(
mean = round(mean, 2),
mean_char = as.character(mean),
mean_char = if_else(mean > 0.4, "0.4+", mean_char),
mean = if_else(mean > 0.4, 0.4, mean),
sd = round(sd, 2),
label_str = str_c(mean_char, " (", sd, ")"),
) %>%
ggplot() +
geom_tile(aes(Cluster, name, fill = mean)) +
geom_text(aes(Cluster, name, label = label_str), size = 3) +
scale_fill_gradient2(
low = "red",
mid = "white",
high = "green",
midpoint = 0,
limits = c(-0.4, 0.4)
) +
theme_minimal() +
labs(
x = "Cluster", y = "Feature",
fill = "Means (std)"
)We cluster the data with \(k = 5\) here. The plot below projects the generated clusters on a 2-d plane, with the axes denoting the first 2 principal components explaining the most variance. We see that the first two components explain very little variation in the data.
km <- kmeans(df_features, centers = 5, nstart = 25)
fviz_cluster(km, data = df_features, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:5),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 258 |
| 2 | 542 |
| 3 | 320 |
| 4 | 171 |
| 5 | 980 |
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
bind_cols(df_features, cluster = km$cluster) %>%
mutate_at(vars(-cluster), ~ scale(.) %>% as.vector()) %>%
group_by(cluster) %>%
summarise_all(~ mean(., na.rm = T)) %>%
bind_rows(
bind_cols(df_features, cluster = km$cluster) %>%
mutate_at(vars(-cluster), ~ scale(.) %>% as.vector()) %>%
group_by(cluster) %>%
summarise_all(~ sd(., na.rm = T)),
.id = "stat"
) %>%
mutate(stat = if_else(stat == 1, "mean", "sd")) %>%
select(against_beliefs, no_benefits, risky, no_time, no_money, no_availability, ad_text_airtime, ad_theme_risky, ad_theme_unnecessary, ad_theme_inaccessible, ad_theme_neutral, ad_theme_innocence = `ad_theme_innocence/curiosity`, ad_theme_fear, ad_theme_familyvalues, elaboration, politeness, sentiment, age, female, income, education, religiosity, politics, location, white, everything()) %>%
clean_names(case = "title") %>%
pivot_longer(cols = -c(Cluster, Stat)) %>%
mutate(
Cluster = factor(Cluster),
name = name %>% fct_inorder() %>% fct_rev()
) %>%
pivot_wider(names_from = Stat, values_from = value) %>%
mutate(
mean = round(mean, 2),
mean_char = as.character(mean),
mean_char = if_else(mean > 0.4, "0.4+", mean_char),
mean = if_else(mean > 0.4, 0.4, mean),
mean_char = if_else(mean < -0.4, "-0.4", mean_char),
mean = if_else(mean < -0.4, -0.4, mean),
sd = round(sd, 2),
label_str = str_c(mean_char, " (", sd, ")"),
) %>%
ggplot() +
geom_tile(aes(Cluster, name, fill = mean)) +
geom_text(aes(Cluster, name, label = label_str), size = 3) +
scale_fill_gradient2(
low = "red",
mid = "white",
high = "green",
midpoint = 0,
limits = c(-0.4, 0.4)
) +
theme_minimal() +
labs(
x = "Cluster", y = "Feature",
fill = "Means (std)"
)We cluster the data with \(k = 6\) here. The plot below projects the generated clusters on a 2-d plane, with the axes denoting the first 2 principal components explaining the most variance. We see that the first two components explain very little variation in the data.
km <- kmeans(df_features, centers = 6, nstart = 25)
fviz_cluster(km, data = df_features, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:6),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 224 |
| 2 | 299 |
| 3 | 271 |
| 4 | 426 |
| 5 | 903 |
| 6 | 148 |
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
bind_cols(df_features, cluster = km$cluster) %>%
mutate_at(vars(-cluster), ~ scale(.) %>% as.vector()) %>%
group_by(cluster) %>%
summarise_all(~ mean(., na.rm = T)) %>%
bind_rows(
bind_cols(df_features, cluster = km$cluster) %>%
mutate_at(vars(-cluster), ~ scale(.) %>% as.vector()) %>%
group_by(cluster) %>%
summarise_all(~ sd(., na.rm = T)),
.id = "stat"
) %>%
mutate(stat = if_else(stat == 1, "mean", "sd")) %>%
select(against_beliefs, no_benefits, risky, no_time, no_money, no_availability, ad_text_airtime, ad_theme_risky, ad_theme_unnecessary, ad_theme_inaccessible, ad_theme_neutral, ad_theme_innocence = `ad_theme_innocence/curiosity`, ad_theme_fear, ad_theme_familyvalues, elaboration, politeness, sentiment, age, female, income, education, religiosity, politics, location, white, everything()) %>%
clean_names(case = "title") %>%
pivot_longer(cols = -c(Cluster, Stat)) %>%
mutate(
Cluster = factor(Cluster),
name = name %>% fct_inorder() %>% fct_rev()
) %>%
pivot_wider(names_from = Stat, values_from = value) %>%
mutate(
mean = round(mean, 2),
mean_char = as.character(mean),
mean_char = if_else(mean > 0.4, "0.4+", mean_char),
mean = if_else(mean > 0.4, 0.4, mean),
mean_char = if_else(mean < -0.4, "-0.4", mean_char),
mean = if_else(mean < -0.4, -0.4, mean),
sd = round(sd, 2),
label_str = str_c(mean_char, " (", sd, ")"),
) %>%
ggplot() +
geom_tile(aes(Cluster, name, fill = mean)) +
geom_text(aes(Cluster, name, label = label_str), size = 3) +
scale_fill_gradient2(
low = "red",
mid = "white",
high = "green",
midpoint = 0,
limits = c(-0.4, 0.4)
) +
theme_minimal() +
labs(
x = "Cluster", y = "Feature",
fill = "Means (std)"
)