The present user segmentation analysis is based on 7318 respondents completing our survey for pilot wave 8 (August 2022), and 81 features (both MCQ and free text). The related GitHub issue is here.
Analysis outline
WIP Takeaways for user segments
Summary stats:
Clustering:
Old:
more safety evidence can work as treatment.family/friend endorsement can work as treatment.getting time off work can work as
treatment.Rewards to get vaccinated can work as treatment for all
kinds of impediments (both motivation and ability), though less so for
more educated and Black participants.no treatment will work (which is 4% of our sample) consists
of respondents who are more religious and lack motivation to get the
vaccine (vaccine is against beliefs/is risky/has no benefits).# Load packages
pacman::p_load(DT, estimatr, kableExtra, readr, reshape2, tidyverse, xtable, dataMaid, ggcorrplot, ggmap, rpart, rpart.plot, pollster, wordcloud, tm, RColorBrewer, hrbrthemes, janitor, purrr, gridExtra, cowplot, rcompanion, nnet, texreg, compareGroups, factoextra, cluster, fastDummies, simputation, sentimentr, politeness, textir, glmnet, gamlr, tm, topicmodels, ldatuning, lda, SnowballC, olsrr, here)
set.seed(94305)
# read in CURRENT chatfuel data
# full data
df_full_v8 <-
read_csv(here("pilot8/data/full_df_clean.csv")) %>%
clean_names() %>%
mutate_if(is.character, ~ str_replace_all(., '[\n\t]', '')) %>%
mutate(
motive = if_else(str_detect(motive, "yes"), "yes", "no"),
motive_main = if_else(str_detect(motive_main, "risk"), "risk", motive_main),
best_treatment = str_remove_all(best_treatment, "\\.") %>% str_to_sentence(),
best_treatment = if_else(best_treatment == "New trusted info", "More safety evidence", best_treatment),
) %>%
remove_empty("rows")
# filter to completes
df <-
df_full_v8 %>%
filter(full_complete == "complete") %>%
drop_na(vax_status) %>%
left_join(read_csv(here("pilot8/data/df_features_treatment_proposal.csv")) %>% select(id, treatment_text = best_treatment_proposal_feature), by = "id") %>%
remove_empty("rows")We start with cleaning chatbot features to be used in our models. Below are summary statistics for the cleaned features fed into the models.
# clean up demographic variables
df_features <-
df %>%
transmute(
age,
education = education_num,
religiosity = religiosity_num,
location = location_num,
black_or_african = if_else(ethnicity == "black or african", 1L, 0L) %>% replace_na(0),
vaccinated = vax_status_num,
covid_already = if_else(str_detect(covid_already, "No|no"), 0L, 1L) %>% replace_na(0),
covid_vax_future = if_else(str_detect(covid_vax_future, "Likely|Maybe"), 1L, 0L) %>% replace_na(0),
no_motive = case_when(
motive == "yes" ~ 0L,
motive == "no" ~ 1L,
),
motive_elaboration = motive_nchar,
no_ability = case_when(
ability == "easy" ~ 0L,
ability != "easy" & !is.na(ability) ~ 1L,
) %>% replace_na(0),
ability_elaboration = ability_nchar %>% replace_na(0),
against_beliefs = if_else(motive_main == "beliefs", 1L, 0L) %>% replace_na(0),
no_benefits = if_else(motive_main == "benefit", 1L, 0L) %>% replace_na(0),
risky = if_else(motive_main == "risk", 1L, 0L) %>% replace_na(0),
no_time = if_else(ability_main == "time", 1L, 0L) %>% replace_na(0),
no_money = if_else(ability_main == "money", 1L, 0L) %>% replace_na(0),
no_availability = if_else(ability_main == "availability", 1L, 0L) %>% replace_na(0),
# info variables
info_source = str_squish(info_source) %>% str_to_lower(),
info_source = case_when(
str_detect(info_source, "social|facebook|twitter|whatsapp|fb|insta|tiktok|youtube") ~ "social media",
str_detect(info_source, "online|google|internet|web") ~ "internet",
str_detect(info_source, "media|tv|television") ~ "television",
str_detect(info_source, "radio") ~ "radio",
str_detect(info_source, "news") ~ "news",
str_detect(info_source, "friend|family|work|elder") ~ "friends & family",
str_detect(info_source, "health|hospital|clinic") ~ "hospital",
str_detect(info_source, "school|teach|educ") ~ "school",
),
info_confidence = str_to_lower(info_confidence),
info_confidence_high = if_else(str_detect(info_confidence, "very"), 1L, 0L) %>% replace_na(0),
want_link = if_else(str_detect(want_link, "Sure"), 1L, 0L) %>% replace_na(0),
want_answer = if_else(str_detect(want_answer, "Sure"), 1L, 0L) %>% replace_na(0),
self_reflection = if_else(self_reflection == "A lot!", 1L, 0L) %>% replace_na(0),
best_treatment,
lack_motive = case_when(
str_detect(motive_reason, "(((negative|sides?) ?-?)?(a|e)fff?ects?[^ive]|ma(k|d)(e|es|ing) (me|people|them|you) sick|g(e|o)t(ting)? sick|more sick|sick effect)|(after|side|the) effect|react|bad effect|aftermath|vomit|ache|cause health problems|dizzy|negative effect|sick after|clot|fever|fatigue|health complication afterward|weakens my body|getting ill|collapse|falling down|dizziness") ~ "side_effects",
str_detect(motive_reason, "needles?|injections?") ~ "needles_injection",
str_detect(motive_reason, "not good for my body|safe|harm|dangerous|dangers|unhealthy|bad for my health") ~ "unsafe",
str_detect(motive_reason, "lab rats|paralyzed|illuminati|rumurs|infertil|misinformed|mis ?information|ro?umou?rs?|myths?|theories|conspiracy|scam|zombie|controvers|romours|robot|political|politics|criticism|propaganda|gossip|online|misleading info|false info|misconceptions?|(fake )?news|article|magnetic|hoax|fake") ~ "misinformation",
str_detect(motive_reason, "scar(e|y)|fear|afraid") ~ "scared",
str_detect(motive_reason, "((don\'?t|didn'?t|did not|do not|no) (really )?trust|trusted|distrust|doubt|s(k|c)eptical)|not trusting it|untrustworthy|the reviews") ~ "trust",
str_detect(motive_reason, "preg3|am expectant|breastfeeding|pregnan|lactating|newborn|infant|nursing") ~ "pregnancy_nursing",
str_detect(motive_reason, "pain|hurt") ~ "pain",
str_detect(motive_reason, "demonic|religio|church|god|christian|antichrist|666") ~ "religion",
str_detect(motive_reason, "die|death|dead|kill|dying") ~ "death",
str_detect(motive_reason, "soceity judgment|sibling|family|friends?|parents?|brother|sister|uncle|aunt|community|cousin") ~ "family_friends",
str_detect(motive_reason, "risk") ~ "risk",
str_detect(motive_reason, "more research|enough proof|i did see enough proof|no sure prove|unanswered questions|limited data|not informed|knowledge|information|didn't know|i don't (really )?know") ~ "more_information",
str_detect(motive_reason, "government") ~ "government",
str_detect(motive_reason, "benefit") ~ "no_benefit",
str_detect(motive_reason, "effective|no proof it works") ~ "ineffective",
str_detect(motive_reason, "i don'?t have reason|^not really$|no reason|nothing|not? interest|don't feel like|^none$") ~ "no_reason",
str_detect(motive_reason, "not real[^ly]|no covid|exist|is real") ~ "covid_not_real",
str_detect(motive_reason, "did not think it'?s important|not common|didn'?t think it was necessary|is gone|few cases|less prevalent|almost over|post\\-? ?covid|isn't anything serious|not important|already controlled|i don't find it necessary|isn't as prevalent|covid is gone|i don't really hear of the covid|rates have decreased|spread.*is low|not serious|not that important|no need|not help|not? use|no more|no longer|is over|lowered|need |finished|necessity|not necessary|i don't think it'?s necessary|i don'?t see the point") ~ "no_need",
str_detect(motive_reason, "listened to people|negative reviews|people talk|talking negative things|what people say|(bad|negative) report|negative things from social media|condemning|opinion|bad reviews|hearing negative things|people say|negativity|heare?d|hearsay|stories|talks|discussion|speculation|saying|feedback") ~ "heard_hearsay",
str_detect(motive_reason, "belie(f|ve)") ~ "beliefs",
str_detect(motive_reason, "never met a victim|i haven't seen someone|never saw|never seen|no one (has|from)|i have never someone") ~ "never_saw_covid",
str_detect(motive_reason, "my immune is strong|okay with my health|am good/fine|am technically okay|my immunity system|(i'?m|am) not sick|good health|i feel (healthy|fine)|i feel (very )?ok(ay)?|healthy? is good|am healthy|healthy person|high immunity|healthy lifestyle|am just healthy|fine healthy|still healthy|i was healthy|i am just healthy|always been healthy|eating healthy|^(?!.*(weak|affect my)).*immune system|strong immunity") ~ "healthy",
str_detect(motive_reason, "asthmatic|weak immune system|health issue|it's not good for human health|not good for my health|no good for health|allergies|health issues|body weakness|chronic illness|high blood pressure") ~ "health_issues",
str_detect(motive_reason, "distant visit|near the place|nearby|nearest place|far |not available in my area") ~ "distance",
str_detect(motive_reason, "office|not living close|at work|requirement at work|work related|job") ~ "work",
str_detect(motive_reason, "travel") ~ "travel",
str_detect(motive_reason, "nowhere near contacting|am not infected|i had not gotten corona|don'?t have.*(virus|covid|symptoms|it$)") ~ "dont_have_covid",
str_detect(motive_reason, "^(yes|nice|noo?|ok(ay)?|bad|yes sure|nope?|unsure|pardon|lol)$|https\\:\\/\\/|personal reasons") | nchar(motive_reason) <3 ~ "nonsensical",
str_detect(motive_reason, "availab") ~ "no_availability",
str_detect(motive_reason, "expensive|money|lack of money|cost") ~ "need_money",
TRUE ~ NA_character_,
),
treatment = treatment_text,
) %>%
select(-info_confidence) %>%
dummy_cols(select_columns = "info_source", remove_most_frequent_dummy = TRUE, remove_selected_columns = TRUE, ignore_na = TRUE) %>%
dummy_cols(select_columns = "lack_motive", remove_most_frequent_dummy = TRUE, remove_selected_columns = TRUE, ignore_na = TRUE) %>%
dummy_cols(select_columns = "treatment", remove_most_frequent_dummy = TRUE, remove_selected_columns = TRUE, ignore_na = TRUE) %>%
mutate_at(vars(starts_with(c("info", "motive", "treatment"))), ~ replace_na(., 0))
df_features_unvax <-
df_features %>%
filter(vaccinated == 0)
df_unvax <-
df %>%
filter(vax_status_num == 0)df_features_unvax %>%
clean_names(case = "title") %>%
papeR::summarize_numeric() %>%
datatable(options = list(pageLength = 10, columnDefs = list(list(orderable = TRUE, targets = 0))))The heatmap below shows the mean values of features (along the y-axis) for each of the motive impediments(along the x-axis). Each feature is normalized before aggregating so that its mean across motive impediments is 0. Hence, for each , reds on the plot show values below the mean across motive impediments, whereas greens on the plot show values above it.
df_features_unvax %>%
mutate(motive_main = df_unvax$motive_main) %>%
select(!best_treatment & !no_motive & !risky & !against_beliefs & !no_benefits & !motive_elaboration) %>%
filter(!is.na(motive_main)) %>%
mutate_at(vars(-motive_main), ~ scale(.) %>% as.vector()) %>%
group_by(motive_main) %>%
summarise_all(list(sep_mean = ~ mean(., na.rm = T), sep_sd = ~ sd(., na.rm =T))) %>%
pivot_longer(cols = !motive_main, names_to = c("name"))%>%
separate(name, into = c("name", "value_type"), sep = "_sep_") %>%
mutate(
motive_main = factor(motive_main),
name =str_to_title(str_replace_all(string = name, pattern = "_", replacement = " ")) %>% fct_inorder() %>% fct_rev()
) %>%
pivot_wider(names_from = value_type, values_from = value) %>%
mutate(
mean = round(mean, 2),
mean_char = as.character(mean),
mean_char = if_else(mean > 0.4, "0.4+", mean_char),
mean = if_else(mean > 0.4, 0.4, mean),
mean_char = if_else(mean < -0.4, "-0.4", mean_char),
mean = if_else(mean < -0.4, -0.4, mean),
sd = round(sd, 2),
label_str = str_c(mean_char, " (", sd, ")"),
motive_main = factor(str_to_title(str_replace_all(string = motive_main, pattern = " ", replacement = "\n")), levels = c("Beliefs", "Benefit", "Risk", "Misunderstood", "Other"))
) %>%
ggplot() +
geom_tile(aes(as.numeric(motive_main), name, fill = mean), color = "white", lwd = .5) +
geom_text(aes(as.numeric(motive_main), name, label = label_str), size = 2.75) +
scale_fill_gradient2(
low = "red",
mid = "white",
high = "green",
midpoint = 0,
limits = c(-0.5, 0.5)
) +
scale_x_continuous(sec.axis = dup_axis(),breaks = 1:5,
limits = c(0.5, 5.5),
expand = c(0,0),
labels = c("Beliefs", "Benefit", "Risk", "Misunderstood", "Other")) +
theme_minimal() +
theme(legend.position = "bottom",
axis.title.y = element_text(angle = 0)) +
labs(
x = "Motive Impediment", y = "Feature",
fill = "Means (std)"
) df_features_unvax %>%
clean_names(case = "title") %>%
filter(`Against Beliefs` == 1) %>%
papeR::summarize_numeric() %>%
datatable(options = list(pageLength = 10, columnDefs = list(list(orderable = TRUE, targets = 0))))df_features_unvax %>%
clean_names(case = "title") %>%
filter(`No Benefits` == 1) %>%
papeR::summarize_numeric() %>%
datatable(options = list(pageLength = 10, columnDefs = list(list(orderable = TRUE, targets = 0))))df_features_unvax %>%
clean_names(case = "title") %>%
filter(`Risky` == 1) %>%
papeR::summarize_numeric() %>%
datatable(options = list(pageLength = 10, columnDefs = list(list(orderable = TRUE, targets = 0))))The heatmap below shows the mean values of features (along the y-axis) for each of the ability impediments(along the x-axis). Each feature is normalized before aggregating so that its mean across ability impediments is 0. Hence, for each , reds on the plot show values below the mean across ability impediments, whereas greens on the plot show values above it.
df_features_unvax %>%
mutate(ability_main = df_unvax$ability_main) %>%
select(!best_treatment & !no_availability & !no_time & !no_money & !no_ability & !ability_elaboration) %>%
filter(!is.na(ability_main)) %>%
mutate_at(vars(-ability_main), ~ scale(.) %>% as.vector()) %>%
group_by(ability_main) %>%
summarise_all(list(sep_mean = ~ mean(., na.rm = T), sep_sd = ~ sd(., na.rm =T))) %>%
pivot_longer(cols = !ability_main, names_to = c("name"))%>%
separate(name, into = c("name", "value_type"), sep = "_sep_") %>%
mutate(
ability_main = factor(ability_main),
name =str_to_title(str_replace_all(string = name, pattern = "_", replacement = " ")) %>% fct_inorder() %>% fct_rev()
) %>%
pivot_wider(names_from = value_type, values_from = value) %>%
mutate(
mean = round(mean, 2),
mean_char = as.character(mean),
mean_char = if_else(mean > 0.4, "0.4+", mean_char),
mean = if_else(mean > 0.4, 0.4, mean),
mean_char = if_else(mean < -0.4, "-0.4", mean_char),
mean = if_else(mean < -0.4, -0.4, mean),
sd = round(sd, 2),
label_str = str_c(mean_char, " (", sd, ")"),
ability_main = factor(str_to_title(str_replace_all(string = ability_main, pattern = " ", replacement = "\n")), levels =c("Availability", "Money", "Time", "Misunderstood", "Other"))
) %>%
ggplot() +
geom_tile(aes(as.numeric(ability_main), name, fill = mean), color = "white", lwd = .5) +
geom_text(aes(as.numeric(ability_main), name, label = label_str), size = 2.75) +
scale_fill_gradient2(
low = "red",
mid = "white",
high = "green",
midpoint = 0,
limits = c(-0.5, 0.5)
)+
scale_x_continuous(sec.axis = dup_axis(),breaks = 1:5,
limits = c(0.5, 5.5),
expand = c(0,0),
labels = c("Availability", "Money", "Time", "Misunderstood", "Other")) +
theme_minimal() +
theme(legend.position = "bottom",
axis.title.y = element_text(angle = 0)) +
labs(
x = "Ability Impediment", y = "Feature",
fill = "Means (std)"
) df_features_unvax %>%
clean_names(case = "title") %>%
filter(`No Availability` == 1) %>%
papeR::summarize_numeric() %>%
datatable(options = list(pageLength = 10, columnDefs = list(list(orderable = TRUE, targets = 0))))df_features_unvax %>%
clean_names(case = "title") %>%
filter(`No Money` == 1) %>%
papeR::summarize_numeric() %>%
datatable(options = list(pageLength = 10, columnDefs = list(list(orderable = TRUE, targets = 0))))df_features_unvax %>%
clean_names(case = "title") %>%
filter(`No Time` == 1) %>%
papeR::summarize_numeric() %>%
datatable(options = list(pageLength = 10, columnDefs = list(list(orderable = TRUE, targets = 0))))The heatmap below shows the mean values of features (along the y-axis) for each of the best treatments (along the x-axis). Each feature is normalized before aggregating so that its mean across best treatments is 0. Hence, for each, reds on the plot show values below the mean across best treatments, whereas greens on the plot show values above it.
df_features_unvax %>%
filter(!(best_treatment %in%c("Other", NA))) %>%
mutate_at(vars(-best_treatment), ~ scale(.) %>% as.vector()) %>%
group_by(best_treatment) %>%
summarise_all(list(sep_mean = ~ mean(., na.rm = T), sep_sd = ~ sd(., na.rm =T))) %>%
pivot_longer(cols = !best_treatment, names_to = c("name"))%>%
separate(name, into = c("name", "value_type"), sep = "_sep_") %>%
mutate(
best_treatment = factor(best_treatment),
name =str_to_title(str_replace_all(string = name, pattern = "_", replacement = " ")) %>% fct_inorder() %>% fct_rev()
) %>%
pivot_wider(names_from = value_type, values_from = value) %>%
mutate(
mean = round(mean, 2),
mean_char = as.character(mean),
mean_char = if_else(mean > 0.4, "0.4+", mean_char),
mean = if_else(mean > 0.4, 0.4, mean),
mean_char = if_else(mean < -0.4, "-0.4", mean_char),
mean = if_else(mean < -0.4, -0.4, mean),
sd = round(sd, 2),
label_str = str_c(mean_char, " (", sd, ")"),
best_treatment = str_replace_all(string = best_treatment, pattern = " ", replacement = "\n")
) %>%
ggplot() +
geom_tile(aes(as.numeric(as.factor(best_treatment)), name, fill = mean), color = "white", lwd = .5) +
geom_text(aes(as.numeric(as.factor(best_treatment)), name, label = label_str), size = 2.75) +
scale_fill_gradient2(
low = "red",
mid = "white",
high = "green",
midpoint = 0,
limits = c(-0.5, 0.5)
) +
scale_x_continuous(breaks = 1:10, sec.axis = dup_axis(),
expand = c(0,0),
limits = c(0.5, 10.5),
labels= c("Appointment", "Convenient\nto\nvaccinate", "Family/friend\nendorses\nit", "Job/school\nrequired", "More\nsafety\nevidence", "No,\nsomething\nelse", "Nothing", "Reminders", "Rewards\nfor\nvaccinating", "Time\noff\nwork")) +
theme_minimal() +
theme(legend.position = "bottom",
axis.title.y = element_text(
angle = 0))+
labs(
x = "Best Treatment", y = "Feature",
fill = "Means (std)"
) df_features_unvax %>%
clean_names(case = "title") %>%
filter(`Best Treatment` == "Family/friend endorses it") %>%
papeR::summarize_numeric() %>%
datatable(options = list(pageLength = 10, columnDefs = list(list(orderable = TRUE, targets = 0))))df_features_unvax %>%
clean_names(case = "title") %>%
filter(`Best Treatment` == "More safety evidence") %>%
papeR::summarize_numeric() %>%
datatable(options = list(pageLength = 10, columnDefs = list(list(orderable = TRUE, targets = 0))))df_features_unvax %>%
clean_names(case = "title") %>%
filter(`Best Treatment` == "Reminders") %>%
papeR::summarize_numeric() %>%
datatable(options = list(pageLength = 10, columnDefs = list(list(orderable = TRUE, targets = 0))))df_features_unvax %>%
clean_names(case = "title") %>%
filter(`Best Treatment` == "Rewards for vaccinating") %>%
papeR::summarize_numeric() %>%
datatable(options = list(pageLength = 10, columnDefs = list(list(orderable = TRUE, targets = 0))))df_features_unvax %>%
clean_names(case = "title") %>%
filter(`Best Treatment` == "Convenient to vaccinate") %>%
papeR::summarize_numeric() %>%
datatable(options = list(pageLength = 10, columnDefs = list(list(orderable = TRUE, targets = 0))))df_features_unvax %>%
clean_names(case = "title") %>%
filter(`Best Treatment` == "Time off work") %>%
papeR::summarize_numeric() %>%
datatable(options = list(pageLength = 10, columnDefs = list(list(orderable = TRUE, targets = 0))))df_features_unvax %>%
clean_names(case = "title") %>%
filter(`Best Treatment` == "Job/school required it") %>%
papeR::summarize_numeric() %>%
datatable(options = list(pageLength = 10, columnDefs = list(list(orderable = TRUE, targets = 0))))df_features_unvax %>%
clean_names(case = "title") %>%
filter(`Best Treatment` == "Appointment") %>%
papeR::summarize_numeric() %>%
datatable(options = list(pageLength = 10, columnDefs = list(list(orderable = TRUE, targets = 0))))df_features_unvax %>%
clean_names(case = "title") %>%
filter(`Best Treatment` == "No, something else") %>%
papeR::summarize_numeric() %>%
datatable(options = list(pageLength = 10, columnDefs = list(list(orderable = TRUE, targets = 0))))df_features_unvax %>%
clean_names(case = "title") %>%
filter(`Best Treatment` == "Nothing") %>%
papeR::summarize_numeric() %>%
datatable(options = list(pageLength = 10, columnDefs = list(list(orderable = TRUE, targets = 0))))df_cluster <-
df_features %>%
filter(no_motive == 1) %>%
drop_na(best_treatment) %>%
dummy_cols(select_columns = "best_treatment", remove_most_frequent_dummy = TRUE, remove_selected_columns = TRUE) %>%
select(-`best_treatment_No, something else`) %>%
select(-starts_with(c("lack", "treatment"))) %>% # excludes free text for this iteration
remove_constant() %>%
remove_empty("rows") %>%
drop_na()We run k-means clustering here on 2488 respondents on all relevant features collected from the chatbot.
We start by examining the optimal number of clusters we will split our data into. Let’s try the average silhouette approach to find an optimum number of clusters. The average silhouette approach measures the quality of a clustering by determining how well each object lies within its cluster. A high average silhouette width indicates a good clustering. The optimal number of clusters \(k\) is the one that maximizes the average silhouette over a range of possible values for \(k\).
fviz_nbclust(df_cluster, kmeans, method = "silhouette")We cluster the data with \(k = 2\).
km <- kmeans(df_cluster, centers = 2, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:2),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 2062 |
| 2 | 426 |
Aggregated statistics by cluster:
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters <- function(df_plot) {
df_plot %>%
mutate_at(vars(-cluster), ~ scale(.) %>% as.vector()) %>%
group_by(cluster) %>%
summarise_all(~ mean(., na.rm = T)) %>%
bind_rows(
df_plot %>%
mutate_at(vars(-cluster), ~ scale(.) %>% as.vector()) %>%
group_by(cluster) %>%
summarise_all(~ sd(., na.rm = T)),
.id = "stat"
) %>%
mutate(stat = if_else(stat == 1, "mean", "sd")) %>%
clean_names(case = "title") %>%
pivot_longer(cols = -c(Cluster, Stat)) %>%
mutate(
Cluster = factor(Cluster),
name = name %>% fct_inorder() %>% fct_rev()
) %>%
pivot_wider(names_from = Stat, values_from = value) %>%
mutate(
mean = round(mean, 2),
mean_char = as.character(mean),
mean_char = if_else(mean > 0.4, "0.4+", mean_char),
mean = if_else(mean > 0.4, 0.4, mean),
mean_char = if_else(mean < -0.4, "-0.4", mean_char),
mean = if_else(mean < -0.4, -0.4, mean),
sd = round(sd, 2),
label_str = str_c(mean_char, " (", sd, ")"),
) %>%
ggplot() +
geom_tile(aes(Cluster, name, fill = mean)) +
geom_text(aes(Cluster, name, label = label_str), size = 2.75) +
scale_fill_gradient2(
low = "red",
mid = "white",
high = "green",
midpoint = 0,
limits = c(-0.5, 0.5)
) +
theme_minimal() +
theme(legend.position = "bottom") +
labs(
x = "Cluster", y = "Feature",
fill = "Means (std)"
)
}
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))We cluster the data with \(k = 3\).
km <- kmeans(df_cluster, centers = 3, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:3),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 735 |
| 2 | 1598 |
| 3 | 155 |
Aggregated statistics by cluster:
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))We cluster the data with \(k = 4\) here. We see that the first two components explain very little variation in the data.
km <- kmeans(df_cluster, centers = 4, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:4),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 664 |
| 2 | 1531 |
| 3 | 124 |
| 4 | 169 |
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))We cluster the data with \(k = 5\) here. We see that the first two components explain very little variation in the data.
km <- kmeans(df_cluster, centers = 5, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:5),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 51 |
| 2 | 606 |
| 3 | 312 |
| 4 | 1405 |
| 5 | 114 |
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))We cluster the data with \(k = 6\) here. We see that the first two components explain very little variation in the data.
km <- kmeans(df_cluster, centers = 6, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:6),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 316 |
| 2 | 655 |
| 3 | 94 |
| 4 | 52 |
| 5 | 1088 |
| 6 | 283 |
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))df_cluster <-
df_features %>%
filter(no_ability == 1) %>%
drop_na(best_treatment) %>%
dummy_cols(select_columns = "best_treatment", remove_most_frequent_dummy = TRUE, remove_selected_columns = TRUE) %>%
select(-`best_treatment_No, something else`) %>%
select(-starts_with(c("lack", "treatment"))) %>% # excludes free text for this iteration
remove_constant() %>%
remove_empty("rows") %>%
drop_na()We run k-means clustering here on 1907 respondents on all relevant features collected from the chatbot.
We start by examining the optimal number of clusters we will split our data into. Let’s try the average silhouette approach to find an optimum number of clusters. The average silhouette approach measures the quality of a clustering by determining how well each object lies within its cluster. A high average silhouette width indicates a good clustering. The optimal number of clusters k is the one that maximizes the average silhouette over a range of possible values for k.
fviz_nbclust(df_cluster, kmeans, method = "silhouette")We cluster the data with \(k = 2\).
km <- kmeans(df_cluster, centers = 2, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:2),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 1564 |
| 2 | 343 |
Aggregated statistics by cluster:
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))We cluster the data with \(k = 3\).
km <- kmeans(df_cluster, centers = 3, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:3),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 1422 |
| 2 | 257 |
| 3 | 228 |
Aggregated statistics by cluster:
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))We cluster the data with \(k = 4\) here. We see that the first two components explain very little variation in the data.
km <- kmeans(df_cluster, centers = 4, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:4),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 84 |
| 2 | 220 |
| 3 | 401 |
| 4 | 1202 |
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))We cluster the data with \(k = 5\) here. We see that the first two components explain very little variation in the data.
km <- kmeans(df_cluster, centers = 5, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:5),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 1037 |
| 2 | 73 |
| 3 | 370 |
| 4 | 78 |
| 5 | 349 |
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))We cluster the data with \(k = 6\) here. We see that the first two components explain very little variation in the data.
km <- kmeans(df_cluster, centers = 6, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:6),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 176 |
| 2 | 351 |
| 3 | 380 |
| 4 | 869 |
| 5 | 73 |
| 6 | 58 |
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))df_cluster <-
df_features %>%
filter(no_motive == 1, no_ability == 1) %>%
drop_na(best_treatment) %>%
dummy_cols(select_columns = "best_treatment", remove_most_frequent_dummy = TRUE, remove_selected_columns = TRUE) %>%
select(-`best_treatment_No, something else`) %>%
select(-starts_with(c("lack", "treatment"))) %>% # excludes free text for this iteration
remove_constant() %>%
remove_empty() %>%
drop_na()We run k-means clustering here on 865 respondents on all relevant features collected from the chatbot.
We start by examining the optimal number of clusters we will split our data into. Let’s try the average silhouette approach to find an optimum number of clusters. The average silhouette approach measures the quality of a clustering by determining how well each object lies within its cluster. A high average silhouette width indicates a good clustering. The optimal number of clusters k is the one that maximizes the average silhouette over a range of possible values for k.
fviz_nbclust(df_cluster, kmeans, method = "silhouette")We cluster the data with \(k = 2\).
km <- kmeans(df_cluster, centers = 2, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:2),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 716 |
| 2 | 149 |
Aggregated statistics by cluster:
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))We cluster the data with \(k = 3\).
km <- kmeans(df_cluster, centers = 3, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:3),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 291 |
| 2 | 507 |
| 3 | 67 |
Aggregated statistics by cluster:
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))We cluster the data with \(k = 4\) here. We see that the first two components explain very little variation in the data.
km <- kmeans(df_cluster, centers = 4, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:4),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 283 |
| 2 | 27 |
| 3 | 55 |
| 4 | 500 |
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))We cluster the data with \(k = 5\) here. We see that the first two components explain very little variation in the data.
km <- kmeans(df_cluster, centers = 5, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:5),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 55 |
| 2 | 111 |
| 3 | 480 |
| 4 | 24 |
| 5 | 195 |
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))We cluster the data with \(k = 6\) here. We see that the first two components explain very little variation in the data.
km <- kmeans(df_cluster, centers = 6, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:6),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 377 |
| 2 | 90 |
| 3 | 108 |
| 4 | 36 |
| 5 | 232 |
| 6 | 22 |
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))df_cluster <-
df_features %>%
filter(no_motive != 1, no_ability != 1) %>%
drop_na(best_treatment) %>%
dummy_cols(select_columns = "best_treatment", remove_most_frequent_dummy = TRUE, remove_selected_columns = TRUE) %>%
select(-`best_treatment_No, something else`) %>%
select(-starts_with(c("lack", "treatment"))) %>% # excludes free text for this iteration
remove_constant() %>%
remove_empty() %>%
drop_na()We run k-means clustering here on 2512 respondents on all relevant features collected from the chatbot.
We start by examining the optimal number of clusters we will split our data into. Let’s try the average silhouette approach to find an optimum number of clusters. The average silhouette approach measures the quality of a clustering by determining how well each object lies within its cluster. A high average silhouette width indicates a good clustering. The optimal number of clusters k is the one that maximizes the average silhouette over a range of possible values for k.
fviz_nbclust(df_cluster, kmeans, method = "silhouette")We cluster the data with \(k = 2\).
km <- kmeans(df_cluster, centers = 2, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:2),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 2086 |
| 2 | 426 |
Aggregated statistics by cluster:
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))We cluster the data with \(k = 3\).
km <- kmeans(df_cluster, centers = 3, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:3),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 1514 |
| 2 | 825 |
| 3 | 173 |
Aggregated statistics by cluster:
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))We cluster the data with \(k = 4\) here. We see that the first two components explain very little variation in the data.
km <- kmeans(df_cluster, centers = 4, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:4),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 1111 |
| 2 | 975 |
| 3 | 73 |
| 4 | 353 |
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))We cluster the data with \(k = 5\) here. We see that the first two components explain very little variation in the data.
km <- kmeans(df_cluster, centers = 5, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:5),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 276 |
| 2 | 856 |
| 3 | 802 |
| 4 | 58 |
| 5 | 520 |
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))We cluster the data with \(k = 6\) here. We see that the first two components explain very little variation in the data.
km <- kmeans(df_cluster, centers = 6, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:6),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 262 |
| 2 | 572 |
| 3 | 726 |
| 4 | 788 |
| 5 | 43 |
| 6 | 121 |
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))df_cluster <-
df_features %>%
filter(best_treatment == "More safety evidence") %>%
select(-starts_with(c("lack", "treatment"))) %>% # excludes free text for this iteration
remove_constant() %>%
remove_empty("rows") %>%
drop_na()We run k-means clustering here on 402 respondents on all relevant features collected from the chatbot.
We start by examining the optimal number of clusters we will split our data into. Let’s try the average silhouette approach to find an optimum number of clusters. The average silhouette approach measures the quality of a clustering by determining how well each object lies within its cluster. A high average silhouette width indicates a good clustering. The optimal number of clusters \(k\) is the one that maximizes the average silhouette over a range of possible values for \(k\).
fviz_nbclust(df_cluster, kmeans, method = "silhouette")We cluster the data with \(k = 2\).
km <- kmeans(df_cluster, centers = 2, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:2),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 336 |
| 2 | 66 |
Aggregated statistics by cluster:
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))We cluster the data with \(k = 3\).
km <- kmeans(df_cluster, centers = 3, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:3),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 132 |
| 2 | 27 |
| 3 | 243 |
Aggregated statistics by cluster:
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))We cluster the data with \(k = 4\) here. We see that the first two components explain very little variation in the data.
km <- kmeans(df_cluster, centers = 4, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:4),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 205 |
| 2 | 60 |
| 3 | 27 |
| 4 | 110 |
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))We cluster the data with \(k = 5\) here. We see that the first two components explain very little variation in the data.
km <- kmeans(df_cluster, centers = 5, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:5),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 204 |
| 2 | 59 |
| 3 | 9 |
| 4 | 109 |
| 5 | 21 |
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))We cluster the data with \(k = 6\) here. We see that the first two components explain very little variation in the data.
km <- kmeans(df_cluster, centers = 6, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:6),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 56 |
| 2 | 9 |
| 3 | 96 |
| 4 | 187 |
| 5 | 10 |
| 6 | 44 |
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))df_cluster <-
df_features %>%
filter(best_treatment == "Family/friend endorses it") %>%
select(-starts_with(c("lack", "treatment"))) %>% # excludes free text for this iteration
remove_constant() %>%
remove_empty("rows") %>%
drop_na()We run k-means clustering here on 168 respondents on all relevant features collected from the chatbot.
We start by examining the optimal number of clusters we will split our data into. Let’s try the average silhouette approach to find an optimum number of clusters. The average silhouette approach measures the quality of a clustering by determining how well each object lies within its cluster. A high average silhouette width indicates a good clustering. The optimal number of clusters \(k\) is the one that maximizes the average silhouette over a range of possible values for \(k\).
fviz_nbclust(df_cluster, kmeans, method = "silhouette")We cluster the data with \(k = 2\).
km <- kmeans(df_cluster, centers = 2, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:2),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 5 |
| 2 | 163 |
Aggregated statistics by cluster:
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))We cluster the data with \(k = 3\).
km <- kmeans(df_cluster, centers = 3, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:3),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 110 |
| 2 | 53 |
| 3 | 5 |
Aggregated statistics by cluster:
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))We cluster the data with \(k = 4\) here. We see that the first two components explain very little variation in the data.
km <- kmeans(df_cluster, centers = 4, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:4),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 96 |
| 2 | 16 |
| 3 | 51 |
| 4 | 5 |
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))We cluster the data with \(k = 5\) here. We see that the first two components explain very little variation in the data.
km <- kmeans(df_cluster, centers = 5, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:5),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 66 |
| 2 | 23 |
| 3 | 59 |
| 4 | 16 |
| 5 | 4 |
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))We cluster the data with \(k = 6\) here. We see that the first two components explain very little variation in the data.
km <- kmeans(df_cluster, centers = 6, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:6),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 56 |
| 2 | 1 |
| 3 | 16 |
| 4 | 22 |
| 5 | 69 |
| 6 | 4 |
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))df_cluster <-
df_features %>%
filter(best_treatment == "Job/school required it") %>%
select(-starts_with(c("lack", "treatment"))) %>% # excludes free text for this iteration
remove_constant() %>%
remove_empty("rows") %>%
drop_na()We run k-means clustering here on 753 respondents on all relevant features collected from the chatbot.
We start by examining the optimal number of clusters we will split our data into. Let’s try the average silhouette approach to find an optimum number of clusters. The average silhouette approach measures the quality of a clustering by determining how well each object lies within its cluster. A high average silhouette width indicates a good clustering. The optimal number of clusters \(k\) is the one that maximizes the average silhouette over a range of possible values for \(k\).
fviz_nbclust(df_cluster, kmeans, method = "silhouette")We cluster the data with \(k = 2\).
km <- kmeans(df_cluster, centers = 2, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:2),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 131 |
| 2 | 622 |
Aggregated statistics by cluster:
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))We cluster the data with \(k = 3\).
km <- kmeans(df_cluster, centers = 3, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:3),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 58 |
| 2 | 121 |
| 3 | 574 |
Aggregated statistics by cluster:
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))We cluster the data with \(k = 4\) here. We see that the first two components explain very little variation in the data.
km <- kmeans(df_cluster, centers = 4, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:4),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 33 |
| 2 | 156 |
| 3 | 507 |
| 4 | 57 |
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))We cluster the data with \(k = 5\) here. We see that the first two components explain very little variation in the data.
km <- kmeans(df_cluster, centers = 5, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:5),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 15 |
| 2 | 33 |
| 3 | 89 |
| 4 | 150 |
| 5 | 466 |
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))We cluster the data with \(k = 6\) here. We see that the first two components explain very little variation in the data.
km <- kmeans(df_cluster, centers = 6, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:6),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 24 |
| 2 | 82 |
| 3 | 89 |
| 4 | 150 |
| 5 | 393 |
| 6 | 15 |
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))df_cluster <-
df_features %>%
filter(best_treatment == "Nothing") %>%
select(-starts_with(c("lack", "treatment"))) %>% # excludes free text for this iteration
remove_constant() %>%
remove_empty("rows") %>%
drop_na()We run k-means clustering here on 191 respondents on all relevant features collected from the chatbot.
We start by examining the optimal number of clusters we will split our data into. Let’s try the average silhouette approach to find an optimum number of clusters. The average silhouette approach measures the quality of a clustering by determining how well each object lies within its cluster. A high average silhouette width indicates a good clustering. The optimal number of clusters \(k\) is the one that maximizes the average silhouette over a range of possible values for \(k\).
fviz_nbclust(df_cluster, kmeans, method = "silhouette")We cluster the data with \(k = 2\).
km <- kmeans(df_cluster, centers = 2, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:2),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 29 |
| 2 | 162 |
Aggregated statistics by cluster:
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))We cluster the data with \(k = 3\).
km <- kmeans(df_cluster, centers = 3, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:3),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 144 |
| 2 | 19 |
| 3 | 28 |
Aggregated statistics by cluster:
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))We cluster the data with \(k = 4\) here. We see that the first two components explain very little variation in the data.
km <- kmeans(df_cluster, centers = 4, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:4),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 2 |
| 2 | 19 |
| 3 | 28 |
| 4 | 142 |
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))We cluster the data with \(k = 5\) here. We see that the first two components explain very little variation in the data.
km <- kmeans(df_cluster, centers = 5, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:5),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 99 |
| 2 | 25 |
| 3 | 19 |
| 4 | 2 |
| 5 | 46 |
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))We cluster the data with \(k = 6\) here. We see that the first two components explain very little variation in the data.
km <- kmeans(df_cluster, centers = 6, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:6),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 46 |
| 2 | 97 |
| 3 | 1 |
| 4 | 20 |
| 5 | 25 |
| 6 | 2 |
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))df_cluster <-
df_features %>%
filter(best_treatment == "Convenient to vaccinate") %>%
select(-starts_with(c("lack", "treatment"))) %>% # excludes free text for this iteration
remove_constant() %>%
remove_empty("rows") %>%
drop_na()We run k-means clustering here on 770 respondents on all relevant features collected from the chatbot.
We start by examining the optimal number of clusters we will split our data into. Let’s try the average silhouette approach to find an optimum number of clusters. The average silhouette approach measures the quality of a clustering by determining how well each object lies within its cluster. A high average silhouette width indicates a good clustering. The optimal number of clusters \(k\) is the one that maximizes the average silhouette over a range of possible values for \(k\).
fviz_nbclust(df_cluster, kmeans, method = "silhouette")We cluster the data with \(k = 2\).
km <- kmeans(df_cluster, centers = 2, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:2),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 701 |
| 2 | 69 |
Aggregated statistics by cluster:
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))We cluster the data with \(k = 3\).
km <- kmeans(df_cluster, centers = 3, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:3),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 109 |
| 2 | 635 |
| 3 | 26 |
Aggregated statistics by cluster:
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))We cluster the data with \(k = 4\) here. We see that the first two components explain very little variation in the data.
km <- kmeans(df_cluster, centers = 4, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:4),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 108 |
| 2 | 24 |
| 3 | 612 |
| 4 | 26 |
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))We cluster the data with \(k = 5\) here. We see that the first two components explain very little variation in the data.
km <- kmeans(df_cluster, centers = 5, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:5),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 34 |
| 2 | 7 |
| 3 | 24 |
| 4 | 98 |
| 5 | 607 |
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))We cluster the data with \(k = 6\) here. We see that the first two components explain very little variation in the data.
km <- kmeans(df_cluster, centers = 6, nstart = 25)
# fviz_cluster(km, data = df_cluster, geom = "point", ggtheme = theme_minimal())Size of each cluster is reported below:
tibble(
Cluster = c(1:6),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 98 |
| 2 | 597 |
| 3 | 34 |
| 4 | 7 |
| 5 | 6 |
| 6 | 28 |
The heatmap below shows the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
plot_clusters(bind_cols(df_cluster, cluster = km$cluster))