The present analysis is based on 1066 respondents from South Africa, Nigeria, Kenya, and Ghana completing our survey for pilot wave 9 (Jan 2023). The related GitHub issue is here.
# function to plot heatmaps
plot_clusters <- function(df_plot) {
df_plot %>%
mutate_at(vars(-cluster), ~ scale(.) %>% as.vector()) %>%
group_by(cluster) %>%
summarise_all(~ mean(., na.rm = T)) %>%
bind_rows(
df_plot %>%
mutate_at(vars(-cluster), ~ scale(.) %>% as.vector()) %>%
group_by(cluster) %>%
summarise_all(~ sd(., na.rm = T)),
.id = "stat"
) %>%
mutate(stat = if_else(stat == 1, "mean", "sd")) %>%
clean_names(case = "title") %>%
pivot_longer(cols = -c(Cluster, Stat)) %>%
mutate(
Cluster = factor(Cluster),
name = name %>% fct_inorder() %>% fct_rev()
) %>%
pivot_wider(names_from = Stat, values_from = value) %>%
mutate(
mean = round(mean, 2),
mean_char = as.character(mean),
# mean_char = if_else(mean > 0.6, "0.6+", mean_char),
# mean = if_else(mean > 0.6, 0.6, mean),
# mean_char = if_else(mean < -0.6, "-0.6", mean_char),
# mean = if_else(mean < -0.6, -0.6, mean),
sd = round(sd, 2),
label_str = str_c(mean_char, " (", sd, ")"),
) %>%
ggplot() +
geom_tile(aes(Cluster, name, fill = mean)) +
geom_text(aes(Cluster, name, label = label_str), size = 2.75) +
scale_fill_gradient2(
low = "red",
mid = "white",
high = "green",
midpoint = 0,
#limits = c(-0.6, 0.6)
) +
theme_minimal() +
theme(legend.position = "bottom") +
labs(
x = "Cluster", y = "Feature",
fill = "Means (std)"
)
}df_cluster <-
df %>%
select_if(is.numeric) %>%
select(-c(custom_attributes, messenger_user_id, sessions, timezone, page_id_2, covid_is_problem_other_country, cv_age)) %>%
mutate_at(vars(2:46), ~ scale(.) %>% as.vector) %>%
relocate(num_dose, female, white, education, religiosity, location) %>%
drop_na() %>%
remove_empty()We run k-means clustering here on 1057 respondents with a complete numeric feature set collected from the chatbot. The heatmaps below show the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
km <- kmeans(df_cluster %>% select(-chatfuel_user_id), centers = 3, nstart = 25)
plot_clusters(bind_cols(df_cluster %>% select(-chatfuel_user_id), cluster = km$cluster))Size of each cluster for above plot:
tibble(
Cluster = c(1:3),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 434 |
| 2 | 451 |
| 3 | 172 |
Wordcloud for cluster 1:
vector_wc <-
bind_cols(df_cluster, cluster = km$cluster) %>%
select(chatfuel_user_id, cluster) %>%
inner_join(df, by = "chatfuel_user_id") %>%
filter(cluster == 1) %>%
select(free_text_vars) %>%
select(-contains(c("phone", "percent", "postal", "_id", "_name"))) %>%
unite("text", 1:25, na.rm = T, remove = T, sep = " ") %>%
mutate(
text = str_to_lower(text),
text = gsub("http[^[:space:]]*", "", text), # remove bad URLs in text
text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|vaccination|none|nothing|min|dont|covid|yes|no|don|can|less")
) %>%
pull(text)
# Create corpus
docs <- Corpus(VectorSource(vector_wc))
# Clean corpus
docs <-
docs %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeWords, stopwords("english"))
# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)
# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))Wordcloud for cluster 2:
vector_wc <-
bind_cols(df_cluster, cluster = km$cluster) %>%
select(chatfuel_user_id, cluster) %>%
inner_join(df, by = "chatfuel_user_id") %>%
filter(cluster == 2) %>%
select(free_text_vars) %>%
select(-contains(c("phone", "percent", "postal", "_id", "_name"))) %>%
unite("text", 1:25, na.rm = T, remove = T, sep = " ") %>%
mutate(
text = str_to_lower(text),
text = gsub("http[^[:space:]]*", "", text), # remove bad URLs in text
text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|vaccination|none|nothing|min|dont|covid|yes|no|don|can|less")
) %>%
pull(text)
# Create corpus
docs <- Corpus(VectorSource(vector_wc))
# Clean corpus
docs <-
docs %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeWords, stopwords("english"))
# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)
# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))Wordcloud for cluster 3:
vector_wc <-
bind_cols(df_cluster, cluster = km$cluster) %>%
select(chatfuel_user_id, cluster) %>%
inner_join(df, by = "chatfuel_user_id") %>%
filter(cluster == 3) %>%
select(free_text_vars) %>%
select(-contains(c("phone", "percent", "postal", "_id", "_name"))) %>%
unite("text", 1:25, na.rm = T, remove = T, sep = " ") %>%
mutate(
text = str_to_lower(text),
text = gsub("http[^[:space:]]*", "", text), # remove bad URLs in text
text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|vaccination|none|nothing|min|dont|covid|yes|no|don|can|less")
) %>%
pull(text)
# Create corpus
docs <- Corpus(VectorSource(vector_wc))
# Clean corpus
docs <-
docs %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeWords, stopwords("english"))
# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)
# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))df_cluster <-
df %>%
filter(vax_next_year == 1) %>%
select_if(is.numeric) %>%
select(-c(custom_attributes, messenger_user_id, sessions, timezone, page_id_2, covid_is_problem_other_country, cv_age, vax_next_year)) %>%
mutate_at(vars(2:45), ~ scale(.) %>% as.vector) %>%
relocate(num_dose, female, white, education, religiosity, location) %>%
drop_na() %>%
remove_empty()We run k-means clustering here on 243 respondents on all numeric features collected from the chatbot. The heatmaps below show the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
km <- kmeans(df_cluster %>% select(-chatfuel_user_id), centers = 3, nstart = 25)
plot_clusters(bind_cols(df_cluster %>% select(-chatfuel_user_id), cluster = km$cluster))Size of each cluster for above plot:
tibble(
Cluster = c(1:3),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 100 |
| 2 | 118 |
| 3 | 25 |
Wordcloud for cluster 1:
vector_wc <-
bind_cols(df_cluster, cluster = km$cluster) %>%
select(chatfuel_user_id, cluster) %>%
inner_join(df, by = "chatfuel_user_id") %>%
filter(cluster == 1) %>%
select(free_text_vars) %>%
select(-contains(c("phone", "percent", "postal", "_id", "_name"))) %>%
unite("text", 1:25, na.rm = T, remove = T, sep = " ") %>%
mutate(
text = str_to_lower(text),
text = gsub("http[^[:space:]]*", "", text), # remove bad URLs in text
text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|vaccination|none|nothing|min|dont|covid|yes|no|don|can|less")
) %>%
pull(text)
# Create corpus
docs <- Corpus(VectorSource(vector_wc))
# Clean corpus
docs <-
docs %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeWords, stopwords("english"))
# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)
# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))Wordcloud for cluster 2:
vector_wc <-
bind_cols(df_cluster, cluster = km$cluster) %>%
select(chatfuel_user_id, cluster) %>%
inner_join(df, by = "chatfuel_user_id") %>%
filter(cluster == 2) %>%
select(free_text_vars) %>%
select(-contains(c("phone", "percent", "postal", "_id", "_name"))) %>%
unite("text", 1:25, na.rm = T, remove = T, sep = " ") %>%
mutate(
text = str_to_lower(text),
text = gsub("http[^[:space:]]*", "", text), # remove bad URLs in text
text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|vaccination|none|nothing|min|dont|covid|yes|no|don|can|less")
) %>%
pull(text)
# Create corpus
docs <- Corpus(VectorSource(vector_wc))
# Clean corpus
docs <-
docs %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeWords, stopwords("english"))
# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)
# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))Wordcloud for cluster 3:
vector_wc <-
bind_cols(df_cluster, cluster = km$cluster) %>%
select(chatfuel_user_id, cluster) %>%
inner_join(df, by = "chatfuel_user_id") %>%
filter(cluster == 3) %>%
select(free_text_vars) %>%
select(-contains(c("phone", "percent", "postal", "_id", "_name"))) %>%
unite("text", 1:25, na.rm = T, remove = T, sep = " ") %>%
mutate(
text = str_to_lower(text),
text = gsub("http[^[:space:]]*", "", text), # remove bad URLs in text
text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|vaccination|none|nothing|min|dont|covid|yes|no|don|can|less")
) %>%
pull(text)
# Create corpus
docs <- Corpus(VectorSource(vector_wc))
# Clean corpus
docs <-
docs %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeWords, stopwords("english"))
# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)
# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))df_cluster <-
df %>%
filter(vax_next_year %in% c(2, 3, 4)) %>%
select_if(is.numeric) %>%
select(-c(custom_attributes, messenger_user_id, sessions, timezone, page_id_2, covid_is_problem_other_country, cv_age, vax_next_year)) %>%
mutate_at(vars(2:45), ~ scale(.) %>% as.vector) %>%
relocate(num_dose, female, white, education, religiosity, location) %>%
drop_na() %>%
remove_empty()We run k-means clustering here on 574 respondents on all numeric features collected from the chatbot. The heatmaps below show the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
km <- kmeans(df_cluster %>% select(-chatfuel_user_id), centers = 3, nstart = 25)
plot_clusters(bind_cols(df_cluster %>% select(-chatfuel_user_id), cluster = km$cluster))Size of each cluster for above plot:
tibble(
Cluster = c(1:3),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 231 |
| 2 | 201 |
| 3 | 142 |
Wordcloud for cluster 1:
vector_wc <-
bind_cols(df_cluster, cluster = km$cluster) %>%
select(chatfuel_user_id, cluster) %>%
inner_join(df, by = "chatfuel_user_id") %>%
filter(cluster == 1) %>%
select(free_text_vars) %>%
select(-contains(c("phone", "percent", "postal", "_id", "_name"))) %>%
unite("text", 1:25, na.rm = T, remove = T, sep = " ") %>%
mutate(
text = str_to_lower(text),
text = gsub("http[^[:space:]]*", "", text), # remove bad URLs in text
text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|vaccination|none|nothing|min|dont|covid|yes|no|don|can|less")
) %>%
pull(text)
# Create corpus
docs <- Corpus(VectorSource(vector_wc))
# Clean corpus
docs <-
docs %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeWords, stopwords("english"))
# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)
# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))Wordcloud for cluster 2:
vector_wc <-
bind_cols(df_cluster, cluster = km$cluster) %>%
select(chatfuel_user_id, cluster) %>%
inner_join(df, by = "chatfuel_user_id") %>%
filter(cluster == 2) %>%
select(free_text_vars) %>%
select(-contains(c("phone", "percent", "postal", "_id", "_name"))) %>%
unite("text", 1:25, na.rm = T, remove = T, sep = " ") %>%
mutate(
text = str_to_lower(text),
text = gsub("http[^[:space:]]*", "", text), # remove bad URLs in text
text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|vaccination|none|nothing|min|dont|covid|yes|no|don|can|less")
) %>%
pull(text)
# Create corpus
docs <- Corpus(VectorSource(vector_wc))
# Clean corpus
docs <-
docs %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeWords, stopwords("english"))
# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)
# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))Wordcloud for cluster 3:
vector_wc <-
bind_cols(df_cluster, cluster = km$cluster) %>%
select(chatfuel_user_id, cluster) %>%
inner_join(df, by = "chatfuel_user_id") %>%
filter(cluster == 3) %>%
select(free_text_vars) %>%
select(-contains(c("phone", "percent", "postal", "_id", "_name"))) %>%
unite("text", 1:25, na.rm = T, remove = T, sep = " ") %>%
mutate(
text = str_to_lower(text),
text = gsub("http[^[:space:]]*", "", text), # remove bad URLs in text
text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|vaccination|none|nothing|min|dont|covid|yes|no|don|can|less")
) %>%
pull(text)
# Create corpus
docs <- Corpus(VectorSource(vector_wc))
# Clean corpus
docs <-
docs %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeWords, stopwords("english"))
# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)
# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))df_cluster <-
df %>%
filter(vax_next_year == 5) %>%
select_if(is.numeric) %>%
select(-c(custom_attributes, messenger_user_id, sessions, timezone, page_id_2, covid_is_problem_other_country, cv_age, vax_next_year)) %>%
mutate_at(vars(2:45), ~ scale(.) %>% as.vector) %>%
relocate(num_dose, female, white, education, religiosity, location) %>%
drop_na() %>%
remove_empty()We run k-means clustering here on 240 respondents on all numeric features collected from the chatbot. The heatmaps below show the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.
km <- kmeans(df_cluster %>% select(-chatfuel_user_id), centers = 3, nstart = 25)
plot_clusters(bind_cols(df_cluster %>% select(-chatfuel_user_id), cluster = km$cluster))Size of each cluster for above plot:
tibble(
Cluster = c(1:3),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 135 |
| 2 | 11 |
| 3 | 94 |
Wordcloud for cluster 1:
vector_wc <-
bind_cols(df_cluster, cluster = km$cluster) %>%
select(chatfuel_user_id, cluster) %>%
inner_join(df, by = "chatfuel_user_id") %>%
filter(cluster == 1) %>%
select(free_text_vars) %>%
select(-contains(c("phone", "percent", "postal", "_id", "_name"))) %>%
unite("text", 1:25, na.rm = T, remove = T, sep = " ") %>%
mutate(
text = str_to_lower(text),
text = gsub("http[^[:space:]]*", "", text), # remove bad URLs in text
text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|vaccination|none|nothing|min|dont|covid|yes|no|don|can|less")
) %>%
pull(text)
# Create corpus
docs <- Corpus(VectorSource(vector_wc))
# Clean corpus
docs <-
docs %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeWords, stopwords("english"))
# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)
# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))Wordcloud for cluster 2:
vector_wc <-
bind_cols(df_cluster, cluster = km$cluster) %>%
select(chatfuel_user_id, cluster) %>%
inner_join(df, by = "chatfuel_user_id") %>%
filter(cluster == 2) %>%
select(free_text_vars) %>%
select(-contains(c("phone", "percent", "postal", "_id", "_name"))) %>%
unite("text", 1:25, na.rm = T, remove = T, sep = " ") %>%
mutate(
text = str_to_lower(text),
text = gsub("http[^[:space:]]*", "", text), # remove bad URLs in text
text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|vaccination|none|nothing|min|dont|covid|yes|no|don|can|less")
) %>%
pull(text)
# Create corpus
docs <- Corpus(VectorSource(vector_wc))
# Clean corpus
docs <-
docs %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeWords, stopwords("english"))
# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)
# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))Wordcloud for cluster 3:
vector_wc <-
bind_cols(df_cluster, cluster = km$cluster) %>%
select(chatfuel_user_id, cluster) %>%
inner_join(df, by = "chatfuel_user_id") %>%
filter(cluster == 3) %>%
select(free_text_vars) %>%
select(-contains(c("phone", "percent", "postal", "_id", "_name"))) %>%
unite("text", 1:25, na.rm = T, remove = T, sep = " ") %>%
mutate(
text = str_to_lower(text),
text = gsub("http[^[:space:]]*", "", text), # remove bad URLs in text
text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|vaccination|none|nothing|min|dont|covid|yes|no|don|can|less")
) %>%
pull(text)
# Create corpus
docs <- Corpus(VectorSource(vector_wc))
# Clean corpus
docs <-
docs %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeWords, stopwords("english"))
# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)
# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))data_vector <-
df %>%
select(free_text_vars) %>%
select(-contains(c("phone", "percent", "postal", "_id", "_name"))) %>%
unite("text", 1:25, na.rm = T, remove = T, sep = " ") %>%
mutate(
text = str_to_lower(text),
text = gsub("http[^[:space:]]*", "", text), # remove bad URLs in text
text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|vaccination|none|nothing|min|dont|covid|yes|no|can|don|less")
) %>%
pull(text)
# Create corpus
corpus <- Corpus(VectorSource(data_vector))
# Clean corpus
docs <-
corpus %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeWords, stopwords("english"))
# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)
minimumFrequency <- 5
DTM <- DocumentTermMatrix(docs, control = list(bounds = list(global = c(minimumFrequency, Inf))))
sel_idx <- slam::row_sums(DTM) > 0
DTM <- DTM[sel_idx, ]We move to inspect the number of topics. In this case, we use two methods CaoJuan2009 and Griffith2004. The best number of topics shows low values for CaoJuan2009 and high values for Deveaud2014.
result <-
ldatuning::FindTopicsNumber(
DTM,
topics = seq(from = 3, to = 10, by = 1),
metrics = c("CaoJuan2009", "Deveaud2014"),
method = "Gibbs",
control = list(seed = 77),
verbose = F
)
ldatuning::FindTopicsNumber_plot(result)K <- 3
# set random number generator seed
set.seed(94305)
# compute the LDA model, inference via 1000 iterations of Gibbs sampling
topicModel <- LDA(DTM, K, method = "Gibbs", control = list(iter = 1000, verbose = 25))## K = 3; V = 792; M = 1065
## Sampling 1000 iterations!
## Iteration 25 ...
## Iteration 50 ...
## Iteration 75 ...
## Iteration 100 ...
## Iteration 125 ...
## Iteration 150 ...
## Iteration 175 ...
## Iteration 200 ...
## Iteration 225 ...
## Iteration 250 ...
## Iteration 275 ...
## Iteration 300 ...
## Iteration 325 ...
## Iteration 350 ...
## Iteration 375 ...
## Iteration 400 ...
## Iteration 425 ...
## Iteration 450 ...
## Iteration 475 ...
## Iteration 500 ...
## Iteration 525 ...
## Iteration 550 ...
## Iteration 575 ...
## Iteration 600 ...
## Iteration 625 ...
## Iteration 650 ...
## Iteration 675 ...
## Iteration 700 ...
## Iteration 725 ...
## Iteration 750 ...
## Iteration 775 ...
## Iteration 800 ...
## Iteration 825 ...
## Iteration 850 ...
## Iteration 875 ...
## Iteration 900 ...
## Iteration 925 ...
## Iteration 950 ...
## Iteration 975 ...
## Iteration 1000 ...
## Gibbs sampling completed!
# have a look a some of the results (posterior distributions)
tmResult <- posterior(topicModel)
# format of the resulting object
# attributes(tmResult)
# nTerms(DTM)
# topics are probability distributions over the entire vocabulary
beta <- tmResult$terms # get beta from results
# for every document we have a probability distribution of its contained topics
theta <- tmResult$topicsTop 10 terms per topic:
terms(topicModel, 10)## Topic 1 Topic 2 Topic 3
## [1,] "people" "hrs" "effects"
## [2,] "care" "car" "side"
## [3,] "will" "prevent" "virus"
## [4,] "get" "safe" "body"
## [5,] "health" "good" "long"
## [6,] "one" "really" "system"
## [7,] "walking" "morning" "immune"
## [8,] "safe" "effect" "walking"
## [9,] "getting" "cause" "term"
## [10,] "disease" "walking" "last"
km <- kmeans(df_cluster %>% select(-chatfuel_user_id), centers = 4, nstart = 25)
plot_clusters(bind_cols(df_cluster %>% select(-chatfuel_user_id), cluster = km$cluster))Size of each cluster for above plot:
tibble(
Cluster = c(1:4),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 58 |
| 2 | 93 |
| 3 | 11 |
| 4 | 78 |
km <- kmeans(df_cluster %>% select(-chatfuel_user_id), centers = 5, nstart = 25)
plot_clusters(bind_cols(df_cluster %>% select(-chatfuel_user_id), cluster = km$cluster))Size of each cluster for above plot:
tibble(
Cluster = c(1:5),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 67 |
| 2 | 36 |
| 3 | 10 |
| 4 | 52 |
| 5 | 75 |
km <- kmeans(df_cluster %>% select(-chatfuel_user_id), centers = 6, nstart = 25)
plot_clusters(bind_cols(df_cluster %>% select(-chatfuel_user_id), cluster = km$cluster))Size of each cluster for above plot:
tibble(
Cluster = c(1:6),
Size = km$size
) %>%
kable() %>%
kable_styling()| Cluster | Size |
|---|---|
| 1 | 43 |
| 2 | 50 |
| 3 | 10 |
| 4 | 55 |
| 5 | 36 |
| 6 | 46 |
K <- 4
# set random number generator seed
set.seed(94305)
# compute the LDA model, inference via 1000 iterations of Gibbs sampling
topicModel <- LDA(DTM, K, method = "Gibbs", control = list(iter = 1000, verbose = 25))## K = 4; V = 792; M = 1065
## Sampling 1000 iterations!
## Iteration 25 ...
## Iteration 50 ...
## Iteration 75 ...
## Iteration 100 ...
## Iteration 125 ...
## Iteration 150 ...
## Iteration 175 ...
## Iteration 200 ...
## Iteration 225 ...
## Iteration 250 ...
## Iteration 275 ...
## Iteration 300 ...
## Iteration 325 ...
## Iteration 350 ...
## Iteration 375 ...
## Iteration 400 ...
## Iteration 425 ...
## Iteration 450 ...
## Iteration 475 ...
## Iteration 500 ...
## Iteration 525 ...
## Iteration 550 ...
## Iteration 575 ...
## Iteration 600 ...
## Iteration 625 ...
## Iteration 650 ...
## Iteration 675 ...
## Iteration 700 ...
## Iteration 725 ...
## Iteration 750 ...
## Iteration 775 ...
## Iteration 800 ...
## Iteration 825 ...
## Iteration 850 ...
## Iteration 875 ...
## Iteration 900 ...
## Iteration 925 ...
## Iteration 950 ...
## Iteration 975 ...
## Iteration 1000 ...
## Gibbs sampling completed!
# have a look a some of the results (posterior distributions)
tmResult <- posterior(topicModel)
# format of the resulting object
# attributes(tmResult)
# nTerms(DTM)
# topics are probability distributions over the entire vocabulary
beta <- tmResult$terms # get beta from results
# for every document we have a probability distribution of its contained topics
theta <- tmResult$topicsTop 10 terms per topic:
terms(topicModel, 10)## Topic 1 Topic 2 Topic 3 Topic 4
## [1,] "people" "walking" "hrs" "effects"
## [2,] "will" "health" "morning" "side"
## [3,] "safe" "one" "car" "virus"
## [4,] "get" "care" "good" "body"
## [5,] "think" "disease" "effect" "long"
## [6,] "care" "immunity" "really" "system"
## [7,] "getting" "cause" "work" "immune"
## [8,] "life" "effective" "prevent" "term"
## [9,] "many" "like" "walking" "fight"
## [10,] "protect" "time" "works" "benefits"
K <- 5
# set random number generator seed
set.seed(94305)
# compute the LDA model, inference via 1000 iterations of Gibbs sampling
topicModel <- LDA(DTM, K, method = "Gibbs", control = list(iter = 1000, verbose = 25))## K = 5; V = 792; M = 1065
## Sampling 1000 iterations!
## Iteration 25 ...
## Iteration 50 ...
## Iteration 75 ...
## Iteration 100 ...
## Iteration 125 ...
## Iteration 150 ...
## Iteration 175 ...
## Iteration 200 ...
## Iteration 225 ...
## Iteration 250 ...
## Iteration 275 ...
## Iteration 300 ...
## Iteration 325 ...
## Iteration 350 ...
## Iteration 375 ...
## Iteration 400 ...
## Iteration 425 ...
## Iteration 450 ...
## Iteration 475 ...
## Iteration 500 ...
## Iteration 525 ...
## Iteration 550 ...
## Iteration 575 ...
## Iteration 600 ...
## Iteration 625 ...
## Iteration 650 ...
## Iteration 675 ...
## Iteration 700 ...
## Iteration 725 ...
## Iteration 750 ...
## Iteration 775 ...
## Iteration 800 ...
## Iteration 825 ...
## Iteration 850 ...
## Iteration 875 ...
## Iteration 900 ...
## Iteration 925 ...
## Iteration 950 ...
## Iteration 975 ...
## Iteration 1000 ...
## Gibbs sampling completed!
# have a look a some of the results (posterior distributions)
tmResult <- posterior(topicModel)
# format of the resulting object
# attributes(tmResult)
# nTerms(DTM)
# topics are probability distributions over the entire vocabulary
beta <- tmResult$terms # get beta from results
# for every document we have a probability distribution of its contained topics
theta <- tmResult$topicsTop 10 terms per topic:
terms(topicModel, 10)## Topic 1 Topic 2 Topic 3 Topic 4 Topic 5
## [1,] "virus" "safe" "people" "health" "effects"
## [2,] "body" "will" "walking" "good" "side"
## [3,] "prevent" "hrs" "get" "really" "long"
## [4,] "cause" "car" "think" "morning" "system"
## [5,] "disease" "take" "getting" "work" "immune"
## [6,] "immunity" "effect" "care" "walking" "walking"
## [7,] "effective" "care" "life" "works" "term"
## [8,] "one" "bus" "time" "question" "benefits"
## [9,] "fight" "one" "many" "year" "help"
## [10,] "diseases" "affect" "still" "idea" "clinic"
K <- 6
# set random number generator seed
set.seed(94305)
# compute the LDA model, inference via 1000 iterations of Gibbs sampling
topicModel <- LDA(DTM, K, method = "Gibbs", control = list(iter = 1000, verbose = 25))## K = 6; V = 792; M = 1065
## Sampling 1000 iterations!
## Iteration 25 ...
## Iteration 50 ...
## Iteration 75 ...
## Iteration 100 ...
## Iteration 125 ...
## Iteration 150 ...
## Iteration 175 ...
## Iteration 200 ...
## Iteration 225 ...
## Iteration 250 ...
## Iteration 275 ...
## Iteration 300 ...
## Iteration 325 ...
## Iteration 350 ...
## Iteration 375 ...
## Iteration 400 ...
## Iteration 425 ...
## Iteration 450 ...
## Iteration 475 ...
## Iteration 500 ...
## Iteration 525 ...
## Iteration 550 ...
## Iteration 575 ...
## Iteration 600 ...
## Iteration 625 ...
## Iteration 650 ...
## Iteration 675 ...
## Iteration 700 ...
## Iteration 725 ...
## Iteration 750 ...
## Iteration 775 ...
## Iteration 800 ...
## Iteration 825 ...
## Iteration 850 ...
## Iteration 875 ...
## Iteration 900 ...
## Iteration 925 ...
## Iteration 950 ...
## Iteration 975 ...
## Iteration 1000 ...
## Gibbs sampling completed!
# have a look a some of the results (posterior distributions)
tmResult <- posterior(topicModel)
# format of the resulting object
# attributes(tmResult)
# nTerms(DTM)
# topics are probability distributions over the entire vocabulary
beta <- tmResult$terms # get beta from results
# for every document we have a probability distribution of its contained topics
theta <- tmResult$topicsTop 10 terms per topic:
terms(topicModel, 10)## Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 Topic 6
## [1,] "virus" "walking" "will" "people" "safe" "effects"
## [2,] "body" "health" "good" "get" "hrs" "side"
## [3,] "system" "care" "effect" "one" "car" "long"
## [4,] "immune" "morning" "walking" "getting" "really" "term"
## [5,] "disease" "affect" "works" "many" "prevent" "bus"
## [6,] "cause" "questions" "think" "still" "hospital" "effective"
## [7,] "immunity" "free" "year" "clinic" "question" "benefits"
## [8,] "fight" "just" "life" "infected" "work" "time"
## [9,] "diseases" "september" "help" "think" "care" "tested"
## [10,] "boost" "center" "like" "taking" "doesnt" "sure"