1 Overview

The present analysis is based on 1066 respondents from South Africa, Nigeria, Kenya, and Ghana completing our survey for pilot wave 9 (Jan 2023). The related GitHub issue is here.


2 K-means clustering

# function to plot heatmaps

plot_clusters <- function(df_plot) {
  df_plot %>% 
  mutate_at(vars(-cluster), ~ scale(.) %>% as.vector()) %>% 
  group_by(cluster) %>% 
  summarise_all(~ mean(., na.rm = T)) %>%
  bind_rows(
    df_plot %>% 
      mutate_at(vars(-cluster), ~ scale(.) %>% as.vector()) %>% 
      group_by(cluster) %>% 
      summarise_all(~ sd(., na.rm = T)),
    .id = "stat"
  ) %>% 
  mutate(stat = if_else(stat == 1, "mean", "sd")) %>% 
  clean_names(case = "title") %>%
  pivot_longer(cols = -c(Cluster, Stat)) %>%
  mutate(
    Cluster = factor(Cluster),
    name = name %>% fct_inorder() %>% fct_rev()
  ) %>%
  pivot_wider(names_from = Stat, values_from = value) %>% 
  mutate(
    mean = round(mean, 2),
    mean_char = as.character(mean),
    # mean_char = if_else(mean > 0.6, "0.6+", mean_char),
    # mean = if_else(mean > 0.6, 0.6, mean),
    # mean_char = if_else(mean < -0.6, "-0.6", mean_char),
    # mean = if_else(mean < -0.6, -0.6, mean),
    sd = round(sd, 2),
    label_str = str_c(mean_char, " (", sd, ")"),
  ) %>% 
  ggplot() +
  geom_tile(aes(Cluster, name, fill = mean)) +
  geom_text(aes(Cluster, name, label = label_str), size = 2.75) +
  scale_fill_gradient2(
    low = "red",
    mid = "white",
    high = "green",
    midpoint = 0,
    #limits = c(-0.6, 0.6)
  ) +
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(
    x = "Cluster", y = "Feature",
    fill = "Means (std)"
  ) 
}

2.1 Overall

df_cluster <-
  df %>% 
  select_if(is.numeric) %>%
  select(-c(custom_attributes, messenger_user_id, sessions, timezone, page_id_2, covid_is_problem_other_country, cv_age)) %>% 
  mutate_at(vars(2:46), ~ scale(.) %>% as.vector) %>%
  relocate(num_dose, female, white, education, religiosity, location) %>% 
  drop_na() %>% 
  remove_empty()

We run k-means clustering here on 1057 respondents with a complete numeric feature set collected from the chatbot. The heatmaps below show the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.

km <- kmeans(df_cluster %>% select(-chatfuel_user_id), centers = 3, nstart = 25)
plot_clusters(bind_cols(df_cluster %>% select(-chatfuel_user_id), cluster = km$cluster))

Size of each cluster for above plot:

tibble(
  Cluster = c(1:3),
  Size = km$size
) %>% 
  kable() %>% 
  kable_styling()
Cluster Size
1 434
2 451
3 172

Wordcloud for cluster 1:

vector_wc <-
  bind_cols(df_cluster, cluster = km$cluster) %>% 
  select(chatfuel_user_id, cluster) %>% 
  inner_join(df, by = "chatfuel_user_id") %>% 
  filter(cluster == 1) %>% 
  select(free_text_vars) %>% 
  select(-contains(c("phone", "percent", "postal", "_id", "_name"))) %>%
  unite("text", 1:25, na.rm = T, remove = T, sep = " ") %>%
  mutate(
    text = str_to_lower(text),
    text = gsub("http[^[:space:]]*", "", text), # remove bad URLs in text
    text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|vaccination|none|nothing|min|dont|covid|yes|no|don|can|less")
  ) %>%
  pull(text)

# Create corpus 
docs <- Corpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)

# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))

Wordcloud for cluster 2:

vector_wc <-
  bind_cols(df_cluster, cluster = km$cluster) %>% 
  select(chatfuel_user_id, cluster) %>% 
  inner_join(df, by = "chatfuel_user_id") %>% 
  filter(cluster == 2) %>% 
  select(free_text_vars) %>% 
  select(-contains(c("phone", "percent", "postal", "_id", "_name"))) %>%
  unite("text", 1:25, na.rm = T, remove = T, sep = " ") %>%
  mutate(
    text = str_to_lower(text),
    text = gsub("http[^[:space:]]*", "", text), # remove bad URLs in text
    text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|vaccination|none|nothing|min|dont|covid|yes|no|don|can|less")
  ) %>%
  pull(text)

# Create corpus 
docs <- Corpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)

# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))

Wordcloud for cluster 3:

vector_wc <-
  bind_cols(df_cluster, cluster = km$cluster) %>% 
  select(chatfuel_user_id, cluster) %>% 
  inner_join(df, by = "chatfuel_user_id") %>% 
  filter(cluster == 3) %>% 
  select(free_text_vars) %>% 
  select(-contains(c("phone", "percent", "postal", "_id", "_name"))) %>%
  unite("text", 1:25, na.rm = T, remove = T, sep = " ") %>%
  mutate(
    text = str_to_lower(text),
    text = gsub("http[^[:space:]]*", "", text), # remove bad URLs in text
    text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|vaccination|none|nothing|min|dont|covid|yes|no|don|can|less")
  ) %>%
  pull(text)

# Create corpus 
docs <- Corpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)

# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))

2.2 Strong Hesitancy

df_cluster <-
  df %>%
  filter(vax_next_year == 1) %>% 
  select_if(is.numeric) %>%
  select(-c(custom_attributes, messenger_user_id, sessions, timezone, page_id_2, covid_is_problem_other_country, cv_age, vax_next_year)) %>% 
  mutate_at(vars(2:45), ~ scale(.) %>% as.vector) %>% 
  relocate(num_dose, female, white, education, religiosity, location) %>% 
  drop_na() %>% 
  remove_empty()

We run k-means clustering here on 243 respondents on all numeric features collected from the chatbot. The heatmaps below show the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.

km <- kmeans(df_cluster %>% select(-chatfuel_user_id), centers = 3, nstart = 25)
plot_clusters(bind_cols(df_cluster %>% select(-chatfuel_user_id), cluster = km$cluster))

Size of each cluster for above plot:

tibble(
  Cluster = c(1:3),
  Size = km$size
) %>% 
  kable() %>% 
  kable_styling()
Cluster Size
1 100
2 118
3 25

Wordcloud for cluster 1:

vector_wc <-
  bind_cols(df_cluster, cluster = km$cluster) %>% 
  select(chatfuel_user_id, cluster) %>% 
  inner_join(df, by = "chatfuel_user_id") %>% 
  filter(cluster == 1) %>% 
  select(free_text_vars) %>% 
  select(-contains(c("phone", "percent", "postal", "_id", "_name"))) %>%
  unite("text", 1:25, na.rm = T, remove = T, sep = " ") %>%
  mutate(
    text = str_to_lower(text),
    text = gsub("http[^[:space:]]*", "", text), # remove bad URLs in text
    text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|vaccination|none|nothing|min|dont|covid|yes|no|don|can|less")
  ) %>%
  pull(text)

# Create corpus 
docs <- Corpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)

# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))

Wordcloud for cluster 2:

vector_wc <-
  bind_cols(df_cluster, cluster = km$cluster) %>% 
  select(chatfuel_user_id, cluster) %>% 
  inner_join(df, by = "chatfuel_user_id") %>% 
  filter(cluster == 2) %>% 
  select(free_text_vars) %>% 
  select(-contains(c("phone", "percent", "postal", "_id", "_name"))) %>%
  unite("text", 1:25, na.rm = T, remove = T, sep = " ") %>%
  mutate(
    text = str_to_lower(text),
    text = gsub("http[^[:space:]]*", "", text), # remove bad URLs in text
    text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|vaccination|none|nothing|min|dont|covid|yes|no|don|can|less")
  ) %>%
  pull(text)

# Create corpus 
docs <- Corpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)

# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))

Wordcloud for cluster 3:

vector_wc <-
  bind_cols(df_cluster, cluster = km$cluster) %>% 
  select(chatfuel_user_id, cluster) %>% 
  inner_join(df, by = "chatfuel_user_id") %>% 
  filter(cluster == 3) %>% 
  select(free_text_vars) %>% 
  select(-contains(c("phone", "percent", "postal", "_id", "_name"))) %>%
  unite("text", 1:25, na.rm = T, remove = T, sep = " ") %>%
  mutate(
    text = str_to_lower(text),
    text = gsub("http[^[:space:]]*", "", text), # remove bad URLs in text
    text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|vaccination|none|nothing|min|dont|covid|yes|no|don|can|less")
  ) %>%
  pull(text)

# Create corpus 
docs <- Corpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)

# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))

2.3 Open but hesitant

df_cluster <-
  df %>% 
  filter(vax_next_year %in% c(2, 3, 4)) %>% 
  select_if(is.numeric) %>%
  select(-c(custom_attributes, messenger_user_id, sessions, timezone, page_id_2, covid_is_problem_other_country, cv_age, vax_next_year)) %>% 
  mutate_at(vars(2:45), ~ scale(.) %>% as.vector) %>% 
  relocate(num_dose, female, white, education, religiosity, location) %>% 
  drop_na() %>% 
  remove_empty()

We run k-means clustering here on 574 respondents on all numeric features collected from the chatbot. The heatmaps below show the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.

km <- kmeans(df_cluster %>% select(-chatfuel_user_id), centers = 3, nstart = 25)
plot_clusters(bind_cols(df_cluster %>% select(-chatfuel_user_id), cluster = km$cluster))

Size of each cluster for above plot:

tibble(
  Cluster = c(1:3),
  Size = km$size
) %>% 
  kable() %>% 
  kable_styling()
Cluster Size
1 231
2 201
3 142

Wordcloud for cluster 1:

vector_wc <-
  bind_cols(df_cluster, cluster = km$cluster) %>% 
  select(chatfuel_user_id, cluster) %>% 
  inner_join(df, by = "chatfuel_user_id") %>% 
  filter(cluster == 1) %>% 
  select(free_text_vars) %>% 
  select(-contains(c("phone", "percent", "postal", "_id", "_name"))) %>%
  unite("text", 1:25, na.rm = T, remove = T, sep = " ") %>%
  mutate(
    text = str_to_lower(text),
    text = gsub("http[^[:space:]]*", "", text), # remove bad URLs in text
    text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|vaccination|none|nothing|min|dont|covid|yes|no|don|can|less")
  ) %>%
  pull(text)

# Create corpus 
docs <- Corpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)

# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))

Wordcloud for cluster 2:

vector_wc <-
  bind_cols(df_cluster, cluster = km$cluster) %>% 
  select(chatfuel_user_id, cluster) %>% 
  inner_join(df, by = "chatfuel_user_id") %>% 
  filter(cluster == 2) %>% 
  select(free_text_vars) %>% 
  select(-contains(c("phone", "percent", "postal", "_id", "_name"))) %>%
  unite("text", 1:25, na.rm = T, remove = T, sep = " ") %>%
  mutate(
    text = str_to_lower(text),
    text = gsub("http[^[:space:]]*", "", text), # remove bad URLs in text
    text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|vaccination|none|nothing|min|dont|covid|yes|no|don|can|less")
  ) %>%
  pull(text)

# Create corpus 
docs <- Corpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)

# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))

Wordcloud for cluster 3:

vector_wc <-
  bind_cols(df_cluster, cluster = km$cluster) %>% 
  select(chatfuel_user_id, cluster) %>% 
  inner_join(df, by = "chatfuel_user_id") %>% 
  filter(cluster == 3) %>% 
  select(free_text_vars) %>% 
  select(-contains(c("phone", "percent", "postal", "_id", "_name"))) %>%
  unite("text", 1:25, na.rm = T, remove = T, sep = " ") %>%
  mutate(
    text = str_to_lower(text),
    text = gsub("http[^[:space:]]*", "", text), # remove bad URLs in text
    text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|vaccination|none|nothing|min|dont|covid|yes|no|don|can|less")
  ) %>%
  pull(text)

# Create corpus 
docs <- Corpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)

# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))

2.4 No hesitancy

df_cluster <-
  df %>% 
  filter(vax_next_year == 5) %>% 
  select_if(is.numeric) %>%
  select(-c(custom_attributes, messenger_user_id, sessions, timezone, page_id_2, covid_is_problem_other_country, cv_age, vax_next_year)) %>% 
  mutate_at(vars(2:45), ~ scale(.) %>% as.vector) %>% 
  relocate(num_dose, female, white, education, religiosity, location) %>% 
  drop_na() %>% 
  remove_empty()

We run k-means clustering here on 240 respondents on all numeric features collected from the chatbot. The heatmaps below show the mean values of features (along the y-axis) for each cluster (along the x-axis). Each feature is normalized before aggregating so that its mean across clusters is 0. Hence, for each cluster, reds on the plot show values below the mean across clusters, whereas greens on the plot show values above it.

km <- kmeans(df_cluster %>% select(-chatfuel_user_id), centers = 3, nstart = 25)
plot_clusters(bind_cols(df_cluster %>% select(-chatfuel_user_id), cluster = km$cluster))

Size of each cluster for above plot:

tibble(
  Cluster = c(1:3),
  Size = km$size
) %>% 
  kable() %>% 
  kable_styling()
Cluster Size
1 135
2 11
3 94

Wordcloud for cluster 1:

vector_wc <-
  bind_cols(df_cluster, cluster = km$cluster) %>% 
  select(chatfuel_user_id, cluster) %>% 
  inner_join(df, by = "chatfuel_user_id") %>% 
  filter(cluster == 1) %>% 
  select(free_text_vars) %>% 
  select(-contains(c("phone", "percent", "postal", "_id", "_name"))) %>%
  unite("text", 1:25, na.rm = T, remove = T, sep = " ") %>%
  mutate(
    text = str_to_lower(text),
    text = gsub("http[^[:space:]]*", "", text), # remove bad URLs in text
    text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|vaccination|none|nothing|min|dont|covid|yes|no|don|can|less")
  ) %>%
  pull(text)

# Create corpus 
docs <- Corpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)

# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))

Wordcloud for cluster 2:

vector_wc <-
  bind_cols(df_cluster, cluster = km$cluster) %>% 
  select(chatfuel_user_id, cluster) %>% 
  inner_join(df, by = "chatfuel_user_id") %>% 
  filter(cluster == 2) %>% 
  select(free_text_vars) %>% 
  select(-contains(c("phone", "percent", "postal", "_id", "_name"))) %>%
  unite("text", 1:25, na.rm = T, remove = T, sep = " ") %>%
  mutate(
    text = str_to_lower(text),
    text = gsub("http[^[:space:]]*", "", text), # remove bad URLs in text
    text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|vaccination|none|nothing|min|dont|covid|yes|no|don|can|less")
  ) %>%
  pull(text)

# Create corpus 
docs <- Corpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)

# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))

Wordcloud for cluster 3:

vector_wc <-
  bind_cols(df_cluster, cluster = km$cluster) %>% 
  select(chatfuel_user_id, cluster) %>% 
  inner_join(df, by = "chatfuel_user_id") %>% 
  filter(cluster == 3) %>% 
  select(free_text_vars) %>% 
  select(-contains(c("phone", "percent", "postal", "_id", "_name"))) %>%
  unite("text", 1:25, na.rm = T, remove = T, sep = " ") %>%
  mutate(
    text = str_to_lower(text),
    text = gsub("http[^[:space:]]*", "", text), # remove bad URLs in text
    text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|vaccination|none|nothing|min|dont|covid|yes|no|don|can|less")
  ) %>%
  pull(text)

# Create corpus 
docs <- Corpus(VectorSource(vector_wc))

# Clean corpus
docs <-
  docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)

# Create wordcloud
wordcloud(words = df_freetext$word, freq = df_freetext$freq, min.freq = 1, max.words = 200, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))




3 Topic models

data_vector <-
  df %>% 
  select(free_text_vars) %>% 
  select(-contains(c("phone", "percent", "postal", "_id", "_name"))) %>%
  unite("text", 1:25, na.rm = T, remove = T, sep = " ") %>%
  mutate(
    text = str_to_lower(text),
    text = gsub("http[^[:space:]]*", "", text), # remove bad URLs in text
    text = str_remove_all(text, "vaccine|vaccines|vaccinate|vaccinated|vaccination|none|nothing|min|dont|covid|yes|no|can|don|less")
  ) %>%
  pull(text)

# Create corpus
corpus <- Corpus(VectorSource(data_vector))

# Clean corpus
docs <-
  corpus %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeWords, stopwords("english"))

# Create doc-term matrix
matrix <- as.matrix(TermDocumentMatrix(docs))
words <- sort(rowSums(matrix), decreasing = TRUE)
df_freetext <- data.frame(word = names(words), freq = words)


minimumFrequency <- 5
DTM <- DocumentTermMatrix(docs, control = list(bounds = list(global = c(minimumFrequency, Inf))))

sel_idx <- slam::row_sums(DTM) > 0
DTM <- DTM[sel_idx, ]

We move to inspect the number of topics. In this case, we use two methods CaoJuan2009 and Griffith2004. The best number of topics shows low values for CaoJuan2009 and high values for Deveaud2014.

result <- 
  ldatuning::FindTopicsNumber(
    DTM,
    topics = seq(from = 3, to = 10, by = 1),
    metrics = c("CaoJuan2009",  "Deveaud2014"),
    method = "Gibbs",
    control = list(seed = 77),
    verbose = F
  )

ldatuning::FindTopicsNumber_plot(result)

3.1 3 topics

K <- 3

# set random number generator seed
set.seed(94305)

# compute the LDA model, inference via 1000 iterations of Gibbs sampling
topicModel <- LDA(DTM, K, method = "Gibbs", control = list(iter = 1000, verbose = 25))
## K = 3; V = 792; M = 1065
## Sampling 1000 iterations!
## Iteration 25 ...
## Iteration 50 ...
## Iteration 75 ...
## Iteration 100 ...
## Iteration 125 ...
## Iteration 150 ...
## Iteration 175 ...
## Iteration 200 ...
## Iteration 225 ...
## Iteration 250 ...
## Iteration 275 ...
## Iteration 300 ...
## Iteration 325 ...
## Iteration 350 ...
## Iteration 375 ...
## Iteration 400 ...
## Iteration 425 ...
## Iteration 450 ...
## Iteration 475 ...
## Iteration 500 ...
## Iteration 525 ...
## Iteration 550 ...
## Iteration 575 ...
## Iteration 600 ...
## Iteration 625 ...
## Iteration 650 ...
## Iteration 675 ...
## Iteration 700 ...
## Iteration 725 ...
## Iteration 750 ...
## Iteration 775 ...
## Iteration 800 ...
## Iteration 825 ...
## Iteration 850 ...
## Iteration 875 ...
## Iteration 900 ...
## Iteration 925 ...
## Iteration 950 ...
## Iteration 975 ...
## Iteration 1000 ...
## Gibbs sampling completed!
# have a look a some of the results (posterior distributions)
tmResult <- posterior(topicModel)

# format of the resulting object
# attributes(tmResult)
# nTerms(DTM)

# topics are probability distributions over the entire vocabulary
beta <- tmResult$terms   # get beta from results

# for every document we have a probability distribution of its contained topics
theta <- tmResult$topics

Top 10 terms per topic:

terms(topicModel, 10)
##       Topic 1   Topic 2   Topic 3  
##  [1,] "people"  "hrs"     "effects"
##  [2,] "care"    "car"     "side"   
##  [3,] "will"    "prevent" "virus"  
##  [4,] "get"     "safe"    "body"   
##  [5,] "health"  "good"    "long"   
##  [6,] "one"     "really"  "system" 
##  [7,] "walking" "morning" "immune" 
##  [8,] "safe"    "effect"  "walking"
##  [9,] "getting" "cause"   "term"   
## [10,] "disease" "walking" "last"



4 Appendix

4.1 More clusters

4.1.1 4 clusters

km <- kmeans(df_cluster %>% select(-chatfuel_user_id), centers = 4, nstart = 25)
plot_clusters(bind_cols(df_cluster %>% select(-chatfuel_user_id), cluster = km$cluster))

Size of each cluster for above plot:

tibble(
  Cluster = c(1:4),
  Size = km$size
) %>% 
  kable() %>% 
  kable_styling()
Cluster Size
1 58
2 93
3 11
4 78

4.1.2 5 clusters

km <- kmeans(df_cluster %>% select(-chatfuel_user_id), centers = 5, nstart = 25)
plot_clusters(bind_cols(df_cluster %>% select(-chatfuel_user_id), cluster = km$cluster))

Size of each cluster for above plot:

tibble(
  Cluster = c(1:5),
  Size = km$size
) %>% 
  kable() %>% 
  kable_styling()
Cluster Size
1 67
2 36
3 10
4 52
5 75

4.1.3 6 clusters

km <- kmeans(df_cluster %>% select(-chatfuel_user_id), centers = 6, nstart = 25)
plot_clusters(bind_cols(df_cluster %>% select(-chatfuel_user_id), cluster = km$cluster))

Size of each cluster for above plot:

tibble(
  Cluster = c(1:6),
  Size = km$size
) %>% 
  kable() %>% 
  kable_styling()
Cluster Size
1 43
2 50
3 10
4 55
5 36
6 46

4.2 More topics

4.2.1 4 topics

K <- 4

# set random number generator seed
set.seed(94305)

# compute the LDA model, inference via 1000 iterations of Gibbs sampling
topicModel <- LDA(DTM, K, method = "Gibbs", control = list(iter = 1000, verbose = 25))
## K = 4; V = 792; M = 1065
## Sampling 1000 iterations!
## Iteration 25 ...
## Iteration 50 ...
## Iteration 75 ...
## Iteration 100 ...
## Iteration 125 ...
## Iteration 150 ...
## Iteration 175 ...
## Iteration 200 ...
## Iteration 225 ...
## Iteration 250 ...
## Iteration 275 ...
## Iteration 300 ...
## Iteration 325 ...
## Iteration 350 ...
## Iteration 375 ...
## Iteration 400 ...
## Iteration 425 ...
## Iteration 450 ...
## Iteration 475 ...
## Iteration 500 ...
## Iteration 525 ...
## Iteration 550 ...
## Iteration 575 ...
## Iteration 600 ...
## Iteration 625 ...
## Iteration 650 ...
## Iteration 675 ...
## Iteration 700 ...
## Iteration 725 ...
## Iteration 750 ...
## Iteration 775 ...
## Iteration 800 ...
## Iteration 825 ...
## Iteration 850 ...
## Iteration 875 ...
## Iteration 900 ...
## Iteration 925 ...
## Iteration 950 ...
## Iteration 975 ...
## Iteration 1000 ...
## Gibbs sampling completed!
# have a look a some of the results (posterior distributions)
tmResult <- posterior(topicModel)

# format of the resulting object
# attributes(tmResult)
# nTerms(DTM)

# topics are probability distributions over the entire vocabulary
beta <- tmResult$terms   # get beta from results

# for every document we have a probability distribution of its contained topics
theta <- tmResult$topics

Top 10 terms per topic:

terms(topicModel, 10)
##       Topic 1   Topic 2     Topic 3   Topic 4   
##  [1,] "people"  "walking"   "hrs"     "effects" 
##  [2,] "will"    "health"    "morning" "side"    
##  [3,] "safe"    "one"       "car"     "virus"   
##  [4,] "get"     "care"      "good"    "body"    
##  [5,] "think"   "disease"   "effect"  "long"    
##  [6,] "care"    "immunity"  "really"  "system"  
##  [7,] "getting" "cause"     "work"    "immune"  
##  [8,] "life"    "effective" "prevent" "term"    
##  [9,] "many"    "like"      "walking" "fight"   
## [10,] "protect" "time"      "works"   "benefits"

4.2.2 5 topics

K <- 5

# set random number generator seed
set.seed(94305)

# compute the LDA model, inference via 1000 iterations of Gibbs sampling
topicModel <- LDA(DTM, K, method = "Gibbs", control = list(iter = 1000, verbose = 25))
## K = 5; V = 792; M = 1065
## Sampling 1000 iterations!
## Iteration 25 ...
## Iteration 50 ...
## Iteration 75 ...
## Iteration 100 ...
## Iteration 125 ...
## Iteration 150 ...
## Iteration 175 ...
## Iteration 200 ...
## Iteration 225 ...
## Iteration 250 ...
## Iteration 275 ...
## Iteration 300 ...
## Iteration 325 ...
## Iteration 350 ...
## Iteration 375 ...
## Iteration 400 ...
## Iteration 425 ...
## Iteration 450 ...
## Iteration 475 ...
## Iteration 500 ...
## Iteration 525 ...
## Iteration 550 ...
## Iteration 575 ...
## Iteration 600 ...
## Iteration 625 ...
## Iteration 650 ...
## Iteration 675 ...
## Iteration 700 ...
## Iteration 725 ...
## Iteration 750 ...
## Iteration 775 ...
## Iteration 800 ...
## Iteration 825 ...
## Iteration 850 ...
## Iteration 875 ...
## Iteration 900 ...
## Iteration 925 ...
## Iteration 950 ...
## Iteration 975 ...
## Iteration 1000 ...
## Gibbs sampling completed!
# have a look a some of the results (posterior distributions)
tmResult <- posterior(topicModel)

# format of the resulting object
# attributes(tmResult)
# nTerms(DTM)

# topics are probability distributions over the entire vocabulary
beta <- tmResult$terms   # get beta from results

# for every document we have a probability distribution of its contained topics
theta <- tmResult$topics

Top 10 terms per topic:

terms(topicModel, 10)
##       Topic 1     Topic 2  Topic 3   Topic 4    Topic 5   
##  [1,] "virus"     "safe"   "people"  "health"   "effects" 
##  [2,] "body"      "will"   "walking" "good"     "side"    
##  [3,] "prevent"   "hrs"    "get"     "really"   "long"    
##  [4,] "cause"     "car"    "think"   "morning"  "system"  
##  [5,] "disease"   "take"   "getting" "work"     "immune"  
##  [6,] "immunity"  "effect" "care"    "walking"  "walking" 
##  [7,] "effective" "care"   "life"    "works"    "term"    
##  [8,] "one"       "bus"    "time"    "question" "benefits"
##  [9,] "fight"     "one"    "many"    "year"     "help"    
## [10,] "diseases"  "affect" "still"   "idea"     "clinic"

4.2.3 6 topics

K <- 6

# set random number generator seed
set.seed(94305)

# compute the LDA model, inference via 1000 iterations of Gibbs sampling
topicModel <- LDA(DTM, K, method = "Gibbs", control = list(iter = 1000, verbose = 25))
## K = 6; V = 792; M = 1065
## Sampling 1000 iterations!
## Iteration 25 ...
## Iteration 50 ...
## Iteration 75 ...
## Iteration 100 ...
## Iteration 125 ...
## Iteration 150 ...
## Iteration 175 ...
## Iteration 200 ...
## Iteration 225 ...
## Iteration 250 ...
## Iteration 275 ...
## Iteration 300 ...
## Iteration 325 ...
## Iteration 350 ...
## Iteration 375 ...
## Iteration 400 ...
## Iteration 425 ...
## Iteration 450 ...
## Iteration 475 ...
## Iteration 500 ...
## Iteration 525 ...
## Iteration 550 ...
## Iteration 575 ...
## Iteration 600 ...
## Iteration 625 ...
## Iteration 650 ...
## Iteration 675 ...
## Iteration 700 ...
## Iteration 725 ...
## Iteration 750 ...
## Iteration 775 ...
## Iteration 800 ...
## Iteration 825 ...
## Iteration 850 ...
## Iteration 875 ...
## Iteration 900 ...
## Iteration 925 ...
## Iteration 950 ...
## Iteration 975 ...
## Iteration 1000 ...
## Gibbs sampling completed!
# have a look a some of the results (posterior distributions)
tmResult <- posterior(topicModel)

# format of the resulting object
# attributes(tmResult)
# nTerms(DTM)

# topics are probability distributions over the entire vocabulary
beta <- tmResult$terms   # get beta from results

# for every document we have a probability distribution of its contained topics
theta <- tmResult$topics

Top 10 terms per topic:

terms(topicModel, 10)
##       Topic 1    Topic 2     Topic 3   Topic 4    Topic 5    Topic 6    
##  [1,] "virus"    "walking"   "will"    "people"   "safe"     "effects"  
##  [2,] "body"     "health"    "good"    "get"      "hrs"      "side"     
##  [3,] "system"   "care"      "effect"  "one"      "car"      "long"     
##  [4,] "immune"   "morning"   "walking" "getting"  "really"   "term"     
##  [5,] "disease"  "affect"    "works"   "many"     "prevent"  "bus"      
##  [6,] "cause"    "questions" "think"   "still"    "hospital" "effective"
##  [7,] "immunity" "free"      "year"    "clinic"   "question" "benefits" 
##  [8,] "fight"    "just"      "life"    "infected" "work"     "time"     
##  [9,] "diseases" "september" "help"    "think"    "care"     "tested"   
## [10,] "boost"    "center"    "like"    "taking"   "doesnt"   "sure"