## Load packages
pacman::p_load(DT, estimatr, kableExtra, readr, reshape2, tidyverse, xtable, dataMaid, ggcorrplot, ggmap, rpart, rpart.plot, pollster, wordcloud, tm, RColorBrewer, hrbrthemes, janitor, purrr, gridExtra, cowplot, rcompanion, nnet, texreg, compareGroups, factoextra, cluster, fastDummies, simputation, sentimentr, politeness, textir, entropy)

set.seed(94305)

1 Overview

This script reads in raw chatfuel data and generates a complete transcript for each participant’s interaction with the chatbot. This script does not include – i) chatbot prompts where the user is not expected to answer any question, and ii) non-informative questions like cellphone number for airtime payments. The current version of the script shows conversation transcripts for 2499 participants from pilot wave 7.

These user-level transcripts will feed into text classification algorithms we expect to use to determine features highly predictive of user segments. Related GitHub issue is here.

2 Data wrangling

2.1 Load raw chatfuel data

## read in CURRENT chatfuel data
# full data
df_full_v7 <- 
  read_csv(here::here("chatfuel_data/chatfuel_full_v7.csv")) %>%
  clean_names() %>% 
  mutate_if(is.character, ~ str_replace_all(., '[\n\t]', '')) %>% 
  mutate(
    first_name = if_else(is.na(first_name), "", first_name),
    middle_name = if_else(is.na(middle_name), "", middle_name),
    last_name = if_else(is.na(last_name), "", last_name),
    full_name = str_c(first_name, middle_name, last_name, sep = " "),
    full_name_short = str_c(first_name, last_name, sep = " "),
    date = lubridate::date(signed_up)
  ) %>% 
  filter(!(full_name %in% c("Robert Kuan", "James Li", "Kaylin Rochford", "Saurabh Khanna", "Dingchen Sha", "Kristine Koutout", "Susan Athey", "Dean Karlan"))) %>%
  filter(version == "ALP_May") %>%
  mutate(original_ref = parse_number(original_ref)) %>% 
  mutate(
    motive = if_else(str_detect(motive, "yes"), "yes", "no"),
    motive_main = if_else(str_detect(motive_main, "risk"), "risk", motive_main)
  )

# filter to completes
df_v7 <-
  df_full_v7 %>%
  filter(full_complete == "yes") %>% 
  drop_na(vax_status) %>% 
  mutate(
    phone_number = str_replace_all(phone_number, " ", ""),
    phone_number = str_replace_all(phone_number, "-", ""),
  ) %>% 
  arrange(phone_number, last_seen) %>% 
  distinct(phone_number, .keep_all = T)


## OLD WAVES

# v6
df_full_v6 <- 
  read_csv(here::here("chatfuel_data/chatfuel_full_v6.csv")) %>%
  clean_names() %>% 
  mutate_if(is.character, ~ str_replace_all(., '[\n\t]', '')) %>% 
  mutate(
    first_name = if_else(is.na(first_name), "", first_name),
    middle_name = if_else(is.na(middle_name), "", middle_name),
    last_name = if_else(is.na(last_name), "", last_name),
    full_name = str_c(first_name, middle_name, last_name, sep = " "),
    full_name_short = str_c(first_name, last_name, sep = " "),
    date = lubridate::date(signed_up)
  ) %>% 
  filter(!(full_name %in% c("Robert Kuan", "James Li", "Kaylin Rochford", "Saurabh Khanna", "Dingchen Sha", "Kristine Koutout", "Susan Athey", "Dean Karlan"))) %>%
  filter(date > "2022-03-23", !is.na(version)) %>%
  mutate(original_ref = parse_number(original_ref)) %>% 
  mutate(
    motive = if_else(str_detect(motive, "yes"), "yes", "no"),
    motive_main = if_else(str_detect(motive_main, "risk"), "risk", motive_main)
  )

# filtering for survey completes
df_v6 <-
  df_full_v6 %>%
  filter(version == "pilot_6", full_complete == "yes") %>% 
  drop_na(vax_status) %>% 
  mutate(
    phone_number = str_replace_all(phone_number, " ", ""),
    phone_number = str_replace_all(phone_number, "-", ""),
  ) %>% 
  arrange(phone_number, last_seen) %>% 
  distinct(phone_number, .keep_all = T)

# v5
df_full_v5 <-
  read_csv(here::here("chatfuel_data/chatfuel_full_v5.csv")) %>% 
  clean_names() %>% remove_empty() %>% 
  mutate_if(is.character, ~ str_replace_all(., '[\n\t]', '')) %>% 
  mutate(
    full_name = str_c(first_name, last_name, sep = " "),
    date = lubridate::date(signed_up)
  ) %>% 
  filter(!(full_name %in% c("Robert Kuan", "James Li", "Kaylin Rochford", "Saurabh Khanna", "Dingchen Sha"))) %>%
  filter(date > "2022-02-18", !is.na(version)) %>%
  select(-contains("name"), -profile_pic_url, -messenger_user_id) %>% 
  mutate(original_ref = parse_number(original_ref))


df_v5 <-
  df_full_v5 %>% 
  filter(version == "pilot_5", full_complete == "yes") %>% 
  drop_na(vax_status) %>% 
  mutate(
    phone_number = str_replace_all(phone_number, " ", ""),
    phone_number = str_replace_all(phone_number, "-", ""),
  ) %>% 
  arrange(phone_number, last_seen) %>% 
  distinct(phone_number, .keep_all = T)


# combining chatfuel data from pilots 5 and 6. we use `df` in most analyses below. 
df <-
  bind_rows(df_v5, df_v6)

# combining full datatsets too
df_full <- 
  bind_rows(df_full_v6, df_full_v5)

2.2 Generating user ID

df <- df_v7 

# create user id variable
df <-
  df %>% 
  mutate(
    user_id = str_c(version, chatfuel_user_id, sep = "_")
  )

df <- df %>% distinct(user_id, .keep_all = TRUE)

2.3 Variable cleaning

# clean up demographic variables

clean_up_demog <- function(df){
  df$gender[!df$gender %in% c("male", "female", NA_character_)] <- "other"
  df$ethnicity[!df$ethnicity %in% c("black or african", "coloured", "white or caucasian", "prefer not to say", "asian or indian", NA_character_)] <- "other"
  df$income[!df$income %in% c("< R5,000", "R5,000 – R9,999", "R10,000 – R29,999", "R30,000 – R49,999", "R50,000 – R99,999", "> R100,000", "prefer not to say", NA_character_)] <- "other"
  df$education[!df$education %in% c("< high school", "high school", "some college", "2-year degree", "4-year degree", "graduate degree", "prefer not to say", NA_character_)] <- "other"
  df$politics[!df$politics %in% c("conservative", "moderate", "liberal", "prefer not to say", NA_character_)] <- "other"
  df$location[!df$location %in% c("urban", "suburban", "rural", "prefer not to say", NA_character_)] <- "other"
  df$religion[grep("christ", tolower(df$religion))] <- "christian"
  df$religion[!df$religion %in% c("christian", "african traditional", "islam", "hinduism", "no religion", "prefer not to say", NA_character_)] <- "other"
  return(df)
}

df <- clean_up_demog(df)

2.4 Separate out treatment answers

# separating out treatment for binding later

df_treatment <-
  df %>% 
  transmute(
    user_id,
    vax_status = vax_status %>% as_factor(),
    have_motivation = motive %>% as_factor(),
    have_ability = ability %>% as_factor(),
    best_treatment = str_to_sentence(best_treatment),
    best_treatment = if_else(str_detect(best_treatment, "^Nothing"), "Nothing would help", best_treatment),
    best_treatment = if_else(str_detect(best_treatment, "Other reason"), "Something else", best_treatment),
    best_treatment = if_else(best_treatment %in% c("Family supports it", "Trusted info source", "More transparency", "Nothing", "Rewards for vaxxing", "Job/school required", "Something else", "Nothing would help"), best_treatment, NA_character_) %>% as_factor()
  )

2.5 Conversation generation step

perplexity <- function(p) 2^(rowSums(-(p * log2(p)), na.rm = TRUE))

c(1/6,1/6,1/6,1/6,1/6,1/6) %>% 
 as_tibble() %>% 
  mutate(
    row = row_number()
  ) %>% 
  pivot_wider(values_from = value, names_from = row) %>% 
  mutate(
    perplexity = perplexity(.)
  )
  
c(1,0,0,0,0,0) %>% 
 as_tibble() %>% 
  mutate(
    row = row_number()
  ) %>% 
  pivot_wider(values_from = value, names_from = row) %>% 
  mutate(
    perplexity = perplexity(.)
  )

# remove later

# df %>% 
#   filter(vax_status == "unvax") %>%
#   sample_n(100) %>% 
#   select(user_id, consent, vax_status_raw, motive_raw, motive_main_raw, benefit_main, benefit_main_other, benefit_main_explain, risk_main, risk_main_other, risky_main_explain, belief_main, belief_main_other, belief_main_explain, motive_main_other, motive_other_main_explain, motive_other, ability_raw, ability_main_raw, ability_main_other, ability_other_main_explain, availability_main, availability_main_other, availability_main_explain, time_main, time_main_other, time_main_explain, money_main, money_main_other, money_main_explain, ability_other, ability_other_add, vax_future, best_treatment_explain, best_treatment_other, best_treatment_add, post_vax_measure, cv_age, ethnicity, income, education, religion, christian_type, religiosity, politics, location, comfortable, comfortable_not_parts, engaging, enjoyable, enjoyable_not_parts, suggestions) %>% 
#   select(user_id, contains(c("_other", "_explain"))) %>% 
#   remove_empty() %>%
#   mutate_all(~ as.character(.)) %>% 
#   pivot_longer(cols = -user_id, values_drop_na = T, names_to = "question", values_to = "answer") %>% 
#   mutate(
#     answer = gsub("http[^[:space:]]*", "", answer),
#     contains_information = ""  
#   ) %>% 
#   writexl::write_xlsx(here::here("proportion_of_useful_freetext_answers_100_unvax_users.xlsx"))

# selecting variable order from data dictionary: https://docs.google.com/spreadsheets/d/168Ol4mR4umwqdhYiJhahsDvQjPYcw4hQfbYri9ZJO74/edit#gid=23855991

df <-
  df %>%
  select(user_id, consent, vax_status_raw, motive_raw, motive_main_raw, benefit_main, benefit_main_other, benefit_main_explain, risk_main, risk_main_other, risky_main_explain, belief_main, belief_main_other, belief_main_explain, motive_main_other, motive_other_main_explain, motive_other, ability_raw, ability_main_raw, ability_main_other, ability_other_main_explain, availability_main, availability_main_other, availability_main_explain, time_main, time_main_other, time_main_explain, money_main, money_main_other, money_main_explain, ability_other, ability_other_add, vax_future, best_treatment_explain, best_treatment_other, best_treatment_add, post_vax_measure, cv_age, ethnicity, income, education, religion, christian_type, religiosity, politics, location, comfortable, comfortable_not_parts, engaging, enjoyable, enjoyable_not_parts, suggestions)

colnames(df) <-
  c("user_id", "if you're over the age of 18 and are ready to start the survey now, let me know below!", "Have you received the covid vaccine already?", "can you share if you want to get a covid vaccine?","which of these best describes what you think?","which of these is closest to what you mean?","what reason is that?","can you say more?","which of these is closest to what you mean?","what reason is that?","can you say more?","which of these is closest to what you mean?","what reason is that?","can you say more?","what reason for not wanting the vaccine is most important to you?","can you say more?","are there other reasons why you might not want the vaccine?","if you decided you wanted the vaccine, how easy would it be to get it?","is there a particular challenge that would affect you the most if you wanted the vaccine?","what makes getting the vaccine particularly challenging for you?","can you explain a bit more?","is there a reason below that makes it particularly hard?","what reason is that?","can you explain a bit more?","is there a reason below that makes it particularly hard?","what reason is that?","can you explain a bit more?","is there a reason below that makes it particularly costly?","what reason was that?","can you explain a bit more?","Are there other reasons it might be hard to get the vaccine?","anything else you'd like to add?","would you ever consider getting a vaccine in the future?","what would help the most?","what would make the most impact for you to consider getting vaccinated?","is there anything else you'd like to add?","can you share if you want to get a covid vaccine?","what's your current age?", "which best describes you?", "what's your household's total yearly income?", "what’s your highest level of education?", "what's your religion, if any?", "is there a particular Christian denomination you belong to?", "how religious would you say you are?", "what's your political position?", "what best describes where you live?", "how comfortable did you feel during this conversation?", "what parts weren't comfortable?", "how engaging was the conversation?", "how enjoyable was the conversation?", "what parts weren't enjoyable?", "any suggestions for improving this survey?")

df <-
  df %>%
  pivot_longer(cols = -user_id, values_drop_na = T, names_to = "question", values_to = "answer") %>% 
  transmute(
    user_id,
    qna = str_c(question, answer, sep = " ")
  ) %>% 
  group_by(user_id) %>% 
  mutate(row = row_number()) %>% 
  ungroup() %>% 
  relocate(row) %>% 
  mutate(qna = str_to_sentence(qna)) %>% 
  pivot_wider(names_from = row, values_from = qna) %>% 
  #mutate_all(~ replace_na(., "")) %>%
  unite("conversation", `1`:`39`, sep = ". ", na.rm = TRUE) %>% 
  mutate(
    conversation = str_c(conversation, "Would any of the below make you more interested in getting a vaccine in the future? (it's ok to say nothing!)", sep = ". ")
  )

df <-
  df %>% 
  inner_join(df_treatment, by = "user_id")


# write conversation to data file
# df %>% write_csv(here::here("conversations/conversation_pilot_7.csv"))

3 Output

For context, the preferred treatments chosen by 2499 participants can be seen below:

df %>% 
  drop_na(best_treatment) %>%
  tabyl(best_treatment) %>% 
  as_data_frame() %>% 
  arrange(-n) %>% 
  rename(preferrred_treatment = best_treatment) %>% 
  kable() %>% 
  kable_styling()

preferrred_treatment	n	percent
Job/school required	970	0.3887776
Nothing would help	512	0.2052104
Family supports it	453	0.1815631
Trusted info source	315	0.1262525
Something else	124	0.0496994
Rewards for vaxxing	110	0.0440882
More transparency	11	0.0044088

3.1 Overall

The table below shows the entire conversation each participant had with the chatbot. The last question in each conversation asks about their preferred treatment. The preferred treatment answer is separated out as a label for use in supervised algorithms.

df %>% 
  select(user_id, conversation, preferred_treatment = best_treatment) %>%
  drop_na() %>% 
  datatable(filter = "top", options = list(pageLength = 5))

3.2 Filterable by vaccination status and impediments

The table below shows the entire conversation each user had with the chatbot, along with their vaccination status, motivation, and ability to get vaccinated. The last question in each conversation asks about their preferred treatment. The preferred treatment answer is separated out as a label for use in supervised algorithms.

df %>% 
  select(user_id, vax_status, have_motivation, have_ability, conversation, preferred_treatment = best_treatment) %>%
  drop_na() %>% 
  datatable(filter = "top", options = list(pageLength = 5))

Generating user level transcripts

Updated: July 19, 2022