## Load packages
pacman::p_load(DT, estimatr, kableExtra, readr, reshape2, tidyverse, xtable, dataMaid, ggcorrplot, ggmap, rpart, rpart.plot, pollster, wordcloud, tm, RColorBrewer, hrbrthemes, janitor, purrr, gridExtra, cowplot, rcompanion, nnet, texreg, compareGroups, factoextra, cluster, fastDummies, simputation, sentimentr, politeness, textir, entropy)
set.seed(94305)This script reads in raw chatfuel data and generates a complete transcript for each participant’s interaction with the chatbot. This script does not include – i) chatbot prompts where the user is not expected to answer any question, and ii) non-informative questions like cellphone number for airtime payments. The current version of the script shows conversation transcripts for 2499 participants from pilot wave 7.
These user-level transcripts will feed into text classification algorithms we expect to use to determine features highly predictive of user segments. Related GitHub issue is here.
## read in CURRENT chatfuel data
# full data
df_full_v7 <-
read_csv(here::here("chatfuel_data/chatfuel_full_v7.csv")) %>%
clean_names() %>%
mutate_if(is.character, ~ str_replace_all(., '[\n\t]', '')) %>%
mutate(
first_name = if_else(is.na(first_name), "", first_name),
middle_name = if_else(is.na(middle_name), "", middle_name),
last_name = if_else(is.na(last_name), "", last_name),
full_name = str_c(first_name, middle_name, last_name, sep = " "),
full_name_short = str_c(first_name, last_name, sep = " "),
date = lubridate::date(signed_up)
) %>%
filter(!(full_name %in% c("Robert Kuan", "James Li", "Kaylin Rochford", "Saurabh Khanna", "Dingchen Sha", "Kristine Koutout", "Susan Athey", "Dean Karlan"))) %>%
filter(version == "ALP_May") %>%
mutate(original_ref = parse_number(original_ref)) %>%
mutate(
motive = if_else(str_detect(motive, "yes"), "yes", "no"),
motive_main = if_else(str_detect(motive_main, "risk"), "risk", motive_main)
)
# filter to completes
df_v7 <-
df_full_v7 %>%
filter(full_complete == "yes") %>%
drop_na(vax_status) %>%
mutate(
phone_number = str_replace_all(phone_number, " ", ""),
phone_number = str_replace_all(phone_number, "-", ""),
) %>%
arrange(phone_number, last_seen) %>%
distinct(phone_number, .keep_all = T)
## OLD WAVES
# v6
df_full_v6 <-
read_csv(here::here("chatfuel_data/chatfuel_full_v6.csv")) %>%
clean_names() %>%
mutate_if(is.character, ~ str_replace_all(., '[\n\t]', '')) %>%
mutate(
first_name = if_else(is.na(first_name), "", first_name),
middle_name = if_else(is.na(middle_name), "", middle_name),
last_name = if_else(is.na(last_name), "", last_name),
full_name = str_c(first_name, middle_name, last_name, sep = " "),
full_name_short = str_c(first_name, last_name, sep = " "),
date = lubridate::date(signed_up)
) %>%
filter(!(full_name %in% c("Robert Kuan", "James Li", "Kaylin Rochford", "Saurabh Khanna", "Dingchen Sha", "Kristine Koutout", "Susan Athey", "Dean Karlan"))) %>%
filter(date > "2022-03-23", !is.na(version)) %>%
mutate(original_ref = parse_number(original_ref)) %>%
mutate(
motive = if_else(str_detect(motive, "yes"), "yes", "no"),
motive_main = if_else(str_detect(motive_main, "risk"), "risk", motive_main)
)
# filtering for survey completes
df_v6 <-
df_full_v6 %>%
filter(version == "pilot_6", full_complete == "yes") %>%
drop_na(vax_status) %>%
mutate(
phone_number = str_replace_all(phone_number, " ", ""),
phone_number = str_replace_all(phone_number, "-", ""),
) %>%
arrange(phone_number, last_seen) %>%
distinct(phone_number, .keep_all = T)
# v5
df_full_v5 <-
read_csv(here::here("chatfuel_data/chatfuel_full_v5.csv")) %>%
clean_names() %>% remove_empty() %>%
mutate_if(is.character, ~ str_replace_all(., '[\n\t]', '')) %>%
mutate(
full_name = str_c(first_name, last_name, sep = " "),
date = lubridate::date(signed_up)
) %>%
filter(!(full_name %in% c("Robert Kuan", "James Li", "Kaylin Rochford", "Saurabh Khanna", "Dingchen Sha"))) %>%
filter(date > "2022-02-18", !is.na(version)) %>%
select(-contains("name"), -profile_pic_url, -messenger_user_id) %>%
mutate(original_ref = parse_number(original_ref))
df_v5 <-
df_full_v5 %>%
filter(version == "pilot_5", full_complete == "yes") %>%
drop_na(vax_status) %>%
mutate(
phone_number = str_replace_all(phone_number, " ", ""),
phone_number = str_replace_all(phone_number, "-", ""),
) %>%
arrange(phone_number, last_seen) %>%
distinct(phone_number, .keep_all = T)
# combining chatfuel data from pilots 5 and 6. we use `df` in most analyses below.
df <-
bind_rows(df_v5, df_v6)
# combining full datatsets too
df_full <-
bind_rows(df_full_v6, df_full_v5)df <- df_v7
# create user id variable
df <-
df %>%
mutate(
user_id = str_c(version, chatfuel_user_id, sep = "_")
)
df <- df %>% distinct(user_id, .keep_all = TRUE)# clean up demographic variables
clean_up_demog <- function(df){
df$gender[!df$gender %in% c("male", "female", NA_character_)] <- "other"
df$ethnicity[!df$ethnicity %in% c("black or african", "coloured", "white or caucasian", "prefer not to say", "asian or indian", NA_character_)] <- "other"
df$income[!df$income %in% c("< R5,000", "R5,000 – R9,999", "R10,000 – R29,999", "R30,000 – R49,999", "R50,000 – R99,999", "> R100,000", "prefer not to say", NA_character_)] <- "other"
df$education[!df$education %in% c("< high school", "high school", "some college", "2-year degree", "4-year degree", "graduate degree", "prefer not to say", NA_character_)] <- "other"
df$politics[!df$politics %in% c("conservative", "moderate", "liberal", "prefer not to say", NA_character_)] <- "other"
df$location[!df$location %in% c("urban", "suburban", "rural", "prefer not to say", NA_character_)] <- "other"
df$religion[grep("christ", tolower(df$religion))] <- "christian"
df$religion[!df$religion %in% c("christian", "african traditional", "islam", "hinduism", "no religion", "prefer not to say", NA_character_)] <- "other"
return(df)
}
df <- clean_up_demog(df)# separating out treatment for binding later
df_treatment <-
df %>%
transmute(
user_id,
vax_status = vax_status %>% as_factor(),
have_motivation = motive %>% as_factor(),
have_ability = ability %>% as_factor(),
best_treatment = str_to_sentence(best_treatment),
best_treatment = if_else(str_detect(best_treatment, "^Nothing"), "Nothing would help", best_treatment),
best_treatment = if_else(str_detect(best_treatment, "Other reason"), "Something else", best_treatment),
best_treatment = if_else(best_treatment %in% c("Family supports it", "Trusted info source", "More transparency", "Nothing", "Rewards for vaxxing", "Job/school required", "Something else", "Nothing would help"), best_treatment, NA_character_) %>% as_factor()
)perplexity <- function(p) 2^(rowSums(-(p * log2(p)), na.rm = TRUE))
c(1/6,1/6,1/6,1/6,1/6,1/6) %>%
as_tibble() %>%
mutate(
row = row_number()
) %>%
pivot_wider(values_from = value, names_from = row) %>%
mutate(
perplexity = perplexity(.)
)
c(1,0,0,0,0,0) %>%
as_tibble() %>%
mutate(
row = row_number()
) %>%
pivot_wider(values_from = value, names_from = row) %>%
mutate(
perplexity = perplexity(.)
)# remove later
# df %>%
# filter(vax_status == "unvax") %>%
# sample_n(100) %>%
# select(user_id, consent, vax_status_raw, motive_raw, motive_main_raw, benefit_main, benefit_main_other, benefit_main_explain, risk_main, risk_main_other, risky_main_explain, belief_main, belief_main_other, belief_main_explain, motive_main_other, motive_other_main_explain, motive_other, ability_raw, ability_main_raw, ability_main_other, ability_other_main_explain, availability_main, availability_main_other, availability_main_explain, time_main, time_main_other, time_main_explain, money_main, money_main_other, money_main_explain, ability_other, ability_other_add, vax_future, best_treatment_explain, best_treatment_other, best_treatment_add, post_vax_measure, cv_age, ethnicity, income, education, religion, christian_type, religiosity, politics, location, comfortable, comfortable_not_parts, engaging, enjoyable, enjoyable_not_parts, suggestions) %>%
# select(user_id, contains(c("_other", "_explain"))) %>%
# remove_empty() %>%
# mutate_all(~ as.character(.)) %>%
# pivot_longer(cols = -user_id, values_drop_na = T, names_to = "question", values_to = "answer") %>%
# mutate(
# answer = gsub("http[^[:space:]]*", "", answer),
# contains_information = ""
# ) %>%
# writexl::write_xlsx(here::here("proportion_of_useful_freetext_answers_100_unvax_users.xlsx"))# selecting variable order from data dictionary: https://docs.google.com/spreadsheets/d/168Ol4mR4umwqdhYiJhahsDvQjPYcw4hQfbYri9ZJO74/edit#gid=23855991
df <-
df %>%
select(user_id, consent, vax_status_raw, motive_raw, motive_main_raw, benefit_main, benefit_main_other, benefit_main_explain, risk_main, risk_main_other, risky_main_explain, belief_main, belief_main_other, belief_main_explain, motive_main_other, motive_other_main_explain, motive_other, ability_raw, ability_main_raw, ability_main_other, ability_other_main_explain, availability_main, availability_main_other, availability_main_explain, time_main, time_main_other, time_main_explain, money_main, money_main_other, money_main_explain, ability_other, ability_other_add, vax_future, best_treatment_explain, best_treatment_other, best_treatment_add, post_vax_measure, cv_age, ethnicity, income, education, religion, christian_type, religiosity, politics, location, comfortable, comfortable_not_parts, engaging, enjoyable, enjoyable_not_parts, suggestions)
colnames(df) <-
c("user_id", "if you're over the age of 18 and are ready to start the survey now, let me know below!", "Have you received the covid vaccine already?", "can you share if you want to get a covid vaccine?","which of these best describes what you think?","which of these is closest to what you mean?","what reason is that?","can you say more?","which of these is closest to what you mean?","what reason is that?","can you say more?","which of these is closest to what you mean?","what reason is that?","can you say more?","what reason for not wanting the vaccine is most important to you?","can you say more?","are there other reasons why you might not want the vaccine?","if you decided you wanted the vaccine, how easy would it be to get it?","is there a particular challenge that would affect you the most if you wanted the vaccine?","what makes getting the vaccine particularly challenging for you?","can you explain a bit more?","is there a reason below that makes it particularly hard?","what reason is that?","can you explain a bit more?","is there a reason below that makes it particularly hard?","what reason is that?","can you explain a bit more?","is there a reason below that makes it particularly costly?","what reason was that?","can you explain a bit more?","Are there other reasons it might be hard to get the vaccine?","anything else you'd like to add?","would you ever consider getting a vaccine in the future?","what would help the most?","what would make the most impact for you to consider getting vaccinated?","is there anything else you'd like to add?","can you share if you want to get a covid vaccine?","what's your current age?", "which best describes you?", "what's your household's total yearly income?", "what’s your highest level of education?", "what's your religion, if any?", "is there a particular Christian denomination you belong to?", "how religious would you say you are?", "what's your political position?", "what best describes where you live?", "how comfortable did you feel during this conversation?", "what parts weren't comfortable?", "how engaging was the conversation?", "how enjoyable was the conversation?", "what parts weren't enjoyable?", "any suggestions for improving this survey?")
df <-
df %>%
pivot_longer(cols = -user_id, values_drop_na = T, names_to = "question", values_to = "answer") %>%
transmute(
user_id,
qna = str_c(question, answer, sep = " ")
) %>%
group_by(user_id) %>%
mutate(row = row_number()) %>%
ungroup() %>%
relocate(row) %>%
mutate(qna = str_to_sentence(qna)) %>%
pivot_wider(names_from = row, values_from = qna) %>%
#mutate_all(~ replace_na(., "")) %>%
unite("conversation", `1`:`39`, sep = ". ", na.rm = TRUE) %>%
mutate(
conversation = str_c(conversation, "Would any of the below make you more interested in getting a vaccine in the future? (it's ok to say nothing!)", sep = ". ")
)
df <-
df %>%
inner_join(df_treatment, by = "user_id")
# write conversation to data file
# df %>% write_csv(here::here("conversations/conversation_pilot_7.csv"))For context, the preferred treatments chosen by 2499 participants can be seen below:
df %>%
drop_na(best_treatment) %>%
tabyl(best_treatment) %>%
as_data_frame() %>%
arrange(-n) %>%
rename(preferrred_treatment = best_treatment) %>%
kable() %>%
kable_styling()| preferrred_treatment | n | percent |
|---|---|---|
| Job/school required | 970 | 0.3887776 |
| Nothing would help | 512 | 0.2052104 |
| Family supports it | 453 | 0.1815631 |
| Trusted info source | 315 | 0.1262525 |
| Something else | 124 | 0.0496994 |
| Rewards for vaxxing | 110 | 0.0440882 |
| More transparency | 11 | 0.0044088 |
The table below shows the entire conversation each participant had with the chatbot. The last question in each conversation asks about their preferred treatment. The preferred treatment answer is separated out as a label for use in supervised algorithms.
df %>%
select(user_id, conversation, preferred_treatment = best_treatment) %>%
drop_na() %>%
datatable(filter = "top", options = list(pageLength = 5))
The table below shows the entire conversation each user had with the chatbot, along with their vaccination status, motivation, and ability to get vaccinated. The last question in each conversation asks about their preferred treatment. The preferred treatment answer is separated out as a label for use in supervised algorithms.
df %>%
select(user_id, vax_status, have_motivation, have_ability, conversation, preferred_treatment = best_treatment) %>%
drop_na() %>%
datatable(filter = "top", options = list(pageLength = 5))