Chatfuel

chatfuel <- read.csv(here("pilot8", "data", "chatfuel", "Share_Your_Voice_2022_08_15_23_59_12_full.csv"), na.strings = "") %>%
  mutate(chatfuel.user.id = as.character(chatfuel.user.id)) %>%
  # this variable is not used
  dplyr::select(!opinion_friend) %>%
  # extract ad id from ref
  mutate(ad_id = as.character(as.numeric(str_remove(string = ref, pattern = "ad id: ")))) %>%
  dplyr::select(!ref) %>%
  mutate(full_complete = ifelse(full_complete == "yes" & !is.na(full_complete), "complete", "incomplete"))%>% 
  #unite variables that are the same
  unite("ability_misunderstood_explain",  c("ability_misunderstood", "ability_misunderstood_explain"), 
      na.rm = T, remove = T, sep = "") %>%
  unite("availability_main",  c("availability_main", "availability_main."), 
      na.rm = T, remove = T, sep = "") %>%
  unite("intro_complete",  c("intro_complete", "consent_complete"), 
      na.rm = T, remove = T, sep = "") %>%
  mutate(intro_complete = ifelse(intro_complete == "yesyes", "yes", intro_complete))

Ads

ad_themes <- c("Risky", "Inaccessible", "Unnecessary", "Fear")
ads <- read.csv(here("pilot8","data", "ads_data_v8.csv"))%>%
  # extract ad theme and make it a column
  mutate(theme = str_extract(pattern = paste(ad_themes, collapse = "|"), Ad.Set.Name), 
         Ad.ID = as.character(Ad.ID)) 

# the first line is a summary
ads <- ads[-1, ]

# print non matches
unique(chatfuel$ad_id)[!(unique(chatfuel$ad_id) %in% unique(ads$Ad.ID))]
## [1] NA                  "23850143823590220" "23850139217980156"
## [4] "23850143823490220" "23850143823480220"
# how many non matching chatfuel ids are in each non matching ad id?
chatfuel %>% 
  filter(!(ad_id %in% unique(ads$Ad.ID))) %>% 
  group_by(ad_id) %>% 
  count()
## # A tibble: 5 × 2
## # Groups:   ad_id [5]
##   ad_id                 n
##   <chr>             <int>
## 1 23850139217980156     1
## 2 23850143823480220     1
## 3 23850143823490220     1
## 4 23850143823590220     1
## 5 <NA>               1197
# drop these (only 1 observation each)
chatfuel <- chatfuel %>%
  filter(!(ad_id %in% na.omit(unique(chatfuel$ad_id)[!(unique(chatfuel$ad_id) %in% unique(ads$Ad.ID))])))


# ad id by qualtrics id table
ad_to_qualtrics_id <- chatfuel %>% 
  select(chatfuel.user.id, ad_id) %>%
  merge(ads, by.x = "ad_id", by.y = "Ad.ID", all.x = T)

# we will re merge this later by chatfuel id
chatfuel <- chatfuel %>% 
  select(!ad_id)

Sign up time

# create sign up time by chatfuel id talbe. we will remerge this later
chatfeul_signup <- chatfuel %>% select(chatfuel.user.id, signed.up)  

chatfuel <- chatfuel %>%
  select(!signed.up)

Gender

chatfuel_gender <- chatfuel %>%
  select(chatfuel.user.id, gender)
chatfuel <- chatfuel %>%
  select(!gender)

Qualtrics

# get paths to qualtrics data
fullpath <- str_subset(pattern = "full", list.files(here( "pilot8", "data", "qualtrics"), full.names = T))

# bind them all together
qualtrics <- do.call("smartbind",lapply(fullpath,FUN=function(files){ read.csv(files, na.strings = "")[-c(1:2), ] %>%
    # create treatment assignment label
    mutate(treatment_assign = str_extract(files, "T(1|5)"))})) %>%
  # drop pii/ unused columns
    dplyr::select(!IPAddress & !RecipientLastName &  !RecipientFirstName & !RecipientEmail & !LocationLatitude & 
             !LocationLongitude & !UserLanguage & !version. & !ExternalReference & !ResponseId &
             !DistributionChannel  &  !Status &  !Progress &   !Duration..in.seconds.
             & !Finished & !RecordedDate )  %>%
  #drop users with no chatfuel id or chatfuel ids have characters
  filter(chatfuel_id != "" & !str_detect(chatfuel_id, "[A-z]"))

Correct for Qualtrics shutdown issue

A =ymd_hms("2022-08-05 02:50:00")
qualtrics <- qualtrics %>%
  mutate(StartDate = ymd_hms(StartDate),
         #drop if startdate is after the shutdown date
         drop = treatment_assign == "T5" & StartDate > A) %>%
  mutate(correction = case_when(drop == T ~ "drop", 
                                drop == F &  is.na(full_complete) ~ "incomplete", 
                                drop == F & !is.na(full_complete)  ~ "complete", 
                                TRUE ~ NA_character_)) %>%
  select(!full_complete) %>%
  rename(full_complete = correction)


qualtrics %>%
  select(treatment_assign, full_complete) %>%
  group_by(treatment_assign, full_complete) %>%
  count()
## # A tibble: 5 × 3
## # Groups:   treatment_assign, full_complete [5]
##   treatment_assign full_complete     n
##   <chr>            <chr>         <int>
## 1 T1               complete       1249
## 2 T1               incomplete      137
## 3 T5               complete       2734
## 4 T5               drop            648
## 5 T5               incomplete     1018
#number of observations by chatfuel id
qual_chatid_sum  <- qualtrics %>%
  dplyr::select(chatfuel_id) %>%
  filter(chatfuel_id != "") %>%
  group_by(chatfuel_id) %>%
  count() %>% 
  arrange(desc(n))

qual_chatid_sum
## # A tibble: 1,088 × 2
## # Groups:   chatfuel_id [1,088]
##    chatfuel_id          n
##    <chr>            <int>
##  1 5098864440235852  3208
##  2 5357116787706479   303
##  3 7711755595561101   264
##  4 5103974433004691   158
##  5 6065549646806340    57
##  6 5130844860370768    33
##  7 7712433932163547    33
##  8 8024409180934271    30
##  9 4965765240190005    29
## 10 5207892332603804    29
## # … with 1,078 more rows
# how many chatfuel ids have more than 1 observation?
qual_chatid_sum  %>%
   ungroup() %>%
   mutate(one_obs = ifelse(n == 1, 1, 0)) %>%
   group_by(one_obs) %>% 
   count()
## # A tibble: 2 × 2
## # Groups:   one_obs [2]
##   one_obs     n
##     <dbl> <int>
## 1       0   162
## 2       1   926
qualtrics <- qualtrics %>% 
     group_by(chatfuel_id) %>%
     arrange(StartDate) %>%
  # the earliest starte date for each chatfuel id is the randomized observation, 
  # all else are non randomized
    mutate(randomized = ifelse(StartDate == min(StartDate), 1, 0)) %>%
  dplyr::select(!StartDate & !EndDate) %>% 
  ungroup()


# percent of randomized by qualtrics treatment
qualtrics %>%
  select(treatment_assign, randomized) %>%
  group_by(treatment_assign, randomized) %>%
  count() %>%
  ungroup() %>%
  mutate(percent = round(n/sum(n)*100, 2))
## # A tibble: 4 × 4
##   treatment_assign randomized     n percent
##   <chr>                 <dbl> <int>   <dbl>
## 1 T1                        0   852   14.7 
## 2 T1                        1   534    9.23
## 3 T5                        0  3846   66.5 
## 4 T5                        1   554    9.57

Fixing Qualtrics column names liek ability consent, consent.1, consent.2, etc

# get column names with . .1, .2, .3 etc
colnames_to_fix <- str_remove(colnames(qualtrics)[grepl("\\.1",x = colnames(qualtrics) )], ".1")

# unite these columns
for (i in 1:length(colnames_to_fix)){
  qualtrics <- qualtrics %>% 
    unite(!!colnames_to_fix[i],  
        colnames(qualtrics)[grepl(paste("^", colnames_to_fix[i], "(\\.|$)",sep = ""),
                                   x = colnames(qualtrics))], 
        na.rm = T, remove = T, sep = "")
}

manually fix the column names with just a period at the end

qualtrics <- qualtrics %>% 
  unite("ability_mis_explain",  c("ability_mis_explain", "ability_mis_explain."), 
      na.rm = T, remove = T, sep = "")

qualtrics <- qualtrics %>% 
  unite("motive_mis_explain",  c("motive_mis_explain", "motive_mis_explain."), 
      na.rm = T, remove = T, sep = "")

qualtrics <- qualtrics %>% 
  unite("best_treatment_2",  c("best_treatment_2", "best_treatment2"), 
      na.rm = T, remove = T, sep = "")

qualtrics <- qualtrics %>% 
  unite("treat_no_explain",  c("treat_no_explain", "treat_no_explain."), 
      na.rm = T, remove = T, sep = "")

rename qualtrics column names to match chatfuel column names using crosswalk

xwalk <- read.csv(here("pilot8", "scripts", "qualtrics_to_chatfuel.csv")) %>% 
  dplyr::select(qualtrics_names, chatfuel_col) %>%
  filter(chatfuel_col != "")

colnames(qualtrics)[colnames(qualtrics) %in% xwalk$qualtrics_names] <- xwalk$chatfuel_col

make data frames for T1 and T5 that will merge with chatfuel

# get all chatfuel columns taht are in qualtrics (direct matches in qualtrics)
chatfuel_matches_qualtrics <- chatfuel %>%
  dplyr::select(any_of(c(colnames(qualtrics))))

# df for T2 T3 T5
chatfuel_t2_t3_t4 <- chatfuel_matches_qualtrics %>%
  filter(treatment_assign %in% c("T2", "T3", "T4"))

# t1 and t5 observations that are not found in qualtrics
#  did not actually start the qualtrics survey
t1_t5_not_start_qualtrics <- chatfuel_matches_qualtrics %>%
  filter(treatment_assign %in% c("T1", "T5")) %>%
  filter(!(chatfuel.user.id %in%qualtrics$chatfuel.user.id)) 

# T1 and T5 that did start the survey
t1_t5_start_qualtrics <- chatfuel_matches_qualtrics %>%
  filter(treatment_assign %in% c("T1", "T5")) %>%
  filter(chatfuel.user.id %in%qualtrics$chatfuel.user.id)

# merge qualtrics to chatfuel T1
chatfuel_t1<- t1_t5_start_qualtrics %>%
  filter(treatment_assign == "T1") %>%
  dplyr::select("chatfuel.user.id") %>%
  merge(qualtrics[qualtrics$treatment_assign == "T1", ], by = "chatfuel.user.id",  all.x = T) 

# merge qualtrics to chatfuel T5
chatfuel_t5 <- t1_t5_start_qualtrics %>%
  filter(treatment_assign == "T5")%>%
  dplyr::select("chatfuel.user.id")%>%
  merge(qualtrics[qualtrics$treatment_assign == "T5", ], by = "chatfuel.user.id",  all.x = T) 

Final data frame binds these all together

df <- chatfuel_t2_t3_t4 %>%
  smartbind(t1_t5_not_start_qualtrics) %>%
  smartbind(chatfuel_t1) %>%
  smartbind(chatfuel_t5) %>%
  # corrects for T2-T4 not having a randomized column
  mutate(randomized = ifelse(is.na(randomized), 1, randomized))
## Warning in smartbind(., chatfuel_t1): Column class mismatch for 'continue_0'.
## Converting column to class 'character'.

Dealing with duplicate phone numbers

#clean phone numbers
df$phone_number <- str_replace(df$phone_number, " ", "")
df$phone_number <- str_replace(df$phone_number, "-", "")

# get list of duplicated phone numbers
dup_phone_numbers <- df %>%
  filter(phone_number != "") %>%
  group_by(phone_number) %>% 
  count() %>%
  # if count is greater than 1, set indicator equal to 1
  mutate(dup_phone = ifelse(n >1, 1, 0)) %>% 
  filter(dup_phone == 1) %>% 
  pull(phone_number)

# we want to keep these - the earliest observation for a phone number
df_original_phonenumbers <- df %>%
  filter(phone_number %in% dup_phone_numbers) %>%
  group_by(phone_number) %>%
  mutate(original_phone_number = ifelse(intro_start_time == min(intro_start_time), 1, 0)) %>%
  filter(original_phone_number == 1) %>%
  select(!original_phone_number)

# how many observation will be removed?
# note that we will keep the first observations of duplicated phone numbers
# as the "non duplicated observations". We drop the additional observations
nrow(df) - 
  nrow( df %>% filter(!(phone_number %in% dup_phone_numbers))) + 
  nrow(df_original_phonenumbers)
## [1] 834
df <- df %>%
  # drop observations with duplicated phonenumbers
  filter(!(phone_number %in% dup_phone_numbers)) %>%
  # bind back the first observations for the duplicated phone numbers
  rbind(df_original_phonenumbers)
# dylan, #sarah, #james, #saurabh, #kristine
lab_ids <- c("5036136209807958", "7720209621383182" ,"5059178407427161", "4655082084550853", "4255488774554006")
df <- df %>% 
  filter(!(chatfuel.user.id %in% lab_ids))

Do cleaning of used in analysis variables

df_clean <- df %>%
  mutate(country_answer = tolower(country_answer)) %>%
  mutate(country_answer = ifelse(!(country_answer %in% c("kenya", "ghana", "nigeria", "south africa", "other")), NA, country_answer)) %>%
  mutate(consent = case_when(consent %in% c("yes", "Yes, let's start.") ~ "yes", 
                              consent %in% c("no","No, let's stop here.") ~ "no", 
                             is.na(consent) ~ NA_character_,
                             TRUE ~ "other"),
         vax_status = case_when(vax_status %in% c("no", "No, I haven't.", "unvax") ~ "unvax", 
                              vax_status %in% c("vax", "yes", "Yes, I have.") ~ "vax", 
                              is.na(vax_status) ~ NA_character_,
                             TRUE ~ "other"),
         motive =tolower(str_remove(motive_raw, "\\.")), 
         motive_main = case_when(motive_main_raw == "No clear benefit." ~ "benefit", 
                                     motive_main_raw %in% c("It's too risky.", "Too risky.") ~ "risky", 
                                     motive_main_raw == "Against my beliefs." ~ "beliefs",
                                     motive_main_raw %in% c("No,you misunderstood", "No, you misunderstood", "No, you misunderstood.") ~ "misunderstood", 
                                     is.na(motive_main_raw)  ~ NA_character_,
                                     TRUE ~ "other"),
         ability_raw = tolower(ability_raw),
         ability = ifelse(ability_raw %in% c("easy", "somewhat hard", "really hard") | is.na(ability_raw), ability_raw, "other"),
         ability_main = case_when(ability_main %in% c("availability", "No availability.") ~ "availability", 
                                  ability_main %in% c("time", "No time.") ~ "time", 
                                  ability_main %in% c("money", "Too costly.") ~ "money", 
                                  ability_main %in% c("misunderstood", "No, you misunderstood.") ~ "misunderstood", 
                                  is.na(ability_main) ~ NA_character_, 
                                  TRUE ~ "other")) %>%
  select(!ability_raw & !motive_main_raw) %>%
 mutate_all(list(~na_if(.,""))) 

clean best treatment

df_clean <- df_clean %>%
  mutate(new_best_treatment = case_when(vax_status == "unvax" & motive == "probably not" & best_treatment == "1" ~ "Job/school required it.",
                                        vax_status == "unvax" & motive == "probably not" & best_treatment == "2" ~ "Family/friend endorses it.",
                                        vax_status == "unvax" & motive == "probably not" & best_treatment == "3" ~ "More safety evidence.",
                                        vax_status == "unvax" & motive == "probably not" & best_treatment == "4" ~ "New trusted info.",
                                        vax_status == "unvax" & motive == "probably not" & best_treatment == "5" ~ "Rewards for vaccinating.",
                                        vax_status == "unvax" & motive == "probably not" & best_treatment == "6" ~ "Nothing.",
                                        vax_status == "unvax" & motive == "probably not" & best_treatment == "7" ~ "No, something else.",
                                        vax_status == "unvax" & motive == "unsure" & best_treatment == "1" ~ "Job/school required it.",
                                        vax_status == "unvax" & motive == "unsure" & best_treatment == "2" ~ "Family/friend endorses it.",
                                        vax_status == "unvax" & motive == "unsure" & best_treatment == "3" ~ "More safety evidence.",
                                        vax_status == "unvax" & motive == "unsure" & best_treatment == "4" ~ "New trusted info.",
                                        vax_status == "unvax" & motive == "unsure" & best_treatment == "5" ~ "Rewards for vaccinating.",
                                        vax_status == "unvax" & motive == "unsure" & best_treatment == "6" ~ "Convenient to vaccinate.",
                                        vax_status == "unvax" & motive == "unsure" & best_treatment == "7" ~ "Time off work.", 
                                        vax_status == "unvax" & motive == "unsure" & best_treatment == "8" ~ "Reminders.", 
                                        vax_status == "unvax" & motive == "unsure" & best_treatment == "9" ~ "Appointment.", 
                                        vax_status == "unvax" & motive == "unsure" & best_treatment == "10" ~ "Nothing.", 
                                        vax_status == "unvax" & motive == "unsure" & best_treatment == "11" ~ "No, something else.",
                                        vax_status == "unvax" & motive == "probably" & best_treatment == "1" ~ "Job/school required it.",
                                        vax_status == "unvax" & motive == "probably" & best_treatment == "2" ~ "Convenient to vaccinate.",
                                        vax_status == "unvax" & motive == "probably" & best_treatment == "3" ~ "Time off work.", 
                                        vax_status == "unvax" & motive == "probably" & best_treatment == "4" ~ "Reminders.", 
                                        vax_status == "unvax" & motive == "probably" & best_treatment == "5" ~ "Appointment.", 
                                        vax_status == "unvax" & motive == "probably" & best_treatment == "6" ~ "Nothing.", 
                                        vax_status == "unvax" & motive == "probably" & best_treatment == "7" ~ "No, something else.",
                                        vax_status == "vax" & motive %in% c("was unsure", "no") & best_treatment == "1" ~ "Job/school required it.",
                                        vax_status == "vax" & motive %in% c("was unsure", "no")  & best_treatment == "2" ~ "Family/friend endorses it.",
                                        vax_status == "vax" & motive %in% c("was unsure", "no")  & best_treatment == "3" ~ "More safety evidence.",
                                        vax_status == "vax" & motive %in% c("was unsure", "no") & best_treatment == "4" ~ "New trusted info.",
                                        vax_status == "vax" & motive %in% c("was unsure", "no")  & best_treatment == "5" ~ "Rewards for vaccinating.",
                                        vax_status == "vax" & motive %in% c("was unsure", "no") & best_treatment == "6" ~ "Convenient to vaccinate.",
                                        vax_status == "vax" & motive %in%c("was unsure", "no") & best_treatment == "7" ~ "Time off work.", 
                                        vax_status == "vax" & motive %in%c("was unsure", "no")  & best_treatment == "8" ~ "Reminders.", 
                                        vax_status == "vax" & motive %in% c("was unsure", "no")  & best_treatment == "9" ~ "Appointment.", 
                                        vax_status == "vax" & motive %in% c("was unsure", "no")  & best_treatment == "10" ~ "Nothing.", 
                                        vax_status == "vax" & motive %in% c("was unsure", "no")  & best_treatment == "11" ~ "No, something else.",
                                        vax_status == "vax" & motive == "yes" & best_treatment == "1" ~ "Job/school required it.",
                                        vax_status == "vax" & motive == "yes" & best_treatment == "2" ~ "Convenient to vaccinate.",
                                        vax_status == "vax" & motive == "yes" & best_treatment == "3" ~ "Time off work.", 
                                        vax_status == "vax" & motive == "yes" & best_treatment == "4" ~ "Reminders.", 
                                        vax_status == "vax" & motive == "yes" & best_treatment == "5" ~ "Appointment.", 
                                        vax_status == "vax" & motive == "yes" & best_treatment == "6" ~ "Nothing.", 
                                        vax_status == "vax" & motive == "yes" & best_treatment == "7" ~ "No, something else.", 
                                        is.na(best_treatment) ~ NA_character_,
                                        TRUE ~ "other")) %>%
  select(!best_treatment) %>%
  rename(best_treatment = new_best_treatment)

Clean Motive

Vax Unvax Label
Yes Probably Yes
No Probably Not No
Was Unsure Unsure Unsure
df_clean <- df_clean %>%
  mutate(motive = case_when(motive %in% c("probably", "yes")~ "yes", 
                            motive %in% c("probably not", "no") ~ "no",
                            motive %in% c("was unsure", "unsure") ~ "unsure", 
                            is.na(motive) ~ NA_character_,
                            TRUE ~ "other"))

Add IDs for dylan outcomes

df_clean <- df_clean %>% dplyr::select(!phone_number)
df_clean$id <- 1:nrow(df_clean)

Free text nchar/nquestions outcome

# everyone
everyone<- c("opinion_friend_family", "opinion_conversation", 
  "best_treatment_proposal", "opinion_other_people_treatment", 
  "info_source", "info_confidence_explain", "challenge_have_covid", "why_change_mind", 
  "info_likelihood_why", "self_reflection_explain", "comfortable_not_.parts", "enjoyable_not_parts", "suggestions", "greating_answer")

# if motive != yes
### Check this - i am seeing inconsistencies
motive_free_text <- c("motive_reason", "motive_other")

#if risk_main or benefit_main or belief_main == no, none of these
motive_sub_other_free_text <- c("motive_sub_other")

# if motive_main_raw == "No, you misunderstood."
motive_main_raw_free_text <- c("motive_misunderstood_explain")


# ability_raw !=  "easy"
ability_free_text <- c("ability_reason", "ability_other")

# if availability_main or time_main or money_main == "No, something else."
ability_sub_other_free_text <- c("ability_sub_other")

# ability_main == "misunderstood"
ability_main_raw_free_text <- c("ability_misunderstood_explain")

# not sure
best_treatment_none_free_text <- c("best_treatment_none_explain")

# if best treatmet != button or = something else
best_treatment_other_free_text <- c("best_treatment_other")


# if vax_status = unvax and ability == "easy"
unvax_easy_free_text <- c("unvaxeasy_explain")


df_clean <- df_clean %>%
  mutate(everyone_nchar = rowSums(across(all_of(everyone), nchar), na.rm = T), 
         everyone_nq = length(everyone), 
         motive_nchar = ifelse(motive != "yes", 
                               rowSums(across(all_of(motive_free_text), nchar), na.rm = T), 
                               0), 
         motive_nq = ifelse(motive != "yes", 
                                   length(motive_free_text), 
                                   0), 
         
         motive_sub_other_nchar = ifelse(benefit_main == "No, none of these."| belief_main == "No, none of these." | risk_main == "No, none of these.", 
                                         rowSums(across(all_of(motive_sub_other_free_text), nchar), na.rm = T),0), 
         motive_sub_other_nq = ifelse(benefit_main == "No, none of these."| belief_main == "No, none of these." | risk_main == "No, none of these.", 
                                      length(motive_sub_other_free_text), 0),
         motive_misunderstood_explain_nchar= ifelse(motive_main== "misunderstood", rowSums(across(all_of(motive_main_raw_free_text), nchar), na.rm = T),0),
         motive_misunderstood_explain_nq = ifelse(motive_main == "misunderstood", length(motive_main_raw_free_text),0),
         
         ability_nchar =ifelse(ability != "easy", rowSums(across(all_of(ability_free_text), nchar), na.rm = T),0),
         ability_nq = ifelse(ability != "easy", length(ability_free_text),0),
         
         ability_sub_other_nchar = ifelse(time_main == "No, something else."| availability_main == "No, something else." | money_main == "No, something else.", 
                                         rowSums(across(all_of(ability_sub_other_free_text), nchar), na.rm = T),0), 
         ability_sub_other_nq = ifelse(time_main == "No, something else."| availability_main == "No, something else." | money_main == "No, something else.", 
                                       length(ability_sub_other_free_text), 0),
         ability_misunderstood_explain_nchar= ifelse(ability_main == "misunderstood", rowSums(across(all_of(ability_main_raw_free_text), nchar), na.rm = T),0),
         ability_misunderstood_explain_nq = ifelse(ability_main == "misunderstood", length(ability_main_raw_free_text),0),
         best_treatment_none_nchar = ifelse(best_treatment == "Nothing.", rowSums(across(all_of(best_treatment_none_free_text), nchar), na.rm = T),0),
         best_treatment_none_nq = ifelse(best_treatment == "Nothing.", length(best_treatment_none_free_text),0), 
         
         best_treatment_other_nchar = ifelse(!(best_treatment %in% c("Convenient to vaccinate.", "Job/school required it.", "More safety evidence.", 
                                                                      "Time off work.", "Family/friend endorses it.", "New trusted info.", "Reminders.", "Appointment.")), 
                                             rowSums(across(all_of(best_treatment_other_free_text), nchar), na.rm = T),0),
         best_treatment_other_nq = ifelse(!(best_treatment %in% c("Convenient to vaccinate.", "Job/school required it.", "More safety evidence.", 
                                                                      "Time off work.", "Family/friend endorses it.", "New trusted info.", "Reminders.", "Appointment.")), 
                                             length(best_treatment_other_free_text),0), 
         unvaxeasy_other_nchar = ifelse(vax_status == "unvax" & ability == "easy", rowSums(across(all_of(unvax_easy_free_text), nchar), na.rm = T), 0),
         unvaxeasy_other_nq = ifelse(vax_status == "unvax" & ability == "easy", length(unvax_easy_free_text), 0)
         
         ) %>%
  mutate(nchar_all = rowSums(across(colnames(.)[grepl(pattern = "nchar", x = colnames(.))]), na.rm = T), 
         nq_all = rowSums(across(colnames(.)[grepl(pattern = "nq", x = colnames(.))]), na.rm = T), 
         nchar_per_question = nchar_all/nq_all) %>%
  select(!ends_with("_nchar"), !ends_with("_nq"))

Randomized qualtrics treatments by complete/incomplete

df_clean %>% 
  filter(treatment_assign %in% c("T1", "T5")) %>%
  select(treatment_assign, randomized, full_complete) %>%
  group_by(treatment_assign, randomized, full_complete) %>% 
  count()
## # A tibble: 10 × 4
## # Groups:   treatment_assign, randomized, full_complete [10]
##    treatment_assign randomized full_complete     n
##    <chr>                 <dbl> <chr>         <int>
##  1 T1                        0 complete        719
##  2 T1                        0 incomplete       58
##  3 T1                        1 complete        491
##  4 T1                        1 incomplete     1390
##  5 T5                        0 complete       2250
##  6 T5                        0 drop            612
##  7 T5                        0 incomplete      927
##  8 T5                        1 complete        476
##  9 T5                        1 drop             23
## 10 T5                        1 incomplete     1434

Add dylan’s heuristic outcomes

outcome_none <- read.csv(here("pilot8", "data", "hand_coded", "outcome_heuristics_best_treatment_none_explain.csv"), na.strings = "NA") %>%
  select(id, outcome_heuristics_best_treatment_none = outcome_heuristics_best_treatment_proposal)

outcome_proposal <- read.csv(here("pilot8", "data", "hand_coded", "outcome_heuristics_best_treatment_proposal.csv"), na.strings = "NA") %>%
  select(id, outcome_heuristics_best_treatment_proposal)

df_clean <- df_clean %>%
  merge(outcome_none, by = "id", all.x =T) %>%
  merge(outcome_proposal, by = "id", all.x = T)

Filter out participants who have went through the script multiple times

df_clean <- df_clean %>%
  mutate(vax_q = !is.na(why_change_mind), 
         unvax_q = !is.na(vax_future), 
         both_q = vax_q & unvax_q) %>%
  mutate(avail_q = !is.na(availability_main), 
         time_q = !is.na(time_main), 
         money_q = !is.na(money_main),
         misunderstood_ability_q = ability_main == "misunderstood" & !is.na(ability_main)) %>% 
  rowwise() %>%
  mutate(mult_ability = sum(avail_q, time_q, money_q, misunderstood_ability_q) > 1) %>%
  mutate(belief_q = !is.na(belief_main), 
        risk_q = !is.na(risk_main), 
        benefit_q = !is.na(benefit_main), 
        misunderstood_q = (motive_main == "misunderstood" & !is.na(motive_main))) %>%
  rowwise() %>%
  mutate(mult_motive = sum(belief_q, risk_q, benefit_q, misunderstood_q) > 1) %>%
  mutate(mult_motive2 = motive == "yes" & !is.na(motive) & (!is.na(motive_reason) | !is.na(motive_other) | !is.na(motive_misunderstood_explain) | !is.na(belief_main) | !is.na(benefit_main) | !is.na(risk_main))) %>%
  mutate(mult_ability2 = ability == "easy" & !is.na(ability) & (!is.na(ability_reason) | !is.na(ability_other) | !is.na(ability_misunderstood_explain)| !is.na(time_main) | !is.na(money_main) | !is.na(availability_main))) %>%
  mutate(overwriting = sum(mult_motive| mult_motive2 | mult_ability2 | mult_ability|both_q)) 

# overwriting by treatment
df_clean %>%
  select( overwriting, treatment_assign) %>%
  filter(overwriting == 1) %>%
  group_by(overwriting,  treatment_assign) %>% 
  count()
## # A tibble: 5 × 3
## # Groups:   overwriting, treatment_assign [5]
##   overwriting treatment_assign     n
##         <int> <chr>            <int>
## 1           1 T1                   4
## 2           1 T2                 121
## 3           1 T3                 135
## 4           1 T4                 119
## 5           1 T5                   3
df_clean <- df_clean %>%
  filter(overwriting != 1) %>% 
  select(!overwriting)

Day time Cleaning

df_clean <- df_clean %>%
  merge(chatfeul_signup, by = "chatfuel.user.id", all.x = T) %>%
  mutate(signed_up = ifelse(randomized == 0, NA, signed.up)) %>%
  select(!signed.up)

nrow(df_clean)
## [1] 13572
df_clean <- df_clean %>%
 mutate(main_start_time = ymd_hms(signed_up),
         month = month(signed_up, label = T), 
         hour = hour(signed_up), 
         wday = wday(signed_up, label = T), 
         time_group= ifelse(hour >= 8 & hour< 20, "8 am to 8 pm", "8 pm to 8 am" ), 
    day_time_group = ifelse(is.na(wday)|is.na(time_group), NA, paste(wday, time_group, sep = " "))) %>%
  filter(month == "Aug" | is.na(month))

nrow(df_clean)
## [1] 13331

Clean comfortable and enjoyable cleaning

df_clean <- df_clean %>% 
  mutate(comfortable = str_remove(pattern = "\\.", string = comfortable),
         enjoyable = str_remove(pattern = "\\.", string = enjoyable)) 

Merge Ads

df_clean <- df_clean %>%
  merge(ad_to_qualtrics_id, by = "chatfuel.user.id", all.x = T)
df_clean <- merge(df_clean, chatfuel_gender, by = "chatfuel.user.id", all.x = T) %>%
  # we don't know gender of non randomized folks not in chatfuel
  mutate(gender = ifelse(randomized ==0, NA, gender)) %>%
  mutate(ethnicity  = tolower(ethnicity), 
         education = tolower(education), 
         location = tolower(location), 
         religion = tolower(religion), 
         religiosity = tolower(religiosity))



# clean up demographic variables
clean_up_demog <- function(df){
  df$gender[!df$gender %in% c("male", "female", NA_character_)] <- "other"
  df$ethnicity[!df$ethnicity %in% c("black or african", "white or caucasian", "prefer not to say", "asian or indian", "other", NA_character_)] <- "other"
  df$education[!df$education %in% c("< high school", "high school", "some college", "2-year degree", "4-year degree", "graduate degree", "prefer not to say", NA_character_)] <- "other"
  df$location[!df$location %in% c("urban", "suburban", "rural", "prefer not to say", NA_character_)] <- "other"
  df$religion[grep("christ", tolower(df$religion))] <- "christian"
  df$religion[!df$religion %in% c("christian", "african traditional", "islam", "hinduism", "judaism","no religion", "prefer not to say", NA_character_)] <- "other"
  df$religiosity[!df$religiosity %in% c("somewhat religious", "not very religious", "very religious", "prefer not to say", NA_character_)] <- "other"
  return(df)
}

df_clean <- clean_up_demog(df_clean)

Clean MCQ

# Clean MCQs
# Make sure values match the encoded responses, any leftovers marked as "other".
clean_up_mcq <- function(df){
# ability
  df[!(df$ability_main %in% c("availability", "money", "other", "time", "misunderstood", NA_character_)), "ability_main"] <- "other"
  df[!(df$ability %in% c("easy",  "really hard", "somewhat hard", NA_character_)), "ability_raw"] <- "other"
  
  df$availability_main <- str_replace(pattern = "\\.",string =  tolower(df$availability_main), "")
  df$availability_main <- ifelse(df$availability_main %in% c("it's too far away", "it was too far away"), "too far away", df$availability_main)
  df[!(df$availability_main %in% c("no vaccines left",  "too far away", "no, something else", NA_character_)), "availability_main"] <- "other"
  
  df$money_main <- str_replace(pattern = "\\.",string =  tolower(df$money_main), "")
  df[!(df$money_main %in% c("no cash",  "no insurance", "travel costs",  "no, something else", NA_character_)), "money_main"] <- "other"
  
  df$time_main <- str_replace(pattern = "\\.",string =  tolower(df$time_main), "")
  df[!(df$time_main %in% c("no childcare",  "hard to get off work", "no time to research", "no, something else", NA_character_)), "time_main"] <- "other"
  
  # motive
  df[!(df$motive %in% c("no", "yes", "unsure", NA_character_)), "motive"] <- "other"
  
  df$belief_main <- str_replace(pattern = "\\.",string =  tolower(df$belief_main), "")
  df[!(df$belief_main %in% c("not trust pharm/gov", "freedom to choose", "religious reasons", "no, none of these", NA_character_)), "belief_main"] <- "other"
  
  df$risk_main <- str_replace(pattern = "\\.",string =  tolower(df$risk_main), "")
  df[!(df$risk_main %in% c("bad side effects", "not trust pharm/gov", "not enough testing", "no, none of these", NA_character_)), "risk_main"] <- "other"
  
  df$benefit_main <- str_replace(pattern = "\\.",string =  tolower(df$benefit_main), "")
  df$benefit_main <- ifelse(df$benefit_main %in% c("i had covid already"), "had covid before", df$benefit_main)
  df[!(df$benefit_main %in% c("had covid before", "unlikely to get sick", "covid not dangerous", "vaccines don't work", "no, none of these", NA_character_)), "benefit_main"] <- "other"
  
  df[!(df$vax_status %in% c("unvax", "vax", NA_character_)), "vax_status"] <- "other"

  df[!(df$vax_future %in% c("Sure", "No", "Maybe", NA_character_)), "vax_future"] <- "other"
  

  return(df)
  
}

df_clean <- clean_up_mcq(df_clean)

Numeric Columns

numeric_cols <- function(df){
out <- df %>%
  mutate(
    vax_status_num = ifelse(vax_status == "vax" & !is.na(vax_status), 1, ifelse(vax_status == "unvax" & !is.na(vax_status), 0, NA)),
    age = as.numeric(ifelse(cv_age %in% paste(18:99), cv_age, "")),
    gender_num = case_when(
      gender == "female" ~ 1,
      gender == "male" ~ 0
      ),
    education_num = case_when(
      education == "< high school" ~ 1,
      education == "high school" ~ 2,
      education == "some college" ~ 3,
      education == "2-year degree" ~ 4,
      education == "4-year degree" ~ 5,
      education == "graduate degree" ~ 6,
    ),
    religiosity_num = case_when(
      religiosity == "not very religious" ~ 1,
      religiosity == "somewhat religious" ~ 2,
      religiosity == "very religious" ~ 3,
    ),
    location_num = case_when(
      location == "rural" ~ 1,
      location == "suburban" ~ 2,
      location == "urban" ~ 3,
    ),
    location =factor(location, levels = c("urban", "suburban", "rural", "other", "prefer not to say")),
    black = ifelse(ethnicity == "black or african", 1, 0),
    ethnicity = factor(ethnicity, levels=c("asian or indian", 
                                                "black or african", 
                                                "white or caucasian", 
                                                "other",
                                                "prefer not to say")
  ))
  
  return(out)
}
df_clean <- numeric_cols(df_clean)

Add in hand coded values

motive_consistency <- read.csv(here("pilot8", "data", "hand_coded", "motive_reason_consistency.csv")) %>%
  select(!Note &  !vax_status & !motive_reason)
opinion_consistency <- read.csv(here("pilot8", "data", "hand_coded", "opinion_friend_family_consistency.csv")) %>%
  select(id, opinion_friend_family_consistency, opinion_friend_family_consistency_2)

motive_track_consistency <-read.csv(here("pilot8", "data", "hand_coded", "motive_track_consistency.csv")) %>%
  select(id, motive_track_consistency)

df_clean <- merge(df_clean, motive_consistency, by = "id", all.x = T)
df_clean <- merge(df_clean, opinion_consistency, by = "id", all.x = T)
df_clean <- merge(df_clean, motive_track_consistency, by = "id", all.x = T)

Dylan’s csv with shutdown

write.csv(df_clean, here("pilot8", "data", "full_df_clean_with_shutdown.csv"), row.names = T)

Drop T5 shutdown observations

# how many will be dropped?
nrow(df_clean %>% filter(full_complete == "drop"))
## [1] 635
df_clean <- df_clean %>%
  filter(full_complete != "drop") 
write.csv(df_clean, here("pilot8", "data", "full_df_clean.csv"), row.names = T)
test <- read.csv(here("pilot8", "data", "full_df_clean.csv"))