# full data
df_full_v8 <- 
  read_csv(here("pilot8/data/full_df_clean.csv")) %>% 
  clean_names() %>% 
  mutate_if(is.character, ~ str_replace_all(., '[\n\t]', '')) %>%
  mutate(
    motive = if_else(str_detect(motive, "yes"), "yes", "no"),
    motive_main = if_else(str_detect(motive_main, "risk"), "risk", motive_main),
    best_treatment = str_remove_all(best_treatment, "\\.") %>% str_to_sentence(),
    best_treatment = if_else(best_treatment == "New trusted info", "More safety evidence", best_treatment),
  ) %>%
  remove_empty("rows")

# filter to completes and add features
df <-
  df_full_v8 %>%
  filter(full_complete == "complete") %>% 
  drop_na(vax_status)


df_features <-
  df %>% 
  bind_cols(
    df %>%
      pull(opinion_conversation) %>%
      get_sentences() %>%
      sentiment_by() %>% 
      transmute(opinion_conv_sentiment = ave_sentiment)
  ) %>%
  mutate(
    
    covid_already = if_else(str_detect(covid_already, "No|no"), 0L, 1L) %>% replace_na(0),
    no_motive = case_when(
      motive == "yes" ~ 0L,
      motive == "no" ~ 1L,
    ),
    motive_elaboration = motive_nchar,
    # post_want_vax,
    no_ability = case_when(
      ability == "easy" ~ 0L,
      ability != "easy" & !is.na(ability) ~ 1L,
    ) %>% replace_na(0),
    ability_elaboration = ability_nchar %>% replace_na(0),
    
    against_beliefs = if_else(motive_main == "beliefs", 1L, 0L) %>% replace_na(0),
    no_benefits = if_else(motive_main == "benefit", 1L, 0L) %>% replace_na(0),
    risky = if_else(motive_main == "risk", 1L, 0L) %>% replace_na(0),
    
    no_time = if_else(ability_main == "time", 1L, 0L) %>% replace_na(0),
    no_money = if_else(ability_main == "money", 1L, 0L) %>% replace_na(0),
    no_availability = if_else(ability_main == "availability", 1L, 0L) %>% replace_na(0),
    
    # risk
    bad_side_effects = if_else(str_detect(risk_main, "bad side effects"), 1L, 0L) %>% replace_na(0),
    lack_of_testing = if_else(str_detect(risk_main, "not enough testing"), 1L, 0L) %>% replace_na(0),
    not_trust_pharma = if_else(str_detect(risk_main, "not trust phar") | str_detect(belief_main, "not trust phar"), 1L, 0L) %>% replace_na(0),
    # benefit
    covid_not_dangerous = if_else(str_detect(benefit_main, "covid not dangerous"), 1L, 0L) %>% replace_na(0),
    # had_covid_before = if_else(str_detect(benefit_main, "had covid before"), 1L, 0L) %>% replace_na(0),
    vaccines_dont_work = if_else(str_detect(benefit_main, "vaccines don't work"), 1L, 0L) %>% replace_na(0),
    # belief
    freedom_to_choose = if_else(str_detect(belief_main, "freedom to choose"), 1L, 0L) %>% replace_na(0),
    religious_reasons = if_else(str_detect(belief_main, "religious reasons"), 1L, 0L) %>% replace_na(0),
    
    # time
    no_time_off_work = if_else(str_detect(time_main, "hard to get off work"), 1L, 0L) %>% replace_na(0),
    no_time_to_research = if_else(str_detect(time_main, "no time to research"), 1L, 0L) %>% replace_na(0),
    no_childcare = if_else(str_detect(time_main, "no childcare"), 1L, 0L) %>% replace_na(0),
    # money
    no_cash = if_else(str_detect(money_main, "no cash"), 1L, 0L) %>% replace_na(0),
    no_insurance = if_else(str_detect(money_main, "no insurance"), 1L, 0L) %>% replace_na(0),
    travel_costs = if_else(str_detect(money_main, "travel costs"), 1L, 0L) %>% replace_na(0),
    # availability
    no_vax_left = if_else(str_detect(availability_main, "no vaccines left"), 1L, 0L) %>% replace_na(0),
    too_far = if_else(str_detect(availability_main, "too far away"), 1L, 0L) %>% replace_na(0),
    info_confidence = str_to_lower(info_confidence),
    info_confidence_high = if_else(str_detect(info_confidence, "very"), 1L, 0L) %>% replace_na(0),
    want_link = if_else(str_detect(want_link, "Sure"), 1L, 0L) %>% replace_na(0),
    want_answer = if_else(str_detect(want_answer, "Sure"), 1L, 0L) %>% replace_na(0),
    self_reflection = if_else(self_reflection == "A lot!", 1L, 0L) %>% replace_na(0),
    
    # demographics
    age,
    education = education_num,
    religiosity = religiosity_num,
    location = location_num,
    black_or_african = if_else(ethnicity == "black or african", 1L, 0L) %>% replace_na(0),
    vaccinated = vax_status_num,
    nigeria = if_else(country_answer == "nigeria", 1, 0) %>% replace_na(0),
    kenya = if_else(country_answer == "kenya", 1, 0) %>% replace_na(0),
    ghana = if_else(country_answer == "ghana", 1, 0) %>% replace_na(0),
    south_africa = if_else(country_answer == "south africa", 1, 0) %>% replace_na(0),
    opinion_conv_sentiment = opinion_conv_sentiment %>% replace_na(0)
  ) %>% 
  select(starts_with(c("impediments_hr_", "info_source_hr_", "best_treat_", "opinion_hr_")), 
       covid_already, no_motive, motive_elaboration, no_ability, 
       ability_elaboration, against_beliefs, no_benefits, risky, no_time, no_money, no_availability,
       bad_side_effects, lack_of_testing, not_trust_pharma,
       covid_not_dangerous, vaccines_dont_work, freedom_to_choose,
       religious_reasons, no_time_off_work, no_time_to_research,
       no_childcare, no_cash, no_insurance, travel_costs, no_vax_left, too_far,
       info_confidence, info_confidence_high, want_link, want_answer, self_reflection,
       age, education, religiosity, location, black_or_african, vaccinated, 
       nigeria, kenya, ghana, south_africa,
       best_treatment, opinion_conv_sentiment,post_want_vax) %>%
  select(!info_confidence) %>%
  relocate(starts_with("info_source"), .before = info_confidence_high)

df_features_unvax <- 
  df_features %>%
  filter(vaccinated == 0)

df_unvax <- 
  df %>%
  filter(vax_status_num == 0)

1 Impediments

  1. Filter only to observations that are non missing for either motive_reason, ability_reason, motive_other, or ability_other.

1.1 Classification Rate

df_unvax %>%
  filter(!is.na(motive_reason) | !is.na(ability_reason) | !is.na(motive_other) | !is.na(ability_other)) %>%
  select(starts_with("impediments_hr_")) %>%
  mutate(total = rowSums(.)) %>%
  select(total) %>%
  mutate(classified = ifelse(total >0, 1, 0)) %>%
  group_by(classified) %>%
  count() %>%
  ungroup()%>%
  mutate(percent_classified = paste(round(n/sum(n) * 100), "%", sep = "")) %>%
  filter(classified == 1) %>%
  pull(percent_classified) 
## [1] "78%"

1.2 Number of Categories per Observation

df_unvax %>%
  filter(!is.na(motive_reason) | !is.na(ability_reason) | !is.na(motive_other) | !is.na(ability_other)) %>%
  select(starts_with("impediments_hr_")) %>%
  mutate(number_of_categories = rowSums(.)) %>%
  group_by(number_of_categories) %>%
  count(name = "number_of_observations") %>%
  ungroup() %>%
  mutate(percent_of_observations = paste(round(number_of_observations/sum(number_of_observations)* 100), "%", sep = ""))

1.3 Number of Observations in Each Category

df_unvax %>%
  filter(!is.na(motive_reason) | !is.na(ability_reason) | !is.na(motive_other) | !is.na(ability_other)) %>%
  select(starts_with("impediments_hr_"))  %>% 
  summarise_all(.funs = sum) %>%
  rename_all(~str_replace(., pattern = "impediments_hr_", "")) %>%
  pivot_longer(everything(), names_to = "Heuristic", values_to = "Count") %>%
  arrange(desc(Count))

1.4 Correlations

df_features_unvax %>%
  select(starts_with("impediments_hr_")) %>%
  rename_all(~str_replace(., pattern = "impediments_hr_", "")) %>%
  cor(use = "pairwise.complete.obs")  %>%
  ggcorrplot(type = "lower", lab = TRUE, lab_size = 12/.pt, tl.cex = 10) + 
    labs(y = "", x = "", title = "Correlation Matrix: \nImpediment Heuristics")+
    theme(axis.text.x = element_text(angle = 45, hjust=1))

2 Info source

  1. Filter only to observations that are non missing for info_source.

2.1 Classification Rate

df_unvax %>%
  filter(!is.na(info_source)) %>%
  select(starts_with("info_source_hr_")) %>%
  mutate(total = rowSums(.)) %>%
  select(total) %>%
  mutate(classified = ifelse(total >0, 1, 0)) %>%
  group_by(classified) %>%
  count() %>%
  ungroup()%>%
  mutate(percent_classified = paste(round(n/sum(n) * 100), "%", sep = "")) %>%
  filter(classified == 1) %>%
  pull(percent_classified) 
## [1] "81%"

2.2 Number of Categories per Observation

df_unvax %>%
  filter(!is.na(info_source) ) %>%
  select(starts_with("info_source_hr_")) %>%
  mutate(number_of_categories = rowSums(.)) %>%
  group_by(number_of_categories) %>%
  count(name = "number_of_observations") %>%
  ungroup() %>%
  mutate(percent_of_observations = paste(round(number_of_observations/sum(number_of_observations)* 100), "%", sep = ""))

2.3 Number of Observations in Each Category

df_unvax %>%
  filter(!is.na(info_source)) %>%
  select(starts_with("info_source_hr_"))  %>% 
  summarise_all(.funs = sum) %>%
  rename_all(~str_replace(., pattern = "info_source_hr_", "")) %>%
  pivot_longer(everything(), names_to = "Heuristic", values_to = "Count") %>%
  arrange(desc(Count))

2.4 Correlations

df_features_unvax %>%
  select(starts_with("info_source_hr_")) %>%
  rename_all(~str_replace(., pattern = "info_source_hr_", "")) %>%
  cor(use = "pairwise.complete.obs")  %>%
  ggcorrplot(type = "lower", lab = TRUE, lab_size = 12/.pt, tl.cex = 10) + 
    labs(y = "", x = "", title = "Correlation Matrix: \nInfo Source Heuristics")+
    theme(axis.text.x = element_text(angle = 45, hjust=1))

3 Best Treatment Proposal

  1. Filter only to observations that are non missing for best_treatment_proposal.

3.1 Classification Rate

df_unvax %>%
  filter(!is.na(best_treatment_proposal)) %>%
  select(starts_with("best_treat_")) %>%
  mutate(total = rowSums(.)) %>%
  select(total) %>%
  mutate(classified = ifelse(total >0, 1, 0)) %>%
  group_by(classified) %>%
  count() %>%
  ungroup()%>%
  mutate(percent_classified = paste(round(n/sum(n) * 100), "%", sep = "")) %>%
  filter(classified == 1) %>%
  pull(percent_classified) 
## [1] "65%"

3.2 Number of Categories per Observation

df_unvax %>%
  filter(!is.na(best_treatment_proposal)) %>%
  select(starts_with("best_treat_")) %>%
  mutate(number_of_categories = rowSums(.)) %>%
  group_by(number_of_categories) %>%
  count(name = "number_of_observations") %>%
  ungroup() %>%
  mutate(percent_of_observations = paste(round(number_of_observations/sum(number_of_observations)* 100), "%", sep = ""))

3.3 Number of Observations in Each Category

df_unvax %>%
  filter(!is.na(best_treatment_proposal)) %>%
  select(starts_with("best_treat_"))  %>% 
  summarise_all(.funs = sum) %>%
  rename_all(~str_replace(., pattern = "best_treat_", "")) %>%
  pivot_longer(everything(), names_to = "Heuristic", values_to = "Count") %>%
  arrange(desc(Count))

3.4 Correlations

df_features_unvax %>%
  select(starts_with("best_treat_")) %>%
  rename_all(~str_replace(., pattern = "best_treat_", "")) %>%
  cor(use = "pairwise.complete.obs")  %>%
  ggcorrplot(type = "lower", lab = TRUE, lab_size = 12/.pt, tl.cex = 10) + 
    labs(y = "", x = "", title = "Correlation Matrix: \nBest Treatment Proposal Heuristics")+
    theme(axis.text.x = element_text(angle = 45, hjust=1))

4 Opinion

  1. Filter only to observations that are non missing for either opinion_friend_family or opinion_conversation.

4.1 Classification Rate

df_unvax %>%
  filter(!is.na(opinion_friend_family) | !is.na(opinion_conversation)) %>%
  select(starts_with("opinion_hr_")) %>%
  mutate(total = rowSums(.)) %>%
  select(total) %>%
  mutate(classified = ifelse(total >0, 1, 0)) %>%
  group_by(classified) %>%
  count() %>%
  ungroup()%>%
  mutate(percent_classified = paste(round(n/sum(n) * 100), "%", sep = "")) %>%
  filter(classified == 1) %>%
  pull(percent_classified) 
## [1] "70%"

4.2 Number of Categories per Observation

df_unvax %>%
  filter(!is.na(opinion_friend_family) | !is.na(opinion_conversation)) %>%
  select(starts_with("opinion_hr_")) %>%
  mutate(number_of_categories = rowSums(.)) %>%
  group_by(number_of_categories) %>%
  count(name = "number_of_observations") %>%
  ungroup() %>%
  mutate(percent_of_observations = paste(round(number_of_observations/sum(number_of_observations)* 100), "%", sep = ""))

4.3 Number of Observations in Each Category

df_unvax %>%
  filter(!is.na(opinion_friend_family) | !is.na(opinion_conversation)) %>%
  select(starts_with("opinion_hr_"))  %>% 
  summarise_all(.funs = sum) %>%
  rename_all(~str_replace(., pattern = "opinion_hr_", "")) %>%
  pivot_longer(everything(), names_to = "Heuristic", values_to = "Count") %>%
  arrange(desc(Count))

4.4 Correlations

df_features_unvax %>%
  select(starts_with("opinion_hr_")) %>%
  rename_all(~str_replace(., pattern = "opinion_hr_", "")) %>%
  cor(use = "pairwise.complete.obs")  %>%
  ggcorrplot(type = "lower", lab = TRUE, lab_size = 12/.pt, tl.cex = 10) + 
    labs(y = "", x = "", title = "Correlation Matrix: \nOpinion Heuristics")+
    theme(axis.text.x = element_text(angle = 45, hjust=1))