1 Overview

The present user segmentation analysis is based on 2,309 unvaccinated responsdents completing our survey for pilot wave 8 (August 2022), and 144 features (both MCQ and free text). The related GitHub issue is here.

We take the following steps to pinpoint user segments from pilot 8 data:

We start by hypothesizing 17 user segments based on our understanding of the pilot 8 data (N = 2309 unvaccinated participants) (Section 2)
For each of the 17 user segments, we refer to our data dictionary and identify variables (or logical combinations of variables) that help identify each user segment in the pilot data (Section 2)
We drop segments that have less than 50 users in them (5 segments in the current script) (Section 3)
For each of 12 user segment left after step 3, we generate four sets of covariate heatmap tables: (Section 6)
- Set 1: Compares user segments on vaccine impediment related covariates
- Set 2: Compares user segments on preferred treatment related covariates
- Set 3: Compares user segments on vaccine opinion related covariates
- Set 4: Compares user segments on vaccine information sources and demographic covariates
Our next steps would be to propose new questions for further refining these segments, and to propose what treatments would work for each segment.

# Load packages
pacman::p_load(DT, estimatr, kableExtra, readr, reshape2, tidyverse, xtable, dataMaid, ggcorrplot, ggmap, rpart, rpart.plot, pollster, wordcloud, tm, RColorBrewer, hrbrthemes, janitor, purrr, gridExtra, cowplot, rcompanion, nnet, texreg, compareGroups, factoextra, cluster, fastDummies, simputation, sentimentr, politeness, textir, glmnet, gamlr, tm, topicmodels, ldatuning, lda, SnowballC, olsrr, here)

set.seed(94305)

# read in CURRENT chatfuel data

# full data
df_full_v8 <- 
  read_csv(here("pilot8/data/full_df_clean.csv")) %>% 
  clean_names() %>% 
  mutate_if(is.character, ~ str_replace_all(., '[\n\t]', '')) %>%
  mutate(
    motive = if_else(str_detect(motive, "yes"), "yes", "no"),
    motive_main = if_else(str_detect(motive_main, "risk"), "risk", motive_main),
    best_treatment = str_remove_all(best_treatment, "\\.") %>% str_to_sentence(),
    best_treatment = if_else(best_treatment == "New trusted info", "More safety evidence", best_treatment),
  ) %>%
  remove_empty("rows")

# filter to completes and add features
df <-
  df_full_v8 %>%
  filter(full_complete == "complete") %>% 
  drop_na(vax_status)

2 Defining segments and associated heuristics

We define the following 17 segments:

2.1 List

Segment 1: Heard bad things about the vaccine (N = 284)
Segment 2: Haven’t gotten around to getting the vaccine (N = 591)
Segment 3: Covid not relevant anymore, pandemic over (N = 255)
Segment 4: Covid did not exist (N = 20)
Segment 5: Misinformation (N = 69)
Segment 6: Nothing would work (N = 299)
Segment 7: Scared of side effects (N = 313)
Segment 8: Side effects with pregnancy/maternity (N = 16)
Segment 9: Scared of needles/injections (N = 51)
Segment 10: Death of someone known (N = 145)
Segment 11: Against principles related to freedom/choice (N = 120)
Segment 12: Against religious beliefs (N = 8)
Segment 13: Believe body is healthy/immune system is strong (N = 5)
Segment 14: Super busy and poor (N = 411)
Segment 15: Need time off work to get vaxxed (N = 133)
Segment 16: Can’t get off work to get vaxxed and for a long time to deal with side effects (N = 24)
Segment 17: Too far away from vaccination site (N = 337)

# clean up demographic variables

df_features <-
  df %>% 
  bind_cols(
    df %>%
      pull(opinion_conversation) %>%
      get_sentences() %>%
      sentiment_by() %>% 
      transmute(opinion_conv_sentiment = ave_sentiment)
  ) %>%
  mutate(
    
    covid_already = if_else(str_detect(covid_already, "No|no"), 0L, 1L) %>% replace_na(0),
    no_motive = case_when(
      motive == "yes" ~ 0L,
      motive == "no" ~ 1L,
    ),
    motive_elaboration = motive_nchar,
    # post_want_vax,
    no_ability = case_when(
      ability == "easy" ~ 0L,
      ability != "easy" & !is.na(ability) ~ 1L,
    ) %>% replace_na(0),
    ability_elaboration = ability_nchar %>% replace_na(0),
    
    against_beliefs = if_else(motive_main == "beliefs", 1L, 0L) %>% replace_na(0),
    no_benefits = if_else(motive_main == "benefit", 1L, 0L) %>% replace_na(0),
    risky = if_else(motive_main == "risk", 1L, 0L) %>% replace_na(0),
    
    no_time = if_else(ability_main == "time", 1L, 0L) %>% replace_na(0),
    no_money = if_else(ability_main == "money", 1L, 0L) %>% replace_na(0),
    no_availability = if_else(ability_main == "availability", 1L, 0L) %>% replace_na(0),
    
    # risk
    bad_side_effects = if_else(str_detect(risk_main, "bad side effects"), 1L, 0L) %>% replace_na(0),
    lack_of_testing = if_else(str_detect(risk_main, "not enough testing"), 1L, 0L) %>% replace_na(0),
    not_trust_pharma = if_else(str_detect(risk_main, "not trust phar") | str_detect(belief_main, "not trust phar"), 1L, 0L) %>% replace_na(0),
    # benefit
    covid_not_dangerous = if_else(str_detect(benefit_main, "covid not dangerous"), 1L, 0L) %>% replace_na(0),
    # had_covid_before = if_else(str_detect(benefit_main, "had covid before"), 1L, 0L) %>% replace_na(0),
    vaccines_dont_work = if_else(str_detect(benefit_main, "vaccines don't work"), 1L, 0L) %>% replace_na(0),
    # belief
    freedom_to_choose = if_else(str_detect(belief_main, "freedom to choose"), 1L, 0L) %>% replace_na(0),
    religious_reasons = if_else(str_detect(belief_main, "religious reasons"), 1L, 0L) %>% replace_na(0),
    
    # time
    no_time_off_work = if_else(str_detect(time_main, "hard to get off work"), 1L, 0L) %>% replace_na(0),
    no_time_to_research = if_else(str_detect(time_main, "no time to research"), 1L, 0L) %>% replace_na(0),
    no_childcare = if_else(str_detect(time_main, "no childcare"), 1L, 0L) %>% replace_na(0),
    # money
    no_cash = if_else(str_detect(money_main, "no cash"), 1L, 0L) %>% replace_na(0),
    no_insurance = if_else(str_detect(money_main, "no insurance"), 1L, 0L) %>% replace_na(0),
    travel_costs = if_else(str_detect(money_main, "travel costs"), 1L, 0L) %>% replace_na(0),
    # availability
    no_vax_left = if_else(str_detect(availability_main, "no vaccines left"), 1L, 0L) %>% replace_na(0),
    too_far = if_else(str_detect(availability_main, "too far away"), 1L, 0L) %>% replace_na(0),
    info_confidence = str_to_lower(info_confidence),
    info_confidence_high = if_else(str_detect(info_confidence, "very"), 1L, 0L) %>% replace_na(0),
    want_link = if_else(str_detect(want_link, "Sure"), 1L, 0L) %>% replace_na(0),
    want_answer = if_else(str_detect(want_answer, "Sure"), 1L, 0L) %>% replace_na(0),
    self_reflection = if_else(self_reflection == "A lot!", 1L, 0L) %>% replace_na(0),
    
    # demographics
    age,
    education = education_num,
    religiosity = religiosity_num,
    location = location_num,
    black_or_african = if_else(ethnicity == "black or african", 1L, 0L) %>% replace_na(0),
    vaccinated = vax_status_num,
    nigeria = if_else(country_answer == "nigeria", 1, 0) %>% replace_na(0),
    kenya = if_else(country_answer == "kenya", 1, 0) %>% replace_na(0),
    ghana = if_else(country_answer == "ghana", 1, 0) %>% replace_na(0),
    south_africa = if_else(country_answer == "south africa", 1, 0) %>% replace_na(0),
    opinion_conv_sentiment = opinion_conv_sentiment %>% replace_na(0)
  ) %>% 
  select(starts_with(c("impediments_hr_", "info_source_hr_", "best_treat_", "opinion_hr_")), 
       covid_already, no_motive, motive_elaboration, no_ability, 
       ability_elaboration, against_beliefs, no_benefits, risky, no_time, no_money, no_availability,
       bad_side_effects, lack_of_testing, not_trust_pharma,
       covid_not_dangerous, vaccines_dont_work, freedom_to_choose,
       religious_reasons, no_time_off_work, no_time_to_research,
       no_childcare, no_cash, no_insurance, travel_costs, no_vax_left, too_far,
       info_confidence, info_confidence_high, want_link, want_answer, self_reflection,
       age, education, religiosity, location, black_or_african, vaccinated, 
       nigeria, kenya, ghana, south_africa,
       best_treatment, opinion_conv_sentiment,post_want_vax) %>%
  select(!info_confidence) %>%
  relocate(starts_with("info_source"), .before = info_confidence_high)

df_features_unvax <- 
  df_features %>%
  filter(vaccinated == 0) 

df_unvax <- 
  df %>%
  filter(vax_status_num == 0)


segments <- df_unvax %>%
  mutate(heard_bad_things_seg = as.numeric(impediments_hr_heard_hearsay == 1 |
(impediments_hr_more_information == 1 & impediments_hr_risk == 1)|
(impediments_hr_unsafe == 1 & impediments_hr_risk == 1)|
impediments_hr_scared == 1 |
(impediments_hr_family_friends ==1  & impediments_hr_risk == 1) |
impediments_hr_death == 1)) %>%
  mutate(havent_gotten_vax_seg =as.numeric(
           impediments_hr_no_time == 1 | 
impediments_hr_distance == 1 | 
best_treat_easier_access_to_vax == 1|
best_treat_reminders == 1)
) %>%
  mutate(not_relevant_seg = as.numeric(impediments_hr_no_need == 1 | 
           opinion_hr_no_need ==1 |
           impediments_hr_never_saw_covid == 1| 
           opinion_hr_never_saw_covid == 1)) %>%
  mutate(covid_not_exis_seg = impediments_hr_covid_not_real ==1) %>%
  mutate(misinformation_seg = as.numeric(impediments_hr_misinformation == 1)) %>%
  mutate(nothing_would_work_seg = as.numeric(impediments_hr_no_reason == 1 |
           best_treat_nothing == 1 |
           best_treat_dont_know ==1 | 
          impediments_hr_never_saw_covid == 1|
           impediments_hr_covid_not_real == 1)) %>%
  mutate(side_effect_scare_seg = ifelse((impediments_hr_side_effects == 1) | (impediments_hr_pain == 1), 1L, 0L) %>% replace_na(0L)) %>%
  mutate(side_effect_maternity_seg = ifelse(impediments_hr_pregnancy_nursing == 1, 1L, 0L) %>% replace_na(0L)) %>%
  mutate(scared_of_needles_seg = ifelse(impediments_hr_needles_injection == 1, 1L, 0L) %>% replace_na(0L)) %>%
  mutate(death_concerns_seg = ifelse((impediments_hr_death == 1 | opinion_hr_death ==1), 1L, 0L) %>% replace_na(0L)) %>%
  mutate(
      against_freedom_choice_principles_seg = 
        if_else(
          (impediments_hr_government == 1 | impediments_hr_trust == 1) &
            (impediments_hr_religion == 0) &
            (impediments_hr_scared == 0),
          1L,
          0L
        ) %>% replace_na(0L)
    ) %>%
  mutate(
    against_religious_beliefs_seg = 
      if_else(
        impediments_hr_religion == 1,
        1L,
        0L
      ) %>% replace_na(0L)
  ) %>%
    mutate(
   believe_body_is_healthy_seg = 
      if_else(
        impediments_hr_healthy == 1 & impediments_hr_no_need == 1,
        1L,
        0L
      ) %>% replace_na(0L)
  ) %>%
  mutate(super_busy_and_poor_seg = 
                 ifelse(
                    (impediments_hr_financial == 1) |  (impediments_hr_no_time) | (impediments_hr_work) | (best_treat_rewards),
                1L,
                0L
)%>% replace_na(0L))  %>%
  mutate(need_time_off_work_to_get_vaxxed_seg = 
         ifelse(
           (impediments_hr_work == 1) |(impediments_hr_no_time == 1),
           1L,
           0L
)%>% replace_na(0L)) %>%
  mutate(cant_get_off_work_to_get_vaxed_and_side_effects_seg = 
         ifelse(
           ((impediments_hr_work == 1) | (impediments_hr_no_time == 1)) & (impediments_hr_side_effects == 1),
           1L,
           0L
)%>% replace_na(0L)) %>%
mutate(too_far_away_from_vaccination_site_seg = 
         ifelse(
           (impediments_hr_distance == 1),
           1L,
           0L
)%>% replace_na(0L)) 

n_by_group_tbl <- segments %>%
  select(ends_with("_seg")) %>%
  dplyr::summarize(across(everything(), sum))

2.2 Definitions

Below we show all our definitions for each segments:

2.2.1 Segment 1: Heard bad things about the vaccine

impediments_hearsay == 1 OR

(impediments_more_information == 1 AND impediments_risk == 1) OR

(impediments_unsafe == 1 AND impediments_risk == 1) OR

impediments_scared == 1 OR

(impediments_family_friends == 1 AND impediments_risk == 1) OR

impediments_death == 1

2.2.2 Segment 2: Haven’t gotten around to getting the vaccine

impediments_no_time == 1 OR

impediments_distance == 1 OR

treatment_easier_to_get_vax == 1 OR

treatment_reminder == 1

2.2.3 Segment 3: Covid not relevant anymore, pandemic over

impediments_no_need == 1 OR

opinion_no_need == 1 OR

impediments_never_saw_covid == 1 OR

opinion_never_saw_covid == 1

2.2.4 Segment 4: Covid did not exist

impediments_covid_not_real == 1

2.2.5 Segment 5: Misinformation

impediments_misinformation == 1

2.2.6 Segment 6: Nothing would work

impediments_no_reason == 1 OR

impediments_nothing_dont_know == 1 OR

best_treatment_proposal_nothing == 1 OR

best_treatment_proposal_dont_know == 1 OR

impediments_never_saw_covid == 1 OR

impediments_covid_not_real == 1

2.2.7 Segment 7: Scared of side effects

impediments_side_effects == 1 OR

impediments_pain == 1

2.2.8 Segment 8: Side effects with pregnancy/maternity

impediments_pregnancy_nursing == 1

2.2.9 Segment 9: Scared of needles/injections

impediments_needles_injections == 1

2.2.10 Segment 10: Death of someone known

impediments_death == 1 OR

opinion_death == 1

2.2.12 Segment 12: Against religious beliefs

impediments_religion == 1

2.2.13 Segment 13: Believe body is healthy/immune system is strong

impediments_healthy == 1 AND impediments_no_need == 1

2.2.14 Segment 14: Super busy and poor

impediments_too_costly == 1 OR

impediments_need_money == 1 OR

impediments_no_time == 1 OR

impediments_hard_to_get_off_work == 1 OR

best_treatment_proposal_rewards_for_vaccinating == 1

2.2.15 Segment 15: Need time off work to get vaxxed

impediments_hard_to_get_off_work == 1 AND impediments_no_time == 1

2.2.16 Segment 16: Can’t get off work to get vaxxed and for a long time to deal with side effects

impediments_work == 1 OR

impediments_no_time == 1 AND impediments_side_effects == 1

2.2.17 Segment 17: Too far away from vaccination site

impediments_distance == 1

3 Dropping segments

The following segments were dropped due to have a size of less than 50 observations.

###Drop Segments with $n < 50$. 

drop <- names(n_by_group_tbl)[n_by_group_tbl < 50]
drop

## [1] "covid_not_exis_seg"                                 
## [2] "side_effect_maternity_seg"                          
## [3] "against_religious_beliefs_seg"                      
## [4] "believe_body_is_healthy_seg"                        
## [5] "cant_get_off_work_to_get_vaxed_and_side_effects_seg"

segments <- segments %>%
  select(!contains(drop))
list_of_segments <- segments %>%
  select(ends_with("_seg")) %>% colnames()

4 Segment Descriptives

Below, we present a correlation matrix for our remaining segments.

segments[, list_of_segments]  %>%
  rename_all(~str_to_title(str_replace_all(str_remove(., "_seg"), "_", " "))) %>%
  cor(use = "pairwise.complete.obs")  %>%
  ggcorrplot(type = "lower", lab = TRUE, lab_size = 12/.pt, tl.cex = 10) + 
    labs(y = "", x = "", title = "Correlation Matrix: \nSegments")+
    theme(axis.text.x = element_text(angle = 45, hjust=1))+
    scale_fill_gradient2(
      low = "red",
      mid = "white",
      high = "green",
      midpoint = 0, 
      limits = c(-1, 1))

Next, we provide the observations counts and percents for each of our segments. The percent denominator is total number of observations for unvaccinated participants (2,309).

n_by_group_tbl <- n_by_group_tbl[!(names(n_by_group_tbl) %in% drop)]
n_by_group_tbl %>%
  as.data.frame() %>%
  pivot_longer(cols = everything(), names_to = "segment", values_to = "number_of_observations") %>%
  arrange(desc(number_of_observations)) %>%
  mutate(segment = str_to_title(str_squish(str_replace_all(string = segment, pattern = "_seg|_", " ")))) %>%
  mutate(percent_of_observations = paste(round(number_of_observations/nrow(segments)*100), "%", sep = "")) %>%
  rename_all(~str_to_title(str_replace_all(., "_", " "))) %>%
  datatable

Next, we want to know how many segments each user is assigned to using our definitions. The figure below shows the percent of unvaccinated users by the number segments assigned.

segments %>% 
  select(contains("_seg")) %>% 
  mutate(n_segments_per_user = rowSums(.)) %>% 
  tabyl(n_segments_per_user) %>% 
  as_tibble() %>% 
  mutate(flag = (n_segments_per_user == 1)) %>% 
  ggplot(aes(n_segments_per_user, percent)) +
  geom_col(alpha = 0.9, aes(fill = flag), show.legend = F) +
  theme_minimal() +
  scale_x_continuous(breaks = seq(0, 10, 1)) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1), breaks = seq(0, 1, 0.05)) +
  scale_fill_brewer(palette = "Set1") +
  labs(
    x = "Number of segments",
    y = "Percent of unvaccinated users",
    title = "Number of segments assigned per unvaccinated user (N = 2309)"
  ) + 
  theme( panel.grid.minor = element_blank())

5 Segment Groups

Next, we group the segments by common theme, as shown below.

Group 1: Ability impediments to getting vaccine

Havent Gotten Vax
Super Busy And Poor
Too Far Away From Vaccination Site
Need Time Off Work To Get Vaxxed

Group 2: Health Concerns

Side Effect Scare
Death Concerns
Scared Of Needles

Group 3: Beliefs

Nothing Would Work
Heard Bad Things
Not Relevant
Against Freedom Choice Principles
Misinformation

group_ability = list_of_segments[list_of_segments %in% c("havent_gotten_vax_seg","super_busy_and_poor_seg", "too_far_away_from_vaccination_site_seg", "need_time_off_work_to_get_vaxxed_seg")]

group_health_concerns = list_of_segments[list_of_segments %in% c("side_effect_scare_seg","death_concerns_seg", "scared_of_needles_seg")]

group_beliefs =  list_of_segments[list_of_segments %in% c("nothing_would_work_seg","heard_bad_things_seg", "not_relevant_seg", "against_freedom_choice_principles_seg", "misinformation_seg")]

list_of_segments = c(group_ability, group_health_concerns, group_beliefs)

# Calculate mean and sd for all features
feature_stats <- df_features_unvax %>%
  select(!best_treatment & !vaccinated) %>%
  summarize_all(list(mean = function(x)mean(x, na.rm = T), 
                     sd = function(x)sd(x, na.rm = T))) %>%
  pivot_longer(cols = everything(),
               names_to = "feature",
               values_to = "value") %>%
  mutate(measure = paste("total", str_extract(feature, "mean|sd"), sep = "_"), 
         feature = str_remove(feature, "_mean|_sd")) %>%
  pivot_wider(names_from = "measure", 
              values_from = "value")

#Calculate mean and sd by feature group


feature_stats_group <- df_features_unvax %>%
  select(!best_treatment & !vaccinated) %>%
  cbind(segments[, list_of_segments]) %>%
  mutate(group_ability = case_when(
    havent_gotten_vax_seg == 1| super_busy_and_poor_seg == 1| need_time_off_work_to_get_vaxxed_seg == 1| too_far_away_from_vaccination_site_seg ==1 ~ 1, 
    TRUE ~ 0), 
    group_health_concerns = case_when(
      side_effect_scare_seg == 1| scared_of_needles_seg == 1| death_concerns_seg == 1 ~ 1, 
      TRUE ~ 0
    ), 
    group_beliefs = case_when(heard_bad_things_seg == 1 | not_relevant_seg == 1 | misinformation_seg == 1| nothing_would_work_seg == 1 | against_freedom_choice_principles_seg == 1 ~ 1, 
    TRUE ~ 0)
    )


feature_stats_ability <- feature_stats_group %>%
  filter(group_ability == 1) %>%
  select(!starts_with("group_")) %>%
  select(!ends_with("_seg"))  %>%
  summarize_all(list(mean = function(x)mean(x, na.rm = T), 
                     sd = function(x)sd(x, na.rm = T))) %>%
  pivot_longer(cols = everything(),
               names_to = "feature",
               values_to = "value") %>%
  mutate(measure = paste("ability", str_extract(feature, "mean|sd"), sep = "_"), 
         feature = str_remove(feature, "_mean|_sd")) %>%
  pivot_wider(names_from = "measure", 
              values_from = "value")

feature_stats_health_concerns <- feature_stats_group %>%
  filter(group_health_concerns == 1) %>%
  select(!starts_with("group_")) %>%
  select(!ends_with("_seg"))  %>%
  summarize_all(list(mean = function(x)mean(x, na.rm = T), 
                     sd = function(x)sd(x, na.rm = T))) %>%
  pivot_longer(cols = everything(),
               names_to = "feature",
               values_to = "value") %>%
  mutate(measure = paste("health_concerns", str_extract(feature, "mean|sd"), sep = "_"), 
         feature = str_remove(feature, "_mean|_sd")) %>%
  pivot_wider(names_from = "measure", 
              values_from = "value")

feature_stats_beliefs <- feature_stats_group %>%
  filter(group_beliefs == 1) %>%
  select(!starts_with("group_")) %>%
  select(!ends_with("_seg"))  %>%
  summarize_all(list(mean = function(x)mean(x, na.rm = T), 
                     sd = function(x)sd(x, na.rm = T))) %>%
  pivot_longer(cols = everything(),
               names_to = "feature",
               values_to = "value") %>%
  mutate(measure = paste("beliefs", str_extract(feature, "mean|sd"), sep = "_"), 
         feature = str_remove(feature, "_mean|_sd")) %>%
  pivot_wider(names_from = "measure", 
              values_from = "value")

#Calculate mean and sd by Segment Group
in_segment_stats <- data.frame()
for(column in list_of_segments){
  summary <- df_features_unvax %>% 
    select(!best_treatment & !vaccinated) %>%
    mutate(segment = segments[[column]]) %>%
    filter(segment == 1) %>%
    group_by(segment) %>% 
    summarise_all(list(sep_mean = ~ mean(., na.rm = T), 
                       sep_sd = ~ sd(., na.rm =T), 
                       sep_n = ~sum(., na.rm = T))) %>%
    pivot_longer(cols = !segment, names_to = c("name"))  %>%
    separate(name, into = c("feature", "value_type"), sep = "_sep_") %>%
    mutate(segment = column) %>%
    pivot_wider(names_from = "value_type", values_from = "value") %>%
    mutate(se = sd/sqrt(n)) %>%
    select(!sd & !n) 
    
  
 in_segment_stats<-  rbind(in_segment_stats, summary)
}

in_segment_stats <- in_segment_stats %>%
  mutate_if(is.numeric, function(x)round(x, 3))

# Total Comparions table
tbl <- left_join(in_segment_stats, feature_stats ,by = "feature") %>%
  mutate(n = unlist(n_by_group_tbl[match(segment, names(n_by_group_tbl))])) %>%
  mutate(label = paste(mean, paste("(", se, ")", sep = ""), sep = "\n"), 
         color = (mean - total_mean)/total_sd, 
         color = case_when(color > 2 ~ 2, 
                           color < -2 ~ -2, 
                           is.nan(se) ~ NA_real_,
                           TRUE ~ color),
         group = case_when(segment %in% group_ability ~ "Ability", 
                           segment %in% group_health_concerns ~ "Health Concerns", 
                           segment %in% group_beliefs ~ "Beliefs", 
                           TRUE ~ NA_character_
                           ), 
         feature_group = case_when(grepl("impediments_hr_", feature) ~ "Impediments", 
                                   grepl("best_treat_", feature) ~ "Best Treatment", 
                                   grepl("opinion_hr", feature) ~ "Opinion", 
                                   TRUE ~ "Demo"), 
         segment = str_to_title(str_replace_all(str_remove(segment, "_seg"), "_", "\n")), 
         feature = str_to_title(str_replace_all(str_remove(feature, "impediments_hr_|opinion_hr_|best_treat_"), "_", " ")), 
         prop = round(n/nrow(df_features_unvax), 2)) %>%
  mutate(segment_label = paste(segment, paste("N: ", n, sep = ""), paste("Prop: ", prop, sep = ""), sep = "\n"))

# ability comparions table
tbl_ability <- left_join(in_segment_stats, feature_stats_ability,by = "feature") %>%
  mutate(n = unlist(n_by_group_tbl[match(segment, names(n_by_group_tbl))])) %>%
  mutate(label = paste(mean, paste("(", se, ")", sep = ""), sep = "\n"), 
         color = (mean - ability_mean)/ability_sd, 
         color = case_when(color > 2 ~ 2, 
                           color < -2 ~ -2, 
                           is.nan(se) ~ NA_real_,
                           TRUE ~ color),
         group = case_when(segment %in% group_ability ~ "Ability", 
                           segment %in% group_health_concerns ~ "Health Concerns", 
                           segment %in% group_beliefs ~ "Beliefs", 
                           TRUE ~ NA_character_
                           ), 
         feature_group = case_when(grepl("impediments_hr_", feature) ~ "Impediments", 
                                   grepl("best_treat_", feature) ~ "Best Treatment", 
                                   grepl("opinion_hr", feature) ~ "Opinion", 
                                   TRUE ~ "Demo"), 
         segment = str_to_title(str_replace_all(str_remove(segment, "_seg"), "_", "\n")), 
         feature = str_to_title(str_replace_all(str_remove(feature, "impediments_hr_|opinion_hr_|best_treat_"), "_", " ")), 
         prop = round(n/nrow(df_features_unvax), 2))  %>%
  filter(group == "Ability")  %>%
  mutate(segment_label = paste(segment, paste("N: ", n, sep = ""), paste("Prop: ", prop, sep = ""), sep = "\n"))

# health concerns table
tbl_health_concerns <- left_join(in_segment_stats, feature_stats_health_concerns ,by = "feature") %>%
  mutate(n = unlist(n_by_group_tbl[match(segment, names(n_by_group_tbl))])) %>%
  mutate(label = paste(mean, paste("(", se, ")", sep = ""), sep = "\n"), 
         color = (mean - health_concerns_mean)/health_concerns_sd, 
         color = case_when(color > 2 ~ 2, 
                           color < -2 ~ -2, 
                           is.nan(se) ~ NA_real_,
                           TRUE ~ color),
         group = case_when(segment %in% group_ability ~ "Ability", 
                           segment %in% group_health_concerns ~ "Health Concerns", 
                           segment %in% group_beliefs ~ "Beliefs", 
                           TRUE ~ NA_character_
                           ), 
         feature_group = case_when(grepl("impediments_hr_", feature) ~ "Impediments", 
                                   grepl("best_treat_", feature) ~ "Best Treatment", 
                                   grepl("opinion_hr", feature) ~ "Opinion", 
                                   TRUE ~ "Demo"), 
         segment = str_to_title(str_replace_all(str_remove(segment, "_seg"), "_", "\n")), 
         feature = str_to_title(str_replace_all(str_remove(feature, "impediments_hr_|opinion_hr_|best_treat_"), "_", " ")), 
         prop = round(n/nrow(df_features_unvax), 2))  %>%
  filter(group == "Health Concerns")  %>%
  mutate(segment_label = paste(segment, paste("N: ", n, sep = ""), paste("Prop: ", prop, sep = ""), sep = "\n"))

# beliefs concerns table
tbl_beliefs <- left_join(in_segment_stats, feature_stats_beliefs,by = "feature") %>%
  mutate(n = unlist(n_by_group_tbl[match(segment, names(n_by_group_tbl))])) %>%
  mutate(label = paste(mean, paste("(", se, ")", sep = ""), sep = "\n"), 
         color = (mean - beliefs_mean)/beliefs_sd, 
         color = case_when(color > 2 ~ 2, 
                           color < -2 ~ -2, 
                           is.nan(se) ~ NA_real_,
                           TRUE ~ color),
         group = case_when(segment %in% group_ability ~ "Ability", 
                           segment %in% group_health_concerns ~ "Health Concerns", 
                           segment %in% group_beliefs ~ "Beliefs", 
                           TRUE ~ NA_character_
                           ), 
         feature_group = case_when(grepl("impediments_hr_", feature) ~ "Impediments", 
                                   grepl("best_treat_", feature) ~ "Best Treatment", 
                                   grepl("opinion_hr", feature) ~ "Opinion", 
                                   TRUE ~ "Demo"), 
         segment = str_to_title(str_replace_all(str_remove(segment, "_seg"), "_", "\n")), 
         feature = str_to_title(str_replace_all(str_remove(feature, "impediments_hr_|opinion_hr_|best_treat_"), "_", " ")), 
         prop = round(n/nrow(df_features_unvax), 2))  %>%
  filter(group == "Beliefs")  %>%
  mutate(segment_label = paste(segment, paste("N: ", n, sep = ""), paste("Prop: ", prop, sep = ""), sep = "\n"))

6 Comparison by Segment Group

This section provides covariate heatmaps to make comparisons within segment groups. The text labels are the mean and standard error for the segment.The color is the standard deviation of the normalized distribution (the mean of a given segment scaled to the distribution of the other segments in that group):

(mean in segment_feature - mean in feature segment_group) / std dev feature segment_group

The heatmaps are presented for each of the three segment groups. Further, for each group, the heatmaps are divided by groups of features for readability:

Motive/Ability Impediments
Best Treatment Proposal
Opinion
Demographics

6.1 Group 1: Ability

6.1.1 Motive and Ability Impediments

ggplot(tbl_ability %>% filter(feature_group == "Impediments")) +
    geom_tile(aes(segment_label, feature, fill = color), color = "white", lwd = .5) +
    geom_text(aes(segment_label, feature, label = label), size = 2.75) +
    scale_fill_gradient2(
      low = "red",
      mid = "white",
      high = "green",
      midpoint = 0,
      limits = c(-2,2), 
      labels = c( ">-2", "-1", "0", "1", "2<")) +
    theme_minimal() +
    theme(legend.position = "bottom", 
          axis.title.y = element_text(angle = 0)) +
    labs(title = "Group 1: Ability Concerns", 
         x = "Segment", 
         y = "Feature",
         fill = "Std Deviation on Normalized Distribution")  +
  guides(fill = guide_colourbar(barwidth = 10, title.position = "top"))

6.1.2 Best Treatment

ggplot(tbl_ability %>% filter(feature_group == "Best Treatment")) +
    geom_tile(aes(segment_label, feature, fill = color), color = "white", lwd = .5) +
    geom_text(aes(segment_label, feature, label = label), size = 2.75) +
    scale_fill_gradient2(
      low = "red",
      mid = "white",
      high = "green",
      midpoint = 0,
      limits = c(-2,2), 
      labels = c( ">-2", "-1", "0", "1", "2<")) +
    theme_minimal() +
    theme(legend.position = "bottom", 
          axis.title.y = element_text(angle = 0)) +
    labs(title = "Group 1: Ability Concerns", 
         x = "Segment", 
         y = "Feature",
         fill = "Std Deviation on Normalized Distribution")  +
  guides(fill = guide_colourbar(barwidth = 10, title.position = "top"))

6.1.3 Opinion

ggplot(tbl_ability %>% filter(feature_group == "Opinion")) +
    geom_tile(aes(segment_label, feature, fill = color), color = "white", lwd = .5) +
    geom_text(aes(segment_label, feature, label = label), size = 2.75) +
    scale_fill_gradient2(
      low = "red",
      mid = "white",
      high = "green",
      midpoint = 0,
      limits = c(-2,2), 
      labels = c( ">-2", "-1", "0", "1", "2<")) +
    theme_minimal() +
    theme(legend.position = "bottom", 
          axis.title.y = element_text(angle = 0)) +
    labs(title = "Group 1: Ability Concerns", 
         x = "Segment", 
         y = "Feature",
         fill = "Std Deviation on Normalized Distribution")  +
  guides(fill = guide_colourbar(barwidth = 10, title.position = "top"))

6.1.4 Demographics

ggplot(tbl_ability %>% filter(feature_group == "Demo")) +
    geom_tile(aes(segment_label, feature, fill = color), color = "white", lwd = .5) +
    geom_text(aes(segment_label, feature, label = label), size = 2.75) +
    scale_fill_gradient2(
      low = "red",
      mid = "white",
      high = "green",
      midpoint = 0,
      limits = c(-2,2), 
      labels = c( ">-2", "-1", "0", "1", "2<")) +
    theme_minimal() +
    theme(legend.position = "bottom", 
          axis.title.y = element_text(angle = 0)) +
    labs(title = "Group 1: Ability Concerns", 
         x = "Segment", 
         y = "Feature",
         fill = "Std Deviation on Normalized Distribution")  +
  guides(fill = guide_colourbar(barwidth = 10, title.position = "top"))

6.2 Group 2: Health Concerns

6.2.1 Motive and Ability Impediments

ggplot(tbl_health_concerns %>% filter(feature_group == "Impediments")) +
    geom_tile(aes(segment_label, feature, fill = color), color = "white", lwd = .5) +
    geom_text(aes(segment_label, feature, label = label), size = 2.75) +
    scale_fill_gradient2(
      low = "red",
      mid = "white",
      high = "green",
      midpoint = 0,
      limits = c(-2,2), 
      labels = c( ">-2", "-1", "0", "1", "2<")) +
    theme_minimal() +
    theme(legend.position = "bottom", 
          axis.title.y = element_text(angle = 0)) +
    labs(title = "Group 2: Health Concerns", 
         x = "Segment", 
         y = "Feature",
         fill = "Std Deviation on Normalized Distribution")  +
  guides(fill = guide_colourbar(barwidth = 10, title.position = "top"))

6.2.2 Best Treatment

ggplot(tbl_health_concerns %>% filter(feature_group == "Best Treatment")) +
    geom_tile(aes(segment_label, feature, fill = color), color = "white", lwd = .5) +
    geom_text(aes(segment_label, feature, label = label), size = 2.75) +
    scale_fill_gradient2(
      low = "red",
      mid = "white",
      high = "green",
      midpoint = 0,
        limits = c(-2,2), 
      labels = c( ">-2", "-1", "0", "1", "2<")) +
    theme_minimal() +
    theme(legend.position = "bottom", 
          axis.title.y = element_text(angle = 0)) +
    labs(title = "Group 2: Health Concerns", 
         x = "Segment", 
         y = "Feature",
         fill = "Std Deviation on Normalized Distribution")  +
  guides(fill = guide_colourbar(barwidth = 10, title.position = "top"))

6.2.3 Opinion

ggplot(tbl_health_concerns %>% filter(feature_group == "Opinion")) +
    geom_tile(aes(segment_label, feature, fill = color), color = "white", lwd = .5) +
    geom_text(aes(segment_label, feature, label = label), size = 2.75) +
    scale_fill_gradient2(
      low = "red",
      mid = "white",
      high = "green",
      midpoint = 0,
      limits = c(-2,2), 
      labels = c( ">-2", "-1", "0", "1", "2<")) +
    theme_minimal() +
    theme(legend.position = "bottom", 
          axis.title.y = element_text(angle = 0)) +
    labs(title = "Group 2: Health Concerns", 
         x = "Segment", 
         y = "Feature",
         fill = "Std Deviation on Normalized Distribution")  +
  guides(fill = guide_colourbar(barwidth = 10, title.position = "top"))

6.2.4 Demographics

ggplot(tbl_health_concerns %>% filter(feature_group == "Demo")) +
    geom_tile(aes(segment_label, feature, fill = color), color = "white", lwd = .5) +
    geom_text(aes(segment_label, feature, label = label), size = 2.75) +
    scale_fill_gradient2(
      low = "red",
      mid = "white",
      high = "green",
      midpoint = 0,
      limits = c(-2,2), 
      labels = c( ">-2", "-1", "0", "1", "2<")) +
    theme_minimal() +
    theme(legend.position = "bottom", 
          axis.title.y = element_text(angle = 0)) +
    labs(title = "Group 2: Health Concerns", 
         x = "Segment", 
         y = "Feature",
         fill = "Std Deviation on Normalized Distribution")  +
  guides(fill = guide_colourbar(barwidth = 10, title.position = "top"))

6.3 Group 3: Beliefs

6.3.1 Motive and Ability Impediments

ggplot(tbl_beliefs %>% filter(feature_group == "Impediments")) +
    geom_tile(aes(segment_label, feature, fill = color), color = "white", lwd = .5) +
    geom_text(aes(segment_label, feature, label = label), size = 2.75) +
    scale_fill_gradient2(
      low = "red",
      mid = "white",
      high = "green",
      midpoint = 0,
      limits = c(-2,2), 
      labels = c( ">-2", "-1", "0", "1", "2<")) +
    theme_minimal() +
    theme(legend.position = "bottom", 
          axis.title.y = element_text(angle = 0)) +
    labs(title = "Group 3: Beliefs", 
         x = "Segment", 
         y = "Feature",
         fill = "Std Deviation on Normalized Distribution")  +
  guides(fill = guide_colourbar(barwidth = 10, title.position = "top"))

6.3.2 Best Treatment

ggplot(tbl_beliefs %>% filter(feature_group == "Best Treatment")) +
    geom_tile(aes(segment_label, feature, fill = color), color = "white", lwd = .5) +
    geom_text(aes(segment_label, feature, label = label), size = 2.75) +
    scale_fill_gradient2(
      low = "red",
      mid = "white",
      high = "green",
      midpoint = 0,
      limits = c(-2,2), 
      labels = c( ">-2", "-1", "0", "1", "2<")) +
    theme_minimal() +
    theme(legend.position = "bottom", 
          axis.title.y = element_text(angle = 0)) +
    labs(title = "Group 3: Beliefs", 
         x = "Segment", 
         y = "Feature",
         fill = "Std Deviation on Normalized Distribution")  +
  guides(fill = guide_colourbar(barwidth = 10, title.position = "top"))

6.3.3 Opinion

ggplot(tbl_beliefs %>% filter(feature_group == "Opinion")) +
    geom_tile(aes(segment_label, feature, fill = color), color = "white", lwd = .5) +
    geom_text(aes(segment_label, feature, label = label), size = 2.75) +
    scale_fill_gradient2(
      low = "red",
      mid = "white",
      high = "green",
      midpoint = 0,
      limits = c(-2,2), 
      labels = c( ">-2", "-1", "0", "1", "2<")) +
    theme_minimal() +
    theme(legend.position = "bottom",
          axis.title.y = element_text(angle = 0)) +
    labs(title = "Group 3: Beliefs", 
         x = "Segment", 
         y = "Feature",
         fill = "Std Deviation on Normalized Distribution")  +
  guides(fill = guide_colourbar(barwidth = 10, title.position = "top"))

6.3.4 Demographics

ggplot(tbl_beliefs %>% filter(feature_group == "Demo")) +
    geom_tile(aes(segment_label, feature, fill = color), color = "white", lwd = .5) +
    geom_text(aes(segment_label, feature, label = label), size = 2.75) +
    scale_fill_gradient2(
      low = "red",
      mid = "white",
      high = "green",
      midpoint = 0,
      limits = c(-2,2), 
      labels = c( ">-2", "-1", "0", "1", "2<")) +
    theme_minimal() +
    theme(legend.position = "bottom", 
          axis.title.y = element_text(angle = 0)) +
    labs(title = "Group 3: Beliefs", 
         x = "Segment", 
         y = "Feature",
         fill = "Std Deviation on Normalized Distribution")  +
  guides(fill = guide_colourbar(barwidth = 10, title.position = "top"))

7 Comparison by All Segments

This section provides covariate heatmaps to make comparisons between all segment groups. The text labels are the mean and standard error for the segment. The color is the standard deviation of the normalized distribution (the mean of a given segment scaled to the distribution of the other segments):

(mean in segment_feature - mean in feature) / std dev feature

The heatmaps are divided by groups of features for readability:

Motive/Ability Impediments
Best Treatment Proposal
Opinion
Demographics

7.1 Impediments

ggplot(tbl %>% filter(feature_group == "Impediments")) +
  geom_tile(aes(segment_label, feature, fill = color), color = "white", lwd = .5) +
  geom_text(aes(segment_label, feature, label = label), size = 2.75) +
  facet_grid(~ group,  scales = "free_x", space = "free_x", labeller = labeller(group = str_to_title)) +
  scale_fill_gradient2(
    low = "red",
    mid = "white",
    high = "green",
    midpoint = 0,
      limits = c(-2,2), 
      labels = c( ">-2", "-1", "0", "1", "2<")) +
  theme_minimal() +
  theme(legend.position = "bottom", 
        axis.title.y = element_text(angle = 0)) +
  labs(title = "All Segments", 
       x = "Segment", 
       y = "Feature",
       fill = "Std Deviation on Normalized Distribution")   +
  guides(fill = guide_colourbar(barwidth = 10, title.position = "top"))

7.2 Best Treatment

ggplot(tbl %>% filter(feature_group == "Best Treatment")) +
  geom_tile(aes(segment_label, feature, fill = color), color = "white", lwd = .5) +
  geom_text(aes(segment_label, feature, label = label), size = 2.75) +
  facet_grid(~ group,  scales = "free_x", space = "free_x", labeller = labeller(group = str_to_title)) +
  scale_fill_gradient2(
    low = "red",
    mid = "white",
    high = "green",
    midpoint = 0,
      limits = c(-2,2), 
      labels = c( ">-2", "-1", "0", "1", "2<")) +
  theme_minimal() +
  theme(legend.position = "bottom", 
        axis.title.y = element_text(angle = 0)) +
  labs(title = "All Segments", 
       x = "Segment", 
       y = "Feature",
       fill = "Std Deviation on Normalized Distribution")    +
  guides(fill = guide_colourbar(barwidth = 10, title.position = "top"))

7.3 Opinion

ggplot(tbl %>% filter(feature_group == "Opinion")) +
  geom_tile(aes(segment_label, feature, fill = color), color = "white", lwd = .5) +
  geom_text(aes(segment_label, feature, label = label), size = 2.75) +
  facet_grid(~ group,  scales = "free_x", space = "free_x", labeller = labeller(group = str_to_title)) +
  scale_fill_gradient2(
    low = "red",
    mid = "white",
    high = "green",
    midpoint = 0,
      limits = c(-2,2), 
      labels = c( ">-2", "-1", "0", "1", "2<")) +
  theme_minimal() +
  theme(legend.position = "bottom", 
        axis.title.y = element_text(angle = 0)) +
  labs(title = "All Segments", 
       x = "Segment", 
       y = "Feature",
       fill = "Std Deviation on Normalized Distribution")    +
  guides(fill = guide_colourbar(barwidth = 10, title.position = "top"))

7.4 Demographics

ggplot(tbl %>% filter(feature_group == "Demo")) +
  geom_tile(aes(segment_label, feature, fill = color), color = "white", lwd = .5) +
  geom_text(aes(segment_label, feature, label = label), size = 2.75) +
  facet_grid(~ group,  scales = "free_x", space = "free_x", labeller = labeller(group = str_to_title)) +
  scale_fill_gradient2(
    low = "red",
    mid = "white",
    high = "green",
    midpoint = 0,
      limits = c(-2,2), 
      labels = c( ">-2", "-1", "0", "1", "2<")) +
  theme_minimal() +
  theme(legend.position = "bottom", 
        axis.title.y = element_text(angle = 0)) +
  labs(title = "All Segments", 
       x = "Segment", 
       y = "Feature",
       fill = "Std Deviation on Normalized Distribution")   +
  guides(fill = guide_colourbar(barwidth = 10, title.position = "top"))

Vaccine Hesitancy User Segmentation

Updated: October 05, 2022