The present user segmentation analysis is based on 2,309 unvaccinated responsdents completing our survey for pilot wave 8 (August 2022), and 144 features (both MCQ and free text). The related GitHub issue is here.
We take the following steps to pinpoint user segments from pilot 8 data:
# Load packages
pacman::p_load(DT, estimatr, kableExtra, readr, reshape2, tidyverse, xtable, dataMaid, ggcorrplot, ggmap, rpart, rpart.plot, pollster, wordcloud, tm, RColorBrewer, hrbrthemes, janitor, purrr, gridExtra, cowplot, rcompanion, nnet, texreg, compareGroups, factoextra, cluster, fastDummies, simputation, sentimentr, politeness, textir, glmnet, gamlr, tm, topicmodels, ldatuning, lda, SnowballC, olsrr, here)
set.seed(94305)
# read in CURRENT chatfuel data
# full data
df_full_v8 <-
read_csv(here("pilot8/data/full_df_clean.csv")) %>%
clean_names() %>%
mutate_if(is.character, ~ str_replace_all(., '[\n\t]', '')) %>%
mutate(
motive = if_else(str_detect(motive, "yes"), "yes", "no"),
motive_main = if_else(str_detect(motive_main, "risk"), "risk", motive_main),
best_treatment = str_remove_all(best_treatment, "\\.") %>% str_to_sentence(),
best_treatment = if_else(best_treatment == "New trusted info", "More safety evidence", best_treatment),
) %>%
remove_empty("rows")
# filter to completes and add features
df <-
df_full_v8 %>%
filter(full_complete == "complete") %>%
drop_na(vax_status)We define the following 17 segments:
# clean up demographic variables
df_features <-
df %>%
bind_cols(
df %>%
pull(opinion_conversation) %>%
get_sentences() %>%
sentiment_by() %>%
transmute(opinion_conv_sentiment = ave_sentiment)
) %>%
mutate(
covid_already = if_else(str_detect(covid_already, "No|no"), 0L, 1L) %>% replace_na(0),
no_motive = case_when(
motive == "yes" ~ 0L,
motive == "no" ~ 1L,
),
motive_elaboration = motive_nchar,
# post_want_vax,
no_ability = case_when(
ability == "easy" ~ 0L,
ability != "easy" & !is.na(ability) ~ 1L,
) %>% replace_na(0),
ability_elaboration = ability_nchar %>% replace_na(0),
against_beliefs = if_else(motive_main == "beliefs", 1L, 0L) %>% replace_na(0),
no_benefits = if_else(motive_main == "benefit", 1L, 0L) %>% replace_na(0),
risky = if_else(motive_main == "risk", 1L, 0L) %>% replace_na(0),
no_time = if_else(ability_main == "time", 1L, 0L) %>% replace_na(0),
no_money = if_else(ability_main == "money", 1L, 0L) %>% replace_na(0),
no_availability = if_else(ability_main == "availability", 1L, 0L) %>% replace_na(0),
# risk
bad_side_effects = if_else(str_detect(risk_main, "bad side effects"), 1L, 0L) %>% replace_na(0),
lack_of_testing = if_else(str_detect(risk_main, "not enough testing"), 1L, 0L) %>% replace_na(0),
not_trust_pharma = if_else(str_detect(risk_main, "not trust phar") | str_detect(belief_main, "not trust phar"), 1L, 0L) %>% replace_na(0),
# benefit
covid_not_dangerous = if_else(str_detect(benefit_main, "covid not dangerous"), 1L, 0L) %>% replace_na(0),
# had_covid_before = if_else(str_detect(benefit_main, "had covid before"), 1L, 0L) %>% replace_na(0),
vaccines_dont_work = if_else(str_detect(benefit_main, "vaccines don't work"), 1L, 0L) %>% replace_na(0),
# belief
freedom_to_choose = if_else(str_detect(belief_main, "freedom to choose"), 1L, 0L) %>% replace_na(0),
religious_reasons = if_else(str_detect(belief_main, "religious reasons"), 1L, 0L) %>% replace_na(0),
# time
no_time_off_work = if_else(str_detect(time_main, "hard to get off work"), 1L, 0L) %>% replace_na(0),
no_time_to_research = if_else(str_detect(time_main, "no time to research"), 1L, 0L) %>% replace_na(0),
no_childcare = if_else(str_detect(time_main, "no childcare"), 1L, 0L) %>% replace_na(0),
# money
no_cash = if_else(str_detect(money_main, "no cash"), 1L, 0L) %>% replace_na(0),
no_insurance = if_else(str_detect(money_main, "no insurance"), 1L, 0L) %>% replace_na(0),
travel_costs = if_else(str_detect(money_main, "travel costs"), 1L, 0L) %>% replace_na(0),
# availability
no_vax_left = if_else(str_detect(availability_main, "no vaccines left"), 1L, 0L) %>% replace_na(0),
too_far = if_else(str_detect(availability_main, "too far away"), 1L, 0L) %>% replace_na(0),
info_confidence = str_to_lower(info_confidence),
info_confidence_high = if_else(str_detect(info_confidence, "very"), 1L, 0L) %>% replace_na(0),
want_link = if_else(str_detect(want_link, "Sure"), 1L, 0L) %>% replace_na(0),
want_answer = if_else(str_detect(want_answer, "Sure"), 1L, 0L) %>% replace_na(0),
self_reflection = if_else(self_reflection == "A lot!", 1L, 0L) %>% replace_na(0),
# demographics
age,
education = education_num,
religiosity = religiosity_num,
location = location_num,
black_or_african = if_else(ethnicity == "black or african", 1L, 0L) %>% replace_na(0),
vaccinated = vax_status_num,
nigeria = if_else(country_answer == "nigeria", 1, 0) %>% replace_na(0),
kenya = if_else(country_answer == "kenya", 1, 0) %>% replace_na(0),
ghana = if_else(country_answer == "ghana", 1, 0) %>% replace_na(0),
south_africa = if_else(country_answer == "south africa", 1, 0) %>% replace_na(0),
opinion_conv_sentiment = opinion_conv_sentiment %>% replace_na(0)
) %>%
select(starts_with(c("impediments_hr_", "info_source_hr_", "best_treat_", "opinion_hr_")),
covid_already, no_motive, motive_elaboration, no_ability,
ability_elaboration, against_beliefs, no_benefits, risky, no_time, no_money, no_availability,
bad_side_effects, lack_of_testing, not_trust_pharma,
covid_not_dangerous, vaccines_dont_work, freedom_to_choose,
religious_reasons, no_time_off_work, no_time_to_research,
no_childcare, no_cash, no_insurance, travel_costs, no_vax_left, too_far,
info_confidence, info_confidence_high, want_link, want_answer, self_reflection,
age, education, religiosity, location, black_or_african, vaccinated,
nigeria, kenya, ghana, south_africa,
best_treatment, opinion_conv_sentiment,post_want_vax) %>%
select(!info_confidence) %>%
relocate(starts_with("info_source"), .before = info_confidence_high)
df_features_unvax <-
df_features %>%
filter(vaccinated == 0)
df_unvax <-
df %>%
filter(vax_status_num == 0)
segments <- df_unvax %>%
mutate(heard_bad_things_seg = as.numeric(impediments_hr_heard_hearsay == 1 |
(impediments_hr_more_information == 1 & impediments_hr_risk == 1)|
(impediments_hr_unsafe == 1 & impediments_hr_risk == 1)|
impediments_hr_scared == 1 |
(impediments_hr_family_friends ==1 & impediments_hr_risk == 1) |
impediments_hr_death == 1)) %>%
mutate(havent_gotten_vax_seg =as.numeric(
impediments_hr_no_time == 1 |
impediments_hr_distance == 1 |
best_treat_easier_access_to_vax == 1|
best_treat_reminders == 1)
) %>%
mutate(not_relevant_seg = as.numeric(impediments_hr_no_need == 1 |
opinion_hr_no_need ==1 |
impediments_hr_never_saw_covid == 1|
opinion_hr_never_saw_covid == 1)) %>%
mutate(covid_not_exis_seg = impediments_hr_covid_not_real ==1) %>%
mutate(misinformation_seg = as.numeric(impediments_hr_misinformation == 1)) %>%
mutate(nothing_would_work_seg = as.numeric(impediments_hr_no_reason == 1 |
best_treat_nothing == 1 |
best_treat_dont_know ==1 |
impediments_hr_never_saw_covid == 1|
impediments_hr_covid_not_real == 1)) %>%
mutate(side_effect_scare_seg = ifelse((impediments_hr_side_effects == 1) | (impediments_hr_pain == 1), 1L, 0L) %>% replace_na(0L)) %>%
mutate(side_effect_maternity_seg = ifelse(impediments_hr_pregnancy_nursing == 1, 1L, 0L) %>% replace_na(0L)) %>%
mutate(scared_of_needles_seg = ifelse(impediments_hr_needles_injection == 1, 1L, 0L) %>% replace_na(0L)) %>%
mutate(death_concerns_seg = ifelse((impediments_hr_death == 1 | opinion_hr_death ==1), 1L, 0L) %>% replace_na(0L)) %>%
mutate(
against_freedom_choice_principles_seg =
if_else(
(impediments_hr_government == 1 | impediments_hr_trust == 1) &
(impediments_hr_religion == 0) &
(impediments_hr_scared == 0),
1L,
0L
) %>% replace_na(0L)
) %>%
mutate(
against_religious_beliefs_seg =
if_else(
impediments_hr_religion == 1,
1L,
0L
) %>% replace_na(0L)
) %>%
mutate(
believe_body_is_healthy_seg =
if_else(
impediments_hr_healthy == 1 & impediments_hr_no_need == 1,
1L,
0L
) %>% replace_na(0L)
) %>%
mutate(super_busy_and_poor_seg =
ifelse(
(impediments_hr_financial == 1) | (impediments_hr_no_time) | (impediments_hr_work) | (best_treat_rewards),
1L,
0L
)%>% replace_na(0L)) %>%
mutate(need_time_off_work_to_get_vaxxed_seg =
ifelse(
(impediments_hr_work == 1) |(impediments_hr_no_time == 1),
1L,
0L
)%>% replace_na(0L)) %>%
mutate(cant_get_off_work_to_get_vaxed_and_side_effects_seg =
ifelse(
((impediments_hr_work == 1) | (impediments_hr_no_time == 1)) & (impediments_hr_side_effects == 1),
1L,
0L
)%>% replace_na(0L)) %>%
mutate(too_far_away_from_vaccination_site_seg =
ifelse(
(impediments_hr_distance == 1),
1L,
0L
)%>% replace_na(0L))
n_by_group_tbl <- segments %>%
select(ends_with("_seg")) %>%
dplyr::summarize(across(everything(), sum))Below we show all our definitions for each segments:
impediments_hearsay == 1 OR
(impediments_more_information == 1 AND impediments_risk == 1) OR
(impediments_unsafe == 1 AND impediments_risk == 1) OR
impediments_scared == 1 OR
(impediments_family_friends == 1 AND impediments_risk == 1) OR
impediments_death == 1
impediments_no_time == 1 OR
impediments_distance == 1 OR
treatment_easier_to_get_vax == 1 OR
treatment_reminder == 1
impediments_no_need == 1 OR
opinion_no_need == 1 OR
impediments_never_saw_covid == 1 OR
opinion_never_saw_covid == 1
impediments_covid_not_real == 1
impediments_misinformation == 1
impediments_no_reason == 1 OR
impediments_nothing_dont_know == 1 OR
best_treatment_proposal_nothing == 1 OR
best_treatment_proposal_dont_know == 1 OR
impediments_never_saw_covid == 1 OR
impediments_covid_not_real == 1
impediments_side_effects == 1 OR
impediments_pain == 1
impediments_pregnancy_nursing == 1
impediments_needles_injections == 1
impediments_death == 1 OR
opinion_death == 1
impediments_religion == 1
impediments_healthy == 1 AND impediments_no_need == 1
impediments_too_costly == 1 OR
impediments_need_money == 1 OR
impediments_no_time == 1 OR
impediments_hard_to_get_off_work == 1 OR
best_treatment_proposal_rewards_for_vaccinating == 1
impediments_hard_to_get_off_work == 1 AND impediments_no_time == 1
impediments_work == 1 OR
impediments_no_time == 1 AND impediments_side_effects == 1
impediments_distance == 1
The following segments were dropped due to have a size of less than 50 observations.
###Drop Segments with $n < 50$.
drop <- names(n_by_group_tbl)[n_by_group_tbl < 50]
drop## [1] "covid_not_exis_seg"
## [2] "side_effect_maternity_seg"
## [3] "against_religious_beliefs_seg"
## [4] "believe_body_is_healthy_seg"
## [5] "cant_get_off_work_to_get_vaxed_and_side_effects_seg"
segments <- segments %>%
select(!contains(drop))
list_of_segments <- segments %>%
select(ends_with("_seg")) %>% colnames()Below, we present a correlation matrix for our remaining segments.
segments[, list_of_segments] %>%
rename_all(~str_to_title(str_replace_all(str_remove(., "_seg"), "_", " "))) %>%
cor(use = "pairwise.complete.obs") %>%
ggcorrplot(type = "lower", lab = TRUE, lab_size = 12/.pt, tl.cex = 10) +
labs(y = "", x = "", title = "Correlation Matrix: \nSegments")+
theme(axis.text.x = element_text(angle = 45, hjust=1))+
scale_fill_gradient2(
low = "red",
mid = "white",
high = "green",
midpoint = 0,
limits = c(-1, 1))Next, we provide the observations counts and percents for each of our segments. The percent denominator is total number of observations for unvaccinated participants (2,309).
n_by_group_tbl <- n_by_group_tbl[!(names(n_by_group_tbl) %in% drop)]
n_by_group_tbl %>%
as.data.frame() %>%
pivot_longer(cols = everything(), names_to = "segment", values_to = "number_of_observations") %>%
arrange(desc(number_of_observations)) %>%
mutate(segment = str_to_title(str_squish(str_replace_all(string = segment, pattern = "_seg|_", " ")))) %>%
mutate(percent_of_observations = paste(round(number_of_observations/nrow(segments)*100), "%", sep = "")) %>%
rename_all(~str_to_title(str_replace_all(., "_", " "))) %>%
datatableNext, we want to know how many segments each user is assigned to using our definitions. The figure below shows the percent of unvaccinated users by the number segments assigned.
segments %>%
select(contains("_seg")) %>%
mutate(n_segments_per_user = rowSums(.)) %>%
tabyl(n_segments_per_user) %>%
as_tibble() %>%
mutate(flag = (n_segments_per_user == 1)) %>%
ggplot(aes(n_segments_per_user, percent)) +
geom_col(alpha = 0.9, aes(fill = flag), show.legend = F) +
theme_minimal() +
scale_x_continuous(breaks = seq(0, 10, 1)) +
scale_y_continuous(labels = scales::percent_format(accuracy = 1), breaks = seq(0, 1, 0.05)) +
scale_fill_brewer(palette = "Set1") +
labs(
x = "Number of segments",
y = "Percent of unvaccinated users",
title = "Number of segments assigned per unvaccinated user (N = 2309)"
) +
theme( panel.grid.minor = element_blank())Next, we group the segments by common theme, as shown below.
Group 1: Ability impediments to getting vaccine
Group 2: Health Concerns
Group 3: Beliefs
group_ability = list_of_segments[list_of_segments %in% c("havent_gotten_vax_seg","super_busy_and_poor_seg", "too_far_away_from_vaccination_site_seg", "need_time_off_work_to_get_vaxxed_seg")]
group_health_concerns = list_of_segments[list_of_segments %in% c("side_effect_scare_seg","death_concerns_seg", "scared_of_needles_seg")]
group_beliefs = list_of_segments[list_of_segments %in% c("nothing_would_work_seg","heard_bad_things_seg", "not_relevant_seg", "against_freedom_choice_principles_seg", "misinformation_seg")]
list_of_segments = c(group_ability, group_health_concerns, group_beliefs)# Calculate mean and sd for all features
feature_stats <- df_features_unvax %>%
select(!best_treatment & !vaccinated) %>%
summarize_all(list(mean = function(x)mean(x, na.rm = T),
sd = function(x)sd(x, na.rm = T))) %>%
pivot_longer(cols = everything(),
names_to = "feature",
values_to = "value") %>%
mutate(measure = paste("total", str_extract(feature, "mean|sd"), sep = "_"),
feature = str_remove(feature, "_mean|_sd")) %>%
pivot_wider(names_from = "measure",
values_from = "value")#Calculate mean and sd by feature group
feature_stats_group <- df_features_unvax %>%
select(!best_treatment & !vaccinated) %>%
cbind(segments[, list_of_segments]) %>%
mutate(group_ability = case_when(
havent_gotten_vax_seg == 1| super_busy_and_poor_seg == 1| need_time_off_work_to_get_vaxxed_seg == 1| too_far_away_from_vaccination_site_seg ==1 ~ 1,
TRUE ~ 0),
group_health_concerns = case_when(
side_effect_scare_seg == 1| scared_of_needles_seg == 1| death_concerns_seg == 1 ~ 1,
TRUE ~ 0
),
group_beliefs = case_when(heard_bad_things_seg == 1 | not_relevant_seg == 1 | misinformation_seg == 1| nothing_would_work_seg == 1 | against_freedom_choice_principles_seg == 1 ~ 1,
TRUE ~ 0)
)
feature_stats_ability <- feature_stats_group %>%
filter(group_ability == 1) %>%
select(!starts_with("group_")) %>%
select(!ends_with("_seg")) %>%
summarize_all(list(mean = function(x)mean(x, na.rm = T),
sd = function(x)sd(x, na.rm = T))) %>%
pivot_longer(cols = everything(),
names_to = "feature",
values_to = "value") %>%
mutate(measure = paste("ability", str_extract(feature, "mean|sd"), sep = "_"),
feature = str_remove(feature, "_mean|_sd")) %>%
pivot_wider(names_from = "measure",
values_from = "value")
feature_stats_health_concerns <- feature_stats_group %>%
filter(group_health_concerns == 1) %>%
select(!starts_with("group_")) %>%
select(!ends_with("_seg")) %>%
summarize_all(list(mean = function(x)mean(x, na.rm = T),
sd = function(x)sd(x, na.rm = T))) %>%
pivot_longer(cols = everything(),
names_to = "feature",
values_to = "value") %>%
mutate(measure = paste("health_concerns", str_extract(feature, "mean|sd"), sep = "_"),
feature = str_remove(feature, "_mean|_sd")) %>%
pivot_wider(names_from = "measure",
values_from = "value")
feature_stats_beliefs <- feature_stats_group %>%
filter(group_beliefs == 1) %>%
select(!starts_with("group_")) %>%
select(!ends_with("_seg")) %>%
summarize_all(list(mean = function(x)mean(x, na.rm = T),
sd = function(x)sd(x, na.rm = T))) %>%
pivot_longer(cols = everything(),
names_to = "feature",
values_to = "value") %>%
mutate(measure = paste("beliefs", str_extract(feature, "mean|sd"), sep = "_"),
feature = str_remove(feature, "_mean|_sd")) %>%
pivot_wider(names_from = "measure",
values_from = "value")#Calculate mean and sd by Segment Group
in_segment_stats <- data.frame()
for(column in list_of_segments){
summary <- df_features_unvax %>%
select(!best_treatment & !vaccinated) %>%
mutate(segment = segments[[column]]) %>%
filter(segment == 1) %>%
group_by(segment) %>%
summarise_all(list(sep_mean = ~ mean(., na.rm = T),
sep_sd = ~ sd(., na.rm =T),
sep_n = ~sum(., na.rm = T))) %>%
pivot_longer(cols = !segment, names_to = c("name")) %>%
separate(name, into = c("feature", "value_type"), sep = "_sep_") %>%
mutate(segment = column) %>%
pivot_wider(names_from = "value_type", values_from = "value") %>%
mutate(se = sd/sqrt(n)) %>%
select(!sd & !n)
in_segment_stats<- rbind(in_segment_stats, summary)
}
in_segment_stats <- in_segment_stats %>%
mutate_if(is.numeric, function(x)round(x, 3)) # Total Comparions table
tbl <- left_join(in_segment_stats, feature_stats ,by = "feature") %>%
mutate(n = unlist(n_by_group_tbl[match(segment, names(n_by_group_tbl))])) %>%
mutate(label = paste(mean, paste("(", se, ")", sep = ""), sep = "\n"),
color = (mean - total_mean)/total_sd,
color = case_when(color > 2 ~ 2,
color < -2 ~ -2,
is.nan(se) ~ NA_real_,
TRUE ~ color),
group = case_when(segment %in% group_ability ~ "Ability",
segment %in% group_health_concerns ~ "Health Concerns",
segment %in% group_beliefs ~ "Beliefs",
TRUE ~ NA_character_
),
feature_group = case_when(grepl("impediments_hr_", feature) ~ "Impediments",
grepl("best_treat_", feature) ~ "Best Treatment",
grepl("opinion_hr", feature) ~ "Opinion",
TRUE ~ "Demo"),
segment = str_to_title(str_replace_all(str_remove(segment, "_seg"), "_", "\n")),
feature = str_to_title(str_replace_all(str_remove(feature, "impediments_hr_|opinion_hr_|best_treat_"), "_", " ")),
prop = round(n/nrow(df_features_unvax), 2)) %>%
mutate(segment_label = paste(segment, paste("N: ", n, sep = ""), paste("Prop: ", prop, sep = ""), sep = "\n"))# ability comparions table
tbl_ability <- left_join(in_segment_stats, feature_stats_ability,by = "feature") %>%
mutate(n = unlist(n_by_group_tbl[match(segment, names(n_by_group_tbl))])) %>%
mutate(label = paste(mean, paste("(", se, ")", sep = ""), sep = "\n"),
color = (mean - ability_mean)/ability_sd,
color = case_when(color > 2 ~ 2,
color < -2 ~ -2,
is.nan(se) ~ NA_real_,
TRUE ~ color),
group = case_when(segment %in% group_ability ~ "Ability",
segment %in% group_health_concerns ~ "Health Concerns",
segment %in% group_beliefs ~ "Beliefs",
TRUE ~ NA_character_
),
feature_group = case_when(grepl("impediments_hr_", feature) ~ "Impediments",
grepl("best_treat_", feature) ~ "Best Treatment",
grepl("opinion_hr", feature) ~ "Opinion",
TRUE ~ "Demo"),
segment = str_to_title(str_replace_all(str_remove(segment, "_seg"), "_", "\n")),
feature = str_to_title(str_replace_all(str_remove(feature, "impediments_hr_|opinion_hr_|best_treat_"), "_", " ")),
prop = round(n/nrow(df_features_unvax), 2)) %>%
filter(group == "Ability") %>%
mutate(segment_label = paste(segment, paste("N: ", n, sep = ""), paste("Prop: ", prop, sep = ""), sep = "\n"))
# health concerns table
tbl_health_concerns <- left_join(in_segment_stats, feature_stats_health_concerns ,by = "feature") %>%
mutate(n = unlist(n_by_group_tbl[match(segment, names(n_by_group_tbl))])) %>%
mutate(label = paste(mean, paste("(", se, ")", sep = ""), sep = "\n"),
color = (mean - health_concerns_mean)/health_concerns_sd,
color = case_when(color > 2 ~ 2,
color < -2 ~ -2,
is.nan(se) ~ NA_real_,
TRUE ~ color),
group = case_when(segment %in% group_ability ~ "Ability",
segment %in% group_health_concerns ~ "Health Concerns",
segment %in% group_beliefs ~ "Beliefs",
TRUE ~ NA_character_
),
feature_group = case_when(grepl("impediments_hr_", feature) ~ "Impediments",
grepl("best_treat_", feature) ~ "Best Treatment",
grepl("opinion_hr", feature) ~ "Opinion",
TRUE ~ "Demo"),
segment = str_to_title(str_replace_all(str_remove(segment, "_seg"), "_", "\n")),
feature = str_to_title(str_replace_all(str_remove(feature, "impediments_hr_|opinion_hr_|best_treat_"), "_", " ")),
prop = round(n/nrow(df_features_unvax), 2)) %>%
filter(group == "Health Concerns") %>%
mutate(segment_label = paste(segment, paste("N: ", n, sep = ""), paste("Prop: ", prop, sep = ""), sep = "\n"))
# beliefs concerns table
tbl_beliefs <- left_join(in_segment_stats, feature_stats_beliefs,by = "feature") %>%
mutate(n = unlist(n_by_group_tbl[match(segment, names(n_by_group_tbl))])) %>%
mutate(label = paste(mean, paste("(", se, ")", sep = ""), sep = "\n"),
color = (mean - beliefs_mean)/beliefs_sd,
color = case_when(color > 2 ~ 2,
color < -2 ~ -2,
is.nan(se) ~ NA_real_,
TRUE ~ color),
group = case_when(segment %in% group_ability ~ "Ability",
segment %in% group_health_concerns ~ "Health Concerns",
segment %in% group_beliefs ~ "Beliefs",
TRUE ~ NA_character_
),
feature_group = case_when(grepl("impediments_hr_", feature) ~ "Impediments",
grepl("best_treat_", feature) ~ "Best Treatment",
grepl("opinion_hr", feature) ~ "Opinion",
TRUE ~ "Demo"),
segment = str_to_title(str_replace_all(str_remove(segment, "_seg"), "_", "\n")),
feature = str_to_title(str_replace_all(str_remove(feature, "impediments_hr_|opinion_hr_|best_treat_"), "_", " ")),
prop = round(n/nrow(df_features_unvax), 2)) %>%
filter(group == "Beliefs") %>%
mutate(segment_label = paste(segment, paste("N: ", n, sep = ""), paste("Prop: ", prop, sep = ""), sep = "\n"))This section provides covariate heatmaps to make comparisons within segment groups. The text labels are the mean and standard error for the segment.The color is the standard deviation of the normalized distribution (the mean of a given segment scaled to the distribution of the other segments in that group):
(mean in segment_feature - mean in feature segment_group) / std dev feature segment_group
The heatmaps are presented for each of the three segment groups. Further, for each group, the heatmaps are divided by groups of features for readability:
Motive/Ability Impediments
Best Treatment Proposal
Opinion
Demographics
ggplot(tbl_ability %>% filter(feature_group == "Impediments")) +
geom_tile(aes(segment_label, feature, fill = color), color = "white", lwd = .5) +
geom_text(aes(segment_label, feature, label = label), size = 2.75) +
scale_fill_gradient2(
low = "red",
mid = "white",
high = "green",
midpoint = 0,
limits = c(-2,2),
labels = c( ">-2", "-1", "0", "1", "2<")) +
theme_minimal() +
theme(legend.position = "bottom",
axis.title.y = element_text(angle = 0)) +
labs(title = "Group 1: Ability Concerns",
x = "Segment",
y = "Feature",
fill = "Std Deviation on Normalized Distribution") +
guides(fill = guide_colourbar(barwidth = 10, title.position = "top"))ggplot(tbl_ability %>% filter(feature_group == "Best Treatment")) +
geom_tile(aes(segment_label, feature, fill = color), color = "white", lwd = .5) +
geom_text(aes(segment_label, feature, label = label), size = 2.75) +
scale_fill_gradient2(
low = "red",
mid = "white",
high = "green",
midpoint = 0,
limits = c(-2,2),
labels = c( ">-2", "-1", "0", "1", "2<")) +
theme_minimal() +
theme(legend.position = "bottom",
axis.title.y = element_text(angle = 0)) +
labs(title = "Group 1: Ability Concerns",
x = "Segment",
y = "Feature",
fill = "Std Deviation on Normalized Distribution") +
guides(fill = guide_colourbar(barwidth = 10, title.position = "top"))ggplot(tbl_ability %>% filter(feature_group == "Opinion")) +
geom_tile(aes(segment_label, feature, fill = color), color = "white", lwd = .5) +
geom_text(aes(segment_label, feature, label = label), size = 2.75) +
scale_fill_gradient2(
low = "red",
mid = "white",
high = "green",
midpoint = 0,
limits = c(-2,2),
labels = c( ">-2", "-1", "0", "1", "2<")) +
theme_minimal() +
theme(legend.position = "bottom",
axis.title.y = element_text(angle = 0)) +
labs(title = "Group 1: Ability Concerns",
x = "Segment",
y = "Feature",
fill = "Std Deviation on Normalized Distribution") +
guides(fill = guide_colourbar(barwidth = 10, title.position = "top"))ggplot(tbl_ability %>% filter(feature_group == "Demo")) +
geom_tile(aes(segment_label, feature, fill = color), color = "white", lwd = .5) +
geom_text(aes(segment_label, feature, label = label), size = 2.75) +
scale_fill_gradient2(
low = "red",
mid = "white",
high = "green",
midpoint = 0,
limits = c(-2,2),
labels = c( ">-2", "-1", "0", "1", "2<")) +
theme_minimal() +
theme(legend.position = "bottom",
axis.title.y = element_text(angle = 0)) +
labs(title = "Group 1: Ability Concerns",
x = "Segment",
y = "Feature",
fill = "Std Deviation on Normalized Distribution") +
guides(fill = guide_colourbar(barwidth = 10, title.position = "top"))ggplot(tbl_health_concerns %>% filter(feature_group == "Impediments")) +
geom_tile(aes(segment_label, feature, fill = color), color = "white", lwd = .5) +
geom_text(aes(segment_label, feature, label = label), size = 2.75) +
scale_fill_gradient2(
low = "red",
mid = "white",
high = "green",
midpoint = 0,
limits = c(-2,2),
labels = c( ">-2", "-1", "0", "1", "2<")) +
theme_minimal() +
theme(legend.position = "bottom",
axis.title.y = element_text(angle = 0)) +
labs(title = "Group 2: Health Concerns",
x = "Segment",
y = "Feature",
fill = "Std Deviation on Normalized Distribution") +
guides(fill = guide_colourbar(barwidth = 10, title.position = "top"))ggplot(tbl_health_concerns %>% filter(feature_group == "Best Treatment")) +
geom_tile(aes(segment_label, feature, fill = color), color = "white", lwd = .5) +
geom_text(aes(segment_label, feature, label = label), size = 2.75) +
scale_fill_gradient2(
low = "red",
mid = "white",
high = "green",
midpoint = 0,
limits = c(-2,2),
labels = c( ">-2", "-1", "0", "1", "2<")) +
theme_minimal() +
theme(legend.position = "bottom",
axis.title.y = element_text(angle = 0)) +
labs(title = "Group 2: Health Concerns",
x = "Segment",
y = "Feature",
fill = "Std Deviation on Normalized Distribution") +
guides(fill = guide_colourbar(barwidth = 10, title.position = "top"))ggplot(tbl_health_concerns %>% filter(feature_group == "Opinion")) +
geom_tile(aes(segment_label, feature, fill = color), color = "white", lwd = .5) +
geom_text(aes(segment_label, feature, label = label), size = 2.75) +
scale_fill_gradient2(
low = "red",
mid = "white",
high = "green",
midpoint = 0,
limits = c(-2,2),
labels = c( ">-2", "-1", "0", "1", "2<")) +
theme_minimal() +
theme(legend.position = "bottom",
axis.title.y = element_text(angle = 0)) +
labs(title = "Group 2: Health Concerns",
x = "Segment",
y = "Feature",
fill = "Std Deviation on Normalized Distribution") +
guides(fill = guide_colourbar(barwidth = 10, title.position = "top"))ggplot(tbl_health_concerns %>% filter(feature_group == "Demo")) +
geom_tile(aes(segment_label, feature, fill = color), color = "white", lwd = .5) +
geom_text(aes(segment_label, feature, label = label), size = 2.75) +
scale_fill_gradient2(
low = "red",
mid = "white",
high = "green",
midpoint = 0,
limits = c(-2,2),
labels = c( ">-2", "-1", "0", "1", "2<")) +
theme_minimal() +
theme(legend.position = "bottom",
axis.title.y = element_text(angle = 0)) +
labs(title = "Group 2: Health Concerns",
x = "Segment",
y = "Feature",
fill = "Std Deviation on Normalized Distribution") +
guides(fill = guide_colourbar(barwidth = 10, title.position = "top"))ggplot(tbl_beliefs %>% filter(feature_group == "Impediments")) +
geom_tile(aes(segment_label, feature, fill = color), color = "white", lwd = .5) +
geom_text(aes(segment_label, feature, label = label), size = 2.75) +
scale_fill_gradient2(
low = "red",
mid = "white",
high = "green",
midpoint = 0,
limits = c(-2,2),
labels = c( ">-2", "-1", "0", "1", "2<")) +
theme_minimal() +
theme(legend.position = "bottom",
axis.title.y = element_text(angle = 0)) +
labs(title = "Group 3: Beliefs",
x = "Segment",
y = "Feature",
fill = "Std Deviation on Normalized Distribution") +
guides(fill = guide_colourbar(barwidth = 10, title.position = "top"))ggplot(tbl_beliefs %>% filter(feature_group == "Best Treatment")) +
geom_tile(aes(segment_label, feature, fill = color), color = "white", lwd = .5) +
geom_text(aes(segment_label, feature, label = label), size = 2.75) +
scale_fill_gradient2(
low = "red",
mid = "white",
high = "green",
midpoint = 0,
limits = c(-2,2),
labels = c( ">-2", "-1", "0", "1", "2<")) +
theme_minimal() +
theme(legend.position = "bottom",
axis.title.y = element_text(angle = 0)) +
labs(title = "Group 3: Beliefs",
x = "Segment",
y = "Feature",
fill = "Std Deviation on Normalized Distribution") +
guides(fill = guide_colourbar(barwidth = 10, title.position = "top"))ggplot(tbl_beliefs %>% filter(feature_group == "Opinion")) +
geom_tile(aes(segment_label, feature, fill = color), color = "white", lwd = .5) +
geom_text(aes(segment_label, feature, label = label), size = 2.75) +
scale_fill_gradient2(
low = "red",
mid = "white",
high = "green",
midpoint = 0,
limits = c(-2,2),
labels = c( ">-2", "-1", "0", "1", "2<")) +
theme_minimal() +
theme(legend.position = "bottom",
axis.title.y = element_text(angle = 0)) +
labs(title = "Group 3: Beliefs",
x = "Segment",
y = "Feature",
fill = "Std Deviation on Normalized Distribution") +
guides(fill = guide_colourbar(barwidth = 10, title.position = "top"))ggplot(tbl_beliefs %>% filter(feature_group == "Demo")) +
geom_tile(aes(segment_label, feature, fill = color), color = "white", lwd = .5) +
geom_text(aes(segment_label, feature, label = label), size = 2.75) +
scale_fill_gradient2(
low = "red",
mid = "white",
high = "green",
midpoint = 0,
limits = c(-2,2),
labels = c( ">-2", "-1", "0", "1", "2<")) +
theme_minimal() +
theme(legend.position = "bottom",
axis.title.y = element_text(angle = 0)) +
labs(title = "Group 3: Beliefs",
x = "Segment",
y = "Feature",
fill = "Std Deviation on Normalized Distribution") +
guides(fill = guide_colourbar(barwidth = 10, title.position = "top"))This section provides covariate heatmaps to make comparisons between all segment groups. The text labels are the mean and standard error for the segment. The color is the standard deviation of the normalized distribution (the mean of a given segment scaled to the distribution of the other segments):
(mean in segment_feature - mean in feature) / std dev feature
The heatmaps are divided by groups of features for readability:
Motive/Ability Impediments
Best Treatment Proposal
Opinion
Demographics
ggplot(tbl %>% filter(feature_group == "Impediments")) +
geom_tile(aes(segment_label, feature, fill = color), color = "white", lwd = .5) +
geom_text(aes(segment_label, feature, label = label), size = 2.75) +
facet_grid(~ group, scales = "free_x", space = "free_x", labeller = labeller(group = str_to_title)) +
scale_fill_gradient2(
low = "red",
mid = "white",
high = "green",
midpoint = 0,
limits = c(-2,2),
labels = c( ">-2", "-1", "0", "1", "2<")) +
theme_minimal() +
theme(legend.position = "bottom",
axis.title.y = element_text(angle = 0)) +
labs(title = "All Segments",
x = "Segment",
y = "Feature",
fill = "Std Deviation on Normalized Distribution") +
guides(fill = guide_colourbar(barwidth = 10, title.position = "top"))ggplot(tbl %>% filter(feature_group == "Best Treatment")) +
geom_tile(aes(segment_label, feature, fill = color), color = "white", lwd = .5) +
geom_text(aes(segment_label, feature, label = label), size = 2.75) +
facet_grid(~ group, scales = "free_x", space = "free_x", labeller = labeller(group = str_to_title)) +
scale_fill_gradient2(
low = "red",
mid = "white",
high = "green",
midpoint = 0,
limits = c(-2,2),
labels = c( ">-2", "-1", "0", "1", "2<")) +
theme_minimal() +
theme(legend.position = "bottom",
axis.title.y = element_text(angle = 0)) +
labs(title = "All Segments",
x = "Segment",
y = "Feature",
fill = "Std Deviation on Normalized Distribution") +
guides(fill = guide_colourbar(barwidth = 10, title.position = "top"))ggplot(tbl %>% filter(feature_group == "Opinion")) +
geom_tile(aes(segment_label, feature, fill = color), color = "white", lwd = .5) +
geom_text(aes(segment_label, feature, label = label), size = 2.75) +
facet_grid(~ group, scales = "free_x", space = "free_x", labeller = labeller(group = str_to_title)) +
scale_fill_gradient2(
low = "red",
mid = "white",
high = "green",
midpoint = 0,
limits = c(-2,2),
labels = c( ">-2", "-1", "0", "1", "2<")) +
theme_minimal() +
theme(legend.position = "bottom",
axis.title.y = element_text(angle = 0)) +
labs(title = "All Segments",
x = "Segment",
y = "Feature",
fill = "Std Deviation on Normalized Distribution") +
guides(fill = guide_colourbar(barwidth = 10, title.position = "top"))ggplot(tbl %>% filter(feature_group == "Demo")) +
geom_tile(aes(segment_label, feature, fill = color), color = "white", lwd = .5) +
geom_text(aes(segment_label, feature, label = label), size = 2.75) +
facet_grid(~ group, scales = "free_x", space = "free_x", labeller = labeller(group = str_to_title)) +
scale_fill_gradient2(
low = "red",
mid = "white",
high = "green",
midpoint = 0,
limits = c(-2,2),
labels = c( ">-2", "-1", "0", "1", "2<")) +
theme_minimal() +
theme(legend.position = "bottom",
axis.title.y = element_text(angle = 0)) +
labs(title = "All Segments",
x = "Segment",
y = "Feature",
fill = "Std Deviation on Normalized Distribution") +
guides(fill = guide_colourbar(barwidth = 10, title.position = "top"))