5 Main Takeaways
Will need to update for Pilot 2:
# data dictionary
data_dictionary <- read.csv("~/Documents/git/fb_misinfo_interventions/data/data_dictionary.csv")
# ads data by ad
ads <- read.csv("~/Documents/git/fb_misinfo_interventions/data/cleaned_ads.csv")
# ads data by age and gender
ads_demo <- read.csv("~/Documents/git/fb_misinfo_interventions/data/cleaned_ads_age_gender.csv")
# order of variables by chatbot-treatment combination
out <- list()
for(i in c("control", "game", "video", "sms")){
out [[i]] <- read_sheet("https://docs.google.com/spreadsheets/d/1Kwjm6tEHycGuy_tnlRZURLb_rrGUoP8jpZNRcaeSyyc/edit#gid=0", sheet = i)
}
detailed_funnel <- do.call("rbind", out)
# this has ggplot theme preset
source(paste0("https://raw.githubusercontent.com/scmcdonald/visualization/main/theme_sarah.R"))
# load function that defines factor levels so tables and figures have correct ordering
source("~/Documents/git/fb_misinfo_interventions/pilot2_analysis/factor_cols.R")
# define discrete colorblind palette
cb_colors <- brewer.pal(n = 8, name = "Dark2")
cb_colors_extra <- brewer.pal(n = 8, name = "Set2")
cb_colors <- c(cb_colors, cb_colors_extra)
### intro
df_sim <- data.frame(ad = sample(c("attention 1", "outrage 1", "real 1", "real 2",
"fake", "fake news 2", "help friends", "protect", "join"),
size = 10000, replace = T, prob = rep(1/9, 9)),
conversations_started = rep(1, 10000),
consent = sample(c(0, 1), size = 10000, replace = T, prob = c(0.3, 0.7))
)
df_sim$intro_go_on <- ifelse(df_sim$consent == 0, NA,
sample(c(0, 1),
size = sum(df_sim$consent),
replace = T, prob = c(0.02, 0.98)))
df_sim$intro_complete <- ifelse(df_sim$intro_go_on == 1 & !is.na(df_sim$intro_go_on ), 1, 0)
### treatment
df_sim$treatment <- ifelse(df_sim$intro_complete == 0 | is.na(df_sim$intro_complete), NA,
sample(c("control", "sms", "video", "game"), size = sum(df_sim$intro_complete, na.rm = T), replace = T,
prob = c(2/3, 1/3 * 1/3, 1/3 * 1/3, 1/3 * 1/3)))
df_sim$treatment_group <- ifelse(df_sim$treatment == "control", 0,
ifelse(is.na(df_sim$treatment), NA, 1))
df_sim$treatment_group_label <- ifelse(df_sim$treatment == "control", "control",
ifelse(is.na(df_sim$treatment), NA, "treatment"))
df_sim$treatment_sms <- ifelse(df_sim$treatment== "sms", 1,
ifelse(df_sim$treatment== "control", 0, NA))
df_sim$treatment_game <- ifelse(df_sim$treatment== "game", 1,
ifelse(df_sim$treatment== "control", 0, NA))
df_sim$treatment_video <- ifelse(df_sim$treatment== "video", 1,
ifelse(df_sim$treatment== "control", 0, NA))
#### sms
df_sim$sms_quiz <- ifelse(df_sim$treatment_sms== 1 & !is.na(df_sim$treatment_sms),
sample(c("a", "b", NA_character_),
size = sum(df_sim$treatment_sms== 1, na.rm = T), replace = T, prob = c(.45, .45, .1)), NA)
df_sim$family_friend_check <- ifelse(!is.na(df_sim$sms_quiz),
sample(c("a", "b", NA_character_),
size = sum(!is.na(df_sim$sms_quiz)), replace = T, prob = c(.45, .45, .1)), NA)
df_sim$fear_quiz <- ifelse(!is.na(df_sim$family_friend_check),
sample(c("a", "b", NA_character_),
size = sum(!is.na(df_sim$family_friend_check)), replace = T, prob = c(.45, .45, .1)), NA)
df_sim$anger_quiz <- ifelse(!is.na(df_sim$fear_quiz),
sample(c("a", "b", NA_character_),
size = sum(!is.na(df_sim$fear_quiz)), replace = T, prob = c(.45, .45, .1)), NA)
#### game
df_sim$game_completed <- ifelse(df_sim$treatment_game== 1 & !is.na(df_sim$treatment_game),
sample(c(1, 0),
size = sum(df_sim$treatment_game== 1, na.rm = T), replace = T, prob = c(.8, .2)), NA)
df_sim$score <- ifelse(df_sim$game_completed== 1 & !is.na(df_sim$game_completed),
sample(c(1:5, NA_integer_),
size = sum(df_sim$game_completed== 1, na.rm = T), replace = T, prob = c(.3, .2, .15, .10, .2, .05)), NA)
#### video
df_sim$video_1 <- ifelse(df_sim$treatment_video== 1 & !is.na(df_sim$treatment_video),
sample(c(1, 0),
size = sum(df_sim$treatment_video== 1, na.rm = T), replace = T, prob = c(.95, .05)), NA)
df_sim$video_2 <- ifelse(df_sim$video_1== 1 & !is.na(df_sim$video_1),
sample(c(1, 0),
size = sum(df_sim$video_1== 1, na.rm = T), replace = T, prob = c(.96, .04)), NA)
df_sim$video_3 <- ifelse(df_sim$video_2== 1 & !is.na(df_sim$video_2),
sample(c(1, 0),
size = sum(df_sim$video_2== 1, na.rm = T), replace = T, prob = c(.97, .03)), NA)
df_sim$video_4 <- ifelse(df_sim$video_3== 1 & !is.na(df_sim$video_3),
sample(c(1, 0),
size = sum(df_sim$video_3== 1, na.rm = T), replace = T, prob = c(.98, .02)), NA)
df_sim$video_5 <- ifelse(df_sim$video_4== 1 & !is.na(df_sim$video_4),
sample(c(1, 0),
size = sum(df_sim$video_4== 1, na.rm = T), replace = T, prob = c(.98, .02)), NA)
df_sim$treatment_complete <- ifelse((df_sim$treatment == "control" & !is.na(df_sim$treatment))|
df_sim$video_5 == 1 & !is.na(df_sim$video_5)|
!is.na(df_sim$score) | !is.na(df_sim$anger_quiz)
, 1, 0)
### misinfo quiz - attention check
df_sim$headline1_answer <- ifelse(df_sim$treatment_complete == 1, sample(c("a", "b", NA_character_),
size = sum(df_sim$treatment_complete), replace = T, prob = c(.45, .45, .1)), NA)
df_sim$headline2_answer <- ifelse(!is.na(df_sim$headline1_answer), sample(c("a", "b", NA_character_),
size = sum(!is.na(df_sim$headline1_answer)), replace = T, prob = c(.45, .40, .05)), NA)
df_sim$attention_complete <- ifelse(!is.na(df_sim$headline2_answer), 1, 0)
### consequences nudge
df_sim$per_unfollow <- ifelse(df_sim$attention_complete == 1, sample(c("a", "b", NA_character_),
size = sum(df_sim$attention_complete), replace = T, prob = c(.45, .45, .1)), NA)
df_sim$consequences_complete <- ifelse(!is.na(df_sim$per_unfollow), 1, 0)
### feedback
df_sim$enjoy_most <- ifelse(df_sim$consequences_complete == 1, sample(c("aaa", "bbb", "ccc", NA_character_),
size = sum(df_sim$consequences_complete), replace = T, prob = c(.3, .3, .3, .1)), NA)
df_sim$enjoy_least <- ifelse(!is.na(df_sim$enjoy_most), sample(c("aaa", "bbb", "ccc", NA_character_),
size = sum(!is.na(df_sim$enjoy_most)), replace = T, prob = c(.3, .3, .35, .05)), NA)
df_sim$feedback_complete <- ifelse(!is.na(df_sim$enjoy_most), 1, 0)
### DEMOGRAPHICS
df_sim$demog_age <- ifelse(df_sim$feedback_complete == 1, sample(c(18:99, NA_integer_), size = sum(df_sim$feedback_complete == 1), replace = T,
prob = rep(1/(length(18:99)+1), length(18:99)+1)), NA)
df_sim$demog_gender <- ifelse(!is.na(df_sim$demog_age),
sample(c("woman", "man", "non-binary", NA_character_), size = sum(!is.na(df_sim$demog_age)), replace = T,
prob = c(0.85/2, 0.85/2, 0.10, 0.05)) , NA
)
df_sim$demog_ethnic <- ifelse(!is.na(df_sim$demog_gender),
sample(c("white or caucasian", "black or african",
"hispanic",
"asian or indian",
"native american",
"pacific islander", NA_character_), size = sum(!is.na(df_sim$demog_gender)), replace = T,
prob = c(rep(0.9/6, 6), 0.1)) , NA )
df_sim$demog_income <- ifelse(!is.na(df_sim$demog_ethnic),
sample(c("less than $30,000",
"$30,000 - $49,999",
"$50,000 - $69,999",
"$70,000 - $99,999",
"$100,000 - $150,000",
"more than $150,000", NA_character_), size = sum(!is.na(df_sim$demog_ethnic)), replace = T,
prob = c(rep(0.9/6, 6), 0.1)) , NA )
df_sim$demog_education <- ifelse(!is.na(df_sim$demog_income),
sample(c("less than $30,000",
"$30,000 - $49,999",
"$50,000 - $69,999",
"$70,000 - $99,999",
"$100,000 - $150,000",
"more than $150,000", NA_character_), size = sum(!is.na(df_sim$demog_income)), replace = T,
prob = c(rep(0.9/6, 6), 0.1)) , NA )
df_sim$demog_religion <- ifelse(!is.na(df_sim$demog_education),
sample(c(1:6, NA_character_), size = sum(!is.na(df_sim$demog_education)), replace = T,
prob = c(rep(0.9/6, 6), 0.1)) , NA )
df_sim$demog_political <- ifelse(!is.na(df_sim$demog_religion),
sample(c("democrat", "independent", "republican", NA_character_), size = sum(!is.na(df_sim$demog_religion)), replace = T,
prob = c(rep(0.95/3, 3), 0.1)) , NA )
df_sim$demographics_complete <- ifelse(!is.na(df_sim$demog_political), 1, 0)
### end
df_sim$amazon_credit <- ifelse(df_sim$demographics_complete == 1,
sample(c(letters, NA_character_),
replace =T, size = sum(df_sim$demographics_complete ),
prob = rep(1/27, 27)), NA)
df_sim$full_complete <- ifelse(!is.na(df_sim$amazon_credit), 1, 0)
# outcomes for everyone who started survey
df_sim$fb_outcome_binary <- sample(c(0, 1), size =10000,
replace = T, prob = c(0.4, 0.6))
df_sim$fb_outcome_continuous <- runif(10000, 0, 1)
df_sim$fb_demog_age <- sample(18:99, size = 10000, replace = T,
prob = rep(1/length(18:99), length(18:99)))
df_sim$fb_demog_gender <-as.factor(sample(c("woman", "man", "non-binary"), size = 10000, replace = T,
prob = c(0.85/2, 0.85/2, 0.15)))
df_sim$fb_demog_race <- sample(c("white or caucasian", "black or african",
"hispanic",
"asian or indian",
"native american",
"pacific islander"), 10000,
replace = T, prob = rep(1/6, 6))
df_sim$fb_demog_education <- as.factor(sample(c("< high school",
"high school",
"some college",
"2-year degree",
"4-year degree",
"graduate degree"), 10000,
replace = T, prob = rep(1/6, 6)))
df_sim$fb_demog_income <- as.factor(sample(c("less than $30,000",
"$30,000 - $49,999",
"$50,000 - $69,999",
"$70,000 - $99,999",
"$100,000 - $150,000",
"more than $150,000"), 10000,
replace = T, prob = rep(1/6, 6)))
df_sim <- df_sim %>%
mutate(fb_demog_age_group =
as.factor(case_when(fb_demog_age <25 & fb_demog_age> 17 ~ "18-24",
fb_demog_age <35 & fb_demog_age> 24 ~ "25-34",
fb_demog_age <45 & fb_demog_age> 34 ~ "35-44",
fb_demog_age <55 & fb_demog_age> 44 ~ "45-54",
fb_demog_age <65 & fb_demog_age> 54 ~ "55-64",
fb_demog_age> 64 ~ "65+",
TRUE ~ NA_character_))
)
df_sim <- as.data.frame(one_hot(as.data.table(df_sim), dropCols=FALSE))
ads[1:9, "ad_name"] <- unique(df_sim$ad)
# format numbers to percent
form_percent <- function(dec){
if(is.na(dec)){
return("-")
} else {
return(paste0(round(dec, 4) * 100, "%"))
}
}
# format number dollar type
form_cost <- function(num){
sapply(num, function(x) if (x == Inf| is.na(x)) {
return("-")
} else {
return(paste0("$", round(x, 3)))
})
}
# round number to 4 with trailing 0s
# converts to string
round_4 <- function(x){
formatC(x, digits = 4, format = "f")
}
# format number with commas for
# large numbers without decimals
form_num <- function(num){
sapply(num, function(x) if (x == Inf| is.na(x)) {
return("-")
} else {
return( formatC(x, big.mark = ","))
})
}
# this puts sms to all caps and control to Control
label_fun <- function(x){
ifelse(nchar(x) <4 & x != "all", toupper(x), str_to_title(x))
}
The goal of the Facebook Misinformation Project is to experimentally evaluate scaled versions of promising interventions to reduce misinformation spread on social media using on-platform behavioral data.
Update for Pilot 2:
Update for Pilot 2: The analysis script cleans and analyzes pilot data from Ads Manager, Chatfuel, and chatbot responses. The output includes the following tables:
Descriptive Statistics
Funnel Statistics
Treatment Analysis
Subgroup Analysis
Update for Pilot 2: Our targeting strategy is estimated to target 53 million FB users, which is 22% of total FB users in the US (240 million). The pilot target users (1) in the union of the top 10 states that are most conservative + top 20 most rural states (2) OR in the top 2,500 largest ZIP codes by population in other states that have high republican votes (# votes for Republican candidates / # total votes > 95%) or high rural population rate (% rural population > 85%) (3) Among users in (1) or (2), target users with age 35+.
Update for Pilot 2: The pilot includes samples to test 9 versions of ads - Outrage 1, Outrage 2, Attention 1, Attention 2, Real 1, Real 2, Fake News 1, Fake News 2, Fake.
After users click on the ads, they are randomized to be connected with one of the 4 chatbot conditions: two pre-consent chat conditions to test two different factoids (Factoid 1, Factoid 2), and two post-consent intro conditions to test the performance of the intro section with vs. without an additional motivator (Consent only, Motivator).
| 1a | Consent only | no factoid | no motivator |
| 1b | Motivator | no factoid | motivator |
| 2a | Factoid 1 | factoid 1 | no motivator |
| 2b | Factoid 2 | factoid 2 | no motivator |
After users consent to participate in the study, they are randomized into an intervention group and a control group. The intervention group is exposed to the SMS course, consequences nudge and content replacement; the control group is exposed to the facts baseline course only.
Will need to update for Pilot 2:
The data dictionary below outlines each variable used in analysis from the Chatfuel dataset. Group denotes the section of the chatbot. Derived denotes if this variable was created during the cleaning process.
# print data dictionary
data_dictionary %>%
select(cols, description,values, group, type, derived) %>%
datatable(rownames = F)
This table reports the mean, standard deviations, and total number of non-missing observations for numeric variables. Note that individuals without an arm assignment are not included in these tables. Participants who do not consent to the study do not receive a treatment assignment.
Here we report the summary statistics for all FB demographics variables of interest by treatment group.
# summary for all treatments
df_sim %>%
filter(!is.na(treatment)) %>%
select(colnames(df_sim)[str_detect(colnames(df_sim), "fb_demog")]) %>%
select(!fb_demog_gender & !fb_demog_race & !fb_demog_education & !fb_demog_income &!fb_demog_age_group ) %>%
summarise_all(list(n= function(x) sum(!is.na(x)),
mean = function(x) mean(x, na.rm = T),
sd = function(x) sd(x, na.rm = T))) %>%
mutate(treatment = "all treatments") %>%
# summary by treatmet
rbind(df_sim %>%
filter(!is.na(treatment)) %>%
select(colnames(df_sim)[str_detect(colnames(df_sim), "fb_demog")], treatment) %>%
select(!fb_demog_gender & !fb_demog_race & !fb_demog_education & !fb_demog_income &!fb_demog_age_group ) %>%
group_by(treatment) %>%
summarise_all(list(n= function(x) sum(!is.na(x)),
mean = function(x) mean(x, na.rm = T),
sd = function(x) sd(x, na.rm = T))))%>%
pivot_longer(cols = !treatment,
names_to = "variable",
values_to = "value") %>%
separate(variable, into = c("variable", "measure"), sep="_(?=[^_]+$)") %>%
pivot_wider(names_from = c("treatment", "measure"))%>%
# round mean and sd to 4 digits but not n
mutate_at(vars(contains(c("mean", "sd"))), round_4) %>%
kable(col.names = c("Variable", rep(c("N", "Mean", "SD"), 5))) %>%
add_header_above(c(" " = 1, "All" = 3,"Control" = 3, "Game" = 3, "SMS" = 3, "Video" = 3))%>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
| Variable | N | Mean | SD | N | Mean | SD | N | Mean | SD | N | Mean | SD | N | Mean | SD |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| fb_demog_age | 6857 | 58.2907 | 23.7599 | 4615 | 58.6540 | 23.7090 | 745 | 57.4188 | 24.0327 | 759 | 56.2648 | 23.7804 | 738 | 58.9824 | 23.6955 |
| fb_demog_gender_man | 6857 | 0.4279 | 0.4948 | 4615 | 0.4295 | 0.4951 | 745 | 0.3852 | 0.4870 | 759 | 0.4466 | 0.4975 | 738 | 0.4417 | 0.4969 |
| fb_demog_gender_non-binary | 6857 | 0.1429 | 0.3500 | 4615 | 0.1428 | 0.3499 | 745 | 0.1490 | 0.3563 | 759 | 0.1383 | 0.3455 | 738 | 0.1423 | 0.3496 |
| fb_demog_gender_woman | 6857 | 0.4292 | 0.4950 | 4615 | 0.4277 | 0.4948 | 745 | 0.4658 | 0.4992 | 759 | 0.4150 | 0.4931 | 738 | 0.4160 | 0.4932 |
| fb_demog_education_< high school | 6857 | 0.1776 | 0.3822 | 4615 | 0.1805 | 0.3846 | 745 | 0.1893 | 0.3920 | 759 | 0.1634 | 0.3699 | 738 | 0.1626 | 0.3693 |
| fb_demog_education_2-year degree | 6857 | 0.1663 | 0.3723 | 4615 | 0.1671 | 0.3731 | 745 | 0.1490 | 0.3563 | 759 | 0.1647 | 0.3711 | 738 | 0.1802 | 0.3846 |
| fb_demog_education_4-year degree | 6857 | 0.1712 | 0.3767 | 4615 | 0.1714 | 0.3769 | 745 | 0.1772 | 0.3821 | 759 | 0.1713 | 0.3770 | 738 | 0.1640 | 0.3705 |
| fb_demog_education_graduate degree | 6857 | 0.1600 | 0.3666 | 4615 | 0.1590 | 0.3658 | 745 | 0.1584 | 0.3654 | 759 | 0.1542 | 0.3613 | 738 | 0.1734 | 0.3789 |
| fb_demog_education_high school | 6857 | 0.1587 | 0.3654 | 4615 | 0.1567 | 0.3635 | 745 | 0.1611 | 0.3678 | 759 | 0.1726 | 0.3781 | 738 | 0.1545 | 0.3616 |
| fb_demog_education_some college | 6857 | 0.1663 | 0.3723 | 4615 | 0.1653 | 0.3715 | 745 | 0.1651 | 0.3715 | 759 | 0.1739 | 0.3793 | 738 | 0.1653 | 0.3717 |
| fb_demog_income_$100,000 - $150,000 | 6857 | 0.1574 | 0.3642 | 4615 | 0.1536 | 0.3606 | 745 | 0.1517 | 0.3589 | 759 | 0.1647 | 0.3711 | 738 | 0.1789 | 0.3835 |
| fb_demog_income_$30,000 - $49,999 | 6857 | 0.1759 | 0.3807 | 4615 | 0.1751 | 0.3801 | 745 | 0.1758 | 0.3809 | 759 | 0.1739 | 0.3793 | 738 | 0.1829 | 0.3869 |
| fb_demog_income_$50,000 - $69,999 | 6857 | 0.1619 | 0.3684 | 4615 | 0.1655 | 0.3717 | 745 | 0.1490 | 0.3563 | 759 | 0.1634 | 0.3699 | 738 | 0.1504 | 0.3577 |
| fb_demog_income_$70,000 - $99,999 | 6857 | 0.1639 | 0.3702 | 4615 | 0.1699 | 0.3756 | 745 | 0.1490 | 0.3563 | 759 | 0.1607 | 0.3675 | 738 | 0.1450 | 0.3523 |
| fb_demog_income_less than $30,000 | 6857 | 0.1660 | 0.3721 | 4615 | 0.1653 | 0.3715 | 745 | 0.1852 | 0.3887 | 759 | 0.1713 | 0.3770 | 738 | 0.1450 | 0.3523 |
| fb_demog_income_more than $150,000 | 6857 | 0.1750 | 0.3800 | 4615 | 0.1705 | 0.3761 | 745 | 0.1893 | 0.3920 | 759 | 0.1660 | 0.3723 | 738 | 0.1978 | 0.3986 |
| fb_demog_age_group_18-24 | 6857 | 0.0887 | 0.2843 | 4615 | 0.0858 | 0.2801 | 745 | 0.0993 | 0.2993 | 759 | 0.1014 | 0.3021 | 738 | 0.0827 | 0.2755 |
| fb_demog_age_group_25-34 | 6857 | 0.1228 | 0.3282 | 4615 | 0.1196 | 0.3245 | 745 | 0.1262 | 0.3323 | 759 | 0.1423 | 0.3496 | 738 | 0.1192 | 0.3243 |
| fb_demog_age_group_35-44 | 6857 | 0.1245 | 0.3302 | 4615 | 0.1270 | 0.3330 | 745 | 0.1221 | 0.3277 | 759 | 0.1212 | 0.3266 | 738 | 0.1152 | 0.3195 |
| fb_demog_age_group_45-54 | 6857 | 0.1237 | 0.3292 | 4615 | 0.1211 | 0.3263 | 745 | 0.1181 | 0.3230 | 759 | 0.1318 | 0.3384 | 738 | 0.1369 | 0.3439 |
| fb_demog_age_group_55-64 | 6857 | 0.1168 | 0.3212 | 4615 | 0.1129 | 0.3165 | 745 | 0.1369 | 0.3440 | 759 | 0.1238 | 0.3296 | 738 | 0.1138 | 0.3178 |
| fb_demog_age_group_65+ | 6857 | 0.4235 | 0.4942 | 4615 | 0.4336 | 0.4956 | 745 | 0.3973 | 0.4897 | 759 | 0.3794 | 0.4856 | 738 | 0.4322 | 0.4957 |
funnel_vars <- c("conversations_started",
"consent",
"intro_complete",
"treatment_complete",
"consequences_complete",
"feedback_complete",
"demographics_complete",
"full_complete")
df_sim %>%
filter(!is.na(treatment) ) %>%
select(all_of(funnel_vars)) %>%
summarise_all(list(n= length, mean = mean, sd =sd)) %>%
mutate(treatment = "all") %>%
rbind(df_sim %>%
select(all_of(funnel_vars), treatment) %>%
filter(!is.na(treatment) ) %>%
group_by(treatment) %>%
summarise_all(list(n= length, mean = mean, sd =sd))) %>%
pivot_longer( cols = !treatment,
names_to = "variable",
values_to = "value") %>%
separate(variable, into = c("variable", "measure"), sep="_(?=[^_]+$)") %>%
pivot_wider(names_from = c("treatment", "measure")) %>%
# round mean and sd to 4 digits but not n
mutate_at(vars(contains(c("mean", "sd"))), round_4) %>%
kable(col.names = c("Variable", rep(c("N", "Mean", "SD"), 5))) %>%
add_header_above(c(" " = 1, "All" = 3, "Control" = 3, "Game" = 3, "SMS" = 3, "Video" = 3))%>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
| Variable | N | Mean | SD | N | Mean | SD | N | Mean | SD | N | Mean | SD | N | Mean | SD |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| conversations_started | 6857 | 1.0000 | 0.0000 | 4615 | 1.0000 | 0.0000 | 745 | 1.0000 | 0.0000 | 759 | 1.0000 | 0.0000 | 738 | 1.0000 | 0.0000 |
| consent | 6857 | 1.0000 | 0.0000 | 4615 | 1.0000 | 0.0000 | 745 | 1.0000 | 0.0000 | 759 | 1.0000 | 0.0000 | 738 | 1.0000 | 0.0000 |
| intro_complete | 6857 | 1.0000 | 0.0000 | 4615 | 1.0000 | 0.0000 | 745 | 1.0000 | 0.0000 | 759 | 1.0000 | 0.0000 | 738 | 1.0000 | 0.0000 |
| treatment_complete | 6857 | 0.9262 | 0.2615 | 4615 | 1.0000 | 0.0000 | 745 | 0.7839 | 0.4119 | 759 | 0.6640 | 0.4726 | 738 | 0.8780 | 0.3275 |
| consequences_complete | 6857 | 0.7047 | 0.4562 | 4615 | 0.7569 | 0.4290 | 745 | 0.6000 | 0.4902 | 759 | 0.5204 | 0.4999 | 738 | 0.6734 | 0.4693 |
| feedback_complete | 6857 | 0.6358 | 0.4812 | 4615 | 0.6871 | 0.4637 | 745 | 0.5262 | 0.4996 | 759 | 0.4625 | 0.4989 | 738 | 0.6043 | 0.4893 |
| demographics_complete | 6857 | 0.3564 | 0.4790 | 4615 | 0.3844 | 0.4865 | 745 | 0.2966 | 0.4571 | 759 | 0.2675 | 0.4429 | 738 | 0.3333 | 0.4717 |
| full_complete | 6857 | 0.3429 | 0.4747 | 4615 | 0.3686 | 0.4825 | 745 | 0.2805 | 0.4496 | 759 | 0.2648 | 0.4415 | 738 | 0.3252 | 0.4688 |
# all
df_sim %>%
filter(!is.na(treatment)) %>%
select(fb_outcome_binary, fb_outcome_continuous) %>%
summarise_all(list(n= function(x)sum(!is.na(x)), mean = function(x)mean(x, na.rm = T), sd = function(x)sd(x, na.rm = T))) %>%
mutate(treatment = "all") %>%
# by treatment
rbind(df_sim %>%
select(fb_outcome_binary, fb_outcome_continuous, treatment) %>%
filter(!is.na(treatment)) %>%
group_by(treatment) %>%
summarise_all(list(n= function(x)sum(!is.na(x)), mean = function(x)mean(x, na.rm = T), sd = function(x)sd(x, na.rm = T)))) %>%
pivot_longer( cols = !treatment,
names_to = "variable",
values_to = "value") %>%
separate(variable, into = c("variable", "measure"), sep="_(?=[^_]+$)") %>%
pivot_wider(names_from = c("treatment", "measure")) %>%
# round mean and sd to 4 digits but not n
mutate_at(vars(contains(c("mean", "sd"))), round_4) %>%
kable(col.names = c("Variable", rep(c("N", "Mean", "SD"), 5))) %>%
add_header_above(c(" " = 1, "All" = 3, "Control" = 3, "Game" = 3, "SMS" = 3, "Video" = 3))%>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
| Variable | N | Mean | SD | N | Mean | SD | N | Mean | SD | N | Mean | SD | N | Mean | SD |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| fb_outcome_binary | 6857 | 0.6025 | 0.4894 | 4615 | 0.6054 | 0.4888 | 745 | 0.6148 | 0.4870 | 759 | 0.6074 | 0.4887 | 738 | 0.5664 | 0.4959 |
| fb_outcome_continuous | 6857 | 0.5016 | 0.2860 | 4615 | 0.5007 | 0.2865 | 745 | 0.4971 | 0.2864 | 759 | 0.5120 | 0.2846 | 738 | 0.5003 | 0.2840 |
# summary overall
df_sim %>%
select(!contains(c("demog", "outcome", "complete", funnel_vars))) %>%
filter(!is.na(treatment)) %>%
select_if(is.numeric) %>%
summarise_all(list(n= function(x)sum(!is.na(x)), mean = function(x)mean(x, na.rm = T), sd = function(x)sd(x, na.rm = T))) %>%
mutate(treatment = "all") %>%
rbind(df_sim %>%
select(!contains(c("demog", "outcome", "complete", funnel_vars))) %>%
group_by(treatment) %>%
select_if(is.numeric) %>%
filter(!is.na(treatment)) %>%
group_by(treatment) %>%
summarise_all(list(n= function(x)sum(!is.na(x)), mean = function(x)mean(x, na.rm = T), sd = function(x)sd(x, na.rm = T))))%>%
pivot_longer( cols = !treatment,
names_to = "variable",
values_to = "value") %>%
separate(variable, into = c("variable", "measure"), sep="_(?=[^_]+$)") %>%
pivot_wider(names_from = c("treatment", "measure")) %>%
# round mean and sd to 4 digits but not n
mutate_at(vars(contains(c("mean", "sd"))), round_4) %>%
# if NaN or NA, replace with "-"
mutate_all(~ replace(., .==" NaN" | . == " NA", "-")) %>%
kable(col.names = c("Variable", rep(c("N", "Mean", "SD"), 5))) %>%
add_header_above(c(" " = 1, "All" =3, "Control" = 3, "Game" = 3, "SMS" = 3, "Video" = 3))%>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
| Variable | N | Mean | SD | N | Mean | SD | N | Mean | SD | N | Mean | SD | N | Mean | SD |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| intro_go_on | 6857 | 1.0000 | 0.0000 | 4615 | 1.0000 | 0.0000 | 745 | 1.0000 | 0.0000 | 759 | 1.0000 | 0.0000 | 738 | 1.0000 | 0.0000 |
| treatment_group | 6857 | 0.3270 | 0.4691 | 4615 | 0.0000 | 0.0000 | 745 | 1.0000 | 0.0000 | 759 | 1.0000 | 0.0000 | 738 | 1.0000 | 0.0000 |
| treatment_sms | 5374 | 0.1412 | 0.3483 | 4615 | 0.0000 | 0.0000 | 0 |
|
|
759 | 1.0000 | 0.0000 | 0 |
|
|
| treatment_game | 5360 | 0.1390 | 0.3460 | 4615 | 0.0000 | 0.0000 | 745 | 1.0000 | 0.0000 | 0 |
|
|
0 |
|
|
| treatment_video | 5353 | 0.1379 | 0.3448 | 4615 | 0.0000 | 0.0000 | 0 |
|
|
0 |
|
|
738 | 1.0000 | 0.0000 |
| score | 584 | 2.5599 | 1.5001 | 0 |
|
|
584 | 2.5599 | 1.5001 | 0 |
|
|
0 |
|
|
| video_1 | 738 | 0.9593 | 0.1976 | 0 |
|
|
0 |
|
|
0 |
|
|
738 | 0.9593 | 0.1976 |
| video_2 | 708 | 0.9576 | 0.2016 | 0 |
|
|
0 |
|
|
0 |
|
|
708 | 0.9576 | 0.2016 |
| video_3 | 678 | 0.9794 | 0.1423 | 0 |
|
|
0 |
|
|
0 |
|
|
678 | 0.9794 | 0.1423 |
| video_4 | 664 | 0.9849 | 0.1219 | 0 |
|
|
0 |
|
|
0 |
|
|
664 | 0.9849 | 0.1219 |
| video_5 | 654 | 0.9908 | 0.0954 | 0 |
|
|
0 |
|
|
0 |
|
|
654 | 0.9908 | 0.0954 |
This table reports the distribution of observations in each category for categorical variables. Note that individuals without an arm assignment are not included in these tables.
#function to calculate count and shares of categorical variables
cat_summary <- function(variable){
var <- sym(variable)
df_sim %>%
filter(treatment != "no treatment") %>%
select(!!var) %>%
group_by(!!var) %>%
count(name = "Count") %>%
ungroup() %>%
mutate(Share = round(Count/sum(Count), 2))%>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
}
The arm variable denotes participants’ treatment assignment.
cat_summary("treatment")
| treatment | Count | Share |
|---|---|---|
| control | 4615 | 0.67 |
| game | 745 | 0.11 |
| sms | 759 | 0.11 |
| video | 738 | 0.11 |
Answer to the question: ❓How about you? If a family member or a friend posts something online, do you stop to check whether it’s misinformation?
A. I always trust my family and friends!
B. I always check whether something is misinformation.
cat_summary("family_friend_check")
| family_friend_check | Count | Share |
|---|---|---|
| a | 297 | 0.04 |
| b | 329 | 0.05 |
| NA | 6231 | 0.91 |
Answer to the question: ❓What motive might someone have for wanting you to believe the ingredients of another product are safer?
A. To sell their product.
B. Because they genuinely think the other product is dangerous.
C. All of the above.
cat_summary("fear_quiz")
| fear_quiz | Count | Share |
|---|---|---|
| a | 286 | 0.04 |
| b | 289 | 0.04 |
| NA | 6282 | 0.92 |
Answer to the question: Would you share this [anger inducing ad] on your social media feed?
A. Yes
B. No
cat_summary("anger_quiz")
| anger_quiz | Count | Share |
|---|---|---|
| a | 264 | 0.04 |
| b | 240 | 0.04 |
| NA | 6353 | 0.93 |
Answer to the question: What percentage of people do you think have unfollowed someone who shared misinformation?
10%
25%
50% or more
cat_summary("per_unfollow")
| per_unfollow | Count | Share |
|---|---|---|
| a | 2436 | 0.36 |
| b | 2396 | 0.35 |
| NA | 2025 | 0.30 |
demog_figures <- function(variable){
variable <- enquo(variable)
df_plot <- df_sim %>%
select(treatment, !!variable) %>%
filter(!is.na(treatment) & !is.na(!!variable)) %>%
group_by(treatment, !!variable) %>%
count()%>%
group_by(treatment) %>%
mutate(percent = paste(round(n/sum(n) * 100, 0), "%", sep = "")) %>%
ungroup() %>%
mutate(percent = case_when(!duplicated(!!variable) ~
paste(percent, "\n", str_to_title(!!variable), sep = "")
,
TRUE ~ percent))
x <- ggplot(df_plot,
aes(x = treatment,
y = n,
fill = !!variable)) +
geom_bar(position = "fill", stat ="identity", width = .7, color = "white") +
theme_sarah() +
scale_x_discrete(labels = label_fun )+
scale_fill_manual(values = c(cb_colors)) +
geom_text(aes( label = percent),
color = "white",
position = position_fill(vjust = 0.5),
size = 12/.pt,
fontface = "bold") +
labs(x = "",
y ="",
title = paste("Percentage of Participants by\n Treatment and", str_to_title(str_replace(str_remove(as_label(variable), "demog_"), "_", " ")), sep = " "),
caption = paste("Number of Observations:", scales::comma(sum(df_plot$n)))) +
scale_y_continuous(breaks = c(0, 1), labels = c("0%", "100%")) +
theme(legend.position = "none")
print(x)
}
The next figures show the share of different demographic groups by treatment. Note that the following figures only include observations where the treatment and demographic variables are both non-missing.
demog_vars <- c("fb_demog_gender", "fb_demog_age_group",
"fb_demog_race", "fb_demog_education", "fb_demog_income")
for(i in demog_vars){
cat("####", str_to_title(str_replace(str_remove(i, "demog_"), "_", " ")), "\n\n\n")
demog_figures(!!sym(paste(i, sep = "")))
cat("\n\n\n")
}
Facebook Ads Manager provides cost, impression, and conversation counts at the ad-age-gender level. Below, we show the cost per impression and cost per conversation by age or gender and ad or ad theme. Note that here, we use the Facebook Ads definition of conversations.
# figure is a function of the variable measure (impressions or conversations)
# group (ad theme group or individual ads)
# demographic variable (age or gender)
plot_cost_per <- function(variable, group, demo){
# choose a group to sort bars descending by
# plot colors
# filter out unknown gender group since it is a small size
if(demo == "age"){
label_group <-"65+"
plot_colors <- cb_colors[3:6]
} else if(demo == "gender"){
label_group <-"female"
plot_colors <- cb_colors[1:2]
ads_demo <- ads_demo %>%
filter(gender != "unknown")
}
# set plot title and x axis label
plot_title <- paste("Cost per", str_to_title(str_remove(variable, "s$")), "by", str_to_title(demo), sep =" ")
x_title <- paste("Cost per", str_to_title(str_remove(variable, "s$")), sep = " ")
group <- enquo(group)
variable <- sym(variable)
demo <- sym(demo)
# calculate cost per measure by group
plot_data <- ads_demo %>%
select(!!group, !!demo, !!variable, cost) %>%
group_by(!!group, !!demo) %>%
summarize_all(sum) %>%
mutate(cost_var = cost/ !!variable,
cost_var = ifelse(is.infinite(cost_var), 0, cost_var)
) %>%
group_by(!!demo) %>%
# order for plotting
mutate(ordervar = ifelse(!!demo == label_group, rank(cost_var), 10)) %>%
ungroup()
# this finds which ad or ad theme has the max cost per measure for female or 65+.
# this is where the demographic group labels are plotted
labels <- plot_data %>%
filter(!!demo == label_group) %>%
filter(cost_var ==max(cost_var)) %>%
pull(!!group)
# create labels for demographic groups
plot_data <- plot_data %>%
mutate(demo_label = ifelse(!!group== labels, paste(str_to_title(!!demo), " "), ""))
# plot bars
ggplot(plot_data,
aes(y = reorder(!!group, ordervar),
x = cost_var,
group = !!demo,
fill = !!demo)) +
geom_bar(stat = "identity",
position = "dodge",
color = "white") +
theme_sarah() +
theme(legend.position = "none",
panel.grid.major.y = element_blank()) +
labs(y = "",
x = x_title,
title = plot_title) +
# set colors
scale_fill_manual(values = plot_colors) +
# this plots bar demographic labels
geom_text(aes(label = demo_label),
position = position_dodge(0.9),
color="white",
vjust = 0.5,
hjust = 1)
}
# only thing to add after running function is
# the scale limits and breaks, which is done manually
# since its different for each combination
Note that Facebook Ads also has a gender category called unknown. Since the size of this category is so small, we do no included these observations below.
plot_cost_per("impressions", ad_name, "gender") +
scale_x_continuous(limits = c(0, 0.065),
expand = c(0,0),
breaks = seq(0, 0.06, 0.01))
plot_cost_per("impressions", ad_name, "age") +
scale_x_continuous(limits = c(0, 0.056),
expand = c(0,0),
breaks = seq(0, 0.055, 0.01))
Note that Facebook Ads also has a gender category called unknown. Since the size of this category is so small, we do no included these observations below. We calculate cost per conversation = cost / #conversations. Missing bars indicates that we could not calculate cost per conversation because there were zero conversations for that subgroup.
plot_cost_per("conversations", ad_name, "gender") +
scale_x_continuous(limits = c(0, 65),
expand = c(0,0),
breaks = seq(0, 60, 10))
We calculate cost per conversation = cost / #conversations. Missing bars indicates that we could not calculate cost per conversation because there were zero conversations for that subgroup.
plot_cost_per("conversations", ad_name, "age") +
scale_x_continuous(limits = c(0, 65),
expand = c(0,0),
breaks = seq(0, 65, 10))
This table presents an overview of funnel statistics at the ad-, chatbot-, and treatment-level.
All responses, regardless of treatment-level, are included. Note that here, we use the Facebook Ads definition of conversations.
ads %>%
mutate(cost_per_impression = form_cost(cost/impressions),
cost_per_consent = form_cost(cost/consents),
cost_per_click = form_cost(cost/clicks),
cost_per_completion = form_cost(cost/completions),
impressions = form_num(impressions),
reach = form_num(reach),
clicks = form_num(clicks)) %>%
select("Ad" = ad_name,
"Impressions" = impressions,
"Reach" = reach,
"Clicks" = clicks,
"Conversations" = conversations,
"Consents" = consents,
"Completions" = completions,
"Cost per Impression" = cost_per_impression,
"Cost per Click" = cost_per_click,
"Cost per Consent" = cost_per_consent,
"Cost per Completion" = cost_per_completion) %>%
arrange(Ad) %>%
kable(escape = F) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
| Ad | Impressions | Reach | Clicks | Conversations | Consents | Completions | Cost per Impression | Cost per Click | Cost per Consent | Cost per Completion |
|---|---|---|---|---|---|---|---|---|---|---|
| attention 1 | 4,274 | 3,499 | 231 | 9 | 7 | 3 | $0.037 | $0.677 | $22.353 | $52.157 |
| fake | 6,185 | 4,989 | 305 | 13 | 3 | 2 | $0.029 | $0.595 | $60.443 | $90.665 |
| fake news 2 | 12,629 | 9,490 | 711 | 25 | 11 | 4 | $0.043 | $0.767 | $49.576 | $136.335 |
| help friends | 1,148 | 960 | 97 | 1 | 0 | 0 | $0.025 | $0.301 |
|
|
| join | 18,192 | 14,110 | 1,099 | 65 | 25 | 7 | $0.042 | $0.697 | $30.644 | $109.443 |
| No Ad ID |
|
|
|
5 | 4 | 2 |
|
|
|
|
| outrage 1 | 2,954 | 2,284 | 254 | 4 | 2 | 1 | $0.019 | $0.222 | $28.17 | $56.34 |
| protect | 2,778 | 2,203 | 160 | 1 | 0 | 0 | $0.03 | $0.522 |
|
|
| real 1 | 3,733 | 3,123 | 108 | 15 | 6 | 1 | $0.027 | $0.939 | $16.908 | $101.45 |
| real 2 | 1,983 | 1,442 | 105 | 5 | 2 | 1 | $0.04 | $0.765 | $40.155 | $80.31 |
All users who consented are included.
Cost per Consent = total experiment costs / consents. Cost per Completion = total experiment costs / completions.
total_cost <- sum(ads$cost, na.rm =T)
# treatment level
df_sim %>%
select(treatment, consent, full_complete) %>%
filter(!is.na(treatment)) %>%
group_by(treatment) %>%
summarize(consents = sum(consent, na.rm = T),
completions = sum(full_complete, na.rm = T)) %>%
mutate(cost_per_consent = form_cost(total_cost/consents),
cost_per_completion = form_cost(total_cost/completions),
treatment = label_fun(treatment)) %>%
# all treatments
rbind(df_sim %>%
filter(!is.na(treatment)) %>%
select( consent, full_complete) %>%
summarize(consents = sum(consent, na.rm = T),
completions = sum(full_complete, na.rm = T)) %>%
mutate(cost_per_consent = form_cost(total_cost/consents),
cost_per_completion = form_cost(total_cost/completions),
treatment = "All")) %>%
# rename column names to titles
select("Treatment" = treatment,
"Consents" = consents,
"Completions" = completions,
"Cost per Consent" = cost_per_consent,
"Cost per Completion" = cost_per_completion) %>%
kable(escape = F) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
| Treatment | Consents | Completions | Cost per Consent | Cost per Completion |
|---|---|---|---|---|
| Control | 4615 | 1701 | $0.433 | $1.176 |
| Game | 745 | 209 | $2.685 | $9.569 |
| SMS | 759 | 201 | $2.635 | $9.95 |
| Video | 738 | 240 | $2.71 | $8.333 |
| All | 6857 | 2351 | $0.292 | $0.851 |
This table shows the detailed funnel by all, treatment, and control participants. Here, “All” refers to all participants from the stage of ad impressions, including those who did not start a conversation or did not consent to participate in the study (these users were not assigned to an arm). “SMS participants” refer to consenting users who are assigned to the SMS intervention arm, and “Control participants” refer to consenting users who are assigned to the control arm.
Funnel Parameters
# funnel for overall and by add
get_funnel <- function(group, ad = NULL){
# filter data for ad level analysis
if(!is.null(ad)){
df_temp <- df_sim[df_sim$ad_name == ad & !is.na(df_sim$ad_name), ]
ads_temp <- ads[ads$ad_name == ad & !is.na(ads$ad_name), ]
} else{
df_temp <- df_sim
ads_temp <- ads
}
# filter data for treatment level analysis
if (group == "all"){
df_temp <- df_temp
} else {
df_temp <- df_temp[df_temp$treatment == group & !is.na(df_temp$treatment), ]
}
# calculate total costs, impressions, clicks, from ads
cost <- sum(ads_temp$cost, na.rm = T)
impressions <- sum(ads_temp$impressions, na.rm = T)
reach <- sum(ads_temp$reach, na.rm = T)
clicks <- sum(ads_temp$clicks, na.rm = T)
# get rest of metrics from chatfuel
conversations <- nrow(df_temp)
consents <- sum(df_temp$consent, na.rm = T)
treatment_complete <- sum(df_temp$treatment_complete)
feedback_complete <- sum(df_temp$feedback_complete)
demog_complete <- sum(df_temp$demographics_complete)
full_complete <- sum(df_temp$full_complete)
# metric names
metric = c(
"Impressions",
"Reach",
"Ad Clickthrough",
"Conversation Started",
"Consent Obtained",
"Treatment Completed",
"Feedback Complete",
"Demographics Complete",
"Full Complete")
cost_fun <- function(x){
return(form_cost(cost/x))
}
metrics_list <- c(impressions, reach, clicks, conversations, consents, treatment_complete
, feedback_complete,
demog_complete, full_complete)
# cost of each metric
costs <- sapply(metrics_list, cost_fun)
# counts formatted with comma
counts <- sapply(metrics_list, scales::comma)
percent_se <- function(numerator, denominator){
p <- numerator/denominator
se <- sqrt(p*(1-p)/denominator)
p_se <- paste(form_percent(p), " (", form_percent(se), ")", sep = "")
return(p_se)
}
# percent of previous
percent <- mapply(function(x, y) percent_se(x, y),
metrics_list[-1],
metrics_list[-length(metrics_list)]
)
percent <- c("-", percent)
percent_of_impressions <- sapply(metrics_list, function(x) percent_se(x, impressions))
percent_of_impressions[1] <- "-"
# percent of consents
percent_of_consents = sapply(metrics_list, function(x) percent_se(x, consents))
percent_of_consents[1:4] <- "-"
if(group == "all" & is.null(ad)){
dropoff_1 <- data.frame(cbind(metric, counts, percent, percent_of_impressions, costs))
} else {
dropoff_1 <- data.frame(cbind(metric, counts, percent, percent_of_consents))
}
if(!is.null(ad)){
if(ad == "No Ad ID"){
dropoff_1[dropoff_1$metric == "Impressions", c("counts", "percent", "percent_of_consents") ] <- "-"
dropoff_1[dropoff_1$metric == "Reach", c("counts", "percent", "percent_of_consents") ] <- "-"
dropoff_1[dropoff_1$metric == "Ad Clickthrough", c("counts", "percent", "percent_of_consents")] <- "-"
dropoff_1[dropoff_1$metric == "Conversation Started", "percent"] <- "-"
}
}
if(group == "all" & is.null(ad)){
colnames(dropoff_1) <- c("Metric", "Number of Obs.",
"Percent of Previous", "Percent of Impressions", "Cost per Stage")
} else {
dropoff_1[dropoff_1$metric %in% c("Impressions", "Reach", "Ad Clickthrough", "Conversation Started"), c("counts", "percent")] <- "-"
dropoff_1[dropoff_1$metric == "Consent Obtained", c("percent", "percent_of_consents")] <- "-"
colnames(dropoff_1) <- c("Metric", "Number of Obs.",
"Percent of Previous", "Percent of Consents")
}
print(kable(dropoff_1) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")))
}
get_funnel(group = "all")
| Metric | Number of Obs. | Percent of Previous | Percent of Impressions | Cost per Stage |
|---|---|---|---|---|
| Impressions | 53,876 |
|
|
$0.037 |
| Reach | 42,100 | 78.14% (0.18%) | 78.14% (0.18%) | $0.048 |
| Ad Clickthrough | 3,070 | 7.29% (0.13%) | 5.7% (0.1%) | $0.651 |
| Conversation Started | 10,000 | 325.73% (-) | 18.56% (0.17%) | $0.2 |
| Consent Obtained | 6,999 | 69.99% (0.46%) | 12.99% (0.14%) | $0.286 |
| Treatment Completed | 6,351 | 90.74% (0.35%) | 11.79% (0.14%) | $0.315 |
| Feedback Complete | 4,360 | 68.65% (0.58%) | 8.09% (0.12%) | $0.459 |
| Demographics Complete | 2,444 | 56.06% (0.75%) | 4.54% (0.09%) | $0.818 |
| Full Complete | 2,351 | 96.19% (0.39%) | 4.36% (0.09%) | $0.851 |
get_funnel("game")
| Metric | Number of Obs. | Percent of Previous | Percent of Consents |
|---|---|---|---|
| Impressions |
|
|
|
| Reach |
|
|
|
| Ad Clickthrough |
|
|
|
| Conversation Started |
|
|
|
| Consent Obtained | 745 |
|
|
| Treatment Completed | 584 | 78.39% (1.51%) | 78.39% (1.51%) |
| Feedback Complete | 392 | 67.12% (1.94%) | 52.62% (1.83%) |
| Demographics Complete | 221 | 56.38% (2.5%) | 29.66% (1.67%) |
| Full Complete | 209 | 94.57% (1.52%) | 28.05% (1.65%) |
get_funnel("sms")
| Metric | Number of Obs. | Percent of Previous | Percent of Consents |
|---|---|---|---|
| Impressions |
|
|
|
| Reach |
|
|
|
| Ad Clickthrough |
|
|
|
| Conversation Started |
|
|
|
| Consent Obtained | 759 |
|
|
| Treatment Completed | 504 | 66.4% (1.71%) | 66.4% (1.71%) |
| Feedback Complete | 351 | 69.64% (2.05%) | 46.25% (1.81%) |
| Demographics Complete | 203 | 57.83% (2.64%) | 26.75% (1.61%) |
| Full Complete | 201 | 99.01% (0.69%) | 26.48% (1.6%) |
get_funnel("video")
| Metric | Number of Obs. | Percent of Previous | Percent of Consents |
|---|---|---|---|
| Impressions |
|
|
|
| Reach |
|
|
|
| Ad Clickthrough |
|
|
|
| Conversation Started |
|
|
|
| Consent Obtained | 738 |
|
|
| Treatment Completed | 648 | 87.8% (1.2%) | 87.8% (1.2%) |
| Feedback Complete | 446 | 68.83% (1.82%) | 60.43% (1.8%) |
| Demographics Complete | 246 | 55.16% (2.35%) | 33.33% (1.74%) |
| Full Complete | 240 | 97.56% (0.98%) | 32.52% (1.72%) |
get_funnel("control")
| Metric | Number of Obs. | Percent of Previous | Percent of Consents |
|---|---|---|---|
| Impressions |
|
|
|
| Reach |
|
|
|
| Ad Clickthrough |
|
|
|
| Conversation Started |
|
|
|
| Consent Obtained | 4,615 |
|
|
| Treatment Completed | 4,615 | 100% (0%) | 100% (0%) |
| Feedback Complete | 3,171 | 68.71% (0.68%) | 68.71% (0.68%) |
| Demographics Complete | 1,774 | 55.94% (0.88%) | 38.44% (0.72%) |
| Full Complete | 1,701 | 95.89% (0.47%) | 36.86% (0.71%) |
This table shows the detailed funnel by ad split for all, treatment, and control participants. Here, “All” refers to all participants from the stage of ad impressions, including those who did not start a conversation or did not consent to participate in the study (these users were not assigned to an arm).
# merge ad data
df_sim <- merge(df_sim, ads, by. ="ad", by.y = "ad_name", all.x = T)
# loop over ads
for(i in sort(unique(df_sim$ad_name))) {
cat("###", "Ad", i, "{.tabset} \n")
# loop over treatmnet
for(j in c("all", "sms", "control")){
cat("####", paste(label_fun(j), "Participants", sep = " "), " \n")
print(get_funnel(ad = i, group = j))
cat(" \n")
}
cat(" \n")
}
# function to compute completion for detailed funnel
detailed_funnel_fun <- function(x){
# if any missing, count non-missing observations
if(any(is.na(x))){
sum(!is.na(x))
# is there are no non-missings,
# and the variable is numeric, sum over the
# variable (these are binary),
# else, sum non-missing
} else {
if(is.numeric(x)){
sum(x)
} else {
sum(!is.na(x))
}
}
}
# loop over treatments
for(treat in c( "sms", "control", "game", "video")){
cat("###", paste(label_fun(treat), "{.tabset}", sep = " "), " \n")
plot_df <- df_sim %>%
filter( treatment == treat) %>%
# select variables from detailed funnel correpsonding to the treatment-chatbot combo
select(any_of(detailed_funnel[detailed_funnel$treatment == treat, "variable"] %>%pull())) %>%
summarize_all(.funs =detailed_funnel_fun) %>%
pivot_longer(cols = everything(), values_to = "count") %>%
mutate(perc_of_consents = count/pull(.[.$name == "consent", "count"]))
# get number of consents for plot title
consents <- plot_df %>% filter(name == "consent") %>% pull(count)
# figure
gg <- plot_df %>%
# order variables for plotting
mutate(name = factor(name, levels = rev(name))) %>%
# group = 1 adds the line through the dots
ggplot(aes(x = name, y = perc_of_consents, group = 1)) +
geom_point(size = 3) +
geom_line() +
coord_flip() +
theme_sarah() +
labs(x= "",
y = "Percent of Conversations",
title = paste("Number of Consents:", consents)) +
scale_y_continuous(limits = c(0, 1),
breaks = seq(0, 1, by = 0.2),
labels = scales::percent_format(accuracy = 1))
# table output
plot_df %>%
mutate(perc_of_consents = form_percent(perc_of_consents)) %>%
kable(col.names = c("Variable", "Number. of Obs", "Percent of Consents")) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
print()
# plot output
print(gg)
cat(" \n\n\n")
}
| Variable | Number. of Obs | Percent of Consents |
|---|---|---|
| consent | 759 | 100% |
| intro_go_on | 759 | 100% |
| sms_quiz | 685 | 90.25% |
| family_friend_check | 626 | 82.48% |
| fear_quiz | 575 | 75.76% |
| anger_quiz | 504 | 66.4% |
| headline1_answer | 462 | 60.87% |
| headline2_answer | 432 | 56.92% |
| per_unfollow | 395 | 52.04% |
| enjoy_most | 351 | 46.25% |
| enjoy_least | 337 | 44.4% |
| demog_age | 350 | 46.11% |
| demog_gender | 336 | 44.27% |
| demog_ethnic | 303 | 39.92% |
| demog_income | 274 | 36.1% |
| demog_education | 245 | 32.28% |
| demog_religion | 220 | 28.99% |
| demog_political | 203 | 26.75% |
| amazon_credit | 201 | 26.48% |
| Variable | Number. of Obs | Percent of Consents |
|---|---|---|
| consent | 4615 | 100% |
| intro_go_on | 4615 | 100% |
| headline1_answer | 4134 | 89.58% |
| headline2_answer | 3915 | 84.83% |
| per_unfollow | 3493 | 75.69% |
| enjoy_most | 3171 | 68.71% |
| enjoy_least | 3020 | 65.44% |
| demog_age | 3154 | 68.34% |
| demog_gender | 3004 | 65.09% |
| demog_ethnic | 2744 | 59.46% |
| demog_income | 2471 | 53.54% |
| demog_education | 2219 | 48.08% |
| demog_religion | 1965 | 42.58% |
| demog_political | 1774 | 38.44% |
| amazon_credit | 1701 | 36.86% |
| Variable | Number. of Obs | Percent of Consents |
|---|---|---|
| consent | 745 | 100% |
| intro_go_on | 745 | 100% |
| game_completed | 599 | 80.4% |
| score | 584 | 78.39% |
| headline1_answer | 523 | 70.2% |
| headline2_answer | 494 | 66.31% |
| per_unfollow | 447 | 60% |
| enjoy_most | 392 | 52.62% |
| enjoy_least | 381 | 51.14% |
| demog_age | 389 | 52.21% |
| demog_gender | 375 | 50.34% |
| demog_ethnic | 340 | 45.64% |
| demog_income | 293 | 39.33% |
| demog_education | 273 | 36.64% |
| demog_religion | 245 | 32.89% |
| demog_political | 221 | 29.66% |
| amazon_credit | 209 | 28.05% |
| Variable | Number. of Obs | Percent of Consents |
|---|---|---|
| consent | 738 | 100% |
| intro_go_on | 738 | 100% |
| video_1 | 708 | 95.93% |
| video_2 | 708 | 95.93% |
| video_3 | 678 | 91.87% |
| video_4 | 664 | 89.97% |
| video_5 | 654 | 88.62% |
| headline1_answer | 589 | 79.81% |
| headline2_answer | 556 | 75.34% |
| per_unfollow | 497 | 67.34% |
| enjoy_most | 446 | 60.43% |
| enjoy_least | 422 | 57.18% |
| demog_age | 440 | 59.62% |
| demog_gender | 420 | 56.91% |
| demog_ethnic | 379 | 51.36% |
| demog_income | 339 | 45.93% |
| demog_education | 314 | 42.55% |
| demog_religion | 273 | 36.99% |
| demog_political | 246 | 33.33% |
| amazon_credit | 240 | 32.52% |
estimate_effect <- function(df, outcome, outcome_name){
# get mean and std. errors by treatment for plotting
eq <- as.formula(paste(outcome, " ~ 0+ treatment"))
lm <- lm_robust(eq, data = df)
# make df out of lm results
plot <- tidy(lm)
gg <- ggplot(plot, aes(x = term, y = estimate)) +
geom_point(position = position_dodge(width = 0.5), size = 3) +
geom_pointrange(
aes(x = term,ymin = estimate - 1.96 * std.error,
ymax = estimate + 1.96 * std.error),
width = .1,
position = position_dodge(width = 0.5)) +
# function to clean label names
scale_x_discrete(labels = function(x){
label <- str_remove(x, "treatment")
label <- label_fun(label)
return(label)
}) +
labs(x = 'Treatment',
y = 'Estimate',
title = paste0('Treatment Effect Estimate on ', outcome_name)) +
theme_sarah() +
scale_y_continuous(limits = c(0, 1),
breaks = seq(0, 1, 0.2))
print(gg)
# get difference between treatments estimates
eq <- as.formula(paste(outcome, " ~ treatment"))
lm2 <- lm_robust(eq, data = df)
out <- tidy(lm2)
video_n = sum(df$treatment == "video", na.rm = T)
game_n = sum(df$treatment == "game", na.rm = T)
sms_n = sum(df$treatment == "sms", na.rm = T)
control_n = sum(df$treatment == "control", na.rm = T)
plot %>%
select(term, estimate_value = estimate, std.error_value = std.error) %>%
mutate(estimate_std_value = paste(round_4(estimate_value), "\n(", round_4(std.error_value), ")", sep = "")) %>%
select(!estimate_value & !std.error_value) %>%
merge(out[2:4, c("term", "estimate",
"std.error", "p.value",
"conf.low", "conf.high")], by = "term", all.x = T) %>%
mutate(estimate_std_diff = paste(round_4(estimate), "\n(", round_4(std.error), ")", sep = "")) %>%
select(!estimate & !std.error) %>%
mutate(conf = paste("(", round_4(conf.low), ", ", round_4(conf.high), ")", sep = ""),
n =c(control_n, game_n, sms_n, video_n)) %>%
select(!conf.low & !conf.high) %>%
mutate(term = str_remove(term, "treatment"),
term = factor(term, c("game", "sms", "video", "control"))) %>%
arrange(term) %>%
mutate(p.value = ifelse(term == "control", "-", p.value),
estimate_std_diff = ifelse(term == "control", "-", estimate_std_diff),
conf = ifelse(term == "control", "-", conf)) %>%
select("Treatment"= term,
"Estimate" = estimate_std_value,
"N" = n,
"Difference from Control" = estimate_std_diff,
"95% Conf. Int." = conf,
"p-value" = p.value) %>%
kable(row.names = F, escape = T) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
print()
}
estimate_effect(df_sim, "fb_outcome_binary", "Outcome Binary")
| Treatment | Estimate | N | Difference from Control | 95% Conf. Int. | p-value |
|---|---|---|---|---|---|
| game | 0.6148 (0.0178) | 745 | 0.0093 (0.0192) | (-0.0284, 0.0471) | 0.627041234059019 |
| sms | 0.6074 (0.0177) | 759 | 0.0020 (0.0191) | (-0.0356, 0.0395) | 0.918401926499294 |
| video | 0.5664 (0.0183) | 738 | -0.0390 (0.0196) | (-0.0775, -0.0006) | 0.0467737948032825 |
| control | 0.6054 (0.0072) | 4615 |
|
|
|
estimate_effect(df_sim, "fb_outcome_continuous", "Outcome Continuous")
| Treatment | Estimate | N | Difference from Control | 95% Conf. Int. | p-value |
|---|---|---|---|---|---|
| game | 0.4971 (0.0105) | 745 | -0.0036 (0.0113) | (-0.0258, 0.0186) | 0.749572344614976 |
| sms | 0.5120 (0.0103) | 759 | 0.0112 (0.0112) | (-0.0107, 0.0331) | 0.314803204222693 |
| video | 0.5003 (0.0105) | 738 | -0.0004 (0.0113) | (-0.0225, 0.0217) | 0.969258083314189 |
| control | 0.5007 (0.0042) | 4615 |
|
|
|
Age Group : 18-24, 25-34, 35-44, 45-54, 55-64, 65+
Education: High school or less; Some college; Bachelor’s degree; Graduate degree
Gender: Man, Woman, Non-binary
Political: Republic, Independent, Democrat
# get mean and std. errors by treatment for plotting
tests <- combn(sort(unique(df_sim$treatment)), 2, simplify = F)
covariates <- c("fb_demog_gender_man", "fb_demog_age")
outcomes <- c("fb_outcome_binary", "fb_outcome_continuous")
subgroup_analysis <- function(covariates, outcomes, tests){
outlist <- list()
for(t in 1:length(tests)){
for (cov in covariates){
for(outcome in outcomes){
test = tests[[t]]
df_test <- df_sim[df_sim$treatment %in% test, c("treatment", cov, outcome)]
form <- as.formula(paste(outcome, " ~ 0 +", " treatment * ", cov))
model <- tidy(lm_robust(form, data = df_test))
means <- model[1:2, "estimate"]
names(means) <- test
outlist[[paste(t,cov,outcome, sep = "")]] <- cbind(t(as.matrix(means)), model[3, c("outcome", "estimate", "std.error", "conf.low", "conf.high", "p.value")], nobs = nrow(df_test), covariate = cov)
}
}
}
final <- do.call("smartbind", outlist)
rownames(final) <- NULL
final_out <- final[, c("outcome", "covariate", "control", "game", "sms", "video", "estimate", "std.error", "conf.low", "conf.high", "p.value", "nobs")] %>%
arrange(outcome, covariate)
return(final_out)
}
options(knitr.kable.NA = '')
subgroup <- subgroup_analysis(covariates, outcomes, tests)
subgroup%>%
kable(digits = 4)%>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
| outcome | covariate | control | game | sms | video | estimate | std.error | conf.low | conf.high | p.value | nobs |
|---|---|---|---|---|---|---|---|---|---|---|---|
| fb_outcome_binary | fb_demog_age | 0.6042 | 0.6363 | 0.0000 | 0.0003 | -0.0006 | 0.0006 | 0.9444 | 5360 | ||
| fb_outcome_binary | fb_demog_age | 0.6042 | 0.6168 | 0.0000 | 0.0003 | -0.0006 | 0.0006 | 0.9444 | 5374 | ||
| fb_outcome_binary | fb_demog_age | 0.6042 | 0.6390 | 0.0000 | 0.0003 | -0.0006 | 0.0006 | 0.9444 | 5353 | ||
| fb_outcome_binary | fb_demog_age | 0.6363 | 0.6168 | -0.0004 | 0.0007 | -0.0018 | 0.0011 | 0.6132 | 1504 | ||
| fb_outcome_binary | fb_demog_age | 0.6363 | 0.6390 | -0.0004 | 0.0007 | -0.0018 | 0.0011 | 0.6132 | 1483 | ||
| fb_outcome_binary | fb_demog_age | 0.6168 | 0.6390 | -0.0002 | 0.0007 | -0.0016 | 0.0013 | 0.8216 | 1497 | ||
| fb_outcome_binary | fb_demog_gender_man | 0.6111 | 0.6070 | -0.0132 | 0.0145 | -0.0417 | 0.0153 | 0.3640 | 5360 | ||
| fb_outcome_binary | fb_demog_gender_man | 0.6111 | 0.6048 | -0.0132 | 0.0145 | -0.0417 | 0.0153 | 0.3640 | 5374 | ||
| fb_outcome_binary | fb_demog_gender_man | 0.6111 | 0.5728 | -0.0132 | 0.0145 | -0.0417 | 0.0153 | 0.3640 | 5353 | ||
| fb_outcome_binary | fb_demog_gender_man | 0.6070 | 0.6048 | 0.0202 | 0.0366 | -0.0516 | 0.0920 | 0.5813 | 1504 | ||
| fb_outcome_binary | fb_demog_gender_man | 0.6070 | 0.5728 | 0.0202 | 0.0366 | -0.0516 | 0.0920 | 0.5813 | 1483 | ||
| fb_outcome_binary | fb_demog_gender_man | 0.6048 | 0.5728 | 0.0059 | 0.0357 | -0.0642 | 0.0759 | 0.8697 | 1497 | ||
| fb_outcome_continuous | fb_demog_age | 0.5114 | 0.5101 | -0.0002 | 0.0002 | -0.0005 | 0.0002 | 0.3027 | 5360 | ||
| fb_outcome_continuous | fb_demog_age | 0.5114 | 0.4743 | -0.0002 | 0.0002 | -0.0005 | 0.0002 | 0.3027 | 5374 | ||
| fb_outcome_continuous | fb_demog_age | 0.5114 | 0.4967 | -0.0002 | 0.0002 | -0.0005 | 0.0002 | 0.3027 | 5353 | ||
| fb_outcome_continuous | fb_demog_age | 0.5101 | 0.4743 | -0.0002 | 0.0004 | -0.0011 | 0.0006 | 0.6089 | 1504 | ||
| fb_outcome_continuous | fb_demog_age | 0.5101 | 0.4967 | -0.0002 | 0.0004 | -0.0011 | 0.0006 | 0.6089 | 1483 | ||
| fb_outcome_continuous | fb_demog_age | 0.4743 | 0.4967 | 0.0007 | 0.0004 | -0.0002 | 0.0015 | 0.1310 | 1497 | ||
| fb_outcome_continuous | fb_demog_gender_man | 0.4940 | 0.5073 | 0.0156 | 0.0085 | -0.0011 | 0.0324 | 0.0674 | 5360 | ||
| fb_outcome_continuous | fb_demog_gender_man | 0.4940 | 0.5069 | 0.0156 | 0.0085 | -0.0011 | 0.0324 | 0.0674 | 5374 | ||
| fb_outcome_continuous | fb_demog_gender_man | 0.4940 | 0.5133 | 0.0156 | 0.0085 | -0.0011 | 0.0324 | 0.0674 | 5353 | ||
| fb_outcome_continuous | fb_demog_gender_man | 0.5073 | 0.5069 | -0.0264 | 0.0212 | -0.0680 | 0.0153 | 0.2145 | 1504 | ||
| fb_outcome_continuous | fb_demog_gender_man | 0.5073 | 0.5133 | -0.0264 | 0.0212 | -0.0680 | 0.0153 | 0.2145 | 1483 | ||
| fb_outcome_continuous | fb_demog_gender_man | 0.5069 | 0.5133 | 0.0114 | 0.0207 | -0.0293 | 0.0520 | 0.5832 | 1497 |
The following code prints the table/figures for gender. We can repeat it for other groups we want to look at.
df_test <- df_sim[df_sim$treatment %in% c("control", "game"), c("treatment", "fb_demog_gender_man", "fb_outcome_continuous")]
form <- as.formula(paste("fb_outcome_continuous", " ~ 0 +", " treatment * fb_demog_gender_man"))
model <- tidy(lm_robust(form, data = df_test))
df_sim %>%
select(fb_demog_gender_man, treatment, fb_outcome_continuous) %>%
filter(!is.na(treatment)) %>%
group_by(treatment, fb_demog_gender_man) %>%
summarize_all(list(mean = mean, sd =sd, n = length)) %>%
mutate(se = sd/sqrt(n))%>%
ggplot(aes(group = treatment, color = as.factor(fb_demog_gender_man))) +
geom_segment(aes(x = mean -1.96 * se, xend = mean + 1.96 *se, y = fb_demog_gender_man, yend = fb_demog_gender_man)) +
geom_point(aes(x = mean, y = fb_demog_gender_man)) +
facet_grid(rows =vars(treatment), switch = "y") +
scale_y_continuous(breaks = c(0, 1),
limits = c(-.25, 1.25)) +
theme_sarah() +
labs(y = "", x = "Continuous Outcome", title = "Continuous Outcome for Gender\nBy Treatment Group") +
theme(panel.grid.minor.y= element_blank(),
axis.text.y = element_blank(),
strip.text.y.left = element_text(angle = 0),
legend.position = "none")
subgroup %>%
filter(outcome == "fb_outcome_continuous" & covariate == "fb_demog_gender_man") %>%
kable(digits = 4)%>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
| outcome | covariate | control | game | sms | video | estimate | std.error | conf.low | conf.high | p.value | nobs |
|---|---|---|---|---|---|---|---|---|---|---|---|
| fb_outcome_continuous | fb_demog_gender_man | 0.494 | 0.5073 | 0.0156 | 0.0085 | -0.0011 | 0.0324 | 0.0674 | 5360 | ||
| fb_outcome_continuous | fb_demog_gender_man | 0.494 | 0.5069 | 0.0156 | 0.0085 | -0.0011 | 0.0324 | 0.0674 | 5374 | ||
| fb_outcome_continuous | fb_demog_gender_man | 0.494 | 0.5133 | 0.0156 | 0.0085 | -0.0011 | 0.0324 | 0.0674 | 5353 | ||
| fb_outcome_continuous | fb_demog_gender_man | 0.5073 | 0.5069 | -0.0264 | 0.0212 | -0.0680 | 0.0153 | 0.2145 | 1504 | ||
| fb_outcome_continuous | fb_demog_gender_man | 0.5073 | 0.5133 | -0.0264 | 0.0212 | -0.0680 | 0.0153 | 0.2145 | 1483 | ||
| fb_outcome_continuous | fb_demog_gender_man | 0.5069 | 0.5133 | 0.0114 | 0.0207 | -0.0293 | 0.0520 | 0.5832 | 1497 |
# Make nice labels for plot
labels <- data.frame(covariate = c(colnames(df_sim)[str_detect(colnames(df_sim), "fb_demog_(gender|education|age_group)_.+")],
"fb_demog_age"))
labels$covariate_label <- str_remove(labels$covariate, "fb_demog_")
labels$covariate_group <- str_to_title(str_extract(labels$covariate, "age$|age_group|education|gender"))
labels$covariate_label <- str_to_title(str_remove(labels$covariate_label, "(education|gender|age_group)_"))
labels$covariate_label <- factor(labels$covariate_label, levels = labels$covariate_label)
# standardized mean difference function
# https://statisticaloddsandends.wordpress.com/2021/10/31/standardized-mean-difference-smd-in-causal-inference/
# https://cran.r-project.org/web/packages/TOSTER/vignettes/SMD_calcs.html#:~:text=The%20SMD%20is%20then%20the,d%3D%CB%89xs
smd_fun <- function(mean_1, mean_0, var_1, var_0){
smd <- (mean_1 - mean_0)/(sqrt((var_1 + var_0)/2))
return(smd)
}
# standardized mean difference function by covaraite
smd_by_covariate <- function(data, groups, covs){
smd_list <- list()
# combinations of all covariate - groups to calculate means for
combos <- expand.grid(covs, groups)
for(i in 1:nrow(combos)){
combo_i<- combos[i, ]
cov_sym <- combo_i[[1]][[1]]
cov <- enquo(cov_sym)
group_sym <- combo_i[[2]][[1]]
group <- enquo(group_sym)
df_subset <- data %>%
select(!!cov, !!group) %>%
filter(!is.na(!!group)) %>%
group_by(!!group) %>%
summarize(mean = mean(!!cov),
var = var(!!cov))
mean_1 = df_subset %>% filter(!!group == 1) %>% pull(mean)
mean_0 = df_subset %>% filter(!!group == 0) %>% pull(mean)
var_1 = df_subset %>% filter(!!group == 1) %>% pull(var)
var_0 = df_subset %>% filter(!!group == 0) %>% pull(var)
smd_out <- smd_fun(mean_1, mean_0, var_1, var_0)
smd_list[[i]] <- list(group = as_label(group),
covariate = as_label(cov),
smd = smd_out,
mean_1 = mean_1,
mean_0 = mean_0)
}
smd_list_out <-do.call("rbind.data.frame", smd_list)
rownames(smd_list_out) <-NULL
#return(smd_list)
return(smd_list_out)
}
covs <- syms(c(colnames(df_sim)[str_detect(colnames(df_sim), "fb_demog_(gender|education|age_group)_.+")],
"fb_demog_age"))
groups <- syms(c("full_complete", "treatment_group", "treatment_sms", "treatment_game","treatment_video"))
out <- smd_by_covariate(data = df_sim, groups = groups, covs = covs) %>%
merge(labels, by = "covariate", all.x = T)
cat(paste("### By Completion"))
out[out$group == "full_complete", ] %>%
ggplot(aes(x = smd, y = covariate_label)) +
geom_vline(xintercept = 0.1, linetype = "dashed") +
geom_vline(xintercept = -0.1, linetype = "dashed") +
geom_vline(xintercept = 0) +
geom_point(size =3) +
labs(y = "", x = "Standardize Mean Difference",
title = "Difference in Samples by Survey Completion",
subtitle ="Mean Completers - Mean Non-completers") +
theme_sarah()
cat("\n\n\n")
cat(paste("### By Treatment Group" ))
out[out$group == "treatment_group", ] %>%
ggplot(aes(x = smd, y = covariate_label)) +
geom_vline(xintercept = 0.1, linetype = "dashed") +
geom_vline(xintercept = -0.1, linetype = "dashed") +
geom_vline(xintercept = 0) +
geom_point(size =3) +
labs(y = "", x = "Standardize Mean Difference",
title = "Difference in Samples by Treatment Group",
subtitle ="Mean Treated Group- Mean Control Group") +
theme_sarah()
cat("\n\n\n")
cat(paste("### By Treatment" ))
out[out$group %in% c("treatment_sms", "treatment_game", "treatment_video"), ] %>%
ggplot(aes(x = smd, y = covariate_label, color = group, shape = group)) +
geom_vline(xintercept = 0.1, linetype = "dashed") +
geom_vline(xintercept = -0.1, linetype = "dashed") +
geom_vline(xintercept = 0) +
geom_point(size =3) +
labs(y = "", x = "Standardize Mean Difference",
title = "Difference in Samples by Treatment",
subtitle ="Mean Treatment - Mean Control") +
theme_sarah()
cat("\n\n\n")
I wrote two functions for distributions of continuous variables and factor variables at multiple treatment comparison levels.
Below I show them for continuous age and discrete gender, but we can apply these to pilot 2 variables as needed.
continuous_plots <- function(variable, label){
plots <- list()
plots[["overall"]] <- ggplot(df_sim[!is.na(df_sim$treatment_group), ], aes(x = get(variable))) +
geom_density(alpha= 0.1, fill = "black") +
theme_sarah() +
scale_x_continuous(limits = c(18, 99), breaks = seq(20, 100, 20)) +
scale_y_continuous(limits = c(0, 0.035)) +
labs(x = label, y = "")
plots[["treatment_group"]] <- ggplot(df_sim[!is.na(df_sim$treatment_group ),], aes(x = get(variable),
group = as.factor(treatment_group),
color = as.factor(treatment_group),
fill = as.factor(treatment_group))) +
geom_density(alpha= 0.1) +
scale_fill_manual(values = cb_colors[c(7,9)], labels = c("Control", "Treatment")) +
scale_color_manual(values = cb_colors[c(7, 9)], labels = c("Control", "Treatment")) +
theme_sarah() +
scale_x_continuous(limits = c(18, 99), breaks = seq(20, 100, 20)) +
scale_y_continuous(limits = c(0, 0.035)) +
labs(x = label, y = "", fill = "Treatment Group", color = "Treatment Group")
plots[["treatment"]] <-ggplot(df_sim[!is.na(df_sim$treatment), ], aes(x = get(variable), group = treatment, color = treatment, fill = treatment)) +
geom_density(alpha= 0.1) +
scale_fill_manual(values = cb_colors[3:6]) +
scale_color_manual(values = cb_colors[3:6]) +
theme_sarah() +
scale_x_continuous(limits = c(18, 99), breaks = seq(20, 100, 20)) +
scale_y_continuous(limits = c(0, 0.035)) +
labs(x = label,
y = "",
fill = "Treatment",
color = "Treatment")
return(plots)
}
cat("#### Overall")
continuous_plots("fb_demog_age", "Age")$overall
cat("\n\n\n")
cat("#### Treatment Group")
continuous_plots("fb_demog_age", "Age")$treatment_group
cat("\n\n\n")
cat("#### Treatment")
continuous_plots("fb_demog_age", "Age")$treatment
discrete_plots <- function(variable, label){
plots <- list()
plots[["overall"]] <- df_sim %>%
filter(!is.na(variable)) %>%
ggplot(
aes(x = get(variable),
fill = get(variable))) +
geom_bar(width = 0.75) +
labs(fill = label, y = "", x = label) +
theme_sarah() +
theme(legend.position = "none",
panel.grid.major.x = element_blank())
plots[["treatment_group"]] <- df_sim %>%
filter(!is.na(variable) & !is.na(treatment_group_label)) %>%
group_by_at(c(variable, "treatment_group_label")) %>%
count() %>%
group_by(treatment_group_label) %>%
mutate(freq = n/sum(n))%>%
ggplot(aes(x = treatment_group_label,
y = freq,
group = get(variable))) +
geom_bar(aes(fill =treatment_group_label), stat = "identity") +
facet_grid(~get(variable)) +
theme_sarah() +
scale_fill_manual(values = cb_colors[c(7,9)]) +
scale_color_manual(values = cb_colors[c(7, 9)]) +
theme(legend.position = "none",
panel.grid.major.x = element_blank()) +
labs(y = "", x= "")
plots[["treatment"]] <-df_sim %>%
filter(!is.na(variable) & !is.na(treatment)) %>%
group_by_at(c(variable, "treatment")) %>%
count() %>%
group_by(treatment) %>%
mutate(freq = n/sum(n))%>%
ggplot(aes(x = treatment,
y = freq,
group = get(variable))) +
geom_bar(aes(fill = treatment), stat = "identity") +
facet_grid(~get(variable)) +
theme_sarah() +
scale_fill_manual(values = cb_colors[3:6]) +
scale_color_manual(values = cb_colors[3:6]) +
theme(legend.position = "none") +
labs(y = "", x= "")
return(plots)
}
cat("#### Overall")
discrete_plots("fb_demog_gender", "Gender")$overall
cat("\n\n\n")
cat("#### Treatment Group")
discrete_plots("fb_demog_gender", "Gender")$treatment_group
cat("\n\n\n")
cat("#### Treatment")
discrete_plots("fb_demog_gender", "Gender")$treatment