# data dictionary 
data_dictionary <- read.csv("~/Documents/git/fb_misinfo_interventions/data/data_dictionary.csv")
# ads data by ad
ads <- read.csv("~/Documents/git/fb_misinfo_interventions/data/cleaned_ads.csv")
# ads data by age and gender
ads_demo <- read.csv("~/Documents/git/fb_misinfo_interventions/data/cleaned_ads_age_gender.csv")

# order of variables by chatbot-treatment combination

out <- list()
for(i in c("control", "game", "video", "sms")){
  out [[i]] <- read_sheet("https://docs.google.com/spreadsheets/d/1Kwjm6tEHycGuy_tnlRZURLb_rrGUoP8jpZNRcaeSyyc/edit#gid=0", sheet = i) 
}

detailed_funnel <- do.call("rbind", out)

# this has ggplot theme preset
source(paste0("https://raw.githubusercontent.com/scmcdonald/visualization/main/theme_sarah.R"))

# load function that defines factor levels so tables and figures have correct ordering
source("~/Documents/git/fb_misinfo_interventions/pilot2_analysis/factor_cols.R")


# define discrete colorblind palette
cb_colors <- brewer.pal(n = 8, name = "Dark2")
cb_colors_extra <- brewer.pal(n = 8, name = "Set2")
cb_colors <- c(cb_colors, cb_colors_extra)
### intro
df_sim <- data.frame(ad = sample(c("attention 1", "outrage 1", "real 1", "real 2", 
                "fake", "fake news 2", "help friends", "protect", "join"), 
                size = 10000, replace = T, prob = rep(1/9, 9)), 
                conversations_started = rep(1, 10000), 
                consent = sample(c(0, 1), size = 10000, replace = T, prob = c(0.3, 0.7))
                
                )


df_sim$intro_go_on <- ifelse(df_sim$consent == 0, NA, 
                                 sample(c(0, 1), 
                                        size = sum(df_sim$consent), 
                                        replace = T, prob = c(0.02, 0.98)))
 

df_sim$intro_complete  <- ifelse(df_sim$intro_go_on == 1 & !is.na(df_sim$intro_go_on ), 1, 0)
 

### treatment
df_sim$treatment <- ifelse(df_sim$intro_complete == 0 | is.na(df_sim$intro_complete), NA, 
                           
                           sample(c("control", "sms", "video", "game"), size = sum(df_sim$intro_complete, na.rm = T), replace = T, 
                                  prob = c(2/3, 1/3 * 1/3, 1/3 * 1/3,  1/3 * 1/3)))


df_sim$treatment_group <- ifelse(df_sim$treatment == "control", 0, 
                                 ifelse(is.na(df_sim$treatment), NA, 1))
df_sim$treatment_group_label <- ifelse(df_sim$treatment == "control", "control", 
                                 ifelse(is.na(df_sim$treatment), NA, "treatment"))

df_sim$treatment_sms <- ifelse(df_sim$treatment== "sms", 1, 
                               ifelse(df_sim$treatment== "control", 0, NA))

df_sim$treatment_game <- ifelse(df_sim$treatment== "game", 1, 
                               ifelse(df_sim$treatment== "control", 0, NA))
df_sim$treatment_video <- ifelse(df_sim$treatment== "video", 1, 
                                ifelse(df_sim$treatment== "control", 0, NA))


#### sms

df_sim$sms_quiz <-  ifelse(df_sim$treatment_sms== 1 & !is.na(df_sim$treatment_sms), 
                           sample(c("a", "b", NA_character_), 
                                  size = sum(df_sim$treatment_sms== 1, na.rm = T), replace = T, prob = c(.45, .45, .1)), NA)

df_sim$family_friend_check <-  ifelse(!is.na(df_sim$sms_quiz), 
                           sample(c("a", "b", NA_character_), 
                                  size = sum(!is.na(df_sim$sms_quiz)), replace = T, prob = c(.45, .45, .1)), NA)

df_sim$fear_quiz <-  ifelse(!is.na(df_sim$family_friend_check), 
                           sample(c("a", "b", NA_character_), 
                                  size = sum(!is.na(df_sim$family_friend_check)), replace = T, prob = c(.45, .45, .1)), NA)


 df_sim$anger_quiz <-  ifelse(!is.na(df_sim$fear_quiz), 
                           sample(c("a", "b", NA_character_), 
                                  size = sum(!is.na(df_sim$fear_quiz)), replace = T, prob = c(.45, .45, .1)), NA)  
 
 
 #### game
 
df_sim$game_completed <-  ifelse(df_sim$treatment_game== 1 & !is.na(df_sim$treatment_game), 
                           sample(c(1, 0), 
                                  size = sum(df_sim$treatment_game== 1, na.rm = T), replace = T, prob = c(.8, .2)), NA)
 
 
df_sim$score <-  ifelse(df_sim$game_completed== 1 & !is.na(df_sim$game_completed), 
                           sample(c(1:5, NA_integer_), 
                                  size = sum(df_sim$game_completed== 1, na.rm = T), replace = T, prob = c(.3, .2, .15, .10, .2, .05)), NA)


#### video

df_sim$video_1 <-  ifelse(df_sim$treatment_video== 1 & !is.na(df_sim$treatment_video), 
                           sample(c(1, 0), 
                                  size = sum(df_sim$treatment_video== 1, na.rm = T), replace = T, prob = c(.95, .05)), NA)
 

df_sim$video_2 <-  ifelse(df_sim$video_1== 1 & !is.na(df_sim$video_1), 
                           sample(c(1, 0), 
                                  size = sum(df_sim$video_1== 1, na.rm = T), replace = T, prob = c(.96, .04)), NA)


df_sim$video_3 <-  ifelse(df_sim$video_2== 1 & !is.na(df_sim$video_2), 
                           sample(c(1, 0), 
                                  size = sum(df_sim$video_2== 1, na.rm = T), replace = T, prob = c(.97, .03)), NA)

df_sim$video_4 <-  ifelse(df_sim$video_3== 1 & !is.na(df_sim$video_3), 
                           sample(c(1, 0), 
                                  size = sum(df_sim$video_3== 1, na.rm = T), replace = T, prob = c(.98, .02)), NA)

df_sim$video_5 <-  ifelse(df_sim$video_4== 1 & !is.na(df_sim$video_4), 
                           sample(c(1, 0), 
                                  size = sum(df_sim$video_4== 1, na.rm = T), replace = T, prob = c(.98, .02)), NA)



df_sim$treatment_complete <- ifelse((df_sim$treatment == "control" & !is.na(df_sim$treatment))|
                                      df_sim$video_5 == 1 & !is.na(df_sim$video_5)|
                                      !is.na(df_sim$score) | !is.na(df_sim$anger_quiz)
                                      , 1, 0)
### misinfo quiz - attention check

df_sim$headline1_answer <- ifelse(df_sim$treatment_complete == 1,   sample(c("a", "b", NA_character_), 
                                  size = sum(df_sim$treatment_complete), replace = T, prob = c(.45, .45, .1)), NA) 


df_sim$headline2_answer <- ifelse(!is.na(df_sim$headline1_answer),   sample(c("a", "b", NA_character_), 
                                  size = sum(!is.na(df_sim$headline1_answer)), replace = T, prob = c(.45, .40, .05)), NA) 



df_sim$attention_complete <- ifelse(!is.na(df_sim$headline2_answer), 1, 0)

### consequences nudge


df_sim$per_unfollow <- ifelse(df_sim$attention_complete == 1,   sample(c("a", "b", NA_character_), 
                                  size = sum(df_sim$attention_complete), replace = T, prob = c(.45, .45, .1)), NA) 

df_sim$consequences_complete <-  ifelse(!is.na(df_sim$per_unfollow), 1, 0)


### feedback

df_sim$enjoy_most <- ifelse(df_sim$consequences_complete == 1,   sample(c("aaa", "bbb", "ccc", NA_character_), 
                                  size = sum(df_sim$consequences_complete), replace = T, prob = c(.3, .3, .3, .1)), NA) 

df_sim$enjoy_least <- ifelse(!is.na(df_sim$enjoy_most),   sample(c("aaa", "bbb", "ccc", NA_character_), 
                                  size = sum(!is.na(df_sim$enjoy_most)), replace = T, prob = c(.3, .3, .35, .05)), NA) 



df_sim$feedback_complete <- ifelse(!is.na(df_sim$enjoy_most), 1, 0)

### DEMOGRAPHICS


df_sim$demog_age <- ifelse(df_sim$feedback_complete == 1, sample(c(18:99, NA_integer_), size = sum(df_sim$feedback_complete == 1), replace = T, 
                        prob = rep(1/(length(18:99)+1), length(18:99)+1)), NA)
  
  
df_sim$demog_gender <- ifelse(!is.na(df_sim$demog_age), 
                        sample(c("woman", "man", "non-binary", NA_character_), size = sum(!is.na(df_sim$demog_age)), replace = T, 
                                  prob = c(0.85/2, 0.85/2, 0.10, 0.05))  , NA
                          
                          )

df_sim$demog_ethnic <- ifelse(!is.na(df_sim$demog_gender), 
                        sample(c("white or caucasian", "black or african", 
                                       "hispanic", 
                                       "asian or indian", 
                                       "native american", 
                                       "pacific islander", NA_character_), size = sum(!is.na(df_sim$demog_gender)), replace = T, 
                                  prob = c(rep(0.9/6, 6), 0.1))  , NA )


df_sim$demog_income <- ifelse(!is.na(df_sim$demog_ethnic), 
                        sample(c("less than $30,000", 
                                                           "$30,000 - $49,999",
                                                           "$50,000 - $69,999", 
                                                           "$70,000 - $99,999", 
                                                           "$100,000 - $150,000", 
                                                           "more than $150,000", NA_character_), size = sum(!is.na(df_sim$demog_ethnic)), replace = T, 
                                  prob = c(rep(0.9/6, 6), 0.1))  , NA )



df_sim$demog_education <- ifelse(!is.na(df_sim$demog_income), 
                        sample(c("less than $30,000", 
                                                           "$30,000 - $49,999",
                                                           "$50,000 - $69,999", 
                                                           "$70,000 - $99,999", 
                                                           "$100,000 - $150,000", 
                                                           "more than $150,000", NA_character_), size = sum(!is.na(df_sim$demog_income)), replace = T, 
                                  prob = c(rep(0.9/6, 6), 0.1))  , NA )


df_sim$demog_religion <- ifelse(!is.na(df_sim$demog_education), 
                        sample(c(1:6, NA_character_), size = sum(!is.na(df_sim$demog_education)), replace = T, 
                                  prob = c(rep(0.9/6, 6), 0.1))  , NA )

df_sim$demog_political <- ifelse(!is.na(df_sim$demog_religion), 
                        sample(c("democrat", "independent", "republican", NA_character_), size = sum(!is.na(df_sim$demog_religion)), replace = T, 
                                  prob = c(rep(0.95/3, 3), 0.1))  , NA )




df_sim$demographics_complete <- ifelse(!is.na(df_sim$demog_political), 1, 0)



### end

df_sim$amazon_credit <- ifelse(df_sim$demographics_complete  == 1, 
                               sample(c(letters, NA_character_), 
                                      replace =T, size = sum(df_sim$demographics_complete ), 
                                      prob = rep(1/27, 27)), NA)

df_sim$full_complete <- ifelse(!is.na(df_sim$amazon_credit), 1, 0)


# outcomes for everyone who started survey

df_sim$fb_outcome_binary <- sample(c(0, 1), size =10000, 
                                   replace = T, prob = c(0.4, 0.6))

df_sim$fb_outcome_continuous <- runif(10000, 0, 1)




df_sim$fb_demog_age <- sample(18:99, size = 10000, replace = T, 
                        prob = rep(1/length(18:99), length(18:99)))
                 


df_sim$fb_demog_gender <-as.factor(sample(c("woman", "man", "non-binary"), size = 10000, replace = T, 
                                  prob = c(0.85/2, 0.85/2, 0.15)))
                           


df_sim$fb_demog_race <- sample(c("white or caucasian", "black or african", 
                                       "hispanic", 
                                       "asian or indian", 
                                       "native american", 
                                       "pacific islander"), 10000, 
                            replace = T, prob = rep(1/6, 6))

df_sim$fb_demog_education <- as.factor(sample(c("< high school", 
                                 "high school", 
                                 "some college", 
                                 "2-year degree", 
                                 "4-year degree", 
                                 "graduate degree"), 10000, 
                               replace = T, prob = rep(1/6, 6)))

df_sim$fb_demog_income <- as.factor(sample(c("less than $30,000", 
                                                           "$30,000 - $49,999",
                                                           "$50,000 - $69,999", 
                                                           "$70,000 - $99,999", 
                                                           "$100,000 - $150,000", 
                                                           "more than $150,000"), 10000, 
                               replace = T, prob = rep(1/6, 6)))

df_sim <- df_sim %>%
  mutate(fb_demog_age_group = 
           as.factor(case_when(fb_demog_age <25 & fb_demog_age> 17 ~ "18-24", 
                     fb_demog_age <35 & fb_demog_age> 24 ~ "25-34", 
                     fb_demog_age <45 & fb_demog_age> 34 ~ "35-44", 
                     fb_demog_age <55 & fb_demog_age> 44 ~ "45-54", 
                     fb_demog_age <65 & fb_demog_age> 54 ~ "55-64",
                     fb_demog_age> 64 ~ "65+",
                     TRUE ~ NA_character_))
           )

df_sim <- as.data.frame(one_hot(as.data.table(df_sim), dropCols=FALSE))


ads[1:9, "ad_name"] <- unique(df_sim$ad)
# format numbers to percent
form_percent <- function(dec){
  if(is.na(dec)){
    return("-")
  } else {
    return(paste0(round(dec, 4) * 100, "%"))
  }
}

# format number dollar type
form_cost <- function(num){
  
  sapply(num, function(x) if (x == Inf| is.na(x)) {
    return("-")
  } else {
    return(paste0("$", round(x, 3)))
  })
  
}

# round number to 4 with trailing 0s
# converts to string
round_4 <- function(x){
  formatC(x, digits = 4, format = "f")
}

# format number with commas for 
# large numbers without decimals
form_num <- function(num){
  
  sapply(num, function(x) if (x == Inf| is.na(x)) {
    return("-")
  } else {
    return( formatC(x, big.mark = ","))
  })
  
}

# this puts sms to all caps and control to Control
label_fun <- function(x){
    ifelse(nchar(x) <4 & x != "all", toupper(x), str_to_title(x))
  }

1 Project Objectives

The goal of the Facebook Misinformation Project is to experimentally evaluate scaled versions of promising interventions to reduce misinformation spread on social media using on-platform behavioral data.

2 Pilot Objectives

Update for Pilot 2:

3 Purpose of Analysis Script

Update for Pilot 2: The analysis script cleans and analyzes pilot data from Ads Manager, Chatfuel, and chatbot responses. The output includes the following tables:

  • Descriptive Statistics

  • Funnel Statistics

  • Treatment Analysis

  • Subgroup Analysis

4 Samples

4.1 Targeting Strategy

Update for Pilot 2: Our targeting strategy is estimated to target 53 million FB users, which is 22% of total FB users in the US (240 million). The pilot target users (1) in the union of the top 10 states that are most conservative + top 20 most rural states (2) OR in the top 2,500 largest ZIP codes by population in other states that have high republican votes (# votes for Republican candidates / # total votes > 95%) or high rural population rate (% rural population > 85%) (3) Among users in (1) or (2), target users with age 35+.

4.2 Randomization Strategy

4.2.1 Ad-Level

Update for Pilot 2: The pilot includes samples to test 9 versions of ads - Outrage 1, Outrage 2, Attention 1, Attention 2, Real 1, Real 2, Fake News 1, Fake News 2, Fake.

4.2.1.1 Outrage

4.2.1.1.1 Ad 1

4.2.1.1.2 Ad 2

4.2.1.2 Attention

4.2.1.2.1 Ad 1

4.2.1.2.2 Ad 2

4.2.1.3 Real

4.2.1.3.1 Ad 1

4.2.1.3.2 Ad 2

4.2.1.4 Fake News

4.2.1.4.1 Ad 1

4.2.1.4.2 Ad 2

4.2.1.5 Fake

4.2.2 Chatbot-Level

After users click on the ads, they are randomized to be connected with one of the 4 chatbot conditions: two pre-consent chat conditions to test two different factoids (Factoid 1, Factoid 2), and two post-consent intro conditions to test the performance of the intro section with vs. without an additional motivator (Consent only, Motivator).

1a Consent only no factoid no motivator
1b Motivator no factoid motivator
2a Factoid 1 factoid 1 no motivator
2b Factoid 2 factoid 2 no motivator

4.2.3 Intervention-Level

After users consent to participate in the study, they are randomized into an intervention group and a control group. The intervention group is exposed to the SMS course, consequences nudge and content replacement; the control group is exposed to the facts baseline course only.

5 Main Takeaways

Will need to update for Pilot 2:

6 Data Dictionary

The data dictionary below outlines each variable used in analysis from the Chatfuel dataset. Group denotes the section of the chatbot. Derived denotes if this variable was created during the cleaning process.

# print data dictionary
data_dictionary %>%
  select(cols, description,values, group, type, derived) %>%
  datatable(rownames = F)

7 Descriptive Statistics

7.1 Numeric Variables

This table reports the mean, standard deviations, and total number of non-missing observations for numeric variables. Note that individuals without an arm assignment are not included in these tables. Participants who do not consent to the study do not receive a treatment assignment.

7.1.1 Demographics

Here we report the summary statistics for all FB demographics variables of interest by treatment group.

# summary for all treatments
df_sim %>%
  filter(!is.na(treatment)) %>%
  select(colnames(df_sim)[str_detect(colnames(df_sim), "fb_demog")]) %>%
  select(!fb_demog_gender & !fb_demog_race & !fb_demog_education & !fb_demog_income &!fb_demog_age_group ) %>%
  summarise_all(list(n= function(x) sum(!is.na(x)), 
                     mean = function(x) mean(x, na.rm = T), 
                     sd = function(x) sd(x, na.rm = T))) %>%
  mutate(treatment = "all treatments") %>%
  # summary by treatmet
  rbind(df_sim %>%
  filter(!is.na(treatment)) %>%
  select(colnames(df_sim)[str_detect(colnames(df_sim), "fb_demog")], treatment) %>%
  select(!fb_demog_gender & !fb_demog_race & !fb_demog_education & !fb_demog_income &!fb_demog_age_group ) %>%
  group_by(treatment) %>%
  summarise_all(list(n= function(x) sum(!is.na(x)), 
                     mean = function(x) mean(x, na.rm = T), 
                     sd = function(x) sd(x, na.rm = T))))%>%
      pivot_longer(cols = !treatment, 
                   names_to = "variable",
                   values_to = "value") %>% 
      separate(variable, into = c("variable", "measure"), sep="_(?=[^_]+$)") %>%
      pivot_wider(names_from = c("treatment", "measure"))%>%
  # round mean and sd to 4 digits but not n
  mutate_at(vars(contains(c("mean", "sd"))), round_4) %>%
  kable(col.names = c("Variable", rep(c("N", "Mean", "SD"), 5))) %>%
  add_header_above(c(" " = 1, "All" = 3,"Control" = 3, "Game" = 3, "SMS" = 3, "Video" = 3))%>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
All
Control
Game
SMS
Video
Variable N Mean SD N Mean SD N Mean SD N Mean SD N Mean SD
fb_demog_age 6857 58.2907 23.7599 4615 58.6540 23.7090 745 57.4188 24.0327 759 56.2648 23.7804 738 58.9824 23.6955
fb_demog_gender_man 6857 0.4279 0.4948 4615 0.4295 0.4951 745 0.3852 0.4870 759 0.4466 0.4975 738 0.4417 0.4969
fb_demog_gender_non-binary 6857 0.1429 0.3500 4615 0.1428 0.3499 745 0.1490 0.3563 759 0.1383 0.3455 738 0.1423 0.3496
fb_demog_gender_woman 6857 0.4292 0.4950 4615 0.4277 0.4948 745 0.4658 0.4992 759 0.4150 0.4931 738 0.4160 0.4932
fb_demog_education_< high school 6857 0.1776 0.3822 4615 0.1805 0.3846 745 0.1893 0.3920 759 0.1634 0.3699 738 0.1626 0.3693
fb_demog_education_2-year degree 6857 0.1663 0.3723 4615 0.1671 0.3731 745 0.1490 0.3563 759 0.1647 0.3711 738 0.1802 0.3846
fb_demog_education_4-year degree 6857 0.1712 0.3767 4615 0.1714 0.3769 745 0.1772 0.3821 759 0.1713 0.3770 738 0.1640 0.3705
fb_demog_education_graduate degree 6857 0.1600 0.3666 4615 0.1590 0.3658 745 0.1584 0.3654 759 0.1542 0.3613 738 0.1734 0.3789
fb_demog_education_high school 6857 0.1587 0.3654 4615 0.1567 0.3635 745 0.1611 0.3678 759 0.1726 0.3781 738 0.1545 0.3616
fb_demog_education_some college 6857 0.1663 0.3723 4615 0.1653 0.3715 745 0.1651 0.3715 759 0.1739 0.3793 738 0.1653 0.3717
fb_demog_income_$100,000 - $150,000 6857 0.1574 0.3642 4615 0.1536 0.3606 745 0.1517 0.3589 759 0.1647 0.3711 738 0.1789 0.3835
fb_demog_income_$30,000 - $49,999 6857 0.1759 0.3807 4615 0.1751 0.3801 745 0.1758 0.3809 759 0.1739 0.3793 738 0.1829 0.3869
fb_demog_income_$50,000 - $69,999 6857 0.1619 0.3684 4615 0.1655 0.3717 745 0.1490 0.3563 759 0.1634 0.3699 738 0.1504 0.3577
fb_demog_income_$70,000 - $99,999 6857 0.1639 0.3702 4615 0.1699 0.3756 745 0.1490 0.3563 759 0.1607 0.3675 738 0.1450 0.3523
fb_demog_income_less than $30,000 6857 0.1660 0.3721 4615 0.1653 0.3715 745 0.1852 0.3887 759 0.1713 0.3770 738 0.1450 0.3523
fb_demog_income_more than $150,000 6857 0.1750 0.3800 4615 0.1705 0.3761 745 0.1893 0.3920 759 0.1660 0.3723 738 0.1978 0.3986
fb_demog_age_group_18-24 6857 0.0887 0.2843 4615 0.0858 0.2801 745 0.0993 0.2993 759 0.1014 0.3021 738 0.0827 0.2755
fb_demog_age_group_25-34 6857 0.1228 0.3282 4615 0.1196 0.3245 745 0.1262 0.3323 759 0.1423 0.3496 738 0.1192 0.3243
fb_demog_age_group_35-44 6857 0.1245 0.3302 4615 0.1270 0.3330 745 0.1221 0.3277 759 0.1212 0.3266 738 0.1152 0.3195
fb_demog_age_group_45-54 6857 0.1237 0.3292 4615 0.1211 0.3263 745 0.1181 0.3230 759 0.1318 0.3384 738 0.1369 0.3439
fb_demog_age_group_55-64 6857 0.1168 0.3212 4615 0.1129 0.3165 745 0.1369 0.3440 759 0.1238 0.3296 738 0.1138 0.3178
fb_demog_age_group_65+ 6857 0.4235 0.4942 4615 0.4336 0.4956 745 0.3973 0.4897 759 0.3794 0.4856 738 0.4322 0.4957

7.1.2 Funnel Numeric

funnel_vars <- c("conversations_started", 
                 "consent", 
                 "intro_complete", 
                 "treatment_complete", 
                 "consequences_complete", 
                 "feedback_complete", 
                 "demographics_complete", 
                 "full_complete")

df_sim %>%
  filter(!is.na(treatment) ) %>%
  select(all_of(funnel_vars)) %>%
  summarise_all(list(n= length, mean = mean, sd =sd)) %>%
  mutate(treatment = "all") %>%
  rbind(df_sim %>%
  select(all_of(funnel_vars), treatment) %>%
    filter(!is.na(treatment) ) %>%
  group_by(treatment) %>%
  summarise_all(list(n= length, mean = mean, sd =sd))) %>%
  pivot_longer( cols = !treatment, 
                names_to = "variable",
                values_to = "value") %>% 
  separate(variable, into = c("variable", "measure"), sep="_(?=[^_]+$)") %>%
  pivot_wider(names_from = c("treatment", "measure")) %>%
    # round mean and sd to 4 digits but not n
  mutate_at(vars(contains(c("mean", "sd"))), round_4) %>%
  kable(col.names = c("Variable", rep(c("N", "Mean", "SD"), 5))) %>%
  add_header_above(c(" " = 1, "All" = 3, "Control" = 3, "Game" = 3, "SMS" = 3, "Video" = 3))%>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
All
Control
Game
SMS
Video
Variable N Mean SD N Mean SD N Mean SD N Mean SD N Mean SD
conversations_started 6857 1.0000 0.0000 4615 1.0000 0.0000 745 1.0000 0.0000 759 1.0000 0.0000 738 1.0000 0.0000
consent 6857 1.0000 0.0000 4615 1.0000 0.0000 745 1.0000 0.0000 759 1.0000 0.0000 738 1.0000 0.0000
intro_complete 6857 1.0000 0.0000 4615 1.0000 0.0000 745 1.0000 0.0000 759 1.0000 0.0000 738 1.0000 0.0000
treatment_complete 6857 0.9262 0.2615 4615 1.0000 0.0000 745 0.7839 0.4119 759 0.6640 0.4726 738 0.8780 0.3275
consequences_complete 6857 0.7047 0.4562 4615 0.7569 0.4290 745 0.6000 0.4902 759 0.5204 0.4999 738 0.6734 0.4693
feedback_complete 6857 0.6358 0.4812 4615 0.6871 0.4637 745 0.5262 0.4996 759 0.4625 0.4989 738 0.6043 0.4893
demographics_complete 6857 0.3564 0.4790 4615 0.3844 0.4865 745 0.2966 0.4571 759 0.2675 0.4429 738 0.3333 0.4717
full_complete 6857 0.3429 0.4747 4615 0.3686 0.4825 745 0.2805 0.4496 759 0.2648 0.4415 738 0.3252 0.4688

7.1.3 Outcome

# all 
df_sim %>%
  filter(!is.na(treatment)) %>%
  select(fb_outcome_binary, fb_outcome_continuous) %>%
  summarise_all(list(n= function(x)sum(!is.na(x)), mean = function(x)mean(x, na.rm = T), sd = function(x)sd(x, na.rm = T))) %>%
  mutate(treatment = "all") %>%
  # by treatment
    rbind(df_sim %>%
    select(fb_outcome_binary, fb_outcome_continuous, treatment) %>%
    filter(!is.na(treatment)) %>%
    group_by(treatment) %>%
    summarise_all(list(n= function(x)sum(!is.na(x)), mean = function(x)mean(x, na.rm = T), sd = function(x)sd(x, na.rm = T)))) %>%
  pivot_longer( cols = !treatment, 
                names_to = "variable",
                values_to = "value") %>% 
  separate(variable, into = c("variable", "measure"), sep="_(?=[^_]+$)") %>%
  pivot_wider(names_from = c("treatment", "measure")) %>%
  # round mean and sd to 4 digits but not n
  mutate_at(vars(contains(c("mean", "sd"))), round_4) %>%
  kable(col.names = c("Variable", rep(c("N", "Mean", "SD"), 5))) %>%
  add_header_above(c(" " = 1, "All" = 3, "Control" = 3, "Game" = 3, "SMS" = 3, "Video" = 3))%>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
All
Control
Game
SMS
Video
Variable N Mean SD N Mean SD N Mean SD N Mean SD N Mean SD
fb_outcome_binary 6857 0.6025 0.4894 4615 0.6054 0.4888 745 0.6148 0.4870 759 0.6074 0.4887 738 0.5664 0.4959
fb_outcome_continuous 6857 0.5016 0.2860 4615 0.5007 0.2865 745 0.4971 0.2864 759 0.5120 0.2846 738 0.5003 0.2840

7.1.4 Other Numeric

# summary overall

df_sim %>%
  select(!contains(c("demog", "outcome", "complete", funnel_vars))) %>%
  filter(!is.na(treatment)) %>%
  select_if(is.numeric) %>%
  summarise_all(list(n= function(x)sum(!is.na(x)), mean = function(x)mean(x, na.rm = T), sd = function(x)sd(x, na.rm = T))) %>%
  mutate(treatment = "all") %>%
  rbind(df_sim %>%
  select(!contains(c("demog", "outcome", "complete", funnel_vars))) %>%
  group_by(treatment) %>%
  select_if(is.numeric) %>%
  filter(!is.na(treatment)) %>%
      group_by(treatment) %>%
      summarise_all(list(n= function(x)sum(!is.na(x)), mean = function(x)mean(x, na.rm = T), sd = function(x)sd(x, na.rm = T))))%>%
      pivot_longer( cols = !treatment, 
                    names_to = "variable",
                    values_to = "value") %>% 
      separate(variable, into = c("variable", "measure"), sep="_(?=[^_]+$)") %>%
      pivot_wider(names_from = c("treatment", "measure")) %>%
    # round mean and sd to 4 digits but not n
  mutate_at(vars(contains(c("mean", "sd"))), round_4)  %>% 
  # if NaN or NA, replace with "-"
  mutate_all(~ replace(., .=="  NaN" | . == "   NA", "-")) %>%
  kable(col.names = c("Variable", rep(c("N", "Mean", "SD"), 5))) %>%
  add_header_above(c(" " = 1, "All" =3, "Control" = 3, "Game" = 3, "SMS" = 3, "Video" = 3))%>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
All
Control
Game
SMS
Video
Variable N Mean SD N Mean SD N Mean SD N Mean SD N Mean SD
intro_go_on 6857 1.0000 0.0000 4615 1.0000 0.0000 745 1.0000 0.0000 759 1.0000 0.0000 738 1.0000 0.0000
treatment_group 6857 0.3270 0.4691 4615 0.0000 0.0000 745 1.0000 0.0000 759 1.0000 0.0000 738 1.0000 0.0000
treatment_sms 5374 0.1412 0.3483 4615 0.0000 0.0000 0
759 1.0000 0.0000 0
treatment_game 5360 0.1390 0.3460 4615 0.0000 0.0000 745 1.0000 0.0000 0
0
treatment_video 5353 0.1379 0.3448 4615 0.0000 0.0000 0
0
738 1.0000 0.0000
score 584 2.5599 1.5001 0
584 2.5599 1.5001 0
0
video_1 738 0.9593 0.1976 0
0
0
738 0.9593 0.1976
video_2 708 0.9576 0.2016 0
0
0
708 0.9576 0.2016
video_3 678 0.9794 0.1423 0
0
0
678 0.9794 0.1423
video_4 664 0.9849 0.1219 0
0
0
664 0.9849 0.1219
video_5 654 0.9908 0.0954 0
0
0
654 0.9908 0.0954

7.2 Categorical Variables

This table reports the distribution of observations in each category for categorical variables. Note that individuals without an arm assignment are not included in these tables.

#function to calculate count and shares of categorical variables
cat_summary <- function(variable){
  
  var <- sym(variable)
  
  df_sim %>%
    filter(treatment != "no treatment") %>%
    select(!!var) %>%
    group_by(!!var) %>%
    count(name = "Count") %>%
    ungroup() %>%
    mutate(Share = round(Count/sum(Count), 2))%>%
    kable() %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
  
}

7.2.1 Arm

The arm variable denotes participants’ treatment assignment.

cat_summary("treatment")
treatment Count Share
control 4615 0.67
game 745 0.11
sms 759 0.11
video 738 0.11

7.2.2 Family Friend Check

Answer to the question: ❓How about you? If a family member or a friend posts something online, do you stop to check whether it’s misinformation?

A. I always trust my family and friends!

B. I always check whether something is misinformation.

cat_summary("family_friend_check")
family_friend_check Count Share
a 297 0.04
b 329 0.05
NA 6231 0.91

7.2.3 Fear Quiz

Answer to the question: ❓What motive might someone have for wanting you to believe the ingredients of another product are safer?

A. To sell their product.

B. Because they genuinely think the other product is dangerous.

C. All of the above.

cat_summary("fear_quiz")
fear_quiz Count Share
a 286 0.04
b 289 0.04
NA 6282 0.92

7.2.4 Anger Quiz

Answer to the question: Would you share this [anger inducing ad] on your social media feed?

A. Yes

B. No

cat_summary("anger_quiz")
anger_quiz Count Share
a 264 0.04
b 240 0.04
NA 6353 0.93

7.2.5 Per Unfollow

Answer to the question: What percentage of people do you think have unfollowed someone who shared misinformation?

10%

25%

50% or more

cat_summary("per_unfollow")
per_unfollow Count Share
a 2436 0.36
b 2396 0.35
NA 2025 0.30

7.3 Visualization

7.3.1 Demographics

demog_figures <- function(variable){
   
  variable <- enquo(variable)

  
  df_plot <- df_sim %>%
    select(treatment, !!variable) %>%
    filter(!is.na(treatment) & !is.na(!!variable)) %>%
    group_by(treatment, !!variable) %>%
    count()%>%
    group_by(treatment) %>%
    mutate(percent = paste(round(n/sum(n) * 100, 0), "%", sep = "")) %>%
    ungroup() %>%
    mutate(percent = case_when(!duplicated(!!variable) ~ 
       paste(percent, "\n", str_to_title(!!variable), sep = "")
      ,  
      TRUE ~ percent))
  
 x <- ggplot(df_plot, 
         aes(x = treatment, 
             y = n, 
             fill = !!variable)) +
    geom_bar(position = "fill", stat ="identity", width = .7, color = "white") +
    theme_sarah() +
   scale_x_discrete(labels = label_fun )+
    scale_fill_manual(values = c(cb_colors)) + 
    geom_text(aes( label = percent), 
              color = "white", 
              position = position_fill(vjust = 0.5), 
              size = 12/.pt, 
              fontface = "bold") + 
    labs(x = "", 
         y ="", 
         title = paste("Percentage of Participants by\n Treatment and", str_to_title(str_replace(str_remove(as_label(variable), "demog_"), "_", " ")), sep = " "), 
         caption = paste("Number of Observations:", scales::comma(sum(df_plot$n)))) +
    scale_y_continuous(breaks = c(0, 1), labels = c("0%", "100%"))  + 
    theme(legend.position = "none")
 
  print(x)
}

The next figures show the share of different demographic groups by treatment. Note that the following figures only include observations where the treatment and demographic variables are both non-missing.

demog_vars <- c("fb_demog_gender", "fb_demog_age_group", 
                "fb_demog_race", "fb_demog_education", "fb_demog_income")

for(i in demog_vars){
    
    cat("####", str_to_title(str_replace(str_remove(i, "demog_"), "_", " ")), "\n\n\n")
  
    demog_figures(!!sym(paste(i, sep = "")))
    
    cat("\n\n\n")
  }

7.3.1.1 Fb Gender

7.3.1.2 Fb Age_group

7.3.1.3 Fb Race

7.3.1.4 Fb Education

7.3.1.5 Fb Income

7.3.2 Ads Demographics

Facebook Ads Manager provides cost, impression, and conversation counts at the ad-age-gender level. Below, we show the cost per impression and cost per conversation by age or gender and ad or ad theme. Note that here, we use the Facebook Ads definition of conversations.

# figure is a function of the variable measure (impressions or conversations)
# group (ad theme group or individual ads)
# demographic variable (age or gender)

plot_cost_per <- function(variable, group, demo){
  
  # choose a group to sort bars descending by
  # plot colors
  # filter out unknown gender group since it is a small size
  if(demo == "age"){
    
    label_group <-"65+"
    plot_colors <- cb_colors[3:6]
    
  } else if(demo == "gender"){
    
    label_group <-"female"
    plot_colors <- cb_colors[1:2]
    ads_demo <- ads_demo %>%
      filter(gender != "unknown")
    
  }

  # set plot title and x axis label
  plot_title <- paste("Cost per", str_to_title(str_remove(variable, "s$")), "by", str_to_title(demo), sep =" ")
  x_title <- paste("Cost per", str_to_title(str_remove(variable, "s$")), sep = " ")
  
  group <- enquo(group)
  variable <- sym(variable)
  demo <- sym(demo)

  # calculate cost per measure by group
  plot_data <- ads_demo  %>%
    select(!!group, !!demo, !!variable, cost) %>%
    group_by(!!group, !!demo) %>%
    summarize_all(sum) %>%
    mutate(cost_var = cost/  !!variable, 
          cost_var = ifelse(is.infinite(cost_var), 0, cost_var)
    ) %>%
    group_by(!!demo) %>%
    # order for plotting
    mutate(ordervar = ifelse(!!demo == label_group, rank(cost_var), 10)) %>%
    ungroup() 
  
  # this finds which ad or ad theme has the max cost per measure for female or 65+. 
  # this is where the demographic group labels are plotted
  labels <- plot_data %>%
    filter(!!demo == label_group) %>%
    filter(cost_var ==max(cost_var)) %>%
    pull(!!group)
  
  # create labels for demographic groups
  plot_data <- plot_data %>%
    mutate(demo_label = ifelse(!!group== labels, paste(str_to_title(!!demo), " "), ""))
 
  # plot bars        
  ggplot(plot_data, 
         aes(y = reorder(!!group, ordervar),
             x = cost_var, 
             group = !!demo, 
             fill = !!demo)) + 
    geom_bar(stat = "identity", 
           position = "dodge", 
           color = "white") +
    theme_sarah() +
    theme(legend.position = "none", 
          panel.grid.major.y = element_blank()) +
    labs(y = "", 
     x = x_title, 
     title = plot_title) +
    # set colors
    scale_fill_manual(values = plot_colors) +
    # this plots bar demographic labels
    geom_text(aes(label = demo_label),
              position = position_dodge(0.9),
              color="white",
              vjust = 0.5,
              hjust = 1)
}

# only thing to add after running function is 
# the scale limits and breaks, which is done manually
# since its different for each combination

7.3.2.1 Cost per Impression

7.3.2.1.1 Gender-Ad

Note that Facebook Ads also has a gender category called unknown. Since the size of this category is so small, we do no included these observations below.

plot_cost_per("impressions", ad_name, "gender") +
scale_x_continuous(limits = c(0, 0.065), 
                   expand = c(0,0), 
                   breaks = seq(0, 0.06, 0.01))

7.3.2.1.2 Age-Ad
plot_cost_per("impressions", ad_name, "age") +
  scale_x_continuous(limits = c(0, 0.056), 
                     expand = c(0,0), 
                     breaks = seq(0, 0.055, 0.01)) 

7.3.2.2 Cost per Conversation

7.3.2.2.1 Gender-Ad

Note that Facebook Ads also has a gender category called unknown. Since the size of this category is so small, we do no included these observations below. We calculate cost per conversation = cost / #conversations. Missing bars indicates that we could not calculate cost per conversation because there were zero conversations for that subgroup.

plot_cost_per("conversations", ad_name, "gender") +
  scale_x_continuous(limits = c(0, 65), 
                     expand = c(0,0), 
                     breaks = seq(0, 60, 10)) 

7.3.2.2.2 Age-Ad

We calculate cost per conversation = cost / #conversations. Missing bars indicates that we could not calculate cost per conversation because there were zero conversations for that subgroup.

plot_cost_per("conversations", ad_name, "age") +
  scale_x_continuous(limits = c(0, 65),
                     expand = c(0,0),
                     breaks = seq(0, 65, 10)) 

8 Funnel Statistics

8.1 Table 1: Summary of Results

This table presents an overview of funnel statistics at the ad-, chatbot-, and treatment-level.

8.1.1 Ad-level

All responses, regardless of treatment-level, are included. Note that here, we use the Facebook Ads definition of conversations.

ads %>%
  mutate(cost_per_impression = form_cost(cost/impressions),
         cost_per_consent = form_cost(cost/consents), 
         cost_per_click = form_cost(cost/clicks),
         cost_per_completion = form_cost(cost/completions), 
         impressions = form_num(impressions), 
         reach = form_num(reach),
         clicks = form_num(clicks)) %>%
  select("Ad" = ad_name, 
         "Impressions" = impressions,
         "Reach" = reach,
         "Clicks" = clicks,  
         "Conversations" = conversations, 
         "Consents" = consents,
         "Completions" = completions, 
         "Cost per Impression" = cost_per_impression, 
         "Cost per Click" = cost_per_click,
         "Cost per Consent" = cost_per_consent, 
         "Cost per Completion" = cost_per_completion) %>%
    arrange(Ad) %>%
    kable(escape = F) %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Ad Impressions Reach Clicks Conversations Consents Completions Cost per Impression Cost per Click Cost per Consent Cost per Completion
attention 1 4,274 3,499 231 9 7 3 $0.037 $0.677 $22.353 $52.157
fake 6,185 4,989 305 13 3 2 $0.029 $0.595 $60.443 $90.665
fake news 2 12,629 9,490 711 25 11 4 $0.043 $0.767 $49.576 $136.335
help friends 1,148 960 97 1 0 0 $0.025 $0.301
join 18,192 14,110 1,099 65 25 7 $0.042 $0.697 $30.644 $109.443
No Ad ID
5 4 2
outrage 1 2,954 2,284 254 4 2 1 $0.019 $0.222 $28.17 $56.34
protect 2,778 2,203 160 1 0 0 $0.03 $0.522
real 1 3,733 3,123 108 15 6 1 $0.027 $0.939 $16.908 $101.45
real 2 1,983 1,442 105 5 2 1 $0.04 $0.765 $40.155 $80.31

8.1.2 Treatment-level

All users who consented are included.

Cost per Consent = total experiment costs / consents. Cost per Completion = total experiment costs / completions.

total_cost <- sum(ads$cost, na.rm  =T)
# treatment level
df_sim %>%
  select(treatment, consent, full_complete) %>%
  filter(!is.na(treatment)) %>%
  group_by(treatment) %>%
  summarize(consents = sum(consent, na.rm  = T),
            completions = sum(full_complete, na.rm  = T)) %>%
  mutate(cost_per_consent = form_cost(total_cost/consents), 
         cost_per_completion = form_cost(total_cost/completions), 
         treatment = label_fun(treatment)) %>%
  # all treatments
  rbind(df_sim %>%
          filter(!is.na(treatment)) %>%
          select( consent, full_complete) %>%
          summarize(consents = sum(consent, na.rm  = T),
                    completions = sum(full_complete, na.rm  = T)) %>%
          mutate(cost_per_consent = form_cost(total_cost/consents), 
                 cost_per_completion = form_cost(total_cost/completions), 
                 treatment = "All")) %>%
  # rename column names to titles
  select("Treatment" = treatment,
         "Consents" = consents,
         "Completions" = completions, 
         "Cost per Consent" = cost_per_consent, 
         "Cost per Completion" = cost_per_completion) %>%
  kable(escape = F) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Treatment Consents Completions Cost per Consent Cost per Completion
Control 4615 1701 $0.433 $1.176
Game 745 209 $2.685 $9.569
SMS 759 201 $2.635 $9.95
Video 738 240 $2.71 $8.333
All 6857 2351 $0.292 $0.851

8.2 Table 2: Overall Funnel

This table shows the detailed funnel by all, treatment, and control participants. Here, “All” refers to all participants from the stage of ad impressions, including those who did not start a conversation or did not consent to participate in the study (these users were not assigned to an arm). “SMS participants” refer to consenting users who are assigned to the SMS intervention arm, and “Control participants” refer to consenting users who are assigned to the control arm.

Funnel Parameters

  1. Reach (%) = #reach / #impressions
  2. Ad clickthrough (%) = #clicks / #reach
  3. Conversation started (%) = #conversations / #clicks
  4. Consent obtained (%) = #consents / #conversations
  5. Treatment completed (%) = #forking section completed / # consents
  6. Outcome completed (%) = #outcome completed / #forking section completed
  7. Feedback completed (%) = #feedback section completed / #outcome completed
  8. Demographics Completed (%) = #demographic section completed / #feedback section completed
  9. Full survey completed (%) = #full survey completed / #demographic section completed
# funnel for overall and by add
get_funnel <- function(group, ad = NULL){
  
  # filter data for ad level analysis
  if(!is.null(ad)){
    df_temp <- df_sim[df_sim$ad_name == ad & !is.na(df_sim$ad_name), ]
    ads_temp <- ads[ads$ad_name == ad & !is.na(ads$ad_name), ]
  } else{
    df_temp <- df_sim
    ads_temp <- ads
  }
  
  # filter data for treatment level analysis
  if (group == "all"){
    df_temp <- df_temp
  } else {
     df_temp <- df_temp[df_temp$treatment == group & !is.na(df_temp$treatment), ]
  }
  
  # calculate total costs, impressions, clicks, from ads
  cost <- sum(ads_temp$cost, na.rm = T)
  impressions <- sum(ads_temp$impressions, na.rm = T)
  reach <-  sum(ads_temp$reach, na.rm = T)
  clicks <- sum(ads_temp$clicks, na.rm = T)
  
  # get rest of metrics from chatfuel
  conversations <- nrow(df_temp)
  consents <- sum(df_temp$consent, na.rm = T)
  treatment_complete <- sum(df_temp$treatment_complete) 
  feedback_complete <- sum(df_temp$feedback_complete)
  demog_complete <- sum(df_temp$demographics_complete)
  full_complete <- sum(df_temp$full_complete)
  
  # metric names
  metric = c(
    "Impressions", 
    "Reach",
    "Ad Clickthrough",
    "Conversation Started",
    "Consent Obtained", 
    "Treatment Completed",
    "Feedback Complete", 
    "Demographics Complete", 
    "Full Complete")
  
  cost_fun <- function(x){
    return(form_cost(cost/x))
  }
  
  metrics_list <- c(impressions, reach, clicks, conversations, consents, treatment_complete
         , feedback_complete, 
         demog_complete, full_complete)
  
  # cost of each metric
  costs <- sapply(metrics_list, cost_fun)
  
  # counts formatted with comma  
  counts <- sapply(metrics_list, scales::comma)

  
  percent_se <- function(numerator, denominator){
    p <- numerator/denominator
    se <- sqrt(p*(1-p)/denominator)
    p_se <- paste(form_percent(p), " (", form_percent(se), ")", sep = "")
    
    return(p_se)
  }
    
  # percent of previous
  percent <- mapply(function(x, y) percent_se(x, y), 
                    metrics_list[-1], 
                    metrics_list[-length(metrics_list)]
                    )
  percent <- c("-", percent)

 
  
  percent_of_impressions <- sapply(metrics_list, function(x) percent_se(x, impressions))
  percent_of_impressions[1] <- "-"

  
  # percent of consents
  percent_of_consents = sapply(metrics_list, function(x) percent_se(x, consents))
  percent_of_consents[1:4] <- "-"
  
     if(group == "all" & is.null(ad)){
          dropoff_1 <- data.frame(cbind(metric, counts, percent, percent_of_impressions, costs))
     } else {
       dropoff_1 <- data.frame(cbind(metric, counts, percent, percent_of_consents))
     }
       
  if(!is.null(ad)){
    if(ad == "No Ad ID"){
      dropoff_1[dropoff_1$metric == "Impressions", c("counts", "percent", "percent_of_consents") ] <- "-"
      dropoff_1[dropoff_1$metric == "Reach", c("counts", "percent", "percent_of_consents") ] <- "-"
      dropoff_1[dropoff_1$metric == "Ad Clickthrough", c("counts", "percent", "percent_of_consents")] <- "-"
      dropoff_1[dropoff_1$metric == "Conversation Started", "percent"] <- "-"
    }
  }
    
    if(group == "all" & is.null(ad)){

      
      colnames(dropoff_1) <- c("Metric", "Number of Obs.", 
                           "Percent of Previous", "Percent of Impressions", "Cost per Stage")
      
    } else {
      
       dropoff_1[dropoff_1$metric %in% c("Impressions", "Reach", "Ad Clickthrough", "Conversation Started"), c("counts", "percent")] <- "-"

     dropoff_1[dropoff_1$metric == "Consent Obtained", c("percent", "percent_of_consents")] <- "-"
     
     colnames(dropoff_1) <- c("Metric", "Number of Obs.", 
                           "Percent of Previous", "Percent of Consents")
      
      }
    
  
    print(kable(dropoff_1) %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")))
    
}

8.2.1 All Participants

get_funnel(group = "all")
Metric Number of Obs. Percent of Previous Percent of Impressions Cost per Stage
Impressions 53,876
$0.037
Reach 42,100 78.14% (0.18%) 78.14% (0.18%) $0.048
Ad Clickthrough 3,070 7.29% (0.13%) 5.7% (0.1%) $0.651
Conversation Started 10,000 325.73% (-) 18.56% (0.17%) $0.2
Consent Obtained 6,999 69.99% (0.46%) 12.99% (0.14%) $0.286
Treatment Completed 6,351 90.74% (0.35%) 11.79% (0.14%) $0.315
Feedback Complete 4,360 68.65% (0.58%) 8.09% (0.12%) $0.459
Demographics Complete 2,444 56.06% (0.75%) 4.54% (0.09%) $0.818
Full Complete 2,351 96.19% (0.39%) 4.36% (0.09%) $0.851

8.2.2 Game Participants

get_funnel("game")
Metric Number of Obs. Percent of Previous Percent of Consents
Impressions
Reach
Ad Clickthrough
Conversation Started
Consent Obtained 745
Treatment Completed 584 78.39% (1.51%) 78.39% (1.51%)
Feedback Complete 392 67.12% (1.94%) 52.62% (1.83%)
Demographics Complete 221 56.38% (2.5%) 29.66% (1.67%)
Full Complete 209 94.57% (1.52%) 28.05% (1.65%)

8.2.3 SMS Participants

get_funnel("sms")
Metric Number of Obs. Percent of Previous Percent of Consents
Impressions
Reach
Ad Clickthrough
Conversation Started
Consent Obtained 759
Treatment Completed 504 66.4% (1.71%) 66.4% (1.71%)
Feedback Complete 351 69.64% (2.05%) 46.25% (1.81%)
Demographics Complete 203 57.83% (2.64%) 26.75% (1.61%)
Full Complete 201 99.01% (0.69%) 26.48% (1.6%)

8.2.4 Video Participants

get_funnel("video")
Metric Number of Obs. Percent of Previous Percent of Consents
Impressions
Reach
Ad Clickthrough
Conversation Started
Consent Obtained 738
Treatment Completed 648 87.8% (1.2%) 87.8% (1.2%)
Feedback Complete 446 68.83% (1.82%) 60.43% (1.8%)
Demographics Complete 246 55.16% (2.35%) 33.33% (1.74%)
Full Complete 240 97.56% (0.98%) 32.52% (1.72%)

8.2.5 Control Participants

get_funnel("control")
Metric Number of Obs. Percent of Previous Percent of Consents
Impressions
Reach
Ad Clickthrough
Conversation Started
Consent Obtained 4,615
Treatment Completed 4,615 100% (0%) 100% (0%)
Feedback Complete 3,171 68.71% (0.68%) 68.71% (0.68%)
Demographics Complete 1,774 55.94% (0.88%) 38.44% (0.72%)
Full Complete 1,701 95.89% (0.47%) 36.86% (0.71%)

8.3 Table 3: Ad Split

This table shows the detailed funnel by ad split for all, treatment, and control participants. Here, “All” refers to all participants from the stage of ad impressions, including those who did not start a conversation or did not consent to participate in the study (these users were not assigned to an arm).

# merge ad data
df_sim <- merge(df_sim, ads, by. ="ad", by.y =  "ad_name", all.x = T)

# loop over ads
for(i in sort(unique(df_sim$ad_name))) {
  cat("###",  "Ad", i, "{.tabset} \n")
  # loop over treatmnet
  for(j in c("all", "sms", "control")){
    
    cat("####", paste(label_fun(j), "Participants", sep = " "), " \n")
    print(get_funnel(ad = i, group = j))

    cat("  \n")
  }
  cat("  \n")
}

8.4 Treatment Questionwise Funnel

# function to compute completion for detailed funnel
detailed_funnel_fun <- function(x){
  # if any missing, count non-missing observations
  if(any(is.na(x))){
    sum(!is.na(x))
    # is there are no non-missings, 
    # and the variable is numeric, sum over the 
    # variable (these are binary),
    # else, sum non-missing
  } else {
    if(is.numeric(x)){
      sum(x)
    } else {
      sum(!is.na(x))
    }
  }
}

# loop over treatments
for(treat in c( "sms",  "control", "game", "video")){
   cat("###", paste(label_fun(treat), "{.tabset}", sep = " "), " \n")

    
    plot_df <-  df_sim %>%
      filter(  treatment == treat) %>%
      # select variables from detailed funnel correpsonding to the treatment-chatbot combo
      select(any_of(detailed_funnel[detailed_funnel$treatment == treat, "variable"] %>%pull())) %>%
      summarize_all(.funs =detailed_funnel_fun) %>%
      pivot_longer(cols = everything(), values_to = "count") %>%
      mutate(perc_of_consents = count/pull(.[.$name == "consent", "count"]))
  
    # get number of consents for plot title
    consents <- plot_df %>% filter(name == "consent") %>% pull(count)
  
    # figure
    gg <- plot_df %>%
      # order variables for plotting
      mutate(name = factor(name, levels = rev(name))) %>%
      # group = 1 adds the line through the dots
      ggplot(aes(x = name, y = perc_of_consents, group = 1)) + 
      geom_point(size = 3) +
      geom_line() +
      coord_flip() +
      theme_sarah() +
      labs(x= "", 
           y = "Percent of Conversations", 
           title = paste("Number of Consents:", consents)) +
      scale_y_continuous(limits = c(0, 1), 
                         breaks = seq(0, 1, by = 0.2),
                         labels = scales::percent_format(accuracy = 1)) 
    # table output
    plot_df %>%
      mutate(perc_of_consents = form_percent(perc_of_consents)) %>%
      kable(col.names = c("Variable", "Number. of Obs", "Percent of Consents")) %>%
      kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
      print()
    
    # plot output
    print(gg)
    

  cat("  \n\n\n")
}

8.4.1 SMS

Variable Number. of Obs Percent of Consents
consent 759 100%
intro_go_on 759 100%
sms_quiz 685 90.25%
family_friend_check 626 82.48%
fear_quiz 575 75.76%
anger_quiz 504 66.4%
headline1_answer 462 60.87%
headline2_answer 432 56.92%
per_unfollow 395 52.04%
enjoy_most 351 46.25%
enjoy_least 337 44.4%
demog_age 350 46.11%
demog_gender 336 44.27%
demog_ethnic 303 39.92%
demog_income 274 36.1%
demog_education 245 32.28%
demog_religion 220 28.99%
demog_political 203 26.75%
amazon_credit 201 26.48%

8.4.2 Control

Variable Number. of Obs Percent of Consents
consent 4615 100%
intro_go_on 4615 100%
headline1_answer 4134 89.58%
headline2_answer 3915 84.83%
per_unfollow 3493 75.69%
enjoy_most 3171 68.71%
enjoy_least 3020 65.44%
demog_age 3154 68.34%
demog_gender 3004 65.09%
demog_ethnic 2744 59.46%
demog_income 2471 53.54%
demog_education 2219 48.08%
demog_religion 1965 42.58%
demog_political 1774 38.44%
amazon_credit 1701 36.86%

8.4.3 Game

Variable Number. of Obs Percent of Consents
consent 745 100%
intro_go_on 745 100%
game_completed 599 80.4%
score 584 78.39%
headline1_answer 523 70.2%
headline2_answer 494 66.31%
per_unfollow 447 60%
enjoy_most 392 52.62%
enjoy_least 381 51.14%
demog_age 389 52.21%
demog_gender 375 50.34%
demog_ethnic 340 45.64%
demog_income 293 39.33%
demog_education 273 36.64%
demog_religion 245 32.89%
demog_political 221 29.66%
amazon_credit 209 28.05%

8.4.4 Video

Variable Number. of Obs Percent of Consents
consent 738 100%
intro_go_on 738 100%
video_1 708 95.93%
video_2 708 95.93%
video_3 678 91.87%
video_4 664 89.97%
video_5 654 88.62%
headline1_answer 589 79.81%
headline2_answer 556 75.34%
per_unfollow 497 67.34%
enjoy_most 446 60.43%
enjoy_least 422 57.18%
demog_age 440 59.62%
demog_gender 420 56.91%
demog_ethnic 379 51.36%
demog_income 339 45.93%
demog_education 314 42.55%
demog_religion 273 36.99%
demog_political 246 33.33%
amazon_credit 240 32.52%

9 Treatment Analysis

estimate_effect <- function(df, outcome, outcome_name){
  
  
  # get mean and std. errors by treatment for plotting
  eq <- as.formula(paste(outcome, " ~ 0+ treatment"))
  lm <- lm_robust(eq, data = df)
  
  # make df out of lm results
  plot <- tidy(lm) 
  
  gg <- ggplot(plot, aes(x = term, y = estimate))  + 
    geom_point(position = position_dodge(width = 0.5), size = 3) +
    geom_pointrange(
      aes(x = term,ymin = estimate - 1.96 * std.error,
          ymax = estimate + 1.96 * std.error),
      width = .1,
      position = position_dodge(width = 0.5)) + 
    # function to clean label names
    scale_x_discrete(labels = function(x){
      label <- str_remove(x, "treatment")
      label <- label_fun(label)
      return(label)
      }) + 
    labs(x = 'Treatment', 
         y = 'Estimate', 
         title = paste0('Treatment Effect Estimate on ', outcome_name)) +
    theme_sarah() +
    scale_y_continuous(limits = c(0, 1), 
                       breaks = seq(0, 1, 0.2))


  
  print(gg)
    
  # get difference between treatments estimates
  eq <- as.formula(paste(outcome, " ~  treatment"))
  lm2 <- lm_robust(eq, data = df)
  
  out <- tidy(lm2)

  video_n = sum(df$treatment == "video", na.rm = T)
  game_n = sum(df$treatment == "game", na.rm = T)
  sms_n = sum(df$treatment == "sms", na.rm = T)
  control_n = sum(df$treatment == "control", na.rm = T)

  plot %>%
    select(term, estimate_value  = estimate, std.error_value = std.error) %>%
    mutate(estimate_std_value = paste(round_4(estimate_value), "\n(", round_4(std.error_value), ")", sep = "")) %>%
    select(!estimate_value & !std.error_value) %>%
    merge(out[2:4, c("term", "estimate", 
                     "std.error", "p.value", 
                     "conf.low", "conf.high")], by = "term", all.x = T) %>%
    mutate(estimate_std_diff = paste(round_4(estimate), "\n(", round_4(std.error), ")", sep = "")) %>%
    select(!estimate & !std.error) %>%
    mutate(conf = paste("(", round_4(conf.low), ", ", round_4(conf.high), ")", sep = ""), 
           n =c(control_n, game_n, sms_n, video_n)) %>%
    select(!conf.low & !conf.high) %>%
    mutate(term = str_remove(term, "treatment"), 
           term = factor(term, c("game", "sms", "video", "control"))) %>%
    arrange(term) %>%
    mutate(p.value = ifelse(term == "control", "-", p.value), 
           estimate_std_diff = ifelse(term == "control", "-", estimate_std_diff), 
           conf = ifelse(term == "control", "-", conf)) %>%
    select("Treatment"= term,
           "Estimate" = estimate_std_value, 
           "N" = n, 
           "Difference from Control" = estimate_std_diff, 
           "95% Conf. Int." = conf, 
           "p-value" = p.value) %>%
  kable(row.names = F, escape = T) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
  print()
}  

9.1 Outcome 1: Outcome Binary

estimate_effect(df_sim, "fb_outcome_binary", "Outcome Binary")  
Treatment Estimate N Difference from Control 95% Conf. Int. p-value
game 0.6148 (0.0178) 745 0.0093 (0.0192) (-0.0284, 0.0471) 0.627041234059019
sms 0.6074 (0.0177) 759 0.0020 (0.0191) (-0.0356, 0.0395) 0.918401926499294
video 0.5664 (0.0183) 738 -0.0390 (0.0196) (-0.0775, -0.0006) 0.0467737948032825
control 0.6054 (0.0072) 4615

9.2 Outcome 2: Outcome Continuous

estimate_effect(df_sim, "fb_outcome_continuous", "Outcome Continuous")  
Treatment Estimate N Difference from Control 95% Conf. Int. p-value
game 0.4971 (0.0105) 745 -0.0036 (0.0113) (-0.0258, 0.0186) 0.749572344614976
sms 0.5120 (0.0103) 759 0.0112 (0.0112) (-0.0107, 0.0331) 0.314803204222693
video 0.5003 (0.0105) 738 -0.0004 (0.0113) (-0.0225, 0.0217) 0.969258083314189
control 0.5007 (0.0042) 4615

10 Subgroup Analysis

10.1 Pre-defined

Age Group : 18-24, 25-34, 35-44, 45-54, 55-64, 65+

Education: High school or less; Some college; Bachelor’s degree; Graduate degree

Gender: Man, Woman, Non-binary

Political: Republic, Independent, Democrat

  # get mean and std. errors by treatment for plotting
tests <- combn(sort(unique(df_sim$treatment)), 2, simplify = F)

covariates <-  c("fb_demog_gender_man",  "fb_demog_age")

outcomes <- c("fb_outcome_binary", "fb_outcome_continuous")

subgroup_analysis <- function(covariates, outcomes, tests){

  outlist <- list()
  for(t in 1:length(tests)){
    
    for (cov in covariates){
      
      for(outcome in outcomes){
  
        test = tests[[t]]
        
        df_test <- df_sim[df_sim$treatment %in% test, c("treatment", cov, outcome)]
        form <- as.formula(paste(outcome, " ~ 0 +", " treatment * ", cov))
        model <- tidy(lm_robust(form, data = df_test))
          
        means <- model[1:2, "estimate"]
        names(means) <- test
        
        outlist[[paste(t,cov,outcome, sep = "")]] <- cbind(t(as.matrix(means)), model[3, c("outcome", "estimate", "std.error", "conf.low", "conf.high", "p.value")], nobs = nrow(df_test), covariate = cov)
      }
    }
  }
  
  final <- do.call("smartbind", outlist)
  
  rownames(final) <- NULL
  
  final_out <- final[, c("outcome", "covariate", "control", "game", "sms", "video", "estimate", "std.error", "conf.low", "conf.high", "p.value", "nobs")] %>%
    arrange(outcome, covariate)
  
  return(final_out)
  
}
    options(knitr.kable.NA = '')

subgroup <- subgroup_analysis(covariates, outcomes, tests) 

subgroup%>%
  kable(digits = 4)%>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
outcome covariate control game sms video estimate std.error conf.low conf.high p.value nobs
fb_outcome_binary fb_demog_age 0.6042 0.6363 0.0000 0.0003 -0.0006 0.0006 0.9444 5360
fb_outcome_binary fb_demog_age 0.6042 0.6168 0.0000 0.0003 -0.0006 0.0006 0.9444 5374
fb_outcome_binary fb_demog_age 0.6042 0.6390 0.0000 0.0003 -0.0006 0.0006 0.9444 5353
fb_outcome_binary fb_demog_age 0.6363 0.6168 -0.0004 0.0007 -0.0018 0.0011 0.6132 1504
fb_outcome_binary fb_demog_age 0.6363 0.6390 -0.0004 0.0007 -0.0018 0.0011 0.6132 1483
fb_outcome_binary fb_demog_age 0.6168 0.6390 -0.0002 0.0007 -0.0016 0.0013 0.8216 1497
fb_outcome_binary fb_demog_gender_man 0.6111 0.6070 -0.0132 0.0145 -0.0417 0.0153 0.3640 5360
fb_outcome_binary fb_demog_gender_man 0.6111 0.6048 -0.0132 0.0145 -0.0417 0.0153 0.3640 5374
fb_outcome_binary fb_demog_gender_man 0.6111 0.5728 -0.0132 0.0145 -0.0417 0.0153 0.3640 5353
fb_outcome_binary fb_demog_gender_man 0.6070 0.6048 0.0202 0.0366 -0.0516 0.0920 0.5813 1504
fb_outcome_binary fb_demog_gender_man 0.6070 0.5728 0.0202 0.0366 -0.0516 0.0920 0.5813 1483
fb_outcome_binary fb_demog_gender_man 0.6048 0.5728 0.0059 0.0357 -0.0642 0.0759 0.8697 1497
fb_outcome_continuous fb_demog_age 0.5114 0.5101 -0.0002 0.0002 -0.0005 0.0002 0.3027 5360
fb_outcome_continuous fb_demog_age 0.5114 0.4743 -0.0002 0.0002 -0.0005 0.0002 0.3027 5374
fb_outcome_continuous fb_demog_age 0.5114 0.4967 -0.0002 0.0002 -0.0005 0.0002 0.3027 5353
fb_outcome_continuous fb_demog_age 0.5101 0.4743 -0.0002 0.0004 -0.0011 0.0006 0.6089 1504
fb_outcome_continuous fb_demog_age 0.5101 0.4967 -0.0002 0.0004 -0.0011 0.0006 0.6089 1483
fb_outcome_continuous fb_demog_age 0.4743 0.4967 0.0007 0.0004 -0.0002 0.0015 0.1310 1497
fb_outcome_continuous fb_demog_gender_man 0.4940 0.5073 0.0156 0.0085 -0.0011 0.0324 0.0674 5360
fb_outcome_continuous fb_demog_gender_man 0.4940 0.5069 0.0156 0.0085 -0.0011 0.0324 0.0674 5374
fb_outcome_continuous fb_demog_gender_man 0.4940 0.5133 0.0156 0.0085 -0.0011 0.0324 0.0674 5353
fb_outcome_continuous fb_demog_gender_man 0.5073 0.5069 -0.0264 0.0212 -0.0680 0.0153 0.2145 1504
fb_outcome_continuous fb_demog_gender_man 0.5073 0.5133 -0.0264 0.0212 -0.0680 0.0153 0.2145 1483
fb_outcome_continuous fb_demog_gender_man 0.5069 0.5133 0.0114 0.0207 -0.0293 0.0520 0.5832 1497

The following code prints the table/figures for gender. We can repeat it for other groups we want to look at.

df_test <- df_sim[df_sim$treatment %in% c("control", "game"), c("treatment", "fb_demog_gender_man", "fb_outcome_continuous")]
form <- as.formula(paste("fb_outcome_continuous", " ~ 0 +", " treatment * fb_demog_gender_man"))
model <- tidy(lm_robust(form, data = df_test))

df_sim %>%
  select(fb_demog_gender_man, treatment, fb_outcome_continuous) %>%
  filter(!is.na(treatment)) %>%
  group_by(treatment, fb_demog_gender_man) %>% 
  summarize_all(list(mean = mean, sd =sd, n = length)) %>%
  mutate(se = sd/sqrt(n))%>%
  ggplot(aes(group = treatment, color = as.factor(fb_demog_gender_man))) + 
  geom_segment(aes(x = mean -1.96 * se, xend = mean + 1.96 *se, y = fb_demog_gender_man, yend = fb_demog_gender_man)) + 
  geom_point(aes(x = mean, y = fb_demog_gender_man)) + 
  facet_grid(rows =vars(treatment), switch = "y") + 
  scale_y_continuous(breaks = c(0, 1), 
                     limits = c(-.25, 1.25)) + 
  theme_sarah() + 
  labs(y = "", x = "Continuous Outcome", title = "Continuous Outcome for Gender\nBy Treatment Group") + 
    theme(panel.grid.minor.y= element_blank(), 
          axis.text.y = element_blank(), 
          strip.text.y.left = element_text(angle = 0), 
          legend.position = "none")

subgroup %>%
  filter(outcome == "fb_outcome_continuous" & covariate == "fb_demog_gender_man") %>%
  kable(digits = 4)%>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
outcome covariate control game sms video estimate std.error conf.low conf.high p.value nobs
fb_outcome_continuous fb_demog_gender_man 0.494 0.5073 0.0156 0.0085 -0.0011 0.0324 0.0674 5360
fb_outcome_continuous fb_demog_gender_man 0.494 0.5069 0.0156 0.0085 -0.0011 0.0324 0.0674 5374
fb_outcome_continuous fb_demog_gender_man 0.494 0.5133 0.0156 0.0085 -0.0011 0.0324 0.0674 5353
fb_outcome_continuous fb_demog_gender_man 0.5073 0.5069 -0.0264 0.0212 -0.0680 0.0153 0.2145 1504
fb_outcome_continuous fb_demog_gender_man 0.5073 0.5133 -0.0264 0.0212 -0.0680 0.0153 0.2145 1483
fb_outcome_continuous fb_demog_gender_man 0.5069 0.5133 0.0114 0.0207 -0.0293 0.0520 0.5832 1497

11 Appendix

11.1 Covariate Balance

# Make nice labels for plot
labels <- data.frame(covariate = c(colnames(df_sim)[str_detect(colnames(df_sim), "fb_demog_(gender|education|age_group)_.+")],
                                   "fb_demog_age"))
labels$covariate_label <- str_remove(labels$covariate, "fb_demog_")
labels$covariate_group <- str_to_title(str_extract(labels$covariate, "age$|age_group|education|gender"))

labels$covariate_label <- str_to_title(str_remove(labels$covariate_label, "(education|gender|age_group)_"))

labels$covariate_label <- factor(labels$covariate_label, levels = labels$covariate_label)

# standardized mean difference function
# https://statisticaloddsandends.wordpress.com/2021/10/31/standardized-mean-difference-smd-in-causal-inference/
# https://cran.r-project.org/web/packages/TOSTER/vignettes/SMD_calcs.html#:~:text=The%20SMD%20is%20then%20the,d%3D%CB%89xs
smd_fun <- function(mean_1, mean_0, var_1, var_0){
  
  smd <- (mean_1 - mean_0)/(sqrt((var_1 + var_0)/2))
  return(smd)
  
}

# standardized mean difference function by covaraite

smd_by_covariate <- function(data, groups, covs){

  smd_list <- list()
  
  # combinations of all covariate - groups to calculate means for
  combos <- expand.grid(covs, groups)
  
  for(i in 1:nrow(combos)){
    
    combo_i<- combos[i, ]
    
    cov_sym <- combo_i[[1]][[1]]
    cov <- enquo(cov_sym)
    
    group_sym <- combo_i[[2]][[1]]
    group <- enquo(group_sym)

    df_subset <- data %>%
      select(!!cov, !!group) %>%
      filter(!is.na(!!group)) %>%
      group_by(!!group) %>%
      summarize(mean = mean(!!cov), 
                var = var(!!cov)) 
    
    mean_1 = df_subset %>% filter(!!group == 1) %>% pull(mean) 
    mean_0 = df_subset %>% filter(!!group == 0) %>% pull(mean)
    var_1 = df_subset %>% filter(!!group == 1) %>% pull(var)
    var_0 = df_subset %>% filter(!!group == 0) %>% pull(var)
    
    smd_out <- smd_fun(mean_1, mean_0, var_1, var_0)
    
    smd_list[[i]] <- list(group = as_label(group), 
                          covariate = as_label(cov), 
                          smd = smd_out, 
                          mean_1 = mean_1, 
                          mean_0 = mean_0)

  }
  
  smd_list_out <-do.call("rbind.data.frame", smd_list)
  rownames(smd_list_out) <-NULL
  #return(smd_list)
  return(smd_list_out)
  
}



covs <- syms(c(colnames(df_sim)[str_detect(colnames(df_sim), "fb_demog_(gender|education|age_group)_.+")],
               "fb_demog_age"))




groups <- syms(c("full_complete", "treatment_group", "treatment_sms", "treatment_game","treatment_video"))


out <- smd_by_covariate(data = df_sim, groups = groups, covs = covs) %>%
  merge(labels, by = "covariate", all.x = T)

cat(paste("### By Completion"))

11.1.1 By Completion

out[out$group == "full_complete", ]  %>%
  ggplot(aes(x = smd, y = covariate_label))  +
  geom_vline(xintercept = 0.1, linetype = "dashed") + 
  geom_vline(xintercept = -0.1, linetype = "dashed") + 
  geom_vline(xintercept = 0) + 
  geom_point(size =3) + 
  labs(y = "", x = "Standardize Mean Difference", 
       title = "Difference in Samples by Survey Completion",
       subtitle ="Mean Completers - Mean Non-completers") + 
  theme_sarah()

cat("\n\n\n")
cat(paste("### By Treatment Group" ))

11.1.2 By Treatment Group

out[out$group == "treatment_group", ]  %>%
  ggplot(aes(x = smd, y = covariate_label))  + 
  geom_vline(xintercept = 0.1, linetype = "dashed") + 
  geom_vline(xintercept = -0.1, linetype = "dashed") + 
  geom_vline(xintercept = 0) + 
  geom_point(size =3) + 
  labs(y = "", x = "Standardize Mean Difference", 
       title = "Difference in Samples by Treatment Group",
       subtitle ="Mean Treated Group- Mean Control Group") + 
  theme_sarah()

cat("\n\n\n")
cat(paste("### By Treatment" ))

11.1.3 By Treatment

out[out$group %in% c("treatment_sms", "treatment_game", "treatment_video"), ]  %>%
  ggplot(aes(x = smd, y = covariate_label, color = group, shape = group))  + 
  geom_vline(xintercept = 0.1, linetype = "dashed") + 
  geom_vline(xintercept = -0.1, linetype = "dashed") + 
  geom_vline(xintercept = 0) + 
  geom_point(size =3) + 
  labs(y = "", x = "Standardize Mean Difference", 
       title = "Difference in Samples by Treatment",
       subtitle ="Mean Treatment - Mean Control") + 
  theme_sarah() 

cat("\n\n\n")

11.2 Distributions of Continuous Variables

I wrote two functions for distributions of continuous variables and factor variables at multiple treatment comparison levels.

Below I show them for continuous age and discrete gender, but we can apply these to pilot 2 variables as needed.

11.2.1 Age

continuous_plots <- function(variable, label){

  plots <- list()
  

  
  plots[["overall"]] <- ggplot(df_sim[!is.na(df_sim$treatment_group), ], aes(x = get(variable))) +
    geom_density(alpha= 0.1, fill = "black") +
    theme_sarah() + 
    scale_x_continuous(limits = c(18, 99), breaks = seq(20, 100, 20)) + 
    scale_y_continuous(limits = c(0, 0.035)) +   
    labs(x = label, y = "")
  
    plots[["treatment_group"]] <- ggplot(df_sim[!is.na(df_sim$treatment_group ),], aes(x =  get(variable), 
                                                         group = as.factor(treatment_group), 
                                                         color = as.factor(treatment_group),
                                                         fill = as.factor(treatment_group))) +
    geom_density(alpha= 0.1) +
    scale_fill_manual(values = cb_colors[c(7,9)], labels = c("Control", "Treatment")) +
    scale_color_manual(values = cb_colors[c(7, 9)], labels = c("Control", "Treatment")) +
    theme_sarah() + 
    scale_x_continuous(limits = c(18, 99), breaks = seq(20, 100, 20)) + 
    scale_y_continuous(limits = c(0, 0.035)) + 
    labs(x = label, y = "", fill = "Treatment Group", color  = "Treatment Group")
  

    plots[["treatment"]] <-ggplot(df_sim[!is.na(df_sim$treatment), ], aes(x =  get(variable), group = treatment, color = treatment, fill = treatment)) +
    geom_density(alpha= 0.1) + 
    scale_fill_manual(values = cb_colors[3:6]) +
    scale_color_manual(values = cb_colors[3:6]) +
    theme_sarah() + 
    scale_x_continuous(limits = c(18, 99), breaks = seq(20, 100, 20)) + 
    scale_y_continuous(limits = c(0, 0.035)) + 
    labs(x = label, 
         y = "", 
         fill = "Treatment", 
         color = "Treatment")
  
    return(plots)

}
cat("#### Overall")

11.2.1.1 Overall

continuous_plots("fb_demog_age", "Age")$overall

cat("\n\n\n")
cat("#### Treatment Group")

11.2.1.2 Treatment Group

continuous_plots("fb_demog_age", "Age")$treatment_group

cat("\n\n\n")
cat("#### Treatment")

11.2.1.3 Treatment

continuous_plots("fb_demog_age", "Age")$treatment

11.2.2 Gender

discrete_plots <- function(variable, label){

  plots <- list()
  

  
  plots[["overall"]] <- df_sim %>%
    filter(!is.na(variable)) %>%
    ggplot( 
       aes(x = get(variable), 
           fill = get(variable))) + 
  geom_bar(width = 0.75) + 
  labs(fill = label, y = "", x = label) + 
  theme_sarah() + 
  theme(legend.position = "none", 
        panel.grid.major.x = element_blank())



  plots[["treatment_group"]] <- df_sim %>%
  filter(!is.na(variable) & !is.na(treatment_group_label)) %>%
  group_by_at(c(variable, "treatment_group_label")) %>% 
  count() %>%
  group_by(treatment_group_label) %>%
  mutate(freq = n/sum(n))%>%
  ggplot(aes(x = treatment_group_label,
             y = freq,
           group = get(variable)))  + 
      geom_bar(aes(fill =treatment_group_label), stat = "identity") +
  facet_grid(~get(variable)) + 
  theme_sarah() + 
    scale_fill_manual(values = cb_colors[c(7,9)]) +
  scale_color_manual(values = cb_colors[c(7, 9)]) +
  theme(legend.position = "none", 
        panel.grid.major.x = element_blank()) + 
  labs(y = "", x= "") 
  

  
    plots[["treatment"]] <-df_sim %>%
      filter(!is.na(variable) & !is.na(treatment)) %>%
    group_by_at(c(variable, "treatment")) %>% 
    count() %>%
    group_by(treatment) %>%
    mutate(freq = n/sum(n))%>%
    ggplot(aes(x = treatment,
               y = freq,
             group = get(variable)))  + 
        geom_bar(aes(fill = treatment), stat = "identity") +
    facet_grid(~get(variable)) + 
    theme_sarah() + 
      scale_fill_manual(values = cb_colors[3:6]) +
    scale_color_manual(values = cb_colors[3:6]) +
    theme(legend.position = "none") + 
    labs(y = "", x= "")
    
    return(plots)
}
cat("#### Overall")

11.2.2.1 Overall

discrete_plots("fb_demog_gender", "Gender")$overall

cat("\n\n\n")
cat("#### Treatment Group")

11.2.2.2 Treatment Group

discrete_plots("fb_demog_gender", "Gender")$treatment_group

cat("\n\n\n")
cat("#### Treatment")

11.2.2.3 Treatment

discrete_plots("fb_demog_gender", "Gender")$treatment