# data dictionary 
data_dictionary <- read.csv("~/Documents/git/fb_misinfo_interventions/data/data_dictionary.csv")
# ads data by ad
ads <- read.csv("~/Documents/git/fb_misinfo_interventions/data/cleaned_ads.csv")
# ads data by age and gender
ads_demo <- read.csv("~/Documents/git/fb_misinfo_interventions/data/cleaned_ads_age_gender.csv")

# order of variables by chatbot-treatment combination

out <- list()
for(i in c("control", "game", "video", "sms")){
  out [[i]] <- read_sheet("https://docs.google.com/spreadsheets/d/1Kwjm6tEHycGuy_tnlRZURLb_rrGUoP8jpZNRcaeSyyc/edit#gid=0", sheet = i) 
}

detailed_funnel <- do.call("rbind", out)

# this has ggplot theme preset
source(paste0("https://raw.githubusercontent.com/scmcdonald/visualization/main/theme_sarah.R"))

# load function that defines factor levels so tables and figures have correct ordering
source("~/Documents/git/fb_misinfo_interventions/pilot2_analysis/factor_cols.R")


# define discrete colorblind palette
cb_colors <- brewer.pal(n = 8, name = "Dark2")
cb_colors_extra <- brewer.pal(n = 8, name = "Set2")
cb_colors <- c(cb_colors, cb_colors_extra)

### intro
df_sim <- data.frame(ad = sample(c("attention 1", "outrage 1", "real 1", "real 2", 
                "fake", "fake news 2", "help friends", "protect", "join"), 
                size = 10000, replace = T, prob = rep(1/9, 9)), 
                conversations_started = rep(1, 10000), 
                consent = sample(c(0, 1), size = 10000, replace = T, prob = c(0.3, 0.7))
                
                )


df_sim$intro_go_on <- ifelse(df_sim$consent == 0, NA, 
                                 sample(c(0, 1), 
                                        size = sum(df_sim$consent), 
                                        replace = T, prob = c(0.02, 0.98)))
 

df_sim$intro_complete  <- ifelse(df_sim$intro_go_on == 1 & !is.na(df_sim$intro_go_on ), 1, 0)
 

### treatment
df_sim$treatment <- ifelse(df_sim$intro_complete == 0 | is.na(df_sim$intro_complete), NA, 
                           
                           sample(c("control", "sms", "video", "game"), size = sum(df_sim$intro_complete, na.rm = T), replace = T, 
                                  prob = c(2/3, 1/3 * 1/3, 1/3 * 1/3,  1/3 * 1/3)))


df_sim$treatment_group <- ifelse(df_sim$treatment == "control", 0, 
                                 ifelse(is.na(df_sim$treatment), NA, 1))
df_sim$treatment_group_label <- ifelse(df_sim$treatment == "control", "control", 
                                 ifelse(is.na(df_sim$treatment), NA, "treatment"))

df_sim$treatment_sms <- ifelse(df_sim$treatment== "sms", 1, 
                               ifelse(df_sim$treatment== "control", 0, NA))

df_sim$treatment_game <- ifelse(df_sim$treatment== "game", 1, 
                               ifelse(df_sim$treatment== "control", 0, NA))
df_sim$treatment_video <- ifelse(df_sim$treatment== "video", 1, 
                                ifelse(df_sim$treatment== "control", 0, NA))


#### sms

df_sim$sms_quiz <-  ifelse(df_sim$treatment_sms== 1 & !is.na(df_sim$treatment_sms), 
                           sample(c("a", "b", NA_character_), 
                                  size = sum(df_sim$treatment_sms== 1, na.rm = T), replace = T, prob = c(.45, .45, .1)), NA)

df_sim$family_friend_check <-  ifelse(!is.na(df_sim$sms_quiz), 
                           sample(c("a", "b", NA_character_), 
                                  size = sum(!is.na(df_sim$sms_quiz)), replace = T, prob = c(.45, .45, .1)), NA)

df_sim$fear_quiz <-  ifelse(!is.na(df_sim$family_friend_check), 
                           sample(c("a", "b", NA_character_), 
                                  size = sum(!is.na(df_sim$family_friend_check)), replace = T, prob = c(.45, .45, .1)), NA)


 df_sim$anger_quiz <-  ifelse(!is.na(df_sim$fear_quiz), 
                           sample(c("a", "b", NA_character_), 
                                  size = sum(!is.na(df_sim$fear_quiz)), replace = T, prob = c(.45, .45, .1)), NA)  
 
 
 #### game
 
df_sim$game_completed <-  ifelse(df_sim$treatment_game== 1 & !is.na(df_sim$treatment_game), 
                           sample(c(1, 0), 
                                  size = sum(df_sim$treatment_game== 1, na.rm = T), replace = T, prob = c(.8, .2)), NA)
 
 
df_sim$score <-  ifelse(df_sim$game_completed== 1 & !is.na(df_sim$game_completed), 
                           sample(c(1:5, NA_integer_), 
                                  size = sum(df_sim$game_completed== 1, na.rm = T), replace = T, prob = c(.3, .2, .15, .10, .2, .05)), NA)


#### video

df_sim$video_1 <-  ifelse(df_sim$treatment_video== 1 & !is.na(df_sim$treatment_video), 
                           sample(c(1, 0), 
                                  size = sum(df_sim$treatment_video== 1, na.rm = T), replace = T, prob = c(.95, .05)), NA)
 

df_sim$video_2 <-  ifelse(df_sim$video_1== 1 & !is.na(df_sim$video_1), 
                           sample(c(1, 0), 
                                  size = sum(df_sim$video_1== 1, na.rm = T), replace = T, prob = c(.96, .04)), NA)


df_sim$video_3 <-  ifelse(df_sim$video_2== 1 & !is.na(df_sim$video_2), 
                           sample(c(1, 0), 
                                  size = sum(df_sim$video_2== 1, na.rm = T), replace = T, prob = c(.97, .03)), NA)

df_sim$video_4 <-  ifelse(df_sim$video_3== 1 & !is.na(df_sim$video_3), 
                           sample(c(1, 0), 
                                  size = sum(df_sim$video_3== 1, na.rm = T), replace = T, prob = c(.98, .02)), NA)

df_sim$video_5 <-  ifelse(df_sim$video_4== 1 & !is.na(df_sim$video_4), 
                           sample(c(1, 0), 
                                  size = sum(df_sim$video_4== 1, na.rm = T), replace = T, prob = c(.98, .02)), NA)



df_sim$treatment_complete <- ifelse((df_sim$treatment == "control" & !is.na(df_sim$treatment))|
                                      df_sim$video_5 == 1 & !is.na(df_sim$video_5)|
                                      !is.na(df_sim$score) | !is.na(df_sim$anger_quiz)
                                      , 1, 0)
### misinfo quiz - attention check

df_sim$headline1_answer <- ifelse(df_sim$treatment_complete == 1,   sample(c("a", "b", NA_character_), 
                                  size = sum(df_sim$treatment_complete), replace = T, prob = c(.45, .45, .1)), NA) 


df_sim$headline2_answer <- ifelse(!is.na(df_sim$headline1_answer),   sample(c("a", "b", NA_character_), 
                                  size = sum(!is.na(df_sim$headline1_answer)), replace = T, prob = c(.45, .40, .05)), NA) 



df_sim$attention_complete <- ifelse(!is.na(df_sim$headline2_answer), 1, 0)

### consequences nudge


df_sim$per_unfollow <- ifelse(df_sim$attention_complete == 1,   sample(c("a", "b", NA_character_), 
                                  size = sum(df_sim$attention_complete), replace = T, prob = c(.45, .45, .1)), NA) 

df_sim$consequences_complete <-  ifelse(!is.na(df_sim$per_unfollow), 1, 0)


### feedback

df_sim$enjoy_most <- ifelse(df_sim$consequences_complete == 1,   sample(c("aaa", "bbb", "ccc", NA_character_), 
                                  size = sum(df_sim$consequences_complete), replace = T, prob = c(.3, .3, .3, .1)), NA) 

df_sim$enjoy_least <- ifelse(!is.na(df_sim$enjoy_most),   sample(c("aaa", "bbb", "ccc", NA_character_), 
                                  size = sum(!is.na(df_sim$enjoy_most)), replace = T, prob = c(.3, .3, .35, .05)), NA) 



df_sim$feedback_complete <- ifelse(!is.na(df_sim$enjoy_most), 1, 0)

### DEMOGRAPHICS


df_sim$demog_age <- ifelse(df_sim$feedback_complete == 1, sample(c(18:99, NA_integer_), size = sum(df_sim$feedback_complete == 1), replace = T, 
                        prob = rep(1/(length(18:99)+1), length(18:99)+1)), NA)
  
  
df_sim$demog_gender <- ifelse(!is.na(df_sim$demog_age), 
                        sample(c("woman", "man", "non-binary", NA_character_), size = sum(!is.na(df_sim$demog_age)), replace = T, 
                                  prob = c(0.85/2, 0.85/2, 0.10, 0.05))  , NA
                          
                          )

df_sim$demog_ethnic <- ifelse(!is.na(df_sim$demog_gender), 
                        sample(c("white or caucasian", "black or african", 
                                       "hispanic", 
                                       "asian or indian", 
                                       "native american", 
                                       "pacific islander", NA_character_), size = sum(!is.na(df_sim$demog_gender)), replace = T, 
                                  prob = c(rep(0.9/6, 6), 0.1))  , NA )


df_sim$demog_income <- ifelse(!is.na(df_sim$demog_ethnic), 
                        sample(c("less than $30,000", 
                                                           "$30,000 - $49,999",
                                                           "$50,000 - $69,999", 
                                                           "$70,000 - $99,999", 
                                                           "$100,000 - $150,000", 
                                                           "more than $150,000", NA_character_), size = sum(!is.na(df_sim$demog_ethnic)), replace = T, 
                                  prob = c(rep(0.9/6, 6), 0.1))  , NA )



df_sim$demog_education <- ifelse(!is.na(df_sim$demog_income), 
                        sample(c("less than $30,000", 
                                                           "$30,000 - $49,999",
                                                           "$50,000 - $69,999", 
                                                           "$70,000 - $99,999", 
                                                           "$100,000 - $150,000", 
                                                           "more than $150,000", NA_character_), size = sum(!is.na(df_sim$demog_income)), replace = T, 
                                  prob = c(rep(0.9/6, 6), 0.1))  , NA )


df_sim$demog_religion <- ifelse(!is.na(df_sim$demog_education), 
                        sample(c(1:6, NA_character_), size = sum(!is.na(df_sim$demog_education)), replace = T, 
                                  prob = c(rep(0.9/6, 6), 0.1))  , NA )

df_sim$demog_political <- ifelse(!is.na(df_sim$demog_religion), 
                        sample(c("democrat", "independent", "republican", NA_character_), size = sum(!is.na(df_sim$demog_religion)), replace = T, 
                                  prob = c(rep(0.95/3, 3), 0.1))  , NA )




df_sim$demographics_complete <- ifelse(!is.na(df_sim$demog_political), 1, 0)



### end

df_sim$amazon_credit <- ifelse(df_sim$demographics_complete  == 1, 
                               sample(c(letters, NA_character_), 
                                      replace =T, size = sum(df_sim$demographics_complete ), 
                                      prob = rep(1/27, 27)), NA)

df_sim$full_complete <- ifelse(!is.na(df_sim$amazon_credit), 1, 0)


# outcomes for everyone who started survey

df_sim$fb_outcome_binary <- sample(c(0, 1), size =10000, 
                                   replace = T, prob = c(0.4, 0.6))

df_sim$fb_outcome_continuous <- runif(10000, 0, 1)




df_sim$fb_demog_age <- sample(18:99, size = 10000, replace = T, 
                        prob = rep(1/length(18:99), length(18:99)))
                 


df_sim$fb_demog_gender <-as.factor(sample(c("woman", "man", "non-binary"), size = 10000, replace = T, 
                                  prob = c(0.85/2, 0.85/2, 0.15)))
                           


df_sim$fb_demog_race <- sample(c("white or caucasian", "black or african", 
                                       "hispanic", 
                                       "asian or indian", 
                                       "native american", 
                                       "pacific islander"), 10000, 
                            replace = T, prob = rep(1/6, 6))

df_sim$fb_demog_education <- as.factor(sample(c("< high school", 
                                 "high school", 
                                 "some college", 
                                 "2-year degree", 
                                 "4-year degree", 
                                 "graduate degree"), 10000, 
                               replace = T, prob = rep(1/6, 6)))

df_sim$fb_demog_income <- as.factor(sample(c("less than $30,000", 
                                                           "$30,000 - $49,999",
                                                           "$50,000 - $69,999", 
                                                           "$70,000 - $99,999", 
                                                           "$100,000 - $150,000", 
                                                           "more than $150,000"), 10000, 
                               replace = T, prob = rep(1/6, 6)))

df_sim <- df_sim %>%
  mutate(fb_demog_age_group = 
           as.factor(case_when(fb_demog_age <25 & fb_demog_age> 17 ~ "18-24", 
                     fb_demog_age <35 & fb_demog_age> 24 ~ "25-34", 
                     fb_demog_age <45 & fb_demog_age> 34 ~ "35-44", 
                     fb_demog_age <55 & fb_demog_age> 44 ~ "45-54", 
                     fb_demog_age <65 & fb_demog_age> 54 ~ "55-64",
                     fb_demog_age> 64 ~ "65+",
                     TRUE ~ NA_character_))
           )

df_sim <- as.data.frame(one_hot(as.data.table(df_sim), dropCols=FALSE))


ads[1:9, "ad_name"] <- unique(df_sim$ad)

# format numbers to percent
form_percent <- function(dec){
  if(is.na(dec)){
    return("-")
  } else {
    return(paste0(round(dec, 4) * 100, "%"))
  }
}

# format number dollar type
form_cost <- function(num){
  
  sapply(num, function(x) if (x == Inf| is.na(x)) {
    return("-")
  } else {
    return(paste0("$", round(x, 3)))
  })
  
}

# round number to 4 with trailing 0s
# converts to string
round_4 <- function(x){
  formatC(x, digits = 4, format = "f")
}

# format number with commas for 
# large numbers without decimals
form_num <- function(num){
  
  sapply(num, function(x) if (x == Inf| is.na(x)) {
    return("-")
  } else {
    return( formatC(x, big.mark = ","))
  })
  
}

# this puts sms to all caps and control to Control
label_fun <- function(x){
    ifelse(nchar(x) <4 & x != "all", toupper(x), str_to_title(x))
  }

1 Project Objectives

The goal of the Facebook Misinformation Project is to experimentally evaluate scaled versions of promising interventions to reduce misinformation spread on social media using on-platform behavioral data.

2 Pilot Objectives

Update for Pilot 2:

3 Purpose of Analysis Script

Update for Pilot 2: The analysis script cleans and analyzes pilot data from Ads Manager, Chatfuel, and chatbot responses. The output includes the following tables:

Descriptive Statistics
Funnel Statistics
Treatment Analysis
Subgroup Analysis

4 Samples

4.1 Targeting Strategy

Update for Pilot 2: Our targeting strategy is estimated to target 53 million FB users, which is 22% of total FB users in the US (240 million). The pilot target users (1) in the union of the top 10 states that are most conservative + top 20 most rural states (2) OR in the top 2,500 largest ZIP codes by population in other states that have high republican votes (# votes for Republican candidates / # total votes > 95%) or high rural population rate (% rural population > 85%) (3) Among users in (1) or (2), target users with age 35+.

4.2 Randomization Strategy

4.2.1 Ad-Level

Update for Pilot 2: The pilot includes samples to test 9 versions of ads - Outrage 1, Outrage 2, Attention 1, Attention 2, Real 1, Real 2, Fake News 1, Fake News 2, Fake.

4.2.1.1 Outrage

4.2.1.1.1 Ad 1

4.2.1.1.2 Ad 2

4.2.1.2 Attention

4.2.1.2.1 Ad 1

4.2.1.2.2 Ad 2

4.2.1.3 Real

4.2.1.3.1 Ad 1

4.2.1.3.2 Ad 2

4.2.1.4 Fake News

4.2.1.4.1 Ad 1

4.2.1.4.2 Ad 2

4.2.1.5 Fake

4.2.2 Chatbot-Level

After users click on the ads, they are randomized to be connected with one of the 4 chatbot conditions: two pre-consent chat conditions to test two different factoids (Factoid 1, Factoid 2), and two post-consent intro conditions to test the performance of the intro section with vs. without an additional motivator (Consent only, Motivator).

1a	Consent only	no factoid	no motivator
1b	Motivator	no factoid	motivator
2a	Factoid 1	factoid 1	no motivator
2b	Factoid 2	factoid 2	no motivator

4.2.3 Intervention-Level

After users consent to participate in the study, they are randomized into an intervention group and a control group. The intervention group is exposed to the SMS course, consequences nudge and content replacement; the control group is exposed to the facts baseline course only.

5 Main Takeaways

Will need to update for Pilot 2:

6 Data Dictionary

The data dictionary below outlines each variable used in analysis from the Chatfuel dataset. Group denotes the section of the chatbot. Derived denotes if this variable was created during the cleaning process.

# print data dictionary
data_dictionary %>%
  select(cols, description,values, group, type, derived) %>%
  datatable(rownames = F)

7 Descriptive Statistics

7.1 Numeric Variables

This table reports the mean, standard deviations, and total number of non-missing observations for numeric variables. Note that individuals without an arm assignment are not included in these tables. Participants who do not consent to the study do not receive a treatment assignment.

7.1.1 Demographics

Here we report the summary statistics for all FB demographics variables of interest by treatment group.

# summary for all treatments
df_sim %>%
  filter(!is.na(treatment)) %>%
  select(colnames(df_sim)[str_detect(colnames(df_sim), "fb_demog")]) %>%
  select(!fb_demog_gender & !fb_demog_race & !fb_demog_education & !fb_demog_income &!fb_demog_age_group ) %>%
  summarise_all(list(n= function(x) sum(!is.na(x)), 
                     mean = function(x) mean(x, na.rm = T), 
                     sd = function(x) sd(x, na.rm = T))) %>%
  mutate(treatment = "all treatments") %>%
  # summary by treatmet
  rbind(df_sim %>%
  filter(!is.na(treatment)) %>%
  select(colnames(df_sim)[str_detect(colnames(df_sim), "fb_demog")], treatment) %>%
  select(!fb_demog_gender & !fb_demog_race & !fb_demog_education & !fb_demog_income &!fb_demog_age_group ) %>%
  group_by(treatment) %>%
  summarise_all(list(n= function(x) sum(!is.na(x)), 
                     mean = function(x) mean(x, na.rm = T), 
                     sd = function(x) sd(x, na.rm = T))))%>%
      pivot_longer(cols = !treatment, 
                   names_to = "variable",
                   values_to = "value") %>% 
      separate(variable, into = c("variable", "measure"), sep="_(?=[^_]+$)") %>%
      pivot_wider(names_from = c("treatment", "measure"))%>%
  # round mean and sd to 4 digits but not n
  mutate_at(vars(contains(c("mean", "sd"))), round_4) %>%
  kable(col.names = c("Variable", rep(c("N", "Mean", "SD"), 5))) %>%
  add_header_above(c(" " = 1, "All" = 3,"Control" = 3, "Game" = 3, "SMS" = 3, "Video" = 3))%>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

	All			Control			Game			SMS			Video
Variable	N	Mean	SD	N	Mean	SD	N	Mean	SD	N	Mean	SD	N	Mean	SD
fb_demog_age	6857	58.2907	23.7599	4615	58.6540	23.7090	745	57.4188	24.0327	759	56.2648	23.7804	738	58.9824	23.6955
fb_demog_gender_man	6857	0.4279	0.4948	4615	0.4295	0.4951	745	0.3852	0.4870	759	0.4466	0.4975	738	0.4417	0.4969
fb_demog_gender_non-binary	6857	0.1429	0.3500	4615	0.1428	0.3499	745	0.1490	0.3563	759	0.1383	0.3455	738	0.1423	0.3496
fb_demog_gender_woman	6857	0.4292	0.4950	4615	0.4277	0.4948	745	0.4658	0.4992	759	0.4150	0.4931	738	0.4160	0.4932
fb_demog_education_< high school	6857	0.1776	0.3822	4615	0.1805	0.3846	745	0.1893	0.3920	759	0.1634	0.3699	738	0.1626	0.3693
fb_demog_education_2-year degree	6857	0.1663	0.3723	4615	0.1671	0.3731	745	0.1490	0.3563	759	0.1647	0.3711	738	0.1802	0.3846
fb_demog_education_4-year degree	6857	0.1712	0.3767	4615	0.1714	0.3769	745	0.1772	0.3821	759	0.1713	0.3770	738	0.1640	0.3705
fb_demog_education_graduate degree	6857	0.1600	0.3666	4615	0.1590	0.3658	745	0.1584	0.3654	759	0.1542	0.3613	738	0.1734	0.3789
fb_demog_education_high school	6857	0.1587	0.3654	4615	0.1567	0.3635	745	0.1611	0.3678	759	0.1726	0.3781	738	0.1545	0.3616
fb_demog_education_some college	6857	0.1663	0.3723	4615	0.1653	0.3715	745	0.1651	0.3715	759	0.1739	0.3793	738	0.1653	0.3717
fb_demog_income_$100,000 - $150,000	6857	0.1574	0.3642	4615	0.1536	0.3606	745	0.1517	0.3589	759	0.1647	0.3711	738	0.1789	0.3835
fb_demog_income_$30,000 - $49,999	6857	0.1759	0.3807	4615	0.1751	0.3801	745	0.1758	0.3809	759	0.1739	0.3793	738	0.1829	0.3869
fb_demog_income_$50,000 - $69,999	6857	0.1619	0.3684	4615	0.1655	0.3717	745	0.1490	0.3563	759	0.1634	0.3699	738	0.1504	0.3577
fb_demog_income_$70,000 - $99,999	6857	0.1639	0.3702	4615	0.1699	0.3756	745	0.1490	0.3563	759	0.1607	0.3675	738	0.1450	0.3523
fb_demog_income_less than $30,000	6857	0.1660	0.3721	4615	0.1653	0.3715	745	0.1852	0.3887	759	0.1713	0.3770	738	0.1450	0.3523
fb_demog_income_more than $150,000	6857	0.1750	0.3800	4615	0.1705	0.3761	745	0.1893	0.3920	759	0.1660	0.3723	738	0.1978	0.3986
fb_demog_age_group_18-24	6857	0.0887	0.2843	4615	0.0858	0.2801	745	0.0993	0.2993	759	0.1014	0.3021	738	0.0827	0.2755
fb_demog_age_group_25-34	6857	0.1228	0.3282	4615	0.1196	0.3245	745	0.1262	0.3323	759	0.1423	0.3496	738	0.1192	0.3243
fb_demog_age_group_35-44	6857	0.1245	0.3302	4615	0.1270	0.3330	745	0.1221	0.3277	759	0.1212	0.3266	738	0.1152	0.3195
fb_demog_age_group_45-54	6857	0.1237	0.3292	4615	0.1211	0.3263	745	0.1181	0.3230	759	0.1318	0.3384	738	0.1369	0.3439
fb_demog_age_group_55-64	6857	0.1168	0.3212	4615	0.1129	0.3165	745	0.1369	0.3440	759	0.1238	0.3296	738	0.1138	0.3178
fb_demog_age_group_65+	6857	0.4235	0.4942	4615	0.4336	0.4956	745	0.3973	0.4897	759	0.3794	0.4856	738	0.4322	0.4957

7.1.2 Funnel Numeric

funnel_vars <- c("conversations_started", 
                 "consent", 
                 "intro_complete", 
                 "treatment_complete", 
                 "consequences_complete", 
                 "feedback_complete", 
                 "demographics_complete", 
                 "full_complete")

df_sim %>%
  filter(!is.na(treatment) ) %>%
  select(all_of(funnel_vars)) %>%
  summarise_all(list(n= length, mean = mean, sd =sd)) %>%
  mutate(treatment = "all") %>%
  rbind(df_sim %>%
  select(all_of(funnel_vars), treatment) %>%
    filter(!is.na(treatment) ) %>%
  group_by(treatment) %>%
  summarise_all(list(n= length, mean = mean, sd =sd))) %>%
  pivot_longer( cols = !treatment, 
                names_to = "variable",
                values_to = "value") %>% 
  separate(variable, into = c("variable", "measure"), sep="_(?=[^_]+$)") %>%
  pivot_wider(names_from = c("treatment", "measure")) %>%
    # round mean and sd to 4 digits but not n
  mutate_at(vars(contains(c("mean", "sd"))), round_4) %>%
  kable(col.names = c("Variable", rep(c("N", "Mean", "SD"), 5))) %>%
  add_header_above(c(" " = 1, "All" = 3, "Control" = 3, "Game" = 3, "SMS" = 3, "Video" = 3))%>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

	All			Control			Game			SMS			Video
Variable	N	Mean	SD	N	Mean	SD	N	Mean	SD	N	Mean	SD	N	Mean	SD
conversations_started	6857	1.0000	0.0000	4615	1.0000	0.0000	745	1.0000	0.0000	759	1.0000	0.0000	738	1.0000	0.0000
consent	6857	1.0000	0.0000	4615	1.0000	0.0000	745	1.0000	0.0000	759	1.0000	0.0000	738	1.0000	0.0000
intro_complete	6857	1.0000	0.0000	4615	1.0000	0.0000	745	1.0000	0.0000	759	1.0000	0.0000	738	1.0000	0.0000
treatment_complete	6857	0.9262	0.2615	4615	1.0000	0.0000	745	0.7839	0.4119	759	0.6640	0.4726	738	0.8780	0.3275
consequences_complete	6857	0.7047	0.4562	4615	0.7569	0.4290	745	0.6000	0.4902	759	0.5204	0.4999	738	0.6734	0.4693
feedback_complete	6857	0.6358	0.4812	4615	0.6871	0.4637	745	0.5262	0.4996	759	0.4625	0.4989	738	0.6043	0.4893
demographics_complete	6857	0.3564	0.4790	4615	0.3844	0.4865	745	0.2966	0.4571	759	0.2675	0.4429	738	0.3333	0.4717
full_complete	6857	0.3429	0.4747	4615	0.3686	0.4825	745	0.2805	0.4496	759	0.2648	0.4415	738	0.3252	0.4688

7.1.3 Outcome

# all 
df_sim %>%
  filter(!is.na(treatment)) %>%
  select(fb_outcome_binary, fb_outcome_continuous) %>%
  summarise_all(list(n= function(x)sum(!is.na(x)), mean = function(x)mean(x, na.rm = T), sd = function(x)sd(x, na.rm = T))) %>%
  mutate(treatment = "all") %>%
  # by treatment
    rbind(df_sim %>%
    select(fb_outcome_binary, fb_outcome_continuous, treatment) %>%
    filter(!is.na(treatment)) %>%
    group_by(treatment) %>%
    summarise_all(list(n= function(x)sum(!is.na(x)), mean = function(x)mean(x, na.rm = T), sd = function(x)sd(x, na.rm = T)))) %>%
  pivot_longer( cols = !treatment, 
                names_to = "variable",
                values_to = "value") %>% 
  separate(variable, into = c("variable", "measure"), sep="_(?=[^_]+$)") %>%
  pivot_wider(names_from = c("treatment", "measure")) %>%
  # round mean and sd to 4 digits but not n
  mutate_at(vars(contains(c("mean", "sd"))), round_4) %>%
  kable(col.names = c("Variable", rep(c("N", "Mean", "SD"), 5))) %>%
  add_header_above(c(" " = 1, "All" = 3, "Control" = 3, "Game" = 3, "SMS" = 3, "Video" = 3))%>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

	All			Control			Game			SMS			Video
Variable	N	Mean	SD	N	Mean	SD	N	Mean	SD	N	Mean	SD	N	Mean	SD
fb_outcome_binary	6857	0.6025	0.4894	4615	0.6054	0.4888	745	0.6148	0.4870	759	0.6074	0.4887	738	0.5664	0.4959
fb_outcome_continuous	6857	0.5016	0.2860	4615	0.5007	0.2865	745	0.4971	0.2864	759	0.5120	0.2846	738	0.5003	0.2840

7.1.4 Other Numeric

# summary overall

df_sim %>%
  select(!contains(c("demog", "outcome", "complete", funnel_vars))) %>%
  filter(!is.na(treatment)) %>%
  select_if(is.numeric) %>%
  summarise_all(list(n= function(x)sum(!is.na(x)), mean = function(x)mean(x, na.rm = T), sd = function(x)sd(x, na.rm = T))) %>%
  mutate(treatment = "all") %>%
  rbind(df_sim %>%
  select(!contains(c("demog", "outcome", "complete", funnel_vars))) %>%
  group_by(treatment) %>%
  select_if(is.numeric) %>%
  filter(!is.na(treatment)) %>%
      group_by(treatment) %>%
      summarise_all(list(n= function(x)sum(!is.na(x)), mean = function(x)mean(x, na.rm = T), sd = function(x)sd(x, na.rm = T))))%>%
      pivot_longer( cols = !treatment, 
                    names_to = "variable",
                    values_to = "value") %>% 
      separate(variable, into = c("variable", "measure"), sep="_(?=[^_]+$)") %>%
      pivot_wider(names_from = c("treatment", "measure")) %>%
    # round mean and sd to 4 digits but not n
  mutate_at(vars(contains(c("mean", "sd"))), round_4)  %>% 
  # if NaN or NA, replace with "-"
  mutate_all(~ replace(., .=="  NaN" | . == "   NA", "-")) %>%
  kable(col.names = c("Variable", rep(c("N", "Mean", "SD"), 5))) %>%
  add_header_above(c(" " = 1, "All" =3, "Control" = 3, "Game" = 3, "SMS" = 3, "Video" = 3))%>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

	All			Control			Game			SMS			Video
Variable	N	Mean	SD	N	Mean	SD	N	Mean	SD	N	Mean	SD	N	Mean	SD
intro_go_on	6857	1.0000	0.0000	4615	1.0000	0.0000	745	1.0000	0.0000	759	1.0000	0.0000	738	1.0000	0.0000
treatment_group	6857	0.3270	0.4691	4615	0.0000	0.0000	745	1.0000	0.0000	759	1.0000	0.0000	738	1.0000	0.0000
treatment_sms	5374	0.1412	0.3483	4615	0.0000	0.0000	0			759	1.0000	0.0000	0
treatment_game	5360	0.1390	0.3460	4615	0.0000	0.0000	745	1.0000	0.0000	0			0
treatment_video	5353	0.1379	0.3448	4615	0.0000	0.0000	0			0			738	1.0000	0.0000
score	584	2.5599	1.5001	0			584	2.5599	1.5001	0			0
video_1	738	0.9593	0.1976	0			0			0			738	0.9593	0.1976
video_2	708	0.9576	0.2016	0			0			0			708	0.9576	0.2016
video_3	678	0.9794	0.1423	0			0			0			678	0.9794	0.1423
video_4	664	0.9849	0.1219	0			0			0			664	0.9849	0.1219
video_5	654	0.9908	0.0954	0			0			0			654	0.9908	0.0954

7.2 Categorical Variables

This table reports the distribution of observations in each category for categorical variables. Note that individuals without an arm assignment are not included in these tables.

#function to calculate count and shares of categorical variables
cat_summary <- function(variable){
  
  var <- sym(variable)
  
  df_sim %>%
    filter(treatment != "no treatment") %>%
    select(!!var) %>%
    group_by(!!var) %>%
    count(name = "Count") %>%
    ungroup() %>%
    mutate(Share = round(Count/sum(Count), 2))%>%
    kable() %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
  
}

7.2.1 Arm

The arm variable denotes participants’ treatment assignment.

cat_summary("treatment")

treatment	Count	Share
control	4615	0.67
game	745	0.11
sms	759	0.11
video	738	0.11

7.2.2 Family Friend Check

Answer to the question: ❓How about you? If a family member or a friend posts something online, do you stop to check whether it’s misinformation?

A. I always trust my family and friends!

B. I always check whether something is misinformation.

cat_summary("family_friend_check")

family_friend_check	Count	Share
a	297	0.04
b	329	0.05
NA	6231	0.91

7.2.3 Fear Quiz

Answer to the question: ❓What motive might someone have for wanting you to believe the ingredients of another product are safer?

A. To sell their product.

B. Because they genuinely think the other product is dangerous.

C. All of the above.

cat_summary("fear_quiz")

fear_quiz	Count	Share
a	286	0.04
b	289	0.04
NA	6282	0.92

7.2.4 Anger Quiz

Answer to the question: Would you share this [anger inducing ad] on your social media feed?

A. Yes

B. No

cat_summary("anger_quiz")

anger_quiz	Count	Share
a	264	0.04
b	240	0.04
NA	6353	0.93

7.2.5 Per Unfollow

Answer to the question: What percentage of people do you think have unfollowed someone who shared misinformation?

10%

25%

50% or more

cat_summary("per_unfollow")

per_unfollow	Count	Share
a	2436	0.36
b	2396	0.35
NA	2025	0.30

7.3 Visualization

7.3.1 Demographics

demog_figures <- function(variable){
   
  variable <- enquo(variable)

  
  df_plot <- df_sim %>%
    select(treatment, !!variable) %>%
    filter(!is.na(treatment) & !is.na(!!variable)) %>%
    group_by(treatment, !!variable) %>%
    count()%>%
    group_by(treatment) %>%
    mutate(percent = paste(round(n/sum(n) * 100, 0), "%", sep = "")) %>%
    ungroup() %>%
    mutate(percent = case_when(!duplicated(!!variable) ~ 
       paste(percent, "\n", str_to_title(!!variable), sep = "")
      ,  
      TRUE ~ percent))
  
 x <- ggplot(df_plot, 
         aes(x = treatment, 
             y = n, 
             fill = !!variable)) +
    geom_bar(position = "fill", stat ="identity", width = .7, color = "white") +
    theme_sarah() +
   scale_x_discrete(labels = label_fun )+
    scale_fill_manual(values = c(cb_colors)) + 
    geom_text(aes( label = percent), 
              color = "white", 
              position = position_fill(vjust = 0.5), 
              size = 12/.pt, 
              fontface = "bold") + 
    labs(x = "", 
         y ="", 
         title = paste("Percentage of Participants by\n Treatment and", str_to_title(str_replace(str_remove(as_label(variable), "demog_"), "_", " ")), sep = " "), 
         caption = paste("Number of Observations:", scales::comma(sum(df_plot$n)))) +
    scale_y_continuous(breaks = c(0, 1), labels = c("0%", "100%"))  + 
    theme(legend.position = "none")
 
  print(x)
}

The next figures show the share of different demographic groups by treatment. Note that the following figures only include observations where the treatment and demographic variables are both non-missing.

demog_vars <- c("fb_demog_gender", "fb_demog_age_group", 
                "fb_demog_race", "fb_demog_education", "fb_demog_income")

for(i in demog_vars){
    
    cat("####", str_to_title(str_replace(str_remove(i, "demog_"), "_", " ")), "\n\n\n")
  
    demog_figures(!!sym(paste(i, sep = "")))
    
    cat("\n\n\n")
  }

7.3.1.1 Fb Gender

7.3.1.2 Fb Age_group

7.3.1.3 Fb Race

7.3.1.4 Fb Education

7.3.1.5 Fb Income

7.3.2 Ads Demographics

Facebook Ads Manager provides cost, impression, and conversation counts at the ad-age-gender level. Below, we show the cost per impression and cost per conversation by age or gender and ad or ad theme. Note that here, we use the Facebook Ads definition of conversations.

# figure is a function of the variable measure (impressions or conversations)
# group (ad theme group or individual ads)
# demographic variable (age or gender)

plot_cost_per <- function(variable, group, demo){
  
  # choose a group to sort bars descending by
  # plot colors
  # filter out unknown gender group since it is a small size
  if(demo == "age"){
    
    label_group <-"65+"
    plot_colors <- cb_colors[3:6]
    
  } else if(demo == "gender"){
    
    label_group <-"female"
    plot_colors <- cb_colors[1:2]
    ads_demo <- ads_demo %>%
      filter(gender != "unknown")
    
  }

  # set plot title and x axis label
  plot_title <- paste("Cost per", str_to_title(str_remove(variable, "s$")), "by", str_to_title(demo), sep =" ")
  x_title <- paste("Cost per", str_to_title(str_remove(variable, "s$")), sep = " ")
  
  group <- enquo(group)
  variable <- sym(variable)
  demo <- sym(demo)

  # calculate cost per measure by group
  plot_data <- ads_demo  %>%
    select(!!group, !!demo, !!variable, cost) %>%
    group_by(!!group, !!demo) %>%
    summarize_all(sum) %>%
    mutate(cost_var = cost/  !!variable, 
          cost_var = ifelse(is.infinite(cost_var), 0, cost_var)
    ) %>%
    group_by(!!demo) %>%
    # order for plotting
    mutate(ordervar = ifelse(!!demo == label_group, rank(cost_var), 10)) %>%
    ungroup() 
  
  # this finds which ad or ad theme has the max cost per measure for female or 65+. 
  # this is where the demographic group labels are plotted
  labels <- plot_data %>%
    filter(!!demo == label_group) %>%
    filter(cost_var ==max(cost_var)) %>%
    pull(!!group)
  
  # create labels for demographic groups
  plot_data <- plot_data %>%
    mutate(demo_label = ifelse(!!group== labels, paste(str_to_title(!!demo), " "), ""))
 
  # plot bars        
  ggplot(plot_data, 
         aes(y = reorder(!!group, ordervar),
             x = cost_var, 
             group = !!demo, 
             fill = !!demo)) + 
    geom_bar(stat = "identity", 
           position = "dodge", 
           color = "white") +
    theme_sarah() +
    theme(legend.position = "none", 
          panel.grid.major.y = element_blank()) +
    labs(y = "", 
     x = x_title, 
     title = plot_title) +
    # set colors
    scale_fill_manual(values = plot_colors) +
    # this plots bar demographic labels
    geom_text(aes(label = demo_label),
              position = position_dodge(0.9),
              color="white",
              vjust = 0.5,
              hjust = 1)
}

# only thing to add after running function is 
# the scale limits and breaks, which is done manually
# since its different for each combination

7.3.2.1 Cost per Impression

7.3.2.1.1 Gender-Ad

Note that Facebook Ads also has a gender category called unknown. Since the size of this category is so small, we do no included these observations below.

plot_cost_per("impressions", ad_name, "gender") +
scale_x_continuous(limits = c(0, 0.065), 
                   expand = c(0,0), 
                   breaks = seq(0, 0.06, 0.01))

7.3.2.1.2 Age-Ad

plot_cost_per("impressions", ad_name, "age") +
  scale_x_continuous(limits = c(0, 0.056), 
                     expand = c(0,0), 
                     breaks = seq(0, 0.055, 0.01))

7.3.2.2 Cost per Conversation

7.3.2.2.1 Gender-Ad

Note that Facebook Ads also has a gender category called unknown. Since the size of this category is so small, we do no included these observations below. We calculate cost per conversation = cost / #conversations. Missing bars indicates that we could not calculate cost per conversation because there were zero conversations for that subgroup.

plot_cost_per("conversations", ad_name, "gender") +
  scale_x_continuous(limits = c(0, 65), 
                     expand = c(0,0), 
                     breaks = seq(0, 60, 10))

7.3.2.2.2 Age-Ad

We calculate cost per conversation = cost / #conversations. Missing bars indicates that we could not calculate cost per conversation because there were zero conversations for that subgroup.

plot_cost_per("conversations", ad_name, "age") +
  scale_x_continuous(limits = c(0, 65),
                     expand = c(0,0),
                     breaks = seq(0, 65, 10))

8 Funnel Statistics

8.1 Table 1: Summary of Results

This table presents an overview of funnel statistics at the ad-, chatbot-, and treatment-level.

8.1.1 Ad-level

All responses, regardless of treatment-level, are included. Note that here, we use the Facebook Ads definition of conversations.

ads %>%
  mutate(cost_per_impression = form_cost(cost/impressions),
         cost_per_consent = form_cost(cost/consents), 
         cost_per_click = form_cost(cost/clicks),
         cost_per_completion = form_cost(cost/completions), 
         impressions = form_num(impressions), 
         reach = form_num(reach),
         clicks = form_num(clicks)) %>%
  select("Ad" = ad_name, 
         "Impressions" = impressions,
         "Reach" = reach,
         "Clicks" = clicks,  
         "Conversations" = conversations, 
         "Consents" = consents,
         "Completions" = completions, 
         "Cost per Impression" = cost_per_impression, 
         "Cost per Click" = cost_per_click,
         "Cost per Consent" = cost_per_consent, 
         "Cost per Completion" = cost_per_completion) %>%
    arrange(Ad) %>%
    kable(escape = F) %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

Ad	Impressions	Reach	Clicks	Conversations	Consents	Completions	Cost per Impression	Cost per Click	Cost per Consent	Cost per Completion
attention 1	4,274	3,499	231	9	7	3	$0.037	$0.677	$22.353	$52.157
fake	6,185	4,989	305	13	3	2	$0.029	$0.595	$60.443	$90.665
fake news 2	12,629	9,490	711	25	11	4	$0.043	$0.767	$49.576	$136.335
help friends	1,148	960	97	1	0	0	$0.025	$0.301
join	18,192	14,110	1,099	65	25	7	$0.042	$0.697	$30.644	$109.443
No Ad ID				5	4	2
outrage 1	2,954	2,284	254	4	2	1	$0.019	$0.222	$28.17	$56.34
protect	2,778	2,203	160	1	0	0	$0.03	$0.522
real 1	3,733	3,123	108	15	6	1	$0.027	$0.939	$16.908	$101.45
real 2	1,983	1,442	105	5	2	1	$0.04	$0.765	$40.155	$80.31

8.1.2 Treatment-level

All users who consented are included.

Cost per Consent = total experiment costs / consents. Cost per Completion = total experiment costs / completions.

total_cost <- sum(ads$cost, na.rm  =T)
# treatment level
df_sim %>%
  select(treatment, consent, full_complete) %>%
  filter(!is.na(treatment)) %>%
  group_by(treatment) %>%
  summarize(consents = sum(consent, na.rm  = T),
            completions = sum(full_complete, na.rm  = T)) %>%
  mutate(cost_per_consent = form_cost(total_cost/consents), 
         cost_per_completion = form_cost(total_cost/completions), 
         treatment = label_fun(treatment)) %>%
  # all treatments
  rbind(df_sim %>%
          filter(!is.na(treatment)) %>%
          select( consent, full_complete) %>%
          summarize(consents = sum(consent, na.rm  = T),
                    completions = sum(full_complete, na.rm  = T)) %>%
          mutate(cost_per_consent = form_cost(total_cost/consents), 
                 cost_per_completion = form_cost(total_cost/completions), 
                 treatment = "All")) %>%
  # rename column names to titles
  select("Treatment" = treatment,
         "Consents" = consents,
         "Completions" = completions, 
         "Cost per Consent" = cost_per_consent, 
         "Cost per Completion" = cost_per_completion) %>%
  kable(escape = F) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

Treatment	Consents	Completions	Cost per Consent	Cost per Completion
Control	4615	1701	$0.433	$1.176
Game	745	209	$2.685	$9.569
SMS	759	201	$2.635	$9.95
Video	738	240	$2.71	$8.333
All	6857	2351	$0.292	$0.851

8.2 Table 2: Overall Funnel

This table shows the detailed funnel by all, treatment, and control participants. Here, “All” refers to all participants from the stage of ad impressions, including those who did not start a conversation or did not consent to participate in the study (these users were not assigned to an arm). “SMS participants” refer to consenting users who are assigned to the SMS intervention arm, and “Control participants” refer to consenting users who are assigned to the control arm.

Funnel Parameters

Reach (%) = #reach / #impressions
Ad clickthrough (%) = #clicks / #reach
Conversation started (%) = #conversations / #clicks
Consent obtained (%) = #consents / #conversations
Treatment completed (%) = #forking section completed / # consents
Outcome completed (%) = #outcome completed / #forking section completed
Feedback completed (%) = #feedback section completed / #outcome completed
Demographics Completed (%) = #demographic section completed / #feedback section completed
Full survey completed (%) = #full survey completed / #demographic section completed

# funnel for overall and by add
get_funnel <- function(group, ad = NULL){
  
  # filter data for ad level analysis
  if(!is.null(ad)){
    df_temp <- df_sim[df_sim$ad_name == ad & !is.na(df_sim$ad_name), ]
    ads_temp <- ads[ads$ad_name == ad & !is.na(ads$ad_name), ]
  } else{
    df_temp <- df_sim
    ads_temp <- ads
  }
  
  # filter data for treatment level analysis
  if (group == "all"){
    df_temp <- df_temp
  } else {
     df_temp <- df_temp[df_temp$treatment == group & !is.na(df_temp$treatment), ]
  }
  
  # calculate total costs, impressions, clicks, from ads
  cost <- sum(ads_temp$cost, na.rm = T)
  impressions <- sum(ads_temp$impressions, na.rm = T)
  reach <-  sum(ads_temp$reach, na.rm = T)
  clicks <- sum(ads_temp$clicks, na.rm = T)
  
  # get rest of metrics from chatfuel
  conversations <- nrow(df_temp)
  consents <- sum(df_temp$consent, na.rm = T)
  treatment_complete <- sum(df_temp$treatment_complete) 
  feedback_complete <- sum(df_temp$feedback_complete)
  demog_complete <- sum(df_temp$demographics_complete)
  full_complete <- sum(df_temp$full_complete)
  
  # metric names
  metric = c(
    "Impressions", 
    "Reach",
    "Ad Clickthrough",
    "Conversation Started",
    "Consent Obtained", 
    "Treatment Completed",
    "Feedback Complete", 
    "Demographics Complete", 
    "Full Complete")
  
  cost_fun <- function(x){
    return(form_cost(cost/x))
  }
  
  metrics_list <- c(impressions, reach, clicks, conversations, consents, treatment_complete
         , feedback_complete, 
         demog_complete, full_complete)
  
  # cost of each metric
  costs <- sapply(metrics_list, cost_fun)
  
  # counts formatted with comma  
  counts <- sapply(metrics_list, scales::comma)

  
  percent_se <- function(numerator, denominator){
    p <- numerator/denominator
    se <- sqrt(p*(1-p)/denominator)
    p_se <- paste(form_percent(p), " (", form_percent(se), ")", sep = "")
    
    return(p_se)
  }
    
  # percent of previous
  percent <- mapply(function(x, y) percent_se(x, y), 
                    metrics_list[-1], 
                    metrics_list[-length(metrics_list)]
                    )
  percent <- c("-", percent)

 
  
  percent_of_impressions <- sapply(metrics_list, function(x) percent_se(x, impressions))
  percent_of_impressions[1] <- "-"

  
  # percent of consents
  percent_of_consents = sapply(metrics_list, function(x) percent_se(x, consents))
  percent_of_consents[1:4] <- "-"
  
     if(group == "all" & is.null(ad)){
          dropoff_1 <- data.frame(cbind(metric, counts, percent, percent_of_impressions, costs))
     } else {
       dropoff_1 <- data.frame(cbind(metric, counts, percent, percent_of_consents))
     }
       
  if(!is.null(ad)){
    if(ad == "No Ad ID"){
      dropoff_1[dropoff_1$metric == "Impressions", c("counts", "percent", "percent_of_consents") ] <- "-"
      dropoff_1[dropoff_1$metric == "Reach", c("counts", "percent", "percent_of_consents") ] <- "-"
      dropoff_1[dropoff_1$metric == "Ad Clickthrough", c("counts", "percent", "percent_of_consents")] <- "-"
      dropoff_1[dropoff_1$metric == "Conversation Started", "percent"] <- "-"
    }
  }
    
    if(group == "all" & is.null(ad)){

      
      colnames(dropoff_1) <- c("Metric", "Number of Obs.", 
                           "Percent of Previous", "Percent of Impressions", "Cost per Stage")
      
    } else {
      
       dropoff_1[dropoff_1$metric %in% c("Impressions", "Reach", "Ad Clickthrough", "Conversation Started"), c("counts", "percent")] <- "-"

     dropoff_1[dropoff_1$metric == "Consent Obtained", c("percent", "percent_of_consents")] <- "-"
     
     colnames(dropoff_1) <- c("Metric", "Number of Obs.", 
                           "Percent of Previous", "Percent of Consents")
      
      }
    
  
    print(kable(dropoff_1) %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")))
    
}

8.2.1 All Participants

get_funnel(group = "all")

Metric	Number of Obs.	Percent of Previous	Percent of Impressions	Cost per Stage
Impressions	53,876			$0.037
Reach	42,100	78.14% (0.18%)	78.14% (0.18%)	$0.048
Ad Clickthrough	3,070	7.29% (0.13%)	5.7% (0.1%)	$0.651
Conversation Started	10,000	325.73% (-)	18.56% (0.17%)	$0.2
Consent Obtained	6,999	69.99% (0.46%)	12.99% (0.14%)	$0.286
Treatment Completed	6,351	90.74% (0.35%)	11.79% (0.14%)	$0.315
Feedback Complete	4,360	68.65% (0.58%)	8.09% (0.12%)	$0.459
Demographics Complete	2,444	56.06% (0.75%)	4.54% (0.09%)	$0.818
Full Complete	2,351	96.19% (0.39%)	4.36% (0.09%)	$0.851

8.2.2 Game Participants

get_funnel("game")

Metric	Number of Obs.	Percent of Previous	Percent of Consents
Impressions
Reach
Ad Clickthrough
Conversation Started
Consent Obtained	745
Treatment Completed	584	78.39% (1.51%)	78.39% (1.51%)
Feedback Complete	392	67.12% (1.94%)	52.62% (1.83%)
Demographics Complete	221	56.38% (2.5%)	29.66% (1.67%)
Full Complete	209	94.57% (1.52%)	28.05% (1.65%)

8.2.3 SMS Participants

get_funnel("sms")

Metric	Number of Obs.	Percent of Previous	Percent of Consents
Impressions
Reach
Ad Clickthrough
Conversation Started
Consent Obtained	759
Treatment Completed	504	66.4% (1.71%)	66.4% (1.71%)
Feedback Complete	351	69.64% (2.05%)	46.25% (1.81%)
Demographics Complete	203	57.83% (2.64%)	26.75% (1.61%)
Full Complete	201	99.01% (0.69%)	26.48% (1.6%)

8.2.4 Video Participants

get_funnel("video")

Metric	Number of Obs.	Percent of Previous	Percent of Consents
Impressions
Reach
Ad Clickthrough
Conversation Started
Consent Obtained	738
Treatment Completed	648	87.8% (1.2%)	87.8% (1.2%)
Feedback Complete	446	68.83% (1.82%)	60.43% (1.8%)
Demographics Complete	246	55.16% (2.35%)	33.33% (1.74%)
Full Complete	240	97.56% (0.98%)	32.52% (1.72%)

8.2.5 Control Participants

get_funnel("control")

Metric	Number of Obs.	Percent of Previous	Percent of Consents
Impressions
Reach
Ad Clickthrough
Conversation Started
Consent Obtained	4,615
Treatment Completed	4,615	100% (0%)	100% (0%)
Feedback Complete	3,171	68.71% (0.68%)	68.71% (0.68%)
Demographics Complete	1,774	55.94% (0.88%)	38.44% (0.72%)
Full Complete	1,701	95.89% (0.47%)	36.86% (0.71%)

8.3 Table 3: Ad Split

This table shows the detailed funnel by ad split for all, treatment, and control participants. Here, “All” refers to all participants from the stage of ad impressions, including those who did not start a conversation or did not consent to participate in the study (these users were not assigned to an arm).

# merge ad data
df_sim <- merge(df_sim, ads, by. ="ad", by.y =  "ad_name", all.x = T)

# loop over ads
for(i in sort(unique(df_sim$ad_name))) {
  cat("###",  "Ad", i, "{.tabset} \n")
  # loop over treatmnet
  for(j in c("all", "sms", "control")){
    
    cat("####", paste(label_fun(j), "Participants", sep = " "), " \n")
    print(get_funnel(ad = i, group = j))

    cat("  \n")
  }
  cat("  \n")
}

8.4 Treatment Questionwise Funnel

# function to compute completion for detailed funnel
detailed_funnel_fun <- function(x){
  # if any missing, count non-missing observations
  if(any(is.na(x))){
    sum(!is.na(x))
    # is there are no non-missings, 
    # and the variable is numeric, sum over the 
    # variable (these are binary),
    # else, sum non-missing
  } else {
    if(is.numeric(x)){
      sum(x)
    } else {
      sum(!is.na(x))
    }
  }
}

# loop over treatments
for(treat in c( "sms",  "control", "game", "video")){
   cat("###", paste(label_fun(treat), "{.tabset}", sep = " "), " \n")

    
    plot_df <-  df_sim %>%
      filter(  treatment == treat) %>%
      # select variables from detailed funnel correpsonding to the treatment-chatbot combo
      select(any_of(detailed_funnel[detailed_funnel$treatment == treat, "variable"] %>%pull())) %>%
      summarize_all(.funs =detailed_funnel_fun) %>%
      pivot_longer(cols = everything(), values_to = "count") %>%
      mutate(perc_of_consents = count/pull(.[.$name == "consent", "count"]))
  
    # get number of consents for plot title
    consents <- plot_df %>% filter(name == "consent") %>% pull(count)
  
    # figure
    gg <- plot_df %>%
      # order variables for plotting
      mutate(name = factor(name, levels = rev(name))) %>%
      # group = 1 adds the line through the dots
      ggplot(aes(x = name, y = perc_of_consents, group = 1)) + 
      geom_point(size = 3) +
      geom_line() +
      coord_flip() +
      theme_sarah() +
      labs(x= "", 
           y = "Percent of Conversations", 
           title = paste("Number of Consents:", consents)) +
      scale_y_continuous(limits = c(0, 1), 
                         breaks = seq(0, 1, by = 0.2),
                         labels = scales::percent_format(accuracy = 1)) 
    # table output
    plot_df %>%
      mutate(perc_of_consents = form_percent(perc_of_consents)) %>%
      kable(col.names = c("Variable", "Number. of Obs", "Percent of Consents")) %>%
      kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
      print()
    
    # plot output
    print(gg)
    

  cat("  \n\n\n")
}

8.4.1 SMS

Variable	Number. of Obs	Percent of Consents
consent	759	100%
intro_go_on	759	100%
sms_quiz	685	90.25%
family_friend_check	626	82.48%
fear_quiz	575	75.76%
anger_quiz	504	66.4%
headline1_answer	462	60.87%
headline2_answer	432	56.92%
per_unfollow	395	52.04%
enjoy_most	351	46.25%
enjoy_least	337	44.4%
demog_age	350	46.11%
demog_gender	336	44.27%
demog_ethnic	303	39.92%
demog_income	274	36.1%
demog_education	245	32.28%
demog_religion	220	28.99%
demog_political	203	26.75%
amazon_credit	201	26.48%

8.4.2 Control

Variable	Number. of Obs	Percent of Consents
consent	4615	100%
intro_go_on	4615	100%
headline1_answer	4134	89.58%
headline2_answer	3915	84.83%
per_unfollow	3493	75.69%
enjoy_most	3171	68.71%
enjoy_least	3020	65.44%
demog_age	3154	68.34%
demog_gender	3004	65.09%
demog_ethnic	2744	59.46%
demog_income	2471	53.54%
demog_education	2219	48.08%
demog_religion	1965	42.58%
demog_political	1774	38.44%
amazon_credit	1701	36.86%

8.4.3 Game

Variable	Number. of Obs	Percent of Consents
consent	745	100%
intro_go_on	745	100%
game_completed	599	80.4%
score	584	78.39%
headline1_answer	523	70.2%
headline2_answer	494	66.31%
per_unfollow	447	60%
enjoy_most	392	52.62%
enjoy_least	381	51.14%
demog_age	389	52.21%
demog_gender	375	50.34%
demog_ethnic	340	45.64%
demog_income	293	39.33%
demog_education	273	36.64%
demog_religion	245	32.89%
demog_political	221	29.66%
amazon_credit	209	28.05%

8.4.4 Video

Variable	Number. of Obs	Percent of Consents
consent	738	100%
intro_go_on	738	100%
video_1	708	95.93%
video_2	708	95.93%
video_3	678	91.87%
video_4	664	89.97%
video_5	654	88.62%
headline1_answer	589	79.81%
headline2_answer	556	75.34%
per_unfollow	497	67.34%
enjoy_most	446	60.43%
enjoy_least	422	57.18%
demog_age	440	59.62%
demog_gender	420	56.91%
demog_ethnic	379	51.36%
demog_income	339	45.93%
demog_education	314	42.55%
demog_religion	273	36.99%
demog_political	246	33.33%
amazon_credit	240	32.52%

9 Treatment Analysis

estimate_effect <- function(df, outcome, outcome_name){
  
  
  # get mean and std. errors by treatment for plotting
  eq <- as.formula(paste(outcome, " ~ 0+ treatment"))
  lm <- lm_robust(eq, data = df)
  
  # make df out of lm results
  plot <- tidy(lm) 
  
  gg <- ggplot(plot, aes(x = term, y = estimate))  + 
    geom_point(position = position_dodge(width = 0.5), size = 3) +
    geom_pointrange(
      aes(x = term,ymin = estimate - 1.96 * std.error,
          ymax = estimate + 1.96 * std.error),
      width = .1,
      position = position_dodge(width = 0.5)) + 
    # function to clean label names
    scale_x_discrete(labels = function(x){
      label <- str_remove(x, "treatment")
      label <- label_fun(label)
      return(label)
      }) + 
    labs(x = 'Treatment', 
         y = 'Estimate', 
         title = paste0('Treatment Effect Estimate on ', outcome_name)) +
    theme_sarah() +
    scale_y_continuous(limits = c(0, 1), 
                       breaks = seq(0, 1, 0.2))


  
  print(gg)
    
  # get difference between treatments estimates
  eq <- as.formula(paste(outcome, " ~  treatment"))
  lm2 <- lm_robust(eq, data = df)
  
  out <- tidy(lm2)

  video_n = sum(df$treatment == "video", na.rm = T)
  game_n = sum(df$treatment == "game", na.rm = T)
  sms_n = sum(df$treatment == "sms", na.rm = T)
  control_n = sum(df$treatment == "control", na.rm = T)

  plot %>%
    select(term, estimate_value  = estimate, std.error_value = std.error) %>%
    mutate(estimate_std_value = paste(round_4(estimate_value), "\n(", round_4(std.error_value), ")", sep = "")) %>%
    select(!estimate_value & !std.error_value) %>%
    merge(out[2:4, c("term", "estimate", 
                     "std.error", "p.value", 
                     "conf.low", "conf.high")], by = "term", all.x = T) %>%
    mutate(estimate_std_diff = paste(round_4(estimate), "\n(", round_4(std.error), ")", sep = "")) %>%
    select(!estimate & !std.error) %>%
    mutate(conf = paste("(", round_4(conf.low), ", ", round_4(conf.high), ")", sep = ""), 
           n =c(control_n, game_n, sms_n, video_n)) %>%
    select(!conf.low & !conf.high) %>%
    mutate(term = str_remove(term, "treatment"), 
           term = factor(term, c("game", "sms", "video", "control"))) %>%
    arrange(term) %>%
    mutate(p.value = ifelse(term == "control", "-", p.value), 
           estimate_std_diff = ifelse(term == "control", "-", estimate_std_diff), 
           conf = ifelse(term == "control", "-", conf)) %>%
    select("Treatment"= term,
           "Estimate" = estimate_std_value, 
           "N" = n, 
           "Difference from Control" = estimate_std_diff, 
           "95% Conf. Int." = conf, 
           "p-value" = p.value) %>%
  kable(row.names = F, escape = T) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
  print()
}

9.1 Outcome 1: Outcome Binary

estimate_effect(df_sim, "fb_outcome_binary", "Outcome Binary")

Treatment	Estimate	N	Difference from Control	95% Conf. Int.	p-value
game	0.6148 (0.0178)	745	0.0093 (0.0192)	(-0.0284, 0.0471)	0.627041234059019
sms	0.6074 (0.0177)	759	0.0020 (0.0191)	(-0.0356, 0.0395)	0.918401926499294
video	0.5664 (0.0183)	738	-0.0390 (0.0196)	(-0.0775, -0.0006)	0.0467737948032825
control	0.6054 (0.0072)	4615

9.2 Outcome 2: Outcome Continuous

estimate_effect(df_sim, "fb_outcome_continuous", "Outcome Continuous")

Treatment	Estimate	N	Difference from Control	95% Conf. Int.	p-value
game	0.4971 (0.0105)	745	-0.0036 (0.0113)	(-0.0258, 0.0186)	0.749572344614976
sms	0.5120 (0.0103)	759	0.0112 (0.0112)	(-0.0107, 0.0331)	0.314803204222693
video	0.5003 (0.0105)	738	-0.0004 (0.0113)	(-0.0225, 0.0217)	0.969258083314189
control	0.5007 (0.0042)	4615

10 Subgroup Analysis

10.1 Pre-defined

Age Group : 18-24, 25-34, 35-44, 45-54, 55-64, 65+

Education: High school or less; Some college; Bachelor’s degree; Graduate degree

Gender: Man, Woman, Non-binary

Political: Republic, Independent, Democrat

  # get mean and std. errors by treatment for plotting
tests <- combn(sort(unique(df_sim$treatment)), 2, simplify = F)

covariates <-  c("fb_demog_gender_man",  "fb_demog_age")

outcomes <- c("fb_outcome_binary", "fb_outcome_continuous")

subgroup_analysis <- function(covariates, outcomes, tests){

  outlist <- list()
  for(t in 1:length(tests)){
    
    for (cov in covariates){
      
      for(outcome in outcomes){
  
        test = tests[[t]]
        
        df_test <- df_sim[df_sim$treatment %in% test, c("treatment", cov, outcome)]
        form <- as.formula(paste(outcome, " ~ 0 +", " treatment * ", cov))
        model <- tidy(lm_robust(form, data = df_test))
          
        means <- model[1:2, "estimate"]
        names(means) <- test
        
        outlist[[paste(t,cov,outcome, sep = "")]] <- cbind(t(as.matrix(means)), model[3, c("outcome", "estimate", "std.error", "conf.low", "conf.high", "p.value")], nobs = nrow(df_test), covariate = cov)
      }
    }
  }
  
  final <- do.call("smartbind", outlist)
  
  rownames(final) <- NULL
  
  final_out <- final[, c("outcome", "covariate", "control", "game", "sms", "video", "estimate", "std.error", "conf.low", "conf.high", "p.value", "nobs")] %>%
    arrange(outcome, covariate)
  
  return(final_out)
  
}
    options(knitr.kable.NA = '')

subgroup <- subgroup_analysis(covariates, outcomes, tests) 

subgroup%>%
  kable(digits = 4)%>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

outcome	covariate	control	game	sms	video	estimate	std.error	conf.low	conf.high	p.value	nobs
fb_outcome_binary	fb_demog_age	0.6042	0.6363			0.0000	0.0003	-0.0006	0.0006	0.9444	5360
fb_outcome_binary	fb_demog_age	0.6042		0.6168		0.0000	0.0003	-0.0006	0.0006	0.9444	5374
fb_outcome_binary	fb_demog_age	0.6042			0.6390	0.0000	0.0003	-0.0006	0.0006	0.9444	5353
fb_outcome_binary	fb_demog_age		0.6363	0.6168		-0.0004	0.0007	-0.0018	0.0011	0.6132	1504
fb_outcome_binary	fb_demog_age		0.6363		0.6390	-0.0004	0.0007	-0.0018	0.0011	0.6132	1483
fb_outcome_binary	fb_demog_age			0.6168	0.6390	-0.0002	0.0007	-0.0016	0.0013	0.8216	1497
fb_outcome_binary	fb_demog_gender_man	0.6111	0.6070			-0.0132	0.0145	-0.0417	0.0153	0.3640	5360
fb_outcome_binary	fb_demog_gender_man	0.6111		0.6048		-0.0132	0.0145	-0.0417	0.0153	0.3640	5374
fb_outcome_binary	fb_demog_gender_man	0.6111			0.5728	-0.0132	0.0145	-0.0417	0.0153	0.3640	5353
fb_outcome_binary	fb_demog_gender_man		0.6070	0.6048		0.0202	0.0366	-0.0516	0.0920	0.5813	1504
fb_outcome_binary	fb_demog_gender_man		0.6070		0.5728	0.0202	0.0366	-0.0516	0.0920	0.5813	1483
fb_outcome_binary	fb_demog_gender_man			0.6048	0.5728	0.0059	0.0357	-0.0642	0.0759	0.8697	1497
fb_outcome_continuous	fb_demog_age	0.5114	0.5101			-0.0002	0.0002	-0.0005	0.0002	0.3027	5360
fb_outcome_continuous	fb_demog_age	0.5114		0.4743		-0.0002	0.0002	-0.0005	0.0002	0.3027	5374
fb_outcome_continuous	fb_demog_age	0.5114			0.4967	-0.0002	0.0002	-0.0005	0.0002	0.3027	5353
fb_outcome_continuous	fb_demog_age		0.5101	0.4743		-0.0002	0.0004	-0.0011	0.0006	0.6089	1504
fb_outcome_continuous	fb_demog_age		0.5101		0.4967	-0.0002	0.0004	-0.0011	0.0006	0.6089	1483
fb_outcome_continuous	fb_demog_age			0.4743	0.4967	0.0007	0.0004	-0.0002	0.0015	0.1310	1497
fb_outcome_continuous	fb_demog_gender_man	0.4940	0.5073			0.0156	0.0085	-0.0011	0.0324	0.0674	5360
fb_outcome_continuous	fb_demog_gender_man	0.4940		0.5069		0.0156	0.0085	-0.0011	0.0324	0.0674	5374
fb_outcome_continuous	fb_demog_gender_man	0.4940			0.5133	0.0156	0.0085	-0.0011	0.0324	0.0674	5353
fb_outcome_continuous	fb_demog_gender_man		0.5073	0.5069		-0.0264	0.0212	-0.0680	0.0153	0.2145	1504
fb_outcome_continuous	fb_demog_gender_man		0.5073		0.5133	-0.0264	0.0212	-0.0680	0.0153	0.2145	1483
fb_outcome_continuous	fb_demog_gender_man			0.5069	0.5133	0.0114	0.0207	-0.0293	0.0520	0.5832	1497

The following code prints the table/figures for gender. We can repeat it for other groups we want to look at.

df_test <- df_sim[df_sim$treatment %in% c("control", "game"), c("treatment", "fb_demog_gender_man", "fb_outcome_continuous")]
form <- as.formula(paste("fb_outcome_continuous", " ~ 0 +", " treatment * fb_demog_gender_man"))
model <- tidy(lm_robust(form, data = df_test))

df_sim %>%
  select(fb_demog_gender_man, treatment, fb_outcome_continuous) %>%
  filter(!is.na(treatment)) %>%
  group_by(treatment, fb_demog_gender_man) %>% 
  summarize_all(list(mean = mean, sd =sd, n = length)) %>%
  mutate(se = sd/sqrt(n))%>%
  ggplot(aes(group = treatment, color = as.factor(fb_demog_gender_man))) + 
  geom_segment(aes(x = mean -1.96 * se, xend = mean + 1.96 *se, y = fb_demog_gender_man, yend = fb_demog_gender_man)) + 
  geom_point(aes(x = mean, y = fb_demog_gender_man)) + 
  facet_grid(rows =vars(treatment), switch = "y") + 
  scale_y_continuous(breaks = c(0, 1), 
                     limits = c(-.25, 1.25)) + 
  theme_sarah() + 
  labs(y = "", x = "Continuous Outcome", title = "Continuous Outcome for Gender\nBy Treatment Group") + 
    theme(panel.grid.minor.y= element_blank(), 
          axis.text.y = element_blank(), 
          strip.text.y.left = element_text(angle = 0), 
          legend.position = "none")

subgroup %>%
  filter(outcome == "fb_outcome_continuous" & covariate == "fb_demog_gender_man") %>%
  kable(digits = 4)%>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

outcome	covariate	control	game	sms	video	estimate	std.error	conf.low	conf.high	p.value	nobs
fb_outcome_continuous	fb_demog_gender_man	0.494	0.5073			0.0156	0.0085	-0.0011	0.0324	0.0674	5360
fb_outcome_continuous	fb_demog_gender_man	0.494		0.5069		0.0156	0.0085	-0.0011	0.0324	0.0674	5374
fb_outcome_continuous	fb_demog_gender_man	0.494			0.5133	0.0156	0.0085	-0.0011	0.0324	0.0674	5353
fb_outcome_continuous	fb_demog_gender_man		0.5073	0.5069		-0.0264	0.0212	-0.0680	0.0153	0.2145	1504
fb_outcome_continuous	fb_demog_gender_man		0.5073		0.5133	-0.0264	0.0212	-0.0680	0.0153	0.2145	1483
fb_outcome_continuous	fb_demog_gender_man			0.5069	0.5133	0.0114	0.0207	-0.0293	0.0520	0.5832	1497

11 Appendix

11.1 Covariate Balance

# Make nice labels for plot
labels <- data.frame(covariate = c(colnames(df_sim)[str_detect(colnames(df_sim), "fb_demog_(gender|education|age_group)_.+")],
                                   "fb_demog_age"))
labels$covariate_label <- str_remove(labels$covariate, "fb_demog_")
labels$covariate_group <- str_to_title(str_extract(labels$covariate, "age$|age_group|education|gender"))

labels$covariate_label <- str_to_title(str_remove(labels$covariate_label, "(education|gender|age_group)_"))

labels$covariate_label <- factor(labels$covariate_label, levels = labels$covariate_label)

# standardized mean difference function
# https://statisticaloddsandends.wordpress.com/2021/10/31/standardized-mean-difference-smd-in-causal-inference/
# https://cran.r-project.org/web/packages/TOSTER/vignettes/SMD_calcs.html#:~:text=The%20SMD%20is%20then%20the,d%3D%CB%89xs
smd_fun <- function(mean_1, mean_0, var_1, var_0){
  
  smd <- (mean_1 - mean_0)/(sqrt((var_1 + var_0)/2))
  return(smd)
  
}

# standardized mean difference function by covaraite

smd_by_covariate <- function(data, groups, covs){

  smd_list <- list()
  
  # combinations of all covariate - groups to calculate means for
  combos <- expand.grid(covs, groups)
  
  for(i in 1:nrow(combos)){
    
    combo_i<- combos[i, ]
    
    cov_sym <- combo_i[[1]][[1]]
    cov <- enquo(cov_sym)
    
    group_sym <- combo_i[[2]][[1]]
    group <- enquo(group_sym)

    df_subset <- data %>%
      select(!!cov, !!group) %>%
      filter(!is.na(!!group)) %>%
      group_by(!!group) %>%
      summarize(mean = mean(!!cov), 
                var = var(!!cov)) 
    
    mean_1 = df_subset %>% filter(!!group == 1) %>% pull(mean) 
    mean_0 = df_subset %>% filter(!!group == 0) %>% pull(mean)
    var_1 = df_subset %>% filter(!!group == 1) %>% pull(var)
    var_0 = df_subset %>% filter(!!group == 0) %>% pull(var)
    
    smd_out <- smd_fun(mean_1, mean_0, var_1, var_0)
    
    smd_list[[i]] <- list(group = as_label(group), 
                          covariate = as_label(cov), 
                          smd = smd_out, 
                          mean_1 = mean_1, 
                          mean_0 = mean_0)

  }
  
  smd_list_out <-do.call("rbind.data.frame", smd_list)
  rownames(smd_list_out) <-NULL
  #return(smd_list)
  return(smd_list_out)
  
}



covs <- syms(c(colnames(df_sim)[str_detect(colnames(df_sim), "fb_demog_(gender|education|age_group)_.+")],
               "fb_demog_age"))




groups <- syms(c("full_complete", "treatment_group", "treatment_sms", "treatment_game","treatment_video"))


out <- smd_by_covariate(data = df_sim, groups = groups, covs = covs) %>%
  merge(labels, by = "covariate", all.x = T)

cat(paste("### By Completion"))

11.1.1 By Completion

out[out$group == "full_complete", ]  %>%
  ggplot(aes(x = smd, y = covariate_label))  +
  geom_vline(xintercept = 0.1, linetype = "dashed") + 
  geom_vline(xintercept = -0.1, linetype = "dashed") + 
  geom_vline(xintercept = 0) + 
  geom_point(size =3) + 
  labs(y = "", x = "Standardize Mean Difference", 
       title = "Difference in Samples by Survey Completion",
       subtitle ="Mean Completers - Mean Non-completers") + 
  theme_sarah()

cat("\n\n\n")

cat(paste("### By Treatment Group" ))

11.1.2 By Treatment Group

out[out$group == "treatment_group", ]  %>%
  ggplot(aes(x = smd, y = covariate_label))  + 
  geom_vline(xintercept = 0.1, linetype = "dashed") + 
  geom_vline(xintercept = -0.1, linetype = "dashed") + 
  geom_vline(xintercept = 0) + 
  geom_point(size =3) + 
  labs(y = "", x = "Standardize Mean Difference", 
       title = "Difference in Samples by Treatment Group",
       subtitle ="Mean Treated Group- Mean Control Group") + 
  theme_sarah()

cat("\n\n\n")

cat(paste("### By Treatment" ))

11.1.3 By Treatment

out[out$group %in% c("treatment_sms", "treatment_game", "treatment_video"), ]  %>%
  ggplot(aes(x = smd, y = covariate_label, color = group, shape = group))  + 
  geom_vline(xintercept = 0.1, linetype = "dashed") + 
  geom_vline(xintercept = -0.1, linetype = "dashed") + 
  geom_vline(xintercept = 0) + 
  geom_point(size =3) + 
  labs(y = "", x = "Standardize Mean Difference", 
       title = "Difference in Samples by Treatment",
       subtitle ="Mean Treatment - Mean Control") + 
  theme_sarah()

cat("\n\n\n")

11.2 Distributions of Continuous Variables

I wrote two functions for distributions of continuous variables and factor variables at multiple treatment comparison levels.

Below I show them for continuous age and discrete gender, but we can apply these to pilot 2 variables as needed.

11.2.1 Age

continuous_plots <- function(variable, label){

  plots <- list()
  

  
  plots[["overall"]] <- ggplot(df_sim[!is.na(df_sim$treatment_group), ], aes(x = get(variable))) +
    geom_density(alpha= 0.1, fill = "black") +
    theme_sarah() + 
    scale_x_continuous(limits = c(18, 99), breaks = seq(20, 100, 20)) + 
    scale_y_continuous(limits = c(0, 0.035)) +   
    labs(x = label, y = "")
  
    plots[["treatment_group"]] <- ggplot(df_sim[!is.na(df_sim$treatment_group ),], aes(x =  get(variable), 
                                                         group = as.factor(treatment_group), 
                                                         color = as.factor(treatment_group),
                                                         fill = as.factor(treatment_group))) +
    geom_density(alpha= 0.1) +
    scale_fill_manual(values = cb_colors[c(7,9)], labels = c("Control", "Treatment")) +
    scale_color_manual(values = cb_colors[c(7, 9)], labels = c("Control", "Treatment")) +
    theme_sarah() + 
    scale_x_continuous(limits = c(18, 99), breaks = seq(20, 100, 20)) + 
    scale_y_continuous(limits = c(0, 0.035)) + 
    labs(x = label, y = "", fill = "Treatment Group", color  = "Treatment Group")
  

    plots[["treatment"]] <-ggplot(df_sim[!is.na(df_sim$treatment), ], aes(x =  get(variable), group = treatment, color = treatment, fill = treatment)) +
    geom_density(alpha= 0.1) + 
    scale_fill_manual(values = cb_colors[3:6]) +
    scale_color_manual(values = cb_colors[3:6]) +
    theme_sarah() + 
    scale_x_continuous(limits = c(18, 99), breaks = seq(20, 100, 20)) + 
    scale_y_continuous(limits = c(0, 0.035)) + 
    labs(x = label, 
         y = "", 
         fill = "Treatment", 
         color = "Treatment")
  
    return(plots)

}

cat("#### Overall")

11.2.1.1 Overall

continuous_plots("fb_demog_age", "Age")$overall

cat("\n\n\n")

cat("#### Treatment Group")

11.2.1.2 Treatment Group

continuous_plots("fb_demog_age", "Age")$treatment_group

cat("\n\n\n")

cat("#### Treatment")

11.2.1.3 Treatment

continuous_plots("fb_demog_age", "Age")$treatment

11.2.2 Gender

discrete_plots <- function(variable, label){

  plots <- list()
  

  
  plots[["overall"]] <- df_sim %>%
    filter(!is.na(variable)) %>%
    ggplot( 
       aes(x = get(variable), 
           fill = get(variable))) + 
  geom_bar(width = 0.75) + 
  labs(fill = label, y = "", x = label) + 
  theme_sarah() + 
  theme(legend.position = "none", 
        panel.grid.major.x = element_blank())



  plots[["treatment_group"]] <- df_sim %>%
  filter(!is.na(variable) & !is.na(treatment_group_label)) %>%
  group_by_at(c(variable, "treatment_group_label")) %>% 
  count() %>%
  group_by(treatment_group_label) %>%
  mutate(freq = n/sum(n))%>%
  ggplot(aes(x = treatment_group_label,
             y = freq,
           group = get(variable)))  + 
      geom_bar(aes(fill =treatment_group_label), stat = "identity") +
  facet_grid(~get(variable)) + 
  theme_sarah() + 
    scale_fill_manual(values = cb_colors[c(7,9)]) +
  scale_color_manual(values = cb_colors[c(7, 9)]) +
  theme(legend.position = "none", 
        panel.grid.major.x = element_blank()) + 
  labs(y = "", x= "") 
  

  
    plots[["treatment"]] <-df_sim %>%
      filter(!is.na(variable) & !is.na(treatment)) %>%
    group_by_at(c(variable, "treatment")) %>% 
    count() %>%
    group_by(treatment) %>%
    mutate(freq = n/sum(n))%>%
    ggplot(aes(x = treatment,
               y = freq,
             group = get(variable)))  + 
        geom_bar(aes(fill = treatment), stat = "identity") +
    facet_grid(~get(variable)) + 
    theme_sarah() + 
      scale_fill_manual(values = cb_colors[3:6]) +
    scale_color_manual(values = cb_colors[3:6]) +
    theme(legend.position = "none") + 
    labs(y = "", x= "")
    
    return(plots)
}

cat("#### Overall")

11.2.2.1 Overall

discrete_plots("fb_demog_gender", "Gender")$overall

cat("\n\n\n")

cat("#### Treatment Group")

11.2.2.2 Treatment Group

discrete_plots("fb_demog_gender", "Gender")$treatment_group

cat("\n\n\n")

cat("#### Treatment")

11.2.2.3 Treatment

discrete_plots("fb_demog_gender", "Gender")$treatment

Pilot 2 Analysis

Sarah McDonald

Updated: February 22, 2023