Heterogeneous Policy Analysis

Data

Load Library

library(dplyr)
library(knitr)
library(IRdisplay)
library(kableExtra)
library(vtable)
library(tidyverse)
library(reshape2)
library(ggplot2)
library(softImpute)
library(mltools)
library(data.table)
library(stargazer)
library(factoextra)
library(cluster)
library(estimatr)
library(grf)

Load Data

# Load the data #
main_data = readRDS("df_final.rds")
#main_data = main_data[main_data$att_check == "1",]
main_data$user = as.integer(main_data$user)
main_data$order = as.integer(as.character(main_data$order))
main_data$type = recode(main_data$type,"base"="Base posts",
                                            "tactics" = "Info posts",
                                            "emotion" = "Emotions posts",
                                            "combo" = "Combo posts")
main_data$share = as.numeric(main_data$share == "Yes")
main_data$accuracy = recode(main_data$accuracy,"end"="No nudge","inter"="Nudge")
main_data$treatment = recode(main_data$treatment,"reminder"="Facts baseline",
                                                 "control"="No-course baseline",
                                                 "tactics" = "Info",
                                                 "emotion" = "Emotions",
                                                 "combo" = "Combo")
main_data$treatment = factor(main_data$treatment, levels = c("No-course baseline","Facts baseline","Info","Emotions","Combo"))
main_data$course = recode(main_data$treatment, "No-course baseline" ="Combo")
main_data$course = factor(main_data$course, levels = c("Facts baseline","Info","Emotions","Combo"))
main_data = main_data[with(main_data, order(user, order)), ]
main_data <- main_data %>% mutate(perceived_accuracy = case_when(perceived_accuracy == "Not at all accurate" ~ "0",
                                                    perceived_accuracy == "Not very accurate" ~ "1",
                                                    perceived_accuracy == "Somewhat accurate" ~ "2",
                                                    perceived_accuracy == "Very accurate" ~ "3",
                                                    TRUE ~ as.character(perceived_accuracy)),
                     perceived_accuracy = as.integer(perceived_accuracy))

main_data$social_media_hours = ifelse(is.na(main_data$social_media_hours),"0",main_data$social_media_hours)
main_data$social_media_share = ifelse(is.na(main_data$social_media_share),"0-20%",main_data$social_media_share)

pre_processing = function(data){  
    
# Family 1 subsets to posts that were shared in the pre, then looks at the misinfo counterparts in post #
family_1_data = data[data$type == "Base posts" & data$pre_post == "pre",]
family_1_data$share_post = NA
family_1_data$type_post = NA    

# Identify who shared 0, 1, 2 or 3 base posts in the pre #
groups = aggregate(share~user,family_1_data,sum)
colnames(groups) = c("user","group")
family_1_data = left_join(family_1_data,groups,by="user")
    
# # Followup preprocessing #
# family_1_data_followup = data[!is.na(data$att_check_followup) & data$type == "Base posts" & data$pre_post == "post",]
# family_1_data_followup$share_followup = NA
# family_1_data_followup$type_followup = NA
# 
# # Identify who shared 0, 1, 2 or 3 base posts in the post #
# groups_followup = aggregate(share~user,family_1_data_followup,sum)
# colnames(groups_followup) = c("user","group")
# family_1_data_followup = left_join(family_1_data_followup,groups_followup,by="user")    

# For each user #
for (u in unique(family_1_data$user)){
    # Take the vector of facts that were seen in the pre as base # 
    F = family_1_data$fact[family_1_data$user == u]
    # For each of these facts #
    for (f in F){
        # Take the sharing decision of the misinfo counterpart in the post # 
        family_1_data$share_post[family_1_data$user == u & family_1_data$fact == f] = data$share[data$user == u & data$fact == f & data$pre_post == "post"]
        # Take the post type of the misinfo counterpart in the post #
        family_1_data$type_post[family_1_data$user == u & family_1_data$fact == f] = data$type[data$user == u & data$fact == f & data$pre_post == "post"]}
    
    # F = family_1_data_followup$fact[family_1_data_followup$user == u]
    # # For each of these facts #
    # for (f in F){
    #     # Take the sharing decision of the misinfo counterpart in the followup # 
    #     family_1_data_followup$share_followup[family_1_data_followup$user == u & family_1_data_followup$fact == f] = data$share[data$user == u & data$fact == f & data$pre_post == "followup"]
    #     # Take the post type of the misinfo counterpart in the post #
    #     family_1_data_followup$type_followup[family_1_data_followup$user == u & family_1_data_followup$fact == f] = data$type[data$user == u & data$fact == f & data$pre_post == "followup"]}
}

    
# Subset to those facts that were shared in the pre (as base) #
family_1_data = family_1_data[family_1_data$share == 1,]
# Subset to those facts that were shared in the post (as base) #
# family_1_data_followup = family_1_data_followup[family_1_data_followup$share == 1,]

# Aggregate at the user level by computing averages #
family_1_data_final = aggregate(share_post~user*group*treatment*accuracy*att_check_pre*att_check_post,family_1_data,mean)
# family_1_het_data_final = aggregate(share_post~user*treatment*type_post*att_check_pre*att_check_post,family_1_data,mean)
# family_1_data_followup_final = aggregate(share_followup~user*group*treatment*priming*att_check_pre*att_check_post*att_check_followup,family_1_data_followup,mean)
# family_1_het_data_followup_final = aggregate(share_followup~user*treatment*type_followup*att_check_pre*att_check_post*att_check_followup,family_1_data_followup,mean)
    
family_1_data_final$course = ifelse(family_1_data_final$treatment == "No-course baseline","control",
                             ifelse(family_1_data_final$treatment %in% c("Info","Emotions","Combo"),"treatment",NA))    
    
# Family 2 looks at all misinformation posts #

# # Aggregate at the user level the sharing decisions in the misinfo posts in both #  
# family_2_data_final = aggregate(share~user*treatment*accuracy*pre_post*att_check_pre*att_check_post,data[data$type!="Base posts",],mean)
# # Aggregate at the user level the sharing decisions in the misinfo posts separately in pre and post, and take the difference #  
# family_2_data_final_pre = aggregate(share~user*treatment*accuracy*att_check_pre*att_check_post,data[data$type!="Base posts" & data$pre_post=="pre"& data$treatment %in% c("Info","Emotions","Combo"),],mean)
# family_2_data_final_post = aggregate(share~user*treatment*accuracy*att_check_pre*att_check_post,data[data$type!="Base posts" & data$pre_post=="post" & data$treatment %in% c("Info","Emotions","Combo"),],mean)
# family_2_data_final_post$share_diff = family_2_data_final_post$share - family_2_data_final_pre$share

    
return(family_1_data_final)
}

data_final <- pre_processing(main_data)

all_covs = c("age","gender","education","marital","employment","location","religion","religiosity","social_media_bin","social_media_hours","social_media_share")
categorical_covs = c("gender","education","marital","employment","location","religion","religiosity","social_media_bin","social_media_share")

data_covs = data.frame(lapply(main_data[main_data$order==1,c("user", all_covs)], as.factor))
data_covs$age = as.numeric(as.character(data_covs$age))
data_covs$social_media_hours = as.numeric(as.character(data_covs$social_media_hours))
data_covs$share_pre_all = aggregate(share~user,main_data[main_data$pre_post=="pre",],mean)[,2]
data_covs$share_pre_base = aggregate(share~user,main_data[main_data$pre_post=="pre" & main_data$type == "Base posts",],mean)[,2]
data_covs$share_pre_misinfo = aggregate(share~user,main_data[main_data$pre_post=="pre" & main_data$type != "Base posts",],mean)[,2]
data_covs$accuracy_pre_all = aggregate(perceived_accuracy~user,main_data[main_data$pre_post=="pre",],mean)[,2]
data_covs$accuracy_pre_base = aggregate(perceived_accuracy~user,main_data[main_data$pre_post=="pre" & main_data$type == "Base posts",],mean)[,2]
data_covs$accuracy_pre_misinfo = aggregate(perceived_accuracy~user,main_data[main_data$pre_post=="pre" & main_data$type != "Base posts",],mean)[,2]
data_covs$treatment = main_data[main_data$order==1,"treatment"]

data_covs$education = ifelse(data_covs$education %in% c("Less than a high school diploma",
                             "High school degree or equivalent"),"High school or less",
                      ifelse(data_covs$education %in% c("Some college, no degree","Associate degree (e.g. AA, AS)"),
                             "Some college",
                      ifelse(data_covs$education %in% c("Master\'s degree (e.g. MA, MS, MEd)",
                                                        "Doctorate or professional degree (e.g. MD, DDS, PhD)"),
                             "Graduate degree","Bachelor\'s degree")))
data_covs$education = factor(data_covs$education,levels=c("High school or less",
                                                          "Some college","Bachelor\'s degree",
                                                          "Graduate degree"))

data_covs$employment = ifelse(data_covs$employment %in% c("Unemployed not currently looking for work",
                             "Unemployed and currently looking for work","Homemaker","Unable to work","Retired"),"Unemployed",
                      ifelse(data_covs$employment %in% c("Employed part time (up to 29 hours per week)",
                            "Employed full time (30 or more hours per week)","Self-employed"),"Employed","Student"))

data_covs$employment = factor(data_covs$employment,levels=c("Employed","Unemployed","Student"))

data_covs$religiosity = ifelse(data_covs$religiosity %in% c("Less than once a month",
                                  "One to three times per month",
                                  "Once a week",
                                  "More than once a week but less than daily",
                                  "Daily"),"Attends","Does not attend")

data_covs$religiosity = factor(data_covs$religiosity,levels=c("Attends","Does not attend"))

all_covs = c(all_covs,"share_pre_all","share_pre_base","share_pre_misinfo", "accuracy_pre_all","accuracy_pre_base","accuracy_pre_misinfo")
all_covs_fancy = c(""," ","Educational Attainment","  ","Employment Status","Neighborhood","   ",
                  "    ","     ","      ","Prop. of content shared", "Pre-survey Sharing",
                   "Pre-survey Sharing","Pre-survey Sharing", "Pre-survey Accuracy", "Pre-survey Accuracy", "Pre-survey Accuracy")

# Defines names and display names of variables #
covariates_continuous = c("age","social_media_hours","share_pre_all","share_pre_base","share_pre_misinfo", "accuracy_pre_all","accuracy_pre_base","accuracy_pre_misinfo")
                          
covariates_continuous_fancy = c("Age","Hours per day spent in social media",
                                "All posts (Sharing)",
                                "Non-misinformation posts (Sharing)",
                                "Misinformation posts (Sharing)",
                                "All posts (Accuracy)",
                                "Non-misinformation posts (Accuracy)",
                                "Misinformation posts (Accuracy)")

covariates_binary = c("gender_Man",
                    paste("education",c("High school or less","Some college","Bachelor\'s degree","Graduate degree"),sep="_"),
                    "marital_Married, or in a domestic partnership",
                    paste("employment",c("Unemployed","Employed","Student"),sep="_"),
                    paste("location",c("Mostly urban","Suburban","Mostly rural"),sep="_"),
                    "religion_Christian",
                    "religiosity_Attends",
                    "social_media_bin_Yes",
                    paste("social_media_share",c("0-20%","20-40%","40-60%","60-80%","80-100%"),sep="_"))


covariates_binary_fancy = c("Man",
                            "High school or less",
                            "Some college",
                            "Bachelor\'s degree",
                            "Graduate degree",
                            "Married",
                            "Unemployed",
                            "Employed",
                            "Student",
                            "Mostly urban","Suburban","Mostly rural",
                            "Christian",
                            "Attends religious services",
                            "Uses social media",
                            "0-20%","20-40%","40-60%","60-80%","80-100%")

covariates_all = c("age",covariates_binary[!startsWith(covariates_binary,"social_media_share")],"social_media_hours",
                   covariates_binary[startsWith(covariates_binary,"social_media_share")],
                   covariates_continuous[3:length(covariates_continuous)])
covariates_all_fancy = c("Age",covariates_binary_fancy[!endsWith(covariates_binary_fancy,"%")],"Hours per day spent in social media",
                         covariates_binary_fancy[endsWith(covariates_binary_fancy,"%")],
                         covariates_continuous_fancy[3:length(covariates_continuous_fancy)])

# Adds the level 'missing' to factor variables #
addmissing = function(x){
  if(is.factor(x)) return(factor(x, levels=c(levels(x), "missing")))
  return(x)}

# Add missing category to factors #
data_covs = as.data.frame(lapply(data_covs, addmissing))
data_covs = data_covs %>%
mutate_if(is.factor, ~replace_na(., "missing"))
data_covs = droplevels(data_covs)

# One-hot encoding the factor covariates #
setDT(data_covs)
data_covs = one_hot(data_covs, cols = c(categorical_covs))
setDF(data_covs)
data_covs = subset(data_covs, select=c("user", covariates_binary,covariates_continuous,
                                       "treatment"))

cov_groups_num = c()
for (cov in all_covs){
    cov_groups_num = c(cov_groups_num,sum(startsWith(colnames(data_covs),cov)))}

cov_groups = rep(all_covs_fancy,cov_groups_num)

data_final$user <- as.factor(data_final$user)

data <- left_join(data_final, data_covs, by = c("user", "treatment"))

Emotions v. Info

Outcome is the POST-survey average sharing of misinformation posts.

get_benefit_table = function(pi.hat, Y, W, counter_treatment_name){
    
    A = pi.hat == 1 
    
    # Group assigned to Emotions
    freq = mean(pi.hat==0)*100
    targeted_policy.est = mean(Y[W==0 & pi.hat==0])
    targeted_policy.stderr = sqrt(var(Y[W==0 & pi.hat==0])/sum(W==0 & pi.hat==0))

    t3_policy.est = mean(Y[W==0 & pi.hat==0])
    t3_policy.stderr = sqrt(var(Y[W==0 & pi.hat==0])/sum(W==0 & pi.hat==0))
    benefit_over_t3 = targeted_policy.est-t3_policy.est
    benefit_over_t3.stderr = c("")
    benefit_over_t3.pvalue = c("")

    t2_policy.est = mean(Y[W==1 & pi.hat==0])
    t2_policy.stderr = sqrt(var(Y[W==1 & pi.hat==0])/sum(W==1 & pi.hat==0))
    benefit_over_t2 = (-mean(Y[!A & (W==1)])+mean(Y[!A & (W==0)]))
    benefit_over_t2.stderr = round(sqrt(var(Y[A & (W==1)]) /sum(A & (W==1)) + var(Y[A & (W==0)]) / sum(A & W==0)),3)
    benefit_over_t2.pvalue = round(2*pnorm(-abs(benefit_over_t2)/benefit_over_t2.stderr),3)

    benefit_pi.hat.0 = data.frame(freq, targeted_policy.est, targeted_policy.stderr, t3_policy.est, t3_policy.stderr, benefit_over_t3, benefit_over_t3.stderr, benefit_over_t3.pvalue, t2_policy.est, t2_policy.stderr, benefit_over_t2, benefit_over_t2.stderr, benefit_over_t2.pvalue)
    rownames(benefit_pi.hat.0) = c("Group assigned to Emotions")

    # Group assigned to Non-Emotions
    freq = mean(pi.hat==1)*100
    targeted_policy.est = mean(Y[W==1 & pi.hat==1])
    targeted_policy.stderr = sqrt(var(Y[W==1 & pi.hat==1])/sum(W==1 & pi.hat==1))

    t3_policy.est = mean(Y[W==0 & pi.hat==1])
    t3_policy.stderr = sqrt(var(Y[W==0 & pi.hat==1])/sum(W==0 & pi.hat==1))
    benefit_over_t3 =  (mean(Y[A & (W==1)]) - mean(Y[A & (W==0)]))
    benefit_over_t3.stderr = round(sqrt(var(Y[A & (W==1)]) / sum(A & (W==1)) + var(Y[A & (W==0)]) / sum(A & W==0)),3)
    benefit_over_t3.pvalue = round(2*pnorm(-abs(benefit_over_t3)/benefit_over_t3.stderr),3)

    t2_policy.est = mean(Y[W==1 & pi.hat==1])
    t2_policy.stderr = sqrt(var(Y[W==1 & pi.hat==1])/sum(W==1 & pi.hat==1))
    benefit_over_t2 =  targeted_policy.est-t2_policy.est
    benefit_over_t2.stderr = c("") 
    benefit_over_t2.pvalue = c("") 

    benefit_pi.hat.1 = data.frame(freq, targeted_policy.est, targeted_policy.stderr, t3_policy.est, t3_policy.stderr, benefit_over_t3, benefit_over_t3.stderr, benefit_over_t3.pvalue, t2_policy.est, t2_policy.stderr, benefit_over_t2, benefit_over_t2.stderr, benefit_over_t2.pvalue)
    rownames(benefit_pi.hat.1) = c(paste0("Group assigned to ", counter_treatment_name))

    # ALL
    freq = 100
    targeted_policy.est = mean(Y[A & (W==1)]) * mean(A) + mean(Y[!A & (W==0)]) * mean(!A)
    targeted_policy.stderr = sqrt(var(Y[A & (W==1)]) / sum(A & (W==1)) * mean(A)^2 + var(Y[!A & (W==0)]) / sum(!A & W==0) * mean(!A)^2)

    t3_policy.est = mean(Y[W==0])
    t3_policy.stderr = sqrt(var(Y[W==0])/sum(W==0))
    benefit_over_t3 =  (mean(Y[A & (W==1)]) - mean(Y[A & (W==0)])) * mean(A)
    benefit_over_t3.stderr = round(sqrt(var(Y[A & (W==1)]) / sum(A & (W==1)) + var(Y[A & (W==0)]) / sum(A & W==0)) * mean(A)^2,3)
    benefit_over_t3.pvalue = round(2*pnorm(-abs(benefit_over_t3)/benefit_over_t3.stderr),3)

    t2_policy.est = mean(Y[W==1])
    t2_policy.stderr = sqrt(var(Y[W==1])/sum(W==1))
    benefit_over_t2 =  (-mean(Y[!A & (W==1)]) +mean(Y[!A & (W==0)])) * mean(!A)
    benefit_over_t2.stderr = round(sqrt(var(Y[!A & (W==1)]) / sum(!A & (W==1)) + var(Y[!A & (W==0)]) / sum(!A & W==0)) * mean(!A)^2,3)
    benefit_over_t2.pvalue = round(2*pnorm(-abs(benefit_over_t2)/benefit_over_t2.stderr),3)

    benefit_all = data.frame(freq, targeted_policy.est, targeted_policy.stderr, t3_policy.est, t3_policy.stderr, benefit_over_t3, benefit_over_t3.stderr, benefit_over_t3.pvalue, t2_policy.est, t2_policy.stderr, benefit_over_t2, benefit_over_t2.stderr, benefit_over_t2.pvalue)
    rownames(benefit_all) = c("All")

    benefit_table = rbind(benefit_pi.hat.0, benefit_pi.hat.1, benefit_all)

    return(benefit_table)
}

data_info <- data %>% filter(treatment == "Emotions" | treatment == "Info")
# Define binary treatment where Info is 1 and Emotions is 0
data_info$W <- ifelse(data_info$treatment=="Emotions", 0, 1)

# Define outcome
outcome = c("share_post")

# Define covariates to consider
covariates = c(covariates_binary,covariates_continuous)

# Define number of observations
n = nrow(data_info)

RATE

# Setup data
X= data_info[,covariates]
Y= data_info[,outcome]
W = data_info[,"W"]

# For cross-fitting
num.folds = 10
folds = sort(seq(n) %% num.folds) + 1

# Train a causal forest to estimate a CATE based priority ranking
train = sample(1:nrow(X), nrow(X) / 2)
cf.priority = causal_forest(X[train, ], Y[train], W[train], W.hat=0.5, tune.parameters = "all")

# Compute a prioritization based on estimated treatment effects.
priority.cate = predict(cf.priority, X[-train, ])$predictions

# Estimate AUTOC on held out data.
cf.eval = causal_forest(X[-train, ], Y[-train], W[-train], W.hat=0.5, tune.parameters = "all")
rate = rank_average_treatment_effect(cf.eval, priority.cate)
rate

##   estimate    std.err             target
##  0.0368327 0.01752265 priorities | AUTOC

plot(rate)

Nonparametric Policy

forest = causal_forest(X, Y, W, W.hat=.5,  tune.parameters = "all", clusters=folds, min.node.size = 200)

calibration.table = test_calibration(forest)
calibration.table

## 
## Best linear fit using forest predictions (on held-out data)
## as well as the mean forest prediction as regressors, along
## with one-sided heteroskedasticity-robust (HC3) SEs:
## 
##                                Estimate Std. Error t value    Pr(>t)    
## mean.forest.prediction          1.00397    0.13478  7.4490 6.057e-14 ***
## differential.forest.prediction -4.96773    1.19641 -4.1522         1    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

tuning_output = forest$tuning.output 
tuning_output

## Tuning status: tuned.
## This indicates tuning found parameters that are expected to perform better than default. 
## 
## Predicted debiased error: 0.143946972094571
## 
## Tuned parameters: 
## sample.fraction: 0.264950318855699
##  mtry: 2
##  min.node.size: 1
##  honesty.fraction: 0.524321214878
##  honesty.prune.leaves: 0
##  alpha: 0.211763351166155
##  imbalance.penalty: 1.32448013451888
## 
## Average error by 5-quantile:
## 
##  sample.fraction     error
##     [0.05,0.142] 0.1442010
##    (0.142,0.225] 0.1441933
##     (0.225,0.32] 0.1441838
##     (0.32,0.417] 0.1442817
##      (0.417,0.5] 0.1442756
## 
##     mtry     error
##    [1,5] 0.1440616
##   (5,10] 0.1441385
##  (10,14] 0.1442478
##  (14,20] 0.1443417
##  (20,26] 0.1443808
## 
##  min.node.size     error
##          [1,2] 0.1444373
##          (2,7] 0.1443864
##         (7,22] 0.1442472
##        (22,67] 0.1440582
##       (67,194] 0.1439849
## 
##  honesty.fraction     error
##       [0.5,0.558] 0.1441100
##     (0.558,0.619] 0.1441702
##     (0.619,0.683] 0.1442282
##     (0.683,0.742] 0.1442572
##       (0.742,0.8] 0.1443697
## 
##  honesty.prune.leaves     error
##                     0 0.1442337
##                     1 0.1442198
## 
##              alpha     error
##  [9.01e-05,0.0484] 0.1442593
##     (0.0484,0.103] 0.1442496
##       (0.103,0.15] 0.1442512
##       (0.15,0.198] 0.1441921
##       (0.198,0.25] 0.1441832
## 
##  imbalance.penalty     error
##   [0.000323,0.242] 0.1444127
##      (0.242,0.542] 0.1443363
##      (0.542,0.982] 0.1442277
##       (0.982,1.64] 0.1441084
##        (1.64,6.51] 0.1440503

# Get "out-of-bag" predictions
tau.hat = predict(forest)$predictions

# Nonparametric policy: CATE>0 --> assign to treatment
pi.hat = as.numeric(tau.hat < 0)

# We can use the entire data because predictions are out-of-bag
A = pi.hat == 1

# Retrieve relevant quantities to compute AIPW scores
e.hat = forest$W.hat # P[W=1|X]
mu.hat.1 = forest$Y.hat + (1 - e.hat) * tau.hat  # E[Y|X,W=1] = E[Y|X] + (1 - e(X)) * tau(X)
mu.hat.0 = forest$Y.hat - e.hat * tau.hat        # E[Y|X,W=0] = E[Y|X] - e(X) * tau(X)

# Compute AIPW scores.
gamma.hat.1 = mu.hat.1 + W / e.hat * (Y - mu.hat.1)
gamma.hat.0 = mu.hat.0 + (1-W) / (1-e.hat) * (Y - mu.hat.0)
gamma.hat.pi = pi.hat * gamma.hat.1 + (1 - pi.hat) * gamma.hat.0

benefit_table_nonparametric_forest = get_benefit_table(pi.hat, Y, W, "Info")
benefit_table_nonparametric_forest %>% 
  kable("html",digits=3,
        caption = "Benefit of targeted policy (nonparametric policy via causal forests) over uniform policies",
        escape = FALSE,
        col.names = c("% of users", "Policy value ", "std.err", "Policy value", "std.err",  "Benefit of targeting", "std.err", "p-value", 
                       "Policy value", " std.err", "Benefit of targeting", "std.err", "p-value"),
       row.names= TRUE) %>%
  add_header_above(c(" " = 2, "Assigned policy" =2, "Uniform Emotions policy" = 5, "Uniform Info policy" = 5)) %>%
  column_spec(4,border_right = TRUE) %>%
  column_spec(9,border_right = TRUE) %>%
  kable_styling(bootstrap_options=c("condensed", "responsive"), full_width=FALSE)

Benefit of targeted policy (nonparametric policy via causal forests) over uniform policies
		Assigned policy		Uniform Emotions policy					Uniform Info policy
	% of users	Policy value	std.err	Policy value	std.err	Benefit of targeting	std.err	p-value	Policy value	std.err	Benefit of targeting	std.err	p-value
Group assigned to Emotions	100	0.43	0.01	0.43	0.01	0			0.496	0.01	-0.066	NA	NA
Group assigned to Info	0	NaN	NA	NaN	NA	NaN	NA	NaN	NaN	NA	NaN
All	100	NaN	NA	0.43	0.01	NaN	NA	NaN	0.496	0.01	-0.066	0.014	0

Emotions v. Combo

Outcome is the POST-survey average sharing of misinformation posts.

data_combo <- data %>% filter(treatment == "Emotions" | treatment == "Combo")
# Define binary treatment where Info is 1 and Emotions is 0
data_combo$W <- ifelse(data_combo$treatment=="Emotions", 0, 1)

# Define outcome
outcome = c("share_post")

# Define covariates to consider
covariates = c(covariates_binary,covariates_continuous)

# Define number of observations
n = nrow(data_combo)

RATE

# Setup data
X= data_combo[,covariates]
Y= data_combo[,outcome]
W = data_combo[,"W"]

# For cross-fitting
num.folds = 10
folds = sort(seq(n) %% num.folds) + 1

# Train a causal forest to estimate a CATE based priority ranking
train = sample(1:nrow(X), nrow(X) / 2)
cf.priority = causal_forest(X[train, ], Y[train], W[train], W.hat=0.5, tune.parameters = "all")

# Compute a prioritization based on estimated treatment effects.
priority.cate = predict(cf.priority, X[-train, ])$predictions

# Estimate AUTOC on held out data.
cf.eval = causal_forest(X[-train, ], Y[-train], W[-train], W.hat=0.5, tune.parameters = "all")
rate = rank_average_treatment_effect(cf.eval, priority.cate)
rate

##    estimate    std.err             target
##  0.01811178 0.01923528 priorities | AUTOC

plot(rate)

Nonparametric Policy

forest = causal_forest(X, Y, W, W.hat=.5,  tune.parameters = "all", clusters=folds, min.node.size = 200)

calibration.table = test_calibration(forest)
calibration.table

## 
## Best linear fit using forest predictions (on held-out data)
## as well as the mean forest prediction as regressors, along
## with one-sided heteroskedasticity-robust (HC3) SEs:
## 
##                                Estimate Std. Error t value Pr(>t)
## mean.forest.prediction          0.97319    0.80998  1.2015 0.1148
## differential.forest.prediction -0.19728    0.92406 -0.2135 0.5845

tuning_output = forest$tuning.output 
tuning_output

## Tuning status: tuned.
## This indicates tuning found parameters that are expected to perform better than default. 
## 
## Predicted debiased error: 0.145235254416613
## 
## Tuned parameters: 
## sample.fraction: 0.497420803946443
##  mtry: 14
##  min.node.size: 39
##  honesty.fraction: 0.530276759853587
##  honesty.prune.leaves: 1
##  alpha: 0.185503153130412
##  imbalance.penalty: 0.863741314375048
## 
## Average error by 5-quantile:
## 
##  sample.fraction     error
##   [0.0502,0.139] 0.1454069
##    (0.139,0.234] 0.1453804
##    (0.234,0.312] 0.1453618
##    (0.312,0.405] 0.1453130
##      (0.405,0.5] 0.1452764
## 
##     mtry     error
##    [1,6] 0.1452629
##   (6,11] 0.1452862
##  (11,16] 0.1453278
##  (16,21] 0.1454054
##  (21,26] 0.1454749
## 
##  min.node.size     error
##          [1,2] 0.1454607
##          (2,7] 0.1454025
##         (7,21] 0.1453287
##        (21,62] 0.1452732
##       (62,195] 0.1452639
## 
##  honesty.fraction     error
##       [0.5,0.555] 0.1452930
##      (0.555,0.61] 0.1453072
##      (0.61,0.676] 0.1453291
##     (0.676,0.739] 0.1453944
##       (0.739,0.8] 0.1454148
## 
##  honesty.prune.leaves     error
##                     0 0.1453673
##                     1 0.1453286
## 
##              alpha     error
##  [0.000168,0.0478] 0.1453516
##    (0.0478,0.0958] 0.1453346
##     (0.0958,0.151] 0.1453435
##      (0.151,0.202] 0.1453535
##       (0.202,0.25] 0.1453554
## 
##  imbalance.penalty     error
##   [0.000327,0.186] 0.1453892
##      (0.186,0.429] 0.1453687
##      (0.429,0.868] 0.1453691
##       (0.868,1.59] 0.1453204
##        (1.59,6.69] 0.1452911

# Get "out-of-bag" predictions
tau.hat = predict(forest)$predictions

# Nonparametric policy: CATE>0 --> assign to treatment
pi.hat = as.numeric(tau.hat < 0)

# We can use the entire data because predictions are out-of-bag
A = pi.hat == 1

# Retrieve relevant quantities to compute AIPW scores
e.hat = forest$W.hat # P[W=1|X]
mu.hat.1 = forest$Y.hat + (1 - e.hat) * tau.hat  # E[Y|X,W=1] = E[Y|X] + (1 - e(X)) * tau(X)
mu.hat.0 = forest$Y.hat - e.hat * tau.hat        # E[Y|X,W=0] = E[Y|X] - e(X) * tau(X)

# Compute AIPW scores.
gamma.hat.1 = mu.hat.1 + W / e.hat * (Y - mu.hat.1)
gamma.hat.0 = mu.hat.0 + (1-W) / (1-e.hat) * (Y - mu.hat.0)
gamma.hat.pi = pi.hat * gamma.hat.1 + (1 - pi.hat) * gamma.hat.0

benefit_table_nonparametric_forest = get_benefit_table(pi.hat, Y, W, "Combo")
benefit_table_nonparametric_forest %>% 
  kable("html",digits=3,
        caption = "Benefit of targeted policy (nonparametric policy via causal forests) over uniform policies",
        escape = FALSE,
        col.names = c("% of users", "Policy value ", "std.err", "Policy value", "std.err",  "Benefit of targeting", "std.err", "p-value", 
                       "Policy value", " std.err", "Benefit of targeting", "std.err", "p-value"),
       row.names= TRUE) %>%
  add_header_above(c(" " = 2, "Assigned policy" =2, "Uniform Emotions policy" = 5, "Uniform Combo policy" = 5)) %>%
  column_spec(4,border_right = TRUE) %>%
  column_spec(9,border_right = TRUE) %>%
  kable_styling(bootstrap_options=c("condensed", "responsive"), full_width=FALSE)

Benefit of targeted policy (nonparametric policy via causal forests) over uniform policies
		Assigned policy		Uniform Emotions policy					Uniform Combo policy
	% of users	Policy value	std.err	Policy value	std.err	Benefit of targeting	std.err	p-value	Policy value	std.err	Benefit of targeting	std.err	p-value
Group assigned to Emotions	90.401	0.445	0.010	0.445	0.010	0.000			0.476	0.011	-0.031	0.042	0.46
Group assigned to Combo	9.599	0.227	0.029	0.298	0.031	-0.070	0.042	0.093	0.227	0.029	0.000
All	100.000	0.424	0.010	0.430	0.010	-0.007	0	0	0.454	0.010	-0.028	0.012	0.019