Load Library
library(dplyr)
library(knitr)
library(IRdisplay)
library(kableExtra)
library(vtable)
library(tidyverse)
library(reshape2)
library(ggplot2)
library(softImpute)
library(mltools)
library(data.table)
library(stargazer)
library(factoextra)
library(cluster)
library(estimatr)
library(grf)Load Data
# Load the data #
main_data = readRDS("df_final.rds")
#main_data = main_data[main_data$att_check == "1",]
main_data$user = as.integer(main_data$user)
main_data$order = as.integer(as.character(main_data$order))
main_data$type = recode(main_data$type,"base"="Base posts",
"tactics" = "Info posts",
"emotion" = "Emotions posts",
"combo" = "Combo posts")
main_data$share = as.numeric(main_data$share == "Yes")
main_data$accuracy = recode(main_data$accuracy,"end"="No nudge","inter"="Nudge")
main_data$treatment = recode(main_data$treatment,"reminder"="Facts baseline",
"control"="No-course baseline",
"tactics" = "Info",
"emotion" = "Emotions",
"combo" = "Combo")
main_data$treatment = factor(main_data$treatment, levels = c("No-course baseline","Facts baseline","Info","Emotions","Combo"))
main_data$course = recode(main_data$treatment, "No-course baseline" ="Combo")
main_data$course = factor(main_data$course, levels = c("Facts baseline","Info","Emotions","Combo"))
main_data = main_data[with(main_data, order(user, order)), ]
main_data <- main_data %>% mutate(perceived_accuracy = case_when(perceived_accuracy == "Not at all accurate" ~ "0",
perceived_accuracy == "Not very accurate" ~ "1",
perceived_accuracy == "Somewhat accurate" ~ "2",
perceived_accuracy == "Very accurate" ~ "3",
TRUE ~ as.character(perceived_accuracy)),
perceived_accuracy = as.integer(perceived_accuracy))main_data$social_media_hours = ifelse(is.na(main_data$social_media_hours),"0",main_data$social_media_hours)
main_data$social_media_share = ifelse(is.na(main_data$social_media_share),"0-20%",main_data$social_media_share)pre_processing = function(data){
# Family 1 subsets to posts that were shared in the pre, then looks at the misinfo counterparts in post #
family_1_data = data[data$type == "Base posts" & data$pre_post == "pre",]
family_1_data$share_post = NA
family_1_data$type_post = NA
# Identify who shared 0, 1, 2 or 3 base posts in the pre #
groups = aggregate(share~user,family_1_data,sum)
colnames(groups) = c("user","group")
family_1_data = left_join(family_1_data,groups,by="user")
# # Followup preprocessing #
# family_1_data_followup = data[!is.na(data$att_check_followup) & data$type == "Base posts" & data$pre_post == "post",]
# family_1_data_followup$share_followup = NA
# family_1_data_followup$type_followup = NA
#
# # Identify who shared 0, 1, 2 or 3 base posts in the post #
# groups_followup = aggregate(share~user,family_1_data_followup,sum)
# colnames(groups_followup) = c("user","group")
# family_1_data_followup = left_join(family_1_data_followup,groups_followup,by="user")
# For each user #
for (u in unique(family_1_data$user)){
# Take the vector of facts that were seen in the pre as base #
F = family_1_data$fact[family_1_data$user == u]
# For each of these facts #
for (f in F){
# Take the sharing decision of the misinfo counterpart in the post #
family_1_data$share_post[family_1_data$user == u & family_1_data$fact == f] = data$share[data$user == u & data$fact == f & data$pre_post == "post"]
# Take the post type of the misinfo counterpart in the post #
family_1_data$type_post[family_1_data$user == u & family_1_data$fact == f] = data$type[data$user == u & data$fact == f & data$pre_post == "post"]}
# F = family_1_data_followup$fact[family_1_data_followup$user == u]
# # For each of these facts #
# for (f in F){
# # Take the sharing decision of the misinfo counterpart in the followup #
# family_1_data_followup$share_followup[family_1_data_followup$user == u & family_1_data_followup$fact == f] = data$share[data$user == u & data$fact == f & data$pre_post == "followup"]
# # Take the post type of the misinfo counterpart in the post #
# family_1_data_followup$type_followup[family_1_data_followup$user == u & family_1_data_followup$fact == f] = data$type[data$user == u & data$fact == f & data$pre_post == "followup"]}
}
# Subset to those facts that were shared in the pre (as base) #
family_1_data = family_1_data[family_1_data$share == 1,]
# Subset to those facts that were shared in the post (as base) #
# family_1_data_followup = family_1_data_followup[family_1_data_followup$share == 1,]
# Aggregate at the user level by computing averages #
family_1_data_final = aggregate(share_post~user*group*treatment*accuracy*att_check_pre*att_check_post,family_1_data,mean)
# family_1_het_data_final = aggregate(share_post~user*treatment*type_post*att_check_pre*att_check_post,family_1_data,mean)
# family_1_data_followup_final = aggregate(share_followup~user*group*treatment*priming*att_check_pre*att_check_post*att_check_followup,family_1_data_followup,mean)
# family_1_het_data_followup_final = aggregate(share_followup~user*treatment*type_followup*att_check_pre*att_check_post*att_check_followup,family_1_data_followup,mean)
family_1_data_final$course = ifelse(family_1_data_final$treatment == "No-course baseline","control",
ifelse(family_1_data_final$treatment %in% c("Info","Emotions","Combo"),"treatment",NA))
# Family 2 looks at all misinformation posts #
# # Aggregate at the user level the sharing decisions in the misinfo posts in both #
# family_2_data_final = aggregate(share~user*treatment*accuracy*pre_post*att_check_pre*att_check_post,data[data$type!="Base posts",],mean)
# # Aggregate at the user level the sharing decisions in the misinfo posts separately in pre and post, and take the difference #
# family_2_data_final_pre = aggregate(share~user*treatment*accuracy*att_check_pre*att_check_post,data[data$type!="Base posts" & data$pre_post=="pre"& data$treatment %in% c("Info","Emotions","Combo"),],mean)
# family_2_data_final_post = aggregate(share~user*treatment*accuracy*att_check_pre*att_check_post,data[data$type!="Base posts" & data$pre_post=="post" & data$treatment %in% c("Info","Emotions","Combo"),],mean)
# family_2_data_final_post$share_diff = family_2_data_final_post$share - family_2_data_final_pre$share
return(family_1_data_final)
}data_final <- pre_processing(main_data)all_covs = c("age","gender","education","marital","employment","location","religion","religiosity","social_media_bin","social_media_hours","social_media_share")
categorical_covs = c("gender","education","marital","employment","location","religion","religiosity","social_media_bin","social_media_share")data_covs = data.frame(lapply(main_data[main_data$order==1,c("user", all_covs)], as.factor))
data_covs$age = as.numeric(as.character(data_covs$age))
data_covs$social_media_hours = as.numeric(as.character(data_covs$social_media_hours))
data_covs$share_pre_all = aggregate(share~user,main_data[main_data$pre_post=="pre",],mean)[,2]
data_covs$share_pre_base = aggregate(share~user,main_data[main_data$pre_post=="pre" & main_data$type == "Base posts",],mean)[,2]
data_covs$share_pre_misinfo = aggregate(share~user,main_data[main_data$pre_post=="pre" & main_data$type != "Base posts",],mean)[,2]
data_covs$accuracy_pre_all = aggregate(perceived_accuracy~user,main_data[main_data$pre_post=="pre",],mean)[,2]
data_covs$accuracy_pre_base = aggregate(perceived_accuracy~user,main_data[main_data$pre_post=="pre" & main_data$type == "Base posts",],mean)[,2]
data_covs$accuracy_pre_misinfo = aggregate(perceived_accuracy~user,main_data[main_data$pre_post=="pre" & main_data$type != "Base posts",],mean)[,2]
data_covs$treatment = main_data[main_data$order==1,"treatment"]data_covs$education = ifelse(data_covs$education %in% c("Less than a high school diploma",
"High school degree or equivalent"),"High school or less",
ifelse(data_covs$education %in% c("Some college, no degree","Associate degree (e.g. AA, AS)"),
"Some college",
ifelse(data_covs$education %in% c("Master\'s degree (e.g. MA, MS, MEd)",
"Doctorate or professional degree (e.g. MD, DDS, PhD)"),
"Graduate degree","Bachelor\'s degree")))
data_covs$education = factor(data_covs$education,levels=c("High school or less",
"Some college","Bachelor\'s degree",
"Graduate degree"))
data_covs$employment = ifelse(data_covs$employment %in% c("Unemployed not currently looking for work",
"Unemployed and currently looking for work","Homemaker","Unable to work","Retired"),"Unemployed",
ifelse(data_covs$employment %in% c("Employed part time (up to 29 hours per week)",
"Employed full time (30 or more hours per week)","Self-employed"),"Employed","Student"))
data_covs$employment = factor(data_covs$employment,levels=c("Employed","Unemployed","Student"))
data_covs$religiosity = ifelse(data_covs$religiosity %in% c("Less than once a month",
"One to three times per month",
"Once a week",
"More than once a week but less than daily",
"Daily"),"Attends","Does not attend")
data_covs$religiosity = factor(data_covs$religiosity,levels=c("Attends","Does not attend"))all_covs = c(all_covs,"share_pre_all","share_pre_base","share_pre_misinfo", "accuracy_pre_all","accuracy_pre_base","accuracy_pre_misinfo")
all_covs_fancy = c(""," ","Educational Attainment"," ","Employment Status","Neighborhood"," ",
" "," "," ","Prop. of content shared", "Pre-survey Sharing",
"Pre-survey Sharing","Pre-survey Sharing", "Pre-survey Accuracy", "Pre-survey Accuracy", "Pre-survey Accuracy")# Defines names and display names of variables #
covariates_continuous = c("age","social_media_hours","share_pre_all","share_pre_base","share_pre_misinfo", "accuracy_pre_all","accuracy_pre_base","accuracy_pre_misinfo")
covariates_continuous_fancy = c("Age","Hours per day spent in social media",
"All posts (Sharing)",
"Non-misinformation posts (Sharing)",
"Misinformation posts (Sharing)",
"All posts (Accuracy)",
"Non-misinformation posts (Accuracy)",
"Misinformation posts (Accuracy)")
covariates_binary = c("gender_Man",
paste("education",c("High school or less","Some college","Bachelor\'s degree","Graduate degree"),sep="_"),
"marital_Married, or in a domestic partnership",
paste("employment",c("Unemployed","Employed","Student"),sep="_"),
paste("location",c("Mostly urban","Suburban","Mostly rural"),sep="_"),
"religion_Christian",
"religiosity_Attends",
"social_media_bin_Yes",
paste("social_media_share",c("0-20%","20-40%","40-60%","60-80%","80-100%"),sep="_"))
covariates_binary_fancy = c("Man",
"High school or less",
"Some college",
"Bachelor\'s degree",
"Graduate degree",
"Married",
"Unemployed",
"Employed",
"Student",
"Mostly urban","Suburban","Mostly rural",
"Christian",
"Attends religious services",
"Uses social media",
"0-20%","20-40%","40-60%","60-80%","80-100%")
covariates_all = c("age",covariates_binary[!startsWith(covariates_binary,"social_media_share")],"social_media_hours",
covariates_binary[startsWith(covariates_binary,"social_media_share")],
covariates_continuous[3:length(covariates_continuous)])
covariates_all_fancy = c("Age",covariates_binary_fancy[!endsWith(covariates_binary_fancy,"%")],"Hours per day spent in social media",
covariates_binary_fancy[endsWith(covariates_binary_fancy,"%")],
covariates_continuous_fancy[3:length(covariates_continuous_fancy)])# Adds the level 'missing' to factor variables #
addmissing = function(x){
if(is.factor(x)) return(factor(x, levels=c(levels(x), "missing")))
return(x)}# Add missing category to factors #
data_covs = as.data.frame(lapply(data_covs, addmissing))
data_covs = data_covs %>%
mutate_if(is.factor, ~replace_na(., "missing"))
data_covs = droplevels(data_covs)# One-hot encoding the factor covariates #
setDT(data_covs)
data_covs = one_hot(data_covs, cols = c(categorical_covs))
setDF(data_covs)
data_covs = subset(data_covs, select=c("user", covariates_binary,covariates_continuous,
"treatment"))cov_groups_num = c()
for (cov in all_covs){
cov_groups_num = c(cov_groups_num,sum(startsWith(colnames(data_covs),cov)))}cov_groups = rep(all_covs_fancy,cov_groups_num)data_final$user <- as.factor(data_final$user)data <- left_join(data_final, data_covs, by = c("user", "treatment"))Outcome is the POST-survey average sharing of misinformation posts.
get_benefit_table = function(pi.hat, Y, W, counter_treatment_name){
A = pi.hat == 1
# Group assigned to Emotions
freq = mean(pi.hat==0)*100
targeted_policy.est = mean(Y[W==0 & pi.hat==0])
targeted_policy.stderr = sqrt(var(Y[W==0 & pi.hat==0])/sum(W==0 & pi.hat==0))
t3_policy.est = mean(Y[W==0 & pi.hat==0])
t3_policy.stderr = sqrt(var(Y[W==0 & pi.hat==0])/sum(W==0 & pi.hat==0))
benefit_over_t3 = targeted_policy.est-t3_policy.est
benefit_over_t3.stderr = c("")
benefit_over_t3.pvalue = c("")
t2_policy.est = mean(Y[W==1 & pi.hat==0])
t2_policy.stderr = sqrt(var(Y[W==1 & pi.hat==0])/sum(W==1 & pi.hat==0))
benefit_over_t2 = (-mean(Y[!A & (W==1)])+mean(Y[!A & (W==0)]))
benefit_over_t2.stderr = round(sqrt(var(Y[A & (W==1)]) /sum(A & (W==1)) + var(Y[A & (W==0)]) / sum(A & W==0)),3)
benefit_over_t2.pvalue = round(2*pnorm(-abs(benefit_over_t2)/benefit_over_t2.stderr),3)
benefit_pi.hat.0 = data.frame(freq, targeted_policy.est, targeted_policy.stderr, t3_policy.est, t3_policy.stderr, benefit_over_t3, benefit_over_t3.stderr, benefit_over_t3.pvalue, t2_policy.est, t2_policy.stderr, benefit_over_t2, benefit_over_t2.stderr, benefit_over_t2.pvalue)
rownames(benefit_pi.hat.0) = c("Group assigned to Emotions")
# Group assigned to Non-Emotions
freq = mean(pi.hat==1)*100
targeted_policy.est = mean(Y[W==1 & pi.hat==1])
targeted_policy.stderr = sqrt(var(Y[W==1 & pi.hat==1])/sum(W==1 & pi.hat==1))
t3_policy.est = mean(Y[W==0 & pi.hat==1])
t3_policy.stderr = sqrt(var(Y[W==0 & pi.hat==1])/sum(W==0 & pi.hat==1))
benefit_over_t3 = (mean(Y[A & (W==1)]) - mean(Y[A & (W==0)]))
benefit_over_t3.stderr = round(sqrt(var(Y[A & (W==1)]) / sum(A & (W==1)) + var(Y[A & (W==0)]) / sum(A & W==0)),3)
benefit_over_t3.pvalue = round(2*pnorm(-abs(benefit_over_t3)/benefit_over_t3.stderr),3)
t2_policy.est = mean(Y[W==1 & pi.hat==1])
t2_policy.stderr = sqrt(var(Y[W==1 & pi.hat==1])/sum(W==1 & pi.hat==1))
benefit_over_t2 = targeted_policy.est-t2_policy.est
benefit_over_t2.stderr = c("")
benefit_over_t2.pvalue = c("")
benefit_pi.hat.1 = data.frame(freq, targeted_policy.est, targeted_policy.stderr, t3_policy.est, t3_policy.stderr, benefit_over_t3, benefit_over_t3.stderr, benefit_over_t3.pvalue, t2_policy.est, t2_policy.stderr, benefit_over_t2, benefit_over_t2.stderr, benefit_over_t2.pvalue)
rownames(benefit_pi.hat.1) = c(paste0("Group assigned to ", counter_treatment_name))
# ALL
freq = 100
targeted_policy.est = mean(Y[A & (W==1)]) * mean(A) + mean(Y[!A & (W==0)]) * mean(!A)
targeted_policy.stderr = sqrt(var(Y[A & (W==1)]) / sum(A & (W==1)) * mean(A)^2 + var(Y[!A & (W==0)]) / sum(!A & W==0) * mean(!A)^2)
t3_policy.est = mean(Y[W==0])
t3_policy.stderr = sqrt(var(Y[W==0])/sum(W==0))
benefit_over_t3 = (mean(Y[A & (W==1)]) - mean(Y[A & (W==0)])) * mean(A)
benefit_over_t3.stderr = round(sqrt(var(Y[A & (W==1)]) / sum(A & (W==1)) + var(Y[A & (W==0)]) / sum(A & W==0)) * mean(A)^2,3)
benefit_over_t3.pvalue = round(2*pnorm(-abs(benefit_over_t3)/benefit_over_t3.stderr),3)
t2_policy.est = mean(Y[W==1])
t2_policy.stderr = sqrt(var(Y[W==1])/sum(W==1))
benefit_over_t2 = (-mean(Y[!A & (W==1)]) +mean(Y[!A & (W==0)])) * mean(!A)
benefit_over_t2.stderr = round(sqrt(var(Y[!A & (W==1)]) / sum(!A & (W==1)) + var(Y[!A & (W==0)]) / sum(!A & W==0)) * mean(!A)^2,3)
benefit_over_t2.pvalue = round(2*pnorm(-abs(benefit_over_t2)/benefit_over_t2.stderr),3)
benefit_all = data.frame(freq, targeted_policy.est, targeted_policy.stderr, t3_policy.est, t3_policy.stderr, benefit_over_t3, benefit_over_t3.stderr, benefit_over_t3.pvalue, t2_policy.est, t2_policy.stderr, benefit_over_t2, benefit_over_t2.stderr, benefit_over_t2.pvalue)
rownames(benefit_all) = c("All")
benefit_table = rbind(benefit_pi.hat.0, benefit_pi.hat.1, benefit_all)
return(benefit_table)
}data_info <- data %>% filter(treatment == "Emotions" | treatment == "Info")
# Define binary treatment where Info is 1 and Emotions is 0
data_info$W <- ifelse(data_info$treatment=="Emotions", 0, 1)
# Define outcome
outcome = c("share_post")
# Define covariates to consider
covariates = c(covariates_binary,covariates_continuous)
# Define number of observations
n = nrow(data_info)# Setup data
X= data_info[,covariates]
Y= data_info[,outcome]
W = data_info[,"W"]
# For cross-fitting
num.folds = 10
folds = sort(seq(n) %% num.folds) + 1# Train a causal forest to estimate a CATE based priority ranking
train = sample(1:nrow(X), nrow(X) / 2)
cf.priority = causal_forest(X[train, ], Y[train], W[train], W.hat=0.5, tune.parameters = "all")
# Compute a prioritization based on estimated treatment effects.
priority.cate = predict(cf.priority, X[-train, ])$predictions
# Estimate AUTOC on held out data.
cf.eval = causal_forest(X[-train, ], Y[-train], W[-train], W.hat=0.5, tune.parameters = "all")
rate = rank_average_treatment_effect(cf.eval, priority.cate)
rate## estimate std.err target
## 0.0368327 0.01752265 priorities | AUTOC
plot(rate)forest = causal_forest(X, Y, W, W.hat=.5, tune.parameters = "all", clusters=folds, min.node.size = 200)calibration.table = test_calibration(forest)
calibration.table##
## Best linear fit using forest predictions (on held-out data)
## as well as the mean forest prediction as regressors, along
## with one-sided heteroskedasticity-robust (HC3) SEs:
##
## Estimate Std. Error t value Pr(>t)
## mean.forest.prediction 1.00397 0.13478 7.4490 6.057e-14 ***
## differential.forest.prediction -4.96773 1.19641 -4.1522 1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tuning_output = forest$tuning.output
tuning_output## Tuning status: tuned.
## This indicates tuning found parameters that are expected to perform better than default.
##
## Predicted debiased error: 0.143946972094571
##
## Tuned parameters:
## sample.fraction: 0.264950318855699
## mtry: 2
## min.node.size: 1
## honesty.fraction: 0.524321214878
## honesty.prune.leaves: 0
## alpha: 0.211763351166155
## imbalance.penalty: 1.32448013451888
##
## Average error by 5-quantile:
##
## sample.fraction error
## [0.05,0.142] 0.1442010
## (0.142,0.225] 0.1441933
## (0.225,0.32] 0.1441838
## (0.32,0.417] 0.1442817
## (0.417,0.5] 0.1442756
##
## mtry error
## [1,5] 0.1440616
## (5,10] 0.1441385
## (10,14] 0.1442478
## (14,20] 0.1443417
## (20,26] 0.1443808
##
## min.node.size error
## [1,2] 0.1444373
## (2,7] 0.1443864
## (7,22] 0.1442472
## (22,67] 0.1440582
## (67,194] 0.1439849
##
## honesty.fraction error
## [0.5,0.558] 0.1441100
## (0.558,0.619] 0.1441702
## (0.619,0.683] 0.1442282
## (0.683,0.742] 0.1442572
## (0.742,0.8] 0.1443697
##
## honesty.prune.leaves error
## 0 0.1442337
## 1 0.1442198
##
## alpha error
## [9.01e-05,0.0484] 0.1442593
## (0.0484,0.103] 0.1442496
## (0.103,0.15] 0.1442512
## (0.15,0.198] 0.1441921
## (0.198,0.25] 0.1441832
##
## imbalance.penalty error
## [0.000323,0.242] 0.1444127
## (0.242,0.542] 0.1443363
## (0.542,0.982] 0.1442277
## (0.982,1.64] 0.1441084
## (1.64,6.51] 0.1440503
# Get "out-of-bag" predictions
tau.hat = predict(forest)$predictions
# Nonparametric policy: CATE>0 --> assign to treatment
pi.hat = as.numeric(tau.hat < 0)
# We can use the entire data because predictions are out-of-bag
A = pi.hat == 1
# Retrieve relevant quantities to compute AIPW scores
e.hat = forest$W.hat # P[W=1|X]
mu.hat.1 = forest$Y.hat + (1 - e.hat) * tau.hat # E[Y|X,W=1] = E[Y|X] + (1 - e(X)) * tau(X)
mu.hat.0 = forest$Y.hat - e.hat * tau.hat # E[Y|X,W=0] = E[Y|X] - e(X) * tau(X)
# Compute AIPW scores.
gamma.hat.1 = mu.hat.1 + W / e.hat * (Y - mu.hat.1)
gamma.hat.0 = mu.hat.0 + (1-W) / (1-e.hat) * (Y - mu.hat.0)
gamma.hat.pi = pi.hat * gamma.hat.1 + (1 - pi.hat) * gamma.hat.0benefit_table_nonparametric_forest = get_benefit_table(pi.hat, Y, W, "Info")
benefit_table_nonparametric_forest %>%
kable("html",digits=3,
caption = "Benefit of targeted policy (nonparametric policy via causal forests) over uniform policies",
escape = FALSE,
col.names = c("% of users", "Policy value ", "std.err", "Policy value", "std.err", "Benefit of targeting", "std.err", "p-value",
"Policy value", " std.err", "Benefit of targeting", "std.err", "p-value"),
row.names= TRUE) %>%
add_header_above(c(" " = 2, "Assigned policy" =2, "Uniform Emotions policy" = 5, "Uniform Info policy" = 5)) %>%
column_spec(4,border_right = TRUE) %>%
column_spec(9,border_right = TRUE) %>%
kable_styling(bootstrap_options=c("condensed", "responsive"), full_width=FALSE) | % of users | Policy value | std.err | Policy value | std.err | Benefit of targeting | std.err | p-value | Policy value | std.err | Benefit of targeting | std.err | p-value | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Group assigned to Emotions | 100 | 0.43 | 0.01 | 0.43 | 0.01 | 0 | 0.496 | 0.01 | -0.066 | NA | NA | ||
| Group assigned to Info | 0 | NaN | NA | NaN | NA | NaN | NA | NaN | NaN | NA | NaN | ||
| All | 100 | NaN | NA | 0.43 | 0.01 | NaN | NA | NaN | 0.496 | 0.01 | -0.066 | 0.014 | 0 |
Outcome is the POST-survey average sharing of misinformation posts.
data_combo <- data %>% filter(treatment == "Emotions" | treatment == "Combo")
# Define binary treatment where Info is 1 and Emotions is 0
data_combo$W <- ifelse(data_combo$treatment=="Emotions", 0, 1)
# Define outcome
outcome = c("share_post")
# Define covariates to consider
covariates = c(covariates_binary,covariates_continuous)
# Define number of observations
n = nrow(data_combo)# Setup data
X= data_combo[,covariates]
Y= data_combo[,outcome]
W = data_combo[,"W"]
# For cross-fitting
num.folds = 10
folds = sort(seq(n) %% num.folds) + 1# Train a causal forest to estimate a CATE based priority ranking
train = sample(1:nrow(X), nrow(X) / 2)
cf.priority = causal_forest(X[train, ], Y[train], W[train], W.hat=0.5, tune.parameters = "all")
# Compute a prioritization based on estimated treatment effects.
priority.cate = predict(cf.priority, X[-train, ])$predictions
# Estimate AUTOC on held out data.
cf.eval = causal_forest(X[-train, ], Y[-train], W[-train], W.hat=0.5, tune.parameters = "all")
rate = rank_average_treatment_effect(cf.eval, priority.cate)
rate## estimate std.err target
## 0.01811178 0.01923528 priorities | AUTOC
plot(rate)forest = causal_forest(X, Y, W, W.hat=.5, tune.parameters = "all", clusters=folds, min.node.size = 200)calibration.table = test_calibration(forest)
calibration.table##
## Best linear fit using forest predictions (on held-out data)
## as well as the mean forest prediction as regressors, along
## with one-sided heteroskedasticity-robust (HC3) SEs:
##
## Estimate Std. Error t value Pr(>t)
## mean.forest.prediction 0.97319 0.80998 1.2015 0.1148
## differential.forest.prediction -0.19728 0.92406 -0.2135 0.5845
tuning_output = forest$tuning.output
tuning_output## Tuning status: tuned.
## This indicates tuning found parameters that are expected to perform better than default.
##
## Predicted debiased error: 0.145235254416613
##
## Tuned parameters:
## sample.fraction: 0.497420803946443
## mtry: 14
## min.node.size: 39
## honesty.fraction: 0.530276759853587
## honesty.prune.leaves: 1
## alpha: 0.185503153130412
## imbalance.penalty: 0.863741314375048
##
## Average error by 5-quantile:
##
## sample.fraction error
## [0.0502,0.139] 0.1454069
## (0.139,0.234] 0.1453804
## (0.234,0.312] 0.1453618
## (0.312,0.405] 0.1453130
## (0.405,0.5] 0.1452764
##
## mtry error
## [1,6] 0.1452629
## (6,11] 0.1452862
## (11,16] 0.1453278
## (16,21] 0.1454054
## (21,26] 0.1454749
##
## min.node.size error
## [1,2] 0.1454607
## (2,7] 0.1454025
## (7,21] 0.1453287
## (21,62] 0.1452732
## (62,195] 0.1452639
##
## honesty.fraction error
## [0.5,0.555] 0.1452930
## (0.555,0.61] 0.1453072
## (0.61,0.676] 0.1453291
## (0.676,0.739] 0.1453944
## (0.739,0.8] 0.1454148
##
## honesty.prune.leaves error
## 0 0.1453673
## 1 0.1453286
##
## alpha error
## [0.000168,0.0478] 0.1453516
## (0.0478,0.0958] 0.1453346
## (0.0958,0.151] 0.1453435
## (0.151,0.202] 0.1453535
## (0.202,0.25] 0.1453554
##
## imbalance.penalty error
## [0.000327,0.186] 0.1453892
## (0.186,0.429] 0.1453687
## (0.429,0.868] 0.1453691
## (0.868,1.59] 0.1453204
## (1.59,6.69] 0.1452911
# Get "out-of-bag" predictions
tau.hat = predict(forest)$predictions
# Nonparametric policy: CATE>0 --> assign to treatment
pi.hat = as.numeric(tau.hat < 0)
# We can use the entire data because predictions are out-of-bag
A = pi.hat == 1
# Retrieve relevant quantities to compute AIPW scores
e.hat = forest$W.hat # P[W=1|X]
mu.hat.1 = forest$Y.hat + (1 - e.hat) * tau.hat # E[Y|X,W=1] = E[Y|X] + (1 - e(X)) * tau(X)
mu.hat.0 = forest$Y.hat - e.hat * tau.hat # E[Y|X,W=0] = E[Y|X] - e(X) * tau(X)
# Compute AIPW scores.
gamma.hat.1 = mu.hat.1 + W / e.hat * (Y - mu.hat.1)
gamma.hat.0 = mu.hat.0 + (1-W) / (1-e.hat) * (Y - mu.hat.0)
gamma.hat.pi = pi.hat * gamma.hat.1 + (1 - pi.hat) * gamma.hat.0benefit_table_nonparametric_forest = get_benefit_table(pi.hat, Y, W, "Combo")
benefit_table_nonparametric_forest %>%
kable("html",digits=3,
caption = "Benefit of targeted policy (nonparametric policy via causal forests) over uniform policies",
escape = FALSE,
col.names = c("% of users", "Policy value ", "std.err", "Policy value", "std.err", "Benefit of targeting", "std.err", "p-value",
"Policy value", " std.err", "Benefit of targeting", "std.err", "p-value"),
row.names= TRUE) %>%
add_header_above(c(" " = 2, "Assigned policy" =2, "Uniform Emotions policy" = 5, "Uniform Combo policy" = 5)) %>%
column_spec(4,border_right = TRUE) %>%
column_spec(9,border_right = TRUE) %>%
kable_styling(bootstrap_options=c("condensed", "responsive"), full_width=FALSE)| % of users | Policy value | std.err | Policy value | std.err | Benefit of targeting | std.err | p-value | Policy value | std.err | Benefit of targeting | std.err | p-value | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Group assigned to Emotions | 90.401 | 0.445 | 0.010 | 0.445 | 0.010 | 0.000 | 0.476 | 0.011 | -0.031 | 0.042 | 0.46 | ||
| Group assigned to Combo | 9.599 | 0.227 | 0.029 | 0.298 | 0.031 | -0.070 | 0.042 | 0.093 | 0.227 | 0.029 | 0.000 | ||
| All | 100.000 | 0.424 | 0.010 | 0.430 | 0.010 | -0.007 | 0 | 0 | 0.454 | 0.010 | -0.028 | 0.012 | 0.019 |