This file constructs the functions used to perform permutation-based inference for a state-based policy intervention.
Our first step is to simulate some data. We will draw data based on the following model: \[ Y_{is} = 2 + 0.25 \cdot \textrm{Tx}_s + \epsilon_{is} \] for individual \(i\) in state \(s\). To approximate the SCCS we will have a total of 12 states, with 3 “treatment” states. We’ll assume we are analyzing a dataset with 10,000 observations.
set.seed(3449453)
N <- 10000 # Total Sample Size
N_States <- 12 # Number of States
N_Tx_States <- 3 # Number of Treated States
Tx_States <- sample(1:N_States,size=N_Tx_States,replace=FALSE) # Treated States
S_i <- sample(1:N_States,prob=rep(1/N_States,N_States),size=N,replace=TRUE) # Simulate individual i's state.
Z <- as.integer(S_i %in% Tx_States) # Treatment indicator
Y <- 2 + 0.25*Z + rnorm(n=N) # Outcome
XX <- data.frame(Y=Y,Z=Z,C=S_i) # Data Frame
We will first fit a simple linear model and obtain standard errors and 95% confidence intervals using standard inference methods.
m.fit <- with(XX,lm(Y~Z))
observed_test_statistic <- coef(m.fit)[2]
f0 <- tidy(m.fit) %>% filter(term=="Z") %>%
mutate(model="Standard Inference",
ci_lo = confint(m.fit)[2,1],
ci_hi = confint(m.fit)[2,2]) %>%
select(model,estimate,std.error,p.value,ci_lo,ci_hi) %>%
mutate_each(funs(round(.,4)),c(2,3,5,6)) %>%
mutate(p.value=fixp(p.value))
Dependent variable: | |
Y | |
Z | 0.282*** |
(0.023) | |
Constant | 2.005*** |
(0.012) | |
Observations | 10,000 |
Note: | p<0.1; p<0.05; p<0.01 |
Next we’ll obtain bootstrapped standard errors and 95% confidence intervals using 1,000 bootstrapped samples (with replacement) of our original data.
N_bootstraps = 1000
f1 <- broom::bootstrap(XX, N_bootstraps) %>% do(fit = lm(Y~Z,data=.)) %>% mutate(coef = summary(fit)$coeff[2]) %>%
select(-fit) %>% ungroup() %>%
summarise(estimate=observed_test_statistic,std.error=sd(coef),ci_lo = quantile(coef,0.025),ci_hi=quantile(coef,0.975)) %>%
mutate(model=paste0("Bootstrap (S=",N_bootstraps,")"),
p.value = NA) %>%
select(model,estimate,std.error,p.value,ci_lo,ci_hi) %>%
mutate_each(funs(round(.,4)),c(2,3,5,6))
We’ll now turn to a randomization inference method where we permute treatment status among individuals. The functions below are designed to facilitate treatment status permutation within a given dataset. However, in order to get the process to work correctly a “stacked” version of the dataset is needed. This stacked dataset is essentially two copies of the same dataset; the only difference is that in the first instance, every observation receives a treatment indicator of 1, while in the second instance everyone receives a 0. The permutation function first (randomly) selects an index of treated observations (matching the number of originally treated observations) from the first instance of the data in the stack, and then selects the control index as the complementary set of individuals from the second instance of the data.
XX_1 <- XX %>% mutate(Z_Obs = Z) %>% mutate(Z=1)
XX_0 <- XX %>% mutate(Z_Obs = Z) %>% mutate(Z=0)
XX_Stacked <- rbind(XX_1,XX_0)
# This function randomly permutes indiviudal-level treatment.
permute <- function (df, m, tx="Z",tx_orig = "Z_Obs")
{
n <- nrow(df)/2
n_times_2 <- n*2
n_tx <- df %>% filter(row_number() %in% 1:n) %>% count_(tx_orig) %>% collect() %>% .[["n"]] %>% nth(2)
sampled.index <- function(N=n,N_tx=n_tx,N_Times_2 = n_times_2)
{
sampled_tx_observations <- sample(1:N,N_tx,replace=FALSE)
sampled_cx_observations <- seq(N+1,N_Times_2,1)[-sampled_tx_observations]
out <- c(sampled_tx_observations,sampled_cx_observations)
out
}
attr(df, "indices") <- replicate(m, sampled.index() - 1, simplify = FALSE)
attr(df, "drop") <- TRUE
attr(df, "group_sizes") <- rep(n, m)
attr(df, "biggest_group_size") <- n
attr(df, "labels") <- data.frame(replicate = 1:m)
attr(df, "vars") <- list(quote(replicate))
class(df) <- c("grouped_df", "tbl_df", "tbl", "data.frame")
df
}
# This function permutes treatment based on assignment via a cluster (i.e., state).
permute.cluster <- function (df, m, tx="Z",tx_orig = "Z_Obs", c = "C")
{
n <- nrow(df)/2
n_times_2 <- n*2
df$temp_Tx = df[,tx_orig]
df$temp_C = df[,c]
n_tx <- df %>% filter(row_number() %in% 1:n) %>% filter(temp_Tx==1) %>%
count(temp_C) %>% collect() %>% .[["n"]] %>% length()
cc <- unique(df[,c])
sampled.index <- function(N=n,N_tx=n_tx,N_Times_2 = n_times_2,CC=cc)
{
sampled_clusters <- sample(CC,N_tx,replace=FALSE)
sampled_tx_observations <- df %>% filter(row_number() %in% 1:N) %>%
mutate(sampled_obs=row_number()) %>%
filter(temp_C %in% sampled_clusters) %>% collect() %>% .[["sampled_obs"]]
sampled_cx_observations <- seq(N+1,N_Times_2,1)[-sampled_tx_observations]
out <- c(sampled_tx_observations,sampled_cx_observations)
out
}
attr(df, "indices") <- replicate(m, sampled.index() - 1, simplify = FALSE)
attr(df, "drop") <- TRUE
attr(df, "group_sizes") <- rep(n, m)
attr(df, "biggest_group_size") <- n
attr(df, "labels") <- data.frame(replicate = 1:m)
attr(df, "vars") <- list(quote(replicate))
class(df) <- c("grouped_df", "tbl_df", "tbl", "data.frame")
df
}
N_permutations = 1000
f2 <- permute(XX_Stacked, N_permutations) %>% do(fit = lm(Y~Z,data=.)) %>% mutate(coef = summary(fit)$coeff[2]) %>%
select(-fit) %>% mutate(p.value = as.integer(abs(observed_test_statistic)<=abs(coef))) %>% ungroup() %>%
summarise(std.error = sd(coef),p.value = mean(p.value) ) %>%
mutate(model = "Standard Permutation Inference")
# Invert the Test to Obtain the Confidence INterval
alpha = 0.05
ci <- round(confint(m.fit)[2,],2)
Tau <- c(seq(confint(m.fit)[2,1]-f2$std.error,confint(m.fit)[2,1]+f2$std.error,.001),
seq(confint(m.fit)[2,2]-f2$std.error,confint(m.fit)[2,2]+f2$std.error,.001))
registerDoParallel()
ci.permute <- foreach(ss=1:length(Tau),.combine=rbind) %dopar%
{
require(dplyr)
require(foreach)
tau <- Tau[ss]
observed_test_statistic_s <- XX %>% tbl_df() %>% mutate(foo=1) %>% group_by(foo) %>%
mutate(y_h0 = Z*(Y-tau)+(1-Z)*Y) %>%
do(fit = lm(y_h0~Z,data=.)) %>%
mutate(coef = summary(fit)$coeff[2]) %>%
select(-fit) %>% collect() %>% .[["coef"]]
empirical_distribution_under_null <- XX_Stacked %>%
mutate(y_h0 = Z_Obs*(Y-tau)+(1-Z_Obs)*Y) %>%
permute(N_permutations) %>%
do(fit = lm(y_h0~Z,data=.)) %>%
mutate(coef = summary(fit)$coeff[2]) %>%
select(-fit) %>% collect() %>% .[["coef"]]
p_value = 1 - prop.table(table(abs(observed_test_statistic_s)<=abs(empirical_distribution_under_null)))[1]
reject <- as.integer(p_value<alpha)
out <- data.frame(tau=tau,reject=reject)
}
test1 <- ci.permute %>% filter(reject==0) %>% summarise(ci_lo=min(tau),ci_hi=max(tau))
f2 <- f2 %>% mutate(estimate = observed_test_statistic,ci_lo=test1$ci_lo,ci_hi=test1$ci_hi) %>%
select(model,estimate,std.error,p.value,ci_lo,ci_hi) %>%
mutate_each(funs(round(.,4)),c(2,3,5,6)) %>%
mutate(p.value=fixp(p.value))
N_permutations = 1000
f3 <- permute.cluster(XX_Stacked, N_permutations) %>% do(fit = lm(Y~Z,data=.)) %>% mutate(coef = summary(fit)$coeff[2]) %>%
select(-fit) %>% mutate(p.value = as.integer(abs(observed_test_statistic)<=abs(coef))) %>% ungroup() %>%
summarise(std.error = sd(coef),p.value = mean(p.value) ) %>%
mutate(model = "State Cluster-Based Permutation Inference")
# Invert the Test to Obtain the Confidence INterval
alpha = 0.05
ci <- round(confint(m.fit)[2,],2)
Tau <- c(seq(confint(m.fit)[2,1]-f3$std.error,confint(m.fit)[2,1]+f3$std.error,.001),
seq(confint(m.fit)[2,2]-f3$std.error,confint(m.fit)[2,2]+f3$std.error,.001))
registerDoParallel()
ci.permute.cluster <- foreach(ss=1:length(Tau),.combine=rbind) %dopar%
{
require(dplyr)
require(foreach)
tau <- Tau[ss]
observed_test_statistic_s <- XX %>% tbl_df() %>% mutate(foo=1) %>% group_by(foo) %>%
mutate(y_h0 = Z*(Y-tau)+(1-Z)*Y) %>%
do(fit = lm(y_h0~Z,data=.)) %>%
mutate(coef = summary(fit)$coeff[2]) %>%
select(-fit) %>% collect() %>% .[["coef"]]
empirical_distribution_under_null <- XX_Stacked %>%
mutate(y_h0 = Z_Obs*(Y-tau)+(1-Z_Obs)*Y) %>%
permute.cluster(N_permutations) %>%
do(fit = lm(y_h0~Z,data=.)) %>%
mutate(coef = summary(fit)$coeff[2]) %>%
select(-fit) %>% collect() %>% .[["coef"]]
p_value = 1 - prop.table(table(abs(observed_test_statistic_s)<=abs(empirical_distribution_under_null)))[1]
reject <- as.integer(p_value<alpha)
out <- data.frame(tau=tau,reject=reject)
}
test <- ci.permute.cluster %>% filter(reject==0) %>% summarise(ci_lo=min(tau),ci_hi=max(tau))
f3 <- f3 %>% mutate(estimate = observed_test_statistic,ci_lo=test$ci_lo,ci_hi=test$ci_hi) %>%
select(model,estimate,std.error,p.value,ci_lo,ci_hi) %>%
mutate_each(funs(round(.,4)),c(2,3,5,6)) %>%
mutate(p.value=fixp(p.value))
# Add in clustered standard errors
source("~/Dropbox/Projects/network-quality/analysis/sub-files/cluster-se.r")
library(sandwich,quietly = T)
library(lmtest,quietly = T)
m.fit <- with(XX,lm(Y~Z))
m.vcov <- get_CL_vcov(model=m.fit,cluster=XX$C)
f4 <- tidy(coeftest.cluster(data=XX,fm=m.fit,cluster1="C")) %>% filter(term=="Z") %>%
mutate(model="State Cluster Robust SEs",
ci_lo = get_confint(model=m.fit,vcovCL = m.vcov)[2,2],
ci_hi = get_confint(model=m.fit,vcovCL = m.vcov)[2,3]) %>%
select(model,estimate,std.error,p.value,ci_lo,ci_hi) %>%
mutate_each(funs(round(.,4)),c(2,3,5,6)) %>%
mutate(p.value=fixp(p.value))
knitr::kable(rbind(f0,f4,f1,f2,f3))
model | estimate | std.error | p.value | ci_lo | ci_hi |
---|---|---|---|---|---|
Standard Inference | 0.282 | 0.0232 | < .001 | 0.2365 | 0.3275 |
State Cluster Robust SEs | 0.282 | 0.0170 | < .001 | 0.2487 | 0.3154 |
Bootstrap (S=1000) | 0.282 | 0.0231 | NA | 0.2368 | 0.3241 |
Standard Permutation Inference | 0.282 | 0.0233 | < .001 | 0.2362 | 0.3272 |
State Cluster-Based Permutation Inference | 0.282 | 0.0875 | < .001 | 0.2251 | 0.3411 |