This file constructs the functions used to perform permutation-based inference for a state-based policy intervention.

Simulate the Data

Our first step is to simulate some data. We will draw data based on the following model: \[ Y_{is} = 2 + 0.25 \cdot \textrm{Tx}_s + \epsilon_{is} \] for individual \(i\) in state \(s\). To approximate the SCCS we will have a total of 12 states, with 3 “treatment” states. We’ll assume we are analyzing a dataset with 10,000 observations.

set.seed(3449453)

N <- 10000          # Total Sample Size
N_States <- 12      # Number of States
N_Tx_States <- 3   # Number of Treated States
Tx_States <- sample(1:N_States,size=N_Tx_States,replace=FALSE) # Treated States

S_i <- sample(1:N_States,prob=rep(1/N_States,N_States),size=N,replace=TRUE) # Simulate individual i's state.
Z <- as.integer(S_i %in% Tx_States)  # Treatment indicator
Y <- 2 + 0.25*Z + rnorm(n=N) # Outcome
XX <- data.frame(Y=Y,Z=Z,C=S_i)    # Data Frame

Estimate the Model

We will first fit a simple linear model and obtain standard errors and 95% confidence intervals using standard inference methods.

m.fit <- with(XX,lm(Y~Z))
observed_test_statistic <- coef(m.fit)[2]
f0 <- tidy(m.fit) %>% filter(term=="Z") %>% 
  mutate(model="Standard Inference",
         ci_lo = confint(m.fit)[2,1],
         ci_hi = confint(m.fit)[2,2]) %>% 
  select(model,estimate,std.error,p.value,ci_lo,ci_hi) %>% 
  mutate_each(funs(round(.,4)),c(2,3,5,6)) %>% 
  mutate(p.value=fixp(p.value)) 
Fitted Model
Dependent variable:
Y
Z 0.282***
(0.023)
Constant 2.005***
(0.012)
Observations 10,000
Note: p<0.1; p<0.05; p<0.01

Obtain Standard Bootstrapped Values

Next we’ll obtain bootstrapped standard errors and 95% confidence intervals using 1,000 bootstrapped samples (with replacement) of our original data.

N_bootstraps = 1000

f1 <- broom::bootstrap(XX, N_bootstraps) %>% do(fit = lm(Y~Z,data=.)) %>% mutate(coef = summary(fit)$coeff[2]) %>% 
  select(-fit) %>% ungroup() %>% 
 
  summarise(estimate=observed_test_statistic,std.error=sd(coef),ci_lo = quantile(coef,0.025),ci_hi=quantile(coef,0.975)) %>% 
   mutate(model=paste0("Bootstrap (S=",N_bootstraps,")"),
         p.value = NA) %>% 
  select(model,estimate,std.error,p.value,ci_lo,ci_hi) %>% 
  mutate_each(funs(round(.,4)),c(2,3,5,6)) 

Obtain Inferences via Permuted Individual Treatment Status

We’ll now turn to a randomization inference method where we permute treatment status among individuals. The functions below are designed to facilitate treatment status permutation within a given dataset. However, in order to get the process to work correctly a “stacked” version of the dataset is needed. This stacked dataset is essentially two copies of the same dataset; the only difference is that in the first instance, every observation receives a treatment indicator of 1, while in the second instance everyone receives a 0. The permutation function first (randomly) selects an index of treated observations (matching the number of originally treated observations) from the first instance of the data in the stack, and then selects the control index as the complementary set of individuals from the second instance of the data.

XX_1 <- XX %>% mutate(Z_Obs = Z) %>% mutate(Z=1)
XX_0 <- XX %>% mutate(Z_Obs = Z) %>% mutate(Z=0)
XX_Stacked <- rbind(XX_1,XX_0)
# This function randomly permutes indiviudal-level treatment.
permute <- function (df, m, tx="Z",tx_orig = "Z_Obs") 
{
    n <- nrow(df)/2
    n_times_2 <- n*2
    n_tx <- df %>% filter(row_number() %in% 1:n) %>% count_(tx_orig) %>%  collect() %>% .[["n"]] %>% nth(2)
    sampled.index <- function(N=n,N_tx=n_tx,N_Times_2 = n_times_2)
    {
      sampled_tx_observations <- sample(1:N,N_tx,replace=FALSE)
      sampled_cx_observations <- seq(N+1,N_Times_2,1)[-sampled_tx_observations] 
      out <- c(sampled_tx_observations,sampled_cx_observations)
      out
    }
    attr(df, "indices") <- replicate(m, sampled.index() - 1, simplify = FALSE)
    attr(df, "drop") <- TRUE
    attr(df, "group_sizes") <- rep(n, m)
    attr(df, "biggest_group_size") <- n
    attr(df, "labels") <- data.frame(replicate = 1:m)
    attr(df, "vars") <- list(quote(replicate))
    class(df) <- c("grouped_df", "tbl_df", "tbl", "data.frame")
    df
}

# This function permutes treatment based on assignment via a cluster (i.e., state).
permute.cluster <- function (df, m, tx="Z",tx_orig = "Z_Obs", c = "C") 
{
    n <- nrow(df)/2
    n_times_2 <- n*2
    df$temp_Tx = df[,tx_orig]
    df$temp_C = df[,c]
    n_tx <- df %>% filter(row_number() %in% 1:n) %>% filter(temp_Tx==1) %>% 
      count(temp_C) %>% collect() %>% .[["n"]] %>% length()
    
    cc <- unique(df[,c])
    
    sampled.index <- function(N=n,N_tx=n_tx,N_Times_2 = n_times_2,CC=cc)
    {
      sampled_clusters <- sample(CC,N_tx,replace=FALSE)
      sampled_tx_observations <- df %>% filter(row_number() %in% 1:N) %>% 
        mutate(sampled_obs=row_number()) %>% 
        filter(temp_C %in% sampled_clusters) %>% collect() %>% .[["sampled_obs"]]
      sampled_cx_observations <- seq(N+1,N_Times_2,1)[-sampled_tx_observations] 
      out <- c(sampled_tx_observations,sampled_cx_observations)
      out
    }
    attr(df, "indices") <- replicate(m, sampled.index() - 1, simplify = FALSE)
    attr(df, "drop") <- TRUE
    attr(df, "group_sizes") <- rep(n, m)
    attr(df, "biggest_group_size") <- n
    attr(df, "labels") <- data.frame(replicate = 1:m)
    attr(df, "vars") <- list(quote(replicate))
    class(df) <- c("grouped_df", "tbl_df", "tbl", "data.frame")
    df
}
N_permutations = 1000

f2 <- permute(XX_Stacked, N_permutations) %>% do(fit = lm(Y~Z,data=.)) %>% mutate(coef = summary(fit)$coeff[2]) %>% 
  select(-fit)  %>% mutate(p.value = as.integer(abs(observed_test_statistic)<=abs(coef))) %>% ungroup() %>% 
  summarise(std.error = sd(coef),p.value = mean(p.value) ) %>% 
  mutate(model = "Standard Permutation Inference") 

# Invert the Test to Obtain the Confidence INterval
alpha = 0.05
ci <- round(confint(m.fit)[2,],2)
Tau <- c(seq(confint(m.fit)[2,1]-f2$std.error,confint(m.fit)[2,1]+f2$std.error,.001),
         seq(confint(m.fit)[2,2]-f2$std.error,confint(m.fit)[2,2]+f2$std.error,.001))


registerDoParallel()  

ci.permute <- foreach(ss=1:length(Tau),.combine=rbind) %dopar%
{
  require(dplyr)
  require(foreach)
  tau <- Tau[ss]
  
  observed_test_statistic_s <- XX %>% tbl_df() %>% mutate(foo=1) %>% group_by(foo) %>% 
    mutate(y_h0 = Z*(Y-tau)+(1-Z)*Y) %>%
    do(fit = lm(y_h0~Z,data=.)) %>% 
    mutate(coef = summary(fit)$coeff[2]) %>% 
    select(-fit) %>% collect() %>% .[["coef"]]

  
  empirical_distribution_under_null <- XX_Stacked %>%
    mutate(y_h0 = Z_Obs*(Y-tau)+(1-Z_Obs)*Y) %>%
    permute(N_permutations) %>%
    do(fit = lm(y_h0~Z,data=.)) %>%
    mutate(coef = summary(fit)$coeff[2]) %>%
    select(-fit) %>% collect() %>% .[["coef"]]

  p_value = 1 - prop.table(table(abs(observed_test_statistic_s)<=abs(empirical_distribution_under_null)))[1]
  
  reject <- as.integer(p_value<alpha)
  out <- data.frame(tau=tau,reject=reject)
}

test1 <- ci.permute %>% filter(reject==0) %>% summarise(ci_lo=min(tau),ci_hi=max(tau))
f2 <- f2 %>% mutate(estimate = observed_test_statistic,ci_lo=test1$ci_lo,ci_hi=test1$ci_hi) %>% 
  select(model,estimate,std.error,p.value,ci_lo,ci_hi) %>% 
  mutate_each(funs(round(.,4)),c(2,3,5,6)) %>% 
  mutate(p.value=fixp(p.value)) 

State-Clustering Based Randomization Inference

N_permutations = 1000

f3 <- permute.cluster(XX_Stacked, N_permutations) %>% do(fit = lm(Y~Z,data=.)) %>% mutate(coef = summary(fit)$coeff[2]) %>% 
  select(-fit)  %>% mutate(p.value = as.integer(abs(observed_test_statistic)<=abs(coef))) %>% ungroup() %>% 
  summarise(std.error = sd(coef),p.value = mean(p.value) ) %>% 
  mutate(model = "State Cluster-Based Permutation Inference") 

# Invert the Test to Obtain the Confidence INterval
alpha = 0.05
ci <- round(confint(m.fit)[2,],2)
Tau <- c(seq(confint(m.fit)[2,1]-f3$std.error,confint(m.fit)[2,1]+f3$std.error,.001),
         seq(confint(m.fit)[2,2]-f3$std.error,confint(m.fit)[2,2]+f3$std.error,.001))

registerDoParallel()  
ci.permute.cluster <- foreach(ss=1:length(Tau),.combine=rbind) %dopar%
{
  require(dplyr)
  require(foreach)
  tau <- Tau[ss]
  
  observed_test_statistic_s <- XX %>% tbl_df() %>% mutate(foo=1) %>% group_by(foo) %>% 
    mutate(y_h0 = Z*(Y-tau)+(1-Z)*Y) %>%
    do(fit = lm(y_h0~Z,data=.)) %>% 
    mutate(coef = summary(fit)$coeff[2]) %>% 
    select(-fit) %>% collect() %>% .[["coef"]]

  
  empirical_distribution_under_null <- XX_Stacked %>%
    mutate(y_h0 = Z_Obs*(Y-tau)+(1-Z_Obs)*Y) %>%
    permute.cluster(N_permutations) %>%
    do(fit = lm(y_h0~Z,data=.)) %>%
    mutate(coef = summary(fit)$coeff[2]) %>%
    select(-fit) %>% collect() %>% .[["coef"]]

  p_value = 1 - prop.table(table(abs(observed_test_statistic_s)<=abs(empirical_distribution_under_null)))[1]
  
  reject <- as.integer(p_value<alpha)
  out <- data.frame(tau=tau,reject=reject)
}

test <- ci.permute.cluster %>% filter(reject==0) %>% summarise(ci_lo=min(tau),ci_hi=max(tau))

f3 <- f3 %>% mutate(estimate = observed_test_statistic,ci_lo=test$ci_lo,ci_hi=test$ci_hi) %>% 
  select(model,estimate,std.error,p.value,ci_lo,ci_hi) %>% 
  mutate_each(funs(round(.,4)),c(2,3,5,6)) %>% 
  mutate(p.value=fixp(p.value)) 
# Add in clustered standard errors
source("~/Dropbox/Projects/network-quality/analysis/sub-files/cluster-se.r")
library(sandwich,quietly = T)
library(lmtest,quietly = T)
m.fit <- with(XX,lm(Y~Z))
m.vcov <- get_CL_vcov(model=m.fit,cluster=XX$C)

f4 <- tidy(coeftest.cluster(data=XX,fm=m.fit,cluster1="C")) %>% filter(term=="Z") %>% 
  mutate(model="State Cluster Robust SEs",
         ci_lo = get_confint(model=m.fit,vcovCL = m.vcov)[2,2],
         ci_hi = get_confint(model=m.fit,vcovCL = m.vcov)[2,3]) %>% 
  select(model,estimate,std.error,p.value,ci_lo,ci_hi) %>% 
  mutate_each(funs(round(.,4)),c(2,3,5,6)) %>% 
  mutate(p.value=fixp(p.value)) 
knitr::kable(rbind(f0,f4,f1,f2,f3))
model estimate std.error p.value ci_lo ci_hi
Standard Inference 0.282 0.0232 < .001 0.2365 0.3275
State Cluster Robust SEs 0.282 0.0170 < .001 0.2487 0.3154
Bootstrap (S=1000) 0.282 0.0231 NA 0.2368 0.3241
Standard Permutation Inference 0.282 0.0233 < .001 0.2362 0.3272
State Cluster-Based Permutation Inference 0.282 0.0875 < .001 0.2251 0.3411