Question 1

# Define parameters
n <- 1000 # Number of observations
runs <- 500 # Number of simulations

# Initialize a matrix to store simulation results
results <- matrix(NA, nrow = runs, ncol = 3, dimnames = list(NULL, c("Control_F", "Control_AB", "IV_C")))

for(i in 1:runs) {
  # Simulate common unobserved variable and independent noise
  common_unobserved <- rnorm(n)
  noise_A <- rnorm(n, 0, 1)
  noise_B <- rnorm(n, 0, 1)
  noise_F <- rnorm(n, 0, 1)
  noise_Y <- rnorm(n, 0, 1)
  
  # Generate variables A, B, C, F based on common unobserved variable and noise
  A <- 0.5 * common_unobserved + noise_A
  B <- 0.5 * common_unobserved + noise_B
  C <- 0.3 * A + 0.3 * B + rnorm(n, 0, 1) # Assuming C is influenced by A and B plus its own noise
  F <- 0.5 * common_unobserved + noise_F
  Y <- 0.2 * A + 0.2 * B + 0.4 * F + noise_Y # Outcome variable Y influenced by A, B, F, and its own noise
  

    
  
  # Causal identification strategies
  # Control for F
  model_F <- lm(Y ~ F)
  coef_F <- coef(model_F)["F"]
  
  # Control for A and B
  model_AB <- lm(Y ~ A + B)
  coef_AB <- mean(coef(model_AB)[c("A", "B")])
  
  # Use C as an IV (simplified approach for demonstration)
  model_C <- lm(Y ~ C)
  coef_C <- coef(model_C)["C"]
  
  # Store results
  results[i, ] <- c(coef_F, coef_AB, coef_C)
}

# Calculate averages of coefficients across simulations
avg_results <- colMeans(results, na.rm = TRUE)
print(avg_results)

##  Control_F Control_AB       IV_C 
##  0.4789568  0.2672551  0.1873438

Question 2

set.seed(123) # For reproducibility

library(MASS) # For simulating correlated data
library(lmtest) # For regression diagnostics

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

library(sandwich)

# Define parameters
n <- 1000 # Number of observations
runs <- 500 # Number of simulations

# Initialize a list to store simulation results for each strategy
results <- list(control_strategy = numeric(runs),
                overcontrol_strategy = numeric(runs))

for(i in 1:runs) {
  # Simulate variables based on a simplified version of DAG 1.2
  common_unobserved <- rnorm(n)
  noise_X <- rnorm(n, 0, 1)
  noise_Y <- rnorm(n, 0, 1)
  X <- 0.5 * common_unobserved + noise_X
  Y <- 0.3 * X + 0.5 * common_unobserved + noise_Y
  
  # Additional variable for overcontrol
  Z <- 0.2 * X + rnorm(n, 0, 1)
  
  # Control strategy: Y ~ X
  model_control <- lm(Y ~ X)
  coef_control <- coef(model_control)["X"]
  
  # Overcontrol strategy: Y ~ X + Z (including an unnecessary regressor)
  model_overcontrol <- lm(Y ~ X + Z)
  coef_overcontrol <- coef(model_overcontrol)["X"]
  
  # Store results
  results$control_strategy[i] <- coef_control
  results$overcontrol_strategy[i] <- coef_overcontrol
}

# Calculate averages of coefficients across simulations
avg_results_control <- mean(results$control_strategy)
avg_results_overcontrol <- mean(results$overcontrol_strategy)

cat("Average coefficient for X under control strategy:", avg_results_control, "\n")

## Average coefficient for X under control strategy: 0.4985256

cat("Average coefficient for X under overcontrol strategy:", avg_results_overcontrol, "\n")

## Average coefficient for X under overcontrol strategy: 0.4983825

Question 3

set.seed(123) # For reproducibility

n <- 1000 # Number of observations
p_treated <- 0.5 # Proportion of units receiving treatment

# Simulate baseline outcomes
Y_baseline <- rnorm(n)

# Simulate treatment assignments
T <- rbinom(n, 1, p_treated)

# Introduce a negative impact on control group's outcomes due to treatment in treatment group
# Assuming each treated unit reduces the outcome of control units by a small amount
negative_impact_per_treated <- 0.05 # This value is arbitrary; adjust based on scenario
total_negative_impact <- sum(T) * negative_impact_per_treated # Total negative impact distributed across control units
Y_control_affected <- Y_baseline - (1 - T) * (total_negative_impact / sum(1 - T))

# Simulate treatment effect for treated units
treatment_effect <- 1.5 # This value is arbitrary; adjust based on scenario
Y_treated <- Y_baseline + T * treatment_effect

# Final outcomes, combining both effects
Y_final <- Y_treated - (1 - T) * (total_negative_impact / sum(1 - T))

# Analysis without considering the negative impact
model_without_considering_negative_impact <- lm(Y_baseline ~ T)
coef_without_considering_negative_impact <- coef(model_without_considering_negative_impact)["T"]

# Analysis considering the negative impact
model_considering_negative_impact <- lm(Y_final ~ T)
coef_considering_negative_impact <- coef(model_considering_negative_impact)["T"]

print(paste("Estimated treatment effect without considering negative impact:", coef_without_considering_negative_impact))

## [1] "Estimated treatment effect without considering negative impact: 0.0644382390963502"

print(paste("Estimated treatment effect considering negative impact:", coef_considering_negative_impact))

## [1] "Estimated treatment effect considering negative impact: 1.61325246834538"

Haishan Code

Question 1

Question 2

Question 3