Question 1
# Define parameters
n <- 1000 # Number of observations
runs <- 500 # Number of simulations
# Initialize a matrix to store simulation results
results <- matrix(NA, nrow = runs, ncol = 3, dimnames = list(NULL, c("Control_F", "Control_AB", "IV_C")))
for(i in 1:runs) {
# Simulate common unobserved variable and independent noise
common_unobserved <- rnorm(n)
noise_A <- rnorm(n, 0, 1)
noise_B <- rnorm(n, 0, 1)
noise_F <- rnorm(n, 0, 1)
noise_Y <- rnorm(n, 0, 1)
# Generate variables A, B, C, F based on common unobserved variable and noise
A <- 0.5 * common_unobserved + noise_A
B <- 0.5 * common_unobserved + noise_B
C <- 0.3 * A + 0.3 * B + rnorm(n, 0, 1) # Assuming C is influenced by A and B plus its own noise
F <- 0.5 * common_unobserved + noise_F
Y <- 0.2 * A + 0.2 * B + 0.4 * F + noise_Y # Outcome variable Y influenced by A, B, F, and its own noise
# Causal identification strategies
# Control for F
model_F <- lm(Y ~ F)
coef_F <- coef(model_F)["F"]
# Control for A and B
model_AB <- lm(Y ~ A + B)
coef_AB <- mean(coef(model_AB)[c("A", "B")])
# Use C as an IV (simplified approach for demonstration)
model_C <- lm(Y ~ C)
coef_C <- coef(model_C)["C"]
# Store results
results[i, ] <- c(coef_F, coef_AB, coef_C)
}
# Calculate averages of coefficients across simulations
avg_results <- colMeans(results, na.rm = TRUE)
print(avg_results)
## Control_F Control_AB IV_C
## 0.4789568 0.2672551 0.1873438
Question 2
set.seed(123) # For reproducibility
library(MASS) # For simulating correlated data
library(lmtest) # For regression diagnostics
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(sandwich)
# Define parameters
n <- 1000 # Number of observations
runs <- 500 # Number of simulations
# Initialize a list to store simulation results for each strategy
results <- list(control_strategy = numeric(runs),
overcontrol_strategy = numeric(runs))
for(i in 1:runs) {
# Simulate variables based on a simplified version of DAG 1.2
common_unobserved <- rnorm(n)
noise_X <- rnorm(n, 0, 1)
noise_Y <- rnorm(n, 0, 1)
X <- 0.5 * common_unobserved + noise_X
Y <- 0.3 * X + 0.5 * common_unobserved + noise_Y
# Additional variable for overcontrol
Z <- 0.2 * X + rnorm(n, 0, 1)
# Control strategy: Y ~ X
model_control <- lm(Y ~ X)
coef_control <- coef(model_control)["X"]
# Overcontrol strategy: Y ~ X + Z (including an unnecessary regressor)
model_overcontrol <- lm(Y ~ X + Z)
coef_overcontrol <- coef(model_overcontrol)["X"]
# Store results
results$control_strategy[i] <- coef_control
results$overcontrol_strategy[i] <- coef_overcontrol
}
# Calculate averages of coefficients across simulations
avg_results_control <- mean(results$control_strategy)
avg_results_overcontrol <- mean(results$overcontrol_strategy)
cat("Average coefficient for X under control strategy:", avg_results_control, "\n")
## Average coefficient for X under control strategy: 0.4985256
cat("Average coefficient for X under overcontrol strategy:", avg_results_overcontrol, "\n")
## Average coefficient for X under overcontrol strategy: 0.4983825
Question 3
set.seed(123) # For reproducibility
n <- 1000 # Number of observations
p_treated <- 0.5 # Proportion of units receiving treatment
# Simulate baseline outcomes
Y_baseline <- rnorm(n)
# Simulate treatment assignments
T <- rbinom(n, 1, p_treated)
# Introduce a negative impact on control group's outcomes due to treatment in treatment group
# Assuming each treated unit reduces the outcome of control units by a small amount
negative_impact_per_treated <- 0.05 # This value is arbitrary; adjust based on scenario
total_negative_impact <- sum(T) * negative_impact_per_treated # Total negative impact distributed across control units
Y_control_affected <- Y_baseline - (1 - T) * (total_negative_impact / sum(1 - T))
# Simulate treatment effect for treated units
treatment_effect <- 1.5 # This value is arbitrary; adjust based on scenario
Y_treated <- Y_baseline + T * treatment_effect
# Final outcomes, combining both effects
Y_final <- Y_treated - (1 - T) * (total_negative_impact / sum(1 - T))
# Analysis without considering the negative impact
model_without_considering_negative_impact <- lm(Y_baseline ~ T)
coef_without_considering_negative_impact <- coef(model_without_considering_negative_impact)["T"]
# Analysis considering the negative impact
model_considering_negative_impact <- lm(Y_final ~ T)
coef_considering_negative_impact <- coef(model_considering_negative_impact)["T"]
print(paste("Estimated treatment effect without considering negative impact:", coef_without_considering_negative_impact))
## [1] "Estimated treatment effect without considering negative impact: 0.0644382390963502"
print(paste("Estimated treatment effect considering negative impact:", coef_considering_negative_impact))
## [1] "Estimated treatment effect considering negative impact: 1.61325246834538"