Executive Summary

This analysis evaluates the causal impact of the M7 metro line opening (October 2020) on air quality in Istanbul using a difference-in-differences (DiD) design.

Treatment Groups: - Treated Districts (with M7 stations): Eyüpsultan, Kağıthane, Bağcılar, Esenler - Control Districts: Avcılar, Esenyurt

Key Outcomes: - NO₂ mean concentration (µg/m³) - PM2.5 mean concentration (µg/m³)

Analysis Period: January 2019 - January 2025


1. Setup and Data Loading

1.1 Load Required Packages

# Install packages if needed
packages <- c("tidyverse", "readxl", "lubridate", "data.table", 
              "fixest", "broom", "modelsummary", "ggplot2", 
              "patchwork", "scales", "knitr", "kableExtra")

for (pkg in packages) {
  if (!require(pkg, character.only = TRUE)) {
    install.packages(pkg)
    library(pkg, character.only = TRUE)
  }
}

1.2 Load Raw Data

# File path
file_path <- "m7.xlsx"

# Check if file exists
if (!file.exists(file_path)) {
  stop("Error: m7.xlsx file not found in working directory. Current directory: ", getwd())
}

# Read the actual data starting from row 4
df_raw <- read_excel(
  file_path, 
  sheet = 1,
  skip = 3,
  col_types = c("date", "text", rep("numeric", 14))
)

cat("Data loaded successfully!\n")
## Data loaded successfully!
cat("Dimensions:", nrow(df_raw), "rows x", ncol(df_raw), "columns\n")
## Dimensions: 52609 rows x 16 columns

1.3 Data Preprocessing

# Create proper column names based on the data structure
# Columns: datetime, hour, then pairs of NO2/PM25 for each station
colnames(df_raw) <- c(
  "datetime", "hour",
  "Alibeyköy_NO2", "Alibeyköy_PM25",
  "Avcılar_NO2", "Avcılar_PM25",
  "Bağcılar_NO2", "Bağcılar_PM25",
  "Esenler_NO2", "Esenler_PM25",
  "Esenyurt_NO2", "Esenyurt_PM25",
  "Kağıthane_1_NO2", "Kağıthane_1_PM25",
  "Kağıthane_2_NO2", "Kağıthane_2_PM25"
)

# Convert datetime and create date variables
df_clean <- df_raw %>%
  mutate(
    datetime = as.POSIXct(datetime),
    date = as.Date(datetime),
    year = year(date),
    month = month(date),
    day = day(date),
    year_month = format(date, "%Y-%m")
  ) %>%
  filter(!is.na(datetime)) %>%
  arrange(datetime)

cat("Date range:", min(df_clean$date), "to", max(df_clean$date), "\n")
## Date range: 17897 to 20089
cat("Total observations:", nrow(df_clean), "\n")
## Total observations: 52609

2. District Assignment and Treatment Definition

2.1 Define District Mappings

# Station-to-District mapping based on your specifications
station_to_district <- tribble(
  ~station, ~district, ~treated,
  "Alibeyköy", "Eyüpsultan", 1,
  "Kağıthane_1", "Kağıthane", 1,
  "Kağıthane_2", "Kağıthane", 1,
  "Bağcılar", "Bağcılar", 1,
  "Esenler", "Esenler", 1,
  "Avcılar", "Avcılar", 0,
  "Esenyurt", "Esenyurt", 0
)

cat("Station to District Mapping:\n")
## Station to District Mapping:
station_to_district %>% 
  arrange(treated, district) %>%
  kable() %>% 
  kable_styling(bootstrap_options = c("striped", "hover"))
station district treated
Avcılar Avcılar 0
Esenyurt Esenyurt 0
Bağcılar Bağcılar 1
Esenler Esenler 1
Alibeyköy Eyüpsultan 1
Kağıthane_1 Kağıthane 1
Kağıthane_2 Kağıthane 1

2.2 Treatment Definition

# Treatment timing: M7 opened in October 2020
# post = 1 for months AFTER October 2020 (November 2020 onwards)
# post = 0 for months up to and including October 2020

treatment_date <- as.Date("2020-10-31")

cat("Treatment implementation date:", format(treatment_date, "%B %d, %Y"), "\n")
## Treatment implementation date: October 31, 2020
cat("Pre-period: Up to and including October 2020\n")
## Pre-period: Up to and including October 2020
cat("Post-period: November 2020 onwards\n")
## Post-period: November 2020 onwards

3. Outcome Variable Construction

Following the three-step aggregation procedure specified in the research design.

3.1 Step 1: Station-Day Means

Rule: Compute station-day means only if there are ≥18 valid hourly readings that day.

# Reshape data to long format
df_long <- df_clean %>%
  select(datetime, date, year, month, day, year_month, ends_with("_NO2"), ends_with("_PM25")) %>%
  pivot_longer(
    cols = -c(datetime, date, year, month, day, year_month),
    names_to = "station_pollutant",
    values_to = "value"
  ) %>%
  separate(station_pollutant, into = c("station_temp", "pollutant"), sep = "_(?=[^_]+$)") %>%
  mutate(
    # Handle Kağıthane_1 and Kağıthane_2 naming
    station = str_replace(station_temp, "^(.+)_([12])$", "\\1_\\2")
  ) %>%
  select(-station_temp) %>%
  filter(!is.na(value))

# Count valid hourly readings per station-pollutant-day
station_day_counts <- df_long %>%
  group_by(station, pollutant, date) %>%
  summarise(
    n_hours = n(),
    mean_value = mean(value, na.rm = TRUE),
    .groups = "drop"
  )

# Keep only station-days with ≥18 valid hours
station_day_valid <- station_day_counts %>%
  filter(n_hours >= 18) %>%
  mutate(
    year = year(date),
    month = month(date),
    year_month = format(date, "%Y-%m")
  )

# Summary statistics
cat("\n=== STEP 1: Station-Day Summary ===\n")
## 
## === STEP 1: Station-Day Summary ===
station_day_valid %>%
  group_by(pollutant) %>%
  summarise(
    n_station_days = n(),
    n_stations = n_distinct(station),
    min_hours = min(n_hours),
    max_hours = max(n_hours),
    mean_hours = round(mean(n_hours), 1)
  ) %>%
  kable(caption = "Step 1: Station-Day Summary Statistics") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Step 1: Station-Day Summary Statistics
pollutant n_station_days n_stations min_hours max_hours mean_hours
NO2 14134 7 18 24 23.7
PM25 12233 6 18 24 23.7

3.2 Step 2: Station-Month Means

Rule: Compute station-month means only if ≥75% of possible days in that month have valid station-day means.

# Calculate total days per month for each year
days_in_month <- station_day_valid %>%
  mutate(month_date = floor_date(date, "month")) %>%
  group_by(year, month) %>%
  summarise(
    total_days = days_in_month(first(month_date)),
    .groups = "drop"
  )

# Count valid days per station-month
station_month_counts <- station_day_valid %>%
  group_by(station, pollutant, year, month) %>%
  summarise(
    n_valid_days = n(),
    mean_value = mean(mean_value, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  left_join(days_in_month, by = c("year", "month")) %>%
  mutate(
    pct_valid_days = n_valid_days / total_days,
    year_month = sprintf("%04d-%02d", year, month)
  )

# Keep only station-months with ≥75% valid days
station_month_valid <- station_month_counts %>%
  filter(pct_valid_days >= 0.75)

# Summary statistics
cat("\n=== STEP 2: Station-Month Summary ===\n")
## 
## === STEP 2: Station-Month Summary ===
station_month_valid %>%
  group_by(pollutant) %>%
  summarise(
    n_station_months = n(),
    n_stations = n_distinct(station),
    min_pct_valid = round(min(pct_valid_days) * 100, 1),
    mean_pct_valid = round(mean(pct_valid_days) * 100, 1),
    max_pct_valid = round(max(pct_valid_days) * 100, 1)
  ) %>%
  kable(caption = "Step 2: Station-Month Summary Statistics") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Step 2: Station-Month Summary Statistics
pollutant n_station_months n_stations min_pct_valid mean_pct_valid max_pct_valid
NO2 461 7 75.0 96.5 100
PM25 404 6 76.7 96.6 100

3.3 Step 3: District-Month Means

Rule: Simple average of all valid station-month means within each district.

Note: Kağıthane_1 and Kağıthane_2 are first aggregated to represent Kağıthane district.

# Join with district mapping
district_month <- station_month_valid %>%
  left_join(station_to_district, by = "station") %>%
  filter(!is.na(district))

# Compute district-month means as simple average across all stations in that district
outcome_data <- district_month %>%
  group_by(district, treated, year, month, year_month, pollutant) %>%
  summarise(
    n_stations = n(),
    mean_value = mean(mean_value, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  # Create post treatment indicator
  mutate(
    date = as.Date(paste0(year_month, "-01")),
    post = as.integer(date > treatment_date)
  )

# Pivot wider to have separate columns for each pollutant
outcome_wide <- outcome_data %>%
  select(district, treated, year, month, year_month, date, post, pollutant, mean_value) %>%
  pivot_wider(
    names_from = pollutant,
    values_from = mean_value,
    names_prefix = ""
  ) %>%
  rename(
    NO2_mean = NO2,
    PM25_mean = PM25
  )

# Display summary
cat("\n=== STEP 3: District-Month Summary ===\n")
## 
## === STEP 3: District-Month Summary ===
outcome_wide %>%
  group_by(treated, post) %>%
  summarise(
    n_obs = n(),
    n_districts = n_distinct(district),
    mean_NO2 = round(mean(NO2_mean, na.rm = TRUE), 2),
    sd_NO2 = round(sd(NO2_mean, na.rm = TRUE), 2),
    mean_PM25 = round(mean(PM25_mean, na.rm = TRUE), 2),
    sd_PM25 = round(sd(PM25_mean, na.rm = TRUE), 2),
    .groups = "drop"
  ) %>%
  mutate(
    group = case_when(
      treated == 1 & post == 0 ~ "Treated, Pre",
      treated == 1 & post == 1 ~ "Treated, Post",
      treated == 0 & post == 0 ~ "Control, Pre",
      treated == 0 & post == 1 ~ "Control, Post"
    )
  ) %>%
  select(group, everything(), -treated, -post) %>%
  kable(caption = "Step 3: District-Month Summary by Treatment Group") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Step 3: District-Month Summary by Treatment Group
group n_obs n_districts mean_NO2 sd_NO2 mean_PM25 sd_PM25
Control, Pre 42 2 28.40 12.30 21.42 3.62
Control, Post 91 2 36.05 12.03 18.00 3.87
Treated, Pre 86 4 44.78 16.67 20.70 6.65
Treated, Post 199 4 42.88 11.37 20.05 4.86

4. Descriptive Statistics

4.1 Sample Composition

cat("\n=== Sample Composition ===\n")
## 
## === Sample Composition ===
outcome_wide %>%
  group_by(district, treated) %>%
  summarise(
    n_months = n(),
    date_start = min(date),
    date_end = max(date),
    .groups = "drop"
  ) %>%
  mutate(
    group = ifelse(treated == 1, "Treated", "Control")
  ) %>%
  arrange(group, district) %>%
  kable(caption = "Sample Composition by District") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Sample Composition by District
district treated n_months date_start date_end group
Avcılar 0 69 2019-01-01 2024-12-01 Control
Esenyurt 0 64 2019-01-01 2024-09-01 Control
Bağcılar 1 72 2019-01-01 2024-12-01 Treated
Esenler 1 72 2019-01-01 2024-12-01 Treated
Eyüpsultan 1 69 2019-01-01 2024-12-01 Treated
Kağıthane 1 72 2019-01-01 2024-12-01 Treated

4.2 Pre-Treatment Balance

# Pre-treatment period balance (up to October 2020)
balance_data <- outcome_wide %>%
  filter(post == 0) %>%
  group_by(district, treated) %>%
  summarise(
    mean_NO2 = round(mean(NO2_mean, na.rm = TRUE), 2),
    sd_NO2 = round(sd(NO2_mean, na.rm = TRUE), 2),
    mean_PM25 = round(mean(PM25_mean, na.rm = TRUE), 2),
    sd_PM25 = round(sd(PM25_mean, na.rm = TRUE), 2),
    n_months = n(),
    .groups = "drop"
  ) %>%
  mutate(
    group = ifelse(treated == 1, "Treated", "Control")
  ) %>%
  arrange(group, district)

cat("\n=== Pre-Treatment Balance ===\n")
## 
## === Pre-Treatment Balance ===
balance_data %>%
  kable(caption = "Pre-Treatment Period Statistics (Up to October 2020)") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Pre-Treatment Period Statistics (Up to October 2020)
district treated mean_NO2 sd_NO2 mean_PM25 sd_PM25 n_months group
Avcılar 0 35.07 11.25 21.42 3.62 21 Control
Esenyurt 0 22.03 9.77 NaN NA 21 Control
Bağcılar 1 48.08 13.89 21.32 7.24 22 Treated
Esenler 1 56.15 17.77 21.39 5.75 22 Treated
Eyüpsultan 1 42.80 8.20 18.05 6.28 20 Treated
Kağıthane 1 31.75 16.04 21.85 6.98 22 Treated

5. Difference-in-Differences Analysis

5.1 Prepare Analysis Dataset

# Convert categorical variables to factors
analysis_data <- outcome_wide %>%
  mutate(
    district = factor(district),
    year_month = factor(year_month),
    treated = as.integer(treated),
    post = as.integer(post),
    treated_x_post = treated * post  # Interaction term (main DiD coefficient)
  ) %>%
  arrange(district, year_month)

# Summary of analysis dataset
cat("\n=== Analysis Dataset Summary ===\n")
## 
## === Analysis Dataset Summary ===
analysis_data %>%
  group_by(treated, post) %>%
  summarise(
    n_districts = n_distinct(district),
    n_obs = n(),
    .groups = "drop"
  ) %>%
  mutate(
    period = case_when(
      treated == 1 & post == 0 ~ "Treated, Pre",
      treated == 1 & post == 1 ~ "Treated, Post",
      treated == 0 & post == 0 ~ "Control, Pre",
      treated == 0 & post == 1 ~ "Control, Post"
    )
  ) %>%
  select(period, n_districts, n_obs) %>%
  kable(caption = "Analysis Dataset Summary") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Analysis Dataset Summary
period n_districts n_obs
Control, Pre 2 42
Control, Post 2 91
Treated, Pre 4 86
Treated, Post 4 199

5.2 DiD Regression Models

The model specification: \[Y_{it} = \beta_0 + \beta_1 (Treated_i \times Post_t) + \gamma_i + \delta_t + \epsilon_{it}\]

Where: - \(Y_{it}\) = Outcome (NO₂ or PM2.5) for district \(i\) in month \(t\) - \(Treated_i \times Post_t\) = Interaction term (main parameter of interest) - \(\gamma_i\) = District fixed effects - \(\delta_t\) = Year-month fixed effects - Standard errors clustered at district level

5.2.1 NO₂ Model

# DiD regression with district and time fixed effects
model_no2 <- feols(
  NO2_mean ~ treated_x_post | district + year_month,
  data = analysis_data,
  cluster = ~district
)

# Display results
cat("\n=== NO₂ DiD Results ===\n")
## 
## === NO₂ DiD Results ===
summary(model_no2)
## OLS estimation, Dep. Var.: NO2_mean
## Observations: 402
## Fixed-effects: district: 6,  year_month: 72
## Standard-errors: Clustered (district) 
##                Estimate Std. Error t value Pr(>|t|) 
## treated_x_post -10.2164    8.41267 -1.2144  0.27882 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 8.79384     Adj. R2: 0.493608
##                 Within R2: 0.058004
# Create tidy output
tidy_no2 <- tidy(model_no2, conf.int = TRUE) %>%
  mutate(outcome = "NO₂")

5.2.2 PM2.5 Model

# DiD regression with district and time fixed effects
model_pm25 <- feols(
  PM25_mean ~ treated_x_post | district + year_month,
  data = analysis_data,
  cluster = ~district
)

# Display results
cat("\n=== PM2.5 DiD Results ===\n")
## 
## === PM2.5 DiD Results ===
summary(model_pm25)
## OLS estimation, Dep. Var.: PM25_mean
## Observations: 345
## Fixed-effects: district: 5,  year_month: 72
## Standard-errors: Clustered (district) 
##                Estimate Std. Error t value Pr(>|t|)    
## treated_x_post  2.48174   0.805034 3.08278 0.036832 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 2.35581     Adj. R2: 0.740338
##                 Within R2: 0.030848
# Create tidy output
tidy_pm25 <- tidy(model_pm25, conf.int = TRUE) %>%
  mutate(outcome = "PM2.5")

5.3 Combined Results Table

# Publication-ready table
modelsummary(
  list("NO₂" = model_no2, "PM2.5" = model_pm25),
  stars = c('*' = 0.1, '**' = 0.05, '***' = 0.01),
  coef_rename = c("treated_x_post" = "Treated × Post"),
  gof_map = c("nobs", "r.squared", "adj.r.squared", "FE: district", "FE: year_month"),
  title = "Difference-in-Differences Estimates: Impact of M7 Metro Line on Air Quality",
  notes = list(
    "Standard errors clustered at district level in parentheses.",
    "Models include district and year-month fixed effects.",
    "Treatment date: October 2020 (M7 metro line opening).",
    "Treated districts: Eyüpsultan, Kağıthane, Bağcılar, Esenler.",
    "Control districts: Avcılar, Esenyurt."
  )
)
Difference-in-Differences Estimates: Impact of M7 Metro Line on Air Quality
NO₂ PM2.5
* p < 0.1, ** p < 0.05, *** p < 0.01
Standard errors clustered at district level in parentheses.
Models include district and year-month fixed effects.
Treatment date: October 2020 (M7 metro line opening).
Treated districts: Eyüpsultan, Kağıthane, Bağcılar, Esenler.
Control districts: Avcılar, Esenyurt.
Treated × Post -10.216 2.482**
(8.413) (0.805)
Num.Obs. 402 345
R2 0.591 0.798
R2 Adj. 0.494 0.740
FE: district X X
FE: year_month X X

6. Robustness Checks

6.1 Event Study (Dynamic Treatment Effects)

Tests for pre-treatment parallel trends and post-treatment dynamics.

# Create relative time periods (quarters relative to treatment)
analysis_data_event <- analysis_data %>%
  mutate(
    quarters_since_treatment = ifelse(
      date <= treatment_date,
      floor(as.numeric(difftime(date, treatment_date, units = "days")) / 90),
      ceiling(as.numeric(difftime(date, treatment_date, units = "days")) / 90)
    )
  ) %>%
  filter(quarters_since_treatment >= -8 & quarters_since_treatment <= 16) %>%
  mutate(
    rel_time = factor(quarters_since_treatment),
    rel_time = relevel(rel_time, ref = as.character(-1))  # Quarter before treatment as reference
  )

# Event study for NO2
event_no2 <- feols(
  NO2_mean ~ i(rel_time, treated, ref = -1) | district + year_month,
  data = analysis_data_event,
  cluster = ~district
)

# Event study for PM25
event_pm25 <- feols(
  PM25_mean ~ i(rel_time, treated, ref = -1) | district + year_month,
  data = analysis_data_event,
  cluster = ~district
)

# Plot event study
iplot(
  list("NO₂" = event_no2, "PM2.5" = event_pm25),
  main = "Event Study: Dynamic Treatment Effects",
  xlab = "Quarters Relative to M7 Opening (Quarter -1 = Reference)",
  ylab = "Estimated Coefficient (µg/m³)",
  ref.line = -1
)

6.2 Placebo Test (Fake Treatment Date)

Tests whether we detect false effects when using a fake treatment date in the pre-period.

# Use October 2019 as fake treatment date (one year before actual)
fake_treatment_date <- as.Date("2019-10-31")

placebo_data <- outcome_wide %>%
  filter(date <= treatment_date) %>%  # Only pre-treatment period
  mutate(
    fake_post = as.integer(date > fake_treatment_date),
    treated_x_fake_post = treated * fake_post,
    district = factor(district),
    year_month = factor(year_month)
  )

# Placebo NO2
placebo_no2 <- feols(
  NO2_mean ~ treated_x_fake_post | district + year_month,
  data = placebo_data,
  cluster = ~district
)

# Placebo PM25
placebo_pm25 <- feols(
  PM25_mean ~ treated_x_fake_post | district + year_month,
  data = placebo_data,
  cluster = ~district
)

# Results table
cat("\n=== Placebo Test Results ===\n")
## 
## === Placebo Test Results ===
modelsummary(
  list("NO₂ (Placebo)" = placebo_no2, "PM2.5 (Placebo)" = placebo_pm25),
  stars = c('*' = 0.1, '**' = 0.05, '***' = 0.01),
  coef_rename = c("treated_x_fake_post" = "Treated × Fake Post"),
  gof_map = c("nobs", "r.squared"),
  title = "Placebo Test: Fake Treatment Date (October 2019)",
  notes = "If parallel trends assumption holds, coefficients should be insignificant."
)
Placebo Test: Fake Treatment Date (October 2019)
NO₂ (Placebo) PM2.5 (Placebo)
* p < 0.1, ** p < 0.05, *** p < 0.01
If parallel trends assumption holds, coefficients should be insignificant.
Treated × Fake Post 7.461** -1.550
(2.817) (1.191)
Num.Obs. 123 101
R2 0.743 0.868

7. Summary and Interpretation

7.1 Main Results Summary

# Extract coefficients and create summary table
did_results <- tibble(
  Outcome = c("NO₂", "PM2.5"),
  Coefficient = c(
    coef(model_no2)["treated_x_post"], 
    coef(model_pm25)["treated_x_post"]
  ),
  SE = c(
    se(model_no2)["treated_x_post"], 
    se(model_pm25)["treated_x_post"]
  )
) %>%
  mutate(
    CI_Lower = Coefficient - 1.96 * SE,
    CI_Upper = Coefficient + 1.96 * SE,
    P_Value = 2 * (1 - pnorm(abs(Coefficient / SE))),
    Significant = case_when(
      P_Value < 0.01 ~ "***",
      P_Value < 0.05 ~ "**",
      P_Value < 0.1 ~ "*",
      TRUE ~ ""
    )
  )

cat("\n=== Main DiD Results ===\n")
## 
## === Main DiD Results ===
did_results %>%
  mutate(across(where(is.numeric), ~round(., 3))) %>%
  kable(caption = "DiD Treatment Effect Estimates") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
DiD Treatment Effect Estimates
Outcome Coefficient SE CI_Lower CI_Upper P_Value Significant
NO₂ -10.216 8.413 -26.705 6.272 0.225
PM2.5 2.482 0.805 0.904 4.060 0.002 ***

7.2 Coefficient Plot

ggplot(did_results, aes(x = Outcome, y = Coefficient)) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray50", linewidth = 1) +
  geom_point(size = 5, color = "#E41A1C") +
  geom_errorbar(
    aes(ymin = CI_Lower, ymax = CI_Upper),
    width = 0.2,
    linewidth = 1,
    color = "#E41A1C"
  ) +
  labs(
    title = "DiD Treatment Effects: Impact of M7 Metro Line on Air Quality",
    subtitle = "Point estimates with 95% confidence intervals",
    x = "Outcome Variable",
    y = "Treatment Effect (µg/m³)",
    caption = "Standard errors clustered at district level\nNegative values indicate pollution reduction"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    plot.title = element_text(face = "bold", size = 15),
    plot.subtitle = element_text(size = 12),
    axis.text = element_text(size = 12),
    panel.grid.minor = element_blank()
  )

7.3 Interpretation

cat("\n=== Interpretation ===\n\n")
## 
## === Interpretation ===
# NO2 interpretation
no2_effect <- did_results$Coefficient[1]
no2_pval <- did_results$P_Value[1]
no2_sig <- did_results$Significant[1]

cat("NO₂ Results:\n")
## NO₂ Results:
cat(sprintf("  - Estimated effect: %.3f µg/m³ %s\n", no2_effect, no2_sig))
##   - Estimated effect: -10.216 µg/m³
cat(sprintf("  - P-value: %.4f\n", no2_pval))
##   - P-value: 0.2246
if (no2_pval < 0.05) {
  if (no2_effect < 0) {
    cat(sprintf("  - The M7 metro line is associated with a statistically significant REDUCTION\n"))
    cat(sprintf("    of %.2f µg/m³ in NO₂ concentration in treated districts.\n", abs(no2_effect)))
  } else {
    cat(sprintf("  - The M7 metro line is associated with a statistically significant INCREASE\n"))
    cat(sprintf("    of %.2f µg/m³ in NO₂ concentration in treated districts.\n", no2_effect))
  }
} else {
  cat("  - No statistically significant effect detected.\n")
}
##   - No statistically significant effect detected.
cat("\n")
# PM2.5 interpretation
pm25_effect <- did_results$Coefficient[2]
pm25_pval <- did_results$P_Value[2]
pm25_sig <- did_results$Significant[2]

cat("PM2.5 Results:\n")
## PM2.5 Results:
cat(sprintf("  - Estimated effect: %.3f µg/m³ %s\n", pm25_effect, pm25_sig))
##   - Estimated effect: 2.482 µg/m³ ***
cat(sprintf("  - P-value: %.4f\n", pm25_pval))
##   - P-value: 0.0021
if (pm25_pval < 0.05) {
  if (pm25_effect < 0) {
    cat(sprintf("  - The M7 metro line is associated with a statistically significant REDUCTION\n"))
    cat(sprintf("    of %.2f µg/m³ in PM2.5 concentration in treated districts.\n", abs(pm25_effect)))
  } else {
    cat(sprintf("  - The M7 metro line is associated with a statistically significant INCREASE\n"))
    cat(sprintf("    of %.2f µg/m³ in PM2.5 concentration in treated districts.\n", pm25_effect))
  }
} else {
  cat("  - No statistically significant effect detected.\n")
}
##   - The M7 metro line is associated with a statistically significant INCREASE
##     of 2.48 µg/m³ in PM2.5 concentration in treated districts.

8. Data Export

# Export final analysis dataset
write_csv(
  analysis_data,
  "analysis_dataset_district_month.csv"
)

# Export district-level aggregated trends
write_csv(
  trends_data,
  "district_trends_by_treatment.csv"
)

# Export regression results
write_csv(
  bind_rows(tidy_no2, tidy_pm25),
  "regression_results.csv"
)

# Export summary statistics
write_csv(
  did_results,
  "did_summary_results.csv"
)

cat("\n=== Data Export Complete ===\n")
## 
## === Data Export Complete ===
cat("Files created:\n")
## Files created:
cat("  - analysis_dataset_district_month.csv\n")
##   - analysis_dataset_district_month.csv
cat("  - district_trends_by_treatment.csv\n")
##   - district_trends_by_treatment.csv
cat("  - regression_results.csv\n")
##   - regression_results.csv
cat("  - did_summary_results.csv\n")
##   - did_summary_results.csv

9. Session Information

sessionInfo()
## R version 4.5.2 (2025-10-31)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.3 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0
## 
## locale:
##  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
##  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
##  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
## [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
## 
## time zone: UTC
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] kableExtra_1.4.0   knitr_1.51         scales_1.4.0       patchwork_1.3.2   
##  [5] modelsummary_2.5.0 broom_1.0.11       fixest_0.13.2      data.table_1.18.0 
##  [9] readxl_1.4.5       lubridate_1.9.4    forcats_1.0.1      stringr_1.6.0     
## [13] dplyr_1.1.4        purrr_1.2.0        readr_2.1.6        tidyr_1.3.2       
## [17] tibble_3.3.0       ggplot2_4.0.1      tidyverse_2.0.0   
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6        bayestestR_0.17.0   xfun_0.55          
##  [4] bslib_0.9.0         insight_1.4.4       lattice_0.22-7     
##  [7] tzdb_0.5.0          numDeriv_2016.8-1.1 vctrs_0.6.5        
## [10] tools_4.5.2         generics_0.1.4      parallel_4.5.2     
## [13] datawizard_1.3.0    sandwich_3.1-1      pkgconfig_2.0.3    
## [16] tinytable_0.15.2    checkmate_2.3.3     RColorBrewer_1.1-3 
## [19] S7_0.2.1            stringmagic_1.2.0   lifecycle_1.0.4    
## [22] compiler_4.5.2      farver_2.1.2        textshaping_1.0.4  
## [25] htmltools_0.5.9     sass_0.4.10         yaml_2.3.12        
## [28] Formula_1.2-5       crayon_1.5.3        pillar_1.11.1      
## [31] jquerylib_0.1.4     cachem_1.1.0        nlme_3.1-168       
## [34] tidyselect_1.2.1    digest_0.6.39       performance_0.15.3 
## [37] stringi_1.8.7       labeling_0.4.3      fastmap_1.2.0      
## [40] grid_4.5.2          cli_3.6.5           magrittr_2.0.4     
## [43] withr_3.0.2         dreamerr_1.5.0      backports_1.5.0    
## [46] bit64_4.6.0-1       timechange_0.3.0    rmarkdown_2.30     
## [49] bit_4.6.0           cellranger_1.1.0    zoo_1.8-15         
## [52] hms_1.1.4           evaluate_1.0.5      parameters_0.28.3  
## [55] viridisLite_0.4.2   rlang_1.1.6         Rcpp_1.1.0         
## [58] glue_1.8.0          xml2_1.5.1          vroom_1.6.7        
## [61] svglite_2.2.2       rstudioapi_0.17.1   jsonlite_2.0.0     
## [64] R6_2.6.1            tables_0.9.33       systemfonts_1.3.1

Appendix: Data Quality Checks

A.1 Missing Data Pattern

cat("\n=== Missing Data Summary ===\n")
## 
## === Missing Data Summary ===
# Check missing data in raw input
missing_summary <- df_clean %>%
  select(ends_with("_NO2"), ends_with("_PM25")) %>%
  summarise(across(everything(), ~sum(is.na(.)))) %>%
  pivot_longer(everything(), names_to = "variable", values_to = "n_missing") %>%
  mutate(
    pct_missing = round(n_missing / nrow(df_clean) * 100, 2)
  ) %>%
  arrange(desc(pct_missing))

missing_summary %>%
  kable(caption = "Missing Data by Station-Pollutant") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Missing Data by Station-Pollutant
variable n_missing pct_missing
Esenyurt_PM25 52609 100.00
Kağıthane_1_NO2 6791 12.91
Kağıthane_1_PM25 6218 11.82
Esenyurt_NO2 5494 10.44
Avcılar_PM25 5242 9.96
Alibeyköy_NO2 4447 8.45
Kağıthane_2_NO2 3815 7.25
Alibeyköy_PM25 3565 6.78
Avcılar_NO2 2849 5.42
Kağıthane_2_PM25 2754 5.23
Esenler_NO2 1992 3.79
Bağcılar_PM25 1717 3.26
Esenler_PM25 1050 2.00
Bağcılar_NO2 715 1.36

A.2 District-Month Coverage

cat("\n=== District-Month Coverage ===\n")
## 
## === District-Month Coverage ===
# Create complete grid of district-months
all_months <- outcome_wide %>%
  expand(district, year_month = unique(year_month))

# Check which district-months we have data for
coverage <- all_months %>%
  left_join(
    outcome_wide %>% select(district, year_month) %>% mutate(has_data = 1),
    by = c("district", "year_month")
  ) %>%
  mutate(has_data = ifelse(is.na(has_data), 0, 1))

coverage_summary <- coverage %>%
  group_by(district) %>%
  summarise(
    total_months = n(),
    months_with_data = sum(has_data),
    pct_coverage = round(months_with_data / total_months * 100, 1)
  )

coverage_summary %>%
  kable(caption = "Data Coverage by District") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Data Coverage by District
district total_months months_with_data pct_coverage
Avcılar 72 69 95.8
Bağcılar 72 72 100.0
Esenler 72 72 100.0
Esenyurt 72 64 88.9
Eyüpsultan 72 69 95.8
Kağıthane 72 72 100.0

End of Analysis