Executive Summary

This analysis evaluates the causal impact of the M7 metro line opening (October 2020) on air quality in Istanbul using a difference-in-differences (DiD) design.

Treatment Groups: - Treated Districts (with M7 stations): Eyüpsultan, Kağıthane, Bağcılar, Esenler - Control Districts: Avcılar, Esenyurt

Key Outcomes: - NO₂ mean concentration (µg/m³) - PM2.5 mean concentration (µg/m³)

Analysis Period: January 2019 - January 2025

1. Setup and Data Loading

1.1 Load Required Packages

# Install packages if needed
packages <- c("tidyverse", "readxl", "lubridate", "data.table", 
              "fixest", "broom", "modelsummary", "ggplot2", 
              "patchwork", "scales", "knitr", "kableExtra")

for (pkg in packages) {
  if (!require(pkg, character.only = TRUE)) {
    install.packages(pkg)
    library(pkg, character.only = TRUE)
  }
}

1.2 Load Raw Data

# File path
file_path <- "m7.xlsx"

# Check if file exists
if (!file.exists(file_path)) {
  stop("Error: m7.xlsx file not found in working directory. Current directory: ", getwd())
}

# Read the actual data starting from row 4
df_raw <- read_excel(
  file_path, 
  sheet = 1,
  skip = 3,
  col_types = c("date", "text", rep("numeric", 14))
)

cat("Data loaded successfully!\n")

## Data loaded successfully!

cat("Dimensions:", nrow(df_raw), "rows x", ncol(df_raw), "columns\n")

## Dimensions: 52609 rows x 16 columns

1.3 Data Preprocessing

# Create proper column names based on the data structure
# Columns: datetime, hour, then pairs of NO2/PM25 for each station
colnames(df_raw) <- c(
  "datetime", "hour",
  "Alibeyköy_NO2", "Alibeyköy_PM25",
  "Avcılar_NO2", "Avcılar_PM25",
  "Bağcılar_NO2", "Bağcılar_PM25",
  "Esenler_NO2", "Esenler_PM25",
  "Esenyurt_NO2", "Esenyurt_PM25",
  "Kağıthane_1_NO2", "Kağıthane_1_PM25",
  "Kağıthane_2_NO2", "Kağıthane_2_PM25"
)

# Convert datetime and create date variables
df_clean <- df_raw %>%
  mutate(
    datetime = as.POSIXct(datetime),
    date = as.Date(datetime),
    year = year(date),
    month = month(date),
    day = day(date),
    year_month = format(date, "%Y-%m")
  ) %>%
  filter(!is.na(datetime)) %>%
  arrange(datetime)

cat("Date range:", min(df_clean$date), "to", max(df_clean$date), "\n")

## Date range: 17897 to 20089

cat("Total observations:", nrow(df_clean), "\n")

## Total observations: 52609

2. District Assignment and Treatment Definition

2.1 Define District Mappings

# Station-to-District mapping based on your specifications
station_to_district <- tribble(
  ~station, ~district, ~treated,
  "Alibeyköy", "Eyüpsultan", 1,
  "Kağıthane_1", "Kağıthane", 1,
  "Kağıthane_2", "Kağıthane", 1,
  "Bağcılar", "Bağcılar", 1,
  "Esenler", "Esenler", 1,
  "Avcılar", "Avcılar", 0,
  "Esenyurt", "Esenyurt", 0
)

cat("Station to District Mapping:\n")

## Station to District Mapping:

station_to_district %>% 
  arrange(treated, district) %>%
  kable() %>% 
  kable_styling(bootstrap_options = c("striped", "hover"))

station	district	treated
Avcılar	Avcılar	0
Esenyurt	Esenyurt	0
Bağcılar	Bağcılar	1
Esenler	Esenler	1
Alibeyköy	Eyüpsultan	1
Kağıthane_1	Kağıthane	1
Kağıthane_2	Kağıthane	1

2.2 Treatment Definition

# Treatment timing: M7 opened in October 2020
# post = 1 for months AFTER October 2020 (November 2020 onwards)
# post = 0 for months up to and including October 2020

treatment_date <- as.Date("2020-10-31")

cat("Treatment implementation date:", format(treatment_date, "%B %d, %Y"), "\n")

## Treatment implementation date: October 31, 2020

cat("Pre-period: Up to and including October 2020\n")

## Pre-period: Up to and including October 2020

cat("Post-period: November 2020 onwards\n")

## Post-period: November 2020 onwards

3. Outcome Variable Construction

Following the three-step aggregation procedure specified in the research design.

3.1 Step 1: Station-Day Means

Rule: Compute station-day means only if there are ≥18 valid hourly readings that day.

# Reshape data to long format
df_long <- df_clean %>%
  select(datetime, date, year, month, day, year_month, ends_with("_NO2"), ends_with("_PM25")) %>%
  pivot_longer(
    cols = -c(datetime, date, year, month, day, year_month),
    names_to = "station_pollutant",
    values_to = "value"
  ) %>%
  separate(station_pollutant, into = c("station_temp", "pollutant"), sep = "_(?=[^_]+$)") %>%
  mutate(
    # Handle Kağıthane_1 and Kağıthane_2 naming
    station = str_replace(station_temp, "^(.+)_([12])$", "\\1_\\2")
  ) %>%
  select(-station_temp) %>%
  filter(!is.na(value))

# Count valid hourly readings per station-pollutant-day
station_day_counts <- df_long %>%
  group_by(station, pollutant, date) %>%
  summarise(
    n_hours = n(),
    mean_value = mean(value, na.rm = TRUE),
    .groups = "drop"
  )

# Keep only station-days with ≥18 valid hours
station_day_valid <- station_day_counts %>%
  filter(n_hours >= 18) %>%
  mutate(
    year = year(date),
    month = month(date),
    year_month = format(date, "%Y-%m")
  )

# Summary statistics
cat("\n=== STEP 1: Station-Day Summary ===\n")

## 
## === STEP 1: Station-Day Summary ===

station_day_valid %>%
  group_by(pollutant) %>%
  summarise(
    n_station_days = n(),
    n_stations = n_distinct(station),
    min_hours = min(n_hours),
    max_hours = max(n_hours),
    mean_hours = round(mean(n_hours), 1)
  ) %>%
  kable(caption = "Step 1: Station-Day Summary Statistics") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

Step 1: Station-Day Summary Statistics
pollutant	n_station_days	n_stations	min_hours	max_hours	mean_hours
NO2	14134	7	18	24	23.7
PM25	12233	6	18	24	23.7

3.2 Step 2: Station-Month Means

Rule: Compute station-month means only if ≥75% of possible days in that month have valid station-day means.

# Calculate total days per month for each year
days_in_month <- station_day_valid %>%
  mutate(month_date = floor_date(date, "month")) %>%
  group_by(year, month) %>%
  summarise(
    total_days = days_in_month(first(month_date)),
    .groups = "drop"
  )

# Count valid days per station-month
station_month_counts <- station_day_valid %>%
  group_by(station, pollutant, year, month) %>%
  summarise(
    n_valid_days = n(),
    mean_value = mean(mean_value, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  left_join(days_in_month, by = c("year", "month")) %>%
  mutate(
    pct_valid_days = n_valid_days / total_days,
    year_month = sprintf("%04d-%02d", year, month)
  )

# Keep only station-months with ≥75% valid days
station_month_valid <- station_month_counts %>%
  filter(pct_valid_days >= 0.75)

# Summary statistics
cat("\n=== STEP 2: Station-Month Summary ===\n")

## 
## === STEP 2: Station-Month Summary ===

station_month_valid %>%
  group_by(pollutant) %>%
  summarise(
    n_station_months = n(),
    n_stations = n_distinct(station),
    min_pct_valid = round(min(pct_valid_days) * 100, 1),
    mean_pct_valid = round(mean(pct_valid_days) * 100, 1),
    max_pct_valid = round(max(pct_valid_days) * 100, 1)
  ) %>%
  kable(caption = "Step 2: Station-Month Summary Statistics") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

Step 2: Station-Month Summary Statistics
pollutant	n_station_months	n_stations	min_pct_valid	mean_pct_valid	max_pct_valid
NO2	461	7	75.0	96.5	100
PM25	404	6	76.7	96.6	100

3.3 Step 3: District-Month Means

Rule: Simple average of all valid station-month means within each district.

Note: Kağıthane_1 and Kağıthane_2 are first aggregated to represent Kağıthane district.

# Join with district mapping
district_month <- station_month_valid %>%
  left_join(station_to_district, by = "station") %>%
  filter(!is.na(district))

# Compute district-month means as simple average across all stations in that district
outcome_data <- district_month %>%
  group_by(district, treated, year, month, year_month, pollutant) %>%
  summarise(
    n_stations = n(),
    mean_value = mean(mean_value, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  # Create post treatment indicator
  mutate(
    date = as.Date(paste0(year_month, "-01")),
    post = as.integer(date > treatment_date)
  )

# Pivot wider to have separate columns for each pollutant
outcome_wide <- outcome_data %>%
  select(district, treated, year, month, year_month, date, post, pollutant, mean_value) %>%
  pivot_wider(
    names_from = pollutant,
    values_from = mean_value,
    names_prefix = ""
  ) %>%
  rename(
    NO2_mean = NO2,
    PM25_mean = PM25
  )

# Display summary
cat("\n=== STEP 3: District-Month Summary ===\n")

## 
## === STEP 3: District-Month Summary ===

outcome_wide %>%
  group_by(treated, post) %>%
  summarise(
    n_obs = n(),
    n_districts = n_distinct(district),
    mean_NO2 = round(mean(NO2_mean, na.rm = TRUE), 2),
    sd_NO2 = round(sd(NO2_mean, na.rm = TRUE), 2),
    mean_PM25 = round(mean(PM25_mean, na.rm = TRUE), 2),
    sd_PM25 = round(sd(PM25_mean, na.rm = TRUE), 2),
    .groups = "drop"
  ) %>%
  mutate(
    group = case_when(
      treated == 1 & post == 0 ~ "Treated, Pre",
      treated == 1 & post == 1 ~ "Treated, Post",
      treated == 0 & post == 0 ~ "Control, Pre",
      treated == 0 & post == 1 ~ "Control, Post"
    )
  ) %>%
  select(group, everything(), -treated, -post) %>%
  kable(caption = "Step 3: District-Month Summary by Treatment Group") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

Step 3: District-Month Summary by Treatment Group
group	n_obs	n_districts	mean_NO2	sd_NO2	mean_PM25	sd_PM25
Control, Pre	42	2	28.40	12.30	21.42	3.62
Control, Post	91	2	36.05	12.03	18.00	3.87
Treated, Pre	86	4	44.78	16.67	20.70	6.65
Treated, Post	199	4	42.88	11.37	20.05	4.86

4. Descriptive Statistics

4.1 Sample Composition

cat("\n=== Sample Composition ===\n")

## 
## === Sample Composition ===

outcome_wide %>%
  group_by(district, treated) %>%
  summarise(
    n_months = n(),
    date_start = min(date),
    date_end = max(date),
    .groups = "drop"
  ) %>%
  mutate(
    group = ifelse(treated == 1, "Treated", "Control")
  ) %>%
  arrange(group, district) %>%
  kable(caption = "Sample Composition by District") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

Sample Composition by District
district	treated	n_months	date_start	date_end	group
Avcılar	0	69	2019-01-01	2024-12-01	Control
Esenyurt	0	64	2019-01-01	2024-09-01	Control
Bağcılar	1	72	2019-01-01	2024-12-01	Treated
Esenler	1	72	2019-01-01	2024-12-01	Treated
Eyüpsultan	1	69	2019-01-01	2024-12-01	Treated
Kağıthane	1	72	2019-01-01	2024-12-01	Treated

4.2 Pre-Treatment Balance

# Pre-treatment period balance (up to October 2020)
balance_data <- outcome_wide %>%
  filter(post == 0) %>%
  group_by(district, treated) %>%
  summarise(
    mean_NO2 = round(mean(NO2_mean, na.rm = TRUE), 2),
    sd_NO2 = round(sd(NO2_mean, na.rm = TRUE), 2),
    mean_PM25 = round(mean(PM25_mean, na.rm = TRUE), 2),
    sd_PM25 = round(sd(PM25_mean, na.rm = TRUE), 2),
    n_months = n(),
    .groups = "drop"
  ) %>%
  mutate(
    group = ifelse(treated == 1, "Treated", "Control")
  ) %>%
  arrange(group, district)

cat("\n=== Pre-Treatment Balance ===\n")

## 
## === Pre-Treatment Balance ===

balance_data %>%
  kable(caption = "Pre-Treatment Period Statistics (Up to October 2020)") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

Pre-Treatment Period Statistics (Up to October 2020)
district	treated	mean_NO2	sd_NO2	mean_PM25	sd_PM25	n_months	group
Avcılar	0	35.07	11.25	21.42	3.62	21	Control
Esenyurt	0	22.03	9.77	NaN	NA	21	Control
Bağcılar	1	48.08	13.89	21.32	7.24	22	Treated
Esenler	1	56.15	17.77	21.39	5.75	22	Treated
Eyüpsultan	1	42.80	8.20	18.05	6.28	20	Treated
Kağıthane	1	31.75	16.04	21.85	6.98	22	Treated

4.3 Parallel Trends Visualization

# Aggregate to treatment group level by month
trends_data <- outcome_wide %>%
  group_by(year_month, date, treated, post) %>%
  summarise(
    NO2_mean = mean(NO2_mean, na.rm = TRUE),
    PM25_mean = mean(PM25_mean, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(
    group = ifelse(treated == 1, "Treated Districts", "Control Districts")
  )

# NO2 trends
p1 <- ggplot(trends_data, aes(x = date, y = NO2_mean, color = group, group = group)) +
  geom_line(linewidth = 1) +
  geom_vline(xintercept = treatment_date, linetype = "dashed", color = "red", linewidth = 1) +
  annotate("text", x = treatment_date, y = max(trends_data$NO2_mean, na.rm = TRUE), 
           label = "M7 Opening\n(Oct 2020)", hjust = -0.1, color = "red", size = 3.5) +
  scale_color_manual(values = c("Treated Districts" = "#E41A1C", 
                                  "Control Districts" = "#377EB8")) +
  labs(
    title = "NO₂ Concentration Over Time",
    subtitle = "Treated vs Control Districts",
    x = "Date",
    y = "NO₂ (µg/m³)",
    color = "Group"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    legend.position = "bottom",
    plot.title = element_text(face = "bold")
  )

# PM2.5 trends
p2 <- ggplot(trends_data, aes(x = date, y = PM25_mean, color = group, group = group)) +
  geom_line(linewidth = 1) +
  geom_vline(xintercept = treatment_date, linetype = "dashed", color = "red", linewidth = 1) +
  annotate("text", x = treatment_date, y = max(trends_data$PM25_mean, na.rm = TRUE), 
           label = "M7 Opening\n(Oct 2020)", hjust = -0.1, color = "red", size = 3.5) +
  scale_color_manual(values = c("Treated Districts" = "#E41A1C", 
                                  "Control Districts" = "#377EB8")) +
  labs(
    title = "PM2.5 Concentration Over Time",
    subtitle = "Treated vs Control Districts",
    x = "Date",
    y = "PM2.5 (µg/m³)",
    color = "Group"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    legend.position = "bottom",
    plot.title = element_text(face = "bold")
  )

p1 / p2

5. Difference-in-Differences Analysis

5.1 Prepare Analysis Dataset

# Convert categorical variables to factors
analysis_data <- outcome_wide %>%
  mutate(
    district = factor(district),
    year_month = factor(year_month),
    treated = as.integer(treated),
    post = as.integer(post),
    treated_x_post = treated * post  # Interaction term (main DiD coefficient)
  ) %>%
  arrange(district, year_month)

# Summary of analysis dataset
cat("\n=== Analysis Dataset Summary ===\n")

## 
## === Analysis Dataset Summary ===

analysis_data %>%
  group_by(treated, post) %>%
  summarise(
    n_districts = n_distinct(district),
    n_obs = n(),
    .groups = "drop"
  ) %>%
  mutate(
    period = case_when(
      treated == 1 & post == 0 ~ "Treated, Pre",
      treated == 1 & post == 1 ~ "Treated, Post",
      treated == 0 & post == 0 ~ "Control, Pre",
      treated == 0 & post == 1 ~ "Control, Post"
    )
  ) %>%
  select(period, n_districts, n_obs) %>%
  kable(caption = "Analysis Dataset Summary") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

Analysis Dataset Summary
period	n_districts	n_obs
Control, Pre	2	42
Control, Post	2	91
Treated, Pre	4	86
Treated, Post	4	199

5.2 DiD Regression Models

The model specification: \[Y_{it} = \beta_0 + \beta_1 (Treated_i \times Post_t) + \gamma_i + \delta_t + \epsilon_{it}\]

Where: - \(Y_{it}\) = Outcome (NO₂ or PM2.5) for district \(i\) in month \(t\) - \(Treated_i \times Post_t\) = Interaction term (main parameter of interest) - \(\gamma_i\) = District fixed effects - \(\delta_t\) = Year-month fixed effects - Standard errors clustered at district level

5.2.1 NO₂ Model

# DiD regression with district and time fixed effects
model_no2 <- feols(
  NO2_mean ~ treated_x_post | district + year_month,
  data = analysis_data,
  cluster = ~district
)

# Display results
cat("\n=== NO₂ DiD Results ===\n")

## 
## === NO₂ DiD Results ===

summary(model_no2)

## OLS estimation, Dep. Var.: NO2_mean
## Observations: 402
## Fixed-effects: district: 6,  year_month: 72
## Standard-errors: Clustered (district) 
##                Estimate Std. Error t value Pr(>|t|) 
## treated_x_post -10.2164    8.41267 -1.2144  0.27882 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 8.79384     Adj. R2: 0.493608
##                 Within R2: 0.058004

# Create tidy output
tidy_no2 <- tidy(model_no2, conf.int = TRUE) %>%
  mutate(outcome = "NO₂")

5.2.2 PM2.5 Model

# DiD regression with district and time fixed effects
model_pm25 <- feols(
  PM25_mean ~ treated_x_post | district + year_month,
  data = analysis_data,
  cluster = ~district
)

# Display results
cat("\n=== PM2.5 DiD Results ===\n")

## 
## === PM2.5 DiD Results ===

summary(model_pm25)

## OLS estimation, Dep. Var.: PM25_mean
## Observations: 345
## Fixed-effects: district: 5,  year_month: 72
## Standard-errors: Clustered (district) 
##                Estimate Std. Error t value Pr(>|t|)    
## treated_x_post  2.48174   0.805034 3.08278 0.036832 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 2.35581     Adj. R2: 0.740338
##                 Within R2: 0.030848

# Create tidy output
tidy_pm25 <- tidy(model_pm25, conf.int = TRUE) %>%
  mutate(outcome = "PM2.5")

5.3 Combined Results Table

# Publication-ready table
modelsummary(
  list("NO₂" = model_no2, "PM2.5" = model_pm25),
  stars = c('*' = 0.1, '**' = 0.05, '***' = 0.01),
  coef_rename = c("treated_x_post" = "Treated × Post"),
  gof_map = c("nobs", "r.squared", "adj.r.squared", "FE: district", "FE: year_month"),
  title = "Difference-in-Differences Estimates: Impact of M7 Metro Line on Air Quality",
  notes = list(
    "Standard errors clustered at district level in parentheses.",
    "Models include district and year-month fixed effects.",
    "Treatment date: October 2020 (M7 metro line opening).",
    "Treated districts: Eyüpsultan, Kağıthane, Bağcılar, Esenler.",
    "Control districts: Avcılar, Esenyurt."
  )
)

Difference-in-Differences Estimates: Impact of M7 Metro Line on Air Quality
	NO₂	PM2.5
* p < 0.1, p < 0.05, * p < 0.01
Standard errors clustered at district level in parentheses.
Models include district and year-month fixed effects.
Treatment date: October 2020 (M7 metro line opening).
Treated districts: Eyüpsultan, Kağıthane, Bağcılar, Esenler.
Control districts: Avcılar, Esenyurt.
Treated × Post	-10.216	2.482**
	(8.413)	(0.805)
Num.Obs.	402	345
R2	0.591	0.798
R2 Adj.	0.494	0.740
FE: district	X	X
FE: year_month	X	X

6. Robustness Checks

6.1 Event Study (Dynamic Treatment Effects)

Tests for pre-treatment parallel trends and post-treatment dynamics.

# Create relative time periods (quarters relative to treatment)
analysis_data_event <- analysis_data %>%
  mutate(
    quarters_since_treatment = ifelse(
      date <= treatment_date,
      floor(as.numeric(difftime(date, treatment_date, units = "days")) / 90),
      ceiling(as.numeric(difftime(date, treatment_date, units = "days")) / 90)
    )
  ) %>%
  filter(quarters_since_treatment >= -8 & quarters_since_treatment <= 16) %>%
  mutate(
    rel_time = factor(quarters_since_treatment),
    rel_time = relevel(rel_time, ref = as.character(-1))  # Quarter before treatment as reference
  )

# Event study for NO2
event_no2 <- feols(
  NO2_mean ~ i(rel_time, treated, ref = -1) | district + year_month,
  data = analysis_data_event,
  cluster = ~district
)

# Event study for PM25
event_pm25 <- feols(
  PM25_mean ~ i(rel_time, treated, ref = -1) | district + year_month,
  data = analysis_data_event,
  cluster = ~district
)

# Plot event study
iplot(
  list("NO₂" = event_no2, "PM2.5" = event_pm25),
  main = "Event Study: Dynamic Treatment Effects",
  xlab = "Quarters Relative to M7 Opening (Quarter -1 = Reference)",
  ylab = "Estimated Coefficient (µg/m³)",
  ref.line = -1
)

6.2 Placebo Test (Fake Treatment Date)

Tests whether we detect false effects when using a fake treatment date in the pre-period.

# Use October 2019 as fake treatment date (one year before actual)
fake_treatment_date <- as.Date("2019-10-31")

placebo_data <- outcome_wide %>%
  filter(date <= treatment_date) %>%  # Only pre-treatment period
  mutate(
    fake_post = as.integer(date > fake_treatment_date),
    treated_x_fake_post = treated * fake_post,
    district = factor(district),
    year_month = factor(year_month)
  )

# Placebo NO2
placebo_no2 <- feols(
  NO2_mean ~ treated_x_fake_post | district + year_month,
  data = placebo_data,
  cluster = ~district
)

# Placebo PM25
placebo_pm25 <- feols(
  PM25_mean ~ treated_x_fake_post | district + year_month,
  data = placebo_data,
  cluster = ~district
)

# Results table
cat("\n=== Placebo Test Results ===\n")

## 
## === Placebo Test Results ===

modelsummary(
  list("NO₂ (Placebo)" = placebo_no2, "PM2.5 (Placebo)" = placebo_pm25),
  stars = c('*' = 0.1, '**' = 0.05, '***' = 0.01),
  coef_rename = c("treated_x_fake_post" = "Treated × Fake Post"),
  gof_map = c("nobs", "r.squared"),
  title = "Placebo Test: Fake Treatment Date (October 2019)",
  notes = "If parallel trends assumption holds, coefficients should be insignificant."
)

Placebo Test: Fake Treatment Date (October 2019)
	NO₂ (Placebo)	PM2.5 (Placebo)
* p < 0.1, p < 0.05, * p < 0.01
If parallel trends assumption holds, coefficients should be insignificant.
Treated × Fake Post	7.461**	-1.550
	(2.817)	(1.191)
Num.Obs.	123	101
R2	0.743	0.868

7. Summary and Interpretation

7.1 Main Results Summary

# Extract coefficients and create summary table
did_results <- tibble(
  Outcome = c("NO₂", "PM2.5"),
  Coefficient = c(
    coef(model_no2)["treated_x_post"], 
    coef(model_pm25)["treated_x_post"]
  ),
  SE = c(
    se(model_no2)["treated_x_post"], 
    se(model_pm25)["treated_x_post"]
  )
) %>%
  mutate(
    CI_Lower = Coefficient - 1.96 * SE,
    CI_Upper = Coefficient + 1.96 * SE,
    P_Value = 2 * (1 - pnorm(abs(Coefficient / SE))),
    Significant = case_when(
      P_Value < 0.01 ~ "***",
      P_Value < 0.05 ~ "**",
      P_Value < 0.1 ~ "*",
      TRUE ~ ""
    )
  )

cat("\n=== Main DiD Results ===\n")

## 
## === Main DiD Results ===

did_results %>%
  mutate(across(where(is.numeric), ~round(., 3))) %>%
  kable(caption = "DiD Treatment Effect Estimates") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

DiD Treatment Effect Estimates
Outcome	Coefficient	SE	CI_Lower	CI_Upper	P_Value	Significant
NO₂	-10.216	8.413	-26.705	6.272	0.225
PM2.5	2.482	0.805	0.904	4.060	0.002	***

7.2 Coefficient Plot

ggplot(did_results, aes(x = Outcome, y = Coefficient)) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray50", linewidth = 1) +
  geom_point(size = 5, color = "#E41A1C") +
  geom_errorbar(
    aes(ymin = CI_Lower, ymax = CI_Upper),
    width = 0.2,
    linewidth = 1,
    color = "#E41A1C"
  ) +
  labs(
    title = "DiD Treatment Effects: Impact of M7 Metro Line on Air Quality",
    subtitle = "Point estimates with 95% confidence intervals",
    x = "Outcome Variable",
    y = "Treatment Effect (µg/m³)",
    caption = "Standard errors clustered at district level\nNegative values indicate pollution reduction"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    plot.title = element_text(face = "bold", size = 15),
    plot.subtitle = element_text(size = 12),
    axis.text = element_text(size = 12),
    panel.grid.minor = element_blank()
  )

7.3 Interpretation

cat("\n=== Interpretation ===\n\n")

## 
## === Interpretation ===

# NO2 interpretation
no2_effect <- did_results$Coefficient[1]
no2_pval <- did_results$P_Value[1]
no2_sig <- did_results$Significant[1]

cat("NO₂ Results:\n")

## NO₂ Results:

cat(sprintf("  - Estimated effect: %.3f µg/m³ %s\n", no2_effect, no2_sig))

##   - Estimated effect: -10.216 µg/m³

cat(sprintf("  - P-value: %.4f\n", no2_pval))

##   - P-value: 0.2246

if (no2_pval < 0.05) {
  if (no2_effect < 0) {
    cat(sprintf("  - The M7 metro line is associated with a statistically significant REDUCTION\n"))
    cat(sprintf("    of %.2f µg/m³ in NO₂ concentration in treated districts.\n", abs(no2_effect)))
  } else {
    cat(sprintf("  - The M7 metro line is associated with a statistically significant INCREASE\n"))
    cat(sprintf("    of %.2f µg/m³ in NO₂ concentration in treated districts.\n", no2_effect))
  }
} else {
  cat("  - No statistically significant effect detected.\n")
}

##   - No statistically significant effect detected.

cat("\n")

# PM2.5 interpretation
pm25_effect <- did_results$Coefficient[2]
pm25_pval <- did_results$P_Value[2]
pm25_sig <- did_results$Significant[2]

cat("PM2.5 Results:\n")

## PM2.5 Results:

cat(sprintf("  - Estimated effect: %.3f µg/m³ %s\n", pm25_effect, pm25_sig))

##   - Estimated effect: 2.482 µg/m³ ***

cat(sprintf("  - P-value: %.4f\n", pm25_pval))

##   - P-value: 0.0021

if (pm25_pval < 0.05) {
  if (pm25_effect < 0) {
    cat(sprintf("  - The M7 metro line is associated with a statistically significant REDUCTION\n"))
    cat(sprintf("    of %.2f µg/m³ in PM2.5 concentration in treated districts.\n", abs(pm25_effect)))
  } else {
    cat(sprintf("  - The M7 metro line is associated with a statistically significant INCREASE\n"))
    cat(sprintf("    of %.2f µg/m³ in PM2.5 concentration in treated districts.\n", pm25_effect))
  }
} else {
  cat("  - No statistically significant effect detected.\n")
}

##   - The M7 metro line is associated with a statistically significant INCREASE
##     of 2.48 µg/m³ in PM2.5 concentration in treated districts.

8. Data Export

# Export final analysis dataset
write_csv(
  analysis_data,
  "analysis_dataset_district_month.csv"
)

# Export district-level aggregated trends
write_csv(
  trends_data,
  "district_trends_by_treatment.csv"
)

# Export regression results
write_csv(
  bind_rows(tidy_no2, tidy_pm25),
  "regression_results.csv"
)

# Export summary statistics
write_csv(
  did_results,
  "did_summary_results.csv"
)

cat("\n=== Data Export Complete ===\n")

## 
## === Data Export Complete ===

cat("Files created:\n")

## Files created:

cat("  - analysis_dataset_district_month.csv\n")

##   - analysis_dataset_district_month.csv

cat("  - district_trends_by_treatment.csv\n")

##   - district_trends_by_treatment.csv

cat("  - regression_results.csv\n")

##   - regression_results.csv

cat("  - did_summary_results.csv\n")

##   - did_summary_results.csv

9. Session Information

sessionInfo()

## R version 4.5.2 (2025-10-31)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.3 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0
## 
## locale:
##  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
##  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
##  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
## [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
## 
## time zone: UTC
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] kableExtra_1.4.0   knitr_1.51         scales_1.4.0       patchwork_1.3.2   
##  [5] modelsummary_2.5.0 broom_1.0.11       fixest_0.13.2      data.table_1.18.0 
##  [9] readxl_1.4.5       lubridate_1.9.4    forcats_1.0.1      stringr_1.6.0     
## [13] dplyr_1.1.4        purrr_1.2.0        readr_2.1.6        tidyr_1.3.2       
## [17] tibble_3.3.0       ggplot2_4.0.1      tidyverse_2.0.0   
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6        bayestestR_0.17.0   xfun_0.55          
##  [4] bslib_0.9.0         insight_1.4.4       lattice_0.22-7     
##  [7] tzdb_0.5.0          numDeriv_2016.8-1.1 vctrs_0.6.5        
## [10] tools_4.5.2         generics_0.1.4      parallel_4.5.2     
## [13] datawizard_1.3.0    sandwich_3.1-1      pkgconfig_2.0.3    
## [16] tinytable_0.15.2    checkmate_2.3.3     RColorBrewer_1.1-3 
## [19] S7_0.2.1            stringmagic_1.2.0   lifecycle_1.0.4    
## [22] compiler_4.5.2      farver_2.1.2        textshaping_1.0.4  
## [25] htmltools_0.5.9     sass_0.4.10         yaml_2.3.12        
## [28] Formula_1.2-5       crayon_1.5.3        pillar_1.11.1      
## [31] jquerylib_0.1.4     cachem_1.1.0        nlme_3.1-168       
## [34] tidyselect_1.2.1    digest_0.6.39       performance_0.15.3 
## [37] stringi_1.8.7       labeling_0.4.3      fastmap_1.2.0      
## [40] grid_4.5.2          cli_3.6.5           magrittr_2.0.4     
## [43] withr_3.0.2         dreamerr_1.5.0      backports_1.5.0    
## [46] bit64_4.6.0-1       timechange_0.3.0    rmarkdown_2.30     
## [49] bit_4.6.0           cellranger_1.1.0    zoo_1.8-15         
## [52] hms_1.1.4           evaluate_1.0.5      parameters_0.28.3  
## [55] viridisLite_0.4.2   rlang_1.1.6         Rcpp_1.1.0         
## [58] glue_1.8.0          xml2_1.5.1          vroom_1.6.7        
## [61] svglite_2.2.2       rstudioapi_0.17.1   jsonlite_2.0.0     
## [64] R6_2.6.1            tables_0.9.33       systemfonts_1.3.1

Appendix: Data Quality Checks

A.1 Missing Data Pattern

cat("\n=== Missing Data Summary ===\n")

## 
## === Missing Data Summary ===

# Check missing data in raw input
missing_summary <- df_clean %>%
  select(ends_with("_NO2"), ends_with("_PM25")) %>%
  summarise(across(everything(), ~sum(is.na(.)))) %>%
  pivot_longer(everything(), names_to = "variable", values_to = "n_missing") %>%
  mutate(
    pct_missing = round(n_missing / nrow(df_clean) * 100, 2)
  ) %>%
  arrange(desc(pct_missing))

missing_summary %>%
  kable(caption = "Missing Data by Station-Pollutant") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

Missing Data by Station-Pollutant
variable	n_missing	pct_missing
Esenyurt_PM25	52609	100.00
Kağıthane_1_NO2	6791	12.91
Kağıthane_1_PM25	6218	11.82
Esenyurt_NO2	5494	10.44
Avcılar_PM25	5242	9.96
Alibeyköy_NO2	4447	8.45
Kağıthane_2_NO2	3815	7.25
Alibeyköy_PM25	3565	6.78
Avcılar_NO2	2849	5.42
Kağıthane_2_PM25	2754	5.23
Esenler_NO2	1992	3.79
Bağcılar_PM25	1717	3.26
Esenler_PM25	1050	2.00
Bağcılar_NO2	715	1.36

A.2 District-Month Coverage

cat("\n=== District-Month Coverage ===\n")

## 
## === District-Month Coverage ===

# Create complete grid of district-months
all_months <- outcome_wide %>%
  expand(district, year_month = unique(year_month))

# Check which district-months we have data for
coverage <- all_months %>%
  left_join(
    outcome_wide %>% select(district, year_month) %>% mutate(has_data = 1),
    by = c("district", "year_month")
  ) %>%
  mutate(has_data = ifelse(is.na(has_data), 0, 1))

coverage_summary <- coverage %>%
  group_by(district) %>%
  summarise(
    total_months = n(),
    months_with_data = sum(has_data),
    pct_coverage = round(months_with_data / total_months * 100, 1)
  )

coverage_summary %>%
  kable(caption = "Data Coverage by District") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

Data Coverage by District
district	total_months	months_with_data	pct_coverage
Avcılar	72	69	95.8
Bağcılar	72	72	100.0
Esenler	72	72	100.0
Esenyurt	72	64	88.9
Eyüpsultan	72	69	95.8
Kağıthane	72	72	100.0

End of Analysis

Impact of M7 Metro Line on Air Quality: Difference-in-Differences Analysis

Air Quality Analysis

2026-01-31