Caso 2: Preparar la base para Diferencia en Diferencia

Este documento muestra cómo se reorganiza la base de datos data_full en formato ancho para aplicar la metodología de Diferencia en Diferencia (DiD).

1. Cargar paquetes necesarios

library(tidyverse)
library(haven)
library(MatchIt)
library(modelr)
library(magrittr)

setwd("C:/Users/mosia/Desktop/Curso de especializacion/control de lectura en Rstudio")

data_full <- haven::read_dta("evaluation.dta")
data <- data_full %>% 
  tidyr::pivot_wider(
    names_from = round,
    values_from = c(
      "health_expenditures", "poverty_index", "age_hh", "age_sp",
      "educ_hh", "educ_sp", "female_hh", "indigenous", "hhsize",
      "dirtfloor", "bathroom", "land", "hospital_distance"
    )
  ) %>%
  dplyr::filter(!base::is.na(health_expenditures_0))

# Verificar resultado
dim(data)       
## [1] 9913   34
head(data, 10) 
## # A tibble: 10 × 34
##    locality_identifier household_identifier treatment_locality
##                  <dbl>                <dbl>              <dbl>
##  1                  26                    5                  1
##  2                  26                   11                  1
##  3                  26                   13                  1
##  4                  26                   16                  1
##  5                  26                   21                  1
##  6                  26                   22                  1
##  7                  26                   23                  1
##  8                  26                   25                  1
##  9                  26                   27                  1
## 10                  26                   28                  1
## # ℹ 31 more variables: promotion_locality <dbl>, eligible <dbl>,
## #   enrolled <dbl>, enrolled_rp <dbl>, hospital <dbl>,
## #   health_expenditures_0 <dbl>, health_expenditures_1 <dbl>,
## #   poverty_index_0 <dbl>, poverty_index_1 <dbl>, age_hh_0 <dbl>,
## #   age_hh_1 <dbl>, age_sp_0 <dbl>, age_sp_1 <dbl>, educ_hh_0 <dbl>,
## #   educ_hh_1 <dbl>, educ_sp_0 <dbl>, educ_sp_1 <dbl>, female_hh_0 <dbl>,
## #   female_hh_1 <dbl>, indigenous_0 <dbl>, indigenous_1 <dbl>, …
nrow(data)
## [1] 9913

Pregunta 3

model_lg2 <- MatchIt::matchit(
  formula = enrolled ~ age_hh_0 + educ_hh_0 + age_sp_0 + educ_sp_0 +
    female_hh_0 + indigenous_0 + hhsize_0 + dirtfloor_0 + bathroom_0 +
    hospital_distance_0,
  data = data,
  distance = "glm", link = "logit",
  method = "nearest", replace = FALSE, ratio = 1
)

model_lg2
## A `matchit` object
##  - method: 1:1 nearest neighbor matching without replacement
##  - distance: Propensity score
##              - estimated with logistic regression
##  - number of obs.: 9913 (original), 5928 (matched)
##  - target estimand: ATT
##  - covariates: age_hh_0, educ_hh_0, age_sp_0, educ_sp_0, female_hh_0, indigenous_0, hhsize_0, dirtfloor_0, bathroom_0, hospital_distance_0

Pregunta 4

data %>%
  modelr::add_predictions(
    model = model_lg2$model, var = "pred", type = "response"
  ) %>%
  dplyr::mutate(
    enrolled_lab = base::ifelse(enrolled == 1, "Sí", "No")
  ) %>%
  ggplot2::ggplot(
    mapping = ggplot2::aes(x = pred, colour = enrolled_lab, fill = enrolled_lab)
  ) +
  ggplot2::geom_density(alpha = 1/3)

Pregunta 5

data_ps2 <- MatchIt::match.data(object = model_lg2)

data_ps2 %>%
  dplyr::mutate(
    enrolled_lab = base::ifelse(enrolled == 1, "Sí", "No")
  ) %>%
  ggplot2::ggplot(
    mapping = ggplot2::aes(x = distance, colour = enrolled_lab, fill = enrolled_lab)
  ) +
  ggplot2::geom_density(alpha = 1/3)

Pregunta 6

model_ps2 <- stats::lm(
  data = data_ps2,
  formula = health_expenditures_1 ~ enrolled,
  weights = data_ps2$weights
)

summary(model_ps2)
## 
## Call:
## stats::lm(formula = health_expenditures_1 ~ enrolled, data = data_ps2, 
##     weights = data_ps2$weights)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.658 -5.545 -2.068  2.913 80.228 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  17.8652     0.1555  114.90   <2e-16 ***
## enrolled    -10.0166     0.2223  -45.06   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.21 on 5456 degrees of freedom
##   (470 observations deleted due to missingness)
## Multiple R-squared:  0.2712, Adjusted R-squared:  0.271 
## F-statistic:  2030 on 1 and 5456 DF,  p-value: < 2.2e-16

Pregunta 7

data_ps_dd2 <- dplyr::left_join(
  x = data_ps2 %>% dplyr::select(household_identifier, weights),
  y = data_full
)
## Joining with `by = join_by(household_identifier)`
model_ps_dd2 <- stats::lm(
  data = data_ps_dd2,
  formula = health_expenditures ~ enrolled * round
)

summary(model_ps_dd2)
## 
## Call:
## stats::lm(formula = health_expenditures ~ enrolled * round, data = data_ps_dd2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.581 -4.461 -1.049  2.946 80.306 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     15.0616     0.1217 123.800  < 2e-16 ***
## enrolled        -0.5719     0.1721  -3.324  0.00089 ***
## round            2.7264     0.1721  15.846  < 2e-16 ***
## enrolled:round  -9.3763     0.2433 -38.535  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.624 on 11852 degrees of freedom
## Multiple R-squared:  0.2337, Adjusted R-squared:  0.2335 
## F-statistic:  1205 on 3 and 11852 DF,  p-value: < 2.2e-16