Data Preparation

# Load libraries
library(stargazer)

## 
## Please cite as:

##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer

library(ggplot2)
library(plotly)

## Warning: package 'plotly' was built under R version 4.4.3

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

# Read and subset data
df <- read.csv("PROGRESA.csv", header = TRUE, sep = ",")
df$treat <- df$progresa1 
df$girl  <- df$sex1 
df <- subset(df, wave %in% c(1, 4), 
             select = c(sooloca, sooind_id, age1, hgc1, school, treat, girl, wave))
balanced_ids <- intersect(df[df$wave == 4, "sooind_id"],
                          df[df$wave == 1 & df$age1 >= 6 & df$age1 <= 16, "sooind_id"])
df2 <- df[df$sooind_id %in% balanced_ids, ]
df2$post <- ifelse(df2$wave == 4, 1, 0)
dfPre  <- subset(df2, post == 0)
dfPost <- subset(df2, post == 1)

Baseline Balance and Plots

return_mean_by_treatment <- function(x) {
  c(tapply(x, dfPre$treat, mean), tapply(x, dfPre$treat, var))
}
vars <- c("girl", "age1", "hgc1", "school")
output <- sapply(dfPre[vars], return_mean_by_treatment)
means <- output[1:2, ]
diffs <- means[2, ] - means[1, ]
N1 <- sum(dfPre$treat)
N0 <- sum(1 - dfPre$treat)
pooled.sd <- sqrt(((N0 - 1) * output[3, ] + (N1 - 1) * output[4, ]) / (N0 + N1 - 2))
std.diffs <- diffs / pooled.sd
results0 <- t(rbind(means, diffs, std.diffs))
colnames(results0) <- c("Control", "Treated", "Diff.", "Std Diff")
rownames(results0) <- c("Girl", "Age", "Highest Grade", "Enrolled")
stargazer(results0, type = "HTML", digits = 2, title = "Balance at Baseline")

Baseline Enrollment Plots by Age

Boys

MeanSchC.b_age <- with(subset(dfPre, treat == 0 & girl == 0), tapply(school, age1, mean))
MeanSchT.b_age <- with(subset(dfPre, treat == 1 & girl == 0), tapply(school, age1, mean))
tab.b_age <- data.frame(
  Age = rep(names(MeanSchC.b_age), 2),
  Group = rep(c("Control", "Treated"), each = length(MeanSchC.b_age)),
  MeanSchool = c(MeanSchC.b_age, MeanSchT.b_age)
)
plotPre.b_age <- ggplot(tab.b_age, aes(x = Age, y = MeanSchool, fill = Group)) +
  geom_col(position = position_dodge(0.8), width = 0.7) +
  theme_bw() +
  theme(legend.position = "bottom", legend.title = element_blank()) +
  scale_y_continuous(breaks = seq(0, 1, 0.1)) +
  xlab("Age") + ylab("Mean School Enrollment") +
  ggtitle("Boys: School Enrollment by Age, Pre Period")
ggplotly(plotPre.b_age, tooltip = "y")

Girls

MeanSchC.g_age <- with(subset(dfPre, treat == 0 & girl == 1), tapply(school, age1, mean))
MeanSchT.g_age <- with(subset(dfPre, treat == 1 & girl == 1), tapply(school, age1, mean))
tab.g_age <- data.frame(
  Age = rep(names(MeanSchC.g_age), 2),
  Group = rep(c("Control", "Treated"), each = length(MeanSchC.g_age)),
  MeanSchool = c(MeanSchC.g_age, MeanSchT.g_age)
)
plotPre.g_age <- ggplot(tab.g_age, aes(x = Age, y = MeanSchool, fill = Group)) +
  geom_col(position = position_dodge(0.8), width = 0.7) +
  theme_bw() +
  theme(legend.position = "bottom", legend.title = element_blank()) +
  scale_y_continuous(breaks = seq(0, 1, 0.1)) +
  xlab("Age") + ylab("Mean School Enrollment") +
  ggtitle("Girls: School Enrollment by Age, Pre Period")
ggplotly(plotPre.g_age, tooltip = "y")

Treatment Effects by Age and Sex (Post-Treatment)

MeansC_age <- with(dfPost, tapply(school, list(treat, girl, age1), mean))
dimnames(MeansC_age)[[1]] <- c("Control", "Treated")
dimnames(MeansC_age)[[2]] <- c("Boys", "Girls")
EffSchBoy_age <- MeansC_age[2, 1, ] - MeansC_age[1, 1, ]
EffSchGirl_age <- MeansC_age[2, 2, ] - MeansC_age[1, 2, ]
AgeLevels <- as.factor(names(EffSchBoy_age))
tabEff_age <- data.frame(
  Age = rep(AgeLevels, 2),
  sex = c(rep("Boy", length(EffSchBoy_age)), rep("Girl", length(EffSchGirl_age))),
  EffSch = c(EffSchBoy_age, EffSchGirl_age)
)
plotEff_age <- ggplot(tabEff_age, aes(x = Age, y = EffSch, fill = sex)) +
  geom_col(width = 0.7, position = position_dodge(0.8)) +
  theme_bw(base_size = 10) + 
  theme(legend.position = "bottom", legend.title = element_blank()) +
  scale_y_continuous(breaks = seq(-0.1, 0.1, 0.05)) +
  xlab("Age") + ylab("Estimated Treatment Effect on Enrollment") +
  ggtitle("Estimated Treatment Effects on Enrollment by Age and Sex")
ggplotly(plotEff_age, tooltip = "y")

Observation Counts by Age, Sex, and Treatment (Post-Treatment)

# Count unique children
person.counts <- with(dfPost, 
                      tapply(sooind_id, list(treat, girl, age1), function(x) length(unique(x)))
)
dimnames(person.counts)[[1]] <- c("Control", "Treated")
dimnames(person.counts)[[2]] <- c("Boys", "Girls")
ages <- sort(unique(dfPost$age1))
Age <- as.factor(rep(ages, 2))
Group <- c(rep("Control", length(ages)), rep("Treated", length(ages)))
numbers.boys <- matrix(c(person.counts[1, 1, ], person.counts[2, 1, ]),
                        nrow = length(ages) * 2, ncol = 1)
tab.boys <- data.frame(Age, Group, numbers.boys)
plot.n.boys <- ggplot(tab.boys, aes(x = Age, y = numbers.boys, fill = Group)) +
  geom_col(width = 0.7, position = position_dodge(0.8)) +
  theme_bw(base_size = 11) + 
  theme(legend.position = "bottom", legend.title = element_blank()) +
  scale_y_continuous(limits = c(0, 1100), breaks = seq(0, 1100, 100)) +
  xlab("Age") + ylab("Number of Boy Observations") +
  ggtitle("Number of Boy Observations, Treated and Control")

numbers.girls <- matrix(c(person.counts[1, 2, ], person.counts[2, 2, ]),
                         nrow = length(ages) * 2, ncol = 1)
tab.girls <- data.frame(Age, Group, numbers.girls)
plot.n.girls <- ggplot(tab.girls, aes(x = Age, y = numbers.girls, fill = Group)) +
  geom_col(width = 0.7, position = position_dodge(0.8)) +
  theme_bw(base_size = 11) + 
  theme(legend.position = "bottom", legend.title = element_blank()) +
  scale_y_continuous(limits = c(0, 1100), breaks = seq(0, 1100, 100)) +
  xlab("Age") + ylab("Number of Girl Observations") +
  ggtitle("Number of Girl Observations, Treated and Control")

# Count unique villages
village.counts <- with(dfPost, 
                       tapply(sooloca, list(treat, girl, age1), function(x) length(unique(x)))
)
dimnames(village.counts)[[1]] <- c("Control", "Treated")
dimnames(village.counts)[[2]] <- c("Boys", "Girls")
numbers.boys.v <- matrix(c(village.counts[1, 1, ], village.counts[2, 1, ]),
                          nrow = length(ages) * 2, ncol = 1)
tab.boys.v <- data.frame(Age, Group, numbers.boys.v)
plot.n.boys.v <- ggplot(tab.boys.v, aes(x = Age, y = numbers.boys.v, fill = Group)) +
  geom_col(width = 0.7, position = position_dodge(0.8)) +
  theme_bw(base_size = 11) + 
  theme(legend.position = "bottom", legend.title = element_blank()) +
  scale_y_continuous(limits = c(0, 1100), breaks = seq(0, 1100, 100)) +
  xlab("Age") + ylab("Number of Villages with Boy Observations") +
  ggtitle("Number of Villages with Boy Observations")

numbers.girls.v <- matrix(c(village.counts[1, 2, ], village.counts[2, 2, ]),
                           nrow = length(ages) * 2, ncol = 1)
tab.girls.v <- data.frame(Age, Group, numbers.girls.v)
plot.n.girls.v <- ggplot(tab.girls.v, aes(x = Age, y = numbers.girls.v, fill = Group)) +
  geom_col(width = 0.7, position = position_dodge(0.8)) +
  theme_bw(base_size = 11) + 
  theme(legend.position = "bottom", legend.title = element_blank()) +
  scale_y_continuous(limits = c(0, 1100), breaks = seq(0, 1100, 100)) +
  xlab("Age") + ylab("Number of Villages with Girl Observations") +
  ggtitle("Number of Villages with Girl Observations")

# Display interactive plots for counts
ggplotly(plot.n.boys, tooltip = "y")

ggplotly(plot.n.girls, tooltip = "y")

ggplotly(plot.n.boys.v, tooltip = "y")

ggplotly(plot.n.girls.v, tooltip = "y")

# Additional Code for Discussion (Part e)

# 1. Baseline Enrollment Means for Age 12
baseline_boys_age12 <- with(subset(dfPre, age1 == 12 & girl == 0), tapply(school, treat, mean))
baseline_girls_age12 <- with(subset(dfPre, age1 == 12 & girl == 1), tapply(school, treat, mean))
cat("Baseline Enrollment for Boys at Age 12 (Control vs. Treated):\n")

## Baseline Enrollment for Boys at Age 12 (Control vs. Treated):

print(baseline_boys_age12)

##         0         1 
## 0.9125874 0.8905263

cat("Baseline Enrollment for Girls at Age 12 (Control vs. Treated):\n")

## Baseline Enrollment for Girls at Age 12 (Control vs. Treated):

print(baseline_girls_age12)

##         0         1 
## 0.8326693 0.8000000

# 2. Treatment Effects for Age 12 (Post-Treatment)
post_boys_age12 <- with(subset(dfPost, age1 == 12 & girl == 0), tapply(school, treat, mean))
post_girls_age12 <- with(subset(dfPost, age1 == 12 & girl == 1), tapply(school, treat, mean))
treatment_effect_boys_age12 <- post_boys_age12["1"] - post_boys_age12["0"]
treatment_effect_girls_age12 <- post_girls_age12["1"] - post_girls_age12["0"]
cat("Estimated Treatment Effect for Boys at Age 12:\n")

## Estimated Treatment Effect for Boys at Age 12:

print(treatment_effect_boys_age12)

##          1 
## 0.06456323

cat("Estimated Treatment Effect for Girls at Age 12:\n")

## Estimated Treatment Effect for Girls at Age 12:

print(treatment_effect_girls_age12)

##          1 
## 0.04970651

# 3. Average Estimated Treatment Effects Across All Ages
avg_eff_boys <- mean(EffSchBoy_age, na.rm = TRUE)
avg_eff_girls <- mean(EffSchGirl_age, na.rm = TRUE)
cat("Average Estimated Treatment Effect (Boys):\n")

## Average Estimated Treatment Effect (Boys):

print(avg_eff_boys)

## [1] 0.00340268

cat("Average Estimated Treatment Effect (Girls):\n")

## Average Estimated Treatment Effect (Girls):

print(avg_eff_girls)

## [1] -0.03398594

# 4. Count Observations and Villages for Age 12 (Post-Treatment)
# Unique children counts
count_boys_age12 <- with(subset(dfPost, age1 == 12 & girl == 0), length(unique(sooind_id)))
count_girls_age12 <- with(subset(dfPost, age1 == 12 & girl == 1), length(unique(sooind_id)))
cat("Number of Boys at Age 12 (Post-Treatment):\n")

## Number of Boys at Age 12 (Post-Treatment):

print(count_boys_age12)

## [1] 856

cat("Number of Girls at Age 12 (Post-Treatment):\n")

## Number of Girls at Age 12 (Post-Treatment):

print(count_girls_age12)

## [1] 864

# Unique villages counts
village_boys_age12 <- with(subset(dfPost, age1 == 12 & girl == 0), length(unique(sooloca)))
village_girls_age12 <- with(subset(dfPost, age1 == 12 & girl == 1), length(unique(sooloca)))
cat("Number of Villages for Boys at Age 12 (Post-Treatment):\n")

## Number of Villages for Boys at Age 12 (Post-Treatment):

print(village_boys_age12)

## [1] 355

cat("Number of Villages for Girls at Age 12 (Post-Treatment):\n")

## Number of Villages for Girls at Age 12 (Post-Treatment):

print(village_girls_age12)

## [1] 358

Discussion of Results

Baseline Balance:

Between baseline 12-year-old boys, the two groups are quite similar. The control group boys had a 91.3% enrollment rate, compared to 89.1% in the treated group. The girls’ rates were 83.3% in the control group and 80.0% in the treated group. These small discrepancies tell us that our treatment allocation worked—both groups are balanced across observable characteristics before Progresa starts.

Treatment Impacts at Age 12:

After the program intervention, results show a positive impact on the age of 12. Treated boys show an improvement of approximately 6.5 percentage points in attendance compared to control boys, and treated girls have a 5.0 percentage point increase compared to control girls. This indicates that Progresa seems to enhance school attendance at age 12.

Average Treatment Impacts Across Ages:

When we take the treatment effect across all ages and average it, the narrative changes. For boys, the average is barely any increase at all (only a 0.34 percentage point change), and for girls, it is low at -3.4 percentage points or so. This means that while the positive effect at age 12 is evident, the effect in general across ages is much less, highlighting that Progresa’s effect could also vary depending on the age group.

Observations and Sample Sizes:

The 12-year-old model has large sample sizes—856 boys and 864 girls from 355 and 358 distinct villages, respectively. These numbers lead us to believe that the estimates at this age are trustworthy. The count plots, however, show that most of the data are concentrated in the mid-range ages with few data points toward the tails. This means that estimates from low-sample age groups should be taken with a pinch of salt.

Visual Summary:

Baseline results: Baseline charts of the control and treated groups show high levels of enrollment at younger ages with a consistent decline as age progresses. The negligible group differences between control and treated confirm that the groups were equivalent before the intervention.
Treatment Effect Graph: The treatment effect graph by age and sex is strongly positive at age 12, although the average effect across all ages is zero for boys and negative for girls.
Count Plots: The village and observation count plots show that most of the observations are concentrated at age 12 with fewer at the extreme ages, favoring conservative interpretation for less-represented ages.

Conclusion:

More broadly, the findings suggest that Progresa has a positive impact on enrollment at critical ages—namely at age 12, when treated boys and girls experience increases of about 6.5 and 5.0 percentage points, respectively. But taking a broader view across all ages, the treatment effect is small for boys and slightly negative for girls. This indicates that the impact of the program is not uniform and perhaps warrants specially aimed interventions across different age groups. The large sample sizes at age 12 provide a confidence in such estimates, though smaller cell numbers at the extremes of ages do warn against making inference to the peripheral portion of the study group outside the central ages.

Econ 136 PSET 3

Juan Carlos Gonzalez

2025-03-06