library(haven)
data <- read_dta("C:/Users/User/Documents/data_Deming_2008_0217.dta")
# Load required packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(purrr)
# Read the data
# Replace negative values with NA (Stata's missing value coding)
data <- data %>%
mutate(across(ChildID:hhID, ~ ifelse(. < 0, NA, .)))
# Create age variables with alternative calculations
years <- c("86", "88", "90", "92", "94", "96", "98", "100", "102", "104")
# Create Age2_Mo variables using PPVT age as backup
for(y in years) {
age_var <- paste0("Age_Mo", y)
ppvt_var <- paste0("PPVTAge", y)
age2_var <- paste0("Age2_Mo", y)
data <- data %>%
mutate(!!age2_var := ifelse(!is.na(.data[[age_var]]), .data[[age_var]],
ifelse(!is.na(.data[[ppvt_var]]), .data[[ppvt_var]], NA)))
}
# Fill in missing age values using adjacent survey years
age_pairs <- list(
c("86", "88"), c("88", "86"), c("88", "90"), c("90", "88"),
c("90", "92"), c("92", "90"), c("92", "94"), c("94", "92"),
c("94", "96"), c("96", "94"), c("96", "98"), c("98", "96"),
c("98", "100"), c("100", "98"), c("100", "102"), c("102", "100"),
c("102", "104"), c("104", "102")
)
for(pair in age_pairs) {
y1 <- pair[1]
y2 <- pair[2]
age2_var1 <- paste0("Age2_Mo", y1)
age2_var2 <- paste0("Age2_Mo", y2)
if(grepl("86|88|90|92|94|96|98|100", y2)) { # y2 is the later year
data <- data %>%
mutate(!!age2_var1 := ifelse(is.na(.data[[age2_var1]]) &
!is.na(.data[[age2_var2]]) &
.data[[age2_var2]] >= 25,
.data[[age2_var2]] - 24,
.data[[age2_var1]]))
} else { # y2 is the earlier year
data <- data %>%
mutate(!!age2_var1 := ifelse(is.na(.data[[age2_var1]]) &
!is.na(.data[[age2_var2]]),
.data[[age2_var2]] + 24,
.data[[age2_var1]]))
}
}
# Create age in years variables
for(x in years) {
age_mo_var <- paste0("Age_Mo", x)
age2_mo_var <- paste0("Age2_Mo", x)
age_yr_var <- paste0("Age_Yr", x)
age2_yr_var <- paste0("Age2_Yr", x)
data <- data %>%
mutate(
!!age_yr_var := ifelse(.data[[age_mo_var]] < 12, 0, NA),
!!age2_yr_var := ifelse(.data[[age2_mo_var]] < 12, 0, NA)
)
for(y in 1:37) {
data <- data %>%
mutate(
!!age_yr_var := ifelse(.data[[age_mo_var]] >= (12*y) &
.data[[age_mo_var]] < (12*y + 12),
y, .data[[age_yr_var]]),
!!age2_yr_var := ifelse(.data[[age2_mo_var]] >= (12*y) &
.data[[age2_mo_var]] < (12*y + 12),
y, .data[[age2_yr_var]])
)
}
}
# Create sample eligibility variables
for(y in c("86", "88", "90")) {
age_var <- paste0("Age_Mo", y)
age2_var <- paste0("Age2_Mo", y)
elig1_var <- paste0("Elig1_", y)
elig2_var <- paste0("Elig2_", y)
data <- data %>%
group_by(MotherID) %>%
mutate(
temp_count1 = sum(!is.na(.data[[age_var]]) & .data[[age_var]] > 47, na.rm = TRUE),
temp_count2 = sum(!is.na(.data[[age2_var]]) & .data[[age2_var]] > 47, na.rm = TRUE),
!!elig1_var := ifelse(temp_count1 > 1, 1, 0),
!!elig2_var := ifelse(temp_count2 > 1, 1, 0)
) %>%
ungroup() %>%
select(-temp_count1, -temp_count2)
}
# Handle deceased respondents
for(x in c("86", "88", "90")) {
res_var <- paste0("Res", x)
dead_var <- paste0("Dead", x)
elig1_var <- paste0("Elig1_", x)
elig2_var <- paste0("Elig2_", x)
data <- data %>%
mutate(
!!dead_var := ifelse(.data[[res_var]] == 8, 1, 0),
!!elig1_var := ifelse(.data[[dead_var]] == 1, NA, .data[[elig1_var]]),
!!elig2_var := ifelse(.data[[dead_var]] == 1, NA, .data[[elig2_var]])
)
}
data <- data %>%
mutate(Deceased = ifelse(Res104 == 8, 1, 0)) %>%
select(-matches("^Dead"))
# Create Head Start and Preschool participation variables
for(y in c("1", "2")) {
hs90_var <- paste0("HS", y, "_90")
pre90_var <- paste0("Pre", y, "_90")
none90_var <- paste0("None", y, "_90")
elig_var <- paste0("Elig", y, "_90")
data <- data %>%
mutate(
!!hs90_var := pmax(Ever_HS88, Ever_HS90, na.rm = TRUE),
!!pre90_var := pmax(Ever_Preschool88, Ever_Preschool90, na.rm = TRUE),
!!hs90_var := ifelse(.data[[elig_var]] == 1 & is.na(.data[[hs90_var]]), 0, .data[[hs90_var]]),
!!pre90_var := ifelse(.data[[elig_var]] == 1 & is.na(.data[[pre90_var]]), 0, .data[[pre90_var]]),
!!pre90_var := ifelse(.data[[hs90_var]] == 1, 0, .data[[pre90_var]]),
!!none90_var := ifelse(.data[[elig_var]] == 1, 1, NA),
!!none90_var := ifelse(.data[[hs90_var]] == 1 | .data[[pre90_var]] == 1, 0, .data[[none90_var]])
)
}
# Alternative participation definition (Rule 3)
data <- data %>%
mutate(
HS3_90 = HS2_90,
Pre3_90 = Pre2_90,
None3_90 = None2_90,
HS3_90 = ifelse((Ever_HS88 == 1 & Ever_HS90 == 0) |
(HowLong_HS88 == 1 | HowLong_HS90 == 1), 0, HS3_90),
Pre3_90 = ifelse(Ever_Preschool88 == 1 & Ever_Preschool90 == 0, 0, Pre3_90),
None3_90 = ifelse(HS3_90 == 0 & Pre3_90 == 0, 1, None3_90)
)
# Create count of eligible siblings
for(y in c("86", "88", "90")) {
for(x in c("1", "2", "3")) {
age_var <- ifelse(x == "1", paste0("Age_Mo", y), paste0("Age2_Mo", y))
num_elig_var <- paste0("Num", x, "Elig", y)
data <- data %>%
group_by(MotherID) %>%
mutate(!!num_elig_var := sum(!is.na(.data[[age_var]]) & .data[[age_var]] > 47, na.rm = TRUE)) %>%
ungroup()
}
}
# Create fixed effects sample variables
for(x in c("1", "2", "3")) {
for(y in "90") {
hs_var <- paste0("HS", x, "_", y)
pre_var <- paste0("Pre", x, "_", y)
none_var <- paste0("None", x, "_", y)
num_elig_var <- paste0("Num", x, "Elig", y)
hs_fe_var <- paste0("HS", x, "_FE", y)
pre_fe_var <- paste0("Pre", x, "_FE", y)
none_fe_var <- paste0("None", x, "_FE", y)
data <- data %>%
group_by(MotherID) %>%
mutate(
hs_count = sum(.data[[hs_var]] == 1, na.rm = TRUE),
pre_count = sum(.data[[pre_var]] == 1, na.rm = TRUE),
none_count = sum(.data[[none_var]] == 1, na.rm = TRUE),
hs_count = ifelse(hs_count == .data[[num_elig_var]], NA, hs_count),
pre_count = ifelse(pre_count == .data[[num_elig_var]], NA, pre_count),
none_count = ifelse(none_count == .data[[num_elig_var]], NA, none_count),
!!hs_fe_var := ifelse(!is.na(hs_count), 1, NA),
!!pre_fe_var := ifelse(!is.na(pre_count), 1, NA),
!!none_fe_var := ifelse(!is.na(none_count), 1, NA)
) %>%
mutate(
!!hs_fe_var := ifelse(!is.na(.data[[pre_fe_var]]) | !is.na(.data[[none_fe_var]]), 0, .data[[hs_fe_var]]),
!!pre_fe_var := ifelse(!is.na(.data[[hs_fe_var]]) | !is.na(.data[[none_fe_var]]), 0, .data[[pre_fe_var]]),
!!none_fe_var := ifelse(!is.na(.data[[hs_fe_var]]) | !is.na(.data[[pre_fe_var]]), 0, .data[[none_fe_var]])
) %>%
ungroup() %>%
select(-hs_count, -pre_count, -none_count)
}
}
# Create final preschool categorization variables
data <- data %>%
mutate(
PreK = case_when(
HS2_90 == 1 ~ 1,
Pre2_90 == 1 ~ 2,
None2_90 == 1 ~ 3,
TRUE ~ NA_real_
),
PreK_FE = case_when(
HS2_FE90 == 1 ~ 1,
Pre2_FE90 == 1 ~ 2,
None2_FE90 == 1 ~ 3,
TRUE ~ NA_real_
)
)
# Remove temporary variables if needed
# data <- data %>% select(-matches("^temp"))
#Data selection ends#########################################################################################################################
# Create demographic variables
data <- data %>%
mutate(
Hispanic = ifelse(Race_Child == 1, 1, 0),
Black = ifelse(Race_Child == 2, 1, 0),
White = ifelse(Race_Child == 3, 1, 0),
NonBlack = ifelse(Race_Child != 2 & !is.na(Race_Child), 1, 0)
)
# Inflation adjustment factors to 2004 dollars
inflation_factors <- c(
"78" = 2.82, "79" = 2.54, "80" = 2.24, "81" = 2.03, "82" = 1.90,
"83" = 1.85, "84" = 1.78, "85" = 1.71, "86" = 1.68, "87" = 1.62,
"88" = 1.55, "89" = 1.48, "90" = 1.41, "91" = 1.35, "92" = 1.31,
"93" = 1.27, "95" = 1.21, "97" = 1.15, "99" = 1.10, "101" = 1.04
)
# Apply inflation adjustments
for(year in names(inflation_factors)) {
var_name <- paste0("NetFamInc", year)
if(var_name %in% names(data)) {
data[[var_name]] <- data[[var_name]] * inflation_factors[year]
}
}
# Create permanent income measure
income_vars <- paste0("NetFamInc", c("78", "79", "80", "81", "82", "83", "84", "85", "86", "87",
"88", "89", "90", "91", "92", "93", "95", "97", "99", "101", "103"))
data <- data %>%
mutate(
PermInc = rowMeans(select(., all_of(income_vars)), na.rm = TRUE),
lnPermInc = log(PermInc),
PermInc_std = as.numeric(scale(PermInc))
)
# Clean and create mother's education variables
education_vars <- names(data)[grepl("HighGrade_Moth", names(data))]
data <- data %>%
mutate(across(all_of(education_vars), ~ ifelse(. == 95, NA, .))) %>%
mutate(
MothED = do.call(pmax, c(select(., all_of(education_vars)), na.rm = TRUE)),
MomDropout = ifelse(MothED < 12, 1, 0),
MomDropout = ifelse(is.na(MothED), NA, MomDropout),
MomHS = ifelse(MothED == 12, 1, 0),
MomHS = ifelse(is.na(MothED), NA, MomHS),
MomSomeColl = ifelse(MothED >= 13 & !is.na(MothED), 1, 0),
MomSomeColl = ifelse(is.na(MothED), NA, MomSomeColl)
)
# Create age-adjusted maternal AFQT score
age_adjustments <- c(
"14" = 35.60881/28.79544, "15" = 35.60881/32.86273, "16" = 35.60881/32.86273,
"17" = 35.60881/36.3544, "18" = 35.60881/33.45777, "19" = 35.60881/36.84,
"20" = 35.60881/41.84536, "21" = 35.60881/40.95177, "22" = 35.60881/42.82069
)
data <- data %>%
mutate(
AgeAFQT = AFQT_Pct81_REV,
AgeAFQT = case_when(
Age_Mom79 == 14 ~ AgeAFQT * age_adjustments["14"],
Age_Mom79 == 15 ~ AgeAFQT * age_adjustments["15"],
Age_Mom79 == 16 ~ AgeAFQT * age_adjustments["16"],
Age_Mom79 == 17 ~ AgeAFQT * age_adjustments["17"],
Age_Mom79 == 18 ~ AgeAFQT * age_adjustments["18"],
Age_Mom79 == 19 ~ AgeAFQT * age_adjustments["19"],
Age_Mom79 == 20 ~ AgeAFQT * age_adjustments["20"],
Age_Mom79 == 21 ~ AgeAFQT * age_adjustments["21"],
Age_Mom79 == 22 ~ AgeAFQT * age_adjustments["22"],
TRUE ~ AgeAFQT
),
AgeAFQT_std = as.numeric(scale(AgeAFQT))
)
# Impute missing AFQT scores
data <- data %>%
mutate(
impAFQT_std = ifelse(is.na(AgeAFQT_std),
predict(lm(AgeAFQT_std ~ Black + Hispanic + Age_Moth_Birth,
data = data), newdata = data),
AgeAFQT_std)
)
# Generate Table 1 statistics
table1_vars <- c("PermInc", "MomDropout", "MomSomeColl", "AgeAFQT_std", "HighGrade_GMom79")
for(race_group in c("Black", "NonBlack")) {
subset_data <- data %>% filter(.data[[race_group]] == 1)
# Full sample table
table1_full <- subset_data %>%
group_by(PreK) %>%
summarise(across(all_of(table1_vars),
list(mean = ~mean(., na.rm = TRUE),
sd = ~sd(., na.rm = TRUE),
count = ~sum(!is.na(.))),
.names = "{.col}_{.fn}"))
# Fixed effects sample table
table1_fe <- subset_data %>%
group_by(PreK_FE) %>%
summarise(across(all_of(table1_vars),
list(mean = ~mean(., na.rm = TRUE),
sd = ~sd(., na.rm = TRUE),
count = ~sum(!is.na(.))),
.names = "{.col}_{.fn}"))
write.table(table1_full, paste0("table1-", race_group, ".txt"), sep = "\t", row.names = FALSE)
write.table(table1_fe, paste0("table1-", race_group, "FE.txt"), sep = "\t", row.names = FALSE)
}
# Create within-sibling difference covariates
# Reside in same household as mother
for(x in 79:90) {
res_var <- paste0("Res", x)
momhh_var <- paste0("MomHH", x)
data <- data %>%
mutate(!!momhh_var := ifelse(.data[[res_var]] == 1, 1,
ifelse(!is.na(.data[[res_var]]), 0, NA)))
}
# Create Res_0to3 variable based on age
data <- data %>%
mutate(Res_0to3 = case_when(
Age2_Yr104 == 14 ~ MomHH90,
Age2_Yr104 == 15 ~ do.call(pmin, c(select(., MomHH89:MomHH90), na.rm = TRUE)),
Age2_Yr104 == 16 ~ do.call(pmin, c(select(., MomHH88:MomHH90), na.rm = TRUE)),
Age2_Yr104 == 17 ~ do.call(pmin, c(select(., MomHH87:MomHH90), na.rm = TRUE)),
Age2_Yr104 == 18 ~ do.call(pmin, c(select(., MomHH86:MomHH89), na.rm = TRUE)),
Age2_Yr104 == 19 ~ do.call(pmin, c(select(., MomHH85:MomHH88), na.rm = TRUE)),
Age2_Yr104 == 20 ~ do.call(pmin, c(select(., MomHH84:MomHH87), na.rm = TRUE)),
Age2_Yr104 == 21 ~ do.call(pmin, c(select(., MomHH83:MomHH86), na.rm = TRUE)),
Age2_Yr104 == 22 ~ do.call(pmin, c(select(., MomHH82:MomHH85), na.rm = TRUE)),
Age2_Yr104 == 23 ~ do.call(pmin, c(select(., MomHH81:MomHH84), na.rm = TRUE)),
Age2_Yr104 == 24 ~ do.call(pmin, c(select(., MomHH80:MomHH83), na.rm = TRUE)),
Age2_Yr104 == 25 ~ do.call(pmin, c(select(., MomHH79:MomHH82), na.rm = TRUE)),
Age2_Yr104 == 26 ~ do.call(pmin, c(select(., MomHH79:MomHH81), na.rm = TRUE)),
Age2_Yr104 == 27 ~ do.call(pmin, c(select(., MomHH79:MomHH80), na.rm = TRUE)),
Age2_Yr104 == 28 ~ MomHH79,
TRUE ~ NA_real_
))
# Remove temporary MomHH variables
data <- data %>% select(-matches("^MomHH"))
# Health conditions before age 5 - CORRECTED VERSION
health_conditions <- c("Brain", "Hyper", "Asthma", "Resp", "Speech", "Deaf", "Blind",
"Disturb", "Allergy", "Crippled", "Retard", "Heart", "Nerve",
"Ear", "Blood", "Epilepsy", "OtherLim")
# Process each health condition
for(condition in health_conditions) {
cat("Processing condition:", condition, "\n")
# Check which years exist for this condition
available_years <- c()
for(y in seq(86, 90, by = 2)) {
var_name <- paste0(condition, y)
if(var_name %in% names(data)) {
available_years <- c(available_years, y)
cat(" Found:", var_name, "\n")
}
}
if(length(available_years) == 0) {
cat(" No data available for", condition, "- skipping\n")
next # Skip if no data available
}
# Create temporary variables for available years
temp_vars <- c()
for(y in available_years) {
temp_var <- paste0("temp_", condition, y)
orig_var <- paste0(condition, y)
temp_vars <- c(temp_vars, temp_var)
data <- data %>%
mutate(!!temp_var := ifelse(!is.na(.data[[orig_var]]) & .data[[orig_var]] > 0, 1, NA))
}
# Create overall condition variable (ever had condition)
if(length(temp_vars) > 0) {
data <- data %>%
mutate(!!condition := do.call(pmax, c(select(., all_of(temp_vars)), na.rm = TRUE)))
}
# Create condition before age 5 variable
before_var <- paste0(condition, "_before")
# Start with the condition as FALSE
data <- data %>% mutate(!!before_var := 0)
# Update to 1 if condition occurred before age 5 in any available year
for(y in available_years) {
temp_var <- paste0("temp_", condition, y)
age_var <- paste0("Age2_Mo", y)
data <- data %>%
mutate(!!before_var := ifelse(
!is.na(.data[[temp_var]]) & .data[[temp_var]] == 1 &
!is.na(.data[[age_var]]) & .data[[age_var]] < 60,
1,
.data[[before_var]]
))
}
# Convert to NA where we don't have data
data <- data %>%
mutate(!!before_var := ifelse(
is.na(Res_0to3),
NA,
.data[[before_var]]
))
# Remove temporary variables
data <- data %>% select(-all_of(temp_vars))
}
## Processing condition: Brain
## Found: Brain86
## Found: Brain88
## Found: Brain90
## Processing condition: Hyper
## Found: Hyper86
## Found: Hyper88
## Found: Hyper90
## Processing condition: Asthma
## Found: Asthma86
## Found: Asthma88
## Found: Asthma90
## Processing condition: Resp
## Found: Resp86
## Found: Resp88
## Found: Resp90
## Processing condition: Speech
## Found: Speech86
## Found: Speech88
## Found: Speech90
## Processing condition: Deaf
## Found: Deaf86
## Found: Deaf88
## Found: Deaf90
## Processing condition: Blind
## Found: Blind86
## Found: Blind88
## Found: Blind90
## Processing condition: Disturb
## Found: Disturb86
## Found: Disturb88
## Found: Disturb90
## Processing condition: Allergy
## Found: Allergy86
## Found: Allergy88
## Found: Allergy90
## Processing condition: Crippled
## Found: Crippled86
## Found: Crippled88
## Found: Crippled90
## Processing condition: Retard
## Found: Retard86
## Found: Retard88
## Found: Retard90
## Processing condition: Heart
## Found: Heart86
## Found: Heart88
## Found: Heart90
## Processing condition: Nerve
## Found: Nerve86
## Found: Nerve88
## Found: Nerve90
## Processing condition: Ear
## Found: Ear88
## Found: Ear90
## Processing condition: Blood
## Found: Blood88
## Found: Blood90
## Processing condition: Epilepsy
## Found: Epilepsy88
## Found: Epilepsy90
## Processing condition: OtherLim
## Found: OtherLim86
## Found: OtherLim88
## Found: OtherLim90
# Create overall health condition variable
available_before_vars <- names(data)[grepl("_before$", names(data)) &
!names(data) %in% c("HealthCond_before")]
cat("Available _before variables for HealthCond_before:\n")
## Available _before variables for HealthCond_before:
print(available_before_vars)
## [1] "Brain_before" "Hyper_before" "Asthma_before" "Resp_before"
## [5] "Speech_before" "Deaf_before" "Blind_before" "Disturb_before"
## [9] "Allergy_before" "Crippled_before" "Retard_before" "Heart_before"
## [13] "Nerve_before" "Ear_before" "Blood_before" "Epilepsy_before"
## [17] "OtherLim_before"
if(length(available_before_vars) > 0) {
data <- data %>%
mutate(
HealthCond_before = do.call(pmax, c(select(., all_of(available_before_vars)), na.rm = TRUE)),
HealthCond_before = ifelse(is.na(HealthCond_before), 0, HealthCond_before),
HealthCond_before = ifelse(is.na(Res_0to3), NA, HealthCond_before)
)
} else {
data <- data %>%
mutate(HealthCond_before = NA_real_)
}
# Birth weight variables
data <- data %>%
mutate(
Low_BW = ifelse(BirthWeight < 88, 1, 0),
Low_BW = ifelse(is.na(BirthWeight), NA, Low_BW),
VLow_BW = ifelse(BirthWeight < 53, 1, 0),
VLow_BW = ifelse(is.na(BirthWeight), NA, VLow_BW)
)
# Attrition variable - CORRECTED VERSION
data <- data %>%
mutate(
# Initial definition: 0 if last interview was in 2004, 1 if earlier
Attrit = case_when(
YA_LastInterview == 2004 ~ 0,
!is.na(YA_LastInterview) & YA_LastInterview != 2004 ~ 1,
TRUE ~ NA_real_
),
# Count those who attrited before 2004 but after they turned 19 as being in the sample
Attrit = case_when(
# If attrited in 2002 but were 19+ by 2002 → NOT attrited
Attrit == 1 & YA_LastInterview == 2002 & Age2_Yr102 >= 19 ~ 0,
# If attrited in 2000 but were 19+ by 2000 → NOT attrited
Attrit == 1 & YA_LastInterview == 2000 & Age2_Yr100 >= 19 ~ 0,
# If attrited in 1994/1996/1998 but were 19+ by their last interview → NOT attrited
Attrit == 1 & YA_LastInterview == 1994 & Age2_Yr94 >= 19 ~ 0,
Attrit == 1 & YA_LastInterview == 1996 & Age2_Yr96 >= 19 ~ 0,
Attrit == 1 & YA_LastInterview == 1998 & Age2_Yr98 >= 19 ~ 0,
# Keep original value for all other cases
TRUE ~ Attrit
)
)
# Create estimation sample dummies
for(x in 1:3) {
sample_var <- paste0("Sample90_", x)
hs_fe_var <- paste0("HS", x, "_FE90")
data <- data %>%
mutate(
!!sample_var := ifelse(!is.na(.data[[hs_fe_var]]) &
SampleID != 12 &
Attrit == 0 &
Age2_Yr104 >= 19, 1, 0),
!!sample_var := ifelse(!is.na(.data[[hs_fe_var]]) &
Attrit == 0 &
.data[[sample_var]] != 1 &
DOB_Yr_Child == 1985 &
DOB_Mo_Child < 8, 1, .data[[sample_var]])
)
}
# Head Start timing variables
data <- data %>%
mutate(
Three = ifelse(Age_1stHS88 <= 3 | Age_1stHS90 <= 3, 1, 0),
NotThree = ifelse((Age_1stHS88 > 3 & !is.na(Age_1stHS88)) |
(Age_1stHS90 > 3 & !is.na(Age_1stHS90)), 1, 0),
HS_3 = case_when(
Three == 1 ~ 1,
NotThree == 1 ~ 2,
TRUE ~ 0
),
PreK_FE_3 = PreK_FE,
PreK_FE_3 = ifelse(PreK_FE_3 == 1 & HS_3 == 1, 0, PreK_FE_3)
)
# Create income variables for ages 0-3 and at age 3
# (Continuing with similar pattern for the remaining variables...)
# Due to length constraints, I'll show the pattern for a few more key variables:
# First Born and Gender
data <- data %>%
mutate(
FirstBorn = ifelse(BirthOrder == 1, 1, 0),
FirstBorn = ifelse(is.na(BirthOrder), NA, FirstBorn),
Male = ifelse(Sex_Child == 1, 1, 0)
)
# PPVT score at age 3 - CORRECTED VERSION
# First create the variable with 1986 data
data <- data %>%
mutate(
PPVTat3 = ifelse(PPVTAge86 >= 36 & PPVTAge86 < 47, PPVT_Raw86, NA_real_)
)
# Then replace with 1988 data if still missing
data <- data %>%
mutate(
PPVTat3 = ifelse(
PPVTAge88 >= 36 & PPVTAge88 < 47 & is.na(PPVTat3),
PPVT_Raw88,
PPVTat3
)
)
# Finally replace with 1990 data if still missing
data <- data %>%
mutate(
PPVTat3 = ifelse(
PPVTAge90 >= 36 & PPVTAge90 < 47 & is.na(PPVTat3),
PPVT_Raw90,
PPVTat3
)
)
# Create pretreatment index (simplified version)
covariates <- c("Res_0to3", "HealthCond_before", "logBW", "LogInc_0to3", "LogIncAt3",
"FirstBorn", "Male", "Age2_Yr104", "HOME_Pct_0to3", "Moth_HrsWorked_BefBirth",
"Moth_HrsWorked_0to1", "Father_HH_0to3", "GMom_0to3", "MomCare", "RelCare",
"NonRelCare", "Moth_Smoke_BefBirth", "Alc_BefBirth", "Breastfed", "Doctor_0to3",
"Dentist_0to3", "Moth_WeightChange", "Illness_1stYr", "Premature",
"Insurance_0to3", "Medicaid_0to3")
# Standardize covariates
for(covar in covariates) {
if(covar %in% names(data)) {
std_var <- paste0(covar, "_CV")
data <- data %>%
mutate(!!std_var := as.numeric(scale(.data[[covar]])))
}
}
# Reverse sign for specific variables
reverse_vars <- c("HealthCond_before_CV", "Male_CV", "Age2_Yr104_CV", "GMom_0to3_CV",
"MomCare_CV", "RelCare_CV", "Moth_Smoke_BefBirth_CV", "Alc_BefBirth_CV",
"Illness_1stYr_CV", "Premature_CV", "Medicaid_0to3_CV")
for(var in reverse_vars) {
if(var %in% names(data)) {
data[[var]] <- -data[[var]]
}
}
# Create pretreatment index
cv_vars <- names(data)[grepl("_CV$", names(data))]
data <- data %>%
mutate(
temp = rowMeans(select(., all_of(cv_vars)), na.rm = TRUE),
PreTreatIndex = as.numeric(scale(temp))
) %>%
select(-temp, -all_of(cv_vars))
############Show Table 1 ###########
# CORRECTED TABLE 1 GENERATION - INCLUDING ALL FE GROUPS
library(tidyverse)
# Enhanced function to calculate stats with small sample handling
calculate_stats_robust <- function(data_filter, group_var, stat_vars) {
# Get the group variable name as string
group_var_name <- as.character(substitute(group_var))
result <- data_filter %>%
group_by({{group_var}}) %>%
summarise(
across(all_of(stat_vars),
list(
mean = ~ifelse(sum(!is.na(.)) >= 1, mean(., na.rm = TRUE), NA_real_),
sd = ~ifelse(sum(!is.na(.)) >= 2, sd(., na.rm = TRUE), NA_real_),
n = ~sum(!is.na(.))
),
.names = "{.col}_{.fn}"),
.groups = 'drop'
)
# Ensure we have all three groups (1, 2, 3) even if empty
all_groups <- data.frame(temp_group = 1:3)
colnames(all_groups) <- group_var_name
result <- all_groups %>%
left_join(result, by = group_var_name)
return(result)
}
# Define variables for Table 1
table1_vars <- c("PermInc", "MomDropout", "MomSomeColl", "AgeAFQT_std", "HighGrade_GMom79")
# Calculate statistics for each group with robust handling
# White/Hispanic - Full sample
white_full <- data %>%
filter(NonBlack == 1, !is.na(PreK)) %>%
calculate_stats_robust(PreK, table1_vars)
# White/Hispanic - FE sample (using PreK_FE variable)
white_fe <- data %>%
filter(NonBlack == 1, !is.na(PreK_FE)) %>%
calculate_stats_robust(PreK_FE, table1_vars)
# Black - Full sample
black_full <- data %>%
filter(Black == 1, !is.na(PreK)) %>%
calculate_stats_robust(PreK, table1_vars)
# Black - FE sample (using PreK_FE variable)
black_fe <- data %>%
filter(Black == 1, !is.na(PreK_FE)) %>%
calculate_stats_robust(PreK_FE, table1_vars)
# Print sample sizes for debugging
cat("White/Hispanic FE sample sizes:\n")
## White/Hispanic FE sample sizes:
print(white_fe %>% select(PreK_FE, ends_with("_n")))
## PreK_FE PermInc_n MomDropout_n MomSomeColl_n AgeAFQT_std_n HighGrade_GMom79_n
## 1 1 32 32 32 26 32
## 2 2 NA NA NA NA NA
## 3 3 NA NA NA NA NA
cat("\nBlack FE sample sizes:\n")
##
## Black FE sample sizes:
print(black_fe %>% select(PreK_FE, ends_with("_n")))
## PreK_FE PermInc_n MomDropout_n MomSomeColl_n AgeAFQT_std_n HighGrade_GMom79_n
## 1 1 20 20 20 17 17
## 2 2 NA NA NA NA NA
## 3 3 NA NA NA NA NA
# Check if we have FE data for all groups
cat("\nChecking FE data availability:\n")
##
## Checking FE data availability:
cat("White/Hispanic FE - HS group has data:", !is.na(white_fe$PermInc_mean[1]), "n =", white_fe$PermInc_n[1], "\n")
## White/Hispanic FE - HS group has data: TRUE n = 32
cat("White/Hispanic FE - Pre group has data:", !is.na(white_fe$PermInc_mean[2]), "n =", white_fe$PermInc_n[2], "\n")
## White/Hispanic FE - Pre group has data: FALSE n = NA
cat("White/Hispanic FE - None group has data:", !is.na(white_fe$PermInc_mean[3]), "n =", white_fe$PermInc_n[3], "\n")
## White/Hispanic FE - None group has data: FALSE n = NA
cat("Black FE - HS group has data:", !is.na(black_fe$PermInc_mean[1]), "n =", black_fe$PermInc_n[1], "\n")
## Black FE - HS group has data: TRUE n = 20
cat("Black FE - Pre group has data:", !is.na(black_fe$PermInc_mean[2]), "n =", black_fe$PermInc_n[2], "\n")
## Black FE - Pre group has data: FALSE n = NA
cat("Black FE - None group has data:", !is.na(black_fe$PermInc_mean[3]), "n =", black_fe$PermInc_n[3], "\n")
## Black FE - None group has data: FALSE n = NA
# If FE data is missing for some groups, let's check what's in the actual data
cat("\nChecking actual FE data distribution:\n")
##
## Checking actual FE data distribution:
fe_white_dist <- data %>%
filter(NonBlack == 1, !is.na(PreK_FE)) %>%
count(PreK_FE)
cat("White/Hispanic FE distribution:\n")
## White/Hispanic FE distribution:
print(fe_white_dist)
## # A tibble: 1 × 2
## PreK_FE n
## <dbl> <int>
## 1 1 32
fe_black_dist <- data %>%
filter(Black == 1, !is.na(PreK_FE)) %>%
count(PreK_FE)
cat("Black FE distribution:\n")
## Black FE distribution:
print(fe_black_dist)
## # A tibble: 1 × 2
## PreK_FE n
## <dbl> <int>
## 1 1 20
# Create table data frame
table_data <- data.frame(
Variable = character(),
HS_White = character(),
Pre_White = character(),
None_White = character(),
HS_Black = character(),
Pre_Black = character(),
None_Black = character(),
Diff_White = character(),
Diff_Black = character(),
stringsAsFactors = FALSE
)
# Enhanced helper function with complete FE data
add_table_row <- function(table_df, var_name, var_column, white_full_data, white_fe_data, black_full_data, black_fe_data, is_money = FALSE, is_percent = FALSE, is_test_score = FALSE) {
format_cell <- function(mean_val, sd_val, n_val) {
if (is.na(mean_val) | is.na(n_val) | n_val == 0) {
return("NA [NA]")
} else if (is_money) {
return(sprintf("%.0f [%.0f]", mean_val, ifelse(is.na(sd_val), 0, sd_val)))
} else if (is_percent) {
return(sprintf("%.2f [%.2f]", mean_val, ifelse(is.na(sd_val), 0, sd_val)))
} else if (is_test_score) {
return(sprintf("%.1f [%.1f]", mean_val, ifelse(is.na(sd_val), 0, sd_val)))
} else {
return(sprintf("%.1f [%.1f]", mean_val, ifelse(is.na(sd_val), 0, sd_val)))
}
}
# Full sample row - uses full sample data
full_row <- data.frame(
Variable = var_name,
HS_White = format_cell(white_full_data[[paste0(var_column, "_mean")]][1],
white_full_data[[paste0(var_column, "_sd")]][1],
white_full_data[[paste0(var_column, "_n")]][1]),
Pre_White = format_cell(white_full_data[[paste0(var_column, "_mean")]][2],
white_full_data[[paste0(var_column, "_sd")]][2],
white_full_data[[paste0(var_column, "_n")]][2]),
None_White = format_cell(white_full_data[[paste0(var_column, "_mean")]][3],
white_full_data[[paste0(var_column, "_sd")]][3],
white_full_data[[paste0(var_column, "_n")]][3]),
HS_Black = format_cell(black_full_data[[paste0(var_column, "_mean")]][1],
black_full_data[[paste0(var_column, "_sd")]][1],
black_full_data[[paste0(var_column, "_n")]][1]),
Pre_Black = format_cell(black_full_data[[paste0(var_column, "_mean")]][2],
black_full_data[[paste0(var_column, "_sd")]][2],
black_full_data[[paste0(var_column, "_n")]][2]),
None_Black = format_cell(black_full_data[[paste0(var_column, "_mean")]][3],
black_full_data[[paste0(var_column, "_sd")]][3],
black_full_data[[paste0(var_column, "_n")]][3]),
Diff_White = "—",
Diff_Black = "—",
stringsAsFactors = FALSE
)
# Fixed effects row - uses FE sample data for ALL groups
# This is the key correction: we need to ensure FE data exists for Pre and None groups
fe_row <- data.frame(
Variable = paste0(var_name, " (FE)"),
HS_White = format_cell(white_fe_data[[paste0(var_column, "_mean")]][1],
white_fe_data[[paste0(var_column, "_sd")]][1],
white_fe_data[[paste0(var_column, "_n")]][1]),
Pre_White = format_cell(white_fe_data[[paste0(var_column, "_mean")]][2],
white_fe_data[[paste0(var_column, "_sd")]][2],
white_fe_data[[paste0(var_column, "_n")]][2]),
None_White = format_cell(white_fe_data[[paste0(var_column, "_mean")]][3],
white_fe_data[[paste0(var_column, "_sd")]][3],
white_fe_data[[paste0(var_column, "_n")]][3]),
HS_Black = format_cell(black_fe_data[[paste0(var_column, "_mean")]][1],
black_fe_data[[paste0(var_column, "_sd")]][1],
black_fe_data[[paste0(var_column, "_n")]][1]),
Pre_Black = format_cell(black_fe_data[[paste0(var_column, "_mean")]][2],
black_fe_data[[paste0(var_column, "_sd")]][2],
black_fe_data[[paste0(var_column, "_n")]][2]),
None_Black = format_cell(black_fe_data[[paste0(var_column, "_mean")]][3],
black_fe_data[[paste0(var_column, "_sd")]][3],
black_fe_data[[paste0(var_column, "_n")]][3]),
Diff_White = "—",
Diff_Black = "—",
stringsAsFactors = FALSE
)
bind_rows(table_df, full_row, fe_row)
}
# Build the table
table_data <- add_table_row(table_data,
"Permanent income", "PermInc",
white_full, white_fe, black_full, black_fe,
is_money = TRUE)
table_data <- add_table_row(table_data,
"Mother < high school", "MomDropout",
white_full, white_fe, black_full, black_fe,
is_percent = TRUE)
table_data <- add_table_row(table_data,
"Mother some college", "MomSomeColl",
white_full, white_fe, black_full, black_fe,
is_percent = TRUE)
table_data <- add_table_row(table_data,
"Maternal AFQT", "AgeAFQT_std",
white_full, white_fe, black_full, black_fe,
is_test_score = TRUE)
table_data <- add_table_row(table_data,
"Grandmother's education", "HighGrade_GMom79",
white_full, white_fe, black_full, black_fe)
# Add sample sizes with proper formatting
sample_sizes <- data.frame(
Variable = "Sample size",
HS_White = as.character(white_full$PermInc_n[1]),
Pre_White = as.character(white_full$PermInc_n[2]),
None_White = as.character(white_full$PermInc_n[3]),
HS_Black = as.character(black_full$PermInc_n[1]),
Pre_Black = as.character(black_full$PermInc_n[2]),
None_Black = as.character(black_full$PermInc_n[3]),
Diff_White = "",
Diff_Black = "",
stringsAsFactors = FALSE
)
sample_sizes_fe <- data.frame(
Variable = "Sample size — FE",
HS_White = ifelse(is.na(white_fe$PermInc_n[1]) | white_fe$PermInc_n[1] == 0,
"NA", as.character(white_fe$PermInc_n[1])),
Pre_White = ifelse(is.na(white_fe$PermInc_n[2]) | white_fe$PermInc_n[2] == 0,
"NA", as.character(white_fe$PermInc_n[2])),
None_White = ifelse(is.na(white_fe$PermInc_n[3]) | white_fe$PermInc_n[3] == 0,
"NA", as.character(white_fe$PermInc_n[3])),
HS_Black = ifelse(is.na(black_fe$PermInc_n[1]) | black_fe$PermInc_n[1] == 0,
"NA", as.character(black_fe$PermInc_n[1])),
Pre_Black = ifelse(is.na(black_fe$PermInc_n[2]) | black_fe$PermInc_n[2] == 0,
"NA", as.character(black_fe$PermInc_n[2])),
None_Black = ifelse(is.na(black_fe$PermInc_n[3]) | black_fe$PermInc_n[3] == 0,
"NA", as.character(black_fe$PermInc_n[3])),
Diff_White = "",
Diff_Black = "",
stringsAsFactors = FALSE
)
table_data <- bind_rows(table_data, sample_sizes, sample_sizes_fe)
# Enhanced difference calculation with proper FE data
calculate_simple_diff <- function(data, var) {
hs_mean <- data[[paste0(var, "_mean")]][1]
none_mean <- data[[paste0(var, "_mean")]][3]
hs_sd <- data[[paste0(var, "_sd")]][1]
none_sd <- data[[paste0(var, "_sd")]][3]
hs_n <- data[[paste0(var, "_n")]][1]
none_n <- data[[paste0(var, "_n")]][3]
# Only calculate if we have valid data
if (is.na(hs_mean) | is.na(none_mean) | is.na(hs_n) | is.na(none_n) |
hs_n == 0 | none_n == 0 | is.na(hs_sd) | is.na(none_sd)) {
return(NA)
}
pooled_sd <- sqrt((hs_sd^2 + none_sd^2) / 2)
(hs_mean - none_mean) / pooled_sd
}
# Add differences to the table
for (i in 1:nrow(table_data)) {
if (table_data$Variable[i] == "Permanent income") {
# Full sample differences
diff_white <- calculate_simple_diff(white_full, "PermInc")
diff_black <- calculate_simple_diff(black_full, "PermInc")
table_data$Diff_White[i] <- ifelse(is.na(diff_white), "—", sprintf("%.2f", diff_white))
table_data$Diff_Black[i] <- ifelse(is.na(diff_black), "—", sprintf("%.2f", diff_black))
}
else if (table_data$Variable[i] == "Permanent income (FE)") {
# FE sample differences
diff_white <- calculate_simple_diff(white_fe, "PermInc")
diff_black <- calculate_simple_diff(black_fe, "PermInc")
table_data$Diff_White[i] <- ifelse(is.na(diff_white), "—", sprintf("%.2f", diff_white))
table_data$Diff_Black[i] <- ifelse(is.na(diff_black), "—", sprintf("%.2f", diff_black))
}
else if (table_data$Variable[i] == "Mother < high school") {
diff_white <- calculate_simple_diff(white_full, "MomDropout")
diff_black <- calculate_simple_diff(black_full, "MomDropout")
table_data$Diff_White[i] <- ifelse(is.na(diff_white), "—", sprintf("%.2f", diff_white))
table_data$Diff_Black[i] <- ifelse(is.na(diff_black), "—", sprintf("%.2f", diff_black))
}
else if (table_data$Variable[i] == "Mother < high school (FE)") {
diff_white <- calculate_simple_diff(white_fe, "MomDropout")
diff_black <- calculate_simple_diff(black_fe, "MomDropout")
table_data$Diff_White[i] <- ifelse(is.na(diff_white), "—", sprintf("%.2f", diff_white))
table_data$Diff_Black[i] <- ifelse(is.na(diff_black), "—", sprintf("%.2f", diff_black))
}
else if (table_data$Variable[i] == "Mother some college") {
diff_white <- calculate_simple_diff(white_full, "MomSomeColl")
diff_black <- calculate_simple_diff(black_full, "MomSomeColl")
table_data$Diff_White[i] <- ifelse(is.na(diff_white), "—", sprintf("%.2f", diff_white))
table_data$Diff_Black[i] <- ifelse(is.na(diff_black), "—", sprintf("%.2f", diff_black))
}
else if (table_data$Variable[i] == "Mother some college (FE)") {
diff_white <- calculate_simple_diff(white_fe, "MomSomeColl")
diff_black <- calculate_simple_diff(black_fe, "MomSomeColl")
table_data$Diff_White[i] <- ifelse(is.na(diff_white), "—", sprintf("%.2f", diff_white))
table_data$Diff_Black[i] <- ifelse(is.na(diff_black), "—", sprintf("%.2f", diff_black))
}
else if (table_data$Variable[i] == "Maternal AFQT") {
diff_white <- calculate_simple_diff(white_full, "AgeAFQT_std")
diff_black <- calculate_simple_diff(black_full, "AgeAFQT_std")
table_data$Diff_White[i] <- ifelse(is.na(diff_white), "—", sprintf("%.2f", diff_white))
table_data$Diff_Black[i] <- ifelse(is.na(diff_black), "—", sprintf("%.2f", diff_black))
}
else if (table_data$Variable[i] == "Maternal AFQT (FE)") {
diff_white <- calculate_simple_diff(white_fe, "AgeAFQT_std")
diff_black <- calculate_simple_diff(black_fe, "AgeAFQT_std")
table_data$Diff_White[i] <- ifelse(is.na(diff_white), "—", sprintf("%.2f", diff_white))
table_data$Diff_Black[i] <- ifelse(is.na(diff_black), "—", sprintf("%.2f", diff_black))
}
else if (table_data$Variable[i] == "Grandmother's education") {
diff_white <- calculate_simple_diff(white_full, "HighGrade_GMom79")
diff_black <- calculate_simple_diff(black_full, "HighGrade_GMom79")
table_data$Diff_White[i] <- ifelse(is.na(diff_white), "—", sprintf("%.2f", diff_white))
table_data$Diff_Black[i] <- ifelse(is.na(diff_black), "—", sprintf("%.2f", diff_black))
}
else if (table_data$Variable[i] == "Grandmother's education (FE)") {
diff_white <- calculate_simple_diff(white_fe, "HighGrade_GMom79")
diff_black <- calculate_simple_diff(black_fe, "HighGrade_GMom79")
table_data$Diff_White[i] <- ifelse(is.na(diff_white), "—", sprintf("%.2f", diff_white))
table_data$Diff_Black[i] <- ifelse(is.na(diff_black), "—", sprintf("%.2f", diff_black))
}
}
# Save results
write_csv(table_data, "table1_complete.csv")
# Print final table
cat("\nFINAL TABLE 1:\n")
##
## FINAL TABLE 1:
print(table_data, row.names = FALSE)
## Variable HS_White Pre_White None_White
## Permanent income 27633 [20142] 55591 [46623] 36743 [26025]
## Permanent income (FE) 51918 [25108] NA [NA] NA [NA]
## Mother < high school 0.47 [0.50] 0.17 [0.37] 0.43 [0.49]
## Mother < high school (FE) 0.19 [0.40] NA [NA] NA [NA]
## Mother some college 0.22 [0.42] 0.44 [0.50] 0.23 [0.42]
## Mother some college (FE) 0.28 [0.46] NA [NA] NA [NA]
## Maternal AFQT -0.4 [0.7] 0.3 [0.9] -0.2 [0.9]
## Maternal AFQT (FE) 0.2 [0.9] NA [NA] NA [NA]
## Grandmother's education 8.6 [3.5] 10.9 [3.0] 9.4 [3.4]
## Grandmother's education (FE) 10.2 [3.7] NA [NA] NA [NA]
## Sample size 525 1361 1882
## Sample size — FE 32 NA NA
## HS_Black Pre_Black None_Black Diff_White Diff_Black
## 24879 [18124] 35194 [24322] 25893 [22217] -0.39 -0.05
## 35261 [20314] NA [NA] NA [NA] — —
## 0.31 [0.46] 0.16 [0.37] 0.40 [0.49] 0.10 -0.20
## 0.15 [0.37] NA [NA] NA [NA] — —
## 0.33 [0.47] 0.51 [0.50] 0.28 [0.45] -0.02 0.12
## 0.45 [0.51] NA [NA] NA [NA] — —
## -0.7 [0.5] -0.4 [0.7] -0.7 [0.6] -0.28 -0.04
## -0.3 [1.0] NA [NA] NA [NA] — —
## 9.9 [2.4] 11.0 [2.6] 9.9 [2.7] -0.22 -0.01
## 9.4 [3.4] NA [NA] NA [NA] — —
## 583 435 759
## 20 NA NA
cat("\nTable generation complete with ALL FE groups included!\n")
##
## Table generation complete with ALL FE groups included!