A peak of the final dataframe is at the end, the data manipulation gets pretty complicated.
geography <- "public use microdata area"
all_vars <- load_variables(2022, "acs5") # Plain text labels associated with variable codes
education_data <- get_acs(
geography = geography,
table = "B15001",
year = 2022,
survey = "acs5",
cache_table = TRUE
)
birth_data <- get_acs(
geography = geography,
table = "B13016",
year = 2022,
survey = "acs5",
cache_table = TRUE
)
# Function to make labels readable
convert_label_to_varname <- function(label) {
label |>
str_replace_all("^(Estimate!!|Total:!!)", "") |> # Remove "Estimate!!" and "Total:!!" prefixes
str_replace_all("!!", " ") |> # Replace remaining "!!" with a single space
str_to_lower() |> # Convert to lowercase
str_replace_all("[^a-z0-9]+", "_") |> # Replace special characters and spaces with underscores
str_replace_all("^_|_$", "") # Remove leading or trailing underscores
}
# Fetch the labels for the datasets
education_labels <- all_vars |>
filter(str_detect(name, "B15001")) |>
dplyr::rename(variable = name) # Rename function was mapping to plyr by default
birth_labels <- all_vars |>
filter(str_detect(name, "B13016")) |>
dplyr::rename(variable = name)
# Label the datasets
labeled_education_data <- education_data |>
left_join(education_labels, by = "variable") |> # Join to get labels into education data
sapply(convert_label_to_varname) |> # Convert labels to an easier to understand format
as_tibble() |>
filter(str_detect(label, "female")) |> # Get rid of male data
select(GEOID, NAME, estimate, moe, label) # Remove unneeded columns
labeled_education_data %<>% mutate(estimate = as.integer(estimate), moe = as.integer(moe)) # Fix column typing
labeled_birth_data <- birth_labels |>
left_join(birth_data, by = "variable") |> # Join to get labels into education data
sapply(convert_label_to_varname) |> # Convert labels to an easier to understand format
as_tibble() |>
arrange(GEOID) |> # Reordering for readability when developing
select(GEOID, NAME, label, estimate, moe) # Keep the required fields
labeled_birth_data %<>% mutate(estimate = as.integer(estimate)) # Fix column typing
# Summarize education data, previously split by age groups
summarised_ed <- labeled_education_data |>
mutate(prelabel = str_remove(label, r"(\w*_\w*_\d*_\w*_\d*_years_)")) |> # Removes age range from labels
mutate(relabel = str_remove(prelabel, r"(\w*_\w*_65_years_\w*_over_)")) |> # 65 and over has a slightly different pattern
select(!c(label, prelabel)) |>
group_by(GEOID, NAME, relabel) |> # relabel is of the form "less_than_9th_grade"
summarise(pop_estimate = sum(estimate)) # collapse age ranges to education aggregates
## `summarise()` has grouped output by 'GEOID', 'NAME'. You can override using the
## `.groups` argument.
# Pivot wider education data, now one row per PUMA
tidied_ed <- summarised_ed |>
pivot_wider(names_from = relabel, values_from = pop_estimate) |>
select(!starts_with("total_female_")) # drop the population totals by age
# Truncating birth data labels to remove overly long values
relabeled_birth_data <- labeled_birth_data |> # Label trimming is different for this data
mutate(trimmed_label = case_when(
str_detect(label, "total_women_who_had_a_birth_in_the_past_12_months_") ~ paste0("age_", str_extract(label, r"(\d*_to_\d*)"), "_birth"),
str_detect(label, "total_women_who_did_not_have_a_birth_in_the_past_12_months_") ~ paste0("age_", str_extract(label, r"(\d*_to_\d*)"), "_none"),
.default = label
)) |>
select(GEOID, NAME, trimmed_label, estimate)
# Pivoting birth data, now one row per PUMA
detailed_birth_data <- relabeled_birth_data |>
pivot_wider(names_from = trimmed_label, values_from = estimate)
# Removing granular columns and calculating birth rate
births_per_thousand <- detailed_birth_data |>
mutate(births_per_thousand = total_women_who_had_a_birth_in_the_past_12_months / total * 1000) |>
select(GEOID, NAME, births_per_thousand)
# Join education and fertility data
fertility_and_education <- tidied_ed |>
left_join(births_per_thousand, by = "GEOID")
# Converting population of education achievement levels to proportions. This is the final dataframe for analysis
prop_fertility_and_education <- fertility_and_education |>
mutate(
prop_less_ninth = less_than_9th_grade / total_female,
prop_no_diploma = `9th_to_12th_grade_no_diploma` / total_female,
prop_diploma_or_equivalent = high_school_graduate_includes_equivalency / total_female,
prop_some_college = some_college_no_degree / total_female,
prop_associate_degree = associate_s_degree / total_female,
prop_bachelor_degree = bachelor_s_degree / total_female,
prop_advanced_degree = graduate_or_professional_degree / total_female
) |>
select(GEOID, NAME.x, prop_less_ninth, prop_no_diploma, prop_diploma_or_equivalent, prop_some_college, prop_associate_degree, prop_bachelor_degree, prop_advanced_degree, births_per_thousand)
prop_fertility_and_education |>
head(10) |>
kbl() |>
kable_material(c("striped"))
| GEOID | NAME.x | prop_less_ninth | prop_no_diploma | prop_diploma_or_equivalent | prop_some_college | prop_associate_degree | prop_bachelor_degree | prop_advanced_degree | births_per_thousand |
|---|---|---|---|---|---|---|---|---|---|
| 0100100 | lauderdale_colbert_franklin_counties_puma_alabama | 0.0313829 | 0.0830847 | 0.3394154 | 0.2447783 | 0.0894880 | 0.1354070 | 0.0764437 | 47.03931 |
| 0100200 | limestone_county_puma_alabama | 0.0341747 | 0.0816135 | 0.2531057 | 0.2344717 | 0.1143529 | 0.1812918 | 0.1009899 | 38.29303 |
| 0100300 | morgan_lawrence_counties_decatur_city_puma_alabama | 0.0450093 | 0.0954748 | 0.3419250 | 0.2191532 | 0.0916538 | 0.1349955 | 0.0717882 | 54.07314 |
| 0100401 | madison_county_north_east_huntsville_city_east_puma_alabama | 0.0222519 | 0.0418584 | 0.2557741 | 0.2203623 | 0.0882072 | 0.2403023 | 0.1312437 | 49.73894 |
| 0100402 | huntsville_north_far_west_madison_east_triana_cities_puma_alabama | 0.0163848 | 0.0373943 | 0.1927246 | 0.2424989 | 0.0848961 | 0.2678628 | 0.1582385 | 47.86090 |
| 0100403 | huntsville_city_central_south_puma_alabama | 0.0222660 | 0.0621319 | 0.1977666 | 0.2497735 | 0.0774214 | 0.2551871 | 0.1354535 | 52.11753 |
| 0100501 | marshall_madison_far_southeast_counties_puma_alabama | 0.0479746 | 0.0816117 | 0.3214848 | 0.2194284 | 0.1230736 | 0.1309223 | 0.0755046 | 61.76211 |
| 0100600 | dekalb_jackson_counties_puma_alabama | 0.0689824 | 0.1166813 | 0.3241602 | 0.2079287 | 0.1337123 | 0.0908589 | 0.0576762 | 62.72640 |
| 0100700 | etowah_cherokee_counties_puma_alabama | 0.0393950 | 0.0876117 | 0.3239437 | 0.2519120 | 0.1212896 | 0.1053877 | 0.0704604 | 56.64153 |
| 0100800 | calhoun_county_puma_alabama | 0.0412083 | 0.0832516 | 0.3355113 | 0.2640336 | 0.0801620 | 0.1195332 | 0.0763000 | 56.51008 |
Does the proportional education level of the female population of a community impact the birth rate in the community?
This question interests me as there is an increasing focus on female fertility rates in the United States coming from Conservative political groups. The perception among these groups is that women who go to college have measurably fewer children which, in their view, is detrimental to society. Setting aside the inflammatory rhetoric, I want to explore whether the basic premise of these arguments, that women with more education have fewer children, is accurate. This is a question that is not unexplored, and there have been studies done by the CDC and NIH on hospital birth data. These studies perform a fair amount of extrapolation to calculate fertility rates, and I would like to take a broader, simpler approach. Using ACS data, we can determine the total female population, number of births, and the proportional educational achievement of the female population. If the premise of the Conservative argument is true, we would expect a higher proportional education level to be a strong negative predictor of the number of births in an area.
The cases are Public Use Microdata Areas (PUMAs) from the 2022 American Community Survey 5 year estimate. A PUMA is defined by the Census Bureau and are “non-overlapping, statistical geographic areas that partition each state or equivalent entity into geographic areas containing no fewer than 100,000 people each” which allows for better granular data comparisons than county level data.
United States Census Bureau (2024, February 18) Public Use Microdata Areas (PUMAs). https://www.census.gov/programs-surveys/geography/guidance/geo-areas/pumas.html#reference
The American Community Survey is an ongoing survey conducted by the Census Bureau for use by State and Federal agencies. It tracks more detailed and up to date information on Americans and American households than the Decennial Census. More detailed information on the methodology of the survey can be found on the Census Bureau website: https://www.census.gov/programs-surveys/acs/methodology/design-and-methodology.html
This is an observational study.
The United States Census Bureau API, information on which can be found at: https://www.census.gov/data/developers/guidance/api-user-guide.html
The independent variables will be the proportion of the female population in a PUMA with a given education level. The levels are: less than 9th grade, 9th-12th with no diploma, high school diploma or equivalent, some college without a degree, associate’s degree, bachelor’s degree, and graduate or professional degree. The proportion of the female population with a given education level is used to account for variation in the population of PUMAs. The dependent variable is the number of births per thousand women in the PUMA.
The first two plots are just to demonstrate that we have a fairly normal distribution of data in terms of the female population of the PUMAs as well as the birth rate, with a few notable outliers.
pop_dist_plot <- labeled_education_data |>
filter(label == "total_female") |>
ggplot(aes(x = estimate)) +
geom_histogram() +
labs(title = "Distribution of Female Population in Census PUMAs", x = "Female Population Estimate")
pop_dist_plot
birth_dist_plot <- births_per_thousand |>
ggplot(aes(x = births_per_thousand)) +
geom_histogram() +
labs(title = "Distribution of Birth Rate in Census PUMAs", x = "Births per Thousand Women")
birth_dist_plot
The data on the highest attained education level has a much more scattered distribution. Most notably the “< 9th Grade” and “Graduate Degree or Higher” groups, as well as “9th-12th” to a lesser extent, have long tails likely indicating areas with high poverty rates or near major academic institutions respectively. Applying a transformation will bring the outliers in and improve our analysis.
female_pop_totals <- summarised_ed |>
filter(relabel == "total_female") |>
pivot_wider(names_from = relabel, values_from = pop_estimate)
percent_education <- summarised_ed |>
left_join(female_pop_totals, by = "GEOID") |>
filter(!str_detect(relabel, "total_female")) |>
mutate(pop_percent = pop_estimate / total_female * 100) |>
select(GEOID, relabel, pop_percent)
education_order <- c("graduate_or_professional_degree", "bachelor_s_degree", "associate_s_degree", "some_college_no_degree", "high_school_graduate_includes_equivalency", "9th_to_12th_grade_no_diploma", "less_than_9th_grade")
massaged_ed <- percent_education |>
mutate(ed_factor = factor(relabel, levels = education_order)) |>
mutate(renamed = fct_recode(ed_factor, "< 9th Grade" = "less_than_9th_grade", "9th-12th" = "9th_to_12th_grade_no_diploma", "HS diploma or equivalent" = "high_school_graduate_includes_equivalency", "Some College" = "some_college_no_degree", "Associate's Degree" = "associate_s_degree", "Bachelor's Degree" = "bachelor_s_degree", "Graduate Degree or Higher" = "graduate_or_professional_degree")) |>
select(GEOID, renamed, pop_percent)
ed_dist_plot <- massaged_ed |>
ggplot(aes(x = renamed, y = pop_percent)) +
geom_boxplot() +
labs(title = "Education Distribution of the Female Population of PUMAs", x = "Highest Level of Education Attained",
y = "Percent of Female Population") +
coord_flip()
ed_dist_plot
The summary statistics provide some numbers to pair with the visualizations, giving some context on the size of the largest outliers.
summary(prop_fertility_and_education |> ungroup() |> select(!c(GEOID, NAME.x)))
## prop_less_ninth prop_no_diploma prop_diploma_or_equivalent
## Min. :0.003256 Min. :0.005673 Min. :0.04423
## 1st Qu.:0.019773 1st Qu.:0.038805 1st Qu.:0.20811
## Median :0.030026 Median :0.055284 Median :0.26314
## Mean :0.043299 Mean :0.060066 Mean :0.25803
## 3rd Qu.:0.052237 3rd Qu.:0.076960 3rd Qu.:0.30970
## Max. :0.334876 Max. :0.193413 Max. :0.46100
## prop_some_college prop_associate_degree prop_bachelor_degree
## Min. :0.06949 Min. :0.01559 Min. :0.04673
## 1st Qu.:0.19204 1st Qu.:0.07650 1st Qu.:0.14743
## Median :0.22355 Median :0.09243 Median :0.19234
## Mean :0.21917 Mean :0.09260 Mean :0.20389
## 3rd Qu.:0.24946 3rd Qu.:0.10898 3rd Qu.:0.25403
## Max. :0.46620 Max. :0.17578 Max. :0.48135
## prop_advanced_degree births_per_thousand
## Min. :0.01422 Min. : 9.23
## 1st Qu.:0.07391 1st Qu.: 44.03
## Median :0.10650 Median : 51.49
## Mean :0.12295 Mean : 51.68
## 3rd Qu.:0.15566 3rd Qu.: 59.24
## Max. :0.50929 Max. :172.59
Using the caret package we can easily preprocess our data and apply a Box-Cox transformation. Modeling our data we can see that many of the educational levels are statistically significant predictors, although attending some college or achieving a bachelors degree are markedly poor predictors. It should be noted the \(R^2\) of this model is very low, indicating that while education level of the female population is a statistically significant predictor it is not a very strong predictor.
control <- trainControl(method = "repeatedcv",
number = 10,
repeats = 3)
proportional_model <- train(births_per_thousand ~ prop_less_ninth + prop_no_diploma + prop_diploma_or_equivalent + prop_some_college + prop_associate_degree + prop_bachelor_degree + prop_advanced_degree,
data = prop_fertility_and_education,
method = "lm",
preProcess = c("BoxCox"),
trControl = control)
summary(proportional_model)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -35.664 -6.956 -0.368 6.714 121.920
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 73.8412 17.8951 4.126 3.81e-05 ***
## prop_less_ninth -1.1071 0.5660 -1.956 0.050586 .
## prop_no_diploma 9.3146 2.7317 3.410 0.000661 ***
## prop_diploma_or_equivalent 22.6356 9.4206 2.403 0.016344 *
## prop_some_college 7.9614 11.5357 0.690 0.490160
## prop_associate_degree 36.4204 11.3007 3.223 0.001286 **
## prop_bachelor_degree 0.7213 2.5968 0.278 0.781211
## prop_advanced_degree -4.5143 1.0950 -4.123 3.87e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.23 on 2478 degrees of freedom
## Multiple R-squared: 0.1599, Adjusted R-squared: 0.1575
## F-statistic: 67.35 on 7 and 2478 DF, p-value: < 2.2e-16
When we combine the educational categories into “milestones reached” groups, i.e. those who have a high school diploma equivalent or less, those who have some college up to a bachelor’s, and those with an advanced degree we find that these broader groups retain their significance which is to be expected. The model has only a slightly lower \(R^2\) than before, indicating that the added granularity of the non-milestone data does not contain significant predictive power. It should be noted that the only predictor that negatively affects birth rate is the proportion of the population with an advanced degree. This would run counter to the original supposition that a college education is a strong negative predictor of birth rates, as education level is not a strong predictor of fertility and only an advanced degree is a negative predictor of fertility.
# Combining categories so that we have three "milestone" categories
condensed_ed <- prop_fertility_and_education |>
mutate(hs_diploma_or_less = prop_less_ninth + prop_no_diploma + prop_diploma_or_equivalent,
up_to_bachelors = prop_some_college + prop_associate_degree + prop_bachelor_degree)
condensed_model <- train(births_per_thousand ~ hs_diploma_or_less + up_to_bachelors + prop_advanced_degree,
data = condensed_ed,
method = "lm",
preProcess = c("BoxCox"),
trControl = control)
summary(condensed_model)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -37.945 -7.064 -0.312 6.789 120.933
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 48.084 6.465 7.437 1.41e-13 ***
## hs_diploma_or_less 32.481 11.383 2.854 0.00436 **
## up_to_bachelors 48.004 19.470 2.466 0.01375 *
## prop_advanced_degree -4.414 1.591 -2.774 0.00558 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.32 on 2482 degrees of freedom
## Multiple R-squared: 0.1444, Adjusted R-squared: 0.1434
## F-statistic: 139.6 on 3 and 2482 DF, p-value: < 2.2e-16
These high level statistics are interesting, but the analysis could be extended to more focussed impacts. The PUMAs never cross state lines, so the data could be aggregated at the state level to see if impacts are stronger in, for example, states that voted red or blue in the most recent election. Another interesting possibility would be to explore whether the educational proportions are strong predictors of birth rates in different age groups. This could answer questions such as whether higher relative education rates lower the teen pregnancy rate or more broadly if they are strong predictors of birth rates in different age brackets. CDC and NIH datasets track every birth in the United States and include the education level of the mother. While not available for privacy reasons, it would be interesting to map the hospital level data to the PUMAs to see if the proportional education level of an area has an impact on the birth rates of women in a given education group.