DATA 606 Data Project Proposal

Data Preparation

A peak of the final dataframe is at the end, the data manipulation gets pretty complicated.

geography <- "public use microdata area"

all_vars <- load_variables(2022, "acs5") # Plain text labels associated with variable codes

education_data <- get_acs( 
  geography = geography,
  table = "B15001",
  year = 2022,
  survey = "acs5",
  cache_table = TRUE
)

birth_data <- get_acs(
  geography = geography,
  table = "B13016",
  year = 2022,
  survey = "acs5",
  cache_table = TRUE
)

# Function to make labels readable
convert_label_to_varname <- function(label) {
  label |>
    str_replace_all("^(Estimate!!|Total:!!)", "") |> # Remove "Estimate!!" and "Total:!!" prefixes
    str_replace_all("!!", " ") |>                    # Replace remaining "!!" with a single space
    str_to_lower() |>                                # Convert to lowercase
    str_replace_all("[^a-z0-9]+", "_") |>            # Replace special characters and spaces with underscores
    str_replace_all("^_|_$", "")                     # Remove leading or trailing underscores
}

# Fetch the labels for the datasets
education_labels <- all_vars |>
  filter(str_detect(name, "B15001")) |>
  dplyr::rename(variable = name) # Rename function was mapping to plyr by default

birth_labels <- all_vars |>
  filter(str_detect(name, "B13016")) |>
  dplyr::rename(variable = name)

# Label the datasets
labeled_education_data <- education_data |>
  left_join(education_labels, by = "variable") |> # Join to get labels into education data
  sapply(convert_label_to_varname) |>             # Convert labels to an easier to understand format
  as_tibble() |>
  filter(str_detect(label, "female")) |>          # Get rid of male data
  select(GEOID, NAME, estimate, moe, label)       # Remove unneeded columns
labeled_education_data %<>% mutate(estimate = as.integer(estimate), moe = as.integer(moe)) # Fix column typing

labeled_birth_data <- birth_labels |>
  left_join(birth_data, by = "variable") |> # Join to get labels into education data
  sapply(convert_label_to_varname) |>       # Convert labels to an easier to understand format
  as_tibble() |>
  arrange(GEOID) |>                         # Reordering for readability when developing
  select(GEOID, NAME, label, estimate, moe) # Keep the required fields
labeled_birth_data %<>% mutate(estimate = as.integer(estimate)) # Fix column typing

# Summarize education data, previously split by age groups
summarised_ed <- labeled_education_data |>
  mutate(prelabel = str_remove(label, r"(\w*_\w*_\d*_\w*_\d*_years_)")) |>   # Removes age range from labels
  mutate(relabel = str_remove(prelabel, r"(\w*_\w*_65_years_\w*_over_)")) |> # 65 and over has a slightly different pattern
  select(!c(label, prelabel)) |>
  group_by(GEOID, NAME, relabel) |>       # relabel is of the form "less_than_9th_grade"
  summarise(pop_estimate = sum(estimate)) # collapse age ranges to education aggregates

## `summarise()` has grouped output by 'GEOID', 'NAME'. You can override using the
## `.groups` argument.

# Pivot wider education data, now one row per PUMA
tidied_ed <- summarised_ed |>
  pivot_wider(names_from = relabel, values_from = pop_estimate) |>
  select(!starts_with("total_female_")) # drop the population totals by age

# Truncating birth data labels to remove overly long values
relabeled_birth_data <- labeled_birth_data |> # Label trimming is different for this data
  mutate(trimmed_label = case_when(
    str_detect(label, "total_women_who_had_a_birth_in_the_past_12_months_") ~ paste0("age_", str_extract(label, r"(\d*_to_\d*)"), "_birth"),
    str_detect(label, "total_women_who_did_not_have_a_birth_in_the_past_12_months_") ~ paste0("age_", str_extract(label, r"(\d*_to_\d*)"), "_none"),
    .default = label
  )) |>
  select(GEOID, NAME, trimmed_label, estimate)

# Pivoting birth data, now one row per PUMA
detailed_birth_data <- relabeled_birth_data |>
  pivot_wider(names_from = trimmed_label, values_from = estimate)

# Removing granular columns and calculating birth rate
births_per_thousand <- detailed_birth_data |>
  mutate(births_per_thousand = total_women_who_had_a_birth_in_the_past_12_months / total * 1000) |>
  select(GEOID, NAME, births_per_thousand)

# Join education and fertility data
fertility_and_education <- tidied_ed |>
  left_join(births_per_thousand, by = "GEOID")

# Converting population of education achievement levels to proportions. This is the final dataframe for analysis
prop_fertility_and_education <- fertility_and_education |>
  mutate(
    prop_less_ninth = less_than_9th_grade / total_female,
    prop_no_diploma = `9th_to_12th_grade_no_diploma` / total_female,
    prop_diploma_or_equivalent = high_school_graduate_includes_equivalency / total_female,
    prop_some_college = some_college_no_degree / total_female,
    prop_associate_degree = associate_s_degree / total_female,
    prop_bachelor_degree = bachelor_s_degree / total_female,
    prop_advanced_degree = graduate_or_professional_degree / total_female
   ) |>
  select(GEOID, NAME.x, prop_less_ninth, prop_no_diploma, prop_diploma_or_equivalent, prop_some_college, prop_associate_degree, prop_bachelor_degree, prop_advanced_degree, births_per_thousand)

prop_fertility_and_education |>
  head(10) |>
  kbl() |>
  kable_material(c("striped"))

GEOID	NAME.x	prop_less_ninth	prop_no_diploma	prop_diploma_or_equivalent	prop_some_college	prop_associate_degree	prop_bachelor_degree	prop_advanced_degree	births_per_thousand
0100100	lauderdale_colbert_franklin_counties_puma_alabama	0.0313829	0.0830847	0.3394154	0.2447783	0.0894880	0.1354070	0.0764437	47.03931
0100200	limestone_county_puma_alabama	0.0341747	0.0816135	0.2531057	0.2344717	0.1143529	0.1812918	0.1009899	38.29303
0100300	morgan_lawrence_counties_decatur_city_puma_alabama	0.0450093	0.0954748	0.3419250	0.2191532	0.0916538	0.1349955	0.0717882	54.07314
0100401	madison_county_north_east_huntsville_city_east_puma_alabama	0.0222519	0.0418584	0.2557741	0.2203623	0.0882072	0.2403023	0.1312437	49.73894
0100402	huntsville_north_far_west_madison_east_triana_cities_puma_alabama	0.0163848	0.0373943	0.1927246	0.2424989	0.0848961	0.2678628	0.1582385	47.86090
0100403	huntsville_city_central_south_puma_alabama	0.0222660	0.0621319	0.1977666	0.2497735	0.0774214	0.2551871	0.1354535	52.11753
0100501	marshall_madison_far_southeast_counties_puma_alabama	0.0479746	0.0816117	0.3214848	0.2194284	0.1230736	0.1309223	0.0755046	61.76211
0100600	dekalb_jackson_counties_puma_alabama	0.0689824	0.1166813	0.3241602	0.2079287	0.1337123	0.0908589	0.0576762	62.72640
0100700	etowah_cherokee_counties_puma_alabama	0.0393950	0.0876117	0.3239437	0.2519120	0.1212896	0.1053877	0.0704604	56.64153
0100800	calhoun_county_puma_alabama	0.0412083	0.0832516	0.3355113	0.2640336	0.0801620	0.1195332	0.0763000	56.51008

Research question

Does the proportional education level of the female population of a community impact the birth rate in the community?

This question interests me as there is an increasing focus on female fertility rates in the United States coming from Conservative political groups. The perception among these groups is that women who go to college have measurably fewer children which, in their view, is detrimental to society. Setting aside the inflammatory rhetoric, I want to explore whether the basic premise of these arguments, that women with more education have fewer children, is accurate. This is a question that is not unexplored, and there have been studies done by the CDC and NIH on hospital birth data. These studies perform a fair amount of extrapolation to calculate fertility rates, and I would like to take a broader, simpler approach. Using ACS data, we can determine the total female population, number of births, and the proportional educational achievement of the female population. If the premise of the Conservative argument is true, we would expect a higher proportional education level to be a strong negative predictor of the number of births in an area.

Cases

The cases are Public Use Microdata Areas (PUMAs) from the 2022 American Community Survey 5 year estimate. A PUMA is defined by the Census Bureau and are “non-overlapping, statistical geographic areas that partition each state or equivalent entity into geographic areas containing no fewer than 100,000 people each” which allows for better granular data comparisons than county level data.

United States Census Bureau (2024, February 18) Public Use Microdata Areas (PUMAs). https://www.census.gov/programs-surveys/geography/guidance/geo-areas/pumas.html#reference

Data collection

The American Community Survey is an ongoing survey conducted by the Census Bureau for use by State and Federal agencies. It tracks more detailed and up to date information on Americans and American households than the Decennial Census. More detailed information on the methodology of the survey can be found on the Census Bureau website: https://www.census.gov/programs-surveys/acs/methodology/design-and-methodology.html

Type of study

This is an observational study.

Data Source

The United States Census Bureau API, information on which can be found at: https://www.census.gov/data/developers/guidance/api-user-guide.html

Describe your variables?

The independent variables will be the proportion of the female population in a PUMA with a given education level. The levels are: less than 9th grade, 9th-12th with no diploma, high school diploma or equivalent, some college without a degree, associate’s degree, bachelor’s degree, and graduate or professional degree. The proportion of the female population with a given education level is used to account for variation in the population of PUMAs. The dependent variable is the number of births per thousand women in the PUMA.

Relevant summary statistics

The first two plots are just to demonstrate that we have a fairly normal distribution of data in terms of the female population of the PUMAs as well as the birth rate, with a few notable outliers.

pop_dist_plot <- labeled_education_data |>
  filter(label == "total_female") |>
  ggplot(aes(x = estimate)) +
  geom_histogram() +
  labs(title = "Distribution of Female Population in Census PUMAs", x = "Female Population Estimate") 
pop_dist_plot

birth_dist_plot <- births_per_thousand |>
  ggplot(aes(x = births_per_thousand)) +
  geom_histogram() +
  labs(title = "Distribution of Birth Rate in Census PUMAs", x = "Births per Thousand Women")
birth_dist_plot

The data on the highest attained education level has a much more scattered distribution. Most notably the “< 9th Grade” and “Graduate Degree or Higher” groups, as well as “9th-12th” to a lesser extent, have long tails likely indicating areas with high poverty rates or near major academic institutions respectively. Applying a transformation will bring the outliers in and improve our analysis.

female_pop_totals <- summarised_ed |>
  filter(relabel == "total_female") |>
  pivot_wider(names_from = relabel, values_from = pop_estimate)
  
percent_education <- summarised_ed |>
  left_join(female_pop_totals, by = "GEOID") |>
  filter(!str_detect(relabel, "total_female")) |>
  mutate(pop_percent = pop_estimate / total_female * 100) |>
  select(GEOID, relabel, pop_percent)

education_order <- c("graduate_or_professional_degree", "bachelor_s_degree", "associate_s_degree", "some_college_no_degree", "high_school_graduate_includes_equivalency",  "9th_to_12th_grade_no_diploma", "less_than_9th_grade")

massaged_ed <- percent_education |>
  mutate(ed_factor = factor(relabel, levels = education_order)) |>
  mutate(renamed = fct_recode(ed_factor, "< 9th Grade" = "less_than_9th_grade", "9th-12th" = "9th_to_12th_grade_no_diploma", "HS diploma or equivalent" = "high_school_graduate_includes_equivalency", "Some College" = "some_college_no_degree", "Associate's Degree" = "associate_s_degree", "Bachelor's Degree" = "bachelor_s_degree", "Graduate Degree or Higher" = "graduate_or_professional_degree")) |>
  select(GEOID, renamed, pop_percent)

ed_dist_plot <- massaged_ed |>
  ggplot(aes(x = renamed, y = pop_percent)) + 
  geom_boxplot() +
  labs(title = "Education Distribution of the Female Population of PUMAs", x = "Highest Level of Education Attained",
       y = "Percent of Female Population") +
  coord_flip()
ed_dist_plot

The summary statistics provide some numbers to pair with the visualizations, giving some context on the size of the largest outliers.

summary(prop_fertility_and_education |> ungroup() |> select(!c(GEOID, NAME.x)))

##  prop_less_ninth    prop_no_diploma    prop_diploma_or_equivalent
##  Min.   :0.003256   Min.   :0.005673   Min.   :0.04423           
##  1st Qu.:0.019773   1st Qu.:0.038805   1st Qu.:0.20811           
##  Median :0.030026   Median :0.055284   Median :0.26314           
##  Mean   :0.043299   Mean   :0.060066   Mean   :0.25803           
##  3rd Qu.:0.052237   3rd Qu.:0.076960   3rd Qu.:0.30970           
##  Max.   :0.334876   Max.   :0.193413   Max.   :0.46100           
##  prop_some_college prop_associate_degree prop_bachelor_degree
##  Min.   :0.06949   Min.   :0.01559       Min.   :0.04673     
##  1st Qu.:0.19204   1st Qu.:0.07650       1st Qu.:0.14743     
##  Median :0.22355   Median :0.09243       Median :0.19234     
##  Mean   :0.21917   Mean   :0.09260       Mean   :0.20389     
##  3rd Qu.:0.24946   3rd Qu.:0.10898       3rd Qu.:0.25403     
##  Max.   :0.46620   Max.   :0.17578       Max.   :0.48135     
##  prop_advanced_degree births_per_thousand
##  Min.   :0.01422      Min.   :  9.23     
##  1st Qu.:0.07391      1st Qu.: 44.03     
##  Median :0.10650      Median : 51.49     
##  Mean   :0.12295      Mean   : 51.68     
##  3rd Qu.:0.15566      3rd Qu.: 59.24     
##  Max.   :0.50929      Max.   :172.59

Using the caret package we can easily preprocess our data and apply a Box-Cox transformation. Modeling our data we can see that many of the educational levels are statistically significant predictors, although attending some college or achieving a bachelors degree are markedly poor predictors. It should be noted the \(R^2\) of this model is very low, indicating that while education level of the female population is a statistically significant predictor it is not a very strong predictor.

control <- trainControl(method = "repeatedcv", 
                        number = 10, 
                        repeats = 3)

proportional_model <- train(births_per_thousand ~ prop_less_ninth + prop_no_diploma + prop_diploma_or_equivalent + prop_some_college + prop_associate_degree + prop_bachelor_degree + prop_advanced_degree,
                    data = prop_fertility_and_education,
                    method = "lm",
                    preProcess = c("BoxCox"),
                    trControl = control)
summary(proportional_model)

## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -35.664  -6.956  -0.368   6.714 121.920 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 73.8412    17.8951   4.126 3.81e-05 ***
## prop_less_ninth             -1.1071     0.5660  -1.956 0.050586 .  
## prop_no_diploma              9.3146     2.7317   3.410 0.000661 ***
## prop_diploma_or_equivalent  22.6356     9.4206   2.403 0.016344 *  
## prop_some_college            7.9614    11.5357   0.690 0.490160    
## prop_associate_degree       36.4204    11.3007   3.223 0.001286 ** 
## prop_bachelor_degree         0.7213     2.5968   0.278 0.781211    
## prop_advanced_degree        -4.5143     1.0950  -4.123 3.87e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.23 on 2478 degrees of freedom
## Multiple R-squared:  0.1599, Adjusted R-squared:  0.1575 
## F-statistic: 67.35 on 7 and 2478 DF,  p-value: < 2.2e-16

When we combine the educational categories into “milestones reached” groups, i.e. those who have a high school diploma equivalent or less, those who have some college up to a bachelor’s, and those with an advanced degree we find that these broader groups retain their significance which is to be expected. The model has only a slightly lower \(R^2\) than before, indicating that the added granularity of the non-milestone data does not contain significant predictive power. It should be noted that the only predictor that negatively affects birth rate is the proportion of the population with an advanced degree. This would run counter to the original supposition that a college education is a strong negative predictor of birth rates, as education level is not a strong predictor of fertility and only an advanced degree is a negative predictor of fertility.

# Combining categories so that we have three "milestone" categories
condensed_ed <- prop_fertility_and_education |>
  mutate(hs_diploma_or_less = prop_less_ninth + prop_no_diploma + prop_diploma_or_equivalent,
         up_to_bachelors = prop_some_college + prop_associate_degree + prop_bachelor_degree)

condensed_model <- train(births_per_thousand ~ hs_diploma_or_less + up_to_bachelors + prop_advanced_degree,
                    data = condensed_ed,
                    method = "lm",
                    preProcess = c("BoxCox"),
                    trControl = control)
summary(condensed_model)

## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -37.945  -7.064  -0.312   6.789 120.933 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            48.084      6.465   7.437 1.41e-13 ***
## hs_diploma_or_less     32.481     11.383   2.854  0.00436 ** 
## up_to_bachelors        48.004     19.470   2.466  0.01375 *  
## prop_advanced_degree   -4.414      1.591  -2.774  0.00558 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.32 on 2482 degrees of freedom
## Multiple R-squared:  0.1444, Adjusted R-squared:  0.1434 
## F-statistic: 139.6 on 3 and 2482 DF,  p-value: < 2.2e-16

These high level statistics are interesting, but the analysis could be extended to more focussed impacts. The PUMAs never cross state lines, so the data could be aggregated at the state level to see if impacts are stronger in, for example, states that voted red or blue in the most recent election. Another interesting possibility would be to explore whether the educational proportions are strong predictors of birth rates in different age groups. This could answer questions such as whether higher relative education rates lower the teen pregnancy rate or more broadly if they are strong predictors of birth rates in different age brackets. CDC and NIH datasets track every birth in the United States and include the education level of the mother. While not available for privacy reasons, it would be interesting to map the hospital level data to the PUMAs to see if the proportional education level of an area has an impact on the birth rates of women in a given education group.