TWF Dems

Author

CRG

TWF Project: Demographic Data

Demographic Data for the Texas Women’s Foundation Project.

I am using IPUMS 5-Year Estimates for 2022 and 2014: Steven Ruggles, Sarah Flood, Matthew Sobek, Daniel Backman, Annie Chen, Grace Cooper, Stephanie Richards, Renae Rodgers, and Megan Schouweiler. IPUMS USA: Version 15.0 [dataset]. Minneapolis, MN: IPUMS, 2024. https://doi.org/10.18128/D010.V15.0

1. Pull in Data from Ipums

options(warn=-1)

suppressPackageStartupMessages({
library(dplyr)
library(writexl)
library(ipumsr) })


usa_00071<- read_ipums_ddi("usa_00071.xml")
acs_00071 <- read_ipums_micro(usa_00071, data_file = ("usa_00071.dat.gz"), verbose = FALSE)

names(acs_00071) <- tolower(names(acs_00071))

head(acs_00071)
# A tibble: 6 × 27
   year multyear sample            serial cbserial  hhwt cluster statefip strata
  <int>    <dbl> <int+lbl>          <dbl>    <dbl> <dbl>   <dbl> <int+lb>  <dbl>
1  2012     2010 201203 [2010-201… 3.36e6  2.01e12    77 2.01e12 48 [Tex… 350148
2  2012     2010 201203 [2010-201… 3.36e6  2.01e12    77 2.01e12 48 [Tex… 350148
3  2012     2010 201203 [2010-201… 3.36e6  2.01e12    77 2.01e12 48 [Tex… 350148
4  2012     2010 201203 [2010-201… 3.36e6  2.01e12    77 2.01e12 48 [Tex… 350148
5  2012     2010 201203 [2010-201… 3.36e6  2.01e12    77 2.01e12 48 [Tex… 350148
6  2012     2010 201203 [2010-201… 3.36e6  2.01e12    20 2.01e12 48 [Tex… 460148
# ℹ 18 more variables: gq <int+lbl>, pernum <dbl>, perwt <dbl>, sex <int+lbl>,
#   age <int+lbl>, marst <int+lbl>, race <int+lbl>, raced <int+lbl>,
#   hispan <int+lbl>, hispand <int+lbl>, educ <int+lbl>, educd <int+lbl>,
#   degfield <int+lbl>, degfieldd <int+lbl>, degfield2 <int+lbl>,
#   degfield2d <int+lbl>, inctot <dbl+lbl>, poverty <dbl+lbl>
txw_data <- acs_00071 

# str(txw_data)

names(txw_data)
 [1] "year"       "multyear"   "sample"     "serial"     "cbserial"  
 [6] "hhwt"       "cluster"    "statefip"   "strata"     "gq"        
[11] "pernum"     "perwt"      "sex"        "age"        "marst"     
[16] "race"       "raced"      "hispan"     "hispand"    "educ"      
[21] "educd"      "degfield"   "degfieldd"  "degfield2"  "degfield2d"
[26] "inctot"     "poverty"   

2. Recodes

library(dplyr)

txw_data1 <- txw_data %>%
  filter(sex == 2)%>%
  filter(!is.na (educd) & !is.na(hispan) & !is.na(race) & !is.na(age) & !is.na(marst)) %>% 
  mutate(
    educ_level = case_when(
      educd %in% c(0, 1) ~ NA_character_,  # N/A 
      educd == 2 ~ "None",                  # No school
      educd %in% c(10:17, 20:26, 30, 40, 50, 61) ~ "LT_HS",  # Nursery school to grade 4, Grade 5, 6, 7, or 8, Grade 9, Grade 10, Grade 11, 12th grade, no diploma
      educd %in% c(62:64) ~ "HSD_GED",      # High school graduate or GED, Regular high school diploma, GED or alternative credential
      educd %in% c(65, 70:71, 80, 90, 100) ~ "SomeColl",  # Some college, but less than 1 year, 1 year of college, 1 or more years of college credit, no degree, 2 years of college, 3 years of college, 4 years of college
      educd %in% c(81:83) ~ "AssocDegree",  # Associate's degree, type not specified, Associate's degree, occupational program, Associate's degree, academic program
      educd == 101 ~ "BachelorDegree",       # Bachelor's degree
      educd %in% c(110:116) ~ "GradDegree",  # 5+ years of college, 6 years of college (6+ in 1960-1970), 7 years of college, 8+ years of college, Master's degree, Professional degree beyond a bachelor's degree, Doctoral degree
      TRUE ~ NA_character_                  # Other cases
    ),
   race_ethnicity = case_when(
      hispan %in% 1:4 ~ "Hispanic",
      race == 1 & hispan == 0 ~ "NH White",
      race == 2 & hispan == 0 ~ "NH Black",
      race == 3 & hispan == 0 ~ "NH AIAN",
      race %in% c(4, 5, 6) & hispan == 0 ~ "NH Asian",
      race %in% c(7, 8, 9) & hispan == 0 ~ "NH Other Race",
      TRUE ~ "Unknown"
    ), 
   age_group = case_when(
      age <= 17 ~ "0-17",
      age <= 24 & age >= 18 ~ "18-24",
      age <= 64 & age >= 25 ~ "25-64",
      age >= 65 ~ "65+",
      TRUE ~ "Unknown"
    ),
   marst_recode = ifelse(marst %in% c(1, 2), "Married", 
                          ifelse(marst %in% c(3, 4, 5), "Separated/Divorced/Widowed", 
                                 ifelse(marst == 6, "Never married/single", "Missing/Blank"))))


# str(txw_data1)

Age Breakdown

library(dplyr)
library(tidyr)
library(survey)
Loading required package: grid
Loading required package: Matrix

Attaching package: 'Matrix'
The following objects are masked from 'package:tidyr':

    expand, pack, unpack
Loading required package: survival

Attaching package: 'survey'
The following object is masked from 'package:graphics':

    dotchart
library(writexl)

txw_design <- svydesign(ids = ~1, data = txw_data1, weights = ~perwt)


# Age Group Summary
age_summary <- svytable(~ year + age_group, design = txw_design)

# Marital Status Summary
marital_status_summary <- svytable(~ year + marst_recode, design = txw_design)

# Combined Age Group and Marital Status Summary
combined_summary <- svytable(~ year + age_group + marst_recode, design = txw_design)

# Convert svytable to data frame
age_summary_df <- as.data.frame(age_summary)
marital_status_summary_df <- as.data.frame(marital_status_summary)
combined_summary_df <- as.data.frame(combined_summary)

# Write to Excel
write_xlsx(list(age_summary = age_summary_df, 
                marital_status_summary = marital_status_summary_df, 
                combined_summary = combined_summary_df), 
           "age_summary.xlsx")

By Race/Ethnicity

library(dplyr)
library(survey)
library(writexl)


txw_design <- svydesign(ids = ~1, data = txw_data1, weights = ~perwt)

# Summarize data to get the count of women by race_ethnicity and year
race_ethnicity_summary <- svytable(~ year + race_ethnicity, design = txw_design)

# Convert to a data frame
race_ethnicity_df <- as.data.frame(race_ethnicity_summary)

# Calculate the total women per year for normalization
total_women_per_year <- aggregate(Freq ~ year, data = race_ethnicity_df, FUN = sum)

# Merge to calculate percentages
race_ethnicity_df <- merge(race_ethnicity_df, total_women_per_year, by = "year", suffixes = c("", "_total"))

# Calculate %
race_ethnicity_df$percentage <- (race_ethnicity_df$Freq / race_ethnicity_df$Freq_total) * 100

# clean up
race_ethnicity_df <- race_ethnicity_df %>%
  select(year, race_ethnicity, percentage) %>%
  arrange(year, race_ethnicity)

print(race_ethnicity_df)
   year race_ethnicity percentage
1  2012       Hispanic  37.415109
2  2012        NH AIAN   0.255130
3  2012       NH Asian   4.017370
4  2012       NH Black  11.867968
5  2012  NH Other Race   1.519564
6  2012       NH White  44.924859
7  2022       Hispanic  39.404874
8  2022        NH AIAN   0.173619
9  2022       NH Asian   5.250103
10 2022       NH Black  12.126077
11 2022  NH Other Race   2.863694
12 2022       NH White  40.181632
library(openxlsx)
write_xlsx(race_ethnicity_df, "women_by_race_ethnicity_2012_2022.xlsx")

Women in Texas by Race/Ethnicity

Includes MOEs, MOERs, and CIs

# 
# # Filter for women aged 18-64
# women_filtered <- txw_data1 %>%
#   filter(age >= 18 & age <= 64, sex == 2) 
# 
# # For 2022 data
# w_2022 <- women_filtered %>%
#   filter(year == 2022) %>%
#   mutate(year = "2022")
# 
# # For 2014 data
# w_2014 <- women_filtered %>%
#   filter(year == 2014) %>%
#   mutate(year = "2014")
# 
# # Combine 
# combined_table2 <- bind_rows(w_2022, w_2014) %>%
#   group_by(year, race_ethnicity) %>%
#   summarize(total_women = sum(perwt),
#             .groups = "drop") %>%
#   mutate(margin_of_error = sqrt(total_women),
#          MOER = margin_of_error / total_women,
#          # Calculate confidence intervals
#          ci = paste0("(", round(total_women - margin_of_error, 2), 
#                      ", ", round(total_women + margin_of_error, 2), ")")) %>%
#   ungroup() %>%
#   group_by(race_ethnicity) %>%
#   mutate(
#     percent_of_total = total_women / sum(total_women) * 100,
#     percent_change = (total_women[year == "2022"] - total_women[year == "2014"]) / total_women[year == "2014"] * 100
#   ) %>%
#   arrange(year, desc(percent_change))
# 
# print(combined_table2)
# 
# write_xlsx(combined_table2, "combined_table4.xlsx")

Visualizing the above

# library(ggplot2)
# library(patchwork)
# 
# # Function to create pie charts
# create_pie_chart <- function(data, year) {
#   ggplot(combined_table2, aes(x = "", y = total_women, fill = race_ethnicity)) +
#     geom_bar(stat = "identity", width = 1) +
#     coord_polar("y", start = 0) +
#     labs(title = paste("Race/Ethnicity Distribution of Texas Women -", year),
#          x = NULL, y = NULL, fill = "Race/Ethnicity") +
#     theme_void() +
#     theme(legend.position = "right")
# }
# 
# pie_2014 <- create_pie_chart(w_2014, "2014")
# pie_2022 <- create_pie_chart(w_2022, "2022")
# 
# 
# combined_pie <- pie_2014 + pie_2022
# 
# print(combined_pie)

——-Tidycensus stuff

# library(tidycensus)
# library(tidyverse)
# 
# # Define the variables for women's population by race/ethnicity
# vars_reth <- c(
#   "B01001_017E" = "Total", # Total female population
#   "B01001H_017E" = "NH White", # NH White female
#   "B01001B_017E" = "Black", # Black female
#   "B01001D_017E" = "Asian", # Asian female
#   "B01001E_017E" = "NHPI", # NHPI female
#   "B01001I_017E" = "Hispanic", # Hispanic female
#   "B01001F_017E" = "Other"  # Other female
# )
# 
# # Define the years of interest
# years <- c(2012:2019, 2021:2022)
# names(years) <- years
# 
# # Fetch population estimates for each race/ethnicity group for Texas for each year
# texas_population <- map_dfr(years, ~{
#   get_acs(
#     geography = "state",
#     variables = vars_reth,
#     state = "TX",
#     year = .x,
#     survey = "acs1"
#   )
# }, .id = "year")
# 
# # Reshape the data to long format
# texas_population_long <- texas_population %>%
#   pivot_longer(cols = starts_with("B01001"), 
#                names_to = "ethnicity_code", 
#                values_to = "population") %>%
#   mutate(ethnicity = case_when(
#     ethnicity_code == "B01001_017E" ~ "Total",
#     ethnicity_code == "B01001H_017E" ~ "NH White",
#     ethnicity_code == "B01001B_017E" ~ "Black",
#     ethnicity_code == "B01001D_017E" ~ "Asian",
#     ethnicity_code == "B01001E_017E" ~ "NHPI",
#     ethnicity_code == "B01001I_017E" ~ "Hispanic",
#     ethnicity_code == "B01001F_017E" ~ "Other"
#   ))
# 
# # Visualize the data using ggplot2
# ggplot(texas_population_long, aes(x = year, y = population, color = ethnicity)) +
#   geom_line() +
#   geom_point() +
#   labs(x = "Year", y = "Population", color = "Race/Ethnicity", 
#        title = "Texas Women Population by Race/Ethnicity Over Time")