Loading Data

county_data <- read.csv("ACSST5Y2022.S2404-2024-11-12T172440.csv") |>
    clean_names()

regions_data <- read_csv("County_12_Regions.csv")
## Rows: 255 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Region, County
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Cleaning data

industry_female_2022 <- county_data |> 
  mutate(across(-label_grouping, ~ as.numeric(gsub(",", "", .)))) |>
  select(label_grouping,
         contains("texas_total_estimate"), 
         contains("texas_female_estimate")) |>
    mutate(label_grouping = str_squish(label_grouping)) |>
  filter(label_grouping == "Agriculture, forestry, fishing and hunting, and mining:" | 
         label_grouping == "Construction" |  
         label_grouping == "Professional, scientific, and management, and administrative and waste management services:" | 
         label_grouping == "Public administration" | 
         label_grouping == "Transportation and warehousing, and utilities:" | 
         label_grouping == "Manufacturing" | 
         label_grouping == "Wholesale trade" | 
         label_grouping == "Retail trade" | 
         label_grouping == "Other services, except public administration" | 
         label_grouping == "Educational services, and health care and social assistance:" | 
         label_grouping == "Finance and insurance, and real estate and rental and leasing:" | 
         label_grouping == "Information" | 
         label_grouping == "Arts, entertainment, and recreation, and accommodation and food services:")
## Warning: There were 1785 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `across(-label_grouping, ~as.numeric(gsub(",", "", .)))`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1784 remaining warnings.
write.csv(file = "industry_totals_and_females.csv", industry_female_2022)

REGIONS

alamo_region <- industry_female_2022 %>%
  select(
    label_grouping,
    contains("atascosa"),
    contains("bandera"),
    contains("bexar"),
    contains("calhoun"),
    contains("comal"),
    contains("dewitt"),
    contains("frio"),
    contains("gillespie"),
    contains("goliad"),
    contains("gonzales"),
    contains("guadalupe"),
    contains("jackson"),
    contains("karnes"),
    contains("kendall"),
    contains("kerr"),
    contains("lavaca"),
    contains("medina"),
    contains("victoria"),
    contains("wilson")
  )

capitol_region <- industry_female_2022 %>%
  select(
    label_grouping,
    contains("bastrop"),
    contains("blanco"),
    contains("burnet"),
    contains("caldwell"),
    contains("fayette"),
    contains("hays"),
    contains("lee"),
    contains("llano"),
    contains("travis"),
    contains("williamson")
  )

central_texas_region <- industry_female_2022 %>%
  select(
    label_grouping,
    contains("bell"),
    contains("bosque"),
    contains("brazos"),
    contains("burleson"),
    contains("coryell"),
    contains("falls"),
    contains("freestone"),
    contains("grimes"),
    contains("hamilton"),
    contains("hill"),
    contains("lampasas"),
    contains("leon"),
    contains("limestone"),
    contains("madison"),
    contains("mclennan"),
    contains("milam"),
    contains("mills"),
    contains("robertson"),
    contains("san saba"),
    contains("washington")
  )

gulf_coast_region <- industry_female_2022 %>%
  select(
    label_grouping,
    contains("austin"),
    contains("brazoria"),
    contains("chambers"),
    contains("colorado"),
    contains("fort bend"),
    contains("galveston"),
    contains("harris"),
    contains("liberty"),
    contains("matagorda"),
    contains("montgomery"),
    contains("walker"),
    contains("waller"),
    contains("wharton")
  )

high_plains_region <- industry_female_2022 %>%
  select(
    label_grouping,
    contains("armstrong"),
    contains("bailey"),
    contains("briscoe"),
    contains("carson"),
    contains("castro"),
    contains("childress"),
    contains("cochran"),
    contains("collingsworth"),
    contains("crosby"),
    contains("dallam"),
    contains("deaf smith"),
    contains("dickens"),
    contains("donley"),
    contains("floyd"),
    contains("garza"),
    contains("gray"),
    contains("hale"),
    contains("hall"),
    contains("hansford"),
    contains("hartley"),
    contains("hemphill"),
    contains("hockley"),
    contains("hutchinson"),
    contains("king"),
    contains("lamb"),
    contains("lipscomb"),
    contains("lubbock"),
    contains("lynn"),
    contains("moore"),
    contains("motley"),
    contains("ochiltree"),
    contains("oldham"),
    contains("parmer"),
    contains("potter"),
    contains("randall"),
    contains("roberts"),
    contains("sherman"),
    contains("swisher"),
    contains("terry"),
    contains("wheeler"),
    contains("yoakum")
  )

metroplex_region <- industry_female_2022 %>%
  select(
    label_grouping,
    contains("collin"),
    contains("cooke"),
    contains("dallas"),
    contains("denton"),
    contains("ellis"),
    contains("erath"),
    contains("fannin"),
    contains("grayson"),
    contains("hood"),
    contains("hunt"),
    contains("johnson"),
    contains("kaufman"),
    contains("navarro"),
    contains("palo pinto"),
    contains("parker"),
    contains("rockwall"),
    contains("somervell"),
    contains("tarrant"),
    contains("wise")
  )

northwest_region <- industry_female_2022 %>%
  select(
    label_grouping,
    contains("archer"),
    contains("baylor"),
    contains("brown"),
    contains("callahan"),
    contains("clay"),
    contains("coleman"),
    contains("comanche"),
    contains("cottle"),
    contains("eastland"),
    contains("fisher"),
    contains("foard"),
    contains("hardeman"),
    contains("haskell"),
    contains("jack"),
    contains("jones"),
    contains("kent"),
    contains("knox"),
    contains("mitchell"),
    contains("montague"),
    contains("nolan"),
    contains("runnels"),
    contains("scurry"),
    contains("shackelford"),
    contains("stephens"),
    contains("stonewall"),
    contains("taylor"),
    contains("throckmorton"),
    contains("wichita"),
    contains("wilbarger"),
    contains("young")
  )

south_texas_region <- industry_female_2022 %>%
  select(
    label_grouping,
    contains("aransas"),
    contains("bee"),
    contains("brooks"),
    contains("cameron"),
    contains("dimmit"),
    contains("duval"),
    contains("edwards"),
    contains("hidalgo"),
    contains("jim hogg"),
    contains("jim wells"),
    contains("kenedy"),
    contains("kinney"),
    contains("kleberg"),
    contains("la salle"),
    contains("live oak"),
    contains("maverick"),
    contains("mcmullen"),
    contains("nueces"),
    contains("real"),
    contains("refugio"),
    contains("san patricio"),
    contains("starr"),
    contains("uvalde"),
    contains("val verde"),
    contains("webb"),
    contains("willacy"),
    contains("zapata"),
    contains("zavala")
  )

southeast_region <- industry_female_2022 %>%
  select(
    label_grouping,
    contains("angelina"),
    contains("hardin"),
    contains("houston"),
    contains("jasper"),
    contains("jefferson"),
    contains("nacogdoches"),
    contains("newton"),
    contains("orange"),
    contains("polk"),
    contains("sabine"),
    contains("san augustine"),
    contains("san jacinto"),
    contains("shelby"),
    contains("trinity"),
    contains("tyler")
  )

upper_east_region <- industry_female_2022 %>%
  select(
    label_grouping,
    contains("anderson"),
    contains("bowie"),
    contains("camp"),
    contains("cass"),
    contains("cherokee"),
    contains("delta"),
    contains("franklin"),
    contains("gregg"),
    contains("harrison"),
    contains("henderson"),
    contains("hopkins"),
    contains("lamar"),
    contains("marion"),
    contains("morris"),
    contains("panola"),
    contains("rains"),
    contains("red river"),
    contains("rusk"),
    contains("smith"),
    contains("titus"),
    contains("upshur"),
    contains("van zandt"),
    contains("wood")
  )

upper_rio_grande <- industry_female_2022 %>%
  select(
    label_grouping,
    contains("brewster"),
    contains("culberson"),
    contains("el paso"),
    contains("hudspeth"),
    contains("jeff davis"),
    contains("presidio")
  )

west_texas_region <- industry_female_2022 %>%
  select(
    label_grouping,
    contains("andrews"),
    contains("borden"),
    contains("coke"),
    contains("concho"),
    contains("crane"),
    contains("crockett"),
    contains("dawson"),
    contains("ector"),
    contains("gaines"),
    contains("glasscock"),
    contains("howard"),
    contains("irion"),
    contains("kimble"),
    contains("loving"),
    contains("martin"),
    contains("mason"),
    contains("mcculloch"),
    contains("menard"),
    contains("midland"),
    contains("pecos"),
    contains("reagan"),
    contains("reeves"),
    contains("schleicher"),
    contains("sterling"),
    contains("sutton"),
    contains("terrell"),
    contains("tom green"),
    contains("upton"),
    contains("ward"),
    contains("winkler")
  )
write.csv(file = "region_example.csv", alamo_region)

Function for each data set to get the composition of each industry

transpose_label_grouping <- function(data) {
  transposed_data <- data |>
    pivot_longer(
      cols = -label_grouping,         
      names_to = "location",          
      values_to = "value"             
    ) %>%
    pivot_wider(
      names_from = label_grouping,    
      values_from = value             
    )
  
  return(transposed_data)
}
alamo_females <- alamo_region |>
  select(label_grouping, contains("female_estimate")) |>
  mutate(female_sum = rowSums(across(-label_grouping), na.rm = TRUE)) |>
  select(label_grouping, female_sum)

alamo_total <- alamo_region |>
  select(label_grouping, contains("texas_total")) |>
  mutate(total_sum = rowSums(across(-label_grouping), na.rm = TRUE)) |>
  select(label_grouping, total_sum)

alamo_percentage <- left_join(alamo_females, alamo_total, by = 'label_grouping') |>
  mutate(industry_comp = female_sum / total_sum)
library(dplyr)

calculate_industry_percentage <- function(region_data) {
  region_females <- region_data |>
    select(label_grouping, contains("female_estimate")) |>
    mutate(female_sum = rowSums(across(-label_grouping), na.rm = TRUE)) |>
    select(label_grouping, female_sum)
  
  region_total <- region_data |>
    select(label_grouping, contains("texas_total")) |>
    mutate(total_sum = rowSums(across(-label_grouping), na.rm = TRUE)) |>
    select(label_grouping, total_sum)
  
  region_percentage <- left_join(region_females, region_total, by = 'label_grouping') |>
    mutate(industry_comp = female_sum / total_sum)
  
  return(region_percentage)
}

ALL REGIONS

per_alamo <- calculate_industry_percentage(alamo_region) |>
  rename(Alamo = industry_comp) |>
  select(label_grouping, Alamo)

per_capitol <- calculate_industry_percentage(capitol_region) |>
  rename(Capitol = industry_comp) |>
  select(label_grouping, Capitol)

per_central_texas <- calculate_industry_percentage(central_texas_region) |>
  rename(`Central Texas` = industry_comp) |>
  select(label_grouping, `Central Texas`)

per_gulf_coast <- calculate_industry_percentage(gulf_coast_region) |>
  rename(`Gulf Coast` = industry_comp) |>
  select(label_grouping, `Gulf Coast`)

per_high_plains<- calculate_industry_percentage(high_plains_region) |>
  rename(`High Plains` = industry_comp) |>
  select(label_grouping, `High Plains`)

per_metroplex <- calculate_industry_percentage(metroplex_region) |>
  rename(Metroplex = industry_comp) |>
  select(label_grouping, Metroplex)

per_northwest <- calculate_industry_percentage(northwest_region) |>
  rename(Northwest = industry_comp) |>
  select(label_grouping, Northwest)

per_south_texas <- calculate_industry_percentage(south_texas_region) |>
  rename(`South Texas` = industry_comp) |>
  select(label_grouping, `South Texas`)

per_southeast <- calculate_industry_percentage(southeast_region) |>
  rename(Southeast = industry_comp) |>
  select(label_grouping, Southeast)

per_upper_east <- calculate_industry_percentage(upper_east_region) |>
  rename(`Upper East` = industry_comp) |>
  select(label_grouping, `Upper East`)

per_upper_rio_grande <- calculate_industry_percentage(upper_rio_grande) |>
  rename(`Upper Rio Grande` = industry_comp) |>
  select(label_grouping, `Upper Rio Grande`)

per_west_texas <- calculate_industry_percentage(west_texas_region) |>
  rename(`West Texas` = industry_comp) |>
  select(label_grouping, `West Texas`)
datasets <- list(per_alamo, per_capitol, per_central_texas, per_gulf_coast, per_high_plains, per_metroplex, per_northwest, per_south_texas, per_southeast, per_upper_east, per_upper_rio_grande, per_west_texas)

all_regions_industry_comp <- reduce(datasets, left_join, by = "label_grouping")
write.csv(file = "all_texas_regions_female_industry_comp.csv", all_regions_industry_comp)