Loading Data
county_data <- read.csv("ACSST5Y2022.S2404-2024-11-12T172440.csv") |>
clean_names()
regions_data <- read_csv("County_12_Regions.csv")
## Rows: 255 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Region, County
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Cleaning data
industry_female_2022 <- county_data |>
mutate(across(-label_grouping, ~ as.numeric(gsub(",", "", .)))) |>
select(label_grouping,
contains("texas_total_estimate"),
contains("texas_female_estimate")) |>
mutate(label_grouping = str_squish(label_grouping)) |>
filter(label_grouping == "Agriculture, forestry, fishing and hunting, and mining:" |
label_grouping == "Construction" |
label_grouping == "Professional, scientific, and management, and administrative and waste management services:" |
label_grouping == "Public administration" |
label_grouping == "Transportation and warehousing, and utilities:" |
label_grouping == "Manufacturing" |
label_grouping == "Wholesale trade" |
label_grouping == "Retail trade" |
label_grouping == "Other services, except public administration" |
label_grouping == "Educational services, and health care and social assistance:" |
label_grouping == "Finance and insurance, and real estate and rental and leasing:" |
label_grouping == "Information" |
label_grouping == "Arts, entertainment, and recreation, and accommodation and food services:")
## Warning: There were 1785 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `across(-label_grouping, ~as.numeric(gsub(",", "", .)))`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1784 remaining warnings.
write.csv(file = "industry_totals_and_females.csv", industry_female_2022)
REGIONS
alamo_region <- industry_female_2022 %>%
select(
label_grouping,
contains("atascosa"),
contains("bandera"),
contains("bexar"),
contains("calhoun"),
contains("comal"),
contains("dewitt"),
contains("frio"),
contains("gillespie"),
contains("goliad"),
contains("gonzales"),
contains("guadalupe"),
contains("jackson"),
contains("karnes"),
contains("kendall"),
contains("kerr"),
contains("lavaca"),
contains("medina"),
contains("victoria"),
contains("wilson")
)
capitol_region <- industry_female_2022 %>%
select(
label_grouping,
contains("bastrop"),
contains("blanco"),
contains("burnet"),
contains("caldwell"),
contains("fayette"),
contains("hays"),
contains("lee"),
contains("llano"),
contains("travis"),
contains("williamson")
)
central_texas_region <- industry_female_2022 %>%
select(
label_grouping,
contains("bell"),
contains("bosque"),
contains("brazos"),
contains("burleson"),
contains("coryell"),
contains("falls"),
contains("freestone"),
contains("grimes"),
contains("hamilton"),
contains("hill"),
contains("lampasas"),
contains("leon"),
contains("limestone"),
contains("madison"),
contains("mclennan"),
contains("milam"),
contains("mills"),
contains("robertson"),
contains("san saba"),
contains("washington")
)
gulf_coast_region <- industry_female_2022 %>%
select(
label_grouping,
contains("austin"),
contains("brazoria"),
contains("chambers"),
contains("colorado"),
contains("fort bend"),
contains("galveston"),
contains("harris"),
contains("liberty"),
contains("matagorda"),
contains("montgomery"),
contains("walker"),
contains("waller"),
contains("wharton")
)
high_plains_region <- industry_female_2022 %>%
select(
label_grouping,
contains("armstrong"),
contains("bailey"),
contains("briscoe"),
contains("carson"),
contains("castro"),
contains("childress"),
contains("cochran"),
contains("collingsworth"),
contains("crosby"),
contains("dallam"),
contains("deaf smith"),
contains("dickens"),
contains("donley"),
contains("floyd"),
contains("garza"),
contains("gray"),
contains("hale"),
contains("hall"),
contains("hansford"),
contains("hartley"),
contains("hemphill"),
contains("hockley"),
contains("hutchinson"),
contains("king"),
contains("lamb"),
contains("lipscomb"),
contains("lubbock"),
contains("lynn"),
contains("moore"),
contains("motley"),
contains("ochiltree"),
contains("oldham"),
contains("parmer"),
contains("potter"),
contains("randall"),
contains("roberts"),
contains("sherman"),
contains("swisher"),
contains("terry"),
contains("wheeler"),
contains("yoakum")
)
metroplex_region <- industry_female_2022 %>%
select(
label_grouping,
contains("collin"),
contains("cooke"),
contains("dallas"),
contains("denton"),
contains("ellis"),
contains("erath"),
contains("fannin"),
contains("grayson"),
contains("hood"),
contains("hunt"),
contains("johnson"),
contains("kaufman"),
contains("navarro"),
contains("palo pinto"),
contains("parker"),
contains("rockwall"),
contains("somervell"),
contains("tarrant"),
contains("wise")
)
northwest_region <- industry_female_2022 %>%
select(
label_grouping,
contains("archer"),
contains("baylor"),
contains("brown"),
contains("callahan"),
contains("clay"),
contains("coleman"),
contains("comanche"),
contains("cottle"),
contains("eastland"),
contains("fisher"),
contains("foard"),
contains("hardeman"),
contains("haskell"),
contains("jack"),
contains("jones"),
contains("kent"),
contains("knox"),
contains("mitchell"),
contains("montague"),
contains("nolan"),
contains("runnels"),
contains("scurry"),
contains("shackelford"),
contains("stephens"),
contains("stonewall"),
contains("taylor"),
contains("throckmorton"),
contains("wichita"),
contains("wilbarger"),
contains("young")
)
south_texas_region <- industry_female_2022 %>%
select(
label_grouping,
contains("aransas"),
contains("bee"),
contains("brooks"),
contains("cameron"),
contains("dimmit"),
contains("duval"),
contains("edwards"),
contains("hidalgo"),
contains("jim hogg"),
contains("jim wells"),
contains("kenedy"),
contains("kinney"),
contains("kleberg"),
contains("la salle"),
contains("live oak"),
contains("maverick"),
contains("mcmullen"),
contains("nueces"),
contains("real"),
contains("refugio"),
contains("san patricio"),
contains("starr"),
contains("uvalde"),
contains("val verde"),
contains("webb"),
contains("willacy"),
contains("zapata"),
contains("zavala")
)
southeast_region <- industry_female_2022 %>%
select(
label_grouping,
contains("angelina"),
contains("hardin"),
contains("houston"),
contains("jasper"),
contains("jefferson"),
contains("nacogdoches"),
contains("newton"),
contains("orange"),
contains("polk"),
contains("sabine"),
contains("san augustine"),
contains("san jacinto"),
contains("shelby"),
contains("trinity"),
contains("tyler")
)
upper_east_region <- industry_female_2022 %>%
select(
label_grouping,
contains("anderson"),
contains("bowie"),
contains("camp"),
contains("cass"),
contains("cherokee"),
contains("delta"),
contains("franklin"),
contains("gregg"),
contains("harrison"),
contains("henderson"),
contains("hopkins"),
contains("lamar"),
contains("marion"),
contains("morris"),
contains("panola"),
contains("rains"),
contains("red river"),
contains("rusk"),
contains("smith"),
contains("titus"),
contains("upshur"),
contains("van zandt"),
contains("wood")
)
upper_rio_grande <- industry_female_2022 %>%
select(
label_grouping,
contains("brewster"),
contains("culberson"),
contains("el paso"),
contains("hudspeth"),
contains("jeff davis"),
contains("presidio")
)
west_texas_region <- industry_female_2022 %>%
select(
label_grouping,
contains("andrews"),
contains("borden"),
contains("coke"),
contains("concho"),
contains("crane"),
contains("crockett"),
contains("dawson"),
contains("ector"),
contains("gaines"),
contains("glasscock"),
contains("howard"),
contains("irion"),
contains("kimble"),
contains("loving"),
contains("martin"),
contains("mason"),
contains("mcculloch"),
contains("menard"),
contains("midland"),
contains("pecos"),
contains("reagan"),
contains("reeves"),
contains("schleicher"),
contains("sterling"),
contains("sutton"),
contains("terrell"),
contains("tom green"),
contains("upton"),
contains("ward"),
contains("winkler")
)
write.csv(file = "region_example.csv", alamo_region)
Function for each data set to get the composition of each industry
transpose_label_grouping <- function(data) {
transposed_data <- data |>
pivot_longer(
cols = -label_grouping,
names_to = "location",
values_to = "value"
) %>%
pivot_wider(
names_from = label_grouping,
values_from = value
)
return(transposed_data)
}
alamo_females <- alamo_region |>
select(label_grouping, contains("female_estimate")) |>
mutate(female_sum = rowSums(across(-label_grouping), na.rm = TRUE)) |>
select(label_grouping, female_sum)
alamo_total <- alamo_region |>
select(label_grouping, contains("texas_total")) |>
mutate(total_sum = rowSums(across(-label_grouping), na.rm = TRUE)) |>
select(label_grouping, total_sum)
alamo_percentage <- left_join(alamo_females, alamo_total, by = 'label_grouping') |>
mutate(industry_comp = female_sum / total_sum)
library(dplyr)
calculate_industry_percentage <- function(region_data) {
region_females <- region_data |>
select(label_grouping, contains("female_estimate")) |>
mutate(female_sum = rowSums(across(-label_grouping), na.rm = TRUE)) |>
select(label_grouping, female_sum)
region_total <- region_data |>
select(label_grouping, contains("texas_total")) |>
mutate(total_sum = rowSums(across(-label_grouping), na.rm = TRUE)) |>
select(label_grouping, total_sum)
region_percentage <- left_join(region_females, region_total, by = 'label_grouping') |>
mutate(industry_comp = female_sum / total_sum)
return(region_percentage)
}
ALL REGIONS
per_alamo <- calculate_industry_percentage(alamo_region) |>
rename(Alamo = industry_comp) |>
select(label_grouping, Alamo)
per_capitol <- calculate_industry_percentage(capitol_region) |>
rename(Capitol = industry_comp) |>
select(label_grouping, Capitol)
per_central_texas <- calculate_industry_percentage(central_texas_region) |>
rename(`Central Texas` = industry_comp) |>
select(label_grouping, `Central Texas`)
per_gulf_coast <- calculate_industry_percentage(gulf_coast_region) |>
rename(`Gulf Coast` = industry_comp) |>
select(label_grouping, `Gulf Coast`)
per_high_plains<- calculate_industry_percentage(high_plains_region) |>
rename(`High Plains` = industry_comp) |>
select(label_grouping, `High Plains`)
per_metroplex <- calculate_industry_percentage(metroplex_region) |>
rename(Metroplex = industry_comp) |>
select(label_grouping, Metroplex)
per_northwest <- calculate_industry_percentage(northwest_region) |>
rename(Northwest = industry_comp) |>
select(label_grouping, Northwest)
per_south_texas <- calculate_industry_percentage(south_texas_region) |>
rename(`South Texas` = industry_comp) |>
select(label_grouping, `South Texas`)
per_southeast <- calculate_industry_percentage(southeast_region) |>
rename(Southeast = industry_comp) |>
select(label_grouping, Southeast)
per_upper_east <- calculate_industry_percentage(upper_east_region) |>
rename(`Upper East` = industry_comp) |>
select(label_grouping, `Upper East`)
per_upper_rio_grande <- calculate_industry_percentage(upper_rio_grande) |>
rename(`Upper Rio Grande` = industry_comp) |>
select(label_grouping, `Upper Rio Grande`)
per_west_texas <- calculate_industry_percentage(west_texas_region) |>
rename(`West Texas` = industry_comp) |>
select(label_grouping, `West Texas`)
datasets <- list(per_alamo, per_capitol, per_central_texas, per_gulf_coast, per_high_plains, per_metroplex, per_northwest, per_south_texas, per_southeast, per_upper_east, per_upper_rio_grande, per_west_texas)
all_regions_industry_comp <- reduce(datasets, left_join, by = "label_grouping")
write.csv(file = "all_texas_regions_female_industry_comp.csv", all_regions_industry_comp)