Loading in Data

industry_data_2012 <- read.csv("2012_2022 5 YEAR by industry S2404(2012).csv")
industry_data_2022 <- read.csv("2012_2022 5 YEAR by industry S2404(2022).csv")
rucc_data <- read.csv("Ruralurbancontinuumcodes2023.csv")
locations <- read.csv("location_countyNames.csv")

parsing rural urban continuum codes

urban_rural <- rucc_data |>
  mutate(Rural_Urban_Stat = ifelse(grepl("nonmetro", Description, ignore.case = TRUE), "Rural",
               ifelse(grepl("metro", Description, ignore.case = TRUE), "Urban", NA)))
table(urban_rural$Rural_Urban_Stat)
## 
## Rural Urban 
##   168    86
table(rucc_data$Description)
## 
##               Metro - Counties in metro areas of 1 million population or more 
##                                                                            34 
##            Metro - Counties in metro areas of 250,000 to 1 million population 
##                                                                            35 
##              Metro - Counties in metro areas of fewer than 250,000 population 
##                                                                            17 
##       Nonmetro - Urban population of 20,000 or more, adjacent to a metro area 
##                                                                            10 
##   Nonmetro - Urban population of 20,000 or more, not adjacent to a metro area 
##                                                                             6 
##      Nonmetro - Urban population of 5,000 to 20,000, adjacent to a metro area 
##                                                                            42 
##  Nonmetro - Urban population of 5,000 to 20,000, not adjacent to a metro area 
##                                                                            18 
##     Nonmetro - Urban population of fewer than 5,000, adjacent to a metro area 
##                                                                            46 
## Nonmetro - Urban population of fewer than 5,000, not adjacent to a metro area 
##                                                                            46

^^ 86 urban and 168 rural, all checks out!

PIVOTING DATA

names <- locations |>
  select(Location) |>
  pull()
# # Pivoting the data
# clean_names <- names(industry_data_2012) |>
#   str_replace_all(" County, Texas!!Female!!Estimate", "") |>
#     str_replace_all("\\.\\.", " ") |>
#     str_replace_all("!!Female!!Estimate", "") |>
#     str_replace_all("\\.County Texas Female Estimate", "") |>
#     str_replace_all("\\.", " ")
# 
# names(industry_data_2012) <- clean_names
# # Create a pattern to match county columns
# pattern <- paste0("^(", paste(names, collapse = "|"), ")")
# # Pivot the data
# indust_2012_long <- industry_data_2012 |>
#   pivot_longer(
#     cols = matches(pattern),
#     names_to = "County",
#     values_to = "Estimate"
#   )

ADDING RURAL/URBAN

# classification <- urban_rural |>
#   select(County_Name, Rural_Urban_Stat) |>
#   mutate(County_Name = str_replace_all(County_Name, " County", "")) 
# 
# classification <- classification|>
#   add_row(County_Name = "Texas Female Estimate
# ", Rural_Urban_Stat = "State")
# classification <- classification |>
#   rename(County = County_Name)
# final_2012_indust <- left_join(classification, indust_2012_long, by = "County")
#2012 dataset
#clean the names of the dataset
cleaned_names_2012 <- names(industry_data_2012) |>
  str_replace_all(" County, Texas!!Female!!Estimate", "") |>
    str_replace_all("\\.\\.", " ") |>
    str_replace_all("!!Female!!Estimate", "") |>
    str_replace_all("\\.County Texas Female Estimate", "") |>
    str_replace_all("\\.", " ")

    names(industry_data_2012) <- cleaned_names_2012
#2022 dataset
#clean the names of the dataset
cleaned_names_2022 <- names(industry_data_2022) |>
  str_replace_all(" County, Texas!!Female!!Percent!!Estimate", "") |>
    str_replace_all("\\.\\.", " ") |>
    str_replace_all("\\.County Texas Percent\\.Female Estimate", "") |>
    str_replace_all("\\.", " ") 

    names(industry_data_2022) <- cleaned_names_2022

    industry_data_2022 <- industry_data_2022 |>
      rename(`Texas Female Estimate` = `Texas Percent Female Estimate`)
#method to pivot dataset and add urban/rural classification
make_workable_dataset <- function(dataset, classification_data) {

  #create a pattern to match county columns
  pattern <- paste0("^(", paste(names, collapse = "|"), ")")
  
  # pivot the data
  dataset_long <- dataset |>
  pivot_longer(
    cols = matches(pattern),
    names_to = "County",
    values_to = "Estimate"
  ) |>
    slice(1:6912)
  
  #adding classification 
  classification <- classification_data |>
    select(County_Name, Rural_Urban_Stat) |>
    mutate(County_Name = str_replace_all(County_Name, " County", "")) 

  classification <- classification|>
    add_row(County_Name = "Texas Female Estimate", Rural_Urban_Stat = "State") |>
    rename(County = County_Name)
  
  final_indust_data <- left_join(classification, dataset_long, by = "County")
  
  final_indust_data <- final_indust_data |>
    rename(Label = `Label Grouping `) |>
    mutate(Label =  str_replace_all(Label, "����", ""))
   
  
  return(final_indust_data)
}
final_2012_indust <- make_workable_dataset(industry_data_2012, urban_rural)
final_2022_indust <- make_workable_dataset(industry_data_2022, urban_rural)
write.csv(file = "Industry_2012_ACS_5_YEAR.csv", final_2012_indust)
write.csv(file = "Industry_2022_ACS_5_YEAR.csv", final_2022_indust)