# Load necessary libraries
library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(writexl)

# Define file paths
file_append <- "C:/Users/bryan/OneDrive/Desktop/Qcew_Report_midland_2002_21_append.xlsx"
file_5d <- "C:/Users/bryan/OneDrive/Desktop/Qcew_Report_2002_21_5d.xlsx"

# Read the Excel files
Qcew_Report_midland_2002_21_append <- read_excel(file_append)
Qcew_Report_2002_21_5d <- read_excel(file_5d)

# Convert column names to lower case and replace spaces with underscores for easier handling
names(Qcew_Report_midland_2002_21_append) <- tolower(gsub(" ", "_", names(Qcew_Report_midland_2002_21_append)))
names(Qcew_Report_2002_21_5d) <- tolower(gsub(" ", "_", names(Qcew_Report_2002_21_5d)))

# Rename columns to match the Stata code's naming conventions
Qcew_Report_midland_2002_21_append <- Qcew_Report_midland_2002_21_append %>%
  rename(indcode = industry_code, employment = average_employment, county = area)

Qcew_Report_2002_21_5d <- Qcew_Report_2002_21_5d %>%
  rename(indcode = industry_code, employment = average_employment, county = area)
# Function to clean and process the data
clean_data <- function(data) {
  data <- data %>%
    mutate(all = ifelse(ownership == "total all", 1, 0)) %>%
    group_by(year, county, indcode) %>%
    mutate(total_all = sum(all)) %>%
    filter(total_all == 0) %>%
    mutate(record = row_number()) %>%
    ungroup()
  return(data)
}

# Clean the append data
Qcew_Report_midland_2002_21_append <- clean_data(Qcew_Report_midland_2002_21_append)
Qcew_Report_2002_21_5d <- clean_data(Qcew_Report_2002_21_5d)
# Generate 2-digit NAICS codes
generate_2d_naics <- function(data) {
  data <- data %>%
    mutate(indcode_2d = case_when(
      indcode >= 1100 & indcode < 1200 ~ "11",
      indcode >= 2100 & indcode < 2200 ~ "21",
      indcode >= 2200 & indcode < 2300 ~ "22",
      indcode >= 2300 & indcode < 2400 ~ "23",
      indcode >= 3100 & indcode < 3400 ~ "31-33",
      indcode >= 4200 & indcode < 4300 ~ "42",
      indcode >= 4400 & indcode < 4600 ~ "44-45",
      indcode >= 4800 & indcode < 5000 ~ "48-49",
      indcode >= 5100 & indcode < 5200 | indcode >= 51200 & indcode < 51300 ~ "51",
      indcode >= 5200 & indcode < 5300 ~ "52",
      indcode >= 5300 & indcode < 5400 ~ "53",
      indcode >= 5400 & indcode < 5500 ~ "54",
      indcode >= 5500 & indcode < 5600 ~ "55",
      indcode >= 5600 & indcode < 5700 ~ "56",
      indcode >= 6100 & indcode < 6200 ~ "61",
      indcode >= 6200 & indcode < 6300 ~ "62",
      indcode >= 7100 & indcode < 7200 ~ "71",
      indcode >= 7200 & indcode < 7300 ~ "72",
      indcode >= 8100 & indcode < 8200 ~ "81",
      indcode >= 9200 & indcode < 9300 ~ "92",
      indcode >= 9900 & indcode < 10000 ~ "99",
      TRUE ~ NA_character_
    ))
}

# Apply the function to both datasets
Qcew_Report_midland_2002_21_append <- generate_2d_naics(Qcew_Report_midland_2002_21_append)
Qcew_Report_2002_21_5d <- generate_2d_naics(Qcew_Report_2002_21_5d)

# Verify that indcode_2d has been created
print(head(Qcew_Report_midland_2002_21_append))
## # A tibble: 6 × 21
##   year  period county  area_number ownership indcode industry              level
##   <chr> <chr>  <chr>   <chr>       <chr>     <chr>   <chr>                 <chr>
## 1 2020  04     Midland 000329      Private   1113    Fruit and Tree Nut F… 4    
## 2 2020  04     Midland 000329      Total All 1113    Fruit and Tree Nut F… 4    
## 3 2002  04     Midland 000329      Private   1119    Other Crop Farming    4    
## 4 2002  04     Midland 000329      Total All 1119    Other Crop Farming    4    
## 5 2003  04     Midland 000329      Private   1119    Other Crop Farming    4    
## 6 2003  04     Midland 000329      Total All 1119    Other Crop Farming    4    
## # ℹ 13 more variables: establishments <dbl>, firms <dbl>, employment <dbl>,
## #   average_weekly_wage <dbl>, total_wages <dbl>, month_1_employment <dbl>,
## #   month_2_employment <dbl>, month_3_employment <dbl>,
## #   hierarchical_ordering <dbl>, all <dbl>, total_all <dbl>, record <int>,
## #   indcode_2d <chr>
print(head(Qcew_Report_2002_21_5d))
## # A tibble: 6 × 21
##   year  period county  area_number ownership indcode industry              level
##   <chr> <chr>  <chr>   <chr>       <chr>     <chr>   <chr>                 <chr>
## 1 2005  04     Midland 000329      Total All 51213   Motion Picture and V… 5    
## 2 2006  04     Midland 000329      Total All 51213   Motion Picture and V… 5    
## 3 2007  04     Midland 000329      Total All 51213   Motion Picture and V… 5    
## 4 2008  04     Midland 000329      Total All 51213   Motion Picture and V… 5    
## 5 2009  04     Midland 000329      Total All 51213   Motion Picture and V… 5    
## 6 2010  04     Midland 000329      Total All 51213   Motion Picture and V… 5    
## # ℹ 13 more variables: establishments <dbl>, firms <dbl>, employment <dbl>,
## #   average_weekly_wage <dbl>, total_wages <dbl>, month_1_employment <dbl>,
## #   month_2_employment <dbl>, month_3_employment <dbl>,
## #   hierarchical_ordering <dbl>, all <dbl>, total_all <dbl>, record <int>,
## #   indcode_2d <chr>
# Verify that `indcode_2d` exists
print("Columns in Qcew_Report_midland_2002_21_append before aggregation:")
## [1] "Columns in Qcew_Report_midland_2002_21_append before aggregation:"
print(colnames(Qcew_Report_midland_2002_21_append))
##  [1] "year"                  "period"                "county"               
##  [4] "area_number"           "ownership"             "indcode"              
##  [7] "industry"              "level"                 "establishments"       
## [10] "firms"                 "employment"            "average_weekly_wage"  
## [13] "total_wages"           "month_1_employment"    "month_2_employment"   
## [16] "month_3_employment"    "hierarchical_ordering" "all"                  
## [19] "total_all"             "record"                "indcode_2d"
print("Columns in Qcew_Report_2002_21_5d before aggregation:")
## [1] "Columns in Qcew_Report_2002_21_5d before aggregation:"
print(colnames(Qcew_Report_2002_21_5d))
##  [1] "year"                  "period"                "county"               
##  [4] "area_number"           "ownership"             "indcode"              
##  [7] "industry"              "level"                 "establishments"       
## [10] "firms"                 "employment"            "average_weekly_wage"  
## [13] "total_wages"           "month_1_employment"    "month_2_employment"   
## [16] "month_3_employment"    "hierarchical_ordering" "all"                  
## [19] "total_all"             "record"                "indcode_2d"
# Create the basic, retail, service, and education columns
create_employment_categories <- function(data) {
  data <- data %>%
    mutate(
      basic = case_when(
        indcode >= 1100 & indcode < 1200 ~ employment,
        indcode >= 2100 & indcode < 2400 ~ employment,
        indcode >= 3100 & indcode < 3400 ~ employment,
        indcode >= 4200 & indcode < 4300 ~ employment,
        indcode >= 4800 & indcode < 4910 ~ employment,
        indcode >= 4920 & indcode < 5000 ~ employment,
        indcode >= 5100 & indcode < 5160 ~ employment,
        indcode >= 5174 & indcode <= 5175 ~ employment,
        TRUE ~ 0
      ),
      retail = case_when(
        indcode >= 4400 & indcode < 4600 ~ employment,
        indcode >= 4910 & indcode < 4920 ~ employment,
        indcode >= 7100 & indcode < 7200 ~ employment,
        indcode >= 7220 & indcode < 7230 ~ employment,
        indcode == 51213 ~ employment,
        TRUE ~ 0
      ),
      service = case_when(
        indcode >= 5160 & indcode < 5170 ~ employment,
        indcode >= 5171 & indcode < 5174 ~ employment,
        indcode == 5179 ~ employment,
        indcode >= 5180 & indcode < 5200 ~ employment,
        indcode >= 5200 & indcode < 5700 ~ employment,
        indcode >= 6114 & indcode <= 6117 ~ employment,
        indcode >= 6200 & indcode < 6300 ~ employment,
        indcode >= 7210 & indcode < 7220 ~ employment,
        indcode >= 8100 & indcode < 8200 ~ employment,
        indcode >= 9200 & indcode < 9300 ~ employment,
        indcode == 9999 ~ employment,
        TRUE ~ 0
      ),
      education = case_when(
        indcode >= 6111 & indcode <= 6113 ~ employment,
        TRUE ~ 0
      )
    )
  return(data)
}

# Apply the function to both datasets
Qcew_Report_midland_2002_21_append <- create_employment_categories(Qcew_Report_midland_2002_21_append)
Qcew_Report_2002_21_5d <- create_employment_categories(Qcew_Report_2002_21_5d)
# Aggregating data
aggregate_data <- function(data) {
  if (!"indcode_2d" %in% colnames(data)) {
    stop("Column `indcode_2d` does not exist in the data frame.")
  }
  
  data <- data %>%
    group_by(year, county, indcode, indcode_2d) %>%
    summarize(across(c(establishments:employment, total_wages:month_3_employment, basic, retail, service, education), ~ sum(.x, na.rm = TRUE)), .groups = 'drop')
  
  print("Columns after first aggregation:")
  print(colnames(data))
  
  data <- data %>%
    group_by(year, county, indcode_2d) %>%
    summarize(totalemp_2d = sum(employment, na.rm = TRUE), 
              tot_basic = sum(basic, na.rm = TRUE), 
              tot_retail = sum(retail, na.rm = TRUE), 
              tot_service = sum(service, na.rm = TRUE), 
              tot_education = sum(education, na.rm = TRUE),
              pct_basic = tot_basic / totalemp_2d * 100,
              pct_retail = tot_retail / totalemp_2d * 100,
              pct_service = tot_service / totalemp_2d * 100,
              pct_education = tot_education / totalemp_2d * 100,
              .groups = 'drop')
  
  return(data)
}

# Debugging statements to inspect data before aggregation
print("Before aggregation, Qcew_Report_midland_2002_21_append:")
## [1] "Before aggregation, Qcew_Report_midland_2002_21_append:"
print(head(Qcew_Report_midland_2002_21_append))
## # A tibble: 6 × 25
##   year  period county  area_number ownership indcode industry              level
##   <chr> <chr>  <chr>   <chr>       <chr>     <chr>   <chr>                 <chr>
## 1 2020  04     Midland 000329      Private   1113    Fruit and Tree Nut F… 4    
## 2 2020  04     Midland 000329      Total All 1113    Fruit and Tree Nut F… 4    
## 3 2002  04     Midland 000329      Private   1119    Other Crop Farming    4    
## 4 2002  04     Midland 000329      Total All 1119    Other Crop Farming    4    
## 5 2003  04     Midland 000329      Private   1119    Other Crop Farming    4    
## 6 2003  04     Midland 000329      Total All 1119    Other Crop Farming    4    
## # ℹ 17 more variables: establishments <dbl>, firms <dbl>, employment <dbl>,
## #   average_weekly_wage <dbl>, total_wages <dbl>, month_1_employment <dbl>,
## #   month_2_employment <dbl>, month_3_employment <dbl>,
## #   hierarchical_ordering <dbl>, all <dbl>, total_all <dbl>, record <int>,
## #   indcode_2d <chr>, basic <dbl>, retail <dbl>, service <dbl>, education <dbl>
print("Before aggregation, Qcew_Report_2002_21_5d:")
## [1] "Before aggregation, Qcew_Report_2002_21_5d:"
print(head(Qcew_Report_2002_21_5d))
## # A tibble: 6 × 25
##   year  period county  area_number ownership indcode industry              level
##   <chr> <chr>  <chr>   <chr>       <chr>     <chr>   <chr>                 <chr>
## 1 2005  04     Midland 000329      Total All 51213   Motion Picture and V… 5    
## 2 2006  04     Midland 000329      Total All 51213   Motion Picture and V… 5    
## 3 2007  04     Midland 000329      Total All 51213   Motion Picture and V… 5    
## 4 2008  04     Midland 000329      Total All 51213   Motion Picture and V… 5    
## 5 2009  04     Midland 000329      Total All 51213   Motion Picture and V… 5    
## 6 2010  04     Midland 000329      Total All 51213   Motion Picture and V… 5    
## # ℹ 17 more variables: establishments <dbl>, firms <dbl>, employment <dbl>,
## #   average_weekly_wage <dbl>, total_wages <dbl>, month_1_employment <dbl>,
## #   month_2_employment <dbl>, month_3_employment <dbl>,
## #   hierarchical_ordering <dbl>, all <dbl>, total_all <dbl>, record <int>,
## #   indcode_2d <chr>, basic <dbl>, retail <dbl>, service <dbl>, education <dbl>
# Apply the aggregation
Qcew_Report_midland_2002_21_append <- aggregate_data(Qcew_Report_midland_2002_21_append)
## [1] "Columns after first aggregation:"
##  [1] "year"               "county"             "indcode"           
##  [4] "indcode_2d"         "establishments"     "firms"             
##  [7] "employment"         "total_wages"        "month_1_employment"
## [10] "month_2_employment" "month_3_employment" "basic"             
## [13] "retail"             "service"            "education"
Qcew_Report_2002_21_5d <- aggregate_data(Qcew_Report_2002_21_5d)
## [1] "Columns after first aggregation:"
##  [1] "year"               "county"             "indcode"           
##  [4] "indcode_2d"         "establishments"     "firms"             
##  [7] "employment"         "total_wages"        "month_1_employment"
## [10] "month_2_employment" "month_3_employment" "basic"             
## [13] "retail"             "service"            "education"
# Debugging statements to inspect data after aggregation
print("After aggregation, Qcew_Report_midland_2002_21_append:")
## [1] "After aggregation, Qcew_Report_midland_2002_21_append:"
print(head(Qcew_Report_midland_2002_21_append))
## # A tibble: 6 × 12
##   year  county  indcode_2d totalemp_2d tot_basic tot_retail tot_service
##   <chr> <chr>   <chr>            <dbl>     <dbl>      <dbl>       <dbl>
## 1 2002  Midland 11                 342       342          0           0
## 2 2002  Midland 21               14996     14996          0           0
## 3 2002  Midland 22                 176       176          0           0
## 4 2002  Midland 23                5290      5290          0           0
## 5 2002  Midland 31-33             2604      2604          0           0
## 6 2002  Midland 42                4378      4378          0           0
## # ℹ 5 more variables: tot_education <dbl>, pct_basic <dbl>, pct_retail <dbl>,
## #   pct_service <dbl>, pct_education <dbl>
print("After aggregation, Qcew_Report_2002_21_5d:")
## [1] "After aggregation, Qcew_Report_2002_21_5d:"
print(head(Qcew_Report_2002_21_5d))
## # A tibble: 6 × 12
##   year  county  indcode_2d totalemp_2d tot_basic tot_retail tot_service
##   <chr> <chr>   <chr>            <dbl>     <dbl>      <dbl>       <dbl>
## 1 2005  Midland 51                 104       104        104           0
## 2 2006  Midland 51                  85        85         85           0
## 3 2007  Midland 51                  90        90         90           0
## 4 2008  Midland 51                  74        74         74           0
## 5 2009  Midland 51                  84        84         84           0
## 6 2010  Midland 51                  96        96         96           0
## # ℹ 5 more variables: tot_education <dbl>, pct_basic <dbl>, pct_retail <dbl>,
## #   pct_service <dbl>, pct_education <dbl>
# Save to Excel
write_xlsx(Qcew_Report_midland_2002_21_append, "C:/Users/bryan/OneDrive/Desktop/Processed_Qcew_Report_midland_2002_21_append.xlsx")
write_xlsx(Qcew_Report_2002_21_5d, "C:/Users/bryan/OneDrive/Desktop/Processed_Qcew_Report_2002_21_5d.xlsx")
# Inspect column names and data types
str(Qcew_Report_midland_2002_21_append)
## tibble [418 × 12] (S3: tbl_df/tbl/data.frame)
##  $ year         : chr [1:418] "2002" "2002" "2002" "2002" ...
##  $ county       : chr [1:418] "Midland" "Midland" "Midland" "Midland" ...
##  $ indcode_2d   : chr [1:418] "11" "21" "22" "23" ...
##  $ totalemp_2d  : num [1:418] 342 14996 176 5290 2604 ...
##  $ tot_basic    : num [1:418] 342 14996 176 5290 2604 ...
##  $ tot_retail   : num [1:418] 0 0 0 0 0 ...
##  $ tot_service  : num [1:418] 0 0 0 0 0 0 0 0 863 3740 ...
##  $ tot_education: num [1:418] 0 0 0 0 0 0 0 0 0 0 ...
##  $ pct_basic    : num [1:418] 100 100 100 100 100 ...
##  $ pct_retail   : num [1:418] 0 0 0 0 0 0 100 0 0 0 ...
##  $ pct_service  : num [1:418] 0 0 0 0 0 ...
##  $ pct_education: num [1:418] 0 0 0 0 0 0 0 0 0 0 ...
str(Qcew_Report_2002_21_5d)
## tibble [14 × 12] (S3: tbl_df/tbl/data.frame)
##  $ year         : chr [1:14] "2005" "2006" "2007" "2008" ...
##  $ county       : chr [1:14] "Midland" "Midland" "Midland" "Midland" ...
##  $ indcode_2d   : chr [1:14] "51" "51" "51" "51" ...
##  $ totalemp_2d  : num [1:14] 104 85 90 74 84 96 90 97 90 199 ...
##  $ tot_basic    : num [1:14] 104 85 90 74 84 96 90 97 90 199 ...
##  $ tot_retail   : num [1:14] 104 85 90 74 84 96 90 97 90 199 ...
##  $ tot_service  : num [1:14] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tot_education: num [1:14] 0 0 0 0 0 0 0 0 0 0 ...
##  $ pct_basic    : num [1:14] 100 100 100 100 100 100 100 100 100 100 ...
##  $ pct_retail   : num [1:14] 100 100 100 100 100 100 100 100 100 100 ...
##  $ pct_service  : num [1:14] 0 0 0 0 0 0 0 0 0 0 ...
##  $ pct_education: num [1:14] 0 0 0 0 0 0 0 0 0 0 ...
# Print a summary of the data
summary(Qcew_Report_midland_2002_21_append)
##      year              county           indcode_2d         totalemp_2d   
##  Length:418         Length:418         Length:418         Min.   :   30  
##  Class :character   Class :character   Class :character   1st Qu.: 1932  
##  Mode  :character   Mode  :character   Mode  :character   Median : 4528  
##                                                           Mean   : 6543  
##                                                           3rd Qu.: 8661  
##                                                           Max.   :62572  
##    tot_basic       tot_retail     tot_service    tot_education   
##  Min.   :    0   Min.   :    0   Min.   :    0   Min.   :   0.0  
##  1st Qu.:    0   1st Qu.:    0   1st Qu.:    0   1st Qu.:   0.0  
##  Median :    0   Median :    0   Median :  229   Median :   0.0  
##  Mean   : 2987   Mean   : 1354   Mean   : 2059   Mean   : 143.2  
##  3rd Qu.: 1098   3rd Qu.:    0   3rd Qu.: 3650   3rd Qu.:   0.0  
##  Max.   :62572   Max.   :20088   Max.   :11103   Max.   :5023.0  
##    pct_basic        pct_retail      pct_service     pct_education   
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   : 0.000  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.: 0.000  
##  Median :  0.00   Median :  0.00   Median : 11.85   Median : 0.000  
##  Mean   : 35.81   Mean   : 13.87   Mean   : 46.08   Mean   : 4.249  
##  3rd Qu.:100.00   3rd Qu.:  0.00   3rd Qu.:100.00   3rd Qu.: 0.000  
##  Max.   :100.00   Max.   :100.00   Max.   :100.00   Max.   :96.090
summary(Qcew_Report_2002_21_5d)
##      year              county           indcode_2d         totalemp_2d   
##  Length:14          Length:14          Length:14          Min.   : 74.0  
##  Class :character   Class :character   Class :character   1st Qu.: 90.0  
##  Mode  :character   Mode  :character   Mode  :character   Median : 96.5  
##                                                           Mean   :124.7  
##                                                           3rd Qu.:165.0  
##                                                           Max.   :206.0  
##    tot_basic       tot_retail     tot_service tot_education   pct_basic  
##  Min.   : 74.0   Min.   : 74.0   Min.   :0    Min.   :0     Min.   :100  
##  1st Qu.: 90.0   1st Qu.: 90.0   1st Qu.:0    1st Qu.:0     1st Qu.:100  
##  Median : 96.5   Median : 96.5   Median :0    Median :0     Median :100  
##  Mean   :124.7   Mean   :124.7   Mean   :0    Mean   :0     Mean   :100  
##  3rd Qu.:165.0   3rd Qu.:165.0   3rd Qu.:0    3rd Qu.:0     3rd Qu.:100  
##  Max.   :206.0   Max.   :206.0   Max.   :0    Max.   :0     Max.   :100  
##    pct_retail   pct_service pct_education
##  Min.   :100   Min.   :0    Min.   :0    
##  1st Qu.:100   1st Qu.:0    1st Qu.:0    
##  Median :100   Median :0    Median :0    
##  Mean   :100   Mean   :0    Mean   :0    
##  3rd Qu.:100   3rd Qu.:0    3rd Qu.:0    
##  Max.   :100   Max.   :0    Max.   :0