# Load necessary libraries
library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(writexl)
# Define file paths
file_append <- "C:/Users/bryan/OneDrive/Desktop/Qcew_Report_midland_2002_21_append.xlsx"
file_5d <- "C:/Users/bryan/OneDrive/Desktop/Qcew_Report_2002_21_5d.xlsx"
# Read the Excel files
Qcew_Report_midland_2002_21_append <- read_excel(file_append)
Qcew_Report_2002_21_5d <- read_excel(file_5d)
# Convert column names to lower case and replace spaces with underscores for easier handling
names(Qcew_Report_midland_2002_21_append) <- tolower(gsub(" ", "_", names(Qcew_Report_midland_2002_21_append)))
names(Qcew_Report_2002_21_5d) <- tolower(gsub(" ", "_", names(Qcew_Report_2002_21_5d)))
# Rename columns to match the Stata code's naming conventions
Qcew_Report_midland_2002_21_append <- Qcew_Report_midland_2002_21_append %>%
rename(indcode = industry_code, employment = average_employment, county = area)
Qcew_Report_2002_21_5d <- Qcew_Report_2002_21_5d %>%
rename(indcode = industry_code, employment = average_employment, county = area)
# Function to clean and process the data
clean_data <- function(data) {
data <- data %>%
mutate(all = ifelse(ownership == "total all", 1, 0)) %>%
group_by(year, county, indcode) %>%
mutate(total_all = sum(all)) %>%
filter(total_all == 0) %>%
mutate(record = row_number()) %>%
ungroup()
return(data)
}
# Clean the append data
Qcew_Report_midland_2002_21_append <- clean_data(Qcew_Report_midland_2002_21_append)
Qcew_Report_2002_21_5d <- clean_data(Qcew_Report_2002_21_5d)
# Generate 2-digit NAICS codes
generate_2d_naics <- function(data) {
data <- data %>%
mutate(indcode_2d = case_when(
indcode >= 1100 & indcode < 1200 ~ "11",
indcode >= 2100 & indcode < 2200 ~ "21",
indcode >= 2200 & indcode < 2300 ~ "22",
indcode >= 2300 & indcode < 2400 ~ "23",
indcode >= 3100 & indcode < 3400 ~ "31-33",
indcode >= 4200 & indcode < 4300 ~ "42",
indcode >= 4400 & indcode < 4600 ~ "44-45",
indcode >= 4800 & indcode < 5000 ~ "48-49",
indcode >= 5100 & indcode < 5200 | indcode >= 51200 & indcode < 51300 ~ "51",
indcode >= 5200 & indcode < 5300 ~ "52",
indcode >= 5300 & indcode < 5400 ~ "53",
indcode >= 5400 & indcode < 5500 ~ "54",
indcode >= 5500 & indcode < 5600 ~ "55",
indcode >= 5600 & indcode < 5700 ~ "56",
indcode >= 6100 & indcode < 6200 ~ "61",
indcode >= 6200 & indcode < 6300 ~ "62",
indcode >= 7100 & indcode < 7200 ~ "71",
indcode >= 7200 & indcode < 7300 ~ "72",
indcode >= 8100 & indcode < 8200 ~ "81",
indcode >= 9200 & indcode < 9300 ~ "92",
indcode >= 9900 & indcode < 10000 ~ "99",
TRUE ~ NA_character_
))
}
# Apply the function to both datasets
Qcew_Report_midland_2002_21_append <- generate_2d_naics(Qcew_Report_midland_2002_21_append)
Qcew_Report_2002_21_5d <- generate_2d_naics(Qcew_Report_2002_21_5d)
# Verify that indcode_2d has been created
print(head(Qcew_Report_midland_2002_21_append))
## # A tibble: 6 × 21
## year period county area_number ownership indcode industry level
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 2020 04 Midland 000329 Private 1113 Fruit and Tree Nut F… 4
## 2 2020 04 Midland 000329 Total All 1113 Fruit and Tree Nut F… 4
## 3 2002 04 Midland 000329 Private 1119 Other Crop Farming 4
## 4 2002 04 Midland 000329 Total All 1119 Other Crop Farming 4
## 5 2003 04 Midland 000329 Private 1119 Other Crop Farming 4
## 6 2003 04 Midland 000329 Total All 1119 Other Crop Farming 4
## # ℹ 13 more variables: establishments <dbl>, firms <dbl>, employment <dbl>,
## # average_weekly_wage <dbl>, total_wages <dbl>, month_1_employment <dbl>,
## # month_2_employment <dbl>, month_3_employment <dbl>,
## # hierarchical_ordering <dbl>, all <dbl>, total_all <dbl>, record <int>,
## # indcode_2d <chr>
print(head(Qcew_Report_2002_21_5d))
## # A tibble: 6 × 21
## year period county area_number ownership indcode industry level
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 2005 04 Midland 000329 Total All 51213 Motion Picture and V… 5
## 2 2006 04 Midland 000329 Total All 51213 Motion Picture and V… 5
## 3 2007 04 Midland 000329 Total All 51213 Motion Picture and V… 5
## 4 2008 04 Midland 000329 Total All 51213 Motion Picture and V… 5
## 5 2009 04 Midland 000329 Total All 51213 Motion Picture and V… 5
## 6 2010 04 Midland 000329 Total All 51213 Motion Picture and V… 5
## # ℹ 13 more variables: establishments <dbl>, firms <dbl>, employment <dbl>,
## # average_weekly_wage <dbl>, total_wages <dbl>, month_1_employment <dbl>,
## # month_2_employment <dbl>, month_3_employment <dbl>,
## # hierarchical_ordering <dbl>, all <dbl>, total_all <dbl>, record <int>,
## # indcode_2d <chr>
# Verify that `indcode_2d` exists
print("Columns in Qcew_Report_midland_2002_21_append before aggregation:")
## [1] "Columns in Qcew_Report_midland_2002_21_append before aggregation:"
print(colnames(Qcew_Report_midland_2002_21_append))
## [1] "year" "period" "county"
## [4] "area_number" "ownership" "indcode"
## [7] "industry" "level" "establishments"
## [10] "firms" "employment" "average_weekly_wage"
## [13] "total_wages" "month_1_employment" "month_2_employment"
## [16] "month_3_employment" "hierarchical_ordering" "all"
## [19] "total_all" "record" "indcode_2d"
print("Columns in Qcew_Report_2002_21_5d before aggregation:")
## [1] "Columns in Qcew_Report_2002_21_5d before aggregation:"
print(colnames(Qcew_Report_2002_21_5d))
## [1] "year" "period" "county"
## [4] "area_number" "ownership" "indcode"
## [7] "industry" "level" "establishments"
## [10] "firms" "employment" "average_weekly_wage"
## [13] "total_wages" "month_1_employment" "month_2_employment"
## [16] "month_3_employment" "hierarchical_ordering" "all"
## [19] "total_all" "record" "indcode_2d"
# Create the basic, retail, service, and education columns
create_employment_categories <- function(data) {
data <- data %>%
mutate(
basic = case_when(
indcode >= 1100 & indcode < 1200 ~ employment,
indcode >= 2100 & indcode < 2400 ~ employment,
indcode >= 3100 & indcode < 3400 ~ employment,
indcode >= 4200 & indcode < 4300 ~ employment,
indcode >= 4800 & indcode < 4910 ~ employment,
indcode >= 4920 & indcode < 5000 ~ employment,
indcode >= 5100 & indcode < 5160 ~ employment,
indcode >= 5174 & indcode <= 5175 ~ employment,
TRUE ~ 0
),
retail = case_when(
indcode >= 4400 & indcode < 4600 ~ employment,
indcode >= 4910 & indcode < 4920 ~ employment,
indcode >= 7100 & indcode < 7200 ~ employment,
indcode >= 7220 & indcode < 7230 ~ employment,
indcode == 51213 ~ employment,
TRUE ~ 0
),
service = case_when(
indcode >= 5160 & indcode < 5170 ~ employment,
indcode >= 5171 & indcode < 5174 ~ employment,
indcode == 5179 ~ employment,
indcode >= 5180 & indcode < 5200 ~ employment,
indcode >= 5200 & indcode < 5700 ~ employment,
indcode >= 6114 & indcode <= 6117 ~ employment,
indcode >= 6200 & indcode < 6300 ~ employment,
indcode >= 7210 & indcode < 7220 ~ employment,
indcode >= 8100 & indcode < 8200 ~ employment,
indcode >= 9200 & indcode < 9300 ~ employment,
indcode == 9999 ~ employment,
TRUE ~ 0
),
education = case_when(
indcode >= 6111 & indcode <= 6113 ~ employment,
TRUE ~ 0
)
)
return(data)
}
# Apply the function to both datasets
Qcew_Report_midland_2002_21_append <- create_employment_categories(Qcew_Report_midland_2002_21_append)
Qcew_Report_2002_21_5d <- create_employment_categories(Qcew_Report_2002_21_5d)
# Aggregating data
aggregate_data <- function(data) {
if (!"indcode_2d" %in% colnames(data)) {
stop("Column `indcode_2d` does not exist in the data frame.")
}
data <- data %>%
group_by(year, county, indcode, indcode_2d) %>%
summarize(across(c(establishments:employment, total_wages:month_3_employment, basic, retail, service, education), ~ sum(.x, na.rm = TRUE)), .groups = 'drop')
print("Columns after first aggregation:")
print(colnames(data))
data <- data %>%
group_by(year, county, indcode_2d) %>%
summarize(totalemp_2d = sum(employment, na.rm = TRUE),
tot_basic = sum(basic, na.rm = TRUE),
tot_retail = sum(retail, na.rm = TRUE),
tot_service = sum(service, na.rm = TRUE),
tot_education = sum(education, na.rm = TRUE),
pct_basic = tot_basic / totalemp_2d * 100,
pct_retail = tot_retail / totalemp_2d * 100,
pct_service = tot_service / totalemp_2d * 100,
pct_education = tot_education / totalemp_2d * 100,
.groups = 'drop')
return(data)
}
# Debugging statements to inspect data before aggregation
print("Before aggregation, Qcew_Report_midland_2002_21_append:")
## [1] "Before aggregation, Qcew_Report_midland_2002_21_append:"
print(head(Qcew_Report_midland_2002_21_append))
## # A tibble: 6 × 25
## year period county area_number ownership indcode industry level
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 2020 04 Midland 000329 Private 1113 Fruit and Tree Nut F… 4
## 2 2020 04 Midland 000329 Total All 1113 Fruit and Tree Nut F… 4
## 3 2002 04 Midland 000329 Private 1119 Other Crop Farming 4
## 4 2002 04 Midland 000329 Total All 1119 Other Crop Farming 4
## 5 2003 04 Midland 000329 Private 1119 Other Crop Farming 4
## 6 2003 04 Midland 000329 Total All 1119 Other Crop Farming 4
## # ℹ 17 more variables: establishments <dbl>, firms <dbl>, employment <dbl>,
## # average_weekly_wage <dbl>, total_wages <dbl>, month_1_employment <dbl>,
## # month_2_employment <dbl>, month_3_employment <dbl>,
## # hierarchical_ordering <dbl>, all <dbl>, total_all <dbl>, record <int>,
## # indcode_2d <chr>, basic <dbl>, retail <dbl>, service <dbl>, education <dbl>
print("Before aggregation, Qcew_Report_2002_21_5d:")
## [1] "Before aggregation, Qcew_Report_2002_21_5d:"
print(head(Qcew_Report_2002_21_5d))
## # A tibble: 6 × 25
## year period county area_number ownership indcode industry level
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 2005 04 Midland 000329 Total All 51213 Motion Picture and V… 5
## 2 2006 04 Midland 000329 Total All 51213 Motion Picture and V… 5
## 3 2007 04 Midland 000329 Total All 51213 Motion Picture and V… 5
## 4 2008 04 Midland 000329 Total All 51213 Motion Picture and V… 5
## 5 2009 04 Midland 000329 Total All 51213 Motion Picture and V… 5
## 6 2010 04 Midland 000329 Total All 51213 Motion Picture and V… 5
## # ℹ 17 more variables: establishments <dbl>, firms <dbl>, employment <dbl>,
## # average_weekly_wage <dbl>, total_wages <dbl>, month_1_employment <dbl>,
## # month_2_employment <dbl>, month_3_employment <dbl>,
## # hierarchical_ordering <dbl>, all <dbl>, total_all <dbl>, record <int>,
## # indcode_2d <chr>, basic <dbl>, retail <dbl>, service <dbl>, education <dbl>
# Apply the aggregation
Qcew_Report_midland_2002_21_append <- aggregate_data(Qcew_Report_midland_2002_21_append)
## [1] "Columns after first aggregation:"
## [1] "year" "county" "indcode"
## [4] "indcode_2d" "establishments" "firms"
## [7] "employment" "total_wages" "month_1_employment"
## [10] "month_2_employment" "month_3_employment" "basic"
## [13] "retail" "service" "education"
Qcew_Report_2002_21_5d <- aggregate_data(Qcew_Report_2002_21_5d)
## [1] "Columns after first aggregation:"
## [1] "year" "county" "indcode"
## [4] "indcode_2d" "establishments" "firms"
## [7] "employment" "total_wages" "month_1_employment"
## [10] "month_2_employment" "month_3_employment" "basic"
## [13] "retail" "service" "education"
# Debugging statements to inspect data after aggregation
print("After aggregation, Qcew_Report_midland_2002_21_append:")
## [1] "After aggregation, Qcew_Report_midland_2002_21_append:"
print(head(Qcew_Report_midland_2002_21_append))
## # A tibble: 6 × 12
## year county indcode_2d totalemp_2d tot_basic tot_retail tot_service
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 2002 Midland 11 342 342 0 0
## 2 2002 Midland 21 14996 14996 0 0
## 3 2002 Midland 22 176 176 0 0
## 4 2002 Midland 23 5290 5290 0 0
## 5 2002 Midland 31-33 2604 2604 0 0
## 6 2002 Midland 42 4378 4378 0 0
## # ℹ 5 more variables: tot_education <dbl>, pct_basic <dbl>, pct_retail <dbl>,
## # pct_service <dbl>, pct_education <dbl>
print("After aggregation, Qcew_Report_2002_21_5d:")
## [1] "After aggregation, Qcew_Report_2002_21_5d:"
print(head(Qcew_Report_2002_21_5d))
## # A tibble: 6 × 12
## year county indcode_2d totalemp_2d tot_basic tot_retail tot_service
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 2005 Midland 51 104 104 104 0
## 2 2006 Midland 51 85 85 85 0
## 3 2007 Midland 51 90 90 90 0
## 4 2008 Midland 51 74 74 74 0
## 5 2009 Midland 51 84 84 84 0
## 6 2010 Midland 51 96 96 96 0
## # ℹ 5 more variables: tot_education <dbl>, pct_basic <dbl>, pct_retail <dbl>,
## # pct_service <dbl>, pct_education <dbl>
# Save to Excel
write_xlsx(Qcew_Report_midland_2002_21_append, "C:/Users/bryan/OneDrive/Desktop/Processed_Qcew_Report_midland_2002_21_append.xlsx")
write_xlsx(Qcew_Report_2002_21_5d, "C:/Users/bryan/OneDrive/Desktop/Processed_Qcew_Report_2002_21_5d.xlsx")
# Inspect column names and data types
str(Qcew_Report_midland_2002_21_append)
## tibble [418 × 12] (S3: tbl_df/tbl/data.frame)
## $ year : chr [1:418] "2002" "2002" "2002" "2002" ...
## $ county : chr [1:418] "Midland" "Midland" "Midland" "Midland" ...
## $ indcode_2d : chr [1:418] "11" "21" "22" "23" ...
## $ totalemp_2d : num [1:418] 342 14996 176 5290 2604 ...
## $ tot_basic : num [1:418] 342 14996 176 5290 2604 ...
## $ tot_retail : num [1:418] 0 0 0 0 0 ...
## $ tot_service : num [1:418] 0 0 0 0 0 0 0 0 863 3740 ...
## $ tot_education: num [1:418] 0 0 0 0 0 0 0 0 0 0 ...
## $ pct_basic : num [1:418] 100 100 100 100 100 ...
## $ pct_retail : num [1:418] 0 0 0 0 0 0 100 0 0 0 ...
## $ pct_service : num [1:418] 0 0 0 0 0 ...
## $ pct_education: num [1:418] 0 0 0 0 0 0 0 0 0 0 ...
str(Qcew_Report_2002_21_5d)
## tibble [14 × 12] (S3: tbl_df/tbl/data.frame)
## $ year : chr [1:14] "2005" "2006" "2007" "2008" ...
## $ county : chr [1:14] "Midland" "Midland" "Midland" "Midland" ...
## $ indcode_2d : chr [1:14] "51" "51" "51" "51" ...
## $ totalemp_2d : num [1:14] 104 85 90 74 84 96 90 97 90 199 ...
## $ tot_basic : num [1:14] 104 85 90 74 84 96 90 97 90 199 ...
## $ tot_retail : num [1:14] 104 85 90 74 84 96 90 97 90 199 ...
## $ tot_service : num [1:14] 0 0 0 0 0 0 0 0 0 0 ...
## $ tot_education: num [1:14] 0 0 0 0 0 0 0 0 0 0 ...
## $ pct_basic : num [1:14] 100 100 100 100 100 100 100 100 100 100 ...
## $ pct_retail : num [1:14] 100 100 100 100 100 100 100 100 100 100 ...
## $ pct_service : num [1:14] 0 0 0 0 0 0 0 0 0 0 ...
## $ pct_education: num [1:14] 0 0 0 0 0 0 0 0 0 0 ...
# Print a summary of the data
summary(Qcew_Report_midland_2002_21_append)
## year county indcode_2d totalemp_2d
## Length:418 Length:418 Length:418 Min. : 30
## Class :character Class :character Class :character 1st Qu.: 1932
## Mode :character Mode :character Mode :character Median : 4528
## Mean : 6543
## 3rd Qu.: 8661
## Max. :62572
## tot_basic tot_retail tot_service tot_education
## Min. : 0 Min. : 0 Min. : 0 Min. : 0.0
## 1st Qu.: 0 1st Qu.: 0 1st Qu.: 0 1st Qu.: 0.0
## Median : 0 Median : 0 Median : 229 Median : 0.0
## Mean : 2987 Mean : 1354 Mean : 2059 Mean : 143.2
## 3rd Qu.: 1098 3rd Qu.: 0 3rd Qu.: 3650 3rd Qu.: 0.0
## Max. :62572 Max. :20088 Max. :11103 Max. :5023.0
## pct_basic pct_retail pct_service pct_education
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 0.00 Median : 0.00 Median : 11.85 Median : 0.000
## Mean : 35.81 Mean : 13.87 Mean : 46.08 Mean : 4.249
## 3rd Qu.:100.00 3rd Qu.: 0.00 3rd Qu.:100.00 3rd Qu.: 0.000
## Max. :100.00 Max. :100.00 Max. :100.00 Max. :96.090
summary(Qcew_Report_2002_21_5d)
## year county indcode_2d totalemp_2d
## Length:14 Length:14 Length:14 Min. : 74.0
## Class :character Class :character Class :character 1st Qu.: 90.0
## Mode :character Mode :character Mode :character Median : 96.5
## Mean :124.7
## 3rd Qu.:165.0
## Max. :206.0
## tot_basic tot_retail tot_service tot_education pct_basic
## Min. : 74.0 Min. : 74.0 Min. :0 Min. :0 Min. :100
## 1st Qu.: 90.0 1st Qu.: 90.0 1st Qu.:0 1st Qu.:0 1st Qu.:100
## Median : 96.5 Median : 96.5 Median :0 Median :0 Median :100
## Mean :124.7 Mean :124.7 Mean :0 Mean :0 Mean :100
## 3rd Qu.:165.0 3rd Qu.:165.0 3rd Qu.:0 3rd Qu.:0 3rd Qu.:100
## Max. :206.0 Max. :206.0 Max. :0 Max. :0 Max. :100
## pct_retail pct_service pct_education
## Min. :100 Min. :0 Min. :0
## 1st Qu.:100 1st Qu.:0 1st Qu.:0
## Median :100 Median :0 Median :0
## Mean :100 Mean :0 Mean :0
## 3rd Qu.:100 3rd Qu.:0 3rd Qu.:0
## Max. :100 Max. :0 Max. :0