22 raw excel data were downloaded and iterated through using a “for loop” funnction with monthly Employment values extracted.
The data was converted from a long format to a wide format.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.1 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(readxl)
# Set working directory for to the folders containing all quarterly data
#setwd("/Users/owad/Downloads/All Data")
# Get the list of files in the working directory
files <- list.files(pattern = "\\.xlsx$")
# Create a dataframe to append each transpose data for each quarterly raw data
main_df <- data.frame()
Each of the raw quarterly data was iterated through to extract required values and transpose to a time-series panel data.
# Iterate through each file
for (file in files) {
df <- read_excel(file)
# Changing the column names format
colnames(df) <- gsub(" ", "_", colnames(df))
# Removing leading numbers from observation starting with numbers
df$Industry <- ifelse(grepl("^\\d", df$Industry),
gsub("^\\d+", "", df$Industry), df$Industry)
# Remove whitespace at the beginning of the new industry observation
df$Industry <- trimws(df$Industry, "left")
# Reshaping data to a time series panel data
df1 <- df %>%
filter(Area_Type == "MSA") %>%
mutate(AreaName = Area) %>%
separate(Area, into = c("Area", "State"), sep = ",") %>%
mutate(State = substr(trimws(State), 1, 2),
Own_indust = paste0(Ownership, " ", Industry))
df_long <- pivot_longer(df1, cols = ends_with("Employment"),
names_to = "Month", values_to = "Employment") %>%
separate(Month, into = c("Month", "Stat"), sep = "_") %>%
mutate(Month = match(Month, month.name),
time = paste0(Year, "-", Month)) %>%
select(Year, Month, time, State, AreaName, Own_indust, Employment) %>%
pivot_wider(names_from = Own_indust,
values_from = Employment)
# Appending each transpose data for each quarter to one dataframe
main_df <- bind_rows(main_df, df_long)
}
head(main_df)
## Year Month time State AreaName Total Covered Total, all industries
## 1 2000 1 2000-1 TX Abilene, TX MSA 59968
## 2 2000 2 2000-2 TX Abilene, TX MSA 60275
## 3 2000 3 2000-3 TX Abilene, TX MSA 60837
## 4 2000 1 2000-1 OH Akron, OH MSA 311730
## 5 2000 2 2000-2 OH Akron, OH MSA 312147
## 6 2000 3 2000-3 OH Akron, OH MSA 315534
## Federal Government Total, all industries Private Total, all industries
## 1 1618 46662
## 2 1625 46817
## 3 1635 47258
## 4 2924 270180
## 5 2944 269622
## 6 2987 272837
## Private Goods-producing Private Natural resources and mining
## 1 7503 1387
## 2 7466 1371
## 3 7607 1537
## 4 74736 765
## 5 74705 771
## 6 75356 823
## Private Construction Private Manufacturing Private Service-providing
## 1 2526 3590 39159
## 2 2541 3554 39351
## 3 2537 3533 39651
## 4 12185 61786 195444
## 5 12185 61749 194917
## 6 12937 61596 197481
## Private Trade, transportation, and utilities Private Financial activities
## 1 12313 2893
## 2 12244 2892
## 3 12157 2928
## 4 65153 13324
## 5 64526 13342
## 6 64846 13539
## Private Professional and business services
## 1 3971
## 2 4145
## 3 4225
## 4 35999
## 5 35779
## 6 36672
## Private Education and health services Private Leisure and hospitality
## 1 10622 5974
## 2 10628 6012
## 3 10731 6168
## 4 37280 28249
## 5 37441 28246
## 6 37789 28940
## State Government Total, all industries Local Government Total, all industries
## 1 NA NA
## 2 NA NA
## 3 NA NA
## 4 7613 31013
## 5 8344 31237
## 6 8409 31301
## Private Information Private Other services Private Unclassified
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 4854 10436 NA
## 5 4907 10502 NA
## 6 4919 10592 NA