#library(googlesheets4)
library(readxl)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# get access to google drive
#gs4_auth()
# load the crosswalk sheet
Crosswalk99 <- read_excel("/Users/linxinyu/Desktop/Crosswalk/Additional_Crosswalks_V3_forRDC.xlsx",
sheet = "99 Crosswalk for ONET"
)
Crosswalk0 <- read_excel("/Users/linxinyu/Desktop/Crosswalk/Additional_Crosswalks_V3_forRDC.xlsx",
sheet = "0 Crosswalk for ONET", col_names = TRUE, skip = 1
)
ACScodes <- read_excel("/Users/linxinyu/Desktop/Crosswalk/occsoc_crosswalk_2000_onward.xlsx",
sheet = "Codes without Descriptions")
## New names:
## • `` -> `...1`
head(Crosswalk99)
head(Crosswalk0)
# Fill down
Crosswalk99_filled <- Crosswalk99 %>%
fill(`...to create this`, .direction = "down")
Crosswalk0_filled <- Crosswalk0 %>%
fill(`...to create this`, .direction = "down")
# filter the data so that we can have the elements we want
filter_work <- function(data, element_ids) {
data %>%
dplyr::filter(`Element ID` %in% element_ids)
}
values_of_interest <- c("1.B.2.a", "1.B.2.b", "1.B.2.c", "1.B.2.d", "1.B.2.e", "1.B.2.f")
styles_of_interest <- c("1.C.4.b")
context_of_interest <- c("4.C.2.a.1.b", "4.C.2.a.1.c", "4.C.2.b.1.a", "4.C.2.b.1.b", "4.C.2.b.1.c", "4.C.2.b.1.d", "4.C.2.b.1.e", "4.C.2.b.1.f", "4.C.2.c.1.d", "4.C.2.c.1.e", "4.C.3.a.1", "4.C.3.a.2.a", "4.C.3.a.2.b", "4.C.3.a.4", "4.C.3.b.8", "4.C.3.d.1", "4.C.3.d.3", "4.C.3.d.4", "4.C.3.d.8")
activities_of_interest <- c("4.A.3.a.1", "4.A.3.a.2")
work_styles_2018 <- read_excel("/Users/linxinyu/Desktop/Crosswalk/db_23_1_excel/Work Styles.xlsx")
work_values_2018 <- read_excel("/Users/linxinyu/Desktop/Crosswalk/db_23_1_excel/Work Values.xlsx")
work_context_2018 <- read_excel("/Users/linxinyu/Desktop/Crosswalk/db_23_1_excel/Work Context.xlsx")
work_activities_2018 <- read_excel("/Users/linxinyu/Desktop/Crosswalk/db_23_1_excel/Work Activities.xlsx")
# Create a mapping table
mapping_table_99 <- Crosswalk99_filled %>%
# Select and rename columns
select(
join_key = `Average these…`,
ASCCODE = `...to create this`
) %>%
filter(!is.na(ASCCODE))%>%
mutate(join_key = paste0(join_key, ".00"))
mapping_table_0 <- Crosswalk0_filled %>%
select(
join_key = `Average these…`,
ASCCODE = `...to create this`
) %>%
filter(!is.na(ASCCODE))%>%
mutate(join_key = paste0(join_key, ".00"))
head(mapping_table_99)
# map the Occupation title
attach_acs_title <- function(mapping_df,
acs_df,
mapping_code_col = "ASCCODE",
acs_code_col = "2010-2012 ACS/PRCS OCCSOC",
acs_title_col = "Occupation title",
out_title_col = "ACS-Occupation title",
pad_width = NA )
{
norm_digits <- function(x) {
x <- as.character(x)
x <- trimws(x)
x <- sub("\\.00$", "", x)
x <- gsub("[^0-9]", "", x)
if (!is.na(pad_width)) x <- sprintf(paste0("%0", pad_width, "d"), as.integer(x))
x
}
keep_digits_X <- function(x) {
x <- as.character(x)
x <- trimws(x)
x <- toupper(x)
x <- gsub("[^0-9X]", "", x)
x
}
map_code <- norm_digits(mapping_df[[mapping_code_col]])
acs_code_raw <- keep_digits_X(acs_df[[acs_code_col]])
acs_exact_idx <- which(!is.na(acs_code_raw) & !grepl("X", acs_code_raw, fixed = TRUE))
acs_wild_idx <- which(!is.na(acs_code_raw) & grepl("X", acs_code_raw, fixed = TRUE))
acs_exact_codes <- gsub("[^0-9]", "", acs_code_raw[acs_exact_idx])
if (!is.na(pad_width)) {
acs_exact_codes <- sprintf(paste0("%0", pad_width, "d"), as.integer(acs_exact_codes))
}
acs_exact_titles <- acs_df[[acs_title_col]][acs_exact_idx]
title_out <- acs_exact_titles[ match(map_code, acs_exact_codes) ]
if (length(acs_wild_idx) > 0) {
acs_wild_codes <- acs_code_raw[acs_wild_idx]
acs_wild_prefix <- sub("X+$", "", acs_wild_codes)
acs_wild_titles <- acs_df[[acs_title_col]][acs_wild_idx]
ord <- order(nchar(acs_wild_prefix), decreasing = TRUE)
need <- which(is.na(title_out) & nzchar(map_code))
if (length(need) > 0) {
for (i in need) {
code_i <- map_code[i]
if (!nzchar(code_i)) next
hit <- NA_integer_
for (j in ord) {
pref <- acs_wild_prefix[j]
if (!nzchar(pref)) next
if (startsWith(code_i, pref)) { hit <- j; break }
}
if (!is.na(hit)) title_out[i] <- acs_wild_titles[hit]
}
}
}
out <- mapping_df
out[[out_title_col]] <- title_out
out
}
mapping_99 <- attach_acs_title(mapping_table_99, ACScodes)
mapping_0 <- attach_acs_title(mapping_table_0, ACScodes)
fill_from_mapping <- function(main_df,
mapping_df,
main_key = "O*NET-SOC Code",
mapping_key= "join_key",
cols = NULL) {
if (is.null(cols)) cols <- setdiff(names(mapping_df), mapping_key)
map2 <- mapping_df %>%
rename_with(~ paste0(.x, "__new"), all_of(cols))
main_df[[main_key]] <- as.character(main_df[[main_key]])
map2[[mapping_key]] <- as.character(map2[[mapping_key]])
out <- left_join(main_df, select(map2, all_of(mapping_key), ends_with("__new")),
by = setNames(mapping_key, main_key))
for (c in cols) {
newc <- paste0(c, "__new")
if (newc %in% names(out)) {
out[[c]] <- coalesce(out[[c]], out[[newc]])
out[[newc]] <- NULL
}
}
out
}
# left join the data
# crosswalk99
work_styles_2018 <- fill_from_mapping(
main_df = work_styles_2018,
mapping_df = mapping_99,
cols = c("ASCCODE", "ACS-Occupation title")
)
work_values_2018 <- fill_from_mapping(
main_df = work_values_2018,
mapping_df = mapping_99,
cols = c("ASCCODE", "ACS-Occupation title")
)
work_context_2018 <- fill_from_mapping(
main_df = work_context_2018,
mapping_df = mapping_99,
cols = c("ASCCODE", "ACS-Occupation title")
)
work_activities_2018 <- fill_from_mapping(
main_df = work_activities_2018,
mapping_df = mapping_99,
cols = c("ASCCODE", "ACS-Occupation title")
)
# crosswalk00
work_styles_2018 <- fill_from_mapping(
main_df = work_styles_2018,
mapping_df = mapping_0,
cols = c("ASCCODE", "ACS-Occupation title")
)
## Warning in left_join(main_df, select(map2, all_of(mapping_key), ends_with("__new")), : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 7729 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
work_values_2018 <- fill_from_mapping(
main_df = work_values_2018,
mapping_df = mapping_0,
cols = c("ASCCODE", "ACS-Occupation title")
)
## Warning in left_join(main_df, select(map2, all_of(mapping_key), ends_with("__new")), : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 4411 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
work_context_2018 <- fill_from_mapping(
main_df = work_context_2018,
mapping_df = mapping_0,
cols = c("ASCCODE", "ACS-Occupation title")
)
## Warning in left_join(main_df, select(map2, all_of(mapping_key), ends_with("__new")), : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 163293 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
work_activities_2018 <- fill_from_mapping(
main_df = work_activities_2018,
mapping_df = mapping_0,
cols = c("ASCCODE", "ACS-Occupation title")
)
## Warning in left_join(main_df, select(map2, all_of(mapping_key), ends_with("__new")), : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 39689 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
head(filter(work_styles_2018, !is.na(ASCCODE)))
head(filter(work_activities_2018, !is.na(ASCCODE)))
head(filter(work_context_2018, !is.na(ASCCODE)))
head(filter(work_values_2018, !is.na(ASCCODE)))