#library(googlesheets4)
library(readxl)
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# get access to google drive
#gs4_auth()
# load the crosswalk sheet
Crosswalk99 <- read_excel("/Users/linxinyu/Desktop/Crosswalk/Additional_Crosswalks_V3_forRDC.xlsx",
    sheet = "99 Crosswalk for ONET"
)
Crosswalk0 <- read_excel("/Users/linxinyu/Desktop/Crosswalk/Additional_Crosswalks_V3_forRDC.xlsx",
    sheet = "0 Crosswalk for ONET", col_names = TRUE, skip = 1
)
ACScodes <- read_excel("/Users/linxinyu/Desktop/Crosswalk/occsoc_crosswalk_2000_onward.xlsx",
    sheet = "Codes without Descriptions")
## New names:
## • `` -> `...1`
head(Crosswalk99)
head(Crosswalk0)
# Fill down
Crosswalk99_filled <- Crosswalk99 %>%
  fill(`...to create this`, .direction = "down") 
Crosswalk0_filled <- Crosswalk0 %>%
  fill(`...to create this`, .direction = "down") 
# filter the data so that we can have the elements we want
filter_work <- function(data, element_ids) {
  data %>%
    dplyr::filter(`Element ID` %in% element_ids)
}
values_of_interest <- c("1.B.2.a", "1.B.2.b", "1.B.2.c", "1.B.2.d", "1.B.2.e", "1.B.2.f")
styles_of_interest <- c("1.C.4.b")
context_of_interest <- c("4.C.2.a.1.b", "4.C.2.a.1.c", "4.C.2.b.1.a", "4.C.2.b.1.b", "4.C.2.b.1.c", "4.C.2.b.1.d", "4.C.2.b.1.e", "4.C.2.b.1.f", "4.C.2.c.1.d", "4.C.2.c.1.e", "4.C.3.a.1", "4.C.3.a.2.a", "4.C.3.a.2.b", "4.C.3.a.4", "4.C.3.b.8", "4.C.3.d.1", "4.C.3.d.3", "4.C.3.d.4", "4.C.3.d.8")
activities_of_interest <- c("4.A.3.a.1", "4.A.3.a.2")
work_styles_2018 <- read_excel("/Users/linxinyu/Desktop/Crosswalk/db_23_1_excel/Work Styles.xlsx")
work_values_2018 <- read_excel("/Users/linxinyu/Desktop/Crosswalk/db_23_1_excel/Work Values.xlsx")
work_context_2018 <- read_excel("/Users/linxinyu/Desktop/Crosswalk/db_23_1_excel/Work Context.xlsx")
work_activities_2018 <- read_excel("/Users/linxinyu/Desktop/Crosswalk/db_23_1_excel/Work Activities.xlsx")
# Create a mapping table
mapping_table_99 <- Crosswalk99_filled %>%
  # Select and rename columns
  select(
    join_key = `Average these…`, 
    ASCCODE = `...to create this`
  ) %>%
  filter(!is.na(ASCCODE))%>%
           mutate(join_key = paste0(join_key, ".00")) 

mapping_table_0 <- Crosswalk0_filled %>%
  select(
    join_key = `Average these…`, 
    ASCCODE = `...to create this`
  ) %>%
  filter(!is.na(ASCCODE))%>%
           mutate(join_key = paste0(join_key, ".00")) 
head(mapping_table_99)
# map the Occupation title
attach_acs_title <- function(mapping_df,
                                   acs_df,
                                   mapping_code_col = "ASCCODE",
                                   acs_code_col     = "2010-2012 ACS/PRCS OCCSOC",
                                   acs_title_col    = "Occupation title",
                                   out_title_col    = "ACS-Occupation title",
                                   pad_width        = NA )
  {
  norm_digits <- function(x) {
    x <- as.character(x)
    x <- trimws(x)
    x <- sub("\\.00$", "", x)    
    x <- gsub("[^0-9]", "", x)   
    if (!is.na(pad_width)) x <- sprintf(paste0("%0", pad_width, "d"), as.integer(x))
    x
  }
  keep_digits_X <- function(x) {
    x <- as.character(x)
    x <- trimws(x)
    x <- toupper(x)
    x <- gsub("[^0-9X]", "", x)
    x
  }

  map_code <- norm_digits(mapping_df[[mapping_code_col]])
  acs_code_raw <- keep_digits_X(acs_df[[acs_code_col]])

  acs_exact_idx <- which(!is.na(acs_code_raw) & !grepl("X", acs_code_raw, fixed = TRUE))
  acs_wild_idx  <- which(!is.na(acs_code_raw) &  grepl("X", acs_code_raw, fixed = TRUE))

  acs_exact_codes  <- gsub("[^0-9]", "", acs_code_raw[acs_exact_idx])
  if (!is.na(pad_width)) {
    acs_exact_codes <- sprintf(paste0("%0", pad_width, "d"), as.integer(acs_exact_codes))
  }
  acs_exact_titles <- acs_df[[acs_title_col]][acs_exact_idx]

  title_out <- acs_exact_titles[ match(map_code, acs_exact_codes) ]

  if (length(acs_wild_idx) > 0) {
    acs_wild_codes <- acs_code_raw[acs_wild_idx]
    acs_wild_prefix <- sub("X+$", "", acs_wild_codes) 
    acs_wild_titles <- acs_df[[acs_title_col]][acs_wild_idx]

    ord <- order(nchar(acs_wild_prefix), decreasing = TRUE)

    need <- which(is.na(title_out) & nzchar(map_code)) 
    if (length(need) > 0) {
      for (i in need) {
        code_i <- map_code[i]
        if (!nzchar(code_i)) next
        hit <- NA_integer_
        for (j in ord) {
          pref <- acs_wild_prefix[j]
          if (!nzchar(pref)) next
          if (startsWith(code_i, pref)) { hit <- j; break }
        }
        if (!is.na(hit)) title_out[i] <- acs_wild_titles[hit]
      }
    }
  }

  out <- mapping_df
  out[[out_title_col]] <- title_out
  out
}
mapping_99 <- attach_acs_title(mapping_table_99, ACScodes)
mapping_0 <- attach_acs_title(mapping_table_0, ACScodes)
fill_from_mapping <- function(main_df,
                              mapping_df,
                              main_key   = "O*NET-SOC Code",
                              mapping_key= "join_key",
                              cols = NULL) {
  if (is.null(cols)) cols <- setdiff(names(mapping_df), mapping_key)

  map2 <- mapping_df %>%
    rename_with(~ paste0(.x, "__new"), all_of(cols))

  main_df[[main_key]]  <- as.character(main_df[[main_key]])
  map2[[mapping_key]]  <- as.character(map2[[mapping_key]])

  out <- left_join(main_df, select(map2, all_of(mapping_key), ends_with("__new")),
                   by = setNames(mapping_key, main_key))

  for (c in cols) {
    newc <- paste0(c, "__new")
    if (newc %in% names(out)) {
      out[[c]] <- coalesce(out[[c]], out[[newc]])
      out[[newc]] <- NULL
    }
  }
  out
}
# left join the data
# crosswalk99
work_styles_2018 <- fill_from_mapping(
  main_df = work_styles_2018,
  mapping_df = mapping_99,
  cols = c("ASCCODE", "ACS-Occupation title")
)
work_values_2018 <- fill_from_mapping(
  main_df = work_values_2018,
  mapping_df = mapping_99,
  cols = c("ASCCODE", "ACS-Occupation title")
)
work_context_2018 <- fill_from_mapping(
  main_df = work_context_2018,
  mapping_df = mapping_99,
  cols = c("ASCCODE", "ACS-Occupation title")
)
work_activities_2018 <- fill_from_mapping(
  main_df = work_activities_2018,
  mapping_df = mapping_99,
  cols = c("ASCCODE", "ACS-Occupation title")
)
# crosswalk00
work_styles_2018 <- fill_from_mapping(
  main_df = work_styles_2018,
  mapping_df = mapping_0,
  cols = c("ASCCODE", "ACS-Occupation title")
)
## Warning in left_join(main_df, select(map2, all_of(mapping_key), ends_with("__new")), : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 7729 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
work_values_2018 <- fill_from_mapping(
  main_df = work_values_2018,
  mapping_df = mapping_0,
  cols = c("ASCCODE", "ACS-Occupation title")
)
## Warning in left_join(main_df, select(map2, all_of(mapping_key), ends_with("__new")), : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 4411 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
work_context_2018 <- fill_from_mapping(
  main_df = work_context_2018,
  mapping_df = mapping_0,
  cols = c("ASCCODE", "ACS-Occupation title")
)
## Warning in left_join(main_df, select(map2, all_of(mapping_key), ends_with("__new")), : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 163293 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
work_activities_2018 <- fill_from_mapping(
  main_df = work_activities_2018,
  mapping_df = mapping_0,
  cols = c("ASCCODE", "ACS-Occupation title")
)
## Warning in left_join(main_df, select(map2, all_of(mapping_key), ends_with("__new")), : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 39689 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
head(filter(work_styles_2018, !is.na(ASCCODE)))
head(filter(work_activities_2018, !is.na(ASCCODE)))
head(filter(work_context_2018, !is.na(ASCCODE)))
head(filter(work_values_2018, !is.na(ASCCODE)))