library(tidyverse)
library(caret)
library(ranger)
library(rmarkdown)

Overview

In this document, we will demonstrate the full process for preparing the data, generating models, and predicting 1976 codes on data from 2003. This general process can be applied to many other longitudinal datasets. All final predictions from this process can be found at https://doi.org/10.18130/V3/6RUBPV. For a guide on utilizing those predictions, please refer to https://rpubs.com/mmc4cv/utilizing-resources_rf_preds.

The process of creating predictions requires a two-stage modeling process. The stages are denoted by “Assembly” and “Rollout.”

Note that in our project, two-three of these models, trained on different years within a schema, were combined for final predictions for increased robustness. If you would like to see this process formatted to run in a loop, with dynamic variables based on year, view our original documentation at https://github.com/IndOcc/IndOcc/blob/main/01_BuildRandomForestsIND.R and at https://github.com/IndOcc/IndOcc/blob/main/03_PredictionIND.R

Actual forest models are hosted at https://doi.org/10.18130/V3/6RUBPV.

Data Preperation

Read in data

Sample data for the purposes of illustration. Models and predictions are created using variables available only in every year and month of the CPS.

df_76 <- read.csv("https://raw.githubusercontent.com/IndOcc/IndOcc/main/vingette_sample_76.csv")
paged_table(head(df_76))

Set year of interest

In this case, we will be training models to translate data from other years in to 1976.

year_of_interest <- 1976

Create Flags

Create a flag to mark what schema the ‘year_of_interest’ is in. These results will be used later in the crosswalk and modeling process.

  year_ranges<- list(c(1971:1982), #1
                     c(1983:1991), #2
                     c(1992:2002), #3
                     c(2003:2008), #4
                     c(2009:2013), #5
                     c(2014:2019), #6
                     c(2020:2022)) #7
  in_year_range<- c(FALSE,
                    FALSE,
                    FALSE,
                    FALSE,
                    FALSE,
                    FALSE,
                    FALSE)

  for (x in 1:length(year_ranges)) {
    if (year_of_interest %in% year_ranges[[x]]){
      in_year_range[x]<- TRUE
    }
  }

Prepare Crosswalk & Merge with Data

The industry crosswalk was previously prepared by us. More information at https://github.com/IndOcc/CPScrosswalks. ### Load crosswalk Load, clean, and display crosswalk:

xwalk_name<- "https://raw.githubusercontent.com/IndOcc/CPScrosswalks/main/IND_crosswalk_FULL.csv"
crosswalk <- read.csv(xwalk_name)
crosswalk<- janitor::clean_names(crosswalk)
crosswalk<- crosswalk[-1]
paged_table(crosswalk)

Set “from”

Where ‘from’ is the year schema the ‘year_of_interest’ belongs to. This should follow the naming conventions present in the crosswalk (i.e. ind/occ_BEGINNINGYEAR_ENDINGYEAR).

from <- '1971_1982'

Convert crosswalk to one-hot-encoding

This will convert 1976 and 2003 industry codes crosswalk to a crosswalk with 1976 codes and one-hot-encoded 2003 industry codes.

  explode_column<- c()
  for (x in 1:7){
    if(in_year_range[x]){
      map_column<- crosswalk[x]
    }
    if(!in_year_range[x]){
      explode_column<-append(explode_column,crosswalk[x])
    }
  }
x = 3
used_crosswalk<- data.frame(map_column, explode_column[x])
crosswalk_era<- names(explode_column[x])

used_crosswalk<- used_crosswalk %>% distinct() %>% mutate(value = 1)  %>%  spread(2, value,  fill = 0, sep="_")
for (col in 1:ncol(crosswalk)){
  colnames(crosswalk)[col] <-  sub("[[:digit:]]+ ", "", colnames(crosswalk)[col])
  colnames(crosswalk)[col] <- sub(" ", "_", colnames(crosswalk)[col])
}

View one-hot-encoded industries

paged_table(used_crosswalk)

Merge this one-hot-encoded crosswalk back with the original data.

In this case, merging on the column for 1976 industry code (‘IND’ and ‘ind_1971_1982’)

init_merged<- merge(used_crosswalk, df_76, by.x="ind_1971_1982", by.y = "IND")
    names(init_merged)[1]<-"IND" 
    init_merged$IND<- as.factor(init_merged$IND)
    init_merged$SEX<- as.factor(init_merged$SEX)
    init_merged$EDUC<- factor(init_merged$EDUC, order=TRUE, levels=c(0, 10,20,30,40,50,60,70,80,90,100,110,120))
    init_merged$CLASSWKR<- as.factor(init_merged$CLASSWKR)

Remove non-predictors

init_merged <- init_merged %>% select(-c('CPSIDP', 'YEAR', 'MONTH', 'OCC'))

View final dataframe

paged_table(head(init_merged))

Assembling Models

Train/Test Split

We used an 80/20 train/test split

set.seed(18) #For reproducibility
nums <- sample(seq_len(nrow(init_merged)), size = floor(.8 * nrow(init_merged)))

train <- init_merged[nums,]
trainx <- train[-1]
test <- init_merged[-nums,]
testx <- test[-1]

Train model and generate test predictions

ranger_forest <- ranger(IND~., data = train[complete.cases(train),], num.trees = 500)
## Growing trees.. Progress: 9%. Estimated remaining time: 5 minutes, 29 seconds.
## Growing trees.. Progress: 20%. Estimated remaining time: 4 minutes, 8 seconds.
## Growing trees.. Progress: 32%. Estimated remaining time: 3 minutes, 17 seconds.
## Growing trees.. Progress: 44%. Estimated remaining time: 2 minutes, 41 seconds.
## Growing trees.. Progress: 55%. Estimated remaining time: 2 minutes, 6 seconds.
## Growing trees.. Progress: 67%. Estimated remaining time: 1 minute, 32 seconds.
## Growing trees.. Progress: 79%. Estimated remaining time: 59 seconds.
## Growing trees.. Progress: 90%. Estimated remaining time: 26 seconds.
ranger_pred <- predict(ranger_forest, data = testx)

Create confusion matrix

confusion <- confusionMatrix(ranger_pred$predictions, test$IND)
confusiontable <- as.data.frame(as.matrix(confusion))
by_class <- as.data.frame(confusion$byClass)
paged_table(confusiontable)

Create complexity measure

The definition for complexity can be found in the readme on our Libra repository, https://dataverse.lib.virginia.edu/file.xhtml?fileId=57998&version=2.0.

crosswalk2 <- crosswalk %>% select(c(paste0("ind_", from), crosswalk_era))
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(crosswalk_era)` instead of `crosswalk_era` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
crosswalk2 <- distinct(crosswalk2)
one_one_c2 <- crosswalk2 %>% group_by(across(2)) %>% count()
counts_other_year <- merge(crosswalk2, one_one_c2, all=TRUE)

factor_levels <- levels(train$IND)

check<- as.data.frame(table(true = test$IND, predicted = ranger_pred$predictions))
accuracy_by_gp<- check %>%
  group_by(true) %>%
  summarise(accuracy = Freq[true==predicted] / sum(Freq))

max_value <- as.data.frame(rep(0, length = length(names)))
names(max_value)[1] <- "Complexity"

for (i in 1:length(confusiontable)){
  val = as.integer(paste(accuracy_by_gp$true[i]))
  single_thing <- counts_other_year %>% filter(counts_other_year[2] == val)
  max_value[i,] <- max(single_thing$n)
}

comparison_table <- tibble(factor_levels, by_class, max_value)

Example comparison table

paged_table(comparison_table)

Example results by group

This results by group table is produced as well.

paged_table(check)

Save outputs

save(ranger_forest, file = paste0("forest_IND", year_of_interest,"from_",crosswalk_era,".RData"))
write.csv(comparison_table, paste0("results_table_", year_of_interest,"from_",crosswalk_era,"_IND.csv"), row.names = FALSE)
write.csv(check, paste0("groupresults_", year_of_interest,"from_",crosswalk_era,"_IND.csv"), row.names=FALSE)
write.csv(confusiontable, paste0("confusion_", year_of_interest,"from_",crosswalk_era,"_IND.csv"), row.names=FALSE)

Rolling out Models

This process follows much of the same format as the previous section, with slight modifications to account for the different data.

Read in data & filter data

In this case we want to predict values for data from 2003. We will read in the 2003 data and filter it to only those responses who have an industry provided.

df_03 <- read.csv("https://raw.githubusercontent.com/IndOcc/IndOcc/main/vingette_sample_03.csv", colClasses= c("CPSIDP" = 'character'))
ind <- df_03 %>% filter(IND != 0)
paged_table(head(ind))

Select start & end years

The start year is the year you have data for. The end year is a year in the schema you want your data to be translated into (in this case 2003 into 1976)

start_year <- 2003
end_year <- 1976

Load, clean and display crosswalk

xwalk_name<- "https://raw.githubusercontent.com/IndOcc/CPScrosswalks/main/IND_crosswalk_FULL.csv"
crosswalk <- read.csv(xwalk_name)
crosswalk<- janitor::clean_names(crosswalk)
crosswalk<- crosswalk[-1]
paged_table(crosswalk)

Create duplicate IND field

For use in the later merging stage

dupe_IND <- ind %>% select(IND)
colnames(dupe_IND) <- c("dupe_IND")
ind <- cbind(ind, dupe_IND)

Get variables we’ll need

In this case, three years are defined in order to create more robust predictions. All three years come from the 1976-1982 schema.

from <- '2003_2008'
# These are the three years we'll pull models for
first <-  1976
second <- 1979
third <- 1982
set <- "1976.1982"
into <- '2003_2008'

Create one-hot-encoding

from1 <- paste0("ind_", from)
crosswalk_subset <- crosswalk[c(from1)]

dupe_col <- crosswalk_subset[1]
names(dupe_col)[1] <- 'shadow'
crosswalk_subset <- tibble(crosswalk_subset[1], dupe_col)
used_crosswalk<- crosswalk_subset  %>% distinct %>% mutate(value = 1)  %>% spread(1, value,  fill = 0, sep="_")

paged_table(used_crosswalk)

Merge one-hot-encoding with original data

init_merged<- merge(used_crosswalk, ind, by.x='shadow', by.y = "IND") 
init_merged <- init_merged[-c(1)]
paged_table(head(init_merged))

Create UniqueID Scaffold

Together, these three variables comprise a unique response id. They will be used to to match predictions to individuals.

filtered_key <- tibble(init_merged$CPSIDP, init_merged$YEAR, init_merged$MONTH)
colnames(filtered_key) <- c("CPSIDP", "YEAR", "MONTH")
paged_table(head(filtered_key))

Create copy of original data

orig <- tibble(init_merged$YEAR, 
               init_merged$MONTH, 
               init_merged$CPSIDP, 
               init_merged$dupe_IND, 
               init_merged$OCC,
               init_merged$AGE, 
               init_merged$SEX, 
               init_merged$EDUC, 
               init_merged$CLASSWKR)
colnames(orig) <- c("YEAR", "MONTH", "CPSIDP", "Original.IND", "Original.OCC", "AGE", "SEX", "EDUC.recoded", "CLASSWKR.recoded")
paged_table(head(orig))

Generate predictions

Load prefit models

These are hosted at https://doi.org/10.18130/V3/6RUBPV. The naming convention is a little confusing, but we can get the names of the models we want by following this format:

model1 <- paste0("forest_IND", first, "from_ind_", into, ".RData")
model2 <- paste0("forest_IND", second, "from_ind_", into, ".RData")
model3 <- paste0("forest_IND", third, "from_ind_", into, ".RData")
print(model1)
## [1] "forest_IND1976from_ind_2003_2008.RData"
print(model2)
## [1] "forest_IND1979from_ind_2003_2008.RData"
print(model3)
## [1] "forest_IND1982from_ind_2003_2008.RData"
load(model1)
forest1 <- ranger_forest

load(model2)
forest2 <- ranger_forest

load(model3)
forest3 <- ranger_forest

Do some data prep

Get the factor levels that will be predicted, convert our data to factors, and remove unused columns from the data.

first_fact <- ind %>% filter(IND != 0)
first_fact$IND<- as.factor(first_fact$IND)
ind_factors <- data.frame(levels(first_fact$IND))
ind_factors['factor_number'] <- seq.int(nrow(ind_factors))

init_merged$SEX<- as.factor(init_merged$SEX)
init_merged$EDUC<- as.factor(init_merged$EDUC)
init_merged$CLASSWKR<- as.factor(init_merged$CLASSWKR)

x_vals <- init_merged %>% select(-c('CPSIDP', 'YEAR', 'MONTH', 'OCC'))

Generate predictions

forest1_preds <- predict(forest1, data = x_vals, predict.all = TRUE)
## Predicting.. Progress: 45%. Estimated remaining time: 37 seconds.
## Predicting.. Progress: 91%. Estimated remaining time: 6 seconds.
forest2_preds <- predict(forest2, data = x_vals, predict.all = TRUE)
## Predicting.. Progress: 44%. Estimated remaining time: 39 seconds.
## Predicting.. Progress: 89%. Estimated remaining time: 7 seconds.
forest3_preds <- predict(forest3, data = x_vals, predict.all = TRUE)
## Predicting.. Progress: 47%. Estimated remaining time: 34 seconds.
## Predicting.. Progress: 96%. Estimated remaining time: 2 seconds.

Extract predictions

Merge them into one dataframe

pred_iters1 <- forest1_preds[["predictions"]]
pred_iters2 <- forest2_preds[["predictions"]]
pred_iters3 <- forest3_preds[["predictions"]]
all_preds <- cbind(pred_iters1, pred_iters2)
all_preds <- cbind(all_preds, pred_iters3)

Generate final predictions & certainty scores

f <- function(x) with(rle(sort(x)), values[order(lengths, decreasing = TRUE)])
fn_test <- apply(all_preds, 1, f)

best_pred <- data.frame(unlist(map(fn_test, 1)))
best_pred$predicted_classification <- factor(best_pred$unlist.map.fn_test..1.., levels = ind_factors$factor_number, labels = ind_factors$levels.first_fact.IND.)

winner <- best_pred$predicted_classification
ct <- rowSums(all_preds == rep(best_pred$unlist.map.fn_test..1.., ncol(all_preds)))
pred_perc <- data.frame(winning_val = winner,
                        ct = ct)

# Generate certainty scores
pred_perc$certainty_score <- lapply(pred_perc$ct, function(x) round((x/ncol(all_preds)),4))
winner_colname <- paste0("IND.Predicted.Value.", set)
certainty_colname <- paste0("IND.Prediction.Certainty.Score.", set)

reduce <- tibble(pred_perc$winning_val, pred_perc$certainty_score)
reduce <- data.frame(lapply(reduce, as.character), stringsAsFactors = FALSE)
colnames(reduce) <- c(winner_colname, certainty_colname)

filtered_key <- cbind(filtered_key, reduce)

paged_table(head(filtered_key))

Merge with original data

merged <- cbind(orig, filtered_key)
merged <- merged[-c(10,11,12)]

Final Output

paged_table(head(merged))

As you can see, each prediction with certainty score in the 1976 schema has now been appended to the original 2003 data.