library(tidyverse)
library(ranger)
library(rmarkdown)

Overview

In this document, we will show the process for predicting 1976 codes on 2003 data. If you would like to see this process formatted to run in a loop, with dynamic variables based on year, view our original documentation at https://github.com/IndOcc/IndOcc/blob/main/03_PredictionIND.R.

An important note: These models require data to be in the same format with the same parameters as defined in our process and will not work when applied to other data that deviates from this. To see a tutorial detailing the process of creating these forests, please refer to this document: https://rpubs.com/mmc4cv/appendix

Read in data & filter data

Because we want to predict values for data from 2003, we will read in the 2003 data and filter it to only those who have an industry provided.

df_03 <- read.csv("https://raw.githubusercontent.com/IndOcc/IndOcc/main/vingette_sample_03.csv", colClasses= c("CPSIDP" = 'character'))
ind <- df_03 %>% filter(IND != 0)
paged_table(head(ind))

Select start & end years

The start year is the year you have data for. The end year is a year in the schema you want your data to be translated into.

start_year <- 2003
end_year <- 1976

Crosswalk Process

Load, clean and display crosswalk

xwalk_name<- "https://raw.githubusercontent.com/IndOcc/CPScrosswalks/main/IND_crosswalk_FULL.csv"
crosswalk <- read.csv(xwalk_name)
crosswalk<- janitor::clean_names(crosswalk)
crosswalk<- crosswalk[-1]
paged_table(crosswalk)

Create duplicate IND field

For use later

dupe_IND <- ind %>% select(IND)
colnames(dupe_IND) <- c("dupe_IND")
ind <- cbind(ind, dupe_IND)

Get variables we’ll need

from <- '2003_2008'
# These are the three years we'll pull models for
first <-  1976
second <- 1979
third <- 1982
set <- "1976.1982"
into <- '2003_2008'

Create one-hot-encoding

from1 <- paste0("ind_", from)
crosswalk_subset <- crosswalk[c(from1)]

dupe_col <- crosswalk_subset[1]
names(dupe_col)[1] <- 'shadow'
crosswalk_subset <- tibble(crosswalk_subset[1], dupe_col)
used_crosswalk<- crosswalk_subset  %>% distinct %>% mutate(value = 1)  %>% spread(1, value,  fill = 0, sep="_")

paged_table(used_crosswalk)

Merge one-hot-encoding with original data

init_merged<- merge(used_crosswalk, ind, by.x='shadow', by.y = "IND") 
init_merged <- init_merged[-c(1)]
paged_table(head(init_merged))

Create UniqueID Scaffold

Together, these three variables comprise a unique response id. They will be used to to match predictions to individuals.

filtered_key <- tibble(init_merged$CPSIDP, init_merged$YEAR, init_merged$MONTH)
colnames(filtered_key) <- c("CPSIDP", "YEAR", "MONTH")
paged_table(head(filtered_key))

Create copy of original data

orig <- tibble(init_merged$YEAR, 
               init_merged$MONTH, 
               init_merged$CPSIDP, 
               init_merged$dupe_IND, 
               init_merged$OCC,
               init_merged$AGE, 
               init_merged$SEX, 
               init_merged$EDUC, 
               init_merged$CLASSWKR)
colnames(orig) <- c("YEAR", "MONTH", "CPSIDP", "Original.IND", "Original.OCC", "AGE", "SEX", "EDUC.recoded", "CLASSWKR.recoded")
paged_table(head(orig))

Generate predictions

Load prefit models

These are hosted at https://doi.org/10.18130/V3/6RUBPV. The naming convention is a little confusing, but we can get the names of the models we want by following this format:

model1 <- paste0("forest_IND", first, "from_ind_", into, ".RData")
model2 <- paste0("forest_IND", second, "from_ind_", into, ".RData")
model3 <- paste0("forest_IND", third, "from_ind_", into, ".RData")
print(model1)
## [1] "forest_IND1976from_ind_2003_2008.RData"
print(model2)
## [1] "forest_IND1979from_ind_2003_2008.RData"
print(model3)
## [1] "forest_IND1982from_ind_2003_2008.RData"
load(model1)
forest1 <- ranger_forest

load(model2)
forest2 <- ranger_forest

load(model3)
forest3 <- ranger_forest

Do some data prep

Get the factor levels that will be predicted, convert our data to factors, and remove unused columns from the data.

first_fact <- ind %>% filter(IND != 0)
first_fact$IND<- as.factor(first_fact$IND)
ind_factors <- data.frame(levels(first_fact$IND))
ind_factors['factor_number'] <- seq.int(nrow(ind_factors))

init_merged$SEX<- as.factor(init_merged$SEX)
init_merged$EDUC<- as.factor(init_merged$EDUC)
init_merged$CLASSWKR<- as.factor(init_merged$CLASSWKR)

x_vals <- init_merged %>% select(-c('CPSIDP', 'YEAR', 'MONTH', 'OCC'))

Generate predictions

Now, we will generate predictions

forest1_preds <- predict(forest1, data = x_vals, predict.all = TRUE)
## Predicting.. Progress: 44%. Estimated remaining time: 39 seconds.
## Predicting.. Progress: 88%. Estimated remaining time: 8 seconds.
forest2_preds <- predict(forest2, data = x_vals, predict.all = TRUE)
## Predicting.. Progress: 48%. Estimated remaining time: 33 seconds.
## Predicting.. Progress: 98%. Estimated remaining time: 1 seconds.
forest3_preds <- predict(forest3, data = x_vals, predict.all = TRUE)
## Predicting.. Progress: 43%. Estimated remaining time: 40 seconds.
## Predicting.. Progress: 89%. Estimated remaining time: 7 seconds.

Extract predictions

And merge them into one dataframe

pred_iters1 <- forest1_preds[["predictions"]]
pred_iters2 <- forest2_preds[["predictions"]]
pred_iters3 <- forest3_preds[["predictions"]]
all_preds <- cbind(pred_iters1, pred_iters2)
all_preds <- cbind(all_preds, pred_iters3)

Generate final predictions & certainty scores

f <- function(x) with(rle(sort(x)), values[order(lengths, decreasing = TRUE)])
fn_test <- apply(all_preds, 1, f)

best_pred <- data.frame(unlist(map(fn_test, 1)))
best_pred$predicted_classification <- factor(best_pred$unlist.map.fn_test..1.., levels = ind_factors$factor_number, labels = ind_factors$levels.first_fact.IND.)

winner <- best_pred$predicted_classification
ct <- rowSums(all_preds == rep(best_pred$unlist.map.fn_test..1.., ncol(all_preds)))
pred_perc <- data.frame(winning_val = winner,
                        ct = ct)

# Generate certainty scores
pred_perc$certainty_score <- lapply(pred_perc$ct, function(x) round((x/ncol(all_preds)),4))
winner_colname <- paste0("IND.Predicted.Value.", set)
certainty_colname <- paste0("IND.Prediction.Certainty.Score.", set)

reduce <- tibble(pred_perc$winning_val, pred_perc$certainty_score)
reduce <- data.frame(lapply(reduce, as.character), stringsAsFactors = FALSE)
colnames(reduce) <- c(winner_colname, certainty_colname)

filtered_key <- cbind(filtered_key, reduce)

paged_table(head(filtered_key))

Merge with original data

merged <- cbind(orig, filtered_key)
merged <- merged[-c(10,11,12)]

Final Output

paged_table(head(merged))

As you can see, each prediction with certainty score in the 1976 schema has now been appended to the original 2003 data.