library(tidyverse)
library(ranger)
library(rmarkdown)
In this document, we will show the process for predicting 1976 codes on 2003 data. If you would like to see this process formatted to run in a loop, with dynamic variables based on year, view our original documentation at https://github.com/IndOcc/IndOcc/blob/main/03_PredictionIND.R.
An important note: These models require data to be in the same format with the same parameters as defined in our process and will not work when applied to other data that deviates from this. To see a tutorial detailing the process of creating these forests, please refer to this document: https://rpubs.com/mmc4cv/appendix
Because we want to predict values for data from 2003, we will read in the 2003 data and filter it to only those who have an industry provided.
df_03 <- read.csv("https://raw.githubusercontent.com/IndOcc/IndOcc/main/vingette_sample_03.csv", colClasses= c("CPSIDP" = 'character'))
ind <- df_03 %>% filter(IND != 0)
paged_table(head(ind))
The start year is the year you have data for. The end year is a year in the schema you want your data to be translated into.
start_year <- 2003
end_year <- 1976
xwalk_name<- "https://raw.githubusercontent.com/IndOcc/CPScrosswalks/main/IND_crosswalk_FULL.csv"
crosswalk <- read.csv(xwalk_name)
crosswalk<- janitor::clean_names(crosswalk)
crosswalk<- crosswalk[-1]
paged_table(crosswalk)
For use later
dupe_IND <- ind %>% select(IND)
colnames(dupe_IND) <- c("dupe_IND")
ind <- cbind(ind, dupe_IND)
from <- '2003_2008'
# These are the three years we'll pull models for
first <- 1976
second <- 1979
third <- 1982
set <- "1976.1982"
into <- '2003_2008'
from1 <- paste0("ind_", from)
crosswalk_subset <- crosswalk[c(from1)]
dupe_col <- crosswalk_subset[1]
names(dupe_col)[1] <- 'shadow'
crosswalk_subset <- tibble(crosswalk_subset[1], dupe_col)
used_crosswalk<- crosswalk_subset %>% distinct %>% mutate(value = 1) %>% spread(1, value, fill = 0, sep="_")
paged_table(used_crosswalk)
init_merged<- merge(used_crosswalk, ind, by.x='shadow', by.y = "IND")
init_merged <- init_merged[-c(1)]
paged_table(head(init_merged))
Together, these three variables comprise a unique response id. They will be used to to match predictions to individuals.
filtered_key <- tibble(init_merged$CPSIDP, init_merged$YEAR, init_merged$MONTH)
colnames(filtered_key) <- c("CPSIDP", "YEAR", "MONTH")
paged_table(head(filtered_key))
orig <- tibble(init_merged$YEAR,
init_merged$MONTH,
init_merged$CPSIDP,
init_merged$dupe_IND,
init_merged$OCC,
init_merged$AGE,
init_merged$SEX,
init_merged$EDUC,
init_merged$CLASSWKR)
colnames(orig) <- c("YEAR", "MONTH", "CPSIDP", "Original.IND", "Original.OCC", "AGE", "SEX", "EDUC.recoded", "CLASSWKR.recoded")
paged_table(head(orig))
These are hosted at https://doi.org/10.18130/V3/6RUBPV. The naming convention is a little confusing, but we can get the names of the models we want by following this format:
model1 <- paste0("forest_IND", first, "from_ind_", into, ".RData")
model2 <- paste0("forest_IND", second, "from_ind_", into, ".RData")
model3 <- paste0("forest_IND", third, "from_ind_", into, ".RData")
print(model1)
## [1] "forest_IND1976from_ind_2003_2008.RData"
print(model2)
## [1] "forest_IND1979from_ind_2003_2008.RData"
print(model3)
## [1] "forest_IND1982from_ind_2003_2008.RData"
load(model1)
forest1 <- ranger_forest
load(model2)
forest2 <- ranger_forest
load(model3)
forest3 <- ranger_forest
Get the factor levels that will be predicted, convert our data to factors, and remove unused columns from the data.
first_fact <- ind %>% filter(IND != 0)
first_fact$IND<- as.factor(first_fact$IND)
ind_factors <- data.frame(levels(first_fact$IND))
ind_factors['factor_number'] <- seq.int(nrow(ind_factors))
init_merged$SEX<- as.factor(init_merged$SEX)
init_merged$EDUC<- as.factor(init_merged$EDUC)
init_merged$CLASSWKR<- as.factor(init_merged$CLASSWKR)
x_vals <- init_merged %>% select(-c('CPSIDP', 'YEAR', 'MONTH', 'OCC'))
Now, we will generate predictions
forest1_preds <- predict(forest1, data = x_vals, predict.all = TRUE)
## Predicting.. Progress: 44%. Estimated remaining time: 39 seconds.
## Predicting.. Progress: 88%. Estimated remaining time: 8 seconds.
forest2_preds <- predict(forest2, data = x_vals, predict.all = TRUE)
## Predicting.. Progress: 48%. Estimated remaining time: 33 seconds.
## Predicting.. Progress: 98%. Estimated remaining time: 1 seconds.
forest3_preds <- predict(forest3, data = x_vals, predict.all = TRUE)
## Predicting.. Progress: 43%. Estimated remaining time: 40 seconds.
## Predicting.. Progress: 89%. Estimated remaining time: 7 seconds.
And merge them into one dataframe
pred_iters1 <- forest1_preds[["predictions"]]
pred_iters2 <- forest2_preds[["predictions"]]
pred_iters3 <- forest3_preds[["predictions"]]
all_preds <- cbind(pred_iters1, pred_iters2)
all_preds <- cbind(all_preds, pred_iters3)
f <- function(x) with(rle(sort(x)), values[order(lengths, decreasing = TRUE)])
fn_test <- apply(all_preds, 1, f)
best_pred <- data.frame(unlist(map(fn_test, 1)))
best_pred$predicted_classification <- factor(best_pred$unlist.map.fn_test..1.., levels = ind_factors$factor_number, labels = ind_factors$levels.first_fact.IND.)
winner <- best_pred$predicted_classification
ct <- rowSums(all_preds == rep(best_pred$unlist.map.fn_test..1.., ncol(all_preds)))
pred_perc <- data.frame(winning_val = winner,
ct = ct)
# Generate certainty scores
pred_perc$certainty_score <- lapply(pred_perc$ct, function(x) round((x/ncol(all_preds)),4))
winner_colname <- paste0("IND.Predicted.Value.", set)
certainty_colname <- paste0("IND.Prediction.Certainty.Score.", set)
reduce <- tibble(pred_perc$winning_val, pred_perc$certainty_score)
reduce <- data.frame(lapply(reduce, as.character), stringsAsFactors = FALSE)
colnames(reduce) <- c(winner_colname, certainty_colname)
filtered_key <- cbind(filtered_key, reduce)
paged_table(head(filtered_key))
Merge with original data
merged <- cbind(orig, filtered_key)
merged <- merged[-c(10,11,12)]
paged_table(head(merged))
As you can see, each prediction with certainty score in the 1976 schema has now been appended to the original 2003 data.