library(tidyverse)
library(caret)
library(ranger)
library(rmarkdown)
In this document, we will demonstrate the full process for preparing the data, generating models, and predicting 1976 codes on data from 2003. This general process can be applied to many other longitudinal datasets. All final predictions from this process can be found at https://doi.org/10.18130/V3/6RUBPV. For a guide on utilizing those predictions, please refer to https://rpubs.com/mmc4cv/utilizing-resources_rf_preds.
The process of creating predictions requires a two-stage modeling process. The stages are denoted by “Assembly” and “Rollout.”
Note that in our project, two-three of these models, trained on different years within a schema, were combined for final predictions for increased robustness. If you would like to see this process formatted to run in a loop, with dynamic variables based on year, view our original documentation at https://github.com/IndOcc/IndOcc/blob/main/01_BuildRandomForestsIND.R and at https://github.com/IndOcc/IndOcc/blob/main/03_PredictionIND.R
Actual forest models are hosted at https://doi.org/10.18130/V3/6RUBPV.
Sample data for the purposes of illustration. Models and predictions are created using variables available only in every year and month of the CPS.
df_76 <- read.csv("https://raw.githubusercontent.com/IndOcc/IndOcc/main/vingette_sample_76.csv")
paged_table(head(df_76))
In this case, we will be training models to translate data from other years in to 1976.
year_of_interest <- 1976
Create a flag to mark what schema the ‘year_of_interest’ is in. These results will be used later in the crosswalk and modeling process.
year_ranges<- list(c(1971:1982), #1
c(1983:1991), #2
c(1992:2002), #3
c(2003:2008), #4
c(2009:2013), #5
c(2014:2019), #6
c(2020:2022)) #7
in_year_range<- c(FALSE,
FALSE,
FALSE,
FALSE,
FALSE,
FALSE,
FALSE)
for (x in 1:length(year_ranges)) {
if (year_of_interest %in% year_ranges[[x]]){
in_year_range[x]<- TRUE
}
}
The industry crosswalk was previously prepared by us. More information at https://github.com/IndOcc/CPScrosswalks. ### Load crosswalk Load, clean, and display crosswalk:
xwalk_name<- "https://raw.githubusercontent.com/IndOcc/CPScrosswalks/main/IND_crosswalk_FULL.csv"
crosswalk <- read.csv(xwalk_name)
crosswalk<- janitor::clean_names(crosswalk)
crosswalk<- crosswalk[-1]
paged_table(crosswalk)
Where ‘from’ is the year schema the ‘year_of_interest’ belongs to. This should follow the naming conventions present in the crosswalk (i.e. ind/occ_BEGINNINGYEAR_ENDINGYEAR).
from <- '1971_1982'
This will convert 1976 and 2003 industry codes crosswalk to a crosswalk with 1976 codes and one-hot-encoded 2003 industry codes.
explode_column<- c()
for (x in 1:7){
if(in_year_range[x]){
map_column<- crosswalk[x]
}
if(!in_year_range[x]){
explode_column<-append(explode_column,crosswalk[x])
}
}
x = 3
used_crosswalk<- data.frame(map_column, explode_column[x])
crosswalk_era<- names(explode_column[x])
used_crosswalk<- used_crosswalk %>% distinct() %>% mutate(value = 1) %>% spread(2, value, fill = 0, sep="_")
for (col in 1:ncol(crosswalk)){
colnames(crosswalk)[col] <- sub("[[:digit:]]+ ", "", colnames(crosswalk)[col])
colnames(crosswalk)[col] <- sub(" ", "_", colnames(crosswalk)[col])
}
paged_table(used_crosswalk)
In this case, merging on the column for 1976 industry code (‘IND’ and ‘ind_1971_1982’)
init_merged<- merge(used_crosswalk, df_76, by.x="ind_1971_1982", by.y = "IND")
names(init_merged)[1]<-"IND"
init_merged$IND<- as.factor(init_merged$IND)
init_merged$SEX<- as.factor(init_merged$SEX)
init_merged$EDUC<- factor(init_merged$EDUC, order=TRUE, levels=c(0, 10,20,30,40,50,60,70,80,90,100,110,120))
init_merged$CLASSWKR<- as.factor(init_merged$CLASSWKR)
init_merged <- init_merged %>% select(-c('CPSIDP', 'YEAR', 'MONTH', 'OCC'))
paged_table(head(init_merged))
We used an 80/20 train/test split
set.seed(18) #For reproducibility
nums <- sample(seq_len(nrow(init_merged)), size = floor(.8 * nrow(init_merged)))
train <- init_merged[nums,]
trainx <- train[-1]
test <- init_merged[-nums,]
testx <- test[-1]
ranger_forest <- ranger(IND~., data = train[complete.cases(train),], num.trees = 500)
## Growing trees.. Progress: 9%. Estimated remaining time: 5 minutes, 29 seconds.
## Growing trees.. Progress: 20%. Estimated remaining time: 4 minutes, 8 seconds.
## Growing trees.. Progress: 32%. Estimated remaining time: 3 minutes, 17 seconds.
## Growing trees.. Progress: 44%. Estimated remaining time: 2 minutes, 41 seconds.
## Growing trees.. Progress: 55%. Estimated remaining time: 2 minutes, 6 seconds.
## Growing trees.. Progress: 67%. Estimated remaining time: 1 minute, 32 seconds.
## Growing trees.. Progress: 79%. Estimated remaining time: 59 seconds.
## Growing trees.. Progress: 90%. Estimated remaining time: 26 seconds.
ranger_pred <- predict(ranger_forest, data = testx)
confusion <- confusionMatrix(ranger_pred$predictions, test$IND)
confusiontable <- as.data.frame(as.matrix(confusion))
by_class <- as.data.frame(confusion$byClass)
paged_table(confusiontable)
The definition for complexity can be found in the readme on our Libra repository, https://dataverse.lib.virginia.edu/file.xhtml?fileId=57998&version=2.0.
crosswalk2 <- crosswalk %>% select(c(paste0("ind_", from), crosswalk_era))
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(crosswalk_era)` instead of `crosswalk_era` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
crosswalk2 <- distinct(crosswalk2)
one_one_c2 <- crosswalk2 %>% group_by(across(2)) %>% count()
counts_other_year <- merge(crosswalk2, one_one_c2, all=TRUE)
factor_levels <- levels(train$IND)
check<- as.data.frame(table(true = test$IND, predicted = ranger_pred$predictions))
accuracy_by_gp<- check %>%
group_by(true) %>%
summarise(accuracy = Freq[true==predicted] / sum(Freq))
max_value <- as.data.frame(rep(0, length = length(names)))
names(max_value)[1] <- "Complexity"
for (i in 1:length(confusiontable)){
val = as.integer(paste(accuracy_by_gp$true[i]))
single_thing <- counts_other_year %>% filter(counts_other_year[2] == val)
max_value[i,] <- max(single_thing$n)
}
comparison_table <- tibble(factor_levels, by_class, max_value)
paged_table(comparison_table)
This results by group table is produced as well.
paged_table(check)
save(ranger_forest, file = paste0("forest_IND", year_of_interest,"from_",crosswalk_era,".RData"))
write.csv(comparison_table, paste0("results_table_", year_of_interest,"from_",crosswalk_era,"_IND.csv"), row.names = FALSE)
write.csv(check, paste0("groupresults_", year_of_interest,"from_",crosswalk_era,"_IND.csv"), row.names=FALSE)
write.csv(confusiontable, paste0("confusion_", year_of_interest,"from_",crosswalk_era,"_IND.csv"), row.names=FALSE)
This process follows much of the same format as the previous section, with slight modifications to account for the different data.
In this case we want to predict values for data from 2003. We will read in the 2003 data and filter it to only those responses who have an industry provided.
df_03 <- read.csv("https://raw.githubusercontent.com/IndOcc/IndOcc/main/vingette_sample_03.csv", colClasses= c("CPSIDP" = 'character'))
ind <- df_03 %>% filter(IND != 0)
paged_table(head(ind))
The start year is the year you have data for. The end year is a year in the schema you want your data to be translated into (in this case 2003 into 1976)
start_year <- 2003
end_year <- 1976
xwalk_name<- "https://raw.githubusercontent.com/IndOcc/CPScrosswalks/main/IND_crosswalk_FULL.csv"
crosswalk <- read.csv(xwalk_name)
crosswalk<- janitor::clean_names(crosswalk)
crosswalk<- crosswalk[-1]
paged_table(crosswalk)
For use in the later merging stage
dupe_IND <- ind %>% select(IND)
colnames(dupe_IND) <- c("dupe_IND")
ind <- cbind(ind, dupe_IND)
In this case, three years are defined in order to create more robust predictions. All three years come from the 1976-1982 schema.
from <- '2003_2008'
# These are the three years we'll pull models for
first <- 1976
second <- 1979
third <- 1982
set <- "1976.1982"
into <- '2003_2008'
from1 <- paste0("ind_", from)
crosswalk_subset <- crosswalk[c(from1)]
dupe_col <- crosswalk_subset[1]
names(dupe_col)[1] <- 'shadow'
crosswalk_subset <- tibble(crosswalk_subset[1], dupe_col)
used_crosswalk<- crosswalk_subset %>% distinct %>% mutate(value = 1) %>% spread(1, value, fill = 0, sep="_")
paged_table(used_crosswalk)
init_merged<- merge(used_crosswalk, ind, by.x='shadow', by.y = "IND")
init_merged <- init_merged[-c(1)]
paged_table(head(init_merged))
Together, these three variables comprise a unique response id. They will be used to to match predictions to individuals.
filtered_key <- tibble(init_merged$CPSIDP, init_merged$YEAR, init_merged$MONTH)
colnames(filtered_key) <- c("CPSIDP", "YEAR", "MONTH")
paged_table(head(filtered_key))
orig <- tibble(init_merged$YEAR,
init_merged$MONTH,
init_merged$CPSIDP,
init_merged$dupe_IND,
init_merged$OCC,
init_merged$AGE,
init_merged$SEX,
init_merged$EDUC,
init_merged$CLASSWKR)
colnames(orig) <- c("YEAR", "MONTH", "CPSIDP", "Original.IND", "Original.OCC", "AGE", "SEX", "EDUC.recoded", "CLASSWKR.recoded")
paged_table(head(orig))
These are hosted at https://doi.org/10.18130/V3/6RUBPV. The naming convention is a little confusing, but we can get the names of the models we want by following this format:
model1 <- paste0("forest_IND", first, "from_ind_", into, ".RData")
model2 <- paste0("forest_IND", second, "from_ind_", into, ".RData")
model3 <- paste0("forest_IND", third, "from_ind_", into, ".RData")
print(model1)
## [1] "forest_IND1976from_ind_2003_2008.RData"
print(model2)
## [1] "forest_IND1979from_ind_2003_2008.RData"
print(model3)
## [1] "forest_IND1982from_ind_2003_2008.RData"
load(model1)
forest1 <- ranger_forest
load(model2)
forest2 <- ranger_forest
load(model3)
forest3 <- ranger_forest
Get the factor levels that will be predicted, convert our data to factors, and remove unused columns from the data.
first_fact <- ind %>% filter(IND != 0)
first_fact$IND<- as.factor(first_fact$IND)
ind_factors <- data.frame(levels(first_fact$IND))
ind_factors['factor_number'] <- seq.int(nrow(ind_factors))
init_merged$SEX<- as.factor(init_merged$SEX)
init_merged$EDUC<- as.factor(init_merged$EDUC)
init_merged$CLASSWKR<- as.factor(init_merged$CLASSWKR)
x_vals <- init_merged %>% select(-c('CPSIDP', 'YEAR', 'MONTH', 'OCC'))
forest1_preds <- predict(forest1, data = x_vals, predict.all = TRUE)
## Predicting.. Progress: 45%. Estimated remaining time: 37 seconds.
## Predicting.. Progress: 91%. Estimated remaining time: 6 seconds.
forest2_preds <- predict(forest2, data = x_vals, predict.all = TRUE)
## Predicting.. Progress: 44%. Estimated remaining time: 39 seconds.
## Predicting.. Progress: 89%. Estimated remaining time: 7 seconds.
forest3_preds <- predict(forest3, data = x_vals, predict.all = TRUE)
## Predicting.. Progress: 47%. Estimated remaining time: 34 seconds.
## Predicting.. Progress: 96%. Estimated remaining time: 2 seconds.
Merge them into one dataframe
pred_iters1 <- forest1_preds[["predictions"]]
pred_iters2 <- forest2_preds[["predictions"]]
pred_iters3 <- forest3_preds[["predictions"]]
all_preds <- cbind(pred_iters1, pred_iters2)
all_preds <- cbind(all_preds, pred_iters3)
f <- function(x) with(rle(sort(x)), values[order(lengths, decreasing = TRUE)])
fn_test <- apply(all_preds, 1, f)
best_pred <- data.frame(unlist(map(fn_test, 1)))
best_pred$predicted_classification <- factor(best_pred$unlist.map.fn_test..1.., levels = ind_factors$factor_number, labels = ind_factors$levels.first_fact.IND.)
winner <- best_pred$predicted_classification
ct <- rowSums(all_preds == rep(best_pred$unlist.map.fn_test..1.., ncol(all_preds)))
pred_perc <- data.frame(winning_val = winner,
ct = ct)
# Generate certainty scores
pred_perc$certainty_score <- lapply(pred_perc$ct, function(x) round((x/ncol(all_preds)),4))
winner_colname <- paste0("IND.Predicted.Value.", set)
certainty_colname <- paste0("IND.Prediction.Certainty.Score.", set)
reduce <- tibble(pred_perc$winning_val, pred_perc$certainty_score)
reduce <- data.frame(lapply(reduce, as.character), stringsAsFactors = FALSE)
colnames(reduce) <- c(winner_colname, certainty_colname)
filtered_key <- cbind(filtered_key, reduce)
paged_table(head(filtered_key))
Merge with original data
merged <- cbind(orig, filtered_key)
merged <- merged[-c(10,11,12)]
paged_table(head(merged))
As you can see, each prediction with certainty score in the 1976 schema has now been appended to the original 2003 data.