# dat data
# col.from index of the first level column
# col.bin name of the first binary column
hier_props <- function(dat, col.from=2, col.bin) {
## Processes one hierarchical level
# d data without the already processed columns
# prev.categs categories from previous levels in this branch
# col.i current column's index
process_level <- function(d, prev.categs, col.i) {
categ.tab <- table(d[, 1])
remove.others <- F
# (possibly) introduce 'Others' for binary levels
if (col.i >= col.bin) {
# add zero for Others when only one category is present
if (length(categ.tab) == 1) {
categ.tab <- c(categ.tab, Others = 0)
remove.others <- T # flag Others for future removal
# keep only the most represented category and merged Others
} else if (length(categ.tab) > 2) {
i.max <- which.max(categ.tab)
categ.tab <- c(categ.tab[i.max], Others = sum(categ.tab[-i.max]))
d[d[, 1] != names(categ.tab)[1], 1] <- 'Others'
}
}
categ.tab <- round(categ.tab / nrow(d) * 100) # percentages
# create a list of list representations of rows for the current categories
now <- mapply(c,
list(as.list(prev.categs)),
names(categ.tab),
list(rep(NA, ncol(d)-1)),
categ.tab,
SIMPLIFY=F)
# change lists to data frames, set names and save them to the 'out' variable
# stored within parent function
out <<- c(out, lapply(now, \(x) setNames(as.data.frame(x),
c(names(dat), 'Perc'))))
# remove Others when only one category was present so that it is not used
# in the lower hierarchical level
if (remove.others) categ.tab <- categ.tab[names(categ.tab) != 'Others']
# go one hierarchicel level deeper
if (ncol(d) > 1) {
for (categ in names(categ.tab)) {
d.categ <- d[d[, 1]==categ, -1, drop=F]
process_level(d.categ, c(prev.categs, categ), col.i+1)
}
}
}
out <- list()
# keep only level columns
dat <- dat[, seq(col.from, ncol(dat))]
# name of the first 'binary' column
col.bin <- which(names(dat) == col.bin)
# start processing from the first column
process_level(dat, NULL, 1)
out <- do.call(rbind, out)
out <- out[order(rowSums(is.na(out)), decreasing=T), ]
rownames(out) <- seq_len(nrow(out))
out
}
### source
## https://stackoverflow.com/questions/75849228/proportion-distributions-at-different-hierarchical-levels