Proportions Down

Data

setwd("C:/Users/mvx13/Downloads")
dat= read.csv("moto2.csv")
dim(dat)
## [1] 8715   10
library(DT)
## Warning: package 'DT' was built under R version 4.2.2
datatable(
  dat, extensions = 'Buttons', options = list(
    dom = 'Bfrtip',
    buttons = c('copy', 'csv', 'excel')
  )
)
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html

Function

#  dat       data
#  col.from  index of the first level column
#  col.bin   name of the first binary column
hier_props <- function(dat, col.from=2, col.bin) {
  
  ## Processes one hierarchical level
  #  d            data without the already processed columns
  #  prev.categs  categories from previous levels in this branch
  #  col.i        current column's index
  process_level <- function(d, prev.categs, col.i) {
    
    categ.tab <- table(d[, 1])
    remove.others <- F
    
    # (possibly) introduce 'Others' for binary levels
    if (col.i >= col.bin) {
      
      # add zero for Others when only one category is present
      if (length(categ.tab) == 1) {
        
        categ.tab <- c(categ.tab, Others = 0)
        remove.others <- T   # flag Others for future removal
        
        # keep only the most represented category and merged Others
      } else if (length(categ.tab) > 2) {
        
        i.max <- which.max(categ.tab)
        categ.tab <- c(categ.tab[i.max], Others = sum(categ.tab[-i.max]))
        d[d[, 1] != names(categ.tab)[1], 1] <- 'Others'
      }
    }
    
    categ.tab <- round(categ.tab / nrow(d) * 100)  # percentages
    # create a list of list representations of rows for the current categories
    now <- mapply(c, 
                  list(as.list(prev.categs)), 
                  names(categ.tab), 
                  list(rep(NA, ncol(d)-1)),
                  categ.tab, 
                  SIMPLIFY=F)
    # change lists to data frames, set names and save them to the 'out' variable
    #   stored within parent function
    out <<- c(out, lapply(now, \(x) setNames(as.data.frame(x), 
                                             c(names(dat), 'Perc'))))
    # remove Others when only one category was present so that it is not used
    #   in the lower hierarchical level
    if (remove.others) categ.tab <- categ.tab[names(categ.tab) != 'Others']
    
    # go one hierarchicel level deeper
    if (ncol(d) > 1) {
      for (categ in names(categ.tab)) {
        
        d.categ <- d[d[, 1]==categ, -1, drop=F]
        process_level(d.categ, c(prev.categs, categ), col.i+1)
      }
    }
  }
  
  out <- list()
  # keep only level columns
  dat <- dat[, seq(col.from, ncol(dat))]
  # name of the first 'binary' column
  col.bin <- which(names(dat) == col.bin)
  # start processing from the first column
  process_level(dat, NULL, 1)
  
  out <- do.call(rbind, out)
  out <- out[order(rowSums(is.na(out)), decreasing=T), ]
  rownames(out) <- seq_len(nrow(out))
  out
}

### source
## https://stackoverflow.com/questions/75849228/proportion-distributions-at-different-hierarchical-levels

Output

out= hier_props(dat, col.bin='FHE')

datatable(
  out, extensions = 'Buttons', options = list(
    dom = 'Bfrtip',
    buttons = c('copy', 'csv', 'excel')
  )
)