Description

This document performs exploratory data analysis on the radiomic features extracted from our three ultra-high-resolution claustrum datasets (BigBrain, Edlow and Lusebrink), each with two hemispheres (left and right) and three image types (original, Laplacian of Gaussian (σ = 0.5 mm), and Laplacian of Gaussian (σ = 1.0 mm).

In preparation for our end goal of PCA followed by spatially-constrained clustering, this exploration is designed to:

  1. Wrangle with the big datasets
  2. Summarise the data “feature space” (classes, counts, missingness)
  3. Flag outlier voxels using IQR-based detection
  4. Examine the correlation structure across feature classes and image types

Section 1 — Data wrangling

Load libraries

#list all the packages we will need
packages <- c(
  "tidyverse",    # data wrangling + ggplot2
  "corrplot",     # correlation matrices
  "ggcorrplot",   # ggplot2-style correlation matrices
  "scales",       # axis formatting
  "knitr",        # table rendering
  "kableExtra",   # styled tables
  "DT",           # interactive tables
  "RColorBrewer", # colour palettes
  "patchwork",    # combine ggplots
  "glue",         # string interpolation
  "plotly"        # hover ggplots 
)

#check what's installed
installed  <- rownames(installed.packages())

#find what needs to be installed
to_install <- setdiff(packages, installed)

#install missing packages
if (length(to_install) > 0) {
  message("Installing missing packages: ", paste(to_install, collapse = ", "))
  install.packages(to_install, repos = "https://cloud.r-project.org")
} else {
  message("All packages already installed.")
}

#load all packages
invisible(lapply(packages, library, character.only = TRUE))

#clean up env
rm(packages, installed, to_install)

Set file paths

#set main data directory (change to 'claustrum' in lab)
DATA_DIR <- "/Users/navona/OneDrive - UHN/projects/project10_claustrumParcellation_OS/output/1_radiomics_features/250um"

#define datasets and hemispheres
DATASETS <- c("bigbrain", "edlow", "lusebrink")
HEMIS    <- c("left", "right")

#create a named list of file paths, based on naming convention
files <- setNames(
  object = as.list(
    outer(DATASETS, HEMIS, function(ds, hemi)
      file.path(DATA_DIR, paste0(ds, "_", hemi, "_features_250um.csv"))
    )
  ),
  nm = as.vector(outer(DATASETS, HEMIS, paste, sep = "_"))
)

Parse the data

#identify meta-data, ie vars not Pyradomics features
META_COLS <- c("x", "y", "z", "y_global", "dist_to_boundary")

#identify feaeture classes of Pyradiomics features
FEATURE_CLASSES <- c("firstorder", "glcm", "glrlm", "glszm", "ngtdm", "gldm")

#specify 3 image types
IMAGE_TYPES <- c(
  "original"  = "", #no prefix
  "log05"     = "log-sigma-0-5-mm-3D_",
  "log10"     = "log-sigma-1-0-mm-3D_"
)

#colour scheme for datasets
DATASET_COLOURS <- c(
  "Bigbrain"   = "darkgreen",
  "Edlow"      = "blue",
  "Lusebrink"  = "purple"
)

#helper function for feature class from col names
get_feature_class <- function(col_name) {
  case_when(
    str_detect(col_name, "firstorder") ~ "firstorder",
    str_detect(col_name, "glcm")       ~ "glcm",
    str_detect(col_name, "glrlm")      ~ "glrlm",
    str_detect(col_name, "glszm")      ~ "glszm",
    str_detect(col_name, "ngtdm")      ~ "ngtdm",
    str_detect(col_name, "gldm")       ~ "gldm",
    TRUE                               ~ "metadata"
  )
}

#helper function to get image type from col names
get_image_type <- function(col_name) {
  case_when(
    str_detect(col_name, "log-sigma-0-5") ~ "log05",
    str_detect(col_name, "log-sigma-1-0") ~ "log10",
    TRUE                                  ~ "original"
  )
}

Load data

#read 6 csvs and tag each with dataset and hemi labels 
df_list <- imap(files, function(path, key) {
  parts <- str_split(key, "_")[[1]]
  read_csv(path, show_col_types = FALSE) |>
    mutate(
      dataset    = parts[1],
      hemisphere = parts[2],
      .before    = everything()
    )
})

#OPTIONAL: MAKE SMALL FOR TESTING SO FAST - MAKE SURE TO COMMENT OUT IN FINAL
#df_list <- map(df_list, ~ slice_sample(.x, n = 500))

#also combine 6 dfs into one long df, useful for cross-dataset comparisons
df_combined <- bind_rows(df_list)

Section 2 — Data Summary

Goal: Understand the structure of our Pyradomics data, i.e., the “feature space”. This is so that we can make smart decisions for our analysis, and also make sure that nothing has gone wrong with the data extraction. The tabs below show many features we have per class and image type, how many voxels per dataset, and how much missingness exists.

Next step: Check out the variable(s) with large proportions of missing data to understand what they are (namely, JointAverage).

Overview Table

#count voxels in each dataset
voxel_counts <- imap_dfr(df_list, function(df, key) { #like lapply, from purrr
  parts <- str_split(key, "_")[[1]]
  tibble(
    Dataset    = str_to_title(parts[1]), #make capital
    Hemisphere = str_to_title(parts[2]),
    Voxels     = nrow(df),
    Features   = ncol(df) - length(META_COLS) - 2  #subtract meta and dataset/hemi cols
  )
})

#make a pretty table
voxel_counts |>
  kbl(caption = "Voxel and feature counts per file") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE) |>
  column_spec(3:4, bold = TRUE)
Voxel and feature counts per file
Dataset Hemisphere Voxels Features
Bigbrain Left 84466 276
Edlow Left 90943 276
Lusebrink Left 107879 276
Bigbrain Right 76505 276
Edlow Right 92731 276
Lusebrink Right 94890 276

Feature Class Breakdown

#build a feature inventory from the colnames of first file (all files share the same structure)
feature_cols <- names(df_list[[1]]) |>
  setdiff(c(META_COLS, "dataset", "hemisphere"))

#pull out image type and feature class
feature_inventory <- tibble(column = feature_cols) |>
  mutate(
    image_type = get_image_type(column),
    feature_class = get_feature_class(column)
  )

#make summary table of features by class and image type 
class_summary <- feature_inventory |>
  count(image_type, feature_class) |>
  pivot_wider(names_from = image_type, values_from = n, values_fill = 0) |>
  select(feature_class, original, log05, log10) |>
  mutate(feature_class = factor(feature_class, 
                                levels = unique(feature_inventory$feature_class))) |>
  arrange(feature_class)

#make table have nice formatting
class_summary |>
  rename(
    `Feature Class` = feature_class,
    `Original`      = original,
    `LoG σ=0.5mm`  = log05,
    `LoG σ=1.0mm`  = log10
  ) |>
  kbl(caption = "Number of features per class and image type") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE) |>
  add_header_above(c(" " = 1, "Image Type" = 3)) 
Number of features per class and image type
Image Type
Feature Class Original LoG σ=0.5mm LoG σ=1.0mm
firstorder 18 18 18
glcm 23 23 23
glrlm 16 16 16
glszm 16 16 16
ngtdm 5 5 5
gldm 14 14 14

Missing Values

#get NA counts per feature class and dataset
na_summary <- imap_dfr(df_list, function(df, key) {
  parts   <- str_split(key, "_")[[1]]
  feat_df <- df |> select(all_of(feature_cols))

  feat_df |>
    summarise(across(everything(), ~ sum(is.na(.)))) |>
    pivot_longer(everything(), names_to = "column", values_to = "n_na") |>
    left_join(feature_inventory, by = "column") |>
    group_by(image_type, feature_class) |>
    summarise(
      n_features_with_na = sum(n_na > 0),
      total_na_cells      = sum(n_na),
      .groups = "drop"
    ) |>
    mutate(
      dataset    = str_to_title(parts[1]),
      hemisphere = str_to_title(parts[2])
    )
})

#make a summary table - SORTABLE!
na_summary |>
  filter(total_na_cells > 0) |>
  arrange(dataset, hemisphere, image_type, feature_class) |>
  select(Dataset = dataset, Hemisphere = hemisphere,
         `Image Type` = image_type, `Feature Class` = feature_class,
         `Features with NAs` = n_features_with_na,
         `Total NA Cells` = total_na_cells) |>
  datatable(
    caption  = "Missing values by dataset, class, and image type (non-zero only)",
    options  = list(pageLength = 30, scrollX = TRUE),
    rownames = FALSE
  )
#calculate NA proportion per feature (pooled across datasets) 
na_per_feature <- imap_dfr(df_list, function(df, key) {
  parts <- str_split(key, "_")[[1]]
  df |>
    select(all_of(feature_cols)) |>
    summarise(across(everything(), ~ mean(is.na(.)))) |>
    pivot_longer(everything(), names_to = "column", values_to = "prop_na") |>
    mutate(
      dataset    = str_to_title(parts[1]),
      hemisphere = str_to_title(parts[2])
    )
}) |>
  left_join(feature_inventory, by = "column") |>
  group_by(column, dataset, image_type, feature_class) |>
  summarise(prop_na = mean(prop_na), .groups = "drop")  # average across hemispheres

#interactive heatmap with plotly - 
#show any variables that has any amount of NA
na_per_feature |>
  #filter(prop_na > 0) |>
  filter(column %in% na_per_feature$column[na_per_feature$prop_na > 0]) |> #keep rows constant
  mutate(
    image_type    = factor(image_type,
                           levels = c("original", "log05", "log10"),
                           labels = c("Original", "LoG σ=0.5mm", "LoG σ=1.0mm")),
    feature_short = str_remove(column, "^log-sigma-[0-9]-[0-9]-mm-3D_"),  # shorter label
    hover_text = glue::glue("{column}<br>Dataset: {dataset}<br>% Missing: {scales::percent(prop_na, accuracy = 0.1)}")
  ) |>
  plot_ly(
    x    = ~ dataset, #image_type
    y    = ~ column,
    z    = ~ prop_na * 100,
    text = ~ hover_text,
    type        = "heatmap",
    hoverinfo   = "text",
    colorscale  = list(c(0, "#fffde7"), c(1, "#d32f2f")),
    colorbar    = list(title = "% Missing")
  ) |>
  layout(
    title  = "Proportion of missing values per feature",
    xaxis = list(title = "Dataset"),
    yaxis  = list(title = "", ticks = "",  showticklabels = FALSE)
  )

Section 3 — Outlier Detection

Goal: Here, we want to identify “outliers”, i.e., voxels with extreme feature values that could distort PCA. Features are Z-scored (mean=0, SD=1) within each dataset before plotting, putting all features on a comparable scale. Outliers are shown in red. We use the IQR (boxplot) method for outlier detection, i.e., a value is flagged as an outlier if it falls below Q1 − 1.5×IQR or above Q3 + 1.5×IQR (put otherwise, a value is an outlier if it falls beyond the whiskers of the boxplot). This method is quite conservative for our feature space, so it’s just for visulization purposes: it’s not clear yet if we want to remove these outliers, as outliers in feature space almost always reflect real image properties, not measurement error - the idea is just to understand how many outliers we have.

Next step: Identify the features, image types, and datasets that have only a small number of wildly abnormal outliers. Compare across datasets and image types to determine if this ought to be ‘expected’. Understand what these outliers ‘mean’ for a given feature. Determine if there is a spatial pattern to the outliers (e.g., are they all at edges, etc). Make a short list of voxels to remove from the analysis of certain features; we can run a ‘sensitivity analysis’, i.e., our PCA and clustering with both these outlying voxels include and excluded.

# z-score all features within each dataset
df_scaled <- imap_dfr(df_list, function(df, key) {
  parts <- str_split(key, "_")[[1]]
  df |>
    select(all_of(META_COLS), all_of(feature_cols)) |>
    mutate(across(all_of(feature_cols), ~ scale(.x)[,1])) |>
    mutate(
      dataset    = str_to_title(parts[1]),
      hemisphere = str_to_title(parts[2])
    )
})
#helper: ggplot boxplot for one feature class and image type
plot_feature_boxplots <- function(feat_class, img_type, img_label) {
  
  cols_to_plot <- feature_inventory |>
    filter(feature_class == feat_class, image_type == img_type) |>
    pull(column)
  
  df_scaled |>
    select(dataset, hemisphere, all_of(cols_to_plot)) |>
    pivot_longer(all_of(cols_to_plot), names_to = "column", values_to = "value") |>
    mutate(
      feature_short = str_remove(column, "^log-sigma-[0-9]-[0-9]-mm-3D_") |>
                      str_remove(paste0(feat_class, "_")),
      hemisphere    = factor(hemisphere, levels = c("Left", "Right")),
      dataset       = factor(dataset, levels = names(DATASET_COLOURS))
    ) |>
    ggplot(aes(x = feature_short, y = value, fill = dataset)) +
    geom_boxplot(outlier.size = 1, outlier.alpha = 0.3, linewidth = 0.3, outlier.colour = "red") +
    facet_grid(hemisphere ~ .) +
    scale_fill_manual(values = DATASET_COLOURS, name = "Dataset") +
    labs(
      title = glue::glue("{feat_class} — {img_label}"),
      x     = NULL,
      y     = "Z-score"
    ) +
    theme_minimal(base_size = 11) +
    theme(
      axis.text.x     = element_text(angle = 45, hjust = 1, size = 8),
      strip.text      = element_text(face = "bold"),
      legend.position = "top"
    )
}

firstorder

Original

plot_feature_boxplots("firstorder", "original", "Original")

LoG σ=0.5mm

plot_feature_boxplots("firstorder", "log05", "LoG σ=0.5mm")

LoG σ=1.0mm

plot_feature_boxplots("firstorder", "log10", "LoG σ=1.0mm")

glcm

Original

plot_feature_boxplots("glcm", "original", "Original")

LoG σ=0.5mm

plot_feature_boxplots("glcm", "log05", "LoG σ=0.5mm")

LoG σ=1.0mm

plot_feature_boxplots("glcm", "log10", "LoG σ=1.0mm")

glrlm

Original

plot_feature_boxplots("glrlm", "original", "Original")

LoG σ=0.5mm

plot_feature_boxplots("glrlm", "log05", "LoG σ=0.5mm")

LoG σ=1.0mm

plot_feature_boxplots("glrlm", "log10", "LoG σ=1.0mm")

glszm

Original

plot_feature_boxplots("glszm", "original", "Original")

LoG σ=0.5mm

plot_feature_boxplots("glszm", "log05", "LoG σ=0.5mm")

LoG σ=1.0mm

plot_feature_boxplots("glszm", "log10", "LoG σ=1.0mm")

ngtdm

Original

plot_feature_boxplots("ngtdm", "original", "Original")

LoG σ=0.5mm

plot_feature_boxplots("ngtdm", "log05", "LoG σ=0.5mm")

LoG σ=1.0mm

plot_feature_boxplots("ngtdm", "log10", "LoG σ=1.0mm")

gldm

Original

plot_feature_boxplots("gldm", "original", "Original")

LoG σ=0.5mm

plot_feature_boxplots("gldm", "log05", "LoG σ=0.5mm")

LoG σ=1.0mm

plot_feature_boxplots("gldm", "log10", "LoG σ=1.0mm")


Section 4 — Correlation Structure

Goal: Examine the correlation structure across features to understand redundancy before PCA. We compute Spearman rank correlation — robust to the non-normal, right-skewed distributions seen in Section 3. A random sample of 2000 voxels per dataset is used for speed (compare to the total voxel totals in Section 1); however, correlation estimates are stable at this sample size. High correlation (|r| > 0.9) between two features means they carry nearly identical information. This will not cause us to drop features before PCA — but it will help interpret PCA loadings later, as correlated features will load onto the same principal components.

Next step: Understand if feature classes are highly correlated among themselves. Understand what features / feature classes are negatively correlated. Examine if the image types appear to have similar correlation patterns.

#compute spearman correlation matrix on a random sample of voxels
compute_corr <- function(df, cols, n_sample = NULL, seed = 123) {
  set.seed(seed)
  df_sub <- df |>
    select(all_of(cols)) |>
    select(where(~ var(.x, na.rm = TRUE) > 0))
  if (!is.null(n_sample)) {
    df_sub <- slice_sample(df_sub, n = min(n_sample, nrow(df_sub)))
  }
  cor(df_sub, use = "pairwise.complete.obs", method = "spearman")
}

#plot correlation matrix as heatmap
plot_corr <- function(corr_mat, title) {
  
  # get feature order from inventory, preserving class grouping
  feature_order <- feature_inventory |>
    filter(column %in% colnames(corr_mat)) |>
    arrange(feature_class) |>
    mutate(f_short = str_remove(column, "^log-sigma-[0-9]-[0-9]-mm-3D_"))
  
  corr_df <- corr_mat |>
    as.data.frame() |>
    rownames_to_column("feature1") |>
    pivot_longer(-feature1, names_to = "feature2", values_to = "r") |>
    mutate(
      f1_short = str_remove(feature1, "^log-sigma-[0-9]-[0-9]-mm-3D_"),
      f2_short = str_remove(feature2, "^log-sigma-[0-9]-[0-9]-mm-3D_"),
      f1_short = factor(f1_short, levels = feature_order$f_short),
      f2_short = factor(f2_short, levels = feature_order$f_short)
    )
  
  #heatmap
  p_main <- corr_df |>
    ggplot(aes(x = f2_short, y = f1_short, fill = r)) +
    geom_tile() +
    scale_fill_gradient2(
      low      = "#d73027",
      mid      = "white",
      high     = "#1a9850",
      midpoint = 0,
      limits   = c(-1, 1),
      name     = "Spearman r"
    ) +
    labs(title = title, x = NULL, y = NULL) +
    theme_minimal(base_size = 10) +
    theme(
      axis.text  = element_blank(),
      axis.ticks = element_blank()
    )
  
  #feature class strip along x axis
  strip_df <- feature_order |>
    mutate(f_short = factor(f_short, levels = feature_order$f_short))
  
  p_strip_x <- strip_df |>
    ggplot(aes(x = f_short, y = 1, fill = feature_class)) +
    geom_tile() +
    scale_fill_brewer(palette = "Set2", name = "Feature Class") +
    theme_void() +
    theme(
      legend.position = "bottom",
      legend.title    = element_text(size = 9),
      legend.text     = element_text(size = 8)
    )
  
  #feature class strip along y axis
  p_strip_y <- strip_df |>
    ggplot(aes(y = f_short, x = 1, fill = feature_class)) +
    geom_tile() +
    scale_fill_brewer(palette = "Set2", guide = "none") +
    theme_void()
  
  #combine with patchwork
  top_row    <- (p_strip_y | p_main)   + plot_layout(widths  = c(1, 40))
  bottom_row <- (plot_spacer() | p_strip_x) + plot_layout(widths  = c(1, 40))
  top_row / bottom_row + plot_layout(heights = c(20, 1))
}
#pre-compute all correlation matrices once
#one matrix per dataset x hemisphere x image type
#note this is run on a fraction of the data, 2000 voxels, just for visualization
#ie correlation values won't be exact but likely close enough
corr_matrices <- imap(df_list, function(df, key) {
  map(c("original", "log05", "log10"), function(img_type) {
    cols <- feature_inventory |>
      filter(image_type == img_type) |>
      pull(column)
    compute_corr(df, cols, n_sample=2000)
  }) |> set_names(c("original", "log05", "log10"))
})

BigBrain

Left

Original

plot_corr(corr_matrices[["bigbrain_left"]][["original"]],
          "BigBrain Left — Original (Spearman, n=500)")

LoG σ=0.5mm

plot_corr(corr_matrices[["bigbrain_left"]][["log05"]],
          "BigBrain Left — LoG σ=0.5mm (Spearman, n=500)")

LoG σ=1.0mm

plot_corr(corr_matrices[["bigbrain_left"]][["log10"]],
          "BigBrain Left — LoG σ=1.0mm (Spearman, n=500)")

Edlow

Left

Original

plot_corr(corr_matrices[["edlow_left"]][["original"]],
          "Edlow Left — Original (Spearman, n=500)")

LoG σ=0.5mm

plot_corr(corr_matrices[["edlow_left"]][["log05"]],
          "Edlow Left — LoG σ=0.5mm (Spearman, n=500)")

LoG σ=1.0mm

plot_corr(corr_matrices[["edlow_left"]][["log10"]],
          "Edlow Left — LoG σ=1.0mm (Spearman, n=500)")

Right

Original

plot_corr(corr_matrices[["edlow_right"]][["original"]],
          "Edlow Right — Original (Spearman, n=500)")

LoG σ=0.5mm

plot_corr(corr_matrices[["edlow_right"]][["log05"]],
          "Edlow Right — LoG σ=0.5mm (Spearman, n=500)")

LoG σ=1.0mm

plot_corr(corr_matrices[["edlow_right"]][["log10"]],
          "Edlow Right — LoG σ=1.0mm (Spearman, n=500)")

Lüsebrink

Left

Original

plot_corr(corr_matrices[["lusebrink_left"]][["original"]],
          "Lüsebrink Left — Original (Spearman, n=500)")

LoG σ=0.5mm

plot_corr(corr_matrices[["lusebrink_left"]][["log05"]],
          "Lüsebrink Left — LoG σ=0.5mm (Spearman, n=500)")

LoG σ=1.0mm

plot_corr(corr_matrices[["lusebrink_left"]][["log10"]],
          "Lüsebrink Left — LoG σ=1.0mm (Spearman, n=500)")

Right

Original

plot_corr(corr_matrices[["lusebrink_right"]][["original"]],
          "Lüsebrink Right — Original (Spearman, n=500)")

LoG σ=0.5mm

plot_corr(corr_matrices[["lusebrink_right"]][["log05"]],
          "Lüsebrink Right — LoG σ=0.5mm (Spearman, n=500)")

LoG σ=1.0mm

plot_corr(corr_matrices[["lusebrink_right"]][["log10"]],
          "Lüsebrink Right — LoG σ=1.0mm (Spearman, n=500)")


Section 5 — NIfTI Export

Goal: Write out two voxel-level .csvs for downstream .nii reconstruction in Python: (1) a missingness map showing the count of NA features (out of all 278) at each voxel, and (2) an outlier map showing the count of IQR-flagged outlier features at each voxel (maximum possible 278. Note: both .csvs also carry a dominant_class column (the feature class contributing the most NAs/outliers) as a secondary diagnostic, in case a separate class-coded .nii is useful later, but the primary NIfTI value for both maps is the count.

#flag whether a value is an outlier by IQR rule
is_iqr_outlier <- function(x) {
  q   <- quantile(x, probs = c(0.25, 0.75), na.rm = TRUE)
  iqr <- q[2] - q[1]
  x < (q[1] - 1.5 * iqr) | x > (q[2] + 1.5 * iqr)
}
#assign an integer code to each feature class, used to colour the outlier map
#e.g. firstorder = 1, glcm = 2, glrlm = 3, glszm = 4, ngtdm = 5, gldm = 6
class_codes <- setNames(seq_along(FEATURE_CLASSES), FEATURE_CLASSES)

#class_codes |>
  #enframe(name = "feature_class", value = "code") |>
  #kbl(caption = "If used later: feature class integer code (for NIfTI colour-coding)") |>
  #kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)

Missingness Map

#missingness map: count of how many features (out of all 278) are NA at each voxel
#dominant_class = feature class contributing the most NAs (secondary diagnostic)
missing_map <- imap_dfr(df_list, function(df, key) {
  parts   <- str_split(key, "_")[[1]]
  feat_df <- df |> select(all_of(feature_cols))
  na_flags <- map_dfc(feat_df, is.na)

  #count NAs per voxel, per feature class
  class_counts <- map_dfc(FEATURE_CLASSES, function(cls) {
    cls_cols <- feature_inventory |> filter(feature_class == cls) |> pull(column)
    tibble("{cls}" := rowSums(na_flags[, cls_cols], na.rm = TRUE))
  })

  #dominant class = highest NA count; 0 if no NAs at all; ties go to first listed in FEATURE_CLASSES
  dominant_class <- apply(class_counts, 1, function(row) {
    if (sum(row) == 0) return(0)
    class_codes[[FEATURE_CLASSES[which.max(row)]]]
  })

  tibble(
    x                   = df$x,
    y                   = df$y,
    z                   = df$z,
    y_global            = df$y_global,
    dataset             = str_to_title(parts[1]),
    hemisphere          = str_to_title(parts[2]),
    n_missing_features  = rowSums(na_flags, na.rm = TRUE),
    dominant_class      = dominant_class
  )
})

#write out, datestamped
EXPORT_DIR <- "/Users/navona/Library/CloudStorage/OneDrive-UHN/projects/project10_claustrumParcellation_OS/output/2_preprocessing"
missing_map_path <- file.path(EXPORT_DIR, glue::glue("missing_map_{format(Sys.Date(), '%Y%m%d')}.csv"))
write_csv(missing_map, missing_map_path)

#quick summary so we can sanity check before handing off to python
missing_map |>
  group_by(dataset, hemisphere) |>
  summarise(
    voxels             = n(),
    voxels_w_any_na    = sum(n_missing_features > 0),
    pct_voxels_w_na    = scales::percent(mean(n_missing_features > 0), accuracy = 0.1),
    mean_n_missing     = round(mean(n_missing_features), 2),
    max_n_missing      = max(n_missing_features),
    .groups = "drop"
  ) |>
  kbl(caption = glue::glue("Missingness summary (written to {basename(missing_map_path)})")) |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
Missingness summary (written to missing_map_20260620.csv)
dataset hemisphere voxels voxels_w_any_na pct_voxels_w_na mean_n_missing max_n_missing
Bigbrain Left 84466 7414 8.8% 0.33 120
Bigbrain Right 76505 7660 10.0% 0.35 120
Edlow Left 90943 924 1.0% 0.03 3
Edlow Right 92731 881 1.0% 0.03 3
Lusebrink Left 107879 742 0.7% 0.02 3
Lusebrink Right 94890 548 0.6% 0.02 3
#secondary diagnostic: which feature class is driving the missingness, per dataset/hemisphere
missing_map |>
  mutate(dominant_class_name = c("none", FEATURE_CLASSES)[dominant_class + 1]) |>
  count(dataset, hemisphere, dominant_class_name) |>
  pivot_wider(names_from = dominant_class_name, values_from = n, values_fill = 0) |>
  kbl(caption = "Dominant missingness class counts (secondary diagnostic)") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
Dominant missingness class counts (secondary diagnostic)
dataset hemisphere firstorder glcm none
Bigbrain Left 22 7392 77052
Bigbrain Right 24 7636 68845
Edlow Left 0 924 90019
Edlow Right 0 881 91850
Lusebrink Left 0 742 107137
Lusebrink Right 0 548 94342

Outlier Map

#outlier map: dominant feature class per voxel, by IQR-flagged count
outlier_map <- imap_dfr(df_list, function(df, key) {
  parts   <- str_split(key, "_")[[1]]
  feat_df <- df |> select(all_of(feature_cols))
  flags   <- map_dfc(feat_df, is_iqr_outlier)

  #count outliers per voxel, per feature class
  class_counts <- map_dfc(FEATURE_CLASSES, function(cls) {
    cls_cols <- feature_inventory |> filter(feature_class == cls) |> pull(column)
    tibble("{cls}" := rowSums(flags[, cls_cols], na.rm = TRUE))
  })

  #dominant class = highest count; 0 if no outliers at all; ties go to first listed in FEATURE_CLASSES
  dominant_class <- apply(class_counts, 1, function(row) {
    if (sum(row) == 0) return(0)
    class_codes[[FEATURE_CLASSES[which.max(row)]]]
  })

  tibble(
    x                   = df$x,
    y                   = df$y,
    z                   = df$z,
    y_global            = df$y_global,
    dataset             = str_to_title(parts[1]),
    hemisphere          = str_to_title(parts[2]),
    n_outlier_features  = rowSums(flags, na.rm = TRUE),
    dominant_class      = dominant_class
  )
})

#write out, datestamped
outlier_map_path <- file.path(EXPORT_DIR, glue::glue("outlier_map_{format(Sys.Date(), '%Y%m%d')}.csv"))
write_csv(outlier_map, outlier_map_path)

#quick summary so we can sanity check before handing off to python
outlier_map |>
  group_by(dataset, hemisphere) |>
  summarise(
    voxels             = n(),
    voxels_w_any_outlier = sum(n_outlier_features > 0),
    pct_voxels_w_outlier = scales::percent(mean(n_outlier_features > 0), accuracy = 0.1),
    mean_n_outlier     = round(mean(n_outlier_features), 2),
    max_n_outlier      = max(n_outlier_features),
    .groups = "drop"
  ) |>
  kbl(caption = glue::glue("Outlier summary (written to {basename(outlier_map_path)})")) |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
Outlier summary (written to outlier_map_20260620.csv)
dataset hemisphere voxels voxels_w_any_outlier pct_voxels_w_outlier mean_n_outlier max_n_outlier
Bigbrain Left 84466 74103 87.7% 10.32 107
Bigbrain Right 76505 65684 85.9% 11.03 115
Edlow Left 90943 79995 88.0% 10.16 108
Edlow Right 92731 81290 87.7% 10.54 88
Lusebrink Left 107879 91961 85.2% 9.11 93
Lusebrink Right 94890 79448 83.7% 8.80 88
#secondary diagnostic: which feature class is driving the outliers, per dataset/hemisphere
outlier_map |>
  mutate(dominant_class_name = c("none", FEATURE_CLASSES)[dominant_class + 1]) |>
  count(dataset, hemisphere, dominant_class_name) |>
  pivot_wider(names_from = dominant_class_name, values_from = n, values_fill = 0) |>
  kbl(caption = "Dominant outlier class counts (secondary diagnostic)") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
Dominant outlier class counts (secondary diagnostic)
dataset hemisphere firstorder glcm gldm glrlm glszm ngtdm none
Bigbrain Left 12627 26031 4296 12290 16419 2440 10363
Bigbrain Right 11174 25821 3803 9161 13691 2034 10821
Edlow Left 13907 30872 3911 11000 17718 2587 10948
Edlow Right 9064 34822 4001 13215 17702 2486 11441
Lusebrink Left 8985 36680 6346 12930 24342 2678 15918
Lusebrink Right 8099 32204 5590 10346 20779 2430 15442