Description

This document performs exploratory data analysis on the radiomic features extracted from our three ultra-high-resolution claustrum datasets (BigBrain, Edlow and Lusebrink), each with two hemispheres (left and right) and three image types (original, Laplacian of Gaussian (σ = 0.5 mm), and Laplacian of Gaussian (σ = 1.0 mm).

In preparation for our end goal of PCA followed by spatially-constrained clustering, this exploration is designed to:

Wrangle with the big datasets
Summarise the data “feature space” (classes, counts, missingness)
Flag outlier voxels using IQR-based detection
Examine the correlation structure across feature classes and image types

Section 1 — Data wrangling

Load libraries

#list all the packages we will need
packages <- c(
  "tidyverse",    # data wrangling + ggplot2
  "corrplot",     # correlation matrices
  "ggcorrplot",   # ggplot2-style correlation matrices
  "scales",       # axis formatting
  "knitr",        # table rendering
  "kableExtra",   # styled tables
  "DT",           # interactive tables
  "RColorBrewer", # colour palettes
  "patchwork",    # combine ggplots
  "glue",         # string interpolation
  "plotly"        # hover ggplots 
)

#check what's installed
installed  <- rownames(installed.packages())

#find what needs to be installed
to_install <- setdiff(packages, installed)

#install missing packages
if (length(to_install) > 0) {
  message("Installing missing packages: ", paste(to_install, collapse = ", "))
  install.packages(to_install, repos = "https://cloud.r-project.org")
} else {
  message("All packages already installed.")
}

#load all packages
invisible(lapply(packages, library, character.only = TRUE))

#clean up env
rm(packages, installed, to_install)

Set file paths

#set main data directory (change to 'claustrum' in lab)
DATA_DIR <- "/Users/navona/OneDrive - UHN/projects/project10_claustrumParcellation_OS/output/1_radiomics_features/250um"

#define datasets and hemispheres
DATASETS <- c("bigbrain", "edlow", "lusebrink")
HEMIS    <- c("left", "right")

#create a named list of file paths, based on naming convention
files <- setNames(
  object = as.list(
    outer(DATASETS, HEMIS, function(ds, hemi)
      file.path(DATA_DIR, paste0(ds, "_", hemi, "_features_250um.csv"))
    )
  ),
  nm = as.vector(outer(DATASETS, HEMIS, paste, sep = "_"))
)

Parse the data

#identify meta-data, ie vars not Pyradomics features
META_COLS <- c("x", "y", "z", "y_global", "dist_to_boundary")

#identify feaeture classes of Pyradiomics features
FEATURE_CLASSES <- c("firstorder", "glcm", "glrlm", "glszm", "ngtdm", "gldm")

#specify 3 image types
IMAGE_TYPES <- c(
  "original"  = "", #no prefix
  "log05"     = "log-sigma-0-5-mm-3D_",
  "log10"     = "log-sigma-1-0-mm-3D_"
)

#colour scheme for datasets
DATASET_COLOURS <- c(
  "Bigbrain"   = "darkgreen",
  "Edlow"      = "blue",
  "Lusebrink"  = "purple"
)

#helper function for feature class from col names
get_feature_class <- function(col_name) {
  case_when(
    str_detect(col_name, "firstorder") ~ "firstorder",
    str_detect(col_name, "glcm")       ~ "glcm",
    str_detect(col_name, "glrlm")      ~ "glrlm",
    str_detect(col_name, "glszm")      ~ "glszm",
    str_detect(col_name, "ngtdm")      ~ "ngtdm",
    str_detect(col_name, "gldm")       ~ "gldm",
    TRUE                               ~ "metadata"
  )
}

#helper function to get image type from col names
get_image_type <- function(col_name) {
  case_when(
    str_detect(col_name, "log-sigma-0-5") ~ "log05",
    str_detect(col_name, "log-sigma-1-0") ~ "log10",
    TRUE                                  ~ "original"
  )
}

Load data

#read 6 csvs and tag each with dataset and hemi labels 
df_list <- imap(files, function(path, key) {
  parts <- str_split(key, "_")[[1]]
  read_csv(path, show_col_types = FALSE) |>
    mutate(
      dataset    = parts[1],
      hemisphere = parts[2],
      .before    = everything()
    )
})

#OPTIONAL: MAKE SMALL FOR TESTING SO FAST - MAKE SURE TO COMMENT OUT IN FINAL
#df_list <- map(df_list, ~ slice_sample(.x, n = 500))

#also combine 6 dfs into one long df, useful for cross-dataset comparisons
df_combined <- bind_rows(df_list)

Section 2 — Data Summary

Goal: Understand the structure of our Pyradomics data, i.e., the “feature space”. This is so that we can make smart decisions for our analysis, and also make sure that nothing has gone wrong with the data extraction. The tabs below show many features we have per class and image type, how many voxels per dataset, and how much missingness exists.

Next step: Check out the variable(s) with large proportions of missing data to understand what they are (namely, JointAverage).

Overview Table

#count voxels in each dataset
voxel_counts <- imap_dfr(df_list, function(df, key) { #like lapply, from purrr
  parts <- str_split(key, "_")[[1]]
  tibble(
    Dataset    = str_to_title(parts[1]), #make capital
    Hemisphere = str_to_title(parts[2]),
    Voxels     = nrow(df),
    Features   = ncol(df) - length(META_COLS) - 2  #subtract meta and dataset/hemi cols
  )
})

#make a pretty table
voxel_counts |>
  kbl(caption = "Voxel and feature counts per file") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE) |>
  column_spec(3:4, bold = TRUE)

Voxel and feature counts per file
Dataset	Hemisphere	Voxels	Features
Bigbrain	Left	84466	276
Edlow	Left	90943	276
Lusebrink	Left	107879	276
Bigbrain	Right	76505	276
Edlow	Right	92731	276
Lusebrink	Right	94890	276

Feature Class Breakdown

#build a feature inventory from the colnames of first file (all files share the same structure)
feature_cols <- names(df_list[[1]]) |>
  setdiff(c(META_COLS, "dataset", "hemisphere"))

#pull out image type and feature class
feature_inventory <- tibble(column = feature_cols) |>
  mutate(
    image_type = get_image_type(column),
    feature_class = get_feature_class(column)
  )

#make summary table of features by class and image type 
class_summary <- feature_inventory |>
  count(image_type, feature_class) |>
  pivot_wider(names_from = image_type, values_from = n, values_fill = 0) |>
  select(feature_class, original, log05, log10) |>
  mutate(feature_class = factor(feature_class, 
                                levels = unique(feature_inventory$feature_class))) |>
  arrange(feature_class)

#make table have nice formatting
class_summary |>
  rename(
    `Feature Class` = feature_class,
    `Original`      = original,
    `LoG σ=0.5mm`  = log05,
    `LoG σ=1.0mm`  = log10
  ) |>
  kbl(caption = "Number of features per class and image type") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE) |>
  add_header_above(c(" " = 1, "Image Type" = 3))

Number of features per class and image type
	Image Type
Feature Class	Original	LoG σ=0.5mm	LoG σ=1.0mm
firstorder	18	18	18
glcm	23	23	23
glrlm	16	16	16
glszm	16	16	16
ngtdm	5	5	5
gldm	14	14	14

Missing Values

#get NA counts per feature class and dataset
na_summary <- imap_dfr(df_list, function(df, key) {
  parts   <- str_split(key, "_")[[1]]
  feat_df <- df |> select(all_of(feature_cols))

  feat_df |>
    summarise(across(everything(), ~ sum(is.na(.)))) |>
    pivot_longer(everything(), names_to = "column", values_to = "n_na") |>
    left_join(feature_inventory, by = "column") |>
    group_by(image_type, feature_class) |>
    summarise(
      n_features_with_na = sum(n_na > 0),
      total_na_cells      = sum(n_na),
      .groups = "drop"
    ) |>
    mutate(
      dataset    = str_to_title(parts[1]),
      hemisphere = str_to_title(parts[2])
    )
})

#make a summary table - SORTABLE!
na_summary |>
  filter(total_na_cells > 0) |>
  arrange(dataset, hemisphere, image_type, feature_class) |>
  select(Dataset = dataset, Hemisphere = hemisphere,
         `Image Type` = image_type, `Feature Class` = feature_class,
         `Features with NAs` = n_features_with_na,
         `Total NA Cells` = total_na_cells) |>
  datatable(
    caption  = "Missing values by dataset, class, and image type (non-zero only)",
    options  = list(pageLength = 30, scrollX = TRUE),
    rownames = FALSE
  )

#calculate NA proportion per feature (pooled across datasets) 
na_per_feature <- imap_dfr(df_list, function(df, key) {
  parts <- str_split(key, "_")[[1]]
  df |>
    select(all_of(feature_cols)) |>
    summarise(across(everything(), ~ mean(is.na(.)))) |>
    pivot_longer(everything(), names_to = "column", values_to = "prop_na") |>
    mutate(
      dataset    = str_to_title(parts[1]),
      hemisphere = str_to_title(parts[2])
    )
}) |>
  left_join(feature_inventory, by = "column") |>
  group_by(column, dataset, image_type, feature_class) |>
  summarise(prop_na = mean(prop_na), .groups = "drop")  # average across hemispheres

#interactive heatmap with plotly - 
#show any variables that has any amount of NA
na_per_feature |>
  #filter(prop_na > 0) |>
  filter(column %in% na_per_feature$column[na_per_feature$prop_na > 0]) |> #keep rows constant
  mutate(
    image_type    = factor(image_type,
                           levels = c("original", "log05", "log10"),
                           labels = c("Original", "LoG σ=0.5mm", "LoG σ=1.0mm")),
    feature_short = str_remove(column, "^log-sigma-[0-9]-[0-9]-mm-3D_"),  # shorter label
    hover_text = glue::glue("{column}<br>Dataset: {dataset}<br>% Missing: {scales::percent(prop_na, accuracy = 0.1)}")
  ) |>
  plot_ly(
    x    = ~ dataset, #image_type
    y    = ~ column,
    z    = ~ prop_na * 100,
    text = ~ hover_text,
    type        = "heatmap",
    hoverinfo   = "text",
    colorscale  = list(c(0, "#fffde7"), c(1, "#d32f2f")),
    colorbar    = list(title = "% Missing")
  ) |>
  layout(
    title  = "Proportion of missing values per feature",
    xaxis = list(title = "Dataset"),
    yaxis  = list(title = "", ticks = "",  showticklabels = FALSE)
  )

Section 3 — Outlier Detection

Goal: Here, we want to identify “outliers”, i.e., voxels with extreme feature values that could distort PCA. Features are Z-scored (mean=0, SD=1) within each dataset before plotting, putting all features on a comparable scale. Outliers are shown in red. We use the IQR (boxplot) method for outlier detection, i.e., a value is flagged as an outlier if it falls below Q1 − 1.5×IQR or above Q3 + 1.5×IQR (put otherwise, a value is an outlier if it falls beyond the whiskers of the boxplot). This method is quite conservative for our feature space, so it’s just for visulization purposes: it’s not clear yet if we want to remove these outliers, as outliers in feature space almost always reflect real image properties, not measurement error - the idea is just to understand how many outliers we have.

Next step: Identify the features, image types, and datasets that have only a small number of wildly abnormal outliers. Compare across datasets and image types to determine if this ought to be ‘expected’. Understand what these outliers ‘mean’ for a given feature. Determine if there is a spatial pattern to the outliers (e.g., are they all at edges, etc). Make a short list of voxels to remove from the analysis of certain features; we can run a ‘sensitivity analysis’, i.e., our PCA and clustering with both these outlying voxels include and excluded.

# z-score all features within each dataset
df_scaled <- imap_dfr(df_list, function(df, key) {
  parts <- str_split(key, "_")[[1]]
  df |>
    select(all_of(META_COLS), all_of(feature_cols)) |>
    mutate(across(all_of(feature_cols), ~ scale(.x)[,1])) |>
    mutate(
      dataset    = str_to_title(parts[1]),
      hemisphere = str_to_title(parts[2])
    )
})

#helper: ggplot boxplot for one feature class and image type
plot_feature_boxplots <- function(feat_class, img_type, img_label) {
  
  cols_to_plot <- feature_inventory |>
    filter(feature_class == feat_class, image_type == img_type) |>
    pull(column)
  
  df_scaled |>
    select(dataset, hemisphere, all_of(cols_to_plot)) |>
    pivot_longer(all_of(cols_to_plot), names_to = "column", values_to = "value") |>
    mutate(
      feature_short = str_remove(column, "^log-sigma-[0-9]-[0-9]-mm-3D_") |>
                      str_remove(paste0(feat_class, "_")),
      hemisphere    = factor(hemisphere, levels = c("Left", "Right")),
      dataset       = factor(dataset, levels = names(DATASET_COLOURS))
    ) |>
    ggplot(aes(x = feature_short, y = value, fill = dataset)) +
    geom_boxplot(outlier.size = 1, outlier.alpha = 0.3, linewidth = 0.3, outlier.colour = "red") +
    facet_grid(hemisphere ~ .) +
    scale_fill_manual(values = DATASET_COLOURS, name = "Dataset") +
    labs(
      title = glue::glue("{feat_class} — {img_label}"),
      x     = NULL,
      y     = "Z-score"
    ) +
    theme_minimal(base_size = 11) +
    theme(
      axis.text.x     = element_text(angle = 45, hjust = 1, size = 8),
      strip.text      = element_text(face = "bold"),
      legend.position = "top"
    )
}

firstorder

Original

plot_feature_boxplots("firstorder", "original", "Original")

LoG σ=0.5mm

plot_feature_boxplots("firstorder", "log05", "LoG σ=0.5mm")

LoG σ=1.0mm

plot_feature_boxplots("firstorder", "log10", "LoG σ=1.0mm")

glcm

Original

plot_feature_boxplots("glcm", "original", "Original")

LoG σ=0.5mm

plot_feature_boxplots("glcm", "log05", "LoG σ=0.5mm")

LoG σ=1.0mm

plot_feature_boxplots("glcm", "log10", "LoG σ=1.0mm")

glrlm

Original

plot_feature_boxplots("glrlm", "original", "Original")

LoG σ=0.5mm

plot_feature_boxplots("glrlm", "log05", "LoG σ=0.5mm")

LoG σ=1.0mm

plot_feature_boxplots("glrlm", "log10", "LoG σ=1.0mm")

glszm

Original

plot_feature_boxplots("glszm", "original", "Original")

LoG σ=0.5mm

plot_feature_boxplots("glszm", "log05", "LoG σ=0.5mm")

LoG σ=1.0mm

plot_feature_boxplots("glszm", "log10", "LoG σ=1.0mm")

ngtdm

Original

plot_feature_boxplots("ngtdm", "original", "Original")

LoG σ=0.5mm

plot_feature_boxplots("ngtdm", "log05", "LoG σ=0.5mm")

LoG σ=1.0mm

plot_feature_boxplots("ngtdm", "log10", "LoG σ=1.0mm")

gldm

Original

plot_feature_boxplots("gldm", "original", "Original")

LoG σ=0.5mm

plot_feature_boxplots("gldm", "log05", "LoG σ=0.5mm")

LoG σ=1.0mm

plot_feature_boxplots("gldm", "log10", "LoG σ=1.0mm")

Section 4 — Correlation Structure

Goal: Examine the correlation structure across features to understand redundancy before PCA. We compute Spearman rank correlation — robust to the non-normal, right-skewed distributions seen in Section 3. A random sample of 2000 voxels per dataset is used for speed (compare to the total voxel totals in Section 1); however, correlation estimates are stable at this sample size. High correlation (|r| > 0.9) between two features means they carry nearly identical information. This will not cause us to drop features before PCA — but it will help interpret PCA loadings later, as correlated features will load onto the same principal components.

Next step: Understand if feature classes are highly correlated among themselves. Understand what features / feature classes are negatively correlated. Examine if the image types appear to have similar correlation patterns.

#compute spearman correlation matrix on a random sample of voxels
compute_corr <- function(df, cols, n_sample = NULL, seed = 123) {
  set.seed(seed)
  df_sub <- df |>
    select(all_of(cols)) |>
    select(where(~ var(.x, na.rm = TRUE) > 0))
  if (!is.null(n_sample)) {
    df_sub <- slice_sample(df_sub, n = min(n_sample, nrow(df_sub)))
  }
  cor(df_sub, use = "pairwise.complete.obs", method = "spearman")
}

#plot correlation matrix as heatmap
plot_corr <- function(corr_mat, title) {
  
  # get feature order from inventory, preserving class grouping
  feature_order <- feature_inventory |>
    filter(column %in% colnames(corr_mat)) |>
    arrange(feature_class) |>
    mutate(f_short = str_remove(column, "^log-sigma-[0-9]-[0-9]-mm-3D_"))
  
  corr_df <- corr_mat |>
    as.data.frame() |>
    rownames_to_column("feature1") |>
    pivot_longer(-feature1, names_to = "feature2", values_to = "r") |>
    mutate(
      f1_short = str_remove(feature1, "^log-sigma-[0-9]-[0-9]-mm-3D_"),
      f2_short = str_remove(feature2, "^log-sigma-[0-9]-[0-9]-mm-3D_"),
      f1_short = factor(f1_short, levels = feature_order$f_short),
      f2_short = factor(f2_short, levels = feature_order$f_short)
    )
  
  #heatmap
  p_main <- corr_df |>
    ggplot(aes(x = f2_short, y = f1_short, fill = r)) +
    geom_tile() +
    scale_fill_gradient2(
      low      = "#d73027",
      mid      = "white",
      high     = "#1a9850",
      midpoint = 0,
      limits   = c(-1, 1),
      name     = "Spearman r"
    ) +
    labs(title = title, x = NULL, y = NULL) +
    theme_minimal(base_size = 10) +
    theme(
      axis.text  = element_blank(),
      axis.ticks = element_blank()
    )
  
  #feature class strip along x axis
  strip_df <- feature_order |>
    mutate(f_short = factor(f_short, levels = feature_order$f_short))
  
  p_strip_x <- strip_df |>
    ggplot(aes(x = f_short, y = 1, fill = feature_class)) +
    geom_tile() +
    scale_fill_brewer(palette = "Set2", name = "Feature Class") +
    theme_void() +
    theme(
      legend.position = "bottom",
      legend.title    = element_text(size = 9),
      legend.text     = element_text(size = 8)
    )
  
  #feature class strip along y axis
  p_strip_y <- strip_df |>
    ggplot(aes(y = f_short, x = 1, fill = feature_class)) +
    geom_tile() +
    scale_fill_brewer(palette = "Set2", guide = "none") +
    theme_void()
  
  #combine with patchwork
  top_row    <- (p_strip_y | p_main)   + plot_layout(widths  = c(1, 40))
  bottom_row <- (plot_spacer() | p_strip_x) + plot_layout(widths  = c(1, 40))
  top_row / bottom_row + plot_layout(heights = c(20, 1))
}

#pre-compute all correlation matrices once
#one matrix per dataset x hemisphere x image type
#note this is run on a fraction of the data, 2000 voxels, just for visualization
#ie correlation values won't be exact but likely close enough
corr_matrices <- imap(df_list, function(df, key) {
  map(c("original", "log05", "log10"), function(img_type) {
    cols <- feature_inventory |>
      filter(image_type == img_type) |>
      pull(column)
    compute_corr(df, cols, n_sample=2000)
  }) |> set_names(c("original", "log05", "log10"))
})

BigBrain

Left

Original

plot_corr(corr_matrices[["bigbrain_left"]][["original"]],
          "BigBrain Left — Original (Spearman, n=500)")

LoG σ=0.5mm

plot_corr(corr_matrices[["bigbrain_left"]][["log05"]],
          "BigBrain Left — LoG σ=0.5mm (Spearman, n=500)")

LoG σ=1.0mm

plot_corr(corr_matrices[["bigbrain_left"]][["log10"]],
          "BigBrain Left — LoG σ=1.0mm (Spearman, n=500)")

Right

Original

plot_corr(corr_matrices[["bigbrain_right"]][["original"]],
          "BigBrain Right — Original (Spearman, n=500)")

LoG σ=0.5mm

plot_corr(corr_matrices[["bigbrain_right"]][["log05"]],
          "BigBrain Right — LoG σ=0.5mm (Spearman, n=500)")

LoG σ=1.0mm

plot_corr(corr_matrices[["bigbrain_right"]][["log10"]],
          "BigBrain Right — LoG σ=1.0mm (Spearman, n=500)")

Edlow

Left

Original

plot_corr(corr_matrices[["edlow_left"]][["original"]],
          "Edlow Left — Original (Spearman, n=500)")

LoG σ=0.5mm

plot_corr(corr_matrices[["edlow_left"]][["log05"]],
          "Edlow Left — LoG σ=0.5mm (Spearman, n=500)")

LoG σ=1.0mm

plot_corr(corr_matrices[["edlow_left"]][["log10"]],
          "Edlow Left — LoG σ=1.0mm (Spearman, n=500)")

Right

Original

plot_corr(corr_matrices[["edlow_right"]][["original"]],
          "Edlow Right — Original (Spearman, n=500)")

LoG σ=0.5mm

plot_corr(corr_matrices[["edlow_right"]][["log05"]],
          "Edlow Right — LoG σ=0.5mm (Spearman, n=500)")

LoG σ=1.0mm

plot_corr(corr_matrices[["edlow_right"]][["log10"]],
          "Edlow Right — LoG σ=1.0mm (Spearman, n=500)")

Lüsebrink

Left

Original

plot_corr(corr_matrices[["lusebrink_left"]][["original"]],
          "Lüsebrink Left — Original (Spearman, n=500)")

LoG σ=0.5mm

plot_corr(corr_matrices[["lusebrink_left"]][["log05"]],
          "Lüsebrink Left — LoG σ=0.5mm (Spearman, n=500)")

LoG σ=1.0mm

plot_corr(corr_matrices[["lusebrink_left"]][["log10"]],
          "Lüsebrink Left — LoG σ=1.0mm (Spearman, n=500)")

Right

Original

plot_corr(corr_matrices[["lusebrink_right"]][["original"]],
          "Lüsebrink Right — Original (Spearman, n=500)")

LoG σ=0.5mm

plot_corr(corr_matrices[["lusebrink_right"]][["log05"]],
          "Lüsebrink Right — LoG σ=0.5mm (Spearman, n=500)")

LoG σ=1.0mm

plot_corr(corr_matrices[["lusebrink_right"]][["log10"]],
          "Lüsebrink Right — LoG σ=1.0mm (Spearman, n=500)")

Section 5 — NIfTI Export

Goal: Write out two voxel-level .csvs for downstream .nii reconstruction in Python: (1) a missingness map showing the count of NA features (out of all 278) at each voxel, and (2) an outlier map showing the count of IQR-flagged outlier features at each voxel (maximum possible 278. Note: both .csvs also carry a dominant_class column (the feature class contributing the most NAs/outliers) as a secondary diagnostic, in case a separate class-coded .nii is useful later, but the primary NIfTI value for both maps is the count.

#flag whether a value is an outlier by IQR rule
is_iqr_outlier <- function(x) {
  q   <- quantile(x, probs = c(0.25, 0.75), na.rm = TRUE)
  iqr <- q[2] - q[1]
  x < (q[1] - 1.5 * iqr) | x > (q[2] + 1.5 * iqr)
}

#assign an integer code to each feature class, used to colour the outlier map
#e.g. firstorder = 1, glcm = 2, glrlm = 3, glszm = 4, ngtdm = 5, gldm = 6
class_codes <- setNames(seq_along(FEATURE_CLASSES), FEATURE_CLASSES)

#class_codes |>
  #enframe(name = "feature_class", value = "code") |>
  #kbl(caption = "If used later: feature class integer code (for NIfTI colour-coding)") |>
  #kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)

Missingness Map

#missingness map: count of how many features (out of all 278) are NA at each voxel
#dominant_class = feature class contributing the most NAs (secondary diagnostic)
missing_map <- imap_dfr(df_list, function(df, key) {
  parts   <- str_split(key, "_")[[1]]
  feat_df <- df |> select(all_of(feature_cols))
  na_flags <- map_dfc(feat_df, is.na)

  #count NAs per voxel, per feature class
  class_counts <- map_dfc(FEATURE_CLASSES, function(cls) {
    cls_cols <- feature_inventory |> filter(feature_class == cls) |> pull(column)
    tibble("{cls}" := rowSums(na_flags[, cls_cols], na.rm = TRUE))
  })

  #dominant class = highest NA count; 0 if no NAs at all; ties go to first listed in FEATURE_CLASSES
  dominant_class <- apply(class_counts, 1, function(row) {
    if (sum(row) == 0) return(0)
    class_codes[[FEATURE_CLASSES[which.max(row)]]]
  })

  tibble(
    x                   = df$x,
    y                   = df$y,
    z                   = df$z,
    y_global            = df$y_global,
    dataset             = str_to_title(parts[1]),
    hemisphere          = str_to_title(parts[2]),
    n_missing_features  = rowSums(na_flags, na.rm = TRUE),
    dominant_class      = dominant_class
  )
})

#write out, datestamped
EXPORT_DIR <- "/Users/navona/Library/CloudStorage/OneDrive-UHN/projects/project10_claustrumParcellation_OS/output/2_preprocessing"
missing_map_path <- file.path(EXPORT_DIR, glue::glue("missing_map_{format(Sys.Date(), '%Y%m%d')}.csv"))
write_csv(missing_map, missing_map_path)

#quick summary so we can sanity check before handing off to python
missing_map |>
  group_by(dataset, hemisphere) |>
  summarise(
    voxels             = n(),
    voxels_w_any_na    = sum(n_missing_features > 0),
    pct_voxels_w_na    = scales::percent(mean(n_missing_features > 0), accuracy = 0.1),
    mean_n_missing     = round(mean(n_missing_features), 2),
    max_n_missing      = max(n_missing_features),
    .groups = "drop"
  ) |>
  kbl(caption = glue::glue("Missingness summary (written to {basename(missing_map_path)})")) |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)

Missingness summary (written to missing_map_20260620.csv)
dataset	hemisphere	voxels	voxels_w_any_na	pct_voxels_w_na	mean_n_missing	max_n_missing
Bigbrain	Left	84466	7414	8.8%	0.33	120
Bigbrain	Right	76505	7660	10.0%	0.35	120
Edlow	Left	90943	924	1.0%	0.03	3
Edlow	Right	92731	881	1.0%	0.03	3
Lusebrink	Left	107879	742	0.7%	0.02	3
Lusebrink	Right	94890	548	0.6%	0.02	3

#secondary diagnostic: which feature class is driving the missingness, per dataset/hemisphere
missing_map |>
  mutate(dominant_class_name = c("none", FEATURE_CLASSES)[dominant_class + 1]) |>
  count(dataset, hemisphere, dominant_class_name) |>
  pivot_wider(names_from = dominant_class_name, values_from = n, values_fill = 0) |>
  kbl(caption = "Dominant missingness class counts (secondary diagnostic)") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)

Dominant missingness class counts (secondary diagnostic)
dataset	hemisphere	firstorder	glcm	none
Bigbrain	Left	22	7392	77052
Bigbrain	Right	24	7636	68845
Edlow	Left	0	924	90019
Edlow	Right	0	881	91850
Lusebrink	Left	0	742	107137
Lusebrink	Right	0	548	94342

Outlier Map

#outlier map: dominant feature class per voxel, by IQR-flagged count
outlier_map <- imap_dfr(df_list, function(df, key) {
  parts   <- str_split(key, "_")[[1]]
  feat_df <- df |> select(all_of(feature_cols))
  flags   <- map_dfc(feat_df, is_iqr_outlier)

  #count outliers per voxel, per feature class
  class_counts <- map_dfc(FEATURE_CLASSES, function(cls) {
    cls_cols <- feature_inventory |> filter(feature_class == cls) |> pull(column)
    tibble("{cls}" := rowSums(flags[, cls_cols], na.rm = TRUE))
  })

  #dominant class = highest count; 0 if no outliers at all; ties go to first listed in FEATURE_CLASSES
  dominant_class <- apply(class_counts, 1, function(row) {
    if (sum(row) == 0) return(0)
    class_codes[[FEATURE_CLASSES[which.max(row)]]]
  })

  tibble(
    x                   = df$x,
    y                   = df$y,
    z                   = df$z,
    y_global            = df$y_global,
    dataset             = str_to_title(parts[1]),
    hemisphere          = str_to_title(parts[2]),
    n_outlier_features  = rowSums(flags, na.rm = TRUE),
    dominant_class      = dominant_class
  )
})

#write out, datestamped
outlier_map_path <- file.path(EXPORT_DIR, glue::glue("outlier_map_{format(Sys.Date(), '%Y%m%d')}.csv"))
write_csv(outlier_map, outlier_map_path)

#quick summary so we can sanity check before handing off to python
outlier_map |>
  group_by(dataset, hemisphere) |>
  summarise(
    voxels             = n(),
    voxels_w_any_outlier = sum(n_outlier_features > 0),
    pct_voxels_w_outlier = scales::percent(mean(n_outlier_features > 0), accuracy = 0.1),
    mean_n_outlier     = round(mean(n_outlier_features), 2),
    max_n_outlier      = max(n_outlier_features),
    .groups = "drop"
  ) |>
  kbl(caption = glue::glue("Outlier summary (written to {basename(outlier_map_path)})")) |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)

Outlier summary (written to outlier_map_20260620.csv)
dataset	hemisphere	voxels	voxels_w_any_outlier	pct_voxels_w_outlier	mean_n_outlier	max_n_outlier
Bigbrain	Left	84466	74103	87.7%	10.32	107
Bigbrain	Right	76505	65684	85.9%	11.03	115
Edlow	Left	90943	79995	88.0%	10.16	108
Edlow	Right	92731	81290	87.7%	10.54	88
Lusebrink	Left	107879	91961	85.2%	9.11	93
Lusebrink	Right	94890	79448	83.7%	8.80	88

#secondary diagnostic: which feature class is driving the outliers, per dataset/hemisphere
outlier_map |>
  mutate(dominant_class_name = c("none", FEATURE_CLASSES)[dominant_class + 1]) |>
  count(dataset, hemisphere, dominant_class_name) |>
  pivot_wider(names_from = dominant_class_name, values_from = n, values_fill = 0) |>
  kbl(caption = "Dominant outlier class counts (secondary diagnostic)") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)

Dominant outlier class counts (secondary diagnostic)
dataset	hemisphere	firstorder	glcm	gldm	glrlm	glszm	ngtdm	none
Bigbrain	Left	12627	26031	4296	12290	16419	2440	10363
Bigbrain	Right	11174	25821	3803	9161	13691	2034	10821
Edlow	Left	13907	30872	3911	11000	17718	2587	10948
Edlow	Right	9064	34822	4001	13215	17702	2486	11441
Lusebrink	Left	8985	36680	6346	12930	24342	2678	15918
Lusebrink	Right	8099	32204	5590	10346	20779	2430	15442

Radiomic feature exploration

2026-06-20

Section 1 — Data wrangling

Section 2 — Data Summary

Overview Table

Feature Class Breakdown

Missing Values

Section 3 — Outlier Detection

firstorder

Original

LoG σ=0.5mm

LoG σ=1.0mm

glcm

Original

LoG σ=0.5mm

LoG σ=1.0mm

glrlm

Original

LoG σ=0.5mm

LoG σ=1.0mm

glszm

Original

LoG σ=0.5mm

LoG σ=1.0mm

ngtdm

Original

LoG σ=0.5mm

LoG σ=1.0mm

gldm

Original

LoG σ=0.5mm

LoG σ=1.0mm

Section 4 — Correlation Structure

BigBrain

Left

Original

LoG σ=0.5mm

LoG σ=1.0mm

Right

Original

LoG σ=0.5mm

LoG σ=1.0mm

Edlow

Left

Original

LoG σ=0.5mm

LoG σ=1.0mm

Right

Original

LoG σ=0.5mm

LoG σ=1.0mm

Lüsebrink

Left

Original

LoG σ=0.5mm

LoG σ=1.0mm

Right

Original

LoG σ=0.5mm

LoG σ=1.0mm

Section 5 — NIfTI Export

Missingness Map

Outlier Map