CRC Image Feature Analysis

Image Analysis: TCGAData

Setting up environment

renv::init()
renv::snapshot()
renv::status()

Setting up libraries

library(BiocManager)
library(MOFA2)
library(remotes)
library(SingleCellExperiment)
library(SpatialExperiment)
library(SpatialFeatureExperiment)
library(MultiAssayExperiment)
library(SummarizedExperiment)
library(curatedTCGAData)
library(dplyr)
library(spatstat)
library(moments)
library(pracma)
library(HistoImageR)
library(devtools)
library(ggplot2)
library(TCGAutils)
library(rmarkdown)
library(knitr)
library(psych)

prov-gigaPath COAD

getwd()
[1] "/Users/eslamabousamra/Cancer Image Analysis/vignettes"
dir <- "../TCGA_COAD_READ_provgigapath"
fnames <- list.files(dir, pattern = "\\.csv$", full.names = TRUE)
length(fnames)  # how many slides
[1] 623

build embedding matrix

# Load helper utils fns
devtools::load_all()
# Creating matrix for all the images
tensor_matrix <- matrix(nrow = length(fnames), ncol = 768)
slide_ids <- sapply(strsplit(basename(fnames), "\\.csv"), `[`, 1)
rownames(tensor_matrix) <- slide_ids

for (i in seq_along(fnames)) {
  tensor_matrix[i,] <- getEmbeddingLayer(fnames[i])
}

tensor_matrix_full <- na.omit(tensor_matrix)
# Verify the structure matched layer 0 in csv
structure_of_matrix = as.data.frame(tensor_matrix_full)

Load in TCGA CRC assay experiment data

# List available Assays
curatedTCGAData("COAD", "*",version = "2.1.1", dry.run = TRUE) 
curatedTCGAData("READ", "*",version = "2.1.1", dry.run = TRUE) 


# Load COAD data with multiple assays
coad_mae_full <- curatedTCGAData(
    diseaseCode = "COAD",
    assays = c("COAD_RNASeq2GeneNorm-20160128", "mRNAArray", "miRNASeqGene"),
    version = "2.1.1",
    dry.run = FALSE
)

# Load READ data (rectal)
read_mae_full <- curatedTCGAData(
    diseaseCode = "READ",
    assays = c("READ_RNASeq2GeneNorm-20160128", "mRNAArray", "miRNASeqGene"),
    version = "2.1.1",
    dry.run = FALSE
)

Read Clinical Metadata data and subset

# ---- Extract sample-level metadata for READ ----- 
read_clinical <- colData(read_mae_full)

# Check completeness of metadata attributes
completeness <- colSums(!is.na(read_clinical)) / nrow(read_clinical) * 100
summary(completeness)

# Subset to attributes with >95% completeness and remove technical fields
read_clinical_sub <- read_clinical[, which(completeness > 95)] %>%
  as.data.frame() %>%
  select(-matches("aliquot")) %>%
  select(-matches("portion"))


# ------ Extract sample-level metadata for COAD ------
coad_clinical <- colData(coad_mae_full)

# Check completeness of metadata attributes
completeness <- colSums(!is.na(coad_clinical)) / nrow(coad_clinical) * 100

coad_clinical_sub <- coad_clinical[, which(completeness > 95)] %>%
  as.data.frame() %>%
  select(-matches("aliquot")) %>%
  select(-matches("portion"))

Matching embedding with clinical metadata

# Merge COAD + READ clinical metadata
all_cols <- union(colnames(coad_clinical_sub), colnames(read_clinical_sub))

coad_clinical_full <- coad_clinical_sub
for (col in setdiff(all_cols, colnames(coad_clinical_sub))) coad_clinical_full[[col]] <- NA

read_clinical_full <- read_clinical_sub
for (col in setdiff(all_cols, colnames(read_clinical_sub))) read_clinical_full[[col]] <- NA

clinical_combined <- rbind(coad_clinical_full, read_clinical_full)

# Keep only patients with embeddings
# Convert embedding matrix rownames to patient IDs (first 3 components)
patient_ids_from_embedding <- sapply(strsplit(rownames(tensor_matrix_full), "-"), 
                                    function(x) paste(x[1:3], collapse = "-"))
rownames(tensor_matrix_full) <- patient_ids_from_embedding

# Convert clinical data rownames to patient IDs (first 3 components)
clinical_patient_ids <- sapply(strsplit(rownames(clinical_combined), "-"), 
                              function(x) paste(x[1:3], collapse = "-"))
rownames(clinical_combined) <- clinical_patient_ids

# Remove duplicate patients from clinical data (keep first occurrence)
clinical_combined <- clinical_combined[!duplicated(rownames(clinical_combined)), ]

# Keep only patients with embeddings
patients_with_embed <- rownames(tensor_matrix_full)
clinical_combined <- clinical_combined[rownames(clinical_combined) %in% patients_with_embed, ]

# Factorize categorical variables with 2-10 unique values
numLv <- apply(clinical_combined, 2, function(x) length(unique(na.omit(x))))
slideAttrNames <- names(numLv[numLv >= 2 & numLv <= 10])

for(attr in slideAttrNames){
  clinical_combined[[attr]] <- as.factor(clinical_combined[[attr]])
}

# Create final metadata dataframe
metadata_embed_df <- clinical_combined[slideAttrNames]

# Remove any rows with all NA values
metadata_embed_df <- metadata_embed_df[rowSums(is.na(metadata_embed_df)) < ncol(metadata_embed_df), ]

# Subset embedding matrix to match metadata patients
embedding_embed_mat <- tensor_matrix_full[rownames(tensor_matrix_full) %in% rownames(metadata_embed_df), ]
embedding_embed_mat <- embedding_embed_mat[!duplicated(rownames(embedding_embed_mat)), ]

# Ensure exact same patients in both datasets
common_patients <- intersect(rownames(embedding_embed_mat), rownames(metadata_embed_df))
embedding_embed_mat <- embedding_embed_mat[common_patients, ]
metadata_embed_df <- metadata_embed_df[common_patients, ]

# Run HistoImageR embedding separation analysis
res_embed <- HistoImageR:::analyze_embedding_separation(embedding_embed_mat, metadata_embed_df)

# View results
res_embed$plots

Matching embedding and assay data COAD + READ

#------------------------------------------
# 1. Load COAD + READ RNASeq assays
#------------------------------------------
# Load COAD
coad_mae <- curatedTCGAData(
  diseaseCode = "COAD",
  assays = "RNASeq2GeneNorm",
  version = "2.1.1",
  dry.run = FALSE
)
Querying and downloading: COAD_RNASeq2GeneNorm-20160128
see ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation
loading from cache
Querying and downloading: COAD_colData-20160128
see ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation
loading from cache
Querying and downloading: COAD_sampleMap-20160128
see ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation
loading from cache
Querying and downloading: COAD_metadata-20160128
see ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation
loading from cache
harmonizing input:
  removing 5606 sampleMap rows not in names(experiments)
  removing 2 colData rownames not in sampleMap 'primary'
# Load READ
read_mae <- curatedTCGAData(
  diseaseCode = "READ",
  assays = "RNASeq2GeneNorm",
  version = "2.1.1",
  dry.run = FALSE
)
Querying and downloading: READ_RNASeq2GeneNorm-20160128
see ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation
loading from cache
Querying and downloading: READ_colData-20160128
see ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation
loading from cache
Querying and downloading: READ_sampleMap-20160128
see ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation
loading from cache
Querying and downloading: READ_metadata-20160128
see ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation
loading from cache
harmonizing input:
  removing 2078 sampleMap rows not in names(experiments)
  removing 2 colData rownames not in sampleMap 'primary'
rna_coad <- coad_mae[["COAD_RNASeq2GeneNorm-20160128"]]
rna_read <- read_mae[["READ_RNASeq2GeneNorm-20160128"]]

Find common genes

#------------------------------------------
# 2. Ensure consistent genes
#------------------------------------------
common_genes <- intersect(rownames(rna_coad), rownames(rna_read))
rna_coad_bulk <- rna_coad[common_genes, ]
rna_read_bulk <- rna_read[common_genes, ]

# Merge RNASeq matrices
rna_combined <- cbind(rna_coad_bulk, rna_read_bulk)

# Standardize patient IDs (first 3 TCGA fields)
rna_patients <- sapply(strsplit(colnames(rna_combined), "-"), function(x) paste(x[1:3], collapse = "-"))
colnames(rna_combined) <- make.unique(rna_patients)

#------------------------------------------
# 3. Prepare embedding matrix
#------------------------------------------
tensor_matrix_unique <- tensor_matrix_full[!duplicated(rownames(tensor_matrix_full)), ]
embedding_patients <- sapply(strsplit(rownames(tensor_matrix_unique), "-"), function(x) paste(x[1:3], collapse = "-"))
rownames(tensor_matrix_unique) <- make.unique(embedding_patients)

#------------------------------------------
# 4. Prepare clinical metadata
#------------------------------------------
# Subset COAD clinical metadata
coad_clinical <- colData(coad_mae)
completeness <- colSums(!is.na(coad_clinical)) / nrow(coad_clinical) * 100
coad_clinical_sub <- coad_clinical[, which(completeness > 95)] %>%
  as.data.frame() %>%
  select(-matches("aliquot"), -matches("portion"))

# Subset READ clinical metadata
read_clinical <- colData(read_mae)
completeness <- colSums(!is.na(read_clinical)) / nrow(read_clinical) * 100
read_clinical_sub <- read_clinical[, which(completeness > 95)] %>%
  as.data.frame() %>%
  select(-matches("aliquot"), -matches("portion"))

# Ensure all columns exist in both datasets
all_cols <- union(colnames(coad_clinical_sub), colnames(read_clinical_sub))

# Fill missing columns with NA
coad_clinical_full <- coad_clinical_sub
for (col in setdiff(all_cols, colnames(coad_clinical_sub))) coad_clinical_full[[col]] <- NA

read_clinical_full <- read_clinical_sub
for (col in setdiff(all_cols, colnames(read_clinical_sub))) read_clinical_full[[col]] <- NA

# Merge COAD + READ clinical metadata
clinical_sub <- rbind(coad_clinical_full, read_clinical_full)

# Keep only patients with embeddings
patients_with_embed <- rownames(tensor_matrix_unique)
clinical_sub <- clinical_sub[rownames(clinical_sub) %in% patients_with_embed, ]

# Factorize categorical variables with 2–10 levels
numLv <- apply(clinical_sub, 2, function(x) length(unique(x)))
slideAttrNames <- names(numLv[numLv >= 2 & numLv <= 10])
for (slideAttrName in slideAttrNames) {
  clinical_sub[[slideAttrName]] <- as.factor(clinical_sub[[slideAttrName]])
}

metadata_df <- clinical_sub[slideAttrNames]

#------------------------------------------
# 5. Subset RNASeq to patients with embeddings
#------------------------------------------
rna_matched <- rna_combined[, colnames(rna_combined) %in% patients_with_embed, drop = FALSE]

# Remove genes with Inf/-Inf
rna_matrix <- assay(rna_matched)
rna_keep <- apply(rna_matrix, 1, function(x) all(is.finite(x)))
rna_matched_clean <- rna_matched[rna_keep, ]

# Subset embedding to match RNA columns
embedding_matched_clean <- tensor_matrix_unique[colnames(rna_matched_clean), , drop = FALSE]

#------------------------------------------
# 6. Build SummarizedExperiment objects
#------------------------------------------
rna_se <- SummarizedExperiment(
  assays = list(counts = assay(rna_matched_clean)),
  colData = DataFrame(patientID = colnames(rna_matched_clean))
)

embedding_se <- SummarizedExperiment(
  assays = list(embedding = t(embedding_matched_clean)),
  colData = metadata_df[colnames(rna_matched_clean), , drop = FALSE]
)
rownames(embedding_se) <- paste0("embed_", seq_len(nrow(embedding_se)))

#------------------------------------------
# 7. Build MultiAssayExperiment
#------------------------------------------
crc_mae <- MultiAssayExperiment(
  experiments = list(
    # labeing this contains both COAD + READ RNASeq data
    RNASeq = rna_se,
    Embedding_layer0 = embedding_se
  ),
  colData = metadata_df[colnames(rna_matched_clean), , drop = FALSE]
)

Running MOFA2 analysis

# Create MOFA object
mofa_object <- create_mofa_from_MultiAssayExperiment(crc_mae)

# Define model options
model_opts <- get_default_model_options(mofa_object)
model_opts$num_factors <- 15

# Define training options
train_opts <- get_default_training_options(mofa_object)
train_opts$convergence_mode <- "slow"
train_opts$seed <- 42

# Prepare and run MOFA
mofa_object <- prepare_mofa(
  object = mofa_object,
  model_options = model_opts,
  training_options = train_opts
)
Warning in prepare_mofa(object = mofa_object, model_options = model_opts, :
Some view(s) have a lot of features, it is recommended to perform a more
stringent feature selection before creating the MOFA object....
Checking data options...
No data options specified, using default...
Checking training options...
Checking model options...
# Run/train the model 

#mofa_object.trained <- run_mofa(
#  mofa_object,
#  outfile = "mofa_model_crc.hdf5",
#  use_basilisk = TRUE
#)

Analyze MOFA model

# Load trained MOFA model
mofa_object.trained <- load_model("mofa_model_crc.hdf5")


# Keep only patients in MOFA object
mofa_samples <- unlist(samples_names(mofa_object.trained))
metadata_matched <- metadata_df[mofa_samples, , drop = FALSE]

# For the metadata data frame
metadata_df$sample <- rownames(metadata_df)

# adding the metadata to the data object as post processing
samples_metadata(mofa_object.trained) <- metadata_df

# visualize the model views and samples
plot_data_overview(mofa_object.trained)

# Plot factor correlations to check for interdependence issues with the model
plot_factor_cor(mofa_object.trained)

Initial visualizations

# Plot variance explained
plot_variance_explained(mofa_object.trained, plot_total = TRUE)
[[1]]


[[2]]

# Plot data heatmaps
# using rna seq view
plot_data_heatmap(mofa_object.trained,
                   scale = "row",
                   cluster_cols = FALSE,
                   cluster_rows = FALSE,
                   show_colnames = FALSE,
                   denoise = TRUE,
                   features = 10,
                   view = "RNASeq",
                   factor=1)

# Using embedding view
plot_data_heatmap(mofa_object.trained,
                  factor = 1,
                  view = "Embedding_layer0",
                  features = 10,
                  cluster_rows = FALSE,
                  cluster_cols = FALSE,
                  show_colnames = FALSE)

# Plot top weights 
plot_top_weights(mofa_object.trained,
                 view = "RNASeq",
                 factor = 1,
                 nfeatures = 10)

# Plot factor colored by pathology stage
plot_factor(mofa_object.trained, 
            factor = 1, 
            add_violin = T,
            dodge = TRUE,
            color_by = "MYH11")

# Plot factor scatter
plot_factor(mofa_object.trained, 
            factor = c(1, 2), 
            color_by = "pathology_T_stage")

plot_top_weights(mofa_object.trained,
                 view = "Embedding_layer0",
                 factor = 1,
                 nfeatures = 10)

# Correlate factors with clinical variables
correlate_factors_with_covariates(mofa_object.trained, 
                      covariates = c("vital_status.x","pathology_M_stage","tumor_tissue_site"), 
                      plot = "log_pval" )