renv::init()
renv::snapshot()
renv::status()CRC Image Feature Analysis
Image Analysis: TCGAData
Setting up environment
Setting up libraries
library(BiocManager)
library(MOFA2)
library(remotes)
library(SingleCellExperiment)
library(SpatialExperiment)
library(SpatialFeatureExperiment)
library(MultiAssayExperiment)
library(SummarizedExperiment)
library(curatedTCGAData)
library(dplyr)
library(spatstat)
library(moments)
library(pracma)
library(HistoImageR)
library(devtools)
library(ggplot2)
library(TCGAutils)
library(rmarkdown)
library(knitr)
library(psych)prov-gigaPath COAD
getwd()[1] "/Users/eslamabousamra/Cancer Image Analysis/vignettes"
dir <- "../TCGA_COAD_READ_provgigapath"
fnames <- list.files(dir, pattern = "\\.csv$", full.names = TRUE)
length(fnames) # how many slides[1] 623
build embedding matrix
# Load helper utils fns
devtools::load_all()
# Creating matrix for all the images
tensor_matrix <- matrix(nrow = length(fnames), ncol = 768)
slide_ids <- sapply(strsplit(basename(fnames), "\\.csv"), `[`, 1)
rownames(tensor_matrix) <- slide_ids
for (i in seq_along(fnames)) {
tensor_matrix[i,] <- getEmbeddingLayer(fnames[i])
}
tensor_matrix_full <- na.omit(tensor_matrix)
# Verify the structure matched layer 0 in csv
structure_of_matrix = as.data.frame(tensor_matrix_full)Load in TCGA CRC assay experiment data
# List available Assays
curatedTCGAData("COAD", "*",version = "2.1.1", dry.run = TRUE)
curatedTCGAData("READ", "*",version = "2.1.1", dry.run = TRUE)
# Load COAD data with multiple assays
coad_mae_full <- curatedTCGAData(
diseaseCode = "COAD",
assays = c("COAD_RNASeq2GeneNorm-20160128", "mRNAArray", "miRNASeqGene"),
version = "2.1.1",
dry.run = FALSE
)
# Load READ data (rectal)
read_mae_full <- curatedTCGAData(
diseaseCode = "READ",
assays = c("READ_RNASeq2GeneNorm-20160128", "mRNAArray", "miRNASeqGene"),
version = "2.1.1",
dry.run = FALSE
)Read Clinical Metadata data and subset
# ---- Extract sample-level metadata for READ -----
read_clinical <- colData(read_mae_full)
# Check completeness of metadata attributes
completeness <- colSums(!is.na(read_clinical)) / nrow(read_clinical) * 100
summary(completeness)
# Subset to attributes with >95% completeness and remove technical fields
read_clinical_sub <- read_clinical[, which(completeness > 95)] %>%
as.data.frame() %>%
select(-matches("aliquot")) %>%
select(-matches("portion"))
# ------ Extract sample-level metadata for COAD ------
coad_clinical <- colData(coad_mae_full)
# Check completeness of metadata attributes
completeness <- colSums(!is.na(coad_clinical)) / nrow(coad_clinical) * 100
coad_clinical_sub <- coad_clinical[, which(completeness > 95)] %>%
as.data.frame() %>%
select(-matches("aliquot")) %>%
select(-matches("portion"))Matching embedding with clinical metadata
# Merge COAD + READ clinical metadata
all_cols <- union(colnames(coad_clinical_sub), colnames(read_clinical_sub))
coad_clinical_full <- coad_clinical_sub
for (col in setdiff(all_cols, colnames(coad_clinical_sub))) coad_clinical_full[[col]] <- NA
read_clinical_full <- read_clinical_sub
for (col in setdiff(all_cols, colnames(read_clinical_sub))) read_clinical_full[[col]] <- NA
clinical_combined <- rbind(coad_clinical_full, read_clinical_full)
# Keep only patients with embeddings
# Convert embedding matrix rownames to patient IDs (first 3 components)
patient_ids_from_embedding <- sapply(strsplit(rownames(tensor_matrix_full), "-"),
function(x) paste(x[1:3], collapse = "-"))
rownames(tensor_matrix_full) <- patient_ids_from_embedding
# Convert clinical data rownames to patient IDs (first 3 components)
clinical_patient_ids <- sapply(strsplit(rownames(clinical_combined), "-"),
function(x) paste(x[1:3], collapse = "-"))
rownames(clinical_combined) <- clinical_patient_ids
# Remove duplicate patients from clinical data (keep first occurrence)
clinical_combined <- clinical_combined[!duplicated(rownames(clinical_combined)), ]
# Keep only patients with embeddings
patients_with_embed <- rownames(tensor_matrix_full)
clinical_combined <- clinical_combined[rownames(clinical_combined) %in% patients_with_embed, ]
# Factorize categorical variables with 2-10 unique values
numLv <- apply(clinical_combined, 2, function(x) length(unique(na.omit(x))))
slideAttrNames <- names(numLv[numLv >= 2 & numLv <= 10])
for(attr in slideAttrNames){
clinical_combined[[attr]] <- as.factor(clinical_combined[[attr]])
}
# Create final metadata dataframe
metadata_embed_df <- clinical_combined[slideAttrNames]
# Remove any rows with all NA values
metadata_embed_df <- metadata_embed_df[rowSums(is.na(metadata_embed_df)) < ncol(metadata_embed_df), ]
# Subset embedding matrix to match metadata patients
embedding_embed_mat <- tensor_matrix_full[rownames(tensor_matrix_full) %in% rownames(metadata_embed_df), ]
embedding_embed_mat <- embedding_embed_mat[!duplicated(rownames(embedding_embed_mat)), ]
# Ensure exact same patients in both datasets
common_patients <- intersect(rownames(embedding_embed_mat), rownames(metadata_embed_df))
embedding_embed_mat <- embedding_embed_mat[common_patients, ]
metadata_embed_df <- metadata_embed_df[common_patients, ]
# Run HistoImageR embedding separation analysis
res_embed <- HistoImageR:::analyze_embedding_separation(embedding_embed_mat, metadata_embed_df)
# View results
res_embed$plotsMatching embedding and assay data COAD + READ
#------------------------------------------
# 1. Load COAD + READ RNASeq assays
#------------------------------------------
# Load COAD
coad_mae <- curatedTCGAData(
diseaseCode = "COAD",
assays = "RNASeq2GeneNorm",
version = "2.1.1",
dry.run = FALSE
)Querying and downloading: COAD_RNASeq2GeneNorm-20160128
see ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation
loading from cache
Querying and downloading: COAD_colData-20160128
see ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation
loading from cache
Querying and downloading: COAD_sampleMap-20160128
see ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation
loading from cache
Querying and downloading: COAD_metadata-20160128
see ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation
loading from cache
harmonizing input:
removing 5606 sampleMap rows not in names(experiments)
removing 2 colData rownames not in sampleMap 'primary'
# Load READ
read_mae <- curatedTCGAData(
diseaseCode = "READ",
assays = "RNASeq2GeneNorm",
version = "2.1.1",
dry.run = FALSE
)Querying and downloading: READ_RNASeq2GeneNorm-20160128
see ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation
loading from cache
Querying and downloading: READ_colData-20160128
see ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation
loading from cache
Querying and downloading: READ_sampleMap-20160128
see ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation
loading from cache
Querying and downloading: READ_metadata-20160128
see ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation
loading from cache
harmonizing input:
removing 2078 sampleMap rows not in names(experiments)
removing 2 colData rownames not in sampleMap 'primary'
rna_coad <- coad_mae[["COAD_RNASeq2GeneNorm-20160128"]]
rna_read <- read_mae[["READ_RNASeq2GeneNorm-20160128"]]Find common genes
#------------------------------------------
# 2. Ensure consistent genes
#------------------------------------------
common_genes <- intersect(rownames(rna_coad), rownames(rna_read))
rna_coad_bulk <- rna_coad[common_genes, ]
rna_read_bulk <- rna_read[common_genes, ]
# Merge RNASeq matrices
rna_combined <- cbind(rna_coad_bulk, rna_read_bulk)
# Standardize patient IDs (first 3 TCGA fields)
rna_patients <- sapply(strsplit(colnames(rna_combined), "-"), function(x) paste(x[1:3], collapse = "-"))
colnames(rna_combined) <- make.unique(rna_patients)
#------------------------------------------
# 3. Prepare embedding matrix
#------------------------------------------
tensor_matrix_unique <- tensor_matrix_full[!duplicated(rownames(tensor_matrix_full)), ]
embedding_patients <- sapply(strsplit(rownames(tensor_matrix_unique), "-"), function(x) paste(x[1:3], collapse = "-"))
rownames(tensor_matrix_unique) <- make.unique(embedding_patients)
#------------------------------------------
# 4. Prepare clinical metadata
#------------------------------------------
# Subset COAD clinical metadata
coad_clinical <- colData(coad_mae)
completeness <- colSums(!is.na(coad_clinical)) / nrow(coad_clinical) * 100
coad_clinical_sub <- coad_clinical[, which(completeness > 95)] %>%
as.data.frame() %>%
select(-matches("aliquot"), -matches("portion"))
# Subset READ clinical metadata
read_clinical <- colData(read_mae)
completeness <- colSums(!is.na(read_clinical)) / nrow(read_clinical) * 100
read_clinical_sub <- read_clinical[, which(completeness > 95)] %>%
as.data.frame() %>%
select(-matches("aliquot"), -matches("portion"))
# Ensure all columns exist in both datasets
all_cols <- union(colnames(coad_clinical_sub), colnames(read_clinical_sub))
# Fill missing columns with NA
coad_clinical_full <- coad_clinical_sub
for (col in setdiff(all_cols, colnames(coad_clinical_sub))) coad_clinical_full[[col]] <- NA
read_clinical_full <- read_clinical_sub
for (col in setdiff(all_cols, colnames(read_clinical_sub))) read_clinical_full[[col]] <- NA
# Merge COAD + READ clinical metadata
clinical_sub <- rbind(coad_clinical_full, read_clinical_full)
# Keep only patients with embeddings
patients_with_embed <- rownames(tensor_matrix_unique)
clinical_sub <- clinical_sub[rownames(clinical_sub) %in% patients_with_embed, ]
# Factorize categorical variables with 2–10 levels
numLv <- apply(clinical_sub, 2, function(x) length(unique(x)))
slideAttrNames <- names(numLv[numLv >= 2 & numLv <= 10])
for (slideAttrName in slideAttrNames) {
clinical_sub[[slideAttrName]] <- as.factor(clinical_sub[[slideAttrName]])
}
metadata_df <- clinical_sub[slideAttrNames]
#------------------------------------------
# 5. Subset RNASeq to patients with embeddings
#------------------------------------------
rna_matched <- rna_combined[, colnames(rna_combined) %in% patients_with_embed, drop = FALSE]
# Remove genes with Inf/-Inf
rna_matrix <- assay(rna_matched)
rna_keep <- apply(rna_matrix, 1, function(x) all(is.finite(x)))
rna_matched_clean <- rna_matched[rna_keep, ]
# Subset embedding to match RNA columns
embedding_matched_clean <- tensor_matrix_unique[colnames(rna_matched_clean), , drop = FALSE]
#------------------------------------------
# 6. Build SummarizedExperiment objects
#------------------------------------------
rna_se <- SummarizedExperiment(
assays = list(counts = assay(rna_matched_clean)),
colData = DataFrame(patientID = colnames(rna_matched_clean))
)
embedding_se <- SummarizedExperiment(
assays = list(embedding = t(embedding_matched_clean)),
colData = metadata_df[colnames(rna_matched_clean), , drop = FALSE]
)
rownames(embedding_se) <- paste0("embed_", seq_len(nrow(embedding_se)))
#------------------------------------------
# 7. Build MultiAssayExperiment
#------------------------------------------
crc_mae <- MultiAssayExperiment(
experiments = list(
# labeing this contains both COAD + READ RNASeq data
RNASeq = rna_se,
Embedding_layer0 = embedding_se
),
colData = metadata_df[colnames(rna_matched_clean), , drop = FALSE]
)Running MOFA2 analysis
# Create MOFA object
mofa_object <- create_mofa_from_MultiAssayExperiment(crc_mae)
# Define model options
model_opts <- get_default_model_options(mofa_object)
model_opts$num_factors <- 15
# Define training options
train_opts <- get_default_training_options(mofa_object)
train_opts$convergence_mode <- "slow"
train_opts$seed <- 42
# Prepare and run MOFA
mofa_object <- prepare_mofa(
object = mofa_object,
model_options = model_opts,
training_options = train_opts
)Warning in prepare_mofa(object = mofa_object, model_options = model_opts, :
Some view(s) have a lot of features, it is recommended to perform a more
stringent feature selection before creating the MOFA object....
Checking data options...
No data options specified, using default...
Checking training options...
Checking model options...
# Run/train the model
#mofa_object.trained <- run_mofa(
# mofa_object,
# outfile = "mofa_model_crc.hdf5",
# use_basilisk = TRUE
#)Analyze MOFA model
# Load trained MOFA model
mofa_object.trained <- load_model("mofa_model_crc.hdf5")
# Keep only patients in MOFA object
mofa_samples <- unlist(samples_names(mofa_object.trained))
metadata_matched <- metadata_df[mofa_samples, , drop = FALSE]
# For the metadata data frame
metadata_df$sample <- rownames(metadata_df)
# adding the metadata to the data object as post processing
samples_metadata(mofa_object.trained) <- metadata_df
# visualize the model views and samples
plot_data_overview(mofa_object.trained)# Plot factor correlations to check for interdependence issues with the model
plot_factor_cor(mofa_object.trained)Initial visualizations
# Plot variance explained
plot_variance_explained(mofa_object.trained, plot_total = TRUE)[[1]]
[[2]]
# Plot data heatmaps
# using rna seq view
plot_data_heatmap(mofa_object.trained,
scale = "row",
cluster_cols = FALSE,
cluster_rows = FALSE,
show_colnames = FALSE,
denoise = TRUE,
features = 10,
view = "RNASeq",
factor=1)# Using embedding view
plot_data_heatmap(mofa_object.trained,
factor = 1,
view = "Embedding_layer0",
features = 10,
cluster_rows = FALSE,
cluster_cols = FALSE,
show_colnames = FALSE)# Plot top weights
plot_top_weights(mofa_object.trained,
view = "RNASeq",
factor = 1,
nfeatures = 10)# Plot factor colored by pathology stage
plot_factor(mofa_object.trained,
factor = 1,
add_violin = T,
dodge = TRUE,
color_by = "MYH11")# Plot factor scatter
plot_factor(mofa_object.trained,
factor = c(1, 2),
color_by = "pathology_T_stage")plot_top_weights(mofa_object.trained,
view = "Embedding_layer0",
factor = 1,
nfeatures = 10)# Correlate factors with clinical variables
correlate_factors_with_covariates(mofa_object.trained,
covariates = c("vital_status.x","pathology_M_stage","tumor_tissue_site"),
plot = "log_pval" )