Cada equipo o estudiante eligió un FOV al azar del experimento de MERFISH en células U-2 OS y entregó su objeto Seurat. No se repitieron los FOVs!. Acá los cargamos todos, hacemos control de calidad, los llevamos a un panel común, los integramos con Harmony e hicimos un análisis sencillo de covariación genética.
# accesor a matriz de conteos
get_counts <- function(obj, assay = DefaultAssay(obj)) {
cm <- tryCatch(SeuratObject::LayerData(obj, assay = assay, layer = "counts"),
error = function(e) NULL)
if (is.null(cm) || prod(dim(cm)) == 0)
cm <- tryCatch(SeuratObject::GetAssayData(obj, assay = assay, slot = "counts"),
error = function(e) NULL)
cm
}
# etiqueta por FOV sin nombres de equipos
fov_id <- function(f) {
if (grepl("fov_?[0-9]+", f, ignore.case = TRUE))
paste0("fov_", sub(".*?fov_?([0-9]+).*", "\\1", f, ignore.case = TRUE))
else sub("\\.rds$", "", f, ignore.case = TRUE)
}
files <- list.files(pattern = "\\.rds$", ignore.case = TRUE)
objs <- list(); qc <- list()
for (f in files) {
obj <- tryCatch(readRDS(f), error = function(e) NULL)
if (!inherits(obj, "Seurat")) next
s <- fov_id(f)
cm <- get_counts(obj)
obj$sample <- s
objs[[s]] <- obj
qc[[s]] <- data.frame(sample = s, genes = nrow(obj), cells = ncol(obj),
has_counts = !is.null(cm),
total_counts = if (!is.null(cm)) sum(cm) else NA_real_)
}
qc_tbl <- bind_rows(qc) %>% arrange(sample)
qc_tbl
| sample | genes | cells | has_counts | total_counts |
|---|---|---|---|---|
| fov_047 | 140 | 89 | TRUE | 281893 |
| fov_057 | 140 | 27 | TRUE | 91186 |
| fov_068 | 140 | 54 | TRUE | 214741 |
| fov_111 | 140 | 87 | TRUE | 378174 |
| fov_125 | 140 | 59 | TRUE | 185479 |
| fov_279 | 140 | 31 | TRUE | 116244 |
| fov_291 | 140 | 65 | TRUE | 333306 |
| fov_333 | 140 | 67 | TRUE | 266848 |
| fov_379 | 140 | 32 | TRUE | 217681 |
| fov_403 | 140 | 20 | TRUE | 167223 |
| fov_456 | 64 | 46 | TRUE | 12317 |
| fov_457 | 140 | 33 | TRUE | 122654 |
Los objetos no traen exactamente el mismo set de genes (un equipo parece haber filtrado genes poco detectados).Rellenamos con ceros los genes en donde falten.
panel <- rownames(objs[[1]])
#Cero en los que faltan genes
pad_to_panel <- function(cm, panel) {
miss <- setdiff(panel, rownames(cm))
if (length(miss))
cm <- rbind(cm, Matrix::Matrix(0, length(miss), ncol(cm), sparse = TRUE,
dimnames = list(miss, colnames(cm))))
cm[panel, , drop = FALSE]
}
clean <- list()
for (s in names(objs)) {
cm <- pad_to_panel(get_counts(objs[[s]]), panel)
o <- CreateSeuratObject(counts = cm, assay = "MERFISH", project = s)
o$sample <- s
clean[[s]] <- o
}
merged <- merge(clean[[1]], y = clean[-1], add.cell.ids = names(clean))
# con features idénticos, merge ya deja una capa por muestra; unir y re-dividir
# garantiza un split limpio por FOV sin importar el estado en que quede merge
merged <- JoinLayers(merged)
La fracción de cuentas que cae en barcodes “blank” estima la tasa de mala identificación de la decodificación. Reportamos el valor global y el rango entre FOVs.
blanks <- grep("BLANK|Blank|NegPrb|^Neg", rownames(merged), value = TRUE)
if (length(blanks)) {
cm0 <- LayerData(merged, assay = "MERFISH", layer = "counts")
fpr <- data.frame(sample = merged$sample,
blank = Matrix::colSums(cm0[blanks, , drop = FALSE]),
total = Matrix::colSums(cm0)) |>
dplyr::group_by(sample) |>
dplyr::summarise(blank_rate = sum(blank) / sum(total), .groups = "drop")
cat(sprintf("Tasa global de blanks: %.2f%%\n", 100 * sum(fpr$blank) / sum(fpr$total)))
print(summary(fpr$blank_rate))
} else {
message("No se conservaron barcodes blank; haría falta la decodificación cruda.")
}
## Tasa global de blanks: NaN%
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.001461 0.001613 0.002689 0.031687 0.044594 0.185074
merged[["MERFISH"]] <- split(merged[["MERFISH"]], f = merged$sample) # una capa por FOV
keep_genes <- rownames(merged)[!grepl("^BLANK|Blank|NegPrb|^Neg", rownames(merged))]
merged <- subset(merged, features = keep_genes)
merged <- NormalizeData(merged)
VariableFeatures(merged) <- rownames(merged) # panel chico: usamos todos los genes
merged <- ScaleData(merged)
merged <- RunPCA(merged, npcs = n_pcs, verbose = FALSE)
merged <- IntegrateLayers(
merged, method = HarmonyIntegration,
orig.reduction = "pca", new.reduction = "harmony", verbose = FALSE)
merged <- FindNeighbors(merged, reduction = "harmony", dims = 1:n_pcs)
merged <- FindClusters(merged, resolution = 0.25)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 610
## Number of edges: 39682
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8971
## Number of communities: 3
## Elapsed time: 0 seconds
merged <- RunUMAP(merged, reduction = "harmony", dims = 1:n_pcs,
reduction.name = "umap.integrated")
merged <- JoinLayers(merged) # colapsa capas para el análisis posterior
Como son una sola línea celular, lo esperado es que los FOVs se mezclen tras la integración; aunque al ser fovs pequeños, estamos sujetos a ruido. Si un solo fov (o fovs de equipo) conformara un clúster, podríamos pensar que es efecto de lote, pero puesto que cada quien hizo el procesamiento independientemente, puede ser también variación biológica.
df <- as.data.frame(Embeddings(merged, "umap.integrated"))
colnames(df) <- c("UMAP_1", "UMAP_2")
df$sample <- merged$sample
df$cluster <- merged$seurat_clusters
bg <- df[, c("UMAP_1", "UMAP_2")] # sin 'sample' -> se dibuja en TODOS los paneles
ggplot() +
geom_point(data = bg, aes(UMAP_1, UMAP_2), color = "grey85", size = 0.4) +
geom_point(data = df, aes(UMAP_1, UMAP_2, color = cluster), size = 0.8) +
facet_wrap(~ sample, ncol = 4) +
guides(color = guide_legend(override.aes = list(size = 3))) +
labs(title = "UMAP por FOV (resto de las células en gris)") +
theme_minimal()
Comparamos la abundancia por gen (sumada sobre células de cada FOV) contra el benchmark publicado de MERFISH. Mostramos el resultado agregado de todos los FOVs con un solo valor de Spearman.
bench <- read.csv("https://d2nhj9g34unfro.cloudfront.net/MERFISH/benchmark_results.csv",
colClasses = c("barcode" = "character"))
bench_counts <- bench %>% group_by(gene) %>% summarise(benchmark = n(), .groups = "drop")
per_sample <- bind_rows(lapply(names(objs), function(s) {
g <- Matrix::rowSums(get_counts(objs[[s]]))
data.frame(gene = names(g), ours = as.numeric(g), sample = s)
})) %>%
filter(!grepl("^BLANK|^Blank|^NegPrb|^Neg_", gene))
comp <- inner_join(per_sample, bench_counts, by = "gene")
pooled <- comp %>% group_by(gene) %>%
summarise(benchmark = first(benchmark), ours = mean(ours), .groups = "drop")
rho <- cor(pooled$benchmark, pooled$ours, method = "spearman")
ggplot(pooled, aes(benchmark, ours)) +
geom_point(alpha = 0.5, size = 1.2, color = "#00BFC4") +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey50") +
scale_x_log10() + scale_y_log10() +
labs(x = "Copias por gen (benchmark publicado)",
y = "Cuentas por gen (promedio entre FOVs)",
title = "Abundancia génica vs benchmark de MERFISH",
subtitle = sprintf("FOVs agrupados \u2014 Spearman \u03c1 = %.2f", rho)) +
theme_minimal()
U-2 OS es una línea clonal: no hay tipos celulares. La estructura interesante es la covariación entre genes, que refleja programas regulatorios compartidos (matriz extracelular, motilidad/citoesqueleto, proliferación).
ecm_genes <- c("FBN1","FBN2","COL5A1","COL7A1","TNC","VCAN","THBS1",
"KIAA1199","CEMIP","PAPPA","LRP1")
mot_genes <- c("MYH10","FLNA","FLNC","SPTAN1","SPTBN1","AFAP1","TLN1",
"CKAP5","IQGAP1","DYNC1H1","DOCK7","ROCK2","PRKCA","AMOTL1")
ecm_in <- intersect(ecm_genes, rownames(merged))
mot_in <- intersect(mot_genes, rownames(merged))
cat("ECM presentes: ", paste(ecm_in, collapse = ", "), "\n",
"Motilidad presentes:", paste(mot_in, collapse = ", "), "\n", sep = " ")
## ECM presentes: FBN1, FBN2, COL5A1, THBS1, CEMIP, PAPPA, LRP1
## Motilidad presentes: MYH10, SPTBN1, AFAP1, TLN1, CKAP5, DYNC1H1, PRKCA, AMOTL1
# matriz normalizada estilo Chen: cuentas totales, sin log, sin blanks
cm <- LayerData(merged, assay = "MERFISH", layer = "counts")
cm <- cm[!grepl("^Blank|BLANK|NegPrb|^Neg", rownames(cm)), ]
Xn <- sweep(as.matrix(cm), 2, Matrix::colSums(cm), "/")
Covariación restringida a los genes de motilidad y matriz extracelular
genes_use <- c(ecm_in, mot_in)
genes_use <- genes_use[apply(Xn[genes_use, , drop = FALSE], 1, var) > 0]
R <- cor(t(Xn[genes_use, , drop = FALSE]), method = "pearson")
anno <- data.frame(grupo = ifelse(genes_use %in% ecm_in, "ECM", "Motilidad"),
row.names = genes_use)
pheatmap(R, annotation_row = anno, annotation_col = anno,
color = colorRampPalette(c("#2166AC", "white", "#B2182B"))(100),
breaks = seq(-1, 1, length.out = 101),
main = "Correlación gen-gen: ECM vs citoesqueleto/motilidad")
El clúster 1 es medio dudoso o quizàs una transiciòn entre el 0 y el 2
Idents(merged) <- "seurat_clusters"
markers <- FindAllMarkers(merged, only.pos = TRUE,
min.pct = 0.1, logfc.threshold = 0.25)
top <- markers %>%
group_by(cluster) %>%
slice_max(avg_log2FC, n = 5) %>%
ungroup()
print(top[, c("cluster", "gene", "avg_log2FC", "pct.1", "pct.2", "p_val_adj")], n = Inf)
## # A tibble: 15 × 6
## cluster gene avg_log2FC pct.1 pct.2 p_val_adj
## <fct> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 0 XDH 2.02 0.492 0.211 1.84e-12
## 2 0 UBR5 2.01 0.961 0.343 5.58e-57
## 3 0 NOTCH2 2.00 0.996 0.43 1.45e-64
## 4 0 USP9X 1.98 0.992 0.365 1.26e-62
## 5 0 ZNF592 1.98 0.992 0.351 2.87e-58
## 6 1 ANKH 1.51 0.451 0.862 5.87e- 8
## 7 1 PRDM2 0.894 0.614 0.911 4.40e- 3
## 8 1 VPS13D 0.776 0.38 0.627 9.83e- 3
## 9 1 AGO3 0.341 0.315 0.714 8.09e-11
## 10 1 PROSER1 0.306 0.543 0.878 7.81e- 3
## 11 2 DOPEY1 5.10 0.738 0.445 5.73e-19
## 12 2 DIEXF 3.87 0.953 0.571 1.75e-56
## 13 2 IL17RA 3.61 0.884 0.5 1.41e-31
## 14 2 MCF2L 3.52 0.692 0.253 4.84e-33
## 15 2 KIAA1462 3.45 0.942 0.47 1.60e-56
# make sure the chosen genes are in scale.data (after JoinLayers), then heatmap
merged <- ScaleData(merged, features = unique(top$gene), verbose = FALSE)
DoHeatmap(merged, features = unique(top$gene)) +
scale_fill_gradientn(colours = c("#2166AC", "white", "#B2182B")) +
theme(axis.text.y = element_text(size = 8))
## Marcadores sobre el UMAP integrado
FeaturePlot(merged, features = c("UBR5", "NOTCH2", "ANKH", "AGO3"), reduction = "umap.integrated",
order = TRUE)
## R version 4.6.0 (2026-04-24 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
##
## Matrix products: default
## LAPACK version 3.12.1
##
## locale:
## [1] LC_COLLATE=German_Germany.utf8 LC_CTYPE=German_Germany.utf8
## [3] LC_MONETARY=German_Germany.utf8 LC_NUMERIC=C
## [5] LC_TIME=German_Germany.utf8
##
## time zone: America/Mexico_City
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] future_1.70.0 harmony_2.0.4 Rcpp_1.1.1-1.1 pheatmap_1.0.13
## [5] patchwork_1.3.2 ggplot2_4.0.3 dplyr_1.2.1 Matrix_1.7-5
## [9] Seurat_5.5.0 SeuratObject_5.4.0 sp_2.2-1
##
## loaded via a namespace (and not attached):
## [1] deldir_2.0-4 pbapply_1.7-4 gridExtra_2.3
## [4] rlang_1.2.0 magrittr_2.0.5 RcppAnnoy_0.0.23
## [7] otel_0.2.0 spatstat.geom_3.7-3 matrixStats_1.5.0
## [10] ggridges_0.5.7 compiler_4.6.0 png_0.1-9
## [13] vctrs_0.7.3 reshape2_1.4.5 stringr_1.6.0
## [16] pkgconfig_2.0.3 fastmap_1.2.0 labeling_0.4.3
## [19] utf8_1.2.6 promises_1.5.0 rmarkdown_2.31
## [22] purrr_1.2.2 xfun_0.57 cachem_1.1.0
## [25] jsonlite_2.0.0 goftest_1.2-3 later_1.4.8
## [28] spatstat.utils_3.2-2 irlba_2.3.7 parallel_4.6.0
## [31] cluster_2.1.8.2 R6_2.6.1 ica_1.0-3
## [34] spatstat.data_3.1-9 bslib_0.10.0 stringi_1.8.7
## [37] RColorBrewer_1.1-3 limma_3.67.3 reticulate_1.46.0
## [40] spatstat.univar_3.1-7 parallelly_1.47.0 lmtest_0.9-40
## [43] jquerylib_0.1.4 scattermore_1.2 knitr_1.51
## [46] tensor_1.5.1 future.apply_1.20.2 zoo_1.8-15
## [49] sctransform_0.4.3 httpuv_1.6.17 splines_4.6.0
## [52] igraph_2.3.1 tidyselect_1.2.1 abind_1.4-8
## [55] rstudioapi_0.18.0 dichromat_2.0-0.1 yaml_2.3.12
## [58] spatstat.random_3.4-5 codetools_0.2-20 miniUI_0.1.2
## [61] spatstat.explore_3.8-0 listenv_0.10.1 lattice_0.22-9
## [64] tibble_3.3.1 plyr_1.8.9 withr_3.0.2
## [67] shiny_1.13.0 S7_0.2.2 ROCR_1.0-12
## [70] evaluate_1.0.5 Rtsne_0.17 fastDummies_1.7.6
## [73] survival_3.8-6 polyclip_1.10-7 fitdistrplus_1.2-6
## [76] pillar_1.11.1 KernSmooth_2.23-26 plotly_4.12.0
## [79] generics_0.1.4 RcppHNSW_0.6.0 scales_1.4.0
## [82] globals_0.19.1 xtable_1.8-8 RhpcBLASctl_0.23-42
## [85] glue_1.8.1 lazyeval_0.2.3 tools_4.6.0
## [88] data.table_1.18.2.1 RSpectra_0.16-2 RANN_2.6.2
## [91] dotCall64_1.2 cowplot_1.2.0 grid_4.6.0
## [94] tidyr_1.3.2 nlme_3.1-169 presto_1.0.0
## [97] cli_3.6.6 spatstat.sparse_3.1-0 spam_2.11-3
## [100] viridisLite_0.4.3 uwot_0.2.4 gtable_0.3.6
## [103] sass_0.4.10 digest_0.6.39 progressr_0.19.0
## [106] ggrepel_0.9.8 htmlwidgets_1.6.4 farver_2.1.2
## [109] htmltools_0.5.9 lifecycle_1.0.5 httr_1.4.8
## [112] statmod_1.5.1 mime_0.13 MASS_7.3-65