Cada equipo o estudiante eligió un FOV al azar del experimento de MERFISH en células U-2 OS y entregó su objeto Seurat. No se repitieron los FOVs!. Acá los cargamos todos, hacemos control de calidad, los llevamos a un panel común, los integramos con Harmony e hicimos un análisis sencillo de covariación genética.

Carga y QC de las entregas

# accesor a matriz de conteos
get_counts <- function(obj, assay = DefaultAssay(obj)) {
  cm <- tryCatch(SeuratObject::LayerData(obj, assay = assay, layer = "counts"),
                 error = function(e) NULL)
  if (is.null(cm) || prod(dim(cm)) == 0)
    cm <- tryCatch(SeuratObject::GetAssayData(obj, assay = assay, slot = "counts"),
                   error = function(e) NULL)
  cm
}

# etiqueta por FOV sin nombres de equipos
fov_id <- function(f) {
  if (grepl("fov_?[0-9]+", f, ignore.case = TRUE))
    paste0("fov_", sub(".*?fov_?([0-9]+).*", "\\1", f, ignore.case = TRUE))
  else sub("\\.rds$", "", f, ignore.case = TRUE)
}

files <- list.files(pattern = "\\.rds$", ignore.case = TRUE)

objs <- list(); qc <- list()
for (f in files) {
  obj <- tryCatch(readRDS(f), error = function(e) NULL)
  if (!inherits(obj, "Seurat")) next
  s  <- fov_id(f)
  cm <- get_counts(obj)
  obj$sample <- s
  objs[[s]] <- obj
  qc[[s]] <- data.frame(sample = s, genes = nrow(obj), cells = ncol(obj),
                        has_counts = !is.null(cm),
                        total_counts = if (!is.null(cm)) sum(cm) else NA_real_)
}

qc_tbl <- bind_rows(qc) %>% arrange(sample)
qc_tbl
sample genes cells has_counts total_counts
fov_047 140 89 TRUE 281893
fov_057 140 27 TRUE 91186
fov_068 140 54 TRUE 214741
fov_111 140 87 TRUE 378174
fov_125 140 59 TRUE 185479
fov_279 140 31 TRUE 116244
fov_291 140 65 TRUE 333306
fov_333 140 67 TRUE 266848
fov_379 140 32 TRUE 217681
fov_403 140 20 TRUE 167223
fov_456 64 46 TRUE 12317
fov_457 140 33 TRUE 122654

Panel común e integración

Los objetos no traen exactamente el mismo set de genes (un equipo parece haber filtrado genes poco detectados).Rellenamos con ceros los genes en donde falten.

panel <- rownames(objs[[1]])

#Cero en los que faltan genes
pad_to_panel <- function(cm, panel) {
  miss <- setdiff(panel, rownames(cm))
  if (length(miss))
    cm <- rbind(cm, Matrix::Matrix(0, length(miss), ncol(cm), sparse = TRUE,
                                   dimnames = list(miss, colnames(cm))))
  cm[panel, , drop = FALSE]
}

clean <- list()
for (s in names(objs)) {
  cm <- pad_to_panel(get_counts(objs[[s]]), panel)
  o <- CreateSeuratObject(counts = cm, assay = "MERFISH", project = s)
  o$sample <- s
  clean[[s]] <- o
}

merged <- merge(clean[[1]], y = clean[-1], add.cell.ids = names(clean))
# con features idénticos, merge ya deja una capa por muestra; unir y re-dividir
# garantiza un split limpio por FOV sin importar el estado en que quede merge
merged <- JoinLayers(merged)

QC de decodificación: tasa de barcodes control

La fracción de cuentas que cae en barcodes “blank” estima la tasa de mala identificación de la decodificación. Reportamos el valor global y el rango entre FOVs.

blanks <- grep("BLANK|Blank|NegPrb|^Neg", rownames(merged), value = TRUE)
if (length(blanks)) {
  cm0 <- LayerData(merged, assay = "MERFISH", layer = "counts")
  fpr <- data.frame(sample = merged$sample,
                    blank  = Matrix::colSums(cm0[blanks, , drop = FALSE]),
                    total  = Matrix::colSums(cm0)) |>
    dplyr::group_by(sample) |>
    dplyr::summarise(blank_rate = sum(blank) / sum(total), .groups = "drop")

  cat(sprintf("Tasa global de blanks: %.2f%%\n", 100 * sum(fpr$blank) / sum(fpr$total)))
  print(summary(fpr$blank_rate))
} else {
  message("No se conservaron barcodes blank; haría falta la decodificación cruda.")
}
## Tasa global de blanks: NaN%
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.001461 0.001613 0.002689 0.031687 0.044594 0.185074
merged[["MERFISH"]] <- split(merged[["MERFISH"]], f = merged$sample)  # una capa por FOV
keep_genes <- rownames(merged)[!grepl("^BLANK|Blank|NegPrb|^Neg", rownames(merged))]
merged <- subset(merged, features = keep_genes)
merged <- NormalizeData(merged)
VariableFeatures(merged) <- rownames(merged)   # panel chico: usamos todos los genes
merged <- ScaleData(merged)
merged <- RunPCA(merged, npcs = n_pcs, verbose = FALSE)

merged <- IntegrateLayers(
  merged, method = HarmonyIntegration,
  orig.reduction = "pca", new.reduction = "harmony", verbose = FALSE)

merged <- FindNeighbors(merged, reduction = "harmony", dims = 1:n_pcs)
merged <- FindClusters(merged, resolution = 0.25)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 610
## Number of edges: 39682
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8971
## Number of communities: 3
## Elapsed time: 0 seconds
merged <- RunUMAP(merged, reduction = "harmony", dims = 1:n_pcs,
                  reduction.name = "umap.integrated")
merged <- JoinLayers(merged)   # colapsa capas para el análisis posterior

Como son una sola línea celular, lo esperado es que los FOVs se mezclen tras la integración; aunque al ser fovs pequeños, estamos sujetos a ruido. Si un solo fov (o fovs de equipo) conformara un clúster, podríamos pensar que es efecto de lote, pero puesto que cada quien hizo el procesamiento independientemente, puede ser también variación biológica.

df <- as.data.frame(Embeddings(merged, "umap.integrated"))
colnames(df) <- c("UMAP_1", "UMAP_2")
df$sample  <- merged$sample
df$cluster <- merged$seurat_clusters

bg <- df[, c("UMAP_1", "UMAP_2")]   # sin 'sample' -> se dibuja en TODOS los paneles

ggplot() +
  geom_point(data = bg, aes(UMAP_1, UMAP_2), color = "grey85", size = 0.4) +
  geom_point(data = df, aes(UMAP_1, UMAP_2, color = cluster), size = 0.8) +
  facet_wrap(~ sample, ncol = 4) +
  guides(color = guide_legend(override.aes = list(size = 3))) +
  labs(title = "UMAP por FOV (resto de las células en gris)") +
  theme_minimal()

Abundancia génica vs benchmark publicado

Comparamos la abundancia por gen (sumada sobre células de cada FOV) contra el benchmark publicado de MERFISH. Mostramos el resultado agregado de todos los FOVs con un solo valor de Spearman.

bench <- read.csv("https://d2nhj9g34unfro.cloudfront.net/MERFISH/benchmark_results.csv",
                  colClasses = c("barcode" = "character"))
bench_counts <- bench %>% group_by(gene) %>% summarise(benchmark = n(), .groups = "drop")

per_sample <- bind_rows(lapply(names(objs), function(s) {
  g <- Matrix::rowSums(get_counts(objs[[s]]))
  data.frame(gene = names(g), ours = as.numeric(g), sample = s)
})) %>%
  filter(!grepl("^BLANK|^Blank|^NegPrb|^Neg_", gene))

comp   <- inner_join(per_sample, bench_counts, by = "gene")
pooled <- comp %>% group_by(gene) %>%
  summarise(benchmark = first(benchmark), ours = mean(ours), .groups = "drop")
rho <- cor(pooled$benchmark, pooled$ours, method = "spearman")

ggplot(pooled, aes(benchmark, ours)) +
  geom_point(alpha = 0.5, size = 1.2, color = "#00BFC4") +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey50") +
  scale_x_log10() + scale_y_log10() +
  labs(x = "Copias por gen (benchmark publicado)",
       y = "Cuentas por gen (promedio entre FOVs)",
       title = "Abundancia génica vs benchmark de MERFISH",
       subtitle = sprintf("FOVs agrupados \u2014 Spearman \u03c1 = %.2f", rho)) +
  theme_minimal()

Covariación génica

U-2 OS es una línea clonal: no hay tipos celulares. La estructura interesante es la covariación entre genes, que refleja programas regulatorios compartidos (matriz extracelular, motilidad/citoesqueleto, proliferación).

Genes del panel presentes

ecm_genes <- c("FBN1","FBN2","COL5A1","COL7A1","TNC","VCAN","THBS1",
               "KIAA1199","CEMIP","PAPPA","LRP1")
mot_genes <- c("MYH10","FLNA","FLNC","SPTAN1","SPTBN1","AFAP1","TLN1",
               "CKAP5","IQGAP1","DYNC1H1","DOCK7","ROCK2","PRKCA","AMOTL1")

ecm_in <- intersect(ecm_genes, rownames(merged))
mot_in <- intersect(mot_genes, rownames(merged))
cat("ECM presentes:     ", paste(ecm_in, collapse = ", "), "\n",
    "Motilidad presentes:", paste(mot_in, collapse = ", "), "\n", sep = " ")
## ECM presentes:      FBN1, FBN2, COL5A1, THBS1, CEMIP, PAPPA, LRP1 
##  Motilidad presentes: MYH10, SPTBN1, AFAP1, TLN1, CKAP5, DYNC1H1, PRKCA, AMOTL1
# matriz normalizada estilo Chen: cuentas totales, sin log, sin blanks
cm <- LayerData(merged, assay = "MERFISH", layer = "counts")
cm <- cm[!grepl("^Blank|BLANK|NegPrb|^Neg", rownames(cm)), ]
Xn <- sweep(as.matrix(cm), 2, Matrix::colSums(cm), "/")

Covariación restringida a los genes de motilidad y matriz extracelular

genes_use <- c(ecm_in, mot_in)
genes_use <- genes_use[apply(Xn[genes_use, , drop = FALSE], 1, var) > 0]
R <- cor(t(Xn[genes_use, , drop = FALSE]), method = "pearson")
anno <- data.frame(grupo = ifelse(genes_use %in% ecm_in, "ECM", "Motilidad"),
                   row.names = genes_use)

pheatmap(R, annotation_row = anno, annotation_col = anno,
         color  = colorRampPalette(c("#2166AC", "white", "#B2182B"))(100),
         breaks = seq(-1, 1, length.out = 101),
         main   = "Correlación gen-gen: ECM vs citoesqueleto/motilidad")

Heatmap de marcadores

El clúster 1 es medio dudoso o quizàs una transiciòn entre el 0 y el 2

Idents(merged) <- "seurat_clusters"

markers <- FindAllMarkers(merged, only.pos = TRUE,
                          min.pct = 0.1, logfc.threshold = 0.25)

top <- markers %>%
  group_by(cluster) %>%
  slice_max(avg_log2FC, n = 5) %>%
  ungroup()
print(top[, c("cluster", "gene", "avg_log2FC", "pct.1", "pct.2", "p_val_adj")], n = Inf)
## # A tibble: 15 × 6
##    cluster gene     avg_log2FC pct.1 pct.2 p_val_adj
##    <fct>   <chr>         <dbl> <dbl> <dbl>     <dbl>
##  1 0       XDH           2.02  0.492 0.211  1.84e-12
##  2 0       UBR5          2.01  0.961 0.343  5.58e-57
##  3 0       NOTCH2        2.00  0.996 0.43   1.45e-64
##  4 0       USP9X         1.98  0.992 0.365  1.26e-62
##  5 0       ZNF592        1.98  0.992 0.351  2.87e-58
##  6 1       ANKH          1.51  0.451 0.862  5.87e- 8
##  7 1       PRDM2         0.894 0.614 0.911  4.40e- 3
##  8 1       VPS13D        0.776 0.38  0.627  9.83e- 3
##  9 1       AGO3          0.341 0.315 0.714  8.09e-11
## 10 1       PROSER1       0.306 0.543 0.878  7.81e- 3
## 11 2       DOPEY1        5.10  0.738 0.445  5.73e-19
## 12 2       DIEXF         3.87  0.953 0.571  1.75e-56
## 13 2       IL17RA        3.61  0.884 0.5    1.41e-31
## 14 2       MCF2L         3.52  0.692 0.253  4.84e-33
## 15 2       KIAA1462      3.45  0.942 0.47   1.60e-56
# make sure the chosen genes are in scale.data (after JoinLayers), then heatmap
merged <- ScaleData(merged, features = unique(top$gene), verbose = FALSE)

DoHeatmap(merged, features = unique(top$gene)) +
  scale_fill_gradientn(colours = c("#2166AC", "white", "#B2182B")) +
  theme(axis.text.y = element_text(size = 8))

## Marcadores sobre el UMAP integrado

FeaturePlot(merged, features = c("UBR5", "NOTCH2", "ANKH", "AGO3"), reduction = "umap.integrated",
            order = TRUE)

Notas

  • Con pocos cientos de células estos resultados son ilustrativos, no inferenciales
## R version 4.6.0 (2026-04-24 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=German_Germany.utf8  LC_CTYPE=German_Germany.utf8   
## [3] LC_MONETARY=German_Germany.utf8 LC_NUMERIC=C                   
## [5] LC_TIME=German_Germany.utf8    
## 
## time zone: America/Mexico_City
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] future_1.70.0      harmony_2.0.4      Rcpp_1.1.1-1.1     pheatmap_1.0.13   
##  [5] patchwork_1.3.2    ggplot2_4.0.3      dplyr_1.2.1        Matrix_1.7-5      
##  [9] Seurat_5.5.0       SeuratObject_5.4.0 sp_2.2-1          
## 
## loaded via a namespace (and not attached):
##   [1] deldir_2.0-4           pbapply_1.7-4          gridExtra_2.3         
##   [4] rlang_1.2.0            magrittr_2.0.5         RcppAnnoy_0.0.23      
##   [7] otel_0.2.0             spatstat.geom_3.7-3    matrixStats_1.5.0     
##  [10] ggridges_0.5.7         compiler_4.6.0         png_0.1-9             
##  [13] vctrs_0.7.3            reshape2_1.4.5         stringr_1.6.0         
##  [16] pkgconfig_2.0.3        fastmap_1.2.0          labeling_0.4.3        
##  [19] utf8_1.2.6             promises_1.5.0         rmarkdown_2.31        
##  [22] purrr_1.2.2            xfun_0.57              cachem_1.1.0          
##  [25] jsonlite_2.0.0         goftest_1.2-3          later_1.4.8           
##  [28] spatstat.utils_3.2-2   irlba_2.3.7            parallel_4.6.0        
##  [31] cluster_2.1.8.2        R6_2.6.1               ica_1.0-3             
##  [34] spatstat.data_3.1-9    bslib_0.10.0           stringi_1.8.7         
##  [37] RColorBrewer_1.1-3     limma_3.67.3           reticulate_1.46.0     
##  [40] spatstat.univar_3.1-7  parallelly_1.47.0      lmtest_0.9-40         
##  [43] jquerylib_0.1.4        scattermore_1.2        knitr_1.51            
##  [46] tensor_1.5.1           future.apply_1.20.2    zoo_1.8-15            
##  [49] sctransform_0.4.3      httpuv_1.6.17          splines_4.6.0         
##  [52] igraph_2.3.1           tidyselect_1.2.1       abind_1.4-8           
##  [55] rstudioapi_0.18.0      dichromat_2.0-0.1      yaml_2.3.12           
##  [58] spatstat.random_3.4-5  codetools_0.2-20       miniUI_0.1.2          
##  [61] spatstat.explore_3.8-0 listenv_0.10.1         lattice_0.22-9        
##  [64] tibble_3.3.1           plyr_1.8.9             withr_3.0.2           
##  [67] shiny_1.13.0           S7_0.2.2               ROCR_1.0-12           
##  [70] evaluate_1.0.5         Rtsne_0.17             fastDummies_1.7.6     
##  [73] survival_3.8-6         polyclip_1.10-7        fitdistrplus_1.2-6    
##  [76] pillar_1.11.1          KernSmooth_2.23-26     plotly_4.12.0         
##  [79] generics_0.1.4         RcppHNSW_0.6.0         scales_1.4.0          
##  [82] globals_0.19.1         xtable_1.8-8           RhpcBLASctl_0.23-42   
##  [85] glue_1.8.1             lazyeval_0.2.3         tools_4.6.0           
##  [88] data.table_1.18.2.1    RSpectra_0.16-2        RANN_2.6.2            
##  [91] dotCall64_1.2          cowplot_1.2.0          grid_4.6.0            
##  [94] tidyr_1.3.2            nlme_3.1-169           presto_1.0.0          
##  [97] cli_3.6.6              spatstat.sparse_3.1-0  spam_2.11-3           
## [100] viridisLite_0.4.3      uwot_0.2.4             gtable_0.3.6          
## [103] sass_0.4.10            digest_0.6.39          progressr_0.19.0      
## [106] ggrepel_0.9.8          htmlwidgets_1.6.4      farver_2.1.2          
## [109] htmltools_0.5.9        lifecycle_1.0.5        httr_1.4.8            
## [112] statmod_1.5.1          mime_0.13              MASS_7.3-65