Install GAMclust package:
devtools::install_github("alserglab/GAMclust")
library(GAMclust)
library(gatom)
library(mwcsr)
library(fgsea)
library(data.table)
library(Seurat)
library(futile.logger)
set.seed(42)
First, please load and initialize all objects required for GAM-clustering analysis:
(1.1.) load KEGG metabolic
network network.kegg.rds
and its metabolites annotation met.kegg.db.rds
# KEGG network:
network <- readRDS(url("http://artyomovlab.wustl.edu/publications/supp_materials/GATOM/network.kegg.rds"))
metabolites.annotation <- readRDS(url("http://artyomovlab.wustl.edu/publications/supp_materials/GATOM/met.kegg.db.rds"))
(1.2.) or load combined metabolic network network.combined.rds,
its metabolites annotation met.combined.db.rds
and species-specific list of genes that either come from proteome or are
not linked to a specific enzyme gene2reaction.combined.mmu.eg.tsv
for mouse and gene2reaction.combined.hsa.eg.tsv
for human data;
# combined network (KEGG+Rhea+transport reactions):
network <- readRDS(url("http://artyomovlab.wustl.edu/publications/supp_materials/GATOM/network.combined.rds"))
metabolites.annotation <- readRDS(url("http://artyomovlab.wustl.edu/publications/supp_materials/GATOM/met.combined.db.rds"))
gene2reaction.extra <- data.table::fread("http://artyomovlab.wustl.edu/publications/supp_materials/GATOM/gene2reaction.combined.hsa.eg.tsv", colClasses="character")
org.Hs.eg.gatom.anno
for human data or org.Mm.eg.gatom.anno
for mouse data;network.annotation <- readRDS(url("http://artyomovlab.wustl.edu/publications/supp_materials/GATOM/org.Hs.eg.gatom.anno.rds"))
met.to.filter <- data.table::fread(system.file("mets2mask.lst", package="GAMclust"))$ID
(4.1.) we recommend to use here either heuristic relax-and-cut solver
rnc_solver from mwcsr
package,
solver <- mwcsr::rnc_solver()
(4.2.) either proprietary CPLEX solver (free for academy);
cplex.dir <- "/opt/ibm/ILOG/CPLEX_Studio1271"
solver <- mwcsr::virgo_solver(cplex_dir = cplex.dir)
work.dir <- "results"
dir.create(work.dir, showWarnings = F, recursive = T)
stats.dir <- file.path(work.dir, "stats")
dir.create(stats.dir, showWarnings = F, recursive = T)
setup_logger <- function(log.file.path, logger.name = "stats.logger") {
file.appender <- appender.file(log.file.path)
console.appender <- appender.console()
combined.appender <- function(line) {
file.appender(line)
console.appender(line)
}
flog.appender(combined.appender, name = logger.name)
flog.threshold(TRACE, name = logger.name)
}
log.file <- file.path(stats.dir, "log.txt")
setup_logger(log.file.path = log.file, logger.name = "stats.logger")
GAMclust works with bulk, single cell and spatial RNA-seq data.
This vignette shows how to process spatial RNA-seq data.
Let’s load the data and take 10,000 genes for the GAM-clustering analysis.
seurat_object <- readRDS(url("http://artyomovlab.wustl.edu/publications/supp_materials/GAMclust/275_T_seurat.rds"))
seurat_object <- Seurat::SCTransform(seurat_object,
assay = "Spatial",
variable.features.n = 10000,
verbose = FALSE)
E <- as.matrix(Seurat::GetAssayData(object = seurat_object,
assay = "SCT",
slot = "scale.data"))
nrow(E) # ! make sure this value is in range from 6,000 to 12,000
# [1] 10000
E[1:3, 1:3]
# AAACAAGTATCTCCCA-1 AAACACCAATAACTGC-1 AAACAGAGCGACTCCT-1
# NOC2L 1.249130 0.8296517 3.7173470
# ISG15 2.642288 -0.7594430 0.5659099
# AGRN -0.620876 -0.7318498 0.5300554
Genes in your dataset may be named as Symbol, Entrez, Ensembl or
RefSeq IDs. One of these names should be specified as value of
gene.id.type parameter in prepareData().
If you analyse singe cell or spatial RNA-seq data, please set
use.PCA=TRUE in prepareData().
E.prep <- prepareData(E = E,
gene.id.type = "Symbol",
use.PCA = TRUE,
use.PCA.n = 50,
network.annotation = network.annotation)
E.prep[1:3, 1:3]
# PC1 PC2 PC3
# 4504 6.326338 2.513886 0.5230565
# 22865 -1.635009 -2.005070 0.9369077
# 100129792 -1.437753 -1.059253 -1.0455837
The prepareNetwork() function defines the structure of
the final metabolic modules.
network.prep <- prepareNetwork(E = E.prep,
network = network,
topology = "metabolites",
met.to.filter = met.to.filter,
network.annotation = network.annotation,
gene2reaction.extra = gene2reaction.extra) # for combined network
# INFO [2026-01-24 15:14:06] Global metabolite network contains 5746 edges.
# INFO [2026-01-24 15:14:06] Largest connected component of this global network contains 1276 nodes and 4403 edges.
The preClustering() function defines initial patterns
using k-medoids clustering on gene expression matrix. It is strongly
recommended to do initial clustering with no less than 32 clusters
(initial.number.of.clusters = 32).
You can visualize the initial heatmap as shown below.
cur.centers <- preClustering(E.prep = E.prep,
network.prep = network.prep,
initial.number.of.clusters = 32,
network.annotation = network.annotation)
# INFO [2026-01-24 15:14:06] 1002 metabolic genes from the analysed dataset mapped to this component.
cur.centers[1:3, 1:3]
# PC1 PC2 PC3
# 1 0.08031134 -0.3373824 2.2391351
# 2 3.50078547 -0.7895749 3.3735482
# 3 -2.35233343 -1.4231047 -0.8383281
pheatmap::pheatmap(
GAMclust:::normalize.rows(cur.centers),
cluster_rows=F, cluster_cols=F,
show_rownames=F, show_colnames=T)
Now you have everything prepared for the GAM-clustering analysis.
Initial patterns will be now refined in an iterative process. The
output of gamClustering() function presents a set of
specific subnetworks (also called metabolic modules) that reflect
metabolic variability within a given transcriptional dataset.
Note, that it may take a long time to derive metabolic modules by
gamClustering() function (tens of minutes).
There is a set of parameters which determine the size and number of your final modules. We recommend you to start with the default settings, however you can adjust them based on your own preferences:
If you consider final modules to bee too small or too big and it
complicates interpretation for you, you can either increase or reduce by
10 units the max.module.size parameter.
If among final modules you consider presence of any modules with
too similar patterns, you can reduce by 0.1 units the
cor.threshold parameter.
If among final modules you consider presence of any uninformative
modules, you can reduce by 10 times the p.adj.val.threshold
parameter.
results <- gamClustering(E.prep = E.prep,
network.prep = network.prep,
cur.centers = cur.centers,
start.base = 0.5,
base.dec = 0.05,
max.module.size = 50,
cor.threshold = 0.5,
p.adj.val.threshold = 1e-5,
batch.solver = seq_batch_solver(solver),
work.dir = work.dir,
show.intermediate.clustering = FALSE,
verbose = FALSE,
collect.stats = TRUE)
# INFO [2026-01-24 15:14:07] GAM-CLUSTERING starts here.
# INFO [2026-01-24 15:14:07] [*] Iteration 1
# INFO [2026-01-24 15:15:19] [*] Iteration 2
# INFO [2026-01-24 15:16:34] >> max cor exceeded 0.5: 0.76
# INFO [2026-01-24 15:16:34] [*] Iteration 3
# INFO [2026-01-24 15:17:45] [*] Iteration 4
# INFO [2026-01-24 15:18:55] >> max cor exceeded 0.5: 0.73
# INFO [2026-01-24 15:18:55] [*] Iteration 5
# INFO [2026-01-24 15:20:00] [*] Iteration 6
# INFO [2026-01-24 15:21:05] >> max cor exceeded 0.5: 0.7
# INFO [2026-01-24 15:21:05] [*] Iteration 7
# INFO [2026-01-24 15:22:11] [*] Iteration 8
# INFO [2026-01-24 15:23:17] >> max cor exceeded 0.5: 0.66
# INFO [2026-01-24 15:23:17] [*] Iteration 9
# INFO [2026-01-24 15:24:23] [*] Iteration 10
# INFO [2026-01-24 15:25:28] >> max cor exceeded 0.5: 0.65
# INFO [2026-01-24 15:25:28] [*] Iteration 11
# INFO [2026-01-24 15:26:30] >> max cor exceeded 0.5: 0.64
# INFO [2026-01-24 15:26:30] [*] Iteration 12
# INFO [2026-01-24 15:27:28] [*] Iteration 13
# INFO [2026-01-24 15:28:28] >> max cor exceeded 0.5: 0.6
# INFO [2026-01-24 15:28:28] [*] Iteration 14
# INFO [2026-01-24 15:29:26] [*] Iteration 15
# INFO [2026-01-24 15:30:26] >> max cor exceeded 0.5: 0.59
# INFO [2026-01-24 15:30:26] [*] Iteration 16
# INFO [2026-01-24 15:31:22] [*] Iteration 17
# INFO [2026-01-24 15:32:18] >> max cor exceeded 0.5: 0.57
# INFO [2026-01-24 15:32:18] [*] Iteration 18
# INFO [2026-01-24 15:33:11] [*] Iteration 19
# INFO [2026-01-24 15:34:05] >> max cor exceeded 0.5: 0.54
# INFO [2026-01-24 15:34:05] [*] Iteration 20
# INFO [2026-01-24 15:34:56] [*] Iteration 21
# INFO [2026-01-24 15:35:48] >> max cor exceeded 0.5: 0.54
# INFO [2026-01-24 15:35:48] [*] Iteration 22
# INFO [2026-01-24 15:36:36] [*] Iteration 23
# INFO [2026-01-24 15:37:25] >> max cor exceeded 0.5: 0.53
# INFO [2026-01-24 15:37:25] [*] Iteration 24
# INFO [2026-01-24 15:38:12] [*] Iteration 25
# INFO [2026-01-24 15:38:58] >> max cor exceeded 0.5: 0.51
# INFO [2026-01-24 15:38:58] [*] Iteration 26
# INFO [2026-01-24 15:39:42] [*] Iteration 27
# INFO [2026-01-24 15:40:26] >> max cor exceeded 0.5: 0.5
# INFO [2026-01-24 15:40:26] [*] Iteration 28
# INFO [2026-01-24 15:41:06] [*] Iteration 29
# INFO [2026-01-24 15:41:47] >> max cor exceeded 0.5: 0.5
# INFO [2026-01-24 15:41:47] [*] Iteration 30
# INFO [2026-01-24 15:42:28] [*] Iteration 31
# INFO [2026-01-24 15:43:13] [*] Iteration 32
# INFO [2026-01-24 15:43:50] [*] Iteration 33
# INFO [2026-01-24 15:44:28] [*] Iteration 34
# INFO [2026-01-24 15:45:06] [*] Iteration 35
# INFO [2026-01-24 15:45:40] [*] Iteration 36
# INFO [2026-01-24 15:46:16] [*] Iteration 37
# INFO [2026-01-24 15:46:49] [*] Iteration 38
# INFO [2026-01-24 15:47:22] [*] Iteration 39
# INFO [2026-01-24 15:47:52] [*] Iteration 40
# INFO [2026-01-24 15:48:23] [*] Iteration 41
# INFO [2026-01-24 15:48:50] [*] Iteration 42
# INFO [2026-01-24 15:49:18] [*] Iteration 43
# INFO [2026-01-24 15:49:44] [*] Iteration 44
# INFO [2026-01-24 15:50:07] [*] Iteration 45
# INFO [2026-01-24 15:50:31] [*] Iteration 46
# INFO [2026-01-24 15:50:51] [*] Iteration 47
# INFO [2026-01-24 15:51:12] [*] Iteration 48
# INFO [2026-01-24 15:51:30] [*] Iteration 49
# INFO [2026-01-24 15:51:50] [*] Iteration 50
# INFO [2026-01-24 15:52:05] [*] Iteration 51
# INFO [2026-01-24 15:52:23] [*] Iteration 52
# INFO [2026-01-24 15:52:37] [*] Iteration 53
# INFO [2026-01-24 15:52:53] [*] Iteration 54
# INFO [2026-01-24 15:53:05] [*] Iteration 55
# INFO [2026-01-24 15:53:22] GAM-CLUSTERING ends here.
Each metabolic module is a connected piece of metabolic network whose genes expression is correlated across all dataset.
The following functions will help you to visualize and explore the obtained results.
getGraphs(modules = results$modules,
network.annotation = network.annotation,
metabolites.annotation = metabolites.annotation,
seed.for.layout = 42,
work.dir = work.dir)
# Graphs for module 1 are built
# Graphs for module 2 are built
# Graphs for module 3 are built
# Graphs for module 4 are built
Example of the graph of the fourth module:
The table contains gene list. Each gene has two descriptive values: i) gene’s correlation value with the modules pattern and ii) gene’s score. High score means that this gene’s expression is similar to the module’s pattern and not similar to other modules’ patterns.
m.gene.list <- getGeneTables(modules = results$modules,
nets = results$nets,
patterns = results$patterns.pos,
gene.exprs = E.prep,
network.annotation = network.annotation,
work.dir = work.dir)
# Gene tables for module 1 are produced
# Gene tables for module 2 are produced
# Gene tables for module 3 are produced
# Gene tables for module 4 are produced
Example of the gene table of the fourth module:
head(GAMclust:::read.tsv(file.path(work.dir, "m.4.genes.tsv"))) |>
kableExtra::kable() |>
kableExtra::kable_styling()
| symbol | Entrez | score | cor |
|---|---|---|---|
| SLC2A3 | 6515 | 3.313484 | 0.9539268 |
| SLC2A1 | 6513 | 2.705281 | 0.9306558 |
| EGLN3 | 112399 | 2.619330 | 0.9266640 |
| PGK1 | 5230 | 2.445532 | 0.9200964 |
| GLUL | 2752 | 2.456103 | 0.9196962 |
| GAPDH | 2597 | 2.481569 | 0.9178584 |
for(i in 1:length(m.gene.list)){
print(fgsea::plotCoregulationProfileSpatial(m.gene.list[[i]],
seurat_object,
title = paste0("module ", i)))
}
Seurat::DefaultAssay(seurat_object) <- "SCT"
i <- 4
Seurat::SpatialFeaturePlot(seurat_object,
features = m.gene.list[[i]],
pt.size.factor = 2, stroke = 0.01, alpha = 0.7,
ncol = 8)