report_file <- "report.parquet" # DIA-NN output (parquet)
annotation_file <- "joined_annotation.txt" # built from exported column namesLimpa Analysis all years
Overview
This notebook performs a protein-level differential expression analysis using limpa (DPC + limma workflow) and produces QC plots, volcano plots, and a publication-quality
Vignette = https://bioconductor.org/packages/release/bioc/vignettes/limpa/inst/doc/limpa.html
ComplexHeatmap
Code assembled from Dr Blythe Durbin Johnson example here
https://ucdavis-bioinformatics-training.github.io/limma-proteomics-analysis-August-2025/
Install & Load Packages
If packages are missing, install them. You can set install_pkgs <- FALSE to skip installation.
Paths & Working Directory
Set your working directory to where your DIA-NN parquet report lives.
Read DIA-NN (Peptide-level) and Filter
We read the peptide-level table and apply standard filters. These steps precede protein-level summarization.
report_file <- "report.parquet" # DIA-NN output (parquet)
annotation_file <- "joined_annotation.txt" # built from exported column names1) Read DIA-NN Parquet with FDR filters (as per limpa vignette)
dat <- readDIANN(
report_file,
format = "parquet",
q.columns = c("Q.Value","Lib.Q.Value","Lib.PG.Q.Value"),
q.cutoffs = 0.01
)Length of q-value columns does not match with length of q-value cutoffs. Using q.cutoffs[1] for all columns.
(Optional) filter helpers suggested by the vignette to filter the peptides identified
dat <- filterNonProteotypicPeptides(dat)
dat <- filterCompoundProteins(dat)
dat <- filterSingletonPeptides(dat, min.n.peptides = 2) # optionalExport column names to help build the annotation file (optional)
write.csv2(colnames(dat$E), "colnames.txt")Data-Driven Protein Clustering (DPC) and Protein Quantification.
dpcfit <- dpc(dat)
plotDPC(dpcfit)Protein-level summarized expression (log2-scale). This can take a while!
y.protein <- dpcQuant(dat, "Protein.Group", dpc = dpcfit)Estimating hyperparameters ...
Quantifying proteins ...
Proteins: 1000 Peptides: 21495
Proteins: 2000 Peptides: 43154
Proteins: 3000 Peptides: 62166
Proteins: 4000 Peptides: 79617
Proteins: 5000 Peptides: 99034
Proteins: 5999 Peptides: 118465
Export Protein log2 expression data from y.protein fromthe dpcQuant function
# 1. Extract the Expression Matrix (Log2 abundance)
# This is the data used for the heatmap and PCA/MDS plots
quant_data <- as.data.frame(y.protein$E)
# 2. (Optional) Add Protein Metadata
# If your object has gene names attached, merge them in.
# y.protein$genes typically holds the ID mappings (Protein.Group, Gene Name, etc.)
if (!is.null(y.protein$genes)) {
quant_data <- cbind(y.protein$genes, quant_data)
} else {
# If no genes slot, just make the Protein ID the first column
quant_data <- tibble::rownames_to_column(quant_data, "Protein.Group")
}
# 3. Save to a TSV or CSV file
readr::write_tsv(quant_data, "limpa_protein_quantification_matrix.tsv")Sample Annotation (Targets), Design Matrix
Ensure the sample-level metadata aligns exactly with the columns of y.protein$E
QC: MDS Plots
Two MDS plots colored by group and by year these should be plots that incorporate the standard error information from limpa (labels hidden for clarity).
# By sample prep group
Group <- factor(targets$group)
Group.color <- Group
levels(Group.color) <- c("blue","red","green")
plotMDSUsingSEs(y.protein, pch = 16, col = as.character(Group.color),
main = "MDS by Sample Prep Group", cex = 1, gene.selection = "common")# By year
Group <- factor(targets$year)
Group.color <- Group
levels(Group.color) <- c("blue","red","green")
plotMDSUsingSEs(y.protein, pch = 16, col = as.character(Group.color),
main = "MDS by Year", cex = 1, gene.selection = "common")# By Student
# Automatically assign a unique color to each student
Group <- factor(targets$student)
Group.color <- rainbow(length(levels(Group)))[Group]
plotMDSUsingSEs(y.protein,
pch = 16,
col = Group.color,
main = "MDS by Student",
cex = 1,
gene.selection = "common")
legend("topright", legend = levels(Group), col = rainbow(length(levels(Group))), pch = 16, cex = 0.8)Contrasts & Differential Expression
Define contrasts relative to the reference (beads), then run limma with eBayes. This also saves the de output as tables to look at later
cont <- makeContrasts(
urea_vs_beads = groupurea, # compares urea to beads (reference)
Strap_vs_beads = groupS.trap, # compares S-trap to beads (reference)
Strap_vs_urea = groupS.trap - groupurea, # compares S-trap to urea
levels = design
)Renaming (Intercept) to Intercept
fit <- dpcDE(y.protein, design, plot = TRUE)Coefficients not estimable: studentXA
Warning: Partial NA coefficients for 5999 probe(s)
Coefficients not estimable: studentXA
Coefficients not estimable: studentXA
Warning: Partial NA coefficients for 5999 probe(s)
Coefficients not estimable: studentXA
fit <- contrasts.fit(fit, cont)
fit <- eBayes(fit)
# Save complete tables
readr::write_tsv(as.data.frame(topTable(fit, coef = "urea_vs_beads", number = Inf)),
"limpa.DE_urea_vs_beads.tsv")
readr::write_tsv(as.data.frame(topTable(fit, coef = "Strap_vs_beads", number = Inf)),
"limpa.DE_Strap_vs_beads.tsv")
readr::write_tsv(as.data.frame(topTable(fit, coef = "Strap_vs_urea", number = Inf)),
"limpa.DE_Strap_vs_urea.tsv")Funsctions to make all volcano Plots and complex Heatmaps. This was done by chatGPT
# Use CONSTANT names for global defaults to avoid lazy-eval recursion
LFC_THR_DEFAULT <- 0.58
FDR_THR_DEFAULT <- 0.05
LABEL_TOP_N_DEFAULT <- 15
HEATMAP_TOP_N_DEFAULT <- 20
.make_labels <- function(df) {
if ("Genes" %in% names(df)) {
ifelse(is.na(df$Genes) | df$Genes == "", df$Protein.Group, df$Genes)
} else df$Protein.Group
}
.classify_de <- function(df, lfc_thr, fdr_thr) {
df$status <- "Not DE"
df$status[df$adj.P.Val < fdr_thr & df$logFC >= lfc_thr] <- "Up"
df$status[df$adj.P.Val < fdr_thr & df$logFC <= -lfc_thr] <- "Down"
df$status <- factor(df$status, levels = c("Down","Not DE","Up"))
df
}
.group_colors <- function(groups) {
base <- c(
beads = "#1f77b4", urea = "#d62728",
STrap = "#2ca02c", "S.trap" = "#2ca02c", Strap = "#2ca02c"
)
u <- unique(groups)
extra <- setdiff(u, names(base))
if (length(extra) > 0) c(base, setNames(rainbow(length(extra)), extra)) else base
}
.zscore_rows <- function(mat) t(apply(mat, 1, function(x) (x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE)))
# ---- Core function (defaults now point to CONSTANTS) ----
run_comparison <- function(fit, coef_name, y.protein, targets,
lfc_thr = LFC_THR_DEFAULT,
fdr_thr = FDR_THR_DEFAULT,
label_top_n = LABEL_TOP_N_DEFAULT,
heatmap_top_n = HEATMAP_TOP_N_DEFAULT,
title_prefix = NULL) {
de <- as.data.frame(topTable(fit, coef = coef_name, number = Inf))
de$Protein.Group <- rownames(de)
de <- .classify_de(de, lfc_thr, fdr_thr)
n_up <- sum(de$status == "Up")
n_down <- sum(de$status == "Down")
cat(sprintf("[%s] DE (FDR < %.2f & |log2FC| >= %.2f): %d (Up: %d, Down: %d)\n",
coef_name, fdr_thr, lfc_thr, n_up + n_down, n_up, n_down))
de$label_pref <- .make_labels(de)
lab_df <- subset(de, status != "Not DE")
lab_df <- lab_df[order(lab_df$P.Value), ][seq_len(min(label_top_n, nrow(lab_df))), ]
volcano_title <- if (is.null(title_prefix)) paste0(coef_name, ": Volcano") else paste0(title_prefix, ": Volcano")
p_volcano <- ggplot(de, aes(x = logFC, y = -log10(P.Value), color = status)) +
geom_point(alpha = 0.8) +
scale_color_manual(values = c("Down" = "royalblue3", "Not DE" = "grey70", "Up" = "firebrick3")) +
geom_vline(xintercept = c(-lfc_thr, lfc_thr), linetype = "dashed") +
geom_hline(yintercept = -log10(fdr_thr), linetype = "dashed") +
geom_text_repel(data = lab_df, aes(label = label_pref), max.overlaps = Inf,
box.padding = 0.4, size = 3) +
theme_bw() +
labs(
title = volcano_title,
subtitle = sprintf("Criteria: FDR < %.2f and |log2FC| ≥ %.2f", fdr_thr, lfc_thr),
x = "log2 Fold Change",
y = "-log10(P-value)",
color = NULL
)
print(p_volcano)
de_set <- subset(de, status != "Not DE")
sel_ids <- if (nrow(de_set) >= 1) head(de_set[order(de_set$P.Value), "Protein.Group"], heatmap_top_n)
else head(de$Protein.Group, heatmap_top_n)
plotmat <- y.protein$E[sel_ids, , drop = FALSE]
plotmat_z <- .zscore_rows(plotmat)
grp <- as.character(targets$group)
yr <- factor(targets$year)
col_map <- .group_colors(grp)
col_ha <- HeatmapAnnotation(
group = grp,
year = yr,
col = list(
group = col_map,
year = setNames(rainbow(length(levels(yr))), levels(yr))
),
annotation_name_side = "left"
)
gene_map <- if ("Genes" %in% names(de)) setNames(de$Genes, de$Protein.Group) else NULL
if (!is.null(gene_map)) {
row_labels <- gene_map[sel_ids]
row_labels[is.na(row_labels) | row_labels == ""] <- sel_ids[is.na(row_labels) | row_labels == ""]
} else row_labels <- sel_ids
ht_title <- if (is.null(title_prefix)) paste0("Top ", min(heatmap_top_n, nrow(plotmat_z)), " DE (", coef_name, ")")
else paste0("Top ", min(heatmap_top_n, nrow(plotmat_z)), " DE (", title_prefix, ")")
print(
Heatmap(
plotmat_z,
name = "z-score",
top_annotation = col_ha,
show_column_names = FALSE,
row_labels = row_labels,
cluster_rows = TRUE,
cluster_columns = TRUE,
column_title = ht_title,
heatmap_legend_param = list(title = "z")
)
)
invisible(list(table = de, ids = sel_ids))
}Now we actually make them
# Urea vs Beads
res_ub <- run_comparison(
fit, "urea_vs_beads", y.protein, targets,
title_prefix = "Urea vs Beads"
)[urea_vs_beads] DE (FDR < 0.05 & |log2FC| >= 0.58): 1808 (Up: 1605, Down: 203)
# S-trap vs Beads
res_sb <- run_comparison(
fit, "Strap_vs_beads", y.protein, targets,
title_prefix = "S-trap vs Beads"
)[Strap_vs_beads] DE (FDR < 0.05 & |log2FC| >= 0.58): 1306 (Up: 907, Down: 399)
# S-trap vs Urea
res_su <- run_comparison(
fit, "Strap_vs_urea", y.protein, targets,
title_prefix = "S-trap vs Urea"
)[Strap_vs_urea] DE (FDR < 0.05 & |log2FC| >= 0.58): 1969 (Up: 619, Down: 1350)
Now lets see what variables affected the results the most using Partition Variance.
Hopefully its the Sample prep (group) and not the Year or Student
# Transpose expression matrix: samples as rows
expr_t <- t(y.protein$E)
# Make sure factors are properly encoded
targets$group <- factor(targets$group)
targets$student <- factor(targets$student)
targets$year <- factor(targets$year)
# Partition variance across 3 factors
var_part <- varpart(expr_t,
~ group,
~ student,
~ year,
data = targets)Warning: collinearity detected in cbind(X2,X3): mm = 100, m = 99
Warning: collinearity detected in cbind(X1,X2,X3): mm = 102, m = 101
# Visualize
plot(var_part,
bg = c("red", "blue", "green"),
Xnames = c("Group", "Student", "Year"),
id.size = 0.8,
cex = 1.1,
main = "Variance Partitioning: Group, Student, Year")Warning in mtext(...): "id.size" is not a graphical parameter
I think this is good, although I’m not sure if this is a high residual
Calculate %CV over year and condition
## -----------------------------------------------------------------------------
## Precision Analysis: %CV by Method and Year (External Metadata Fix)
## -----------------------------------------------------------------------------
library(dplyr)
library(tidyr)
library(ggplot2)
library(readr)
# 1. Prepare Data
# Check if y.protein exists
if (!exists("y.protein")) stop("Object 'y.protein' not found.")
# Extract Log2 Intensity Matrix and convert to Linear
log_mat <- y.protein$E
lin_mat <- 2^log_mat
# --- THE FIX: Load Annotation Externally ---
# We do not rely on y.protein$targets. We load the file directly.
# Make sure "joined_annotation.txt" is in your working directory.
annot_file <- "joined_annotation.txt"
if (file.exists(annot_file)) {
annot <- read_tsv(annot_file, show_col_types = FALSE)
} else {
stop("Could not find 'joined_annotation.txt'. Please ensure the file is in your folder.")
}
# Clean up annotation formats
annot$sample.name <- trimws(as.character(annot$sample.name))
annot$group <- tolower(trimws(as.character(annot$group)))
annot$year <- as.factor(trimws(as.character(annot$year)))
# 2. Match Data Columns to Annotation Rows
# This ensures we only analyze samples that exist in both the matrix and the file
# and that they are in the exact same order.
common_samples <- intersect(colnames(lin_mat), annot$sample.name)
if (length(common_samples) == 0) {
stop("No matching sample names found between y.protein and joined_annotation.txt. Check naming conventions.")
}
# Subset and Reorder Matrix to match common samples
lin_mat_subset <- lin_mat[, common_samples]
# Subset and Reorder Annotation to match common samples
annot_subset <- annot %>%
filter(sample.name %in% common_samples) %>%
arrange(match(sample.name, colnames(lin_mat_subset)))
# Double check alignment
if (!all(annot_subset$sample.name == colnames(lin_mat_subset))) {
stop("Critical Error: Sample alignment failed. Please check your data.")
}
# 3. Calculate CV per Group per Year
cv_results <- list()
combinations <- annot_subset %>% dplyr::distinct(group, year)
for (i in 1:nrow(combinations)) {
g <- combinations$group[i]
y <- combinations$year[i]
# Identify sample INDICES for this specific combo
# We use the subsetted annotation which is now perfectly aligned with lin_mat_subset
idx <- which(annot_subset$group == g & annot_subset$year == y)
if (length(idx) > 1) {
sub_dat <- lin_mat_subset[, idx, drop = FALSE]
# Calculate Stats
means <- rowMeans(sub_dat, na.rm = TRUE)
sds <- apply(sub_dat, 1, sd, na.rm = TRUE)
# Calculate %CV
cvs <- (sds / means) * 100
label <- paste(g, y, sep = "_")
cv_results[[label]] <- data.frame(
Protein = names(cvs),
CV = cvs,
Method = g,
Year = y,
stringsAsFactors = FALSE
)
}
}
# Combine results
plot_df <- dplyr::bind_rows(cv_results)
# Clean up Method names
plot_df$Method <- factor(plot_df$Method,
levels = c("s-trap", "urea", "beads"),
labels = c("S-Trap", "Urea", "Beads"))
# Filter outlier CVs
viz_df <- plot_df %>% filter(CV < 150)
# 4. Generate the Violin Plot
p_cv_year <- ggplot(viz_df, aes(x = Year, y = CV, fill = Method)) +
geom_violin(scale = "width", position = position_dodge(width = 0.8), trim = TRUE, alpha = 0.8) +
# --- CORRECTED BOXPLOT SECTION ---
geom_boxplot(
aes(group = interaction(Year, Method)), # Force ggplot to calculate 3 separate boxplots
width = 0.1,
position = position_dodge(width = 0.8),
outlier.shape = NA,
fill = "white",
alpha = 0.5
) +
# ---------------------------------
geom_hline(yintercept = 20, linetype = "dashed", color = "red", linewidth = 0.8) +
annotate("text", x = 0.6, y = 22, label = "Ideal (<20%)", color = "red", fontface = "bold", hjust = 0) +
scale_fill_manual(values = c("S-Trap" = "lightgreen", "Urea" = "salmon", "Beads" = "skyblue")) +
labs(
title = "Precision Analysis: %CV Distribution by Year and Method",
subtitle = "Comparing consistency across three student cohorts (2023-2025)",
y = "Coefficient of Variation (%CV)",
x = "Academic Year"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 14),
legend.position = "top"
)
print(p_cv_year)# 5. Summary Table
summary_table <- plot_df %>%
group_by(Year, Method) %>%
summarise(
Median_CV = median(CV, na.rm = TRUE),
Pass_Rate_20 = sum(CV < 20, na.rm = TRUE) / n() * 100
) %>%
arrange(Year, Median_CV)`summarise()` has grouped output by 'Year'. You can override using the
`.groups` argument.
print(summary_table)# A tibble: 9 × 4
# Groups: Year [3]
Year Method Median_CV Pass_Rate_20
<fct> <fct> <dbl> <dbl>
1 2023 S-Trap 20.6 47.7
2 2023 Urea 37.4 9.27
3 2023 Beads 47.5 5.63
4 2024 Urea 23.0 41.2
5 2024 S-Trap 25.9 14.9
6 2024 Beads 69.7 9.05
7 2025 S-Trap 25.5 27.5
8 2025 Beads 35.4 11.6
9 2025 Urea 41.3 17.0
Let’s do some Gene Ontology, Pathway and enrichment analysis
library(clusterProfiler)Warning: package 'clusterProfiler' was built under R version 4.5.2
clusterProfiler v4.18.4 Learn more at https://yulab-smu.top/contribution-knowledge-mining/
Please cite:
T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan,
X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal
enrichment tool for interpreting omics data. The Innovation. 2021,
2(3):100141
Attaching package: 'clusterProfiler'
The following object is masked from 'package:stats':
filter
library(org.Mm.eg.db) # <--- This is for MouseLoading required package: AnnotationDbi
Warning: package 'AnnotationDbi' was built under R version 4.5.2
Loading required package: stats4
Loading required package: BiocGenerics
Warning: package 'BiocGenerics' was built under R version 4.5.2
Loading required package: generics
Attaching package: 'generics'
The following object is masked from 'package:dplyr':
explain
The following objects are masked from 'package:base':
as.difftime, as.factor, as.ordered, intersect, is.element, setdiff,
setequal, union
Attaching package: 'BiocGenerics'
The following object is masked from 'package:arrow':
type
The following object is masked from 'package:dplyr':
combine
The following object is masked from 'package:limma':
plotMA
The following objects are masked from 'package:stats':
IQR, mad, sd, var, xtabs
The following objects are masked from 'package:base':
anyDuplicated, aperm, append, as.data.frame, basename, cbind,
colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
get, grep, grepl, is.unsorted, lapply, Map, mapply, match, mget,
order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
rbind, Reduce, rownames, sapply, saveRDS, table, tapply, unique,
unsplit, which.max, which.min
Loading required package: Biobase
Warning: package 'Biobase' was built under R version 4.5.2
Welcome to Bioconductor
Vignettes contain introductory material; view with
'browseVignettes()'. To cite Bioconductor, see
'citation("Biobase")', and for packages 'citation("pkgname")'.
Loading required package: IRanges
Warning: package 'IRanges' was built under R version 4.5.2
Loading required package: S4Vectors
Warning: package 'S4Vectors' was built under R version 4.5.2
Attaching package: 'S4Vectors'
The following object is masked from 'package:clusterProfiler':
rename
The following object is masked from 'package:tidyr':
expand
The following objects are masked from 'package:dplyr':
first, rename
The following object is masked from 'package:utils':
findMatches
The following objects are masked from 'package:base':
expand.grid, I, unname
Attaching package: 'IRanges'
The following object is masked from 'package:clusterProfiler':
slice
The following objects are masked from 'package:dplyr':
collapse, desc, slice
The following object is masked from 'package:grDevices':
windows
Attaching package: 'AnnotationDbi'
The following object is masked from 'package:clusterProfiler':
select
The following object is masked from 'package:dplyr':
select
library(enrichplot)Warning: package 'enrichplot' was built under R version 4.5.2
enrichplot v1.30.4 Learn more at https://yulab-smu.top/contribution-knowledge-mining/
Please cite:
Qianwen Wang, Ming Li, Tianzhi Wu, Li Zhan, Lin Li, Meijun Chen, Wenqin
Xie, Zijing Xie, Erqiang Hu, Shuangbin Xu, Guangchuang Yu. Exploring
epigenomic datasets by ChIPseeker. Current Protocols. 2022, 2(10): e585
library(ggplot2)
run_mouse_enrichment <- function(de_table, comparison_name) {
# 1. Clean IDs: Map Uniprot (Protein.Group) to Entrez ID using Mouse DB
# We use BITR (Biological Id TRanslator)
ids <- bitr(de_table$Protein.Group,
fromType = "UNIPROT",
toType = c("ENTREZID", "SYMBOL"),
OrgDb = org.Mm.eg.db) # Mouse DB
# Merge IDs back with your stats (logFC, P.Value)
merged <- merge(de_table, ids, by.x = "Protein.Group", by.y = "UNIPROT")
# 2. Extract gene lists
# Significant Up-regulated (using the thresholds you set in limpa)
gene_up <- merged$ENTREZID[merged$status == "Up"]
# Significant Down-regulated
gene_down <- merged$ENTREZID[merged$status == "Down"]
# 3. Run GO Enrichment (Biological Process) for UP-regulated
# We check if there are enough genes to run it
if(length(gene_up) > 5) {
cat(paste0("\nRunning GO Enrichment for UP-regulated in: ", comparison_name, "\n"))
go_up <- enrichGO(gene = gene_up,
OrgDb = org.Mm.eg.db,
ont = "BP", # Biological Process
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
readable = TRUE) # Maps IDs back to Gene Symbols
# 4. Plots
if (!is.null(go_up) && nrow(go_up) > 0) {
print(dotplot(go_up, showCategory=15, title=paste(comparison_name, "- Up Regulated Pathways")))
# Cnetplot (Network view) - connects genes to terms
# We need a named vector of fold changes for the coloring
fold_changes <- merged$logFC
names(fold_changes) <- merged$SYMBOL
print(cnetplot(go_up, categorySize="pvalue", foldChange=fold_changes, circular = FALSE))
} else {
cat("No significant GO terms found for Up-regulated genes.\n")
}
}
}
# --- Run the function on your specific contrasts ---
# 1. Urea vs Beads
run_mouse_enrichment(res_ub$table, "Urea vs Beads")'select()' returned 1:many mapping between keys and columns
Warning in bitr(de_table$Protein.Group, fromType = "UNIPROT", toType =
c("ENTREZID", : 2.75% of input gene IDs are fail to map...
Running GO Enrichment for UP-regulated in: Urea vs Beads
Warning: ggrepel: 56 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
# 2. S-trap vs Beads
run_mouse_enrichment(res_sb$table, "S-trap vs Beads")'select()' returned 1:many mapping between keys and columns
Warning in bitr(de_table$Protein.Group, fromType = "UNIPROT", toType =
c("ENTREZID", : 2.75% of input gene IDs are fail to map...
Running GO Enrichment for UP-regulated in: S-trap vs Beads
Warning: ggrepel: 4 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
# 3. S-trap vs Urea
run_mouse_enrichment(res_su$table, "S-trap vs Urea")'select()' returned 1:many mapping between keys and columns
Warning in bitr(de_table$Protein.Group, fromType = "UNIPROT", toType =
c("ENTREZID", : 2.75% of input gene IDs are fail to map...
Running GO Enrichment for UP-regulated in: S-trap vs Urea
Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
From Gemini AI v3 for this plot
This is a Gene-Concept Network plot (cnetplot). It is one of the most useful plots for interpreting omics data because it connects your biological “themes” (the big beige nodes) directly to the specific proteins driving them (the small blue nodes).
Here is how to read it, layer by layer:
1. The Big Beige Nodes (The “Concepts”)
These represent the GO Terms (Pathways) that were found to be statistically significant.
Labels: You can see terms like “phospholipid metabolic process” (top) and “organic anion transport” (bottom).
Size: The size of the beige circle represents the number of proteins from your dataset that fall into that pathway.
Takeaway: “Phospholipid metabolism” and “Transport” are the two dominant biological themes in this specific comparison.
2. The Small Blue Nodes (The Proteins)
These are the actual individual proteins identified in your mass spec experiment.
Labels: These are gene symbols (e.g.,
Slc16a1,Pik3ca).Slc...family: You see a massive cluster ofSlcgenes at the bottom. These are Solute Carriers. They are transmembrane proteins responsible for transporting things in/out of cells.Pik3.../Agpat...: These are enzymes involved in lipid signaling and synthesis.
Color (The Fold Change): The legend on the right shows a blue scale for “fold change” ranging from ~1.0 to 2.0.
Since all values are positive, these proteins are all UP-REGULATED.
Darker blue dots (like
Slc16a1) increased in abundance more than lighter blue dots.
3. The Connections (The Lines)
The grey lines show membership. If a blue gene node is connected to a beige pathway node, it means that gene is part of that pathway.
Visual Grouping: The plot automatically clusters related pathways together.
Top Cluster: Everything related to making or modifying lipids/fats.
Bottom Cluster: Everything related to membrane transport.
4. The “Biological Story” (The Interpretation)
Combining what we know about your experiment (S-trap/Urea vs. Beads) with this plot, the interpretation is likely technical:
“The S-trap (or Urea) method is significantly better at extracting Membrane Proteins than the Beads.”
Evidence: Look at the bottom cluster. It is full of
Slcproteins.Slcproteins are integral membrane proteins. They are notoriously hydrophobic and difficult to extract.Conclusion: If this is the “S-trap vs Beads” comparison, this plot proves that the S-trap’s use of SDS (a strong detergent) successfully solubilized these difficult membrane transporters, whereas the Beads (which often use weaker buffers) failed to capture them as effectively.
In summary: You aren’t just seeing “biology” here; you are seeing the efficiency of your lysis buffer. The extraction method you used for this group successfully ripped open the membranes and released the transporters and lipid-associated proteins.
Gene Set Enrichment Analysis (GSEA)
library(clusterProfiler)
library(org.Mm.eg.db)
library(enrichplot)
library(dplyr)
# Updated Function to run GSEA (KEGG) with robustness fixes
run_mouse_gsea <- function(de_table, comparison_name) {
message(paste0("Preparing GSEA for: ", comparison_name))
# 1. Clean and Map IDs
ids <- bitr(de_table$Protein.Group,
fromType = "UNIPROT",
toType = "ENTREZID",
OrgDb = org.Mm.eg.db)
merged <- merge(de_table, ids, by.x = "Protein.Group", by.y = "UNIPROT")
# 2. Handle Duplicates & Ties
merged_unique <- merged %>%
arrange(desc(abs(logFC))) %>%
distinct(ENTREZID, .keep_all = TRUE)
# Extract numeric vector
gene_list <- merged_unique$logFC
names(gene_list) <- merged_unique$ENTREZID
# FIX TIES: Add a tiny random jitter (1e-10) to break ties
# This prevents the "ties in preranked stats" warning
set.seed(123)
gene_list <- gene_list + runif(length(gene_list), -1e-10, 1e-10)
# Sort descending (Critical for GSEA)
gene_list <- sort(gene_list, decreasing = TRUE)
# 3. Run gseKEGG
# eps=0 fixes the "P-values likely overestimated" warning
gsea_res <- tryCatch({
gseKEGG(geneList = gene_list,
organism = 'mmu',
pvalueCutoff = 0.05,
verbose = FALSE,
eps = 0, # Better precision for low p-values
seed = TRUE) # reproducible results
}, error = function(e) {
message(paste("GSEA failed for", comparison_name, ":", e$message))
return(NULL)
})
# 4. Visualization
if (!is.null(gsea_res) && nrow(gsea_res) > 0) {
# A. Dotplot
print(dotplot(gsea_res, showCategory=10, split=".sign") +
facet_grid(.~.sign) +
ggtitle(paste(comparison_name, "- GSEA Overview")))
# B. GSEA Plot for the #1 pathway
top_pathway_name <- gsea_res$Description[1]
print(gseaplot2(gsea_res, geneSetID = 1, title = paste(comparison_name, "-", top_pathway_name)))
# C. Ridgeplot (Only if package exists)
if (requireNamespace("ggridges", quietly = TRUE)) {
print(ridgeplot(gsea_res) + labs(x = "log2 Fold Change") + ggtitle(paste(comparison_name, "- Ridgeplot")))
} else {
message("Skipping ridgeplot: 'ggridges' package not installed.")
}
return(gsea_res)
} else {
message(paste("No significant GSEA pathways found for", comparison_name))
return(NULL)
}
}Run the Analysis
# 1. Urea vs Beads GSEA
gsea_ub <- run_mouse_gsea(res_ub$table, "Urea vs Beads")Preparing GSEA for: Urea vs Beads
'select()' returned 1:many mapping between keys and columns
Warning in bitr(de_table$Protein.Group, fromType = "UNIPROT", toType =
"ENTREZID", : 2.75% of input gene IDs are fail to map...
Reading KEGG annotation online: "https://rest.kegg.jp/link/mmu/pathway"...
Reading KEGG annotation online: "https://rest.kegg.jp/list/pathway/mmu"...
Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
minSize, : There were 4 pathways for which P-values were not calculated
properly due to unbalanced (positive and negative) gene-level statistic values.
For such pathways pval, padj, NES, log2err are set to NA. You can try to
increase the value of the argument nPermSimple (for example set it nPermSimple
= 10000)
Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
minSize, : For some of the pathways the P-values were likely overestimated. For
such pathways log2err is set to NA.
Picking joint bandwidth of 0.148
# 2. S-trap vs Beads GSEA
gsea_sb <- run_mouse_gsea(res_sb$table, "S-trap vs Beads")Preparing GSEA for: S-trap vs Beads
'select()' returned 1:many mapping between keys and columns
Warning in bitr(de_table$Protein.Group, fromType = "UNIPROT", toType =
"ENTREZID", : 2.75% of input gene IDs are fail to map...
Picking joint bandwidth of 0.139
# 3. S-trap vs Urea GSEA
gsea_su <- run_mouse_gsea(res_su$table, "S-trap vs Urea")Preparing GSEA for: S-trap vs Urea
'select()' returned 1:many mapping between keys and columns
Warning in bitr(de_table$Protein.Group, fromType = "UNIPROT", toType =
"ENTREZID", : 2.75% of input gene IDs are fail to map...
Picking joint bandwidth of 0.16
Save Design and Contrasts (Provenance)
readr::write_tsv(as.data.frame(design) |> tibble::rownames_to_column("Sample"),
"design_matrix.tsv")
readr::write_tsv(as.data.frame(cont) |> tibble::rownames_to_column("Contrast"),
"contrast_matrix.tsv")Session Info
sessionInfo()R version 4.5.1 (2025-06-13 ucrt)
Platform: x86_64-w64-mingw32/x64
Running under: Windows 11 x64 (build 26200)
Matrix products: default
LAPACK version 3.12.1
locale:
[1] LC_COLLATE=English_United States.utf8
[2] LC_CTYPE=English_United States.utf8
[3] LC_MONETARY=English_United States.utf8
[4] LC_NUMERIC=C
[5] LC_TIME=English_United States.utf8
time zone: America/Los_Angeles
tzcode source: internal
attached base packages:
[1] stats4 grid stats graphics grDevices datasets utils
[8] methods base
other attached packages:
[1] enrichplot_1.30.4 org.Mm.eg.db_3.22.0 AnnotationDbi_1.72.0
[4] IRanges_2.44.0 S4Vectors_0.48.0 Biobase_2.70.0
[7] BiocGenerics_0.56.0 generics_0.1.4 clusterProfiler_4.18.4
[10] tidyr_1.3.2 vegan_2.7-2 permute_0.9-8
[13] arrow_22.0.0.1 circlize_0.4.17 ComplexHeatmap_2.26.0
[16] ggrepel_0.9.6 ggplot2_4.0.1 dplyr_1.1.4
[19] tibble_3.3.0 readr_2.1.6 limpa_1.2.3
[22] limma_3.66.0
loaded via a namespace (and not attached):
[1] RColorBrewer_1.1-3 jsonlite_2.0.0 shape_1.4.6.1
[4] tidydr_0.0.6 magrittr_2.0.4 ggtangle_0.0.9
[7] farver_2.1.2 rmarkdown_2.30 GlobalOptions_0.1.3
[10] fs_1.6.6 vctrs_0.6.5 memoise_2.0.1
[13] ggtree_4.0.3 htmltools_0.5.9 gridGraphics_0.5-1
[16] htmlwidgets_1.6.4 plyr_1.8.9 cachem_1.1.0
[19] igraph_2.2.1 lifecycle_1.0.4 iterators_1.0.14
[22] pkgconfig_2.0.3 gson_0.1.0 Matrix_1.7-4
[25] R6_2.6.1 fastmap_1.2.0 clue_0.3-66
[28] digest_0.6.39 aplot_0.2.9 colorspace_2.1-2
[31] ggnewscale_0.5.2 patchwork_1.3.2 RSQLite_2.4.5
[34] labeling_0.4.3 polyclip_1.10-7 httr_1.4.7
[37] mgcv_1.9-4 compiler_4.5.1 bit64_4.6.0-1
[40] fontquiver_0.2.1 withr_3.0.2 doParallel_1.0.17
[43] S7_0.2.1 BiocParallel_1.44.0 DBI_1.2.3
[46] ggforce_0.5.0 R.utils_2.13.0 MASS_7.3-65
[49] rappdirs_0.3.3 rjson_0.2.23 tools_4.5.1
[52] scatterpie_0.2.6 ape_5.8-1 R.oo_1.27.1
[55] glue_1.8.0 nlme_3.1-168 GOSemSim_2.36.0
[58] cluster_2.1.8.1 reshape2_1.4.5 snow_0.4-4
[61] fgsea_1.36.0 gtable_0.3.6 tzdb_0.5.0
[64] R.methodsS3_1.8.2 data.table_1.18.0 hms_1.1.4
[67] utf8_1.2.6 XVector_0.50.0 foreach_1.5.2
[70] pillar_1.11.1 stringr_1.6.0 yulab.utils_0.2.3
[73] vroom_1.6.7 splines_4.5.1 tweenr_2.0.3
[76] treeio_1.34.0 lattice_0.22-7 renv_1.1.5
[79] bit_4.6.0 tidyselect_1.2.1 fontLiberation_0.1.0
[82] GO.db_3.22.0 Biostrings_2.78.0 knitr_1.51
[85] fontBitstreamVera_0.1.1 Seqinfo_1.0.0 xfun_0.55
[88] statmod_1.5.1 matrixStats_1.5.0 stringi_1.8.7
[91] lazyeval_0.2.2 ggfun_0.2.0 yaml_2.3.12
[94] evaluate_1.0.5 codetools_0.2-20 gdtools_0.4.4
[97] qvalue_2.42.0 BiocManager_1.30.27 ggplotify_0.1.3
[100] cli_3.6.5 systemfonts_1.3.1 Rcpp_1.1.0
[103] png_0.1-8 parallel_4.5.1 assertthat_0.2.1
[106] blob_1.2.4 DOSE_4.4.0 tidytree_0.4.6
[109] ggiraph_0.9.2 ggridges_0.5.7 scales_1.4.0
[112] purrr_1.2.0 crayon_1.5.3 GetoptLong_1.1.0
[115] rlang_1.1.6 cowplot_1.2.0 fastmatch_1.1-6
[118] KEGGREST_1.50.0