| Field | Detail |
|---|---|
| MetaboLights ID | MTBLS4461 |
| Technique | Direct-infusion MS (DI-MS), alternating polarity |
| Instrument | Thermo Exactive Orbitrap |
| Matrix | Human serum |
| Cohort | Pakistani control subjects from a case-control MI study |
| Biological factors | Gender, Age, Diabetes status |
Samples were infused directly into the mass spectrometer without prior chromatographic separation, enabling high-throughput profiling across a large epidemiological cohort. The absence of chromatography increases ion suppression, making normalisation a critical step in the analytical pipeline.
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
cran_pkgs <- c(
"tidyverse", # dplyr, ggplot2, stringr, readr, tidyr
"janitor", # clean_names() — makes column names R-safe
"pheatmap", # clustered heatmaps
"ggrepel", # non-overlapping text labels on volcano plots
"patchwork", # stitch ggplot panels together
"RColorBrewer", # colour palettes
"knitr", # kable() for formatted tables
"kableExtra" # enhanced kable styling
)
bioc_pkgs <- c(
"limma", # eBayes differential abundance — the industry standard
"impute" # KNN imputation for missing values
)
install.packages(setdiff(cran_pkgs, rownames(installed.packages())),
quiet = TRUE)
BiocManager::install(setdiff(bioc_pkgs, rownames(installed.packages())),
ask = FALSE, quiet = TRUE)
suppressPackageStartupMessages(
lapply(c(cran_pkgs, bioc_pkgs), library, character.only = TRUE)
)#> [[1]]
#> [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
#> [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
#> [13] "grDevices" "utils" "datasets" "methods" "base"
#>
#> [[2]]
#> [1] "janitor" "lubridate" "forcats" "stringr" "dplyr" "purrr"
#> [7] "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
#> [13] "graphics" "grDevices" "utils" "datasets" "methods" "base"
#>
#> [[3]]
#> [1] "pheatmap" "janitor" "lubridate" "forcats" "stringr" "dplyr"
#> [7] "purrr" "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
#> [13] "stats" "graphics" "grDevices" "utils" "datasets" "methods"
#> [19] "base"
#>
#> [[4]]
#> [1] "ggrepel" "pheatmap" "janitor" "lubridate" "forcats" "stringr"
#> [7] "dplyr" "purrr" "readr" "tidyr" "tibble" "ggplot2"
#> [13] "tidyverse" "stats" "graphics" "grDevices" "utils" "datasets"
#> [19] "methods" "base"
#>
#> [[5]]
#> [1] "patchwork" "ggrepel" "pheatmap" "janitor" "lubridate" "forcats"
#> [7] "stringr" "dplyr" "purrr" "readr" "tidyr" "tibble"
#> [13] "ggplot2" "tidyverse" "stats" "graphics" "grDevices" "utils"
#> [19] "datasets" "methods" "base"
#>
#> [[6]]
#> [1] "RColorBrewer" "patchwork" "ggrepel" "pheatmap" "janitor"
#> [6] "lubridate" "forcats" "stringr" "dplyr" "purrr"
#> [11] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
#> [16] "stats" "graphics" "grDevices" "utils" "datasets"
#> [21] "methods" "base"
#>
#> [[7]]
#> [1] "knitr" "RColorBrewer" "patchwork" "ggrepel" "pheatmap"
#> [6] "janitor" "lubridate" "forcats" "stringr" "dplyr"
#> [11] "purrr" "readr" "tidyr" "tibble" "ggplot2"
#> [16] "tidyverse" "stats" "graphics" "grDevices" "utils"
#> [21] "datasets" "methods" "base"
#>
#> [[8]]
#> [1] "kableExtra" "knitr" "RColorBrewer" "patchwork" "ggrepel"
#> [6] "pheatmap" "janitor" "lubridate" "forcats" "stringr"
#> [11] "dplyr" "purrr" "readr" "tidyr" "tibble"
#> [16] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
#> [21] "utils" "datasets" "methods" "base"
#>
#> [[9]]
#> [1] "limma" "kableExtra" "knitr" "RColorBrewer" "patchwork"
#> [6] "ggrepel" "pheatmap" "janitor" "lubridate" "forcats"
#> [11] "stringr" "dplyr" "purrr" "readr" "tidyr"
#> [16] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
#> [21] "grDevices" "utils" "datasets" "methods" "base"
#>
#> [[10]]
#> [1] "impute" "limma" "kableExtra" "knitr" "RColorBrewer"
#> [6] "patchwork" "ggrepel" "pheatmap" "janitor" "lubridate"
#> [11] "forcats" "stringr" "dplyr" "purrr" "readr"
#> [16] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
#> [21] "graphics" "grDevices" "utils" "datasets" "methods"
#> [26] "base"
Files are retrieved from the MetaboLights REST API. An FTP mirror is
available at
ftp://ftp.ebi.ac.uk/pub/databases/metabolights/studies/public/MTBLS4461/.
dir.create("data/MTBLS4461", recursive = TRUE, showWarnings = FALSE)
download_if_missing <- function(filename, dest_dir = "data/MTBLS4461") {
dest <- file.path(dest_dir, filename)
if (!file.exists(dest)) {
url <- paste0(
"https://www.ebi.ac.uk/metabolights/ws/studies/MTBLS4461/download?file=",
URLencode(filename, reserved = TRUE)
)
message("Downloading: ", filename)
download.file(url, destfile = dest, mode = "wb", quiet = TRUE)
}
invisible(dest)
}
download_if_missing("s_MTBLS4461.txt")
download_if_missing("a_MTBLS4461_DI-MS_alternating__metabolite_profiling.txt")
download_if_missing("m_MTBLS4461_DI-MS_alternating__metabolite_profiling_v2_maf.tsv")
download_if_missing("m_MTBLS4461_DI-MS_positive__metabolite_profiling_v2_maf.tsv_SPLIT")
message("All files ready.")The s_ file is in ISA-Tab format. Ontology term columns
are dropped; three biological factors are retained: gender, age, and
diabetes status.
meta_raw <- read_tsv(
"data/MTBLS4461/s_MTBLS4461.txt",
show_col_types = FALSE
)
metadata <- meta_raw |>
select(
sample_name = `Sample Name`,
source_name = `Source Name`,
gender = `Factor Value[Gender]`,
age = `Factor Value[Age]`,
diabetes = `Factor Value[Diabetes]`
) |>
mutate(
age = as.numeric(age),
gender = factor(gender),
diabetes = na_if(diabetes, "") |> factor()
)
cat("Total samples:", nrow(metadata), "\n")#> Total samples: 5681
# Gender distribution
table(metadata$gender, useNA = "ifany") |>
as.data.frame() |>
setNames(c("Gender", "Count")) |>
kable(caption = "Gender distribution") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| Gender | Count |
|---|---|
| Female | 1195 |
| Male | 4465 |
| NA | 21 |
# Diabetes distribution
table(metadata$diabetes, useNA = "ifany") |>
as.data.frame() |>
setNames(c("Diabetes", "Count")) |>
kable(caption = "Diabetes status") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| Diabetes | Count |
|---|---|
| No | 2604 |
| Yes | 1609 |
| NA | 1468 |
# Age summary
summary(metadata$age) |>
t() |>
kable(caption = "Age distribution (years)",
digits = 1) |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| Min. | 1st Qu. | Median | Mean | 3rd Qu. | Max. | NA’s |
|---|---|---|---|---|---|---|
| 27 | 50 | 54 | 54.2 | 60 | 87 | 21 |
head(metadata, 8) |>
kable(caption = "First 8 rows of cleaned metadata") |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE)| sample_name | source_name | gender | age | diabetes |
|---|---|---|---|---|
| X1352 | Individual_1 | Male | 50 | No |
| LY3258 | Individual_2 | Male | 51 | NA |
| RC1732 | Individual_3 | Male | 50 | No |
| M580 | Individual_4 | Male | 45 | NA |
| FC1104 | Individual_5 | Male | 60 | No |
| B289 | Individual_6 | Male | 59 | NA |
| X1360 | Individual_7 | Female | 60 | No |
| X1641 | Individual_8 | Male | 70 | No |
The MAF (MetaboLights Annotation Format) file contains one row per lipid feature. The first 21 columns are annotation fields; sample abundance columns follow. Three spurious unnamed columns are removed on import.
read_maf <- function(path) {
maf <- read_tsv(path, show_col_types = FALSE)
# Drop unnamed junk columns (Unnamed: 21/22/23)
maf <- maf |> select(-starts_with("Unnamed"))
# Known annotation columns — metabolite_identification is intentionally
# excluded here; it is handled separately as the row identifier to avoid
# it appearing twice in select() calls.
annot_cols <- c(
"database_identifier", "chemical_formula", "smiles", "inchi",
"mass_to_charge", "fragmentation", "modifications", "charge",
"retention_time", "taxid", "species", "database", "database_version",
"reliability", "uri", "search_engine", "search_engine_score",
"smallmolecule_abundance_sub", "smallmolecule_abundance_stdev_sub",
"smallmolecule_abundance_std_error_sub"
)
# Which of those are actually present in this file
present_annot <- intersect(annot_cols, names(maf))
# Sample columns = everything that is not an annotation or the ID column
non_sample <- c("metabolite_identification", present_annot)
sample_cols <- setdiff(names(maf), non_sample)
# ── Build unique row IDs ────────────────────────────────────────────────
# Duplicates occur when the same lipid name appears at multiple m/z values
# (different adducts e.g. [M+H]+, [M+Na]+) or across both polarities.
# column_to_rownames() hard-errors on any duplicate, so we disambiguate
# by appending the rounded m/z; numeric suffix is the last-resort fallback.
n_dupes <- sum(duplicated(maf$metabolite_identification, incomparables = NA))
if (n_dupes > 0) {
message(n_dupes, " duplicate metabolite_identification entries detected — ",
"appending m/z to create unique row IDs.")
mz_vals <- round(as.numeric(maf$mass_to_charge), 4)
is_dup <- duplicated(maf$metabolite_identification) |
duplicated(maf$metabolite_identification, fromLast = TRUE)
row_ids <- ifelse(is_dup,
paste0(maf$metabolite_identification, " [", mz_vals, "]"),
maf$metabolite_identification)
# Numeric suffix if name+m/z is still not unique
if (anyDuplicated(row_ids)) {
row_ids <- ave(row_ids, row_ids, FUN = function(x) {
if (length(x) > 1) paste0(x, "_", seq_along(x)) else x
})
}
} else {
row_ids <- maf$metabolite_identification
}
# ── Assemble outputs ────────────────────────────────────────────────────
annot_out <- maf |>
select(all_of(present_annot)) |>
mutate(metabolite_identification = row_ids, .before = 1)
abund_out <- maf |>
select(all_of(sample_cols)) |>
mutate(metabolite_identification = row_ids) |>
column_to_rownames("metabolite_identification") |>
mutate(across(everything(), as.numeric)) |>
as.matrix()
list(annotation = annot_out, abundance = abund_out)
}
maf_alt <- read_maf(
"data/MTBLS4461/m_MTBLS4461_DI-MS_alternating__metabolite_profiling_v2_maf.tsv"
)
annot_df <- maf_alt$annotation
abundance_mat <- maf_alt$abundance
cat("Lipid features:", nrow(abundance_mat), "\n")#> Lipid features: 497
#> Samples in MAF: 5662
# Match samples between MAF and metadata
shared_samples <- intersect(colnames(abundance_mat), metadata$sample_name)
cat("Samples matched between MAF and metadata:", length(shared_samples), "\n")#> Samples matched between MAF and metadata: 5659
if (length(shared_samples) == 0) {
stop(paste(
"No matching sample IDs found between MAF and metadata.",
"Check for whitespace or formatting differences with trimws()."
))
}
# Subset to shared samples
abundance_mat <- abundance_mat[, shared_samples]
metadata <- filter(metadata, sample_name %in% shared_samples) |>
arrange(match(sample_name, shared_samples))# Preview the annotation columns
annot_df |>
select(metabolite_identification, mass_to_charge, database_identifier) |>
head(10) |>
kable(caption = "First 10 lipid features (annotation preview)") |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE)| metabolite_identification | mass_to_charge | database_identifier |
|---|---|---|
| cholesterol | 369.3514 | CHEBI:16113 |
| NA [637.3359] | 637.3359 | NA |
| lysophosphatidylcholine 15:0 | 482.3247 | CHEBI:72736 |
| lysophosphatidylethanolamine 18:0 | 482.3247 | CHEBI:64576 |
| lysophosphatidylcholine 16:1 | 494.3245 | CHEBI:64560 |
| lysophosphatidylethanolamine 19:1 | 494.3245 | CHEBI:138537 |
| lysophosphatidylcholine 16:0 | 496.3404 | CHEBI:64563 |
| lysophosphatidylethanolamine 19:0 | 496.3404 | CHEBI:138533 |
| lysophosphatidylcholine 17:0 | 510.3560 | CHEBI:72737 |
| lysophosphatidylethanolamine 20:0 | 510.3560 | CHEBI:132556 |
Missing values arise when a lipid signal falls below the instrument detection threshold. The distributions across features and samples are assessed before filtering.
miss_feat <- rowMeans(is.na(abundance_mat)) * 100
miss_sample <- colMeans(is.na(abundance_mat)) * 100
p_miss_feat <- ggplot(data.frame(pct = miss_feat), aes(x = pct)) +
geom_histogram(bins = 40, fill = "#2C7BB6", colour = "white") +
labs(title = "Missing values per lipid feature",
x = "% Missing", y = "Feature count") +
theme_bw(base_size = 12)
p_miss_samp <- ggplot(
data.frame(sample = names(miss_sample), pct = miss_sample),
aes(x = reorder(sample, pct), y = pct)
) +
geom_bar(stat = "identity", fill = "#D7191C") +
labs(title = "Missing values per sample",
x = "Sample (sorted by % missing)", y = "% Missing") +
theme_bw(base_size = 12) +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
p_miss_feat + p_miss_sampFeatures missing in more than 30% of samples are removed.
abundance_filt <- abundance_mat[miss_feat < 30, ]
cat(
"Features before filter:", nrow(abundance_mat), "\n",
"Features after filter: ", nrow(abundance_filt), "\n",
"Features removed: ", nrow(abundance_mat) - nrow(abundance_filt), "\n"
)#> Features before filter: 497
#> Features after filter: 491
#> Features removed: 6
bad_samples <- names(miss_sample[miss_sample > 50])
if (length(bad_samples) > 0) {
cat("⚠️ Samples with >50% missing — consider exclusion:\n")
cat(paste(bad_samples, collapse = ", "), "\n")
} else {
cat("✅ No samples exceed 50% missing threshold.\n")
}#> ⚠️ Samples with >50% missing — consider exclusion:
#> E171, FC1010, FC1074, H1632, H1858, H954, H965, LY1083, LY2283, LY2587, LY2632, MY1199, RC1414, RC1428, RC1824, T1029, T235, T246, X1082, X1343, X1549, X1941, B213, E1680, E564, FC1032, FC1200, H1353, H1539, H1828, LY1109, LY1982, LY2330, LY2804, LY2974, LY3060, LY3203, M211, M426, MY1139, MY1569, MY1966, P280, RC1248, RC1715, RC1719, RC2064, RC2090, RC2095, RC54, X1224, X1240, X1314, X1483, FC1021, MY1632, RC176, FC1223, MY1234, MY1024, RC2088, X1121, B334, E1318, H869, LY1765, LY2169, LY3214, MY1089, RC1772, RC219, RC80, X1510, X1567, X1600, E1384, FC1286, E269, MY1381, B274, J31, MY1147, X1522, AY1052, X1079, X1971, E1568, FC1070, H764, LY3037, LY3100, MY1626, P596, RC1528, RC1563, X1588, X1665, E133, H1411, X1284, X1579, LY3098, LY3130, RC1814, H1214, LY1855, LY2516, LY3174, X1137, LY3199, RC1411, AY1036, B303, FC1260, H1408, H1658, H1775, LY1101, LY1200, LY2185, LY2534, LY3058, LY3187, LY3213, M26, MY1498, P601, RC1561, T206, X1666, E1053, MY1506, P118, E517, FC1239, LY2980, RC90, E714, FC1008, LY1341, LY3069, M95, MY1039, MY1146, X1045, X1969, X1978, FC1025, LY2784, LY2836, RC1442, RC1710, T413, X1146, X1795, H242, E198, B139, H1464, LY3226, RC1278, X1646, RC1808, T438, X1509, B357, X1363, RC1226, X1571, P608, P430, H1706, LY2249, X1548, DY1112, E1646, B127, LY2711, H1600, B113, LY2698, H1382, RC83, B205, LY1188, LY2710, T422, E27, LY1239, T147, T780, E29, E969, FC1164, J5, T408, LY1456, E1122, E53, P633, T1063, T647, H362, E1642, H1240, H1377, LY2879
Total ion signal (TIS) variation across samples reflects global intensity differences that normalisation must account for.
tis_df <- data.frame(
sample = colnames(abundance_filt),
tis = colSums(abundance_filt, na.rm = TRUE)
) |>
left_join(select(metadata, sample_name, gender, diabetes),
by = c("sample" = "sample_name"))
ggplot(tis_df, aes(x = reorder(sample, tis), y = tis / 1e6, fill = gender)) +
geom_bar(stat = "identity") +
scale_fill_brewer(palette = "Set1", na.value = "grey60") +
labs(
title = "Total ion signal per sample (raw, pre-normalisation)",
x = "Sample (sorted by TIS)",
y = "Sum intensity (×10⁶)",
fill = "Gender"
) +
theme_bw(base_size = 12) +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())Probabilistic Quotient Normalisation (PQN) is applied. A reference spectrum is constructed from the per-feature median across all samples; each sample’s normalisation factor is then the median of its quotients against that reference. This approach is robust when only a subset of lipids differ between samples, which TIC normalisation does not account for (Dieterle et al., 2006).
pqn_normalise <- function(mat) {
ref <- apply(mat, 1, median, na.rm = TRUE)
quotients <- mat / ref
nf <- apply(quotients, 2, median, na.rm = TRUE)
cat("PQN normalisation factors:\n")
cat(" Mean:", round(mean(nf), 4),
"| SD:", round(sd(nf), 4),
"| Range:", round(range(nf), 4), "\n")
sweep(mat, 2, nf, "/")
}
abundance_pqn <- pqn_normalise(abundance_filt)#> PQN normalisation factors:
#> Mean: NA | SD: NA | Range: NA NA
Log₂ transformation stabilises variance across the intensity range and symmetrises the distribution prior to linear modelling.
Remaining missing values are filled by KNN imputation
(k = 5), which borrows from the five most similar features.
A secondary 50% missingness filter is applied immediately before
imputation: PQN can introduce new NAs where a feature’s reference median
is zero, and subsetting to shared samples can shift per-feature
missingness proportions beyond the threshold applied in section 3.
# ── Final missingness filter ─────────────────────────────────────────────
miss_feat_pre <- rowMeans(is.na(abundance_log))
miss_samp_pre <- colMeans(is.na(abundance_log))
n_feat_drop <- sum(miss_feat_pre >= 0.5)
n_samp_drop <- sum(miss_samp_pre >= 0.5)
if (n_feat_drop > 0)
message(n_feat_drop, " features with >= 50% missing removed before imputation.")
if (n_samp_drop > 0)
message(n_samp_drop, " samples with >= 50% missing removed before imputation.")
abundance_log_clean <- abundance_log[miss_feat_pre < 0.5, miss_samp_pre < 0.5]
cat("Matrix entering imputation:",
nrow(abundance_log_clean), "features x",
ncol(abundance_log_clean), "samples
")#> Matrix entering imputation: 487 features x 5418 samples
# ── KNN imputation ────────────────────────────────────────────────────────
if (requireNamespace("impute", quietly = TRUE)) {
library(impute)
mat <- impute.knn(abundance_log_clean, k = 5)$data
cat("KNN imputation complete (k = 5).
")
} else {
mat <- abundance_log_clean
for (i in seq_len(nrow(mat))) {
row_med <- median(mat[i, ], na.rm = TRUE)
mat[i, is.na(mat[i, ])] <- row_med
}
cat("Median imputation used (install Bioc 'impute' for KNN).
")
}#> KNN imputation complete (k = 5).
stopifnot("NAs remain after imputation" = sum(is.na(mat)) == 0)
cat("Final matrix:", nrow(mat), "features x", ncol(mat), "samples
")#> Final matrix: 487 features x 5418 samples
# Realign metadata to the (possibly reduced) sample set after filters
metadata <- filter(metadata, sample_name %in% colnames(mat)) |>
arrange(match(sample_name, colnames(mat)))This MAF uses full English lipid names rather than LIPID MAPS
abbreviations. Names are parsed and mapped to standard short codes.
Total carbon count and degree of unsaturation are extracted from the
chain notation (e.g. 34:1 → 34 carbons, 1 double bond).
# The MAF file uses full English names (e.g. "phosphatidylcholine 34:1")
# rather than LIPID MAPS abbreviations. We map the first word(s) of each
# name to a standard short code. The lookup covers every class seen in
# this dataset's table(lipid_class) output.
class_lookup <- c(
"phosphatidylcholine" = "PC",
"lysophosphatidylcholine" = "LPC",
"phosphatidylethanolamine" = "PE",
"lysophosphatidylethanolamine" = "LPE",
"phosphatidylserine" = "PS",
"phosphatidylinositol" = "PI",
"phosphatidylinositole" = "PI", # alternate spelling in MAF
"phosphatidylglycerol" = "PG",
"phosphatidylglicerate" = "PG", # typo variant in MAF
"phosphatidic" = "PA", # "phosphatidic acid"
"Phosphatidic" = "PA",
"phosphatidynserine" = "PS", # typo variant
"sphingomyelin" = "SM",
"triacylglycerol" = "TG",
"diacylglycerol" = "DG",
"cholesteryl" = "CE",
"cholesterol" = "Chol",
"Ceramide" = "Cer",
"CEoxid" = "oxCE",
"CE" = "CE",
"Cer" = "Cer",
"DG" = "DG",
"PG" = "PG",
"fatty" = "FA", # "fatty acid"
"docosapentaenoate" = "FA",
"docosatetraenoate" = "FA",
"octadecadienoate" = "FA",
"octadecatrienoate" = "FA",
"octadecenoate" = "FA",
"hexadecenoate" = "FA",
"icosatetraenoate" = "FA",
"tetracosenoate" = "FA",
"tricosanoate" = "FA",
"Tritridecanoin" = "TG",
"all" = "other",
"NA" = NA_character_
)
lipid_annot <- annot_df |>
filter(metabolite_identification %in% rownames(mat)) |>
transmute(
lipid = metabolite_identification,
# Extract the first word to use as the lookup key
first_word = str_extract(lipid, "^[^\\s\\[]+"),
lipid_class = class_lookup[first_word],
# Fatty acid chain info: total carbons and double bonds
chain_info = str_extract(lipid, "\\d+:\\d+"),
carbons = as.integer(str_extract(chain_info, "^\\d+")),
double_bonds = as.integer(str_extract(chain_info, "(?<=:)\\d+"))
) |>
select(-first_word)
# Summary
class_counts <- count(lipid_annot, lipid_class, sort = TRUE)
cat("Lipid classes after standardisation:\n")#> Lipid classes after standardisation:
#> # A tibble: 15 × 2
#> lipid_class n
#> <chr> <int>
#> 1 <NA> 128
#> 2 PC 89
#> 3 TG 53
#> 4 SM 51
#> 5 PS 31
#> 6 PE 29
#> 7 PI 21
#> 8 FA 19
#> 9 DG 18
#> 10 PA 15
#> 11 PG 11
#> 12 CE 9
#> 13 LPC 8
#> 14 LPE 4
#> 15 Chol 1
ggplot(class_counts,
aes(x = reorder(lipid_class, n), y = n, fill = lipid_class)) +
geom_col() +
coord_flip() +
labs(title = "Lipid species detected per class",
x = NULL, y = "Number of features") +
theme_bw(base_size = 12) +
theme(legend.position = "none")class_counts |>
mutate(pct = round(n / sum(n) * 100, 1)) |>
kable(col.names = c("Lipid class", "Features", "% of total"),
caption = "Detected lipid classes") |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Lipid class | Features | % of total |
|---|---|---|
| NA | 128 | 26.3 |
| PC | 89 | 18.3 |
| TG | 53 | 10.9 |
| SM | 51 | 10.5 |
| PS | 31 | 6.4 |
| PE | 29 | 6.0 |
| PI | 21 | 4.3 |
| FA | 19 | 3.9 |
| DG | 18 | 3.7 |
| PA | 15 | 3.1 |
| PG | 11 | 2.3 |
| CE | 9 | 1.8 |
| LPC | 8 | 1.6 |
| LPE | 4 | 0.8 |
| Chol | 1 | 0.2 |
class_abundance <- as.data.frame(mat) |>
rownames_to_column("lipid") |>
left_join(select(lipid_annot, lipid, lipid_class), by = "lipid") |>
group_by(lipid_class) |>
summarise(across(where(is.numeric), \(x) sum(x, na.rm = TRUE)),
.groups = "drop") |>
mutate(mean_across_samples = rowMeans(across(where(is.numeric)))) |>
arrange(desc(mean_across_samples))
ggplot(class_abundance,
aes(x = reorder(lipid_class, mean_across_samples),
y = mean_across_samples, fill = lipid_class)) +
geom_col() +
coord_flip() +
labs(title = "Mean summed log₂ abundance by lipid class",
x = NULL, y = "Mean log₂ (sum across samples)") +
theme_bw(base_size = 12) +
theme(legend.position = "none")PCA, correlation structure, and feature variance are examined across all samples prior to differential testing.
pca_res <- prcomp(t(mat), center = TRUE, scale. = TRUE)
var_exp <- round(summary(pca_res)$importance[2, 1:10] * 100, 1)
pca_df <- as.data.frame(pca_res$x[, 1:5]) |>
rownames_to_column("sample_name") |>
left_join(metadata, by = "sample_name")ggplot(pca_df, aes(x = PC1, y = PC2, colour = diabetes, shape = gender)) +
geom_point(size = 3, alpha = 0.85) +
scale_colour_brewer(palette = "Set1", na.value = "grey60") +
labs(
title = "PCA — log₂ PQN-normalised lipidome",
x = paste0("PC1 (", var_exp[1], "%)"),
y = paste0("PC2 (", var_exp[2], "%)"),
colour = "Diabetes",
shape = "Gender"
) +
theme_bw(base_size = 12)ggplot(pca_df, aes(x = PC1, y = PC2, colour = age)) +
geom_point(size = 3, alpha = 0.85) +
scale_colour_viridis_c(option = "plasma") +
labs(
title = "PCA — coloured by age",
x = paste0("PC1 (", var_exp[1], "%)"),
y = paste0("PC2 (", var_exp[2], "%)"),
colour = "Age (yr)"
) +
theme_bw(base_size = 12)data.frame(PC = paste0("PC", 1:10), var = var_exp) |>
ggplot(aes(x = factor(PC, levels = paste0("PC", 1:10)), y = var)) +
geom_col(fill = "#4DAC26") +
geom_line(aes(group = 1)) +
geom_point(size = 2) +
labs(title = "Scree plot — variance explained per PC",
x = NULL, y = "% Variance explained") +
theme_bw(base_size = 12)Samples clustering by biological group rather than technical variation confirm adequate normalisation. Low-correlation outliers indicate candidates for further QC review.
cor_mat <- cor(mat, use = "pairwise.complete.obs")
col_annot <- metadata |>
select(sample_name, diabetes, gender) |>
column_to_rownames("sample_name") |>
mutate(across(everything(), as.character))
pheatmap(
cor_mat,
color = colorRampPalette(c("#4575B4", "white", "#D73027"))(100),
clustering_method = "ward.D2",
annotation_col = col_annot,
show_rownames = FALSE,
show_colnames = FALSE,
main = "Sample–sample Pearson correlation (PQN log₂)"
)top100_idx <- order(apply(mat, 1, var, na.rm = TRUE), decreasing = TRUE)[1:100]
pheatmap(
mat[top100_idx, ],
scale = "row",
color = colorRampPalette(rev(brewer.pal(11, "RdYlBu")))(100),
clustering_method = "ward.D2",
clustering_distance_rows = "correlation",
clustering_distance_cols = "correlation",
annotation_col = col_annot,
show_rownames = FALSE,
show_colnames = FALSE,
main = "Top 100 most variable lipid species (row Z-score)"
)Differential abundance is tested using limma’s empirical Bayes framework (Ritchie et al., 2015). Variance estimates are moderated across all features, which improves power at moderate sample sizes. Multiple testing is controlled by Benjamini-Hochberg FDR correction.
These samples are the control arm of an MI case-control study. Diabetes status reflects self-reported comorbidity within the control cohort.
meta_da <- filter(metadata, !is.na(diabetes))
mat_da <- mat[, meta_da$sample_name]
group <- factor(meta_da$diabetes)
design <- model.matrix(~ 0 + group)
colnames(design) <- levels(group)
fit <- lmFit(mat_da, design)
fit2 <- contrasts.fit(
fit,
makeContrasts(DiabetesYesVsNo = Yes - No, levels = design)
) |> eBayes()
da_results <- topTable(fit2, coef = "DiabetesYesVsNo",
number = Inf, adjust.method = "BH") |>
rownames_to_column("lipid") |>
left_join(lipid_annot, by = "lipid") |>
arrange(adj.P.Val) |>
mutate(
sig = adj.P.Val < 0.05 & abs(logFC) > 0.5,
direction = case_when(
sig & logFC > 0.5 ~ "Up in Diabetes",
sig & logFC < -0.5 ~ "Down in Diabetes",
TRUE ~ "NS"
)
)
cat("Significant lipids (FDR < 5%, |logFC| > 0.5):",
sum(da_results$sig, na.rm = TRUE), "\n")#> Significant lipids (FDR < 5%, |logFC| > 0.5): 0
da_results |>
filter(sig) |>
select(lipid, lipid_class, logFC, AveExpr, t, adj.P.Val) |>
head(20) |>
mutate(across(where(is.numeric), \(x) round(x, 3))) |>
kable(caption = "Top 20 diabetes-associated lipids (FDR < 5%)") |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE)| lipid | lipid_class | logFC | AveExpr | t | adj.P.Val |
|---|---|---|---|---|---|
| NA | NA | NA | NA | NA | NA |
| :—– | :———– | —–: | ——-: | –: | ———: |
ggplot(da_results, aes(x = logFC, y = -log10(P.Value), colour = direction)) +
geom_point(alpha = 0.6, size = 1.8) +
geom_vline(xintercept = c(-0.5, 0.5), linetype = "dashed", colour = "grey50") +
geom_hline(yintercept = -log10(0.05), linetype = "dashed", colour = "grey50") +
geom_text_repel(
data = filter(da_results, sig),
aes(label = lipid), size = 2.5, max.overlaps = 20
) +
scale_colour_manual(values = c(
"Up in Diabetes" = "#D73027",
"Down in Diabetes" = "#4575B4",
"NS" = "grey70"
)) +
labs(
title = "Volcano plot: Diabetes Yes vs No (limma eBayes)",
x = "log₂ Fold Change (Yes − No)",
y = "−log₁₀(p-value)",
colour = NULL
) +
theme_bw(base_size = 12)meta_gd <- filter(metadata, !is.na(gender))
mat_gd <- mat[, meta_gd$sample_name]
grp_gd <- factor(meta_gd$gender)
des_gd <- model.matrix(~ 0 + grp_gd)
colnames(des_gd) <- levels(grp_gd)
fit_gd <- lmFit(mat_gd, des_gd) |>
contrasts.fit(makeContrasts(MaleVsFemale = Male - Female, levels = des_gd)) |>
eBayes()
gender_results <- topTable(fit_gd, coef = "MaleVsFemale",
number = Inf, adjust.method = "BH") |>
rownames_to_column("lipid") |>
left_join(lipid_annot, by = "lipid") |>
arrange(adj.P.Val) |>
mutate(
sig = adj.P.Val < 0.05 & abs(logFC) > 0.5,
direction = case_when(
sig & logFC > 0.5 ~ "Up in Male",
sig & logFC < -0.5 ~ "Up in Female",
TRUE ~ "NS"
)
)
cat("Significant lipids (FDR < 5%, |logFC| > 0.5):",
sum(gender_results$sig, na.rm = TRUE), "\n")#> Significant lipids (FDR < 5%, |logFC| > 0.5): 0
ggplot(gender_results, aes(x = logFC, y = -log10(P.Value), colour = direction)) +
geom_point(alpha = 0.6, size = 1.8) +
geom_vline(xintercept = c(-0.5, 0.5), linetype = "dashed", colour = "grey50") +
geom_hline(yintercept = -log10(0.05), linetype = "dashed", colour = "grey50") +
geom_text_repel(data = filter(gender_results, sig),
aes(label = lipid), size = 2.5, max.overlaps = 20) +
scale_colour_manual(values = c("Up in Male" = "#4575B4",
"Up in Female" = "#D73027",
"NS" = "grey70")) +
labs(title = "Volcano plot: Male vs Female (limma eBayes)",
x = "log₂ Fold Change (Male − Female)",
y = "−log₁₀(p-value)", colour = NULL) +
theme_bw(base_size = 12)# Use the top classes actually present in this dataset
top_classes <- class_counts |>
filter(!is.na(lipid_class), n >= 5) |>
slice_max(n, n = 6) |>
pull(lipid_class)
lipid_annot |>
filter(!is.na(carbons), lipid_class %in% top_classes) |>
ggplot(aes(x = factor(carbons), fill = lipid_class)) +
geom_bar() +
facet_wrap(~ lipid_class, scales = "free_y", ncol = 3) +
labs(title = "Total fatty acid carbon chain length per class",
x = "Total carbons", y = "# species") +
theme_bw(base_size = 11) +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1))Double bond count reflects the degree of polyunsaturation. PUFA composition influences membrane fluidity and is linked to eicosanoid-mediated inflammatory signalling.
lipid_annot |>
filter(!is.na(double_bonds), lipid_class %in% top_classes) |>
ggplot(aes(x = factor(double_bonds), fill = lipid_class)) +
geom_bar() +
facet_wrap(~ lipid_class, scales = "free_y", ncol = 3) +
labs(title = "Degree of unsaturation (double bonds) per class",
x = "Double bonds", y = "# species") +
theme_bw(base_size = 11) +
theme(legend.position = "none")class_counts |>
mutate(pct = n / sum(n) * 100) |>
ggplot(aes(x = reorder(lipid_class, pct), y = pct, fill = lipid_class)) +
geom_col() +
coord_flip() +
labs(title = "Lipid class composition (% of detected features)",
x = NULL, y = "% of features") +
theme_bw(base_size = 12) +
theme(legend.position = "none")top_da <- filter(da_results, sig) |>
slice_min(adj.P.Val, n = 20) |>
pull(lipid)
if (length(top_da) > 0) {
as.data.frame(mat[top_da, , drop = FALSE]) |>
rownames_to_column("lipid") |>
pivot_longer(-lipid, names_to = "sample_name", values_to = "log2_abund") |>
left_join(select(metadata, sample_name, diabetes), by = "sample_name") |>
filter(!is.na(diabetes)) |>
ggplot(aes(x = diabetes, y = log2_abund, fill = diabetes)) +
geom_boxplot(outlier.size = 0.8, alpha = 0.85) +
facet_wrap(~ lipid, scales = "free_y", ncol = 5) +
scale_fill_brewer(palette = "Set1") +
labs(title = "Top diabetes-associated lipids (FDR-ranked)",
x = "Diabetes status", y = "log₂ abundance",
fill = "Diabetes") +
theme_bw(base_size = 10) +
theme(legend.position = "bottom")
} else {
cat("No significant diabetes-associated lipids at the chosen thresholds.\n")
}#> No significant diabetes-associated lipids at the chosen thresholds.
dir.create("results", showWarnings = FALSE)
write_csv(as.data.frame(mat) |> rownames_to_column("lipid"),
"results/abundance_pqn_log2_knn_imputed.csv")
write_csv(lipid_annot,
"results/lipid_annotation_parsed.csv")
write_csv(da_results,
"results/DA_diabetes_YesVsNo.csv")
write_csv(gender_results,
"results/DA_gender_MaleVsFemale.csv")
write_csv(metadata,
"results/metadata_clean.csv")
cat("Results written to /results/\n")#> Results written to /results/
#> R version 4.5.3 (2026-03-11)
#> Platform: aarch64-apple-darwin20
#> Running under: macOS Sequoia 15.7.5
#>
#> Matrix products: default
#> BLAS: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
#>
#> locale:
#> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#>
#> time zone: Europe/Stockholm
#> tzcode source: internal
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] impute_1.84.0 limma_3.66.0 kableExtra_1.4.0 knitr_1.51
#> [5] RColorBrewer_1.1-3 patchwork_1.3.2 ggrepel_0.9.8 pheatmap_1.0.13
#> [9] janitor_2.2.1 lubridate_1.9.5 forcats_1.0.1 stringr_1.6.0
#> [13] dplyr_1.2.1 purrr_1.2.2 readr_2.2.0 tidyr_1.3.2
#> [17] tibble_3.3.1 ggplot2_4.0.3 tidyverse_2.0.0
#>
#> loaded via a namespace (and not attached):
#> [1] gtable_0.3.6 xfun_0.57 bslib_0.10.0
#> [4] tzdb_0.5.0 vctrs_0.7.3 tools_4.5.3
#> [7] generics_0.1.4 parallel_4.5.3 pkgconfig_2.0.3
#> [10] S7_0.2.2 lifecycle_1.0.5 compiler_4.5.3
#> [13] farver_2.1.2 textshaping_1.0.5 statmod_1.5.1
#> [16] snakecase_0.11.1 htmltools_0.5.9 sass_0.4.10
#> [19] yaml_2.3.12 pillar_1.11.1 crayon_1.5.3
#> [22] jquerylib_0.1.4 cachem_1.1.0 tidyselect_1.2.1
#> [25] digest_0.6.39 stringi_1.8.7 labeling_0.4.3
#> [28] fastmap_1.2.0 grid_4.5.3 cli_3.6.6
#> [31] magrittr_2.0.5 dichromat_2.0-0.1 utf8_1.2.6
#> [34] withr_3.0.2 scales_1.4.0 bit64_4.8.0
#> [37] timechange_0.4.0 rmarkdown_2.31 bit_4.6.0
#> [40] otel_0.2.0 hms_1.1.4 evaluate_1.0.5
#> [43] viridisLite_0.4.3 rlang_1.2.0 Rcpp_1.1.1-1.1
#> [46] glue_1.8.1 BiocManager_1.30.27 xml2_1.5.2
#> [49] svglite_2.2.2 rstudioapi_0.18.0 vroom_1.7.1
#> [52] jsonlite_2.0.0 R6_2.6.1 systemfonts_1.3.2
| Citation | |
|---|---|
| Dataset | Castro C, Harshfield EL, Butterworth AS, Wood AM, Koulman A, Griffin JL. (2024). A lipidomic dataset for epidemiological studies of acute myocardial infarction. Data in Brief 57:110925 |
| Study | Harshfield EL, Koulman A, Ziemek D, Marney L, Fauman EB, Paul DS, et al. (2019). An unbiased lipid phenotyping approach to study the genetic determinants of lipids and their association with coronary heart disease risk factors. J. Proteome Res. 18(6):2397–2410 |
| limma | Ritchie et al. (2015) Nucleic Acids Res. doi:10.1093/nar/gkv007 |
| limma eBayes | Smyth (2004) Stat. Appl. Genet. Mol. Biol. |
| impute | Troyanskaya et al. (2001) Bioinformatics 17:520 |
| PQN | Dieterle et al. (2006) Anal. Chem. 78:4281 |
| LIPID MAPS shorthand | Liebisch et al. (2020) J. Lipid Res. 61:1650 |
| tidyverse | Wickham et al. (2019) JOSS doi:10.21105/joss.01686 |
| pheatmap | Kolde R (2019) CRAN |