Study Overview

Field Detail
MetaboLights ID MTBLS4461
Technique Direct-infusion MS (DI-MS), alternating polarity
Instrument Thermo Exactive Orbitrap
Matrix Human serum
Cohort Pakistani control subjects from a case-control MI study
Biological factors Gender, Age, Diabetes status

Samples were infused directly into the mass spectrometer without prior chromatographic separation, enabling high-throughput profiling across a large epidemiological cohort. The absence of chromatography increases ion suppression, making normalisation a critical step in the analytical pipeline.


0. Packages

if (!requireNamespace("BiocManager", quietly = TRUE))
  install.packages("BiocManager")

cran_pkgs <- c(
  "tidyverse",    # dplyr, ggplot2, stringr, readr, tidyr
  "janitor",      # clean_names() — makes column names R-safe
  "pheatmap",     # clustered heatmaps
  "ggrepel",      # non-overlapping text labels on volcano plots
  "patchwork",    # stitch ggplot panels together
  "RColorBrewer", # colour palettes
  "knitr",        # kable() for formatted tables
  "kableExtra"    # enhanced kable styling
)

bioc_pkgs <- c(
  "limma",   # eBayes differential abundance — the industry standard
  "impute"   # KNN imputation for missing values
)

install.packages(setdiff(cran_pkgs, rownames(installed.packages())),
                 quiet = TRUE)
BiocManager::install(setdiff(bioc_pkgs, rownames(installed.packages())),
                     ask = FALSE, quiet = TRUE)

suppressPackageStartupMessages(
  lapply(c(cran_pkgs, bioc_pkgs), library, character.only = TRUE)
)
#> [[1]]
#>  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
#>  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
#> [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
#> 
#> [[2]]
#>  [1] "janitor"   "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
#>  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
#> [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
#> 
#> [[3]]
#>  [1] "pheatmap"  "janitor"   "lubridate" "forcats"   "stringr"   "dplyr"    
#>  [7] "purrr"     "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse"
#> [13] "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"  
#> [19] "base"     
#> 
#> [[4]]
#>  [1] "ggrepel"   "pheatmap"  "janitor"   "lubridate" "forcats"   "stringr"  
#>  [7] "dplyr"     "purrr"     "readr"     "tidyr"     "tibble"    "ggplot2"  
#> [13] "tidyverse" "stats"     "graphics"  "grDevices" "utils"     "datasets" 
#> [19] "methods"   "base"     
#> 
#> [[5]]
#>  [1] "patchwork" "ggrepel"   "pheatmap"  "janitor"   "lubridate" "forcats"  
#>  [7] "stringr"   "dplyr"     "purrr"     "readr"     "tidyr"     "tibble"   
#> [13] "ggplot2"   "tidyverse" "stats"     "graphics"  "grDevices" "utils"    
#> [19] "datasets"  "methods"   "base"     
#> 
#> [[6]]
#>  [1] "RColorBrewer" "patchwork"    "ggrepel"      "pheatmap"     "janitor"     
#>  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
#> [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
#> [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
#> [21] "methods"      "base"        
#> 
#> [[7]]
#>  [1] "knitr"        "RColorBrewer" "patchwork"    "ggrepel"      "pheatmap"    
#>  [6] "janitor"      "lubridate"    "forcats"      "stringr"      "dplyr"       
#> [11] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
#> [16] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
#> [21] "datasets"     "methods"      "base"        
#> 
#> [[8]]
#>  [1] "kableExtra"   "knitr"        "RColorBrewer" "patchwork"    "ggrepel"     
#>  [6] "pheatmap"     "janitor"      "lubridate"    "forcats"      "stringr"     
#> [11] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
#> [16] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
#> [21] "utils"        "datasets"     "methods"      "base"        
#> 
#> [[9]]
#>  [1] "limma"        "kableExtra"   "knitr"        "RColorBrewer" "patchwork"   
#>  [6] "ggrepel"      "pheatmap"     "janitor"      "lubridate"    "forcats"     
#> [11] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
#> [16] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
#> [21] "grDevices"    "utils"        "datasets"     "methods"      "base"        
#> 
#> [[10]]
#>  [1] "impute"       "limma"        "kableExtra"   "knitr"        "RColorBrewer"
#>  [6] "patchwork"    "ggrepel"      "pheatmap"     "janitor"      "lubridate"   
#> [11] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
#> [16] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
#> [21] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
#> [26] "base"

1. Data Acquisition

Files are retrieved from the MetaboLights REST API. An FTP mirror is available at ftp://ftp.ebi.ac.uk/pub/databases/metabolights/studies/public/MTBLS4461/.

dir.create("data/MTBLS4461", recursive = TRUE, showWarnings = FALSE)

download_if_missing <- function(filename, dest_dir = "data/MTBLS4461") {
  dest <- file.path(dest_dir, filename)
  if (!file.exists(dest)) {
    url <- paste0(
      "https://www.ebi.ac.uk/metabolights/ws/studies/MTBLS4461/download?file=",
      URLencode(filename, reserved = TRUE)
    )
    message("Downloading: ", filename)
    download.file(url, destfile = dest, mode = "wb", quiet = TRUE)
  }
  invisible(dest)
}

download_if_missing("s_MTBLS4461.txt")
download_if_missing("a_MTBLS4461_DI-MS_alternating__metabolite_profiling.txt")
download_if_missing("m_MTBLS4461_DI-MS_alternating__metabolite_profiling_v2_maf.tsv")
download_if_missing("m_MTBLS4461_DI-MS_positive__metabolite_profiling_v2_maf.tsv_SPLIT")

message("All files ready.")

2. Data Import & Inspection

2a. Sample Metadata

The s_ file is in ISA-Tab format. Ontology term columns are dropped; three biological factors are retained: gender, age, and diabetes status.

meta_raw <- read_tsv(
  "data/MTBLS4461/s_MTBLS4461.txt",
  show_col_types = FALSE
)

metadata <- meta_raw |>
  select(
    sample_name = `Sample Name`,
    source_name = `Source Name`,
    gender      = `Factor Value[Gender]`,
    age         = `Factor Value[Age]`,
    diabetes    = `Factor Value[Diabetes]`
  ) |>
  mutate(
    age      = as.numeric(age),
    gender   = factor(gender),
    diabetes = na_if(diabetes, "") |> factor()
  )

cat("Total samples:", nrow(metadata), "\n")
#> Total samples: 5681
# Gender distribution
table(metadata$gender, useNA = "ifany") |>
  as.data.frame() |>
  setNames(c("Gender", "Count")) |>
  kable(caption = "Gender distribution") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Gender distribution
Gender Count
Female 1195
Male 4465
NA 21
# Diabetes distribution
table(metadata$diabetes, useNA = "ifany") |>
  as.data.frame() |>
  setNames(c("Diabetes", "Count")) |>
  kable(caption = "Diabetes status") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Diabetes status
Diabetes Count
No 2604
Yes 1609
NA 1468
# Age summary
summary(metadata$age) |>
  t() |>
  kable(caption = "Age distribution (years)",
        digits = 1) |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Age distribution (years)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA’s
27 50 54 54.2 60 87 21
head(metadata, 8) |>
  kable(caption = "First 8 rows of cleaned metadata") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE)
First 8 rows of cleaned metadata
sample_name source_name gender age diabetes
X1352 Individual_1 Male 50 No
LY3258 Individual_2 Male 51 NA
RC1732 Individual_3 Male 50 No
M580 Individual_4 Male 45 NA
FC1104 Individual_5 Male 60 No
B289 Individual_6 Male 59 NA
X1360 Individual_7 Female 60 No
X1641 Individual_8 Male 70 No

2b. MAF Data Matrix

The MAF (MetaboLights Annotation Format) file contains one row per lipid feature. The first 21 columns are annotation fields; sample abundance columns follow. Three spurious unnamed columns are removed on import.

read_maf <- function(path) {
  maf <- read_tsv(path, show_col_types = FALSE)

  # Drop unnamed junk columns (Unnamed: 21/22/23)
  maf <- maf |> select(-starts_with("Unnamed"))

  # Known annotation columns — metabolite_identification is intentionally
  # excluded here; it is handled separately as the row identifier to avoid
  # it appearing twice in select() calls.
  annot_cols <- c(
    "database_identifier", "chemical_formula", "smiles", "inchi",
    "mass_to_charge", "fragmentation", "modifications", "charge",
    "retention_time", "taxid", "species", "database", "database_version",
    "reliability", "uri", "search_engine", "search_engine_score",
    "smallmolecule_abundance_sub", "smallmolecule_abundance_stdev_sub",
    "smallmolecule_abundance_std_error_sub"
  )

  # Which of those are actually present in this file
  present_annot <- intersect(annot_cols, names(maf))

  # Sample columns = everything that is not an annotation or the ID column
  non_sample  <- c("metabolite_identification", present_annot)
  sample_cols <- setdiff(names(maf), non_sample)

  # ── Build unique row IDs ────────────────────────────────────────────────
  # Duplicates occur when the same lipid name appears at multiple m/z values
  # (different adducts e.g. [M+H]+, [M+Na]+) or across both polarities.
  # column_to_rownames() hard-errors on any duplicate, so we disambiguate
  # by appending the rounded m/z; numeric suffix is the last-resort fallback.
  n_dupes <- sum(duplicated(maf$metabolite_identification, incomparables = NA))

  if (n_dupes > 0) {
    message(n_dupes, " duplicate metabolite_identification entries detected — ",
            "appending m/z to create unique row IDs.")

    mz_vals <- round(as.numeric(maf$mass_to_charge), 4)
    is_dup  <- duplicated(maf$metabolite_identification) |
               duplicated(maf$metabolite_identification, fromLast = TRUE)
    row_ids <- ifelse(is_dup,
                      paste0(maf$metabolite_identification, " [", mz_vals, "]"),
                      maf$metabolite_identification)

    # Numeric suffix if name+m/z is still not unique
    if (anyDuplicated(row_ids)) {
      row_ids <- ave(row_ids, row_ids, FUN = function(x) {
        if (length(x) > 1) paste0(x, "_", seq_along(x)) else x
      })
    }
  } else {
    row_ids <- maf$metabolite_identification
  }

  # ── Assemble outputs ────────────────────────────────────────────────────
  annot_out <- maf |>
    select(all_of(present_annot)) |>
    mutate(metabolite_identification = row_ids, .before = 1)

  abund_out <- maf |>
    select(all_of(sample_cols)) |>
    mutate(metabolite_identification = row_ids) |>
    column_to_rownames("metabolite_identification") |>
    mutate(across(everything(), as.numeric)) |>
    as.matrix()

  list(annotation = annot_out, abundance = abund_out)
}

maf_alt <- read_maf(
  "data/MTBLS4461/m_MTBLS4461_DI-MS_alternating__metabolite_profiling_v2_maf.tsv"
)

annot_df      <- maf_alt$annotation
abundance_mat <- maf_alt$abundance

cat("Lipid features:", nrow(abundance_mat), "\n")
#> Lipid features: 497
cat("Samples in MAF:", ncol(abundance_mat), "\n")
#> Samples in MAF: 5662
# Match samples between MAF and metadata
shared_samples <- intersect(colnames(abundance_mat), metadata$sample_name)
cat("Samples matched between MAF and metadata:", length(shared_samples), "\n")
#> Samples matched between MAF and metadata: 5659
if (length(shared_samples) == 0) {
  stop(paste(
    "No matching sample IDs found between MAF and metadata.",
    "Check for whitespace or formatting differences with trimws()."
  ))
}

# Subset to shared samples
abundance_mat <- abundance_mat[, shared_samples]
metadata      <- filter(metadata, sample_name %in% shared_samples) |>
  arrange(match(sample_name, shared_samples))
# Preview the annotation columns
annot_df |>
  select(metabolite_identification, mass_to_charge, database_identifier) |>
  head(10) |>
  kable(caption = "First 10 lipid features (annotation preview)") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE)
First 10 lipid features (annotation preview)
metabolite_identification mass_to_charge database_identifier
cholesterol 369.3514 CHEBI:16113
NA [637.3359] 637.3359 NA
lysophosphatidylcholine 15:0 482.3247 CHEBI:72736
lysophosphatidylethanolamine 18:0 482.3247 CHEBI:64576
lysophosphatidylcholine 16:1 494.3245 CHEBI:64560
lysophosphatidylethanolamine 19:1 494.3245 CHEBI:138537
lysophosphatidylcholine 16:0 496.3404 CHEBI:64563
lysophosphatidylethanolamine 19:0 496.3404 CHEBI:138533
lysophosphatidylcholine 17:0 510.3560 CHEBI:72737
lysophosphatidylethanolamine 20:0 510.3560 CHEBI:132556

3. Quality Control

3a. Missing Values

Missing values arise when a lipid signal falls below the instrument detection threshold. The distributions across features and samples are assessed before filtering.

miss_feat   <- rowMeans(is.na(abundance_mat)) * 100
miss_sample <- colMeans(is.na(abundance_mat)) * 100

p_miss_feat <- ggplot(data.frame(pct = miss_feat), aes(x = pct)) +
  geom_histogram(bins = 40, fill = "#2C7BB6", colour = "white") +
  labs(title = "Missing values per lipid feature",
       x = "% Missing", y = "Feature count") +
  theme_bw(base_size = 12)

p_miss_samp <- ggplot(
  data.frame(sample = names(miss_sample), pct = miss_sample),
  aes(x = reorder(sample, pct), y = pct)
) +
  geom_bar(stat = "identity", fill = "#D7191C") +
  labs(title = "Missing values per sample",
       x = "Sample (sorted by % missing)", y = "% Missing") +
  theme_bw(base_size = 12) +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())

p_miss_feat + p_miss_samp

3b. Feature Filtering

Features missing in more than 30% of samples are removed.

abundance_filt <- abundance_mat[miss_feat < 30, ]

cat(
  "Features before filter:", nrow(abundance_mat), "\n",
  "Features after filter: ", nrow(abundance_filt), "\n",
  "Features removed:      ", nrow(abundance_mat) - nrow(abundance_filt), "\n"
)
#> Features before filter: 497 
#>  Features after filter:  491 
#>  Features removed:       6

3c. Poor-Quality Sample Flagging

bad_samples <- names(miss_sample[miss_sample > 50])

if (length(bad_samples) > 0) {
  cat("⚠️  Samples with >50% missing — consider exclusion:\n")
  cat(paste(bad_samples, collapse = ", "), "\n")
} else {
  cat("✅ No samples exceed 50% missing threshold.\n")
}
#> ⚠️  Samples with >50% missing — consider exclusion:
#> E171, FC1010, FC1074, H1632, H1858, H954, H965, LY1083, LY2283, LY2587, LY2632, MY1199, RC1414, RC1428, RC1824, T1029, T235, T246, X1082, X1343, X1549, X1941, B213, E1680, E564, FC1032, FC1200, H1353, H1539, H1828, LY1109, LY1982, LY2330, LY2804, LY2974, LY3060, LY3203, M211, M426, MY1139, MY1569, MY1966, P280, RC1248, RC1715, RC1719, RC2064, RC2090, RC2095, RC54, X1224, X1240, X1314, X1483, FC1021, MY1632, RC176, FC1223, MY1234, MY1024, RC2088, X1121, B334, E1318, H869, LY1765, LY2169, LY3214, MY1089, RC1772, RC219, RC80, X1510, X1567, X1600, E1384, FC1286, E269, MY1381, B274, J31, MY1147, X1522, AY1052, X1079, X1971, E1568, FC1070, H764, LY3037, LY3100, MY1626, P596, RC1528, RC1563, X1588, X1665, E133, H1411, X1284, X1579, LY3098, LY3130, RC1814, H1214, LY1855, LY2516, LY3174, X1137, LY3199, RC1411, AY1036, B303, FC1260, H1408, H1658, H1775, LY1101, LY1200, LY2185, LY2534, LY3058, LY3187, LY3213, M26, MY1498, P601, RC1561, T206, X1666, E1053, MY1506, P118, E517, FC1239, LY2980, RC90, E714, FC1008, LY1341, LY3069, M95, MY1039, MY1146, X1045, X1969, X1978, FC1025, LY2784, LY2836, RC1442, RC1710, T413, X1146, X1795, H242, E198, B139, H1464, LY3226, RC1278, X1646, RC1808, T438, X1509, B357, X1363, RC1226, X1571, P608, P430, H1706, LY2249, X1548, DY1112, E1646, B127, LY2711, H1600, B113, LY2698, H1382, RC83, B205, LY1188, LY2710, T422, E27, LY1239, T147, T780, E29, E969, FC1164, J5, T408, LY1456, E1122, E53, P633, T1063, T647, H362, E1642, H1240, H1377, LY2879

3d. Total Ion Signal (Pre-normalisation)

Total ion signal (TIS) variation across samples reflects global intensity differences that normalisation must account for.

tis_df <- data.frame(
  sample = colnames(abundance_filt),
  tis    = colSums(abundance_filt, na.rm = TRUE)
) |>
  left_join(select(metadata, sample_name, gender, diabetes),
            by = c("sample" = "sample_name"))

ggplot(tis_df, aes(x = reorder(sample, tis), y = tis / 1e6, fill = gender)) +
  geom_bar(stat = "identity") +
  scale_fill_brewer(palette = "Set1", na.value = "grey60") +
  labs(
    title = "Total ion signal per sample (raw, pre-normalisation)",
    x     = "Sample (sorted by TIS)",
    y     = "Sum intensity (×10⁶)",
    fill  = "Gender"
  ) +
  theme_bw(base_size = 12) +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())


4. Normalisation

PQN

Probabilistic Quotient Normalisation (PQN) is applied. A reference spectrum is constructed from the per-feature median across all samples; each sample’s normalisation factor is then the median of its quotients against that reference. This approach is robust when only a subset of lipids differ between samples, which TIC normalisation does not account for (Dieterle et al., 2006).

pqn_normalise <- function(mat) {
  ref       <- apply(mat, 1, median, na.rm = TRUE)
  quotients <- mat / ref
  nf        <- apply(quotients, 2, median, na.rm = TRUE)
  cat("PQN normalisation factors:\n")
  cat("  Mean:", round(mean(nf), 4),
      "| SD:", round(sd(nf), 4),
      "| Range:", round(range(nf), 4), "\n")
  sweep(mat, 2, nf, "/")
}

abundance_pqn <- pqn_normalise(abundance_filt)
#> PQN normalisation factors:
#>   Mean: NA | SD: NA | Range: NA NA

Log₂ Transform

Log₂ transformation stabilises variance across the intensity range and symmetrises the distribution prior to linear modelling.

abundance_log <- log2(abundance_pqn + 1)

Pre-imputation Filter & KNN Imputation

Remaining missing values are filled by KNN imputation (k = 5), which borrows from the five most similar features. A secondary 50% missingness filter is applied immediately before imputation: PQN can introduce new NAs where a feature’s reference median is zero, and subsetting to shared samples can shift per-feature missingness proportions beyond the threshold applied in section 3.

# ── Final missingness filter ─────────────────────────────────────────────
miss_feat_pre <- rowMeans(is.na(abundance_log))
miss_samp_pre <- colMeans(is.na(abundance_log))

n_feat_drop <- sum(miss_feat_pre >= 0.5)
n_samp_drop <- sum(miss_samp_pre >= 0.5)
if (n_feat_drop > 0)
  message(n_feat_drop, " features with >= 50% missing removed before imputation.")
if (n_samp_drop > 0)
  message(n_samp_drop, " samples with >= 50% missing removed before imputation.")

abundance_log_clean <- abundance_log[miss_feat_pre < 0.5, miss_samp_pre < 0.5]
cat("Matrix entering imputation:",
    nrow(abundance_log_clean), "features x",
    ncol(abundance_log_clean), "samples
")
#> Matrix entering imputation: 487 features x 5418 samples
# ── KNN imputation ────────────────────────────────────────────────────────
if (requireNamespace("impute", quietly = TRUE)) {
  library(impute)
  mat <- impute.knn(abundance_log_clean, k = 5)$data
  cat("KNN imputation complete (k = 5).
")
} else {
  mat <- abundance_log_clean
  for (i in seq_len(nrow(mat))) {
    row_med <- median(mat[i, ], na.rm = TRUE)
    mat[i, is.na(mat[i, ])] <- row_med
  }
  cat("Median imputation used (install Bioc 'impute' for KNN).
")
}
#> KNN imputation complete (k = 5).
stopifnot("NAs remain after imputation" = sum(is.na(mat)) == 0)
cat("Final matrix:", nrow(mat), "features x", ncol(mat), "samples
")
#> Final matrix: 487 features x 5418 samples
# Realign metadata to the (possibly reduced) sample set after filters
metadata <- filter(metadata, sample_name %in% colnames(mat)) |>
  arrange(match(sample_name, colnames(mat)))

5. Lipid Class Summarisation

This MAF uses full English lipid names rather than LIPID MAPS abbreviations. Names are parsed and mapped to standard short codes. Total carbon count and degree of unsaturation are extracted from the chain notation (e.g. 34:1 → 34 carbons, 1 double bond).

# The MAF file uses full English names (e.g. "phosphatidylcholine 34:1")
# rather than LIPID MAPS abbreviations. We map the first word(s) of each
# name to a standard short code. The lookup covers every class seen in
# this dataset's table(lipid_class) output.
class_lookup <- c(
  "phosphatidylcholine"          = "PC",
  "lysophosphatidylcholine"      = "LPC",
  "phosphatidylethanolamine"     = "PE",
  "lysophosphatidylethanolamine" = "LPE",
  "phosphatidylserine"           = "PS",
  "phosphatidylinositol"         = "PI",
  "phosphatidylinositole"        = "PI",      # alternate spelling in MAF
  "phosphatidylglycerol"         = "PG",
  "phosphatidylglicerate"        = "PG",      # typo variant in MAF
  "phosphatidic"                 = "PA",      # "phosphatidic acid"
  "Phosphatidic"                 = "PA",
  "phosphatidynserine"           = "PS",      # typo variant
  "sphingomyelin"                = "SM",
  "triacylglycerol"              = "TG",
  "diacylglycerol"               = "DG",
  "cholesteryl"                  = "CE",
  "cholesterol"                  = "Chol",
  "Ceramide"                     = "Cer",
  "CEoxid"                       = "oxCE",
  "CE"                           = "CE",
  "Cer"                          = "Cer",
  "DG"                           = "DG",
  "PG"                           = "PG",
  "fatty"                        = "FA",      # "fatty acid"
  "docosapentaenoate"            = "FA",
  "docosatetraenoate"            = "FA",
  "octadecadienoate"             = "FA",
  "octadecatrienoate"            = "FA",
  "octadecenoate"                = "FA",
  "hexadecenoate"                = "FA",
  "icosatetraenoate"             = "FA",
  "tetracosenoate"               = "FA",
  "tricosanoate"                 = "FA",
  "Tritridecanoin"               = "TG",
  "all"                          = "other",
  "NA"                           = NA_character_
)

lipid_annot <- annot_df |>
  filter(metabolite_identification %in% rownames(mat)) |>
  transmute(
    lipid        = metabolite_identification,
    # Extract the first word to use as the lookup key
    first_word   = str_extract(lipid, "^[^\\s\\[]+"),
    lipid_class  = class_lookup[first_word],
    # Fatty acid chain info: total carbons and double bonds
    chain_info   = str_extract(lipid, "\\d+:\\d+"),
    carbons      = as.integer(str_extract(chain_info, "^\\d+")),
    double_bonds = as.integer(str_extract(chain_info, "(?<=:)\\d+"))
  ) |>
  select(-first_word)

# Summary
class_counts <- count(lipid_annot, lipid_class, sort = TRUE)
cat("Lipid classes after standardisation:\n")
#> Lipid classes after standardisation:
print(class_counts, n = Inf)
#> # A tibble: 15 × 2
#>    lipid_class     n
#>    <chr>       <int>
#>  1 <NA>          128
#>  2 PC             89
#>  3 TG             53
#>  4 SM             51
#>  5 PS             31
#>  6 PE             29
#>  7 PI             21
#>  8 FA             19
#>  9 DG             18
#> 10 PA             15
#> 11 PG             11
#> 12 CE              9
#> 13 LPC             8
#> 14 LPE             4
#> 15 Chol            1

Feature count per class

ggplot(class_counts,
       aes(x = reorder(lipid_class, n), y = n, fill = lipid_class)) +
  geom_col() +
  coord_flip() +
  labs(title = "Lipid species detected per class",
       x = NULL, y = "Number of features") +
  theme_bw(base_size = 12) +
  theme(legend.position = "none")

class_counts |>
  mutate(pct = round(n / sum(n) * 100, 1)) |>
  kable(col.names = c("Lipid class", "Features", "% of total"),
        caption = "Detected lipid classes") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Detected lipid classes
Lipid class Features % of total
NA 128 26.3
PC 89 18.3
TG 53 10.9
SM 51 10.5
PS 31 6.4
PE 29 6.0
PI 21 4.3
FA 19 3.9
DG 18 3.7
PA 15 3.1
PG 11 2.3
CE 9 1.8
LPC 8 1.6
LPE 4 0.8
Chol 1 0.2

Mean abundance per class

class_abundance <- as.data.frame(mat) |>
  rownames_to_column("lipid") |>
  left_join(select(lipid_annot, lipid, lipid_class), by = "lipid") |>
  group_by(lipid_class) |>
  summarise(across(where(is.numeric), \(x) sum(x, na.rm = TRUE)),
            .groups = "drop") |>
  mutate(mean_across_samples = rowMeans(across(where(is.numeric)))) |>
  arrange(desc(mean_across_samples))

ggplot(class_abundance,
       aes(x = reorder(lipid_class, mean_across_samples),
           y = mean_across_samples, fill = lipid_class)) +
  geom_col() +
  coord_flip() +
  labs(title = "Mean summed log₂ abundance by lipid class",
       x = NULL, y = "Mean log₂ (sum across samples)") +
  theme_bw(base_size = 12) +
  theme(legend.position = "none")


6. Exploratory Analysis

PCA, correlation structure, and feature variance are examined across all samples prior to differential testing.

pca_res <- prcomp(t(mat), center = TRUE, scale. = TRUE)
var_exp <- round(summary(pca_res)$importance[2, 1:10] * 100, 1)

pca_df <- as.data.frame(pca_res$x[, 1:5]) |>
  rownames_to_column("sample_name") |>
  left_join(metadata, by = "sample_name")

6a. PCA — Diabetes & Gender

ggplot(pca_df, aes(x = PC1, y = PC2, colour = diabetes, shape = gender)) +
  geom_point(size = 3, alpha = 0.85) +
  scale_colour_brewer(palette = "Set1", na.value = "grey60") +
  labs(
    title  = "PCA — log₂ PQN-normalised lipidome",
    x      = paste0("PC1 (", var_exp[1], "%)"),
    y      = paste0("PC2 (", var_exp[2], "%)"),
    colour = "Diabetes",
    shape  = "Gender"
  ) +
  theme_bw(base_size = 12)

ggplot(pca_df, aes(x = PC1, y = PC2, colour = age)) +
  geom_point(size = 3, alpha = 0.85) +
  scale_colour_viridis_c(option = "plasma") +
  labs(
    title  = "PCA — coloured by age",
    x      = paste0("PC1 (", var_exp[1], "%)"),
    y      = paste0("PC2 (", var_exp[2], "%)"),
    colour = "Age (yr)"
  ) +
  theme_bw(base_size = 12)

6b. Scree Plot

data.frame(PC = paste0("PC", 1:10), var = var_exp) |>
  ggplot(aes(x = factor(PC, levels = paste0("PC", 1:10)), y = var)) +
  geom_col(fill = "#4DAC26") +
  geom_line(aes(group = 1)) +
  geom_point(size = 2) +
  labs(title = "Scree plot — variance explained per PC",
       x = NULL, y = "% Variance explained") +
  theme_bw(base_size = 12)

6c. Sample–Sample Correlation Heatmap

Samples clustering by biological group rather than technical variation confirm adequate normalisation. Low-correlation outliers indicate candidates for further QC review.

cor_mat <- cor(mat, use = "pairwise.complete.obs")

col_annot <- metadata |>
  select(sample_name, diabetes, gender) |>
  column_to_rownames("sample_name") |>
  mutate(across(everything(), as.character))

pheatmap(
  cor_mat,
  color             = colorRampPalette(c("#4575B4", "white", "#D73027"))(100),
  clustering_method = "ward.D2",
  annotation_col    = col_annot,
  show_rownames     = FALSE,
  show_colnames     = FALSE,
  main              = "Sample–sample Pearson correlation (PQN log₂)"
)

6d. Feature Heatmap — Top 100 Most Variable Lipids

top100_idx <- order(apply(mat, 1, var, na.rm = TRUE), decreasing = TRUE)[1:100]

pheatmap(
  mat[top100_idx, ],
  scale             = "row",
  color             = colorRampPalette(rev(brewer.pal(11, "RdYlBu")))(100),
  clustering_method = "ward.D2",
  clustering_distance_rows = "correlation",
  clustering_distance_cols = "correlation",
  annotation_col    = col_annot,
  show_rownames     = FALSE,
  show_colnames     = FALSE,
  main              = "Top 100 most variable lipid species (row Z-score)"
)


7. Differential Abundance

Differential abundance is tested using limma’s empirical Bayes framework (Ritchie et al., 2015). Variance estimates are moderated across all features, which improves power at moderate sample sizes. Multiple testing is controlled by Benjamini-Hochberg FDR correction.

These samples are the control arm of an MI case-control study. Diabetes status reflects self-reported comorbidity within the control cohort.

7a. Diabetes: Yes vs No

meta_da <- filter(metadata, !is.na(diabetes))
mat_da  <- mat[, meta_da$sample_name]

group  <- factor(meta_da$diabetes)
design <- model.matrix(~ 0 + group)
colnames(design) <- levels(group)

fit  <- lmFit(mat_da, design)
fit2 <- contrasts.fit(
  fit,
  makeContrasts(DiabetesYesVsNo = Yes - No, levels = design)
) |> eBayes()

da_results <- topTable(fit2, coef = "DiabetesYesVsNo",
                       number = Inf, adjust.method = "BH") |>
  rownames_to_column("lipid") |>
  left_join(lipid_annot, by = "lipid") |>
  arrange(adj.P.Val) |>
  mutate(
    sig       = adj.P.Val < 0.05 & abs(logFC) > 0.5,
    direction = case_when(
      sig & logFC >  0.5 ~ "Up in Diabetes",
      sig & logFC < -0.5 ~ "Down in Diabetes",
      TRUE               ~ "NS"
    )
  )

cat("Significant lipids (FDR < 5%, |logFC| > 0.5):",
    sum(da_results$sig, na.rm = TRUE), "\n")
#> Significant lipids (FDR < 5%, |logFC| > 0.5): 0
da_results |>
  filter(sig) |>
  select(lipid, lipid_class, logFC, AveExpr, t, adj.P.Val) |>
  head(20) |>
  mutate(across(where(is.numeric), \(x) round(x, 3))) |>
  kable(caption = "Top 20 diabetes-associated lipids (FDR < 5%)") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE)
Top 20 diabetes-associated lipids (FDR < 5%)
lipid lipid_class logFC AveExpr t adj.P.Val
NA NA NA NA NA NA
:—– :———– —–: ——-: –: ———:
ggplot(da_results, aes(x = logFC, y = -log10(P.Value), colour = direction)) +
  geom_point(alpha = 0.6, size = 1.8) +
  geom_vline(xintercept = c(-0.5, 0.5), linetype = "dashed", colour = "grey50") +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed", colour = "grey50") +
  geom_text_repel(
    data = filter(da_results, sig),
    aes(label = lipid), size = 2.5, max.overlaps = 20
  ) +
  scale_colour_manual(values = c(
    "Up in Diabetes"   = "#D73027",
    "Down in Diabetes" = "#4575B4",
    "NS"               = "grey70"
  )) +
  labs(
    title  = "Volcano plot: Diabetes Yes vs No (limma eBayes)",
    x      = "log₂ Fold Change (Yes − No)",
    y      = "−log₁₀(p-value)",
    colour = NULL
  ) +
  theme_bw(base_size = 12)

7b. Gender: Male vs Female

meta_gd <- filter(metadata, !is.na(gender))
mat_gd  <- mat[, meta_gd$sample_name]
grp_gd  <- factor(meta_gd$gender)
des_gd  <- model.matrix(~ 0 + grp_gd)
colnames(des_gd) <- levels(grp_gd)

fit_gd <- lmFit(mat_gd, des_gd) |>
  contrasts.fit(makeContrasts(MaleVsFemale = Male - Female, levels = des_gd)) |>
  eBayes()

gender_results <- topTable(fit_gd, coef = "MaleVsFemale",
                           number = Inf, adjust.method = "BH") |>
  rownames_to_column("lipid") |>
  left_join(lipid_annot, by = "lipid") |>
  arrange(adj.P.Val) |>
  mutate(
    sig       = adj.P.Val < 0.05 & abs(logFC) > 0.5,
    direction = case_when(
      sig & logFC >  0.5 ~ "Up in Male",
      sig & logFC < -0.5 ~ "Up in Female",
      TRUE               ~ "NS"
    )
  )

cat("Significant lipids (FDR < 5%, |logFC| > 0.5):",
    sum(gender_results$sig, na.rm = TRUE), "\n")
#> Significant lipids (FDR < 5%, |logFC| > 0.5): 0
ggplot(gender_results, aes(x = logFC, y = -log10(P.Value), colour = direction)) +
  geom_point(alpha = 0.6, size = 1.8) +
  geom_vline(xintercept = c(-0.5, 0.5), linetype = "dashed", colour = "grey50") +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed", colour = "grey50") +
  geom_text_repel(data = filter(gender_results, sig),
                  aes(label = lipid), size = 2.5, max.overlaps = 20) +
  scale_colour_manual(values = c("Up in Male"   = "#4575B4",
                                 "Up in Female" = "#D73027",
                                 "NS"           = "grey70")) +
  labs(title = "Volcano plot: Male vs Female (limma eBayes)",
       x = "log₂ Fold Change (Male − Female)",
       y = "−log₁₀(p-value)", colour = NULL) +
  theme_bw(base_size = 12)


8. Lipid Species Visualisation

8a. Carbon Chain Length Distribution

# Use the top classes actually present in this dataset
top_classes <- class_counts |>
  filter(!is.na(lipid_class), n >= 5) |>
  slice_max(n, n = 6) |>
  pull(lipid_class)

lipid_annot |>
  filter(!is.na(carbons), lipid_class %in% top_classes) |>
  ggplot(aes(x = factor(carbons), fill = lipid_class)) +
  geom_bar() +
  facet_wrap(~ lipid_class, scales = "free_y", ncol = 3) +
  labs(title = "Total fatty acid carbon chain length per class",
       x = "Total carbons", y = "# species") +
  theme_bw(base_size = 11) +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 45, hjust = 1))

8b. Degree of Unsaturation

Double bond count reflects the degree of polyunsaturation. PUFA composition influences membrane fluidity and is linked to eicosanoid-mediated inflammatory signalling.

lipid_annot |>
  filter(!is.na(double_bonds), lipid_class %in% top_classes) |>
  ggplot(aes(x = factor(double_bonds), fill = lipid_class)) +
  geom_bar() +
  facet_wrap(~ lipid_class, scales = "free_y", ncol = 3) +
  labs(title = "Degree of unsaturation (double bonds) per class",
       x = "Double bonds", y = "# species") +
  theme_bw(base_size = 11) +
  theme(legend.position = "none")

8c. Lipid Class Composition

class_counts |>
  mutate(pct = n / sum(n) * 100) |>
  ggplot(aes(x = reorder(lipid_class, pct), y = pct, fill = lipid_class)) +
  geom_col() +
  coord_flip() +
  labs(title = "Lipid class composition (% of detected features)",
       x = NULL, y = "% of features") +
  theme_bw(base_size = 12) +
  theme(legend.position = "none")

8d. Top Diabetes-Associated Lipids — Boxplots

top_da <- filter(da_results, sig) |>
  slice_min(adj.P.Val, n = 20) |>
  pull(lipid)

if (length(top_da) > 0) {
  as.data.frame(mat[top_da, , drop = FALSE]) |>
    rownames_to_column("lipid") |>
    pivot_longer(-lipid, names_to = "sample_name", values_to = "log2_abund") |>
    left_join(select(metadata, sample_name, diabetes), by = "sample_name") |>
    filter(!is.na(diabetes)) |>
    ggplot(aes(x = diabetes, y = log2_abund, fill = diabetes)) +
    geom_boxplot(outlier.size = 0.8, alpha = 0.85) +
    facet_wrap(~ lipid, scales = "free_y", ncol = 5) +
    scale_fill_brewer(palette = "Set1") +
    labs(title = "Top diabetes-associated lipids (FDR-ranked)",
         x = "Diabetes status", y = "log₂ abundance",
         fill = "Diabetes") +
    theme_bw(base_size = 10) +
    theme(legend.position = "bottom")
} else {
  cat("No significant diabetes-associated lipids at the chosen thresholds.\n")
}
#> No significant diabetes-associated lipids at the chosen thresholds.

9. Export Results

dir.create("results", showWarnings = FALSE)

write_csv(as.data.frame(mat) |> rownames_to_column("lipid"),
          "results/abundance_pqn_log2_knn_imputed.csv")

write_csv(lipid_annot,
          "results/lipid_annotation_parsed.csv")

write_csv(da_results,
          "results/DA_diabetes_YesVsNo.csv")

write_csv(gender_results,
          "results/DA_gender_MaleVsFemale.csv")

write_csv(metadata,
          "results/metadata_clean.csv")

cat("Results written to /results/\n")
#> Results written to /results/

Session Info

sessionInfo()
#> R version 4.5.3 (2026-03-11)
#> Platform: aarch64-apple-darwin20
#> Running under: macOS Sequoia 15.7.5
#> 
#> Matrix products: default
#> BLAS:   /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib 
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
#> 
#> locale:
#> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#> 
#> time zone: Europe/Stockholm
#> tzcode source: internal
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#>  [1] impute_1.84.0      limma_3.66.0       kableExtra_1.4.0   knitr_1.51        
#>  [5] RColorBrewer_1.1-3 patchwork_1.3.2    ggrepel_0.9.8      pheatmap_1.0.13   
#>  [9] janitor_2.2.1      lubridate_1.9.5    forcats_1.0.1      stringr_1.6.0     
#> [13] dplyr_1.2.1        purrr_1.2.2        readr_2.2.0        tidyr_1.3.2       
#> [17] tibble_3.3.1       ggplot2_4.0.3      tidyverse_2.0.0   
#> 
#> loaded via a namespace (and not attached):
#>  [1] gtable_0.3.6        xfun_0.57           bslib_0.10.0       
#>  [4] tzdb_0.5.0          vctrs_0.7.3         tools_4.5.3        
#>  [7] generics_0.1.4      parallel_4.5.3      pkgconfig_2.0.3    
#> [10] S7_0.2.2            lifecycle_1.0.5     compiler_4.5.3     
#> [13] farver_2.1.2        textshaping_1.0.5   statmod_1.5.1      
#> [16] snakecase_0.11.1    htmltools_0.5.9     sass_0.4.10        
#> [19] yaml_2.3.12         pillar_1.11.1       crayon_1.5.3       
#> [22] jquerylib_0.1.4     cachem_1.1.0        tidyselect_1.2.1   
#> [25] digest_0.6.39       stringi_1.8.7       labeling_0.4.3     
#> [28] fastmap_1.2.0       grid_4.5.3          cli_3.6.6          
#> [31] magrittr_2.0.5      dichromat_2.0-0.1   utf8_1.2.6         
#> [34] withr_3.0.2         scales_1.4.0        bit64_4.8.0        
#> [37] timechange_0.4.0    rmarkdown_2.31      bit_4.6.0          
#> [40] otel_0.2.0          hms_1.1.4           evaluate_1.0.5     
#> [43] viridisLite_0.4.3   rlang_1.2.0         Rcpp_1.1.1-1.1     
#> [46] glue_1.8.1          BiocManager_1.30.27 xml2_1.5.2         
#> [49] svglite_2.2.2       rstudioapi_0.18.0   vroom_1.7.1        
#> [52] jsonlite_2.0.0      R6_2.6.1            systemfonts_1.3.2

References

Citation
Dataset Castro C, Harshfield EL, Butterworth AS, Wood AM, Koulman A, Griffin JL. (2024). A lipidomic dataset for epidemiological studies of acute myocardial infarction. Data in Brief 57:110925
Study Harshfield EL, Koulman A, Ziemek D, Marney L, Fauman EB, Paul DS, et al. (2019). An unbiased lipid phenotyping approach to study the genetic determinants of lipids and their association with coronary heart disease risk factors. J. Proteome Res. 18(6):2397–2410
limma Ritchie et al. (2015) Nucleic Acids Res. doi:10.1093/nar/gkv007
limma eBayes Smyth (2004) Stat. Appl. Genet. Mol. Biol.
impute Troyanskaya et al. (2001) Bioinformatics 17:520
PQN Dieterle et al. (2006) Anal. Chem. 78:4281
LIPID MAPS shorthand Liebisch et al. (2020) J. Lipid Res. 61:1650
tidyverse Wickham et al. (2019) JOSS doi:10.21105/joss.01686
pheatmap Kolde R (2019) CRAN