Lipidomics Workflow

Study Overview

Field	Detail
MetaboLights ID	MTBLS4461
Technique	Direct-infusion MS (DI-MS), alternating polarity
Instrument	Thermo Exactive Orbitrap
Matrix	Human serum
Cohort	Pakistani control subjects from a case-control MI study
Biological factors	Gender, Age, Diabetes status

Samples were infused directly into the mass spectrometer without prior chromatographic separation, enabling high-throughput profiling across a large epidemiological cohort. The absence of chromatography increases ion suppression, making normalisation a critical step in the analytical pipeline.

0. Packages

if (!requireNamespace("BiocManager", quietly = TRUE))
  install.packages("BiocManager")

cran_pkgs <- c(
  "tidyverse",    # dplyr, ggplot2, stringr, readr, tidyr
  "janitor",      # clean_names() — makes column names R-safe
  "pheatmap",     # clustered heatmaps
  "ggrepel",      # non-overlapping text labels on volcano plots
  "patchwork",    # stitch ggplot panels together
  "RColorBrewer", # colour palettes
  "knitr",        # kable() for formatted tables
  "kableExtra"    # enhanced kable styling
)

bioc_pkgs <- c(
  "limma",   # eBayes differential abundance — the industry standard
  "impute"   # KNN imputation for missing values
)

install.packages(setdiff(cran_pkgs, rownames(installed.packages())),
                 quiet = TRUE)
BiocManager::install(setdiff(bioc_pkgs, rownames(installed.packages())),
                     ask = FALSE, quiet = TRUE)

suppressPackageStartupMessages(
  lapply(c(cran_pkgs, bioc_pkgs), library, character.only = TRUE)
)

#> [[1]]
#>  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
#>  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
#> [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
#> 
#> [[2]]
#>  [1] "janitor"   "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
#>  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
#> [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
#> 
#> [[3]]
#>  [1] "pheatmap"  "janitor"   "lubridate" "forcats"   "stringr"   "dplyr"    
#>  [7] "purrr"     "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse"
#> [13] "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"  
#> [19] "base"     
#> 
#> [[4]]
#>  [1] "ggrepel"   "pheatmap"  "janitor"   "lubridate" "forcats"   "stringr"  
#>  [7] "dplyr"     "purrr"     "readr"     "tidyr"     "tibble"    "ggplot2"  
#> [13] "tidyverse" "stats"     "graphics"  "grDevices" "utils"     "datasets" 
#> [19] "methods"   "base"     
#> 
#> [[5]]
#>  [1] "patchwork" "ggrepel"   "pheatmap"  "janitor"   "lubridate" "forcats"  
#>  [7] "stringr"   "dplyr"     "purrr"     "readr"     "tidyr"     "tibble"   
#> [13] "ggplot2"   "tidyverse" "stats"     "graphics"  "grDevices" "utils"    
#> [19] "datasets"  "methods"   "base"     
#> 
#> [[6]]
#>  [1] "RColorBrewer" "patchwork"    "ggrepel"      "pheatmap"     "janitor"     
#>  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
#> [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
#> [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
#> [21] "methods"      "base"        
#> 
#> [[7]]
#>  [1] "knitr"        "RColorBrewer" "patchwork"    "ggrepel"      "pheatmap"    
#>  [6] "janitor"      "lubridate"    "forcats"      "stringr"      "dplyr"       
#> [11] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
#> [16] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
#> [21] "datasets"     "methods"      "base"        
#> 
#> [[8]]
#>  [1] "kableExtra"   "knitr"        "RColorBrewer" "patchwork"    "ggrepel"     
#>  [6] "pheatmap"     "janitor"      "lubridate"    "forcats"      "stringr"     
#> [11] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
#> [16] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
#> [21] "utils"        "datasets"     "methods"      "base"        
#> 
#> [[9]]
#>  [1] "limma"        "kableExtra"   "knitr"        "RColorBrewer" "patchwork"   
#>  [6] "ggrepel"      "pheatmap"     "janitor"      "lubridate"    "forcats"     
#> [11] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
#> [16] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
#> [21] "grDevices"    "utils"        "datasets"     "methods"      "base"        
#> 
#> [[10]]
#>  [1] "impute"       "limma"        "kableExtra"   "knitr"        "RColorBrewer"
#>  [6] "patchwork"    "ggrepel"      "pheatmap"     "janitor"      "lubridate"   
#> [11] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
#> [16] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
#> [21] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
#> [26] "base"

1. Data Acquisition

Files are retrieved from the MetaboLights REST API. An FTP mirror is available at ftp://ftp.ebi.ac.uk/pub/databases/metabolights/studies/public/MTBLS4461/.

dir.create("data/MTBLS4461", recursive = TRUE, showWarnings = FALSE)

download_if_missing <- function(filename, dest_dir = "data/MTBLS4461") {
  dest <- file.path(dest_dir, filename)
  if (!file.exists(dest)) {
    url <- paste0(
      "https://www.ebi.ac.uk/metabolights/ws/studies/MTBLS4461/download?file=",
      URLencode(filename, reserved = TRUE)
    )
    message("Downloading: ", filename)
    download.file(url, destfile = dest, mode = "wb", quiet = TRUE)
  }
  invisible(dest)
}

download_if_missing("s_MTBLS4461.txt")
download_if_missing("a_MTBLS4461_DI-MS_alternating__metabolite_profiling.txt")
download_if_missing("m_MTBLS4461_DI-MS_alternating__metabolite_profiling_v2_maf.tsv")
download_if_missing("m_MTBLS4461_DI-MS_positive__metabolite_profiling_v2_maf.tsv_SPLIT")

message("All files ready.")

2. Data Import & Inspection

2a. Sample Metadata

The s_ file is in ISA-Tab format. Ontology term columns are dropped; three biological factors are retained: gender, age, and diabetes status.

meta_raw <- read_tsv(
  "data/MTBLS4461/s_MTBLS4461.txt",
  show_col_types = FALSE
)

metadata <- meta_raw |>
  select(
    sample_name = `Sample Name`,
    source_name = `Source Name`,
    gender      = `Factor Value[Gender]`,
    age         = `Factor Value[Age]`,
    diabetes    = `Factor Value[Diabetes]`
  ) |>
  mutate(
    age      = as.numeric(age),
    gender   = factor(gender),
    diabetes = na_if(diabetes, "") |> factor()
  )

cat("Total samples:", nrow(metadata), "\n")

#> Total samples: 5681

# Gender distribution
table(metadata$gender, useNA = "ifany") |>
  as.data.frame() |>
  setNames(c("Gender", "Count")) |>
  kable(caption = "Gender distribution") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Gender distribution
Gender	Count
Female	1195
Male	4465
NA	21

# Diabetes distribution
table(metadata$diabetes, useNA = "ifany") |>
  as.data.frame() |>
  setNames(c("Diabetes", "Count")) |>
  kable(caption = "Diabetes status") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Diabetes status
Diabetes	Count
No	2604
Yes	1609
NA	1468

# Age summary
summary(metadata$age) |>
  t() |>
  kable(caption = "Age distribution (years)",
        digits = 1) |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Age distribution (years)
Min.	1st Qu.	Median	Mean	3rd Qu.	Max.	NA’s
27	50	54	54.2	60	87	21

head(metadata, 8) |>
  kable(caption = "First 8 rows of cleaned metadata") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE)

First 8 rows of cleaned metadata
sample_name	source_name	gender	age	diabetes
X1352	Individual_1	Male	50	No
LY3258	Individual_2	Male	51	NA
RC1732	Individual_3	Male	50	No
M580	Individual_4	Male	45	NA
FC1104	Individual_5	Male	60	No
B289	Individual_6	Male	59	NA
X1360	Individual_7	Female	60	No
X1641	Individual_8	Male	70	No

2b. MAF Data Matrix

The MAF (MetaboLights Annotation Format) file contains one row per lipid feature. The first 21 columns are annotation fields; sample abundance columns follow. Three spurious unnamed columns are removed on import.

read_maf <- function(path) {
  maf <- read_tsv(path, show_col_types = FALSE)

  # Drop unnamed junk columns (Unnamed: 21/22/23)
  maf <- maf |> select(-starts_with("Unnamed"))

  # Known annotation columns — metabolite_identification is intentionally
  # excluded here; it is handled separately as the row identifier to avoid
  # it appearing twice in select() calls.
  annot_cols <- c(
    "database_identifier", "chemical_formula", "smiles", "inchi",
    "mass_to_charge", "fragmentation", "modifications", "charge",
    "retention_time", "taxid", "species", "database", "database_version",
    "reliability", "uri", "search_engine", "search_engine_score",
    "smallmolecule_abundance_sub", "smallmolecule_abundance_stdev_sub",
    "smallmolecule_abundance_std_error_sub"
  )

  # Which of those are actually present in this file
  present_annot <- intersect(annot_cols, names(maf))

  # Sample columns = everything that is not an annotation or the ID column
  non_sample  <- c("metabolite_identification", present_annot)
  sample_cols <- setdiff(names(maf), non_sample)

  # ── Build unique row IDs ────────────────────────────────────────────────
  # Duplicates occur when the same lipid name appears at multiple m/z values
  # (different adducts e.g. [M+H]+, [M+Na]+) or across both polarities.
  # column_to_rownames() hard-errors on any duplicate, so we disambiguate
  # by appending the rounded m/z; numeric suffix is the last-resort fallback.
  n_dupes <- sum(duplicated(maf$metabolite_identification, incomparables = NA))

  if (n_dupes > 0) {
    message(n_dupes, " duplicate metabolite_identification entries detected — ",
            "appending m/z to create unique row IDs.")

    mz_vals <- round(as.numeric(maf$mass_to_charge), 4)
    is_dup  <- duplicated(maf$metabolite_identification) |
               duplicated(maf$metabolite_identification, fromLast = TRUE)
    row_ids <- ifelse(is_dup,
                      paste0(maf$metabolite_identification, " [", mz_vals, "]"),
                      maf$metabolite_identification)

    # Numeric suffix if name+m/z is still not unique
    if (anyDuplicated(row_ids)) {
      row_ids <- ave(row_ids, row_ids, FUN = function(x) {
        if (length(x) > 1) paste0(x, "_", seq_along(x)) else x
      })
    }
  } else {
    row_ids <- maf$metabolite_identification
  }

  # ── Assemble outputs ────────────────────────────────────────────────────
  annot_out <- maf |>
    select(all_of(present_annot)) |>
    mutate(metabolite_identification = row_ids, .before = 1)

  abund_out <- maf |>
    select(all_of(sample_cols)) |>
    mutate(metabolite_identification = row_ids) |>
    column_to_rownames("metabolite_identification") |>
    mutate(across(everything(), as.numeric)) |>
    as.matrix()

  list(annotation = annot_out, abundance = abund_out)
}

maf_alt <- read_maf(
  "data/MTBLS4461/m_MTBLS4461_DI-MS_alternating__metabolite_profiling_v2_maf.tsv"
)

annot_df      <- maf_alt$annotation
abundance_mat <- maf_alt$abundance

cat("Lipid features:", nrow(abundance_mat), "\n")

#> Lipid features: 497

cat("Samples in MAF:", ncol(abundance_mat), "\n")

#> Samples in MAF: 5662

# Match samples between MAF and metadata
shared_samples <- intersect(colnames(abundance_mat), metadata$sample_name)
cat("Samples matched between MAF and metadata:", length(shared_samples), "\n")

#> Samples matched between MAF and metadata: 5659

if (length(shared_samples) == 0) {
  stop(paste(
    "No matching sample IDs found between MAF and metadata.",
    "Check for whitespace or formatting differences with trimws()."
  ))
}

# Subset to shared samples
abundance_mat <- abundance_mat[, shared_samples]
metadata      <- filter(metadata, sample_name %in% shared_samples) |>
  arrange(match(sample_name, shared_samples))

# Preview the annotation columns
annot_df |>
  select(metabolite_identification, mass_to_charge, database_identifier) |>
  head(10) |>
  kable(caption = "First 10 lipid features (annotation preview)") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE)

First 10 lipid features (annotation preview)
metabolite_identification	mass_to_charge	database_identifier
cholesterol	369.3514	CHEBI:16113
NA [637.3359]	637.3359	NA
lysophosphatidylcholine 15:0	482.3247	CHEBI:72736
lysophosphatidylethanolamine 18:0	482.3247	CHEBI:64576
lysophosphatidylcholine 16:1	494.3245	CHEBI:64560
lysophosphatidylethanolamine 19:1	494.3245	CHEBI:138537
lysophosphatidylcholine 16:0	496.3404	CHEBI:64563
lysophosphatidylethanolamine 19:0	496.3404	CHEBI:138533
lysophosphatidylcholine 17:0	510.3560	CHEBI:72737
lysophosphatidylethanolamine 20:0	510.3560	CHEBI:132556

3. Quality Control

3a. Missing Values

Missing values arise when a lipid signal falls below the instrument detection threshold. The distributions across features and samples are assessed before filtering.

miss_feat   <- rowMeans(is.na(abundance_mat)) * 100
miss_sample <- colMeans(is.na(abundance_mat)) * 100

p_miss_feat <- ggplot(data.frame(pct = miss_feat), aes(x = pct)) +
  geom_histogram(bins = 40, fill = "#2C7BB6", colour = "white") +
  labs(title = "Missing values per lipid feature",
       x = "% Missing", y = "Feature count") +
  theme_bw(base_size = 12)

p_miss_samp <- ggplot(
  data.frame(sample = names(miss_sample), pct = miss_sample),
  aes(x = reorder(sample, pct), y = pct)
) +
  geom_bar(stat = "identity", fill = "#D7191C") +
  labs(title = "Missing values per sample",
       x = "Sample (sorted by % missing)", y = "% Missing") +
  theme_bw(base_size = 12) +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())

p_miss_feat + p_miss_samp

3b. Feature Filtering

Features missing in more than 30% of samples are removed.

abundance_filt <- abundance_mat[miss_feat < 30, ]

cat(
  "Features before filter:", nrow(abundance_mat), "\n",
  "Features after filter: ", nrow(abundance_filt), "\n",
  "Features removed:      ", nrow(abundance_mat) - nrow(abundance_filt), "\n"
)

#> Features before filter: 497 
#>  Features after filter:  491 
#>  Features removed:       6

3c. Poor-Quality Sample Flagging

bad_samples <- names(miss_sample[miss_sample > 50])

if (length(bad_samples) > 0) {
  cat("⚠️  Samples with >50% missing — consider exclusion:\n")
  cat(paste(bad_samples, collapse = ", "), "\n")
} else {
  cat("✅ No samples exceed 50% missing threshold.\n")
}

#> ⚠️  Samples with >50% missing — consider exclusion:
#> E171, FC1010, FC1074, H1632, H1858, H954, H965, LY1083, LY2283, LY2587, LY2632, MY1199, RC1414, RC1428, RC1824, T1029, T235, T246, X1082, X1343, X1549, X1941, B213, E1680, E564, FC1032, FC1200, H1353, H1539, H1828, LY1109, LY1982, LY2330, LY2804, LY2974, LY3060, LY3203, M211, M426, MY1139, MY1569, MY1966, P280, RC1248, RC1715, RC1719, RC2064, RC2090, RC2095, RC54, X1224, X1240, X1314, X1483, FC1021, MY1632, RC176, FC1223, MY1234, MY1024, RC2088, X1121, B334, E1318, H869, LY1765, LY2169, LY3214, MY1089, RC1772, RC219, RC80, X1510, X1567, X1600, E1384, FC1286, E269, MY1381, B274, J31, MY1147, X1522, AY1052, X1079, X1971, E1568, FC1070, H764, LY3037, LY3100, MY1626, P596, RC1528, RC1563, X1588, X1665, E133, H1411, X1284, X1579, LY3098, LY3130, RC1814, H1214, LY1855, LY2516, LY3174, X1137, LY3199, RC1411, AY1036, B303, FC1260, H1408, H1658, H1775, LY1101, LY1200, LY2185, LY2534, LY3058, LY3187, LY3213, M26, MY1498, P601, RC1561, T206, X1666, E1053, MY1506, P118, E517, FC1239, LY2980, RC90, E714, FC1008, LY1341, LY3069, M95, MY1039, MY1146, X1045, X1969, X1978, FC1025, LY2784, LY2836, RC1442, RC1710, T413, X1146, X1795, H242, E198, B139, H1464, LY3226, RC1278, X1646, RC1808, T438, X1509, B357, X1363, RC1226, X1571, P608, P430, H1706, LY2249, X1548, DY1112, E1646, B127, LY2711, H1600, B113, LY2698, H1382, RC83, B205, LY1188, LY2710, T422, E27, LY1239, T147, T780, E29, E969, FC1164, J5, T408, LY1456, E1122, E53, P633, T1063, T647, H362, E1642, H1240, H1377, LY2879

3d. Total Ion Signal (Pre-normalisation)

Total ion signal (TIS) variation across samples reflects global intensity differences that normalisation must account for.

tis_df <- data.frame(
  sample = colnames(abundance_filt),
  tis    = colSums(abundance_filt, na.rm = TRUE)
) |>
  left_join(select(metadata, sample_name, gender, diabetes),
            by = c("sample" = "sample_name"))

ggplot(tis_df, aes(x = reorder(sample, tis), y = tis / 1e6, fill = gender)) +
  geom_bar(stat = "identity") +
  scale_fill_brewer(palette = "Set1", na.value = "grey60") +
  labs(
    title = "Total ion signal per sample (raw, pre-normalisation)",
    x     = "Sample (sorted by TIS)",
    y     = "Sum intensity (×10⁶)",
    fill  = "Gender"
  ) +
  theme_bw(base_size = 12) +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())

4. Normalisation

PQN

Probabilistic Quotient Normalisation (PQN) is applied. A reference spectrum is constructed from the per-feature median across all samples; each sample’s normalisation factor is then the median of its quotients against that reference. This approach is robust when only a subset of lipids differ between samples, which TIC normalisation does not account for (Dieterle et al., 2006).

pqn_normalise <- function(mat) {
  ref       <- apply(mat, 1, median, na.rm = TRUE)
  quotients <- mat / ref
  nf        <- apply(quotients, 2, median, na.rm = TRUE)
  cat("PQN normalisation factors:\n")
  cat("  Mean:", round(mean(nf), 4),
      "| SD:", round(sd(nf), 4),
      "| Range:", round(range(nf), 4), "\n")
  sweep(mat, 2, nf, "/")
}

abundance_pqn <- pqn_normalise(abundance_filt)

#> PQN normalisation factors:
#>   Mean: NA | SD: NA | Range: NA NA

Log₂ Transform

Log₂ transformation stabilises variance across the intensity range and symmetrises the distribution prior to linear modelling.

abundance_log <- log2(abundance_pqn + 1)

Pre-imputation Filter & KNN Imputation

Remaining missing values are filled by KNN imputation (k = 5), which borrows from the five most similar features. A secondary 50% missingness filter is applied immediately before imputation: PQN can introduce new NAs where a feature’s reference median is zero, and subsetting to shared samples can shift per-feature missingness proportions beyond the threshold applied in section 3.

# ── Final missingness filter ─────────────────────────────────────────────
miss_feat_pre <- rowMeans(is.na(abundance_log))
miss_samp_pre <- colMeans(is.na(abundance_log))

n_feat_drop <- sum(miss_feat_pre >= 0.5)
n_samp_drop <- sum(miss_samp_pre >= 0.5)
if (n_feat_drop > 0)
  message(n_feat_drop, " features with >= 50% missing removed before imputation.")
if (n_samp_drop > 0)
  message(n_samp_drop, " samples with >= 50% missing removed before imputation.")

abundance_log_clean <- abundance_log[miss_feat_pre < 0.5, miss_samp_pre < 0.5]
cat("Matrix entering imputation:",
    nrow(abundance_log_clean), "features x",
    ncol(abundance_log_clean), "samples
")

#> Matrix entering imputation: 487 features x 5418 samples

# ── KNN imputation ────────────────────────────────────────────────────────
if (requireNamespace("impute", quietly = TRUE)) {
  library(impute)
  mat <- impute.knn(abundance_log_clean, k = 5)$data
  cat("KNN imputation complete (k = 5).
")
} else {
  mat <- abundance_log_clean
  for (i in seq_len(nrow(mat))) {
    row_med <- median(mat[i, ], na.rm = TRUE)
    mat[i, is.na(mat[i, ])] <- row_med
  }
  cat("Median imputation used (install Bioc 'impute' for KNN).
")
}

#> KNN imputation complete (k = 5).

stopifnot("NAs remain after imputation" = sum(is.na(mat)) == 0)
cat("Final matrix:", nrow(mat), "features x", ncol(mat), "samples
")

#> Final matrix: 487 features x 5418 samples

# Realign metadata to the (possibly reduced) sample set after filters
metadata <- filter(metadata, sample_name %in% colnames(mat)) |>
  arrange(match(sample_name, colnames(mat)))

5. Lipid Class Summarisation

This MAF uses full English lipid names rather than LIPID MAPS abbreviations. Names are parsed and mapped to standard short codes. Total carbon count and degree of unsaturation are extracted from the chain notation (e.g. 34:1 → 34 carbons, 1 double bond).

# The MAF file uses full English names (e.g. "phosphatidylcholine 34:1")
# rather than LIPID MAPS abbreviations. We map the first word(s) of each
# name to a standard short code. The lookup covers every class seen in
# this dataset's table(lipid_class) output.
class_lookup <- c(
  "phosphatidylcholine"          = "PC",
  "lysophosphatidylcholine"      = "LPC",
  "phosphatidylethanolamine"     = "PE",
  "lysophosphatidylethanolamine" = "LPE",
  "phosphatidylserine"           = "PS",
  "phosphatidylinositol"         = "PI",
  "phosphatidylinositole"        = "PI",      # alternate spelling in MAF
  "phosphatidylglycerol"         = "PG",
  "phosphatidylglicerate"        = "PG",      # typo variant in MAF
  "phosphatidic"                 = "PA",      # "phosphatidic acid"
  "Phosphatidic"                 = "PA",
  "phosphatidynserine"           = "PS",      # typo variant
  "sphingomyelin"                = "SM",
  "triacylglycerol"              = "TG",
  "diacylglycerol"               = "DG",
  "cholesteryl"                  = "CE",
  "cholesterol"                  = "Chol",
  "Ceramide"                     = "Cer",
  "CEoxid"                       = "oxCE",
  "CE"                           = "CE",
  "Cer"                          = "Cer",
  "DG"                           = "DG",
  "PG"                           = "PG",
  "fatty"                        = "FA",      # "fatty acid"
  "docosapentaenoate"            = "FA",
  "docosatetraenoate"            = "FA",
  "octadecadienoate"             = "FA",
  "octadecatrienoate"            = "FA",
  "octadecenoate"                = "FA",
  "hexadecenoate"                = "FA",
  "icosatetraenoate"             = "FA",
  "tetracosenoate"               = "FA",
  "tricosanoate"                 = "FA",
  "Tritridecanoin"               = "TG",
  "all"                          = "other",
  "NA"                           = NA_character_
)

lipid_annot <- annot_df |>
  filter(metabolite_identification %in% rownames(mat)) |>
  transmute(
    lipid        = metabolite_identification,
    # Extract the first word to use as the lookup key
    first_word   = str_extract(lipid, "^[^\\s\\[]+"),
    lipid_class  = class_lookup[first_word],
    # Fatty acid chain info: total carbons and double bonds
    chain_info   = str_extract(lipid, "\\d+:\\d+"),
    carbons      = as.integer(str_extract(chain_info, "^\\d+")),
    double_bonds = as.integer(str_extract(chain_info, "(?<=:)\\d+"))
  ) |>
  select(-first_word)

# Summary
class_counts <- count(lipid_annot, lipid_class, sort = TRUE)
cat("Lipid classes after standardisation:\n")

#> Lipid classes after standardisation:

print(class_counts, n = Inf)

#> # A tibble: 15 × 2
#>    lipid_class     n
#>    <chr>       <int>
#>  1 <NA>          128
#>  2 PC             89
#>  3 TG             53
#>  4 SM             51
#>  5 PS             31
#>  6 PE             29
#>  7 PI             21
#>  8 FA             19
#>  9 DG             18
#> 10 PA             15
#> 11 PG             11
#> 12 CE              9
#> 13 LPC             8
#> 14 LPE             4
#> 15 Chol            1

Feature count per class

ggplot(class_counts,
       aes(x = reorder(lipid_class, n), y = n, fill = lipid_class)) +
  geom_col() +
  coord_flip() +
  labs(title = "Lipid species detected per class",
       x = NULL, y = "Number of features") +
  theme_bw(base_size = 12) +
  theme(legend.position = "none")

class_counts |>
  mutate(pct = round(n / sum(n) * 100, 1)) |>
  kable(col.names = c("Lipid class", "Features", "% of total"),
        caption = "Detected lipid classes") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Detected lipid classes
Lipid class	Features	% of total
NA	128	26.3
PC	89	18.3
TG	53	10.9
SM	51	10.5
PS	31	6.4
PE	29	6.0
PI	21	4.3
FA	19	3.9
DG	18	3.7
PA	15	3.1
PG	11	2.3
CE	9	1.8
LPC	8	1.6
LPE	4	0.8
Chol	1	0.2

Mean abundance per class

class_abundance <- as.data.frame(mat) |>
  rownames_to_column("lipid") |>
  left_join(select(lipid_annot, lipid, lipid_class), by = "lipid") |>
  group_by(lipid_class) |>
  summarise(across(where(is.numeric), \(x) sum(x, na.rm = TRUE)),
            .groups = "drop") |>
  mutate(mean_across_samples = rowMeans(across(where(is.numeric)))) |>
  arrange(desc(mean_across_samples))

ggplot(class_abundance,
       aes(x = reorder(lipid_class, mean_across_samples),
           y = mean_across_samples, fill = lipid_class)) +
  geom_col() +
  coord_flip() +
  labs(title = "Mean summed log₂ abundance by lipid class",
       x = NULL, y = "Mean log₂ (sum across samples)") +
  theme_bw(base_size = 12) +
  theme(legend.position = "none")

6. Exploratory Analysis

PCA, correlation structure, and feature variance are examined across all samples prior to differential testing.

pca_res <- prcomp(t(mat), center = TRUE, scale. = TRUE)
var_exp <- round(summary(pca_res)$importance[2, 1:10] * 100, 1)

pca_df <- as.data.frame(pca_res$x[, 1:5]) |>
  rownames_to_column("sample_name") |>
  left_join(metadata, by = "sample_name")

6a. PCA — Diabetes & Gender

ggplot(pca_df, aes(x = PC1, y = PC2, colour = diabetes, shape = gender)) +
  geom_point(size = 3, alpha = 0.85) +
  scale_colour_brewer(palette = "Set1", na.value = "grey60") +
  labs(
    title  = "PCA — log₂ PQN-normalised lipidome",
    x      = paste0("PC1 (", var_exp[1], "%)"),
    y      = paste0("PC2 (", var_exp[2], "%)"),
    colour = "Diabetes",
    shape  = "Gender"
  ) +
  theme_bw(base_size = 12)

ggplot(pca_df, aes(x = PC1, y = PC2, colour = age)) +
  geom_point(size = 3, alpha = 0.85) +
  scale_colour_viridis_c(option = "plasma") +
  labs(
    title  = "PCA — coloured by age",
    x      = paste0("PC1 (", var_exp[1], "%)"),
    y      = paste0("PC2 (", var_exp[2], "%)"),
    colour = "Age (yr)"
  ) +
  theme_bw(base_size = 12)

6b. Scree Plot

data.frame(PC = paste0("PC", 1:10), var = var_exp) |>
  ggplot(aes(x = factor(PC, levels = paste0("PC", 1:10)), y = var)) +
  geom_col(fill = "#4DAC26") +
  geom_line(aes(group = 1)) +
  geom_point(size = 2) +
  labs(title = "Scree plot — variance explained per PC",
       x = NULL, y = "% Variance explained") +
  theme_bw(base_size = 12)

6c. Sample–Sample Correlation Heatmap

Samples clustering by biological group rather than technical variation confirm adequate normalisation. Low-correlation outliers indicate candidates for further QC review.

cor_mat <- cor(mat, use = "pairwise.complete.obs")

col_annot <- metadata |>
  select(sample_name, diabetes, gender) |>
  column_to_rownames("sample_name") |>
  mutate(across(everything(), as.character))

pheatmap(
  cor_mat,
  color             = colorRampPalette(c("#4575B4", "white", "#D73027"))(100),
  clustering_method = "ward.D2",
  annotation_col    = col_annot,
  show_rownames     = FALSE,
  show_colnames     = FALSE,
  main              = "Sample–sample Pearson correlation (PQN log₂)"
)

6d. Feature Heatmap — Top 100 Most Variable Lipids

top100_idx <- order(apply(mat, 1, var, na.rm = TRUE), decreasing = TRUE)[1:100]

pheatmap(
  mat[top100_idx, ],
  scale             = "row",
  color             = colorRampPalette(rev(brewer.pal(11, "RdYlBu")))(100),
  clustering_method = "ward.D2",
  clustering_distance_rows = "correlation",
  clustering_distance_cols = "correlation",
  annotation_col    = col_annot,
  show_rownames     = FALSE,
  show_colnames     = FALSE,
  main              = "Top 100 most variable lipid species (row Z-score)"
)

7. Differential Abundance

Differential abundance is tested using limma’s empirical Bayes framework (Ritchie et al., 2015). Variance estimates are moderated across all features, which improves power at moderate sample sizes. Multiple testing is controlled by Benjamini-Hochberg FDR correction.

These samples are the control arm of an MI case-control study. Diabetes status reflects self-reported comorbidity within the control cohort.

7a. Diabetes: Yes vs No

meta_da <- filter(metadata, !is.na(diabetes))
mat_da  <- mat[, meta_da$sample_name]

group  <- factor(meta_da$diabetes)
design <- model.matrix(~ 0 + group)
colnames(design) <- levels(group)

fit  <- lmFit(mat_da, design)
fit2 <- contrasts.fit(
  fit,
  makeContrasts(DiabetesYesVsNo = Yes - No, levels = design)
) |> eBayes()

da_results <- topTable(fit2, coef = "DiabetesYesVsNo",
                       number = Inf, adjust.method = "BH") |>
  rownames_to_column("lipid") |>
  left_join(lipid_annot, by = "lipid") |>
  arrange(adj.P.Val) |>
  mutate(
    sig       = adj.P.Val < 0.05 & abs(logFC) > 0.5,
    direction = case_when(
      sig & logFC >  0.5 ~ "Up in Diabetes",
      sig & logFC < -0.5 ~ "Down in Diabetes",
      TRUE               ~ "NS"
    )
  )

cat("Significant lipids (FDR < 5%, |logFC| > 0.5):",
    sum(da_results$sig, na.rm = TRUE), "\n")

#> Significant lipids (FDR < 5%, |logFC| > 0.5): 0

da_results |>
  filter(sig) |>
  select(lipid, lipid_class, logFC, AveExpr, t, adj.P.Val) |>
  head(20) |>
  mutate(across(where(is.numeric), \(x) round(x, 3))) |>
  kable(caption = "Top 20 diabetes-associated lipids (FDR < 5%)") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE)

Top 20 diabetes-associated lipids (FDR < 5%)
lipid	lipid_class	logFC	AveExpr	t	adj.P.Val
NA	NA	NA	NA	NA	NA
:—–	:———–	—–:	——-:	–:	———:

ggplot(da_results, aes(x = logFC, y = -log10(P.Value), colour = direction)) +
  geom_point(alpha = 0.6, size = 1.8) +
  geom_vline(xintercept = c(-0.5, 0.5), linetype = "dashed", colour = "grey50") +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed", colour = "grey50") +
  geom_text_repel(
    data = filter(da_results, sig),
    aes(label = lipid), size = 2.5, max.overlaps = 20
  ) +
  scale_colour_manual(values = c(
    "Up in Diabetes"   = "#D73027",
    "Down in Diabetes" = "#4575B4",
    "NS"               = "grey70"
  )) +
  labs(
    title  = "Volcano plot: Diabetes Yes vs No (limma eBayes)",
    x      = "log₂ Fold Change (Yes − No)",
    y      = "−log₁₀(p-value)",
    colour = NULL
  ) +
  theme_bw(base_size = 12)

7b. Gender: Male vs Female

meta_gd <- filter(metadata, !is.na(gender))
mat_gd  <- mat[, meta_gd$sample_name]
grp_gd  <- factor(meta_gd$gender)
des_gd  <- model.matrix(~ 0 + grp_gd)
colnames(des_gd) <- levels(grp_gd)

fit_gd <- lmFit(mat_gd, des_gd) |>
  contrasts.fit(makeContrasts(MaleVsFemale = Male - Female, levels = des_gd)) |>
  eBayes()

gender_results <- topTable(fit_gd, coef = "MaleVsFemale",
                           number = Inf, adjust.method = "BH") |>
  rownames_to_column("lipid") |>
  left_join(lipid_annot, by = "lipid") |>
  arrange(adj.P.Val) |>
  mutate(
    sig       = adj.P.Val < 0.05 & abs(logFC) > 0.5,
    direction = case_when(
      sig & logFC >  0.5 ~ "Up in Male",
      sig & logFC < -0.5 ~ "Up in Female",
      TRUE               ~ "NS"
    )
  )

cat("Significant lipids (FDR < 5%, |logFC| > 0.5):",
    sum(gender_results$sig, na.rm = TRUE), "\n")

#> Significant lipids (FDR < 5%, |logFC| > 0.5): 0

ggplot(gender_results, aes(x = logFC, y = -log10(P.Value), colour = direction)) +
  geom_point(alpha = 0.6, size = 1.8) +
  geom_vline(xintercept = c(-0.5, 0.5), linetype = "dashed", colour = "grey50") +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed", colour = "grey50") +
  geom_text_repel(data = filter(gender_results, sig),
                  aes(label = lipid), size = 2.5, max.overlaps = 20) +
  scale_colour_manual(values = c("Up in Male"   = "#4575B4",
                                 "Up in Female" = "#D73027",
                                 "NS"           = "grey70")) +
  labs(title = "Volcano plot: Male vs Female (limma eBayes)",
       x = "log₂ Fold Change (Male − Female)",
       y = "−log₁₀(p-value)", colour = NULL) +
  theme_bw(base_size = 12)

8. Lipid Species Visualisation

8a. Carbon Chain Length Distribution

# Use the top classes actually present in this dataset
top_classes <- class_counts |>
  filter(!is.na(lipid_class), n >= 5) |>
  slice_max(n, n = 6) |>
  pull(lipid_class)

lipid_annot |>
  filter(!is.na(carbons), lipid_class %in% top_classes) |>
  ggplot(aes(x = factor(carbons), fill = lipid_class)) +
  geom_bar() +
  facet_wrap(~ lipid_class, scales = "free_y", ncol = 3) +
  labs(title = "Total fatty acid carbon chain length per class",
       x = "Total carbons", y = "# species") +
  theme_bw(base_size = 11) +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 45, hjust = 1))

8b. Degree of Unsaturation

Double bond count reflects the degree of polyunsaturation. PUFA composition influences membrane fluidity and is linked to eicosanoid-mediated inflammatory signalling.

lipid_annot |>
  filter(!is.na(double_bonds), lipid_class %in% top_classes) |>
  ggplot(aes(x = factor(double_bonds), fill = lipid_class)) +
  geom_bar() +
  facet_wrap(~ lipid_class, scales = "free_y", ncol = 3) +
  labs(title = "Degree of unsaturation (double bonds) per class",
       x = "Double bonds", y = "# species") +
  theme_bw(base_size = 11) +
  theme(legend.position = "none")

8c. Lipid Class Composition

class_counts |>
  mutate(pct = n / sum(n) * 100) |>
  ggplot(aes(x = reorder(lipid_class, pct), y = pct, fill = lipid_class)) +
  geom_col() +
  coord_flip() +
  labs(title = "Lipid class composition (% of detected features)",
       x = NULL, y = "% of features") +
  theme_bw(base_size = 12) +
  theme(legend.position = "none")

8d. Top Diabetes-Associated Lipids — Boxplots

top_da <- filter(da_results, sig) |>
  slice_min(adj.P.Val, n = 20) |>
  pull(lipid)

if (length(top_da) > 0) {
  as.data.frame(mat[top_da, , drop = FALSE]) |>
    rownames_to_column("lipid") |>
    pivot_longer(-lipid, names_to = "sample_name", values_to = "log2_abund") |>
    left_join(select(metadata, sample_name, diabetes), by = "sample_name") |>
    filter(!is.na(diabetes)) |>
    ggplot(aes(x = diabetes, y = log2_abund, fill = diabetes)) +
    geom_boxplot(outlier.size = 0.8, alpha = 0.85) +
    facet_wrap(~ lipid, scales = "free_y", ncol = 5) +
    scale_fill_brewer(palette = "Set1") +
    labs(title = "Top diabetes-associated lipids (FDR-ranked)",
         x = "Diabetes status", y = "log₂ abundance",
         fill = "Diabetes") +
    theme_bw(base_size = 10) +
    theme(legend.position = "bottom")
} else {
  cat("No significant diabetes-associated lipids at the chosen thresholds.\n")
}

#> No significant diabetes-associated lipids at the chosen thresholds.

9. Export Results

dir.create("results", showWarnings = FALSE)

write_csv(as.data.frame(mat) |> rownames_to_column("lipid"),
          "results/abundance_pqn_log2_knn_imputed.csv")

write_csv(lipid_annot,
          "results/lipid_annotation_parsed.csv")

write_csv(da_results,
          "results/DA_diabetes_YesVsNo.csv")

write_csv(gender_results,
          "results/DA_gender_MaleVsFemale.csv")

write_csv(metadata,
          "results/metadata_clean.csv")

cat("Results written to /results/\n")

#> Results written to /results/

Session Info

sessionInfo()

#> R version 4.5.3 (2026-03-11)
#> Platform: aarch64-apple-darwin20
#> Running under: macOS Sequoia 15.7.5
#> 
#> Matrix products: default
#> BLAS:   /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib 
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
#> 
#> locale:
#> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#> 
#> time zone: Europe/Stockholm
#> tzcode source: internal
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#>  [1] impute_1.84.0      limma_3.66.0       kableExtra_1.4.0   knitr_1.51        
#>  [5] RColorBrewer_1.1-3 patchwork_1.3.2    ggrepel_0.9.8      pheatmap_1.0.13   
#>  [9] janitor_2.2.1      lubridate_1.9.5    forcats_1.0.1      stringr_1.6.0     
#> [13] dplyr_1.2.1        purrr_1.2.2        readr_2.2.0        tidyr_1.3.2       
#> [17] tibble_3.3.1       ggplot2_4.0.3      tidyverse_2.0.0   
#> 
#> loaded via a namespace (and not attached):
#>  [1] gtable_0.3.6        xfun_0.57           bslib_0.10.0       
#>  [4] tzdb_0.5.0          vctrs_0.7.3         tools_4.5.3        
#>  [7] generics_0.1.4      parallel_4.5.3      pkgconfig_2.0.3    
#> [10] S7_0.2.2            lifecycle_1.0.5     compiler_4.5.3     
#> [13] farver_2.1.2        textshaping_1.0.5   statmod_1.5.1      
#> [16] snakecase_0.11.1    htmltools_0.5.9     sass_0.4.10        
#> [19] yaml_2.3.12         pillar_1.11.1       crayon_1.5.3       
#> [22] jquerylib_0.1.4     cachem_1.1.0        tidyselect_1.2.1   
#> [25] digest_0.6.39       stringi_1.8.7       labeling_0.4.3     
#> [28] fastmap_1.2.0       grid_4.5.3          cli_3.6.6          
#> [31] magrittr_2.0.5      dichromat_2.0-0.1   utf8_1.2.6         
#> [34] withr_3.0.2         scales_1.4.0        bit64_4.8.0        
#> [37] timechange_0.4.0    rmarkdown_2.31      bit_4.6.0          
#> [40] otel_0.2.0          hms_1.1.4           evaluate_1.0.5     
#> [43] viridisLite_0.4.3   rlang_1.2.0         Rcpp_1.1.1-1.1     
#> [46] glue_1.8.1          BiocManager_1.30.27 xml2_1.5.2         
#> [49] svglite_2.2.2       rstudioapi_0.18.0   vroom_1.7.1        
#> [52] jsonlite_2.0.0      R6_2.6.1            systemfonts_1.3.2

References

	Citation
Dataset	Castro C, Harshfield EL, Butterworth AS, Wood AM, Koulman A, Griffin JL. (2024). A lipidomic dataset for epidemiological studies of acute myocardial infarction. Data in Brief 57:110925
Study	Harshfield EL, Koulman A, Ziemek D, Marney L, Fauman EB, Paul DS, et al. (2019). An unbiased lipid phenotyping approach to study the genetic determinants of lipids and their association with coronary heart disease risk factors. J. Proteome Res. 18(6):2397–2410
limma	Ritchie et al. (2015) Nucleic Acids Res. doi:10.1093/nar/gkv007
limma eBayes	Smyth (2004) Stat. Appl. Genet. Mol. Biol.
impute	Troyanskaya et al. (2001) Bioinformatics 17:520
PQN	Dieterle et al. (2006) Anal. Chem. 78:4281
LIPID MAPS shorthand	Liebisch et al. (2020) J. Lipid Res. 61:1650
tidyverse	Wickham et al. (2019) JOSS doi:10.21105/joss.01686
pheatmap	Kolde R (2019) CRAN

Lipidomics Workflow — MTBLS4461

An Unbiased Lipid Phenotyping Approach To Study the Genetic Determinants of Lipids and Their Association with Coronary Heart Disease Risk Factors

Jaysmita Chaliha

2026-05-09