#Metabolomic Coverage in Positive Ionization
# Define packages to load
pkg_utils <- c("tidyverse", "scales", "plotly", "heatmaply", "ggpubr", "htmlwidgets")
pkg_R4Mass <- c("RforMassSpectrometry", "SummarizedExperiment", "QFeatures", "MsCoreUtils")
pkg_stats <- c("vsn", "fpc")
# Load packages
lapply(c(pkg_utils, pkg_R4Mass, pkg_stats),
library, character.only = T)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
##
## Attaching package: 'scales'
##
##
## The following object is masked from 'package:purrr':
##
## discard
##
##
## The following object is masked from 'package:readr':
##
## col_factor
##
##
##
## Attaching package: 'plotly'
##
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
##
## The following object is masked from 'package:stats':
##
## filter
##
##
## The following object is masked from 'package:graphics':
##
## layout
##
##
## Loading required package: viridis
##
## Loading required package: viridisLite
##
##
## Attaching package: 'viridis'
##
##
## The following object is masked from 'package:scales':
##
## viridis_pal
##
##
##
## ======================
## Welcome to heatmaply version 1.5.0
##
## Type citation('heatmaply') for how to cite the package.
## Type ?heatmaply for the main documentation.
##
## The github page is: https://github.com/talgalili/heatmaply/
## Please submit your suggestions and bug-reports at: https://github.com/talgalili/heatmaply/issues
## You may ask questions at stackoverflow, use the r and heatmaply tags:
## https://stackoverflow.com/questions/tagged/heatmaply
## ======================
## Warning: replacing previous import 'MsCoreUtils::smooth' by 'Spectra::smooth'
## when loading 'RforMassSpectrometry'
## Warning: replacing previous import 'MsCoreUtils::bin' by 'Spectra::bin' when
## loading 'RforMassSpectrometry'
##
## Visit https://RforMassSpectrometry.org for detail about the project.
##
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
##
## The following object is masked from 'package:dplyr':
##
## count
##
##
## Attaching package: 'MatrixGenerics'
##
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
##
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
##
## The following object is masked from 'package:heatmaply':
##
## normalize
##
## The following objects are masked from 'package:lubridate':
##
## intersect, setdiff, union
##
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
##
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
##
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
##
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
##
## The following object is masked from 'package:plotly':
##
## rename
##
## The following objects are masked from 'package:lubridate':
##
## second, second<-
##
## The following objects are masked from 'package:dplyr':
##
## first, rename
##
## The following object is masked from 'package:tidyr':
##
## expand
##
## The following object is masked from 'package:utils':
##
## findMatches
##
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
##
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
##
## The following object is masked from 'package:plotly':
##
## slice
##
## The following object is masked from 'package:lubridate':
##
## %within%
##
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
##
## The following object is masked from 'package:purrr':
##
## reduce
##
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
##
## Attaching package: 'Biobase'
##
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
##
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
##
## Loading required package: MultiAssayExperiment
##
## Attaching package: 'QFeatures'
##
## The following object is masked from 'package:MultiAssayExperiment':
##
## longFormat
##
## The following object is masked from 'package:base':
##
## sweep
##
##
## Attaching package: 'MsCoreUtils'
##
## The following object is masked from 'package:MatrixGenerics':
##
## colCounts
##
## The following object is masked from 'package:matrixStats':
##
## colCounts
##
## The following object is masked from 'package:dplyr':
##
## between
##
## The following object is masked from 'package:stats':
##
## smooth
## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "scales" "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [7] "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [13] "graphics" "grDevices" "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "plotly" "scales" "lubridate" "forcats" "stringr" "dplyr"
## [7] "purrr" "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [13] "stats" "graphics" "grDevices" "utils" "datasets" "methods"
## [19] "base"
##
## [[4]]
## [1] "heatmaply" "viridis" "viridisLite" "plotly" "scales"
## [6] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [11] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [16] "stats" "graphics" "grDevices" "utils" "datasets"
## [21] "methods" "base"
##
## [[5]]
## [1] "ggpubr" "heatmaply" "viridis" "viridisLite" "plotly"
## [6] "scales" "lubridate" "forcats" "stringr" "dplyr"
## [11] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [16] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [21] "datasets" "methods" "base"
##
## [[6]]
## [1] "htmlwidgets" "ggpubr" "heatmaply" "viridis" "viridisLite"
## [6] "plotly" "scales" "lubridate" "forcats" "stringr"
## [11] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [16] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [21] "utils" "datasets" "methods" "base"
##
## [[7]]
## [1] "RforMassSpectrometry" "htmlwidgets" "ggpubr"
## [4] "heatmaply" "viridis" "viridisLite"
## [7] "plotly" "scales" "lubridate"
## [10] "forcats" "stringr" "dplyr"
## [13] "purrr" "readr" "tidyr"
## [16] "tibble" "ggplot2" "tidyverse"
## [19] "stats" "graphics" "grDevices"
## [22] "utils" "datasets" "methods"
## [25] "base"
##
## [[8]]
## [1] "SummarizedExperiment" "Biobase" "GenomicRanges"
## [4] "GenomeInfoDb" "IRanges" "S4Vectors"
## [7] "BiocGenerics" "stats4" "MatrixGenerics"
## [10] "matrixStats" "RforMassSpectrometry" "htmlwidgets"
## [13] "ggpubr" "heatmaply" "viridis"
## [16] "viridisLite" "plotly" "scales"
## [19] "lubridate" "forcats" "stringr"
## [22] "dplyr" "purrr" "readr"
## [25] "tidyr" "tibble" "ggplot2"
## [28] "tidyverse" "stats" "graphics"
## [31] "grDevices" "utils" "datasets"
## [34] "methods" "base"
##
## [[9]]
## [1] "QFeatures" "MultiAssayExperiment" "SummarizedExperiment"
## [4] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [7] "IRanges" "S4Vectors" "BiocGenerics"
## [10] "stats4" "MatrixGenerics" "matrixStats"
## [13] "RforMassSpectrometry" "htmlwidgets" "ggpubr"
## [16] "heatmaply" "viridis" "viridisLite"
## [19] "plotly" "scales" "lubridate"
## [22] "forcats" "stringr" "dplyr"
## [25] "purrr" "readr" "tidyr"
## [28] "tibble" "ggplot2" "tidyverse"
## [31] "stats" "graphics" "grDevices"
## [34] "utils" "datasets" "methods"
## [37] "base"
##
## [[10]]
## [1] "MsCoreUtils" "QFeatures" "MultiAssayExperiment"
## [4] "SummarizedExperiment" "Biobase" "GenomicRanges"
## [7] "GenomeInfoDb" "IRanges" "S4Vectors"
## [10] "BiocGenerics" "stats4" "MatrixGenerics"
## [13] "matrixStats" "RforMassSpectrometry" "htmlwidgets"
## [16] "ggpubr" "heatmaply" "viridis"
## [19] "viridisLite" "plotly" "scales"
## [22] "lubridate" "forcats" "stringr"
## [25] "dplyr" "purrr" "readr"
## [28] "tidyr" "tibble" "ggplot2"
## [31] "tidyverse" "stats" "graphics"
## [34] "grDevices" "utils" "datasets"
## [37] "methods" "base"
##
## [[11]]
## [1] "vsn" "MsCoreUtils" "QFeatures"
## [4] "MultiAssayExperiment" "SummarizedExperiment" "Biobase"
## [7] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [10] "S4Vectors" "BiocGenerics" "stats4"
## [13] "MatrixGenerics" "matrixStats" "RforMassSpectrometry"
## [16] "htmlwidgets" "ggpubr" "heatmaply"
## [19] "viridis" "viridisLite" "plotly"
## [22] "scales" "lubridate" "forcats"
## [25] "stringr" "dplyr" "purrr"
## [28] "readr" "tidyr" "tibble"
## [31] "ggplot2" "tidyverse" "stats"
## [34] "graphics" "grDevices" "utils"
## [37] "datasets" "methods" "base"
##
## [[12]]
## [1] "fpc" "vsn" "MsCoreUtils"
## [4] "QFeatures" "MultiAssayExperiment" "SummarizedExperiment"
## [7] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [10] "IRanges" "S4Vectors" "BiocGenerics"
## [13] "stats4" "MatrixGenerics" "matrixStats"
## [16] "RforMassSpectrometry" "htmlwidgets" "ggpubr"
## [19] "heatmaply" "viridis" "viridisLite"
## [22] "plotly" "scales" "lubridate"
## [25] "forcats" "stringr" "dplyr"
## [28] "purrr" "readr" "tidyr"
## [31] "tibble" "ggplot2" "tidyverse"
## [34] "stats" "graphics" "grDevices"
## [37] "utils" "datasets" "methods"
## [40] "base"
# Set ggplot default theme
theme_set(theme_bw() +
theme(axis.text = element_text(color = "black"),
strip.background = element_blank(),
strip.text.x = element_text(face = "bold")))
# Read assay data
setwd(dir = "/Users/lorandacalderonzamora/Downloads/")
knitr::opts_knit$set(root.dir = "/Users/lorandacalderonzamora/Downloads/")
sample_data <- read_csv("/Users/lorandacalderonzamora/Downloads/sample_data .csv") # read sample metadata, i.e. groups
## Rows: 160 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Sample, Group
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
peak_data <- read_csv("/Users/lorandacalderonzamora/Downloads/peak_data.csv") %>%
dplyr::filter(!is.na(`m/z`)) %>% # Usar dplyr::filter explÃcitamente
mutate(Formula = gsub('(\\d+)', '<sub>\\1</sub>', Formula, perl = TRUE)) %>%
column_to_rownames(var = "m/z")
## Rows: 2068 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Bucket label, Name, Formula
## dbl (1): m/z
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
matrix <- read_csv("/Users/lorandacalderonzamora/Downloads/matrix.csv") %>%
as.matrix()
## Rows: 2067 Columns: 160
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (160): DM01-POS, DM02-POS, CT01-POS, ND01-POS, DM03-POS, DM04-POS, DM05-...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Create the SummarizedExperiment object
raw_mz <- SummarizedExperiment(
assays = SimpleList(matrix = matrix),
rowData = peak_data,
colData = sample_data
)
# Replace zeros with NA
NA_mz <- zeroIsNA(raw_mz)
# Normalize using the vsn method with QFeatures
nm_mz <- QFeatures::normalize(NA_mz, method = "vsn")
# Check the model with a Mean-SD plot
mean_SD_plot <- meanSdPlot(assay(nm_mz))

# Replace NA back to zeros
nm_zero <- nm_mz
assay(nm_zero) <- assay(nm_mz) %>%
replace(is.na(.), 0)
# Get and plot NA proportion
NA_data <- nNA(NA_mz)$nNAcols %>%
data.frame() %>%
base::cbind(Group = colData(NA_mz)$Group)
NA_plot <- plot_ly(data = NA_data, x = ~ name, y = ~pNA, color = ~Group,
legendgroup = "A") %>%
add_trace(type = "bar", colors = viridis(n = 3),
hovertemplate = paste("Sample: %{x} <br>",
"pNA: %{y:.2f}%<br>")) %>%
plotly::layout(xaxis = list(title = "Sample"))
# Get and plot intensity summary
sum_data <- colSums(assay(raw_mz)) %>%
data.frame() %>%
dplyr::rename("intensity" = '.') %>%
rownames_to_column(var = 'Sample') %>%
base::cbind(Group = colData(raw_mz)$Group)
sum_plot <- plot_ly(data = sum_data, x = ~ Sample, y = ~intensity, color = ~Group,
legendgroup = "A", showlegend = F) %>%
add_trace(type = "bar", colors = viridis(n = 3),
hovertemplate = paste("Sample: %{x} <br>",
"Intensity: %{y}<br>")
)
# Combine the two plots into a subplot
summary_supplot <- plotly::subplot(NA_plot, sum_plot)
summary_supplot
htmlwidgets::saveWidget(summary_supplot, "summary_supplot_temp.html")
# Get features with > 0.4 NA proportion (0.6 prevalence)
top_met <- NA_mz %>%
filterNA(pNA = 0.4) %>%
assay() %>%
rownames() %>%
as.vector()
# Impute NAs to minimum value for visual purposes only
impute_nm_mz <- QFeatures::impute(nm_mz, method = "min")
# Create matrix for heatmap
mat_tm <- assay(impute_nm_mz) %>%
as.data.frame() %>%
dplyr::filter(rownames(.) %in% top_met) %>%
as.matrix()
# Define Group (ensure it exists in colData)
Group <- colData(nm_mz)$Group
# Define met_names
met_names <- rowData(impute_nm_mz) %>%
as.data.frame() %>%
dplyr::filter(rownames(.) %in% top_met)
# Ensure unique names
met_names <- met_names %>%
mutate(Name = make.unique(Name))
# Create heatmap with unique row labels
sample_heatmap <- heatmaply(
mat_tm,
scale_fill_gradient_fun = scale_fill_viridis_c(option = "inferno"),
col_side_colors = data.frame("Group" = Group),
col_side_palette = colorRampPalette(viridis(n = 3)),
labRow = met_names$Name
)
# Correlation matrix and heatmap
corr_mets <- assay(mat_tm) %>%
as.data.frame() %>%
t() %>%
cor(method = "spearman")
corr_heatmap <- heatmaply_cor(corr_mets, labRow = met_names$Name)
# Visualize heatmaps
sample_heatmap
corr_heatmap
htmlwidgets::saveWidget(sample_heatmap, "sample_heatmap.html", selfcontained = TRUE)
# Save the correlation heatmap
htmlwidgets::saveWidget(corr_heatmap, "corr_heatmap.html", selfcontained = TRUE)
# PCA was applied with vsn-normalizated data, because better groups were detected this way
# Cluster analysis could support this idea
pca_nm <- prcomp(assay(nm_zero), center = T, scale = T)
# Using pamk to calculate optimal number of clusters and plot it
pamk.best <- pamk(assay(nm_zero))
pamk.best.crit <- tibble("nclust" = 1:length(pamk.best$crit), crit = pamk.best$crit)
pamk.best.plot <- ggplot(pamk.best.crit, aes(x = nclust, y = crit )) +
geom_line() +
geom_point() +
geom_vline(xintercept = pamk.best$nc, linetype = "dashed", color = "blue") +
labs(x = "Number of clusters k", y = "Average shillouette width") +
scale_x_continuous(breaks = pamk.best.crit$nclust)
# Get PCA scores and loadings for plot
pca_scores_nm <- data.frame(pca_nm$x) %>%
base::cbind(peak_data, cluster = as.factor(pamk.best$pamobject$clustering))
pca_loadings_nm <- data.frame(pca_nm$rotation) %>%
base::cbind(sample_data)
# Plot PCA loadings
p_loadings <- ggplot(pca_loadings_nm, aes(
x = PC1, y = PC2, color = Group, shape = Group,
text = paste0('Sample: ', rownames(pca_loadings_nm), '<br>',
'Group: ', Group)
)) +
geom_point() +
scale_color_viridis_d()
# Plot PCA scores
p_scores <- ggplot(pca_scores_nm, aes(
x = PC1, y = PC2,
color = cluster, group = cluster, fill = cluster,
text = paste0('Name: ', Name, '<br>',
'Formula: ', Formula, '<br>',
'm/z: ', rownames(pca_scores_nm))
)) +
geom_point() +
stat_ellipse(alpha = 0.4, geom = "polygon") +
stat_mean(size = 6, color = "#444444", alpha = 0.6, stroke = 0.75) +
xlab(paste0("PCA1 (", 100 * round(summary(pca_nm)$importance[2, 1], 4), "%)")) +
ylab(paste0("PCA2 (", 100 * round(summary(pca_nm)$importance[2, 2], 4), "%)")) +
scale_color_viridis_d(option = "plasma") +
scale_fill_viridis_d(option = "plasma")
# Convert to interactive plots
ip_scores <- ggplotly(p_scores, dynamicTicks = TRUE, tooltip = "text") %>%
style(hoverinfo = "skip", traces = -1:-pamk.best$nc)
## Warning: The following aesthetics were dropped during statistical transformation: text.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: text.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
ip_loadings <- ggplotly(p_loadings, dynamicTicks = TRUE, tooltip = "text")
# Combine interactive plots
pca_iplot <- plotly::subplot(ip_loadings, ip_scores)
pca_iplot
# 3D PCA plot
(pca_scores_3d <- plot_ly(pca_scores_nm, x = ~PC1, y = ~PC2, z = ~PC3, color = ~cluster) %>%
add_markers(size = 20, colors = viridis(n=3, option = "plasma")))
(pca_loadings_3d <- plot_ly(pca_loadings_nm, x = ~PC1, y = ~PC2, z = ~PC3, color = ~Group) %>%
add_markers(size = 20, colors = viridis(n=3, option = "plasma")))
saveWidget(pca_iplot,
file = "figures/exploratory/PCA.html",
selfcontained = T)
saveWidget(pca_scores_3d,
file = "figures/exploratory/PCA.3D.metabo.html",
selfcontained = T)
saveWidget(pca_loadings_3d,
file = "figures/exploratory/PCA.3D.samples.html",
selfcontained = T)
ggsave("figures/exploratory/cluster number.jpg", pamk.best.plot,
dpi = 600, width = 8, height = 5)