#Metabolomic Coverage in Positive Ionization
# Define packages to load
pkg_utils <- c("tidyverse", "scales", "plotly", "heatmaply", "ggpubr", "htmlwidgets")
pkg_R4Mass <- c("RforMassSpectrometry", "SummarizedExperiment", "QFeatures", "MsCoreUtils")
pkg_stats <- c("vsn", "fpc")
# Load packages

lapply(c(pkg_utils, pkg_R4Mass, pkg_stats),
       library, character.only = T)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## 
## Attaching package: 'scales'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
## 
## 
## 
## Attaching package: 'plotly'
## 
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## 
## The following object is masked from 'package:graphics':
## 
##     layout
## 
## 
## Loading required package: viridis
## 
## Loading required package: viridisLite
## 
## 
## Attaching package: 'viridis'
## 
## 
## The following object is masked from 'package:scales':
## 
##     viridis_pal
## 
## 
## 
## ======================
## Welcome to heatmaply version 1.5.0
## 
## Type citation('heatmaply') for how to cite the package.
## Type ?heatmaply for the main documentation.
## 
## The github page is: https://github.com/talgalili/heatmaply/
## Please submit your suggestions and bug-reports at: https://github.com/talgalili/heatmaply/issues
## You may ask questions at stackoverflow, use the r and heatmaply tags: 
##   https://stackoverflow.com/questions/tagged/heatmaply
## ======================
## Warning: replacing previous import 'MsCoreUtils::smooth' by 'Spectra::smooth'
## when loading 'RforMassSpectrometry'
## Warning: replacing previous import 'MsCoreUtils::bin' by 'Spectra::bin' when
## loading 'RforMassSpectrometry'
## 
## Visit https://RforMassSpectrometry.org for detail about the project.
## 
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## 
## The following object is masked from 'package:dplyr':
## 
##     count
## 
## 
## Attaching package: 'MatrixGenerics'
## 
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## 
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## 
## The following object is masked from 'package:heatmaply':
## 
##     normalize
## 
## The following objects are masked from 'package:lubridate':
## 
##     intersect, setdiff, union
## 
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## 
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## 
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## 
## The following object is masked from 'package:plotly':
## 
##     rename
## 
## The following objects are masked from 'package:lubridate':
## 
##     second, second<-
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## 
## The following object is masked from 'package:tidyr':
## 
##     expand
## 
## The following object is masked from 'package:utils':
## 
##     findMatches
## 
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## 
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## 
## The following object is masked from 'package:plotly':
## 
##     slice
## 
## The following object is masked from 'package:lubridate':
## 
##     %within%
## 
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## 
## The following object is masked from 'package:purrr':
## 
##     reduce
## 
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## 
## Attaching package: 'Biobase'
## 
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## 
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
## 
## Loading required package: MultiAssayExperiment
## 
## Attaching package: 'QFeatures'
## 
## The following object is masked from 'package:MultiAssayExperiment':
## 
##     longFormat
## 
## The following object is masked from 'package:base':
## 
##     sweep
## 
## 
## Attaching package: 'MsCoreUtils'
## 
## The following object is masked from 'package:MatrixGenerics':
## 
##     colCounts
## 
## The following object is masked from 'package:matrixStats':
## 
##     colCounts
## 
## The following object is masked from 'package:dplyr':
## 
##     between
## 
## The following object is masked from 'package:stats':
## 
##     smooth
## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "scales"    "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "plotly"    "scales"    "lubridate" "forcats"   "stringr"   "dplyr"    
##  [7] "purrr"     "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse"
## [13] "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"  
## [19] "base"     
## 
## [[4]]
##  [1] "heatmaply"   "viridis"     "viridisLite" "plotly"      "scales"     
##  [6] "lubridate"   "forcats"     "stringr"     "dplyr"       "purrr"      
## [11] "readr"       "tidyr"       "tibble"      "ggplot2"     "tidyverse"  
## [16] "stats"       "graphics"    "grDevices"   "utils"       "datasets"   
## [21] "methods"     "base"       
## 
## [[5]]
##  [1] "ggpubr"      "heatmaply"   "viridis"     "viridisLite" "plotly"     
##  [6] "scales"      "lubridate"   "forcats"     "stringr"     "dplyr"      
## [11] "purrr"       "readr"       "tidyr"       "tibble"      "ggplot2"    
## [16] "tidyverse"   "stats"       "graphics"    "grDevices"   "utils"      
## [21] "datasets"    "methods"     "base"       
## 
## [[6]]
##  [1] "htmlwidgets" "ggpubr"      "heatmaply"   "viridis"     "viridisLite"
##  [6] "plotly"      "scales"      "lubridate"   "forcats"     "stringr"    
## [11] "dplyr"       "purrr"       "readr"       "tidyr"       "tibble"     
## [16] "ggplot2"     "tidyverse"   "stats"       "graphics"    "grDevices"  
## [21] "utils"       "datasets"    "methods"     "base"       
## 
## [[7]]
##  [1] "RforMassSpectrometry" "htmlwidgets"          "ggpubr"              
##  [4] "heatmaply"            "viridis"              "viridisLite"         
##  [7] "plotly"               "scales"               "lubridate"           
## [10] "forcats"              "stringr"              "dplyr"               
## [13] "purrr"                "readr"                "tidyr"               
## [16] "tibble"               "ggplot2"              "tidyverse"           
## [19] "stats"                "graphics"             "grDevices"           
## [22] "utils"                "datasets"             "methods"             
## [25] "base"                
## 
## [[8]]
##  [1] "SummarizedExperiment" "Biobase"              "GenomicRanges"       
##  [4] "GenomeInfoDb"         "IRanges"              "S4Vectors"           
##  [7] "BiocGenerics"         "stats4"               "MatrixGenerics"      
## [10] "matrixStats"          "RforMassSpectrometry" "htmlwidgets"         
## [13] "ggpubr"               "heatmaply"            "viridis"             
## [16] "viridisLite"          "plotly"               "scales"              
## [19] "lubridate"            "forcats"              "stringr"             
## [22] "dplyr"                "purrr"                "readr"               
## [25] "tidyr"                "tibble"               "ggplot2"             
## [28] "tidyverse"            "stats"                "graphics"            
## [31] "grDevices"            "utils"                "datasets"            
## [34] "methods"              "base"                
## 
## [[9]]
##  [1] "QFeatures"            "MultiAssayExperiment" "SummarizedExperiment"
##  [4] "Biobase"              "GenomicRanges"        "GenomeInfoDb"        
##  [7] "IRanges"              "S4Vectors"            "BiocGenerics"        
## [10] "stats4"               "MatrixGenerics"       "matrixStats"         
## [13] "RforMassSpectrometry" "htmlwidgets"          "ggpubr"              
## [16] "heatmaply"            "viridis"              "viridisLite"         
## [19] "plotly"               "scales"               "lubridate"           
## [22] "forcats"              "stringr"              "dplyr"               
## [25] "purrr"                "readr"                "tidyr"               
## [28] "tibble"               "ggplot2"              "tidyverse"           
## [31] "stats"                "graphics"             "grDevices"           
## [34] "utils"                "datasets"             "methods"             
## [37] "base"                
## 
## [[10]]
##  [1] "MsCoreUtils"          "QFeatures"            "MultiAssayExperiment"
##  [4] "SummarizedExperiment" "Biobase"              "GenomicRanges"       
##  [7] "GenomeInfoDb"         "IRanges"              "S4Vectors"           
## [10] "BiocGenerics"         "stats4"               "MatrixGenerics"      
## [13] "matrixStats"          "RforMassSpectrometry" "htmlwidgets"         
## [16] "ggpubr"               "heatmaply"            "viridis"             
## [19] "viridisLite"          "plotly"               "scales"              
## [22] "lubridate"            "forcats"              "stringr"             
## [25] "dplyr"                "purrr"                "readr"               
## [28] "tidyr"                "tibble"               "ggplot2"             
## [31] "tidyverse"            "stats"                "graphics"            
## [34] "grDevices"            "utils"                "datasets"            
## [37] "methods"              "base"                
## 
## [[11]]
##  [1] "vsn"                  "MsCoreUtils"          "QFeatures"           
##  [4] "MultiAssayExperiment" "SummarizedExperiment" "Biobase"             
##  [7] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
## [10] "S4Vectors"            "BiocGenerics"         "stats4"              
## [13] "MatrixGenerics"       "matrixStats"          "RforMassSpectrometry"
## [16] "htmlwidgets"          "ggpubr"               "heatmaply"           
## [19] "viridis"              "viridisLite"          "plotly"              
## [22] "scales"               "lubridate"            "forcats"             
## [25] "stringr"              "dplyr"                "purrr"               
## [28] "readr"                "tidyr"                "tibble"              
## [31] "ggplot2"              "tidyverse"            "stats"               
## [34] "graphics"             "grDevices"            "utils"               
## [37] "datasets"             "methods"              "base"                
## 
## [[12]]
##  [1] "fpc"                  "vsn"                  "MsCoreUtils"         
##  [4] "QFeatures"            "MultiAssayExperiment" "SummarizedExperiment"
##  [7] "Biobase"              "GenomicRanges"        "GenomeInfoDb"        
## [10] "IRanges"              "S4Vectors"            "BiocGenerics"        
## [13] "stats4"               "MatrixGenerics"       "matrixStats"         
## [16] "RforMassSpectrometry" "htmlwidgets"          "ggpubr"              
## [19] "heatmaply"            "viridis"              "viridisLite"         
## [22] "plotly"               "scales"               "lubridate"           
## [25] "forcats"              "stringr"              "dplyr"               
## [28] "purrr"                "readr"                "tidyr"               
## [31] "tibble"               "ggplot2"              "tidyverse"           
## [34] "stats"                "graphics"             "grDevices"           
## [37] "utils"                "datasets"             "methods"             
## [40] "base"
# Set ggplot default theme

theme_set(theme_bw() +
            theme(axis.text = element_text(color = "black"),
                  strip.background = element_blank(),
                  strip.text.x = element_text(face = "bold")))
# Read assay data
setwd(dir = "/Users/lorandacalderonzamora/Downloads/")
knitr::opts_knit$set(root.dir = "/Users/lorandacalderonzamora/Downloads/")

sample_data <- read_csv("/Users/lorandacalderonzamora/Downloads/sample_data .csv") # read sample metadata, i.e. groups
## Rows: 160 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Sample, Group
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
peak_data <- read_csv("/Users/lorandacalderonzamora/Downloads/peak_data.csv") %>%
  dplyr::filter(!is.na(`m/z`)) %>% # Usar dplyr::filter explícitamente
  mutate(Formula = gsub('(\\d+)', '<sub>\\1</sub>', Formula, perl = TRUE)) %>%
  column_to_rownames(var = "m/z")
## Rows: 2068 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Bucket label, Name, Formula
## dbl (1): m/z
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
matrix <- read_csv("/Users/lorandacalderonzamora/Downloads/matrix.csv") %>%
  as.matrix()
## Rows: 2067 Columns: 160
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (160): DM01-POS, DM02-POS, CT01-POS, ND01-POS, DM03-POS, DM04-POS, DM05-...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Create the SummarizedExperiment object
raw_mz <- SummarizedExperiment(
  assays = SimpleList(matrix = matrix),
  rowData = peak_data,
  colData = sample_data
)

# Replace zeros with NA
NA_mz <- zeroIsNA(raw_mz)

# Normalize using the vsn method with QFeatures
nm_mz <- QFeatures::normalize(NA_mz, method = "vsn")

# Check the model with a Mean-SD plot
mean_SD_plot <- meanSdPlot(assay(nm_mz))

# Replace NA back to zeros
nm_zero <- nm_mz
assay(nm_zero) <- assay(nm_mz) %>%
  replace(is.na(.), 0)
# Get and plot NA proportion
NA_data <- nNA(NA_mz)$nNAcols %>%
  data.frame() %>%
  base::cbind(Group = colData(NA_mz)$Group)

NA_plot <- plot_ly(data = NA_data, x = ~ name, y = ~pNA, color = ~Group,
                   legendgroup = "A") %>%
  add_trace(type = "bar", colors = viridis(n = 3),
            hovertemplate = paste("Sample: %{x} <br>",
                                  "pNA: %{y:.2f}%<br>")) %>%
  plotly::layout(xaxis = list(title = "Sample"))

# Get and plot intensity summary
sum_data <- colSums(assay(raw_mz)) %>%
  data.frame() %>%
  dplyr::rename("intensity" = '.') %>%
  rownames_to_column(var = 'Sample') %>%
  base::cbind(Group = colData(raw_mz)$Group)

sum_plot <- plot_ly(data = sum_data, x = ~ Sample, y = ~intensity, color = ~Group,
                    legendgroup = "A", showlegend = F) %>%
  add_trace(type = "bar", colors = viridis(n = 3),
            hovertemplate = paste("Sample: %{x} <br>",
                                  "Intensity: %{y}<br>")
            )

# Combine the two plots into a subplot
summary_supplot <- plotly::subplot(NA_plot, sum_plot)
summary_supplot
htmlwidgets::saveWidget(summary_supplot, "summary_supplot_temp.html")
# Get features with > 0.4 NA proportion (0.6 prevalence)
top_met <- NA_mz %>%
  filterNA(pNA = 0.4) %>%
  assay() %>%
  rownames() %>%
  as.vector()

# Impute NAs to minimum value for visual purposes only
impute_nm_mz <- QFeatures::impute(nm_mz, method = "min")

# Create matrix for heatmap
mat_tm <- assay(impute_nm_mz) %>%
  as.data.frame() %>%
  dplyr::filter(rownames(.) %in% top_met) %>%
  as.matrix()

# Define Group (ensure it exists in colData)
Group <- colData(nm_mz)$Group

# Define met_names
met_names <- rowData(impute_nm_mz) %>%
  as.data.frame() %>%
  dplyr::filter(rownames(.) %in% top_met)

# Ensure unique names
met_names <- met_names %>%
  mutate(Name = make.unique(Name))

# Create heatmap with unique row labels
sample_heatmap <- heatmaply(
  mat_tm,
  scale_fill_gradient_fun = scale_fill_viridis_c(option = "inferno"),
  col_side_colors = data.frame("Group" = Group),
  col_side_palette = colorRampPalette(viridis(n = 3)),
  labRow = met_names$Name
)

# Correlation matrix and heatmap
corr_mets <- assay(mat_tm) %>%
  as.data.frame() %>%
  t() %>%
  cor(method = "spearman")

corr_heatmap <- heatmaply_cor(corr_mets, labRow = met_names$Name)

# Visualize heatmaps
sample_heatmap
corr_heatmap
htmlwidgets::saveWidget(sample_heatmap, "sample_heatmap.html", selfcontained = TRUE)

# Save the correlation heatmap
htmlwidgets::saveWidget(corr_heatmap, "corr_heatmap.html", selfcontained = TRUE)
# PCA was applied with vsn-normalizated data, because better groups were detected this way
# Cluster analysis could support this idea

pca_nm <- prcomp(assay(nm_zero), center = T, scale = T)


# Using pamk to calculate optimal number of clusters and plot it

pamk.best <- pamk(assay(nm_zero))

pamk.best.crit <- tibble("nclust" = 1:length(pamk.best$crit), crit = pamk.best$crit)

pamk.best.plot <- ggplot(pamk.best.crit, aes(x = nclust, y = crit )) +
  geom_line() +
  geom_point() +
  geom_vline(xintercept = pamk.best$nc, linetype = "dashed", color = "blue") +
  labs(x = "Number of clusters k", y = "Average shillouette width") +
  scale_x_continuous(breaks = pamk.best.crit$nclust)
# Get PCA scores and loadings for plot
pca_scores_nm <- data.frame(pca_nm$x) %>%
  base::cbind(peak_data, cluster = as.factor(pamk.best$pamobject$clustering))

pca_loadings_nm <- data.frame(pca_nm$rotation) %>%
  base::cbind(sample_data)

# Plot PCA loadings
p_loadings <- ggplot(pca_loadings_nm, aes(
  x = PC1, y = PC2, color = Group, shape = Group,
  text = paste0('Sample: ', rownames(pca_loadings_nm), '<br>',
                'Group: ', Group)
)) +
  geom_point() +
  scale_color_viridis_d()

# Plot PCA scores
p_scores <- ggplot(pca_scores_nm, aes(
  x = PC1, y = PC2,
  color = cluster, group = cluster, fill = cluster,
  text = paste0('Name: ', Name, '<br>',
                'Formula: ', Formula, '<br>',
                'm/z: ', rownames(pca_scores_nm))
)) +
  geom_point() +
  stat_ellipse(alpha = 0.4, geom = "polygon") +
  stat_mean(size = 6, color = "#444444", alpha = 0.6, stroke = 0.75) +
  xlab(paste0("PCA1 (", 100 * round(summary(pca_nm)$importance[2, 1], 4), "%)")) +
  ylab(paste0("PCA2 (", 100 * round(summary(pca_nm)$importance[2, 2], 4), "%)")) +
  scale_color_viridis_d(option = "plasma") +
  scale_fill_viridis_d(option = "plasma")

# Convert to interactive plots
ip_scores <- ggplotly(p_scores, dynamicTicks = TRUE, tooltip = "text") %>%
  style(hoverinfo = "skip", traces = -1:-pamk.best$nc)
## Warning: The following aesthetics were dropped during statistical transformation: text.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation: text.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
ip_loadings <- ggplotly(p_loadings, dynamicTicks = TRUE, tooltip = "text")

# Combine interactive plots
pca_iplot <- plotly::subplot(ip_loadings, ip_scores)
pca_iplot
# 3D PCA plot

(pca_scores_3d <- plot_ly(pca_scores_nm, x = ~PC1, y = ~PC2, z = ~PC3, color = ~cluster) %>%
  add_markers(size = 20, colors = viridis(n=3, option = "plasma")))
(pca_loadings_3d <- plot_ly(pca_loadings_nm, x = ~PC1, y = ~PC2, z = ~PC3, color = ~Group) %>%
    add_markers(size = 20, colors = viridis(n=3, option = "plasma")))
saveWidget(pca_iplot,
           file = "figures/exploratory/PCA.html",
           selfcontained = T)

saveWidget(pca_scores_3d,
           file = "figures/exploratory/PCA.3D.metabo.html",
           selfcontained = T)

saveWidget(pca_loadings_3d,
           file = "figures/exploratory/PCA.3D.samples.html",
           selfcontained = T)

ggsave("figures/exploratory/cluster number.jpg", pamk.best.plot,
       dpi = 600, width = 8, height = 5)