Title: Metagenomic diversity analysis of river viromes

Description: This script performs taxonomic annotation, filtering, and alpha/beta diversity analysis of metagenomic sequencing data obtained from water column and sediment samples collected along a pollution gradient in the Virilla River, Costa Rica. Sampling was performed during the dry and rainy seasons.

Main Steps: 1. Installation and loading of required R packages. 2. Taxonomic assignment using local SQLite database and taxonomizr. 3. Construction of phyloseq objects from OTU table, taxonomic assignments, and metadata. 4. Rarefaction of samples to normalize sequencing depth. 5. Prevalence- and abundance-based filtering of taxa and samples. 6. Subsetting of samples by habitat (water vs. sediment) and season (dry vs. rainy). 7. Alpha diversity analysis using Observed richness, Shannon, and Inverse Simpson indices. 8. Visualization of alpha diversity across sites and sample types. 9. Statistical testing using Kruskal-Wallis and Dunn’s post-hoc tests. 10.Ordination analysis on CLR-transformed microbial community data to visualize patterns in sample composition using Principal Component Analysis (PCA).

BiocManager::install("phyloseq",force = TRUE)
install.packages("remotes", repos = "https://cloud.r-project.org")
remotes::install_github("vmikk/metagMisc", force = TRUE)
install.packages(
  "microViz",
  repos = c(davidbarnett = "https://david-barnett.r-universe.dev", getOption("repos"))
)
BiocManager::install("metagenomeSeq")
install.packages("devtools", repos = "https://cloud.r-project.org")
devtools::install_github("braddmg/b2p/package")
#Load required libraries
library("phyloseq")
library("phyloseq")
library("ggplot2")
## Warning: package 'ggplot2' was built under R version 4.4.3
library("vegan")
## Cargando paquete requerido: permute
## Cargando paquete requerido: lattice
library("remotes")
## Warning: package 'remotes' was built under R version 4.4.3
library("metagMisc")
## 
## Adjuntando el paquete: 'metagMisc'
## The following object is masked from 'package:remotes':
## 
##     add_metadata
library("microbiome")
## 
## microbiome R package (microbiome.github.com)
##     
## 
## 
##  Copyright (C) 2011-2022 Leo Lahti, 
##     Sudarshan Shetty et al. <microbiome.github.io>
## 
## Adjuntando el paquete: 'microbiome'
## The following object is masked from 'package:metagMisc':
## 
##     prevalence
## The following object is masked from 'package:vegan':
## 
##     diversity
## The following object is masked from 'package:ggplot2':
## 
##     alpha
## The following object is masked from 'package:base':
## 
##     transform
library("DESeq2")
## Cargando paquete requerido: S4Vectors
## Cargando paquete requerido: stats4
## Cargando paquete requerido: BiocGenerics
## 
## Adjuntando el paquete: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, saveRDS, setdiff,
##     table, tapply, union, unique, unsplit, which.max, which.min
## 
## Adjuntando el paquete: 'S4Vectors'
## The following object is masked from 'package:utils':
## 
##     findMatches
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Cargando paquete requerido: IRanges
## 
## Adjuntando el paquete: 'IRanges'
## The following object is masked from 'package:microbiome':
## 
##     coverage
## The following object is masked from 'package:phyloseq':
## 
##     distance
## The following object is masked from 'package:grDevices':
## 
##     windows
## Cargando paquete requerido: GenomicRanges
## Cargando paquete requerido: GenomeInfoDb
## Cargando paquete requerido: SummarizedExperiment
## Cargando paquete requerido: MatrixGenerics
## Cargando paquete requerido: matrixStats
## 
## Adjuntando el paquete: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Cargando paquete requerido: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Adjuntando el paquete: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
## The following object is masked from 'package:phyloseq':
## 
##     sampleNames
library("taxonomizr")
## Warning: package 'taxonomizr' was built under R version 4.4.3
library("tidyverse")
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.2.1
## ✔ purrr     1.0.4     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::%within%()    masks IRanges::%within%()
## ✖ microbiome::alpha()      masks ggplot2::alpha()
## ✖ dplyr::collapse()        masks IRanges::collapse()
## ✖ dplyr::combine()         masks Biobase::combine(), BiocGenerics::combine()
## ✖ dplyr::count()           masks matrixStats::count()
## ✖ dplyr::desc()            masks IRanges::desc()
## ✖ tidyr::expand()          masks S4Vectors::expand()
## ✖ dplyr::filter()          masks stats::filter()
## ✖ dplyr::first()           masks S4Vectors::first()
## ✖ dplyr::lag()             masks stats::lag()
## ✖ BiocGenerics::Position() masks ggplot2::Position(), base::Position()
## ✖ purrr::reduce()          masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ dplyr::rename()          masks S4Vectors::rename()
## ✖ lubridate::second()      masks S4Vectors::second()
## ✖ lubridate::second<-()    masks S4Vectors::second<-()
## ✖ dplyr::slice()           masks IRanges::slice()
## ✖ purrr::some()            masks metagMisc::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(vegan)
library(picante)
## Cargando paquete requerido: ape
## 
## Adjuntando el paquete: 'ape'
## 
## The following object is masked from 'package:dplyr':
## 
##     where
## 
## Cargando paquete requerido: nlme
## 
## Adjuntando el paquete: 'nlme'
## 
## The following object is masked from 'package:dplyr':
## 
##     collapse
## 
## The following object is masked from 'package:IRanges':
## 
##     collapse
library(ALDEx2)
## Cargando paquete requerido: zCompositions
## Cargando paquete requerido: MASS
## 
## Adjuntando el paquete: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## Cargando paquete requerido: NADA
## Cargando paquete requerido: survival
## 
## Adjuntando el paquete: 'NADA'
## 
## The following object is masked from 'package:IRanges':
## 
##     cor
## 
## The following object is masked from 'package:S4Vectors':
## 
##     cor
## 
## The following object is masked from 'package:stats':
## 
##     cor
## 
## Cargando paquete requerido: truncnorm
## Cargando paquete requerido: latticeExtra
## 
## Adjuntando el paquete: 'latticeExtra'
## 
## The following object is masked from 'package:ggplot2':
## 
##     layer
library(metagenomeSeq)
## Cargando paquete requerido: limma
## 
## Adjuntando el paquete: 'limma'
## 
## The following object is masked from 'package:DESeq2':
## 
##     plotMA
## 
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA
## 
## Cargando paquete requerido: glmnet
## Warning: package 'glmnet' was built under R version 4.4.3
## Cargando paquete requerido: Matrix
## 
## Adjuntando el paquete: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## The following object is masked from 'package:S4Vectors':
## 
##     expand
## 
## Loaded glmnet 4.1-8
## Cargando paquete requerido: RColorBrewer
library(HMP)
## Cargando paquete requerido: dirmult
## 
## Adjuntando el paquete: 'HMP'
## 
## The following object is masked from 'package:dirmult':
## 
##     weirMoM
library(dendextend)
## Registered S3 method overwritten by 'dendextend':
##   method     from 
##   rev.hclust vegan
## 
## ---------------------
## Welcome to dendextend version 1.19.0
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags: 
##   https://stackoverflow.com/questions/tagged/dendextend
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## 
## Adjuntando el paquete: 'dendextend'
## 
## The following objects are masked from 'package:ape':
## 
##     ladderize, rotate
## 
## The following object is masked from 'package:permute':
## 
##     shuffle
## 
## The following object is masked from 'package:stats':
## 
##     cutree
library(rms)
## Warning: package 'rms' was built under R version 4.4.3
## Cargando paquete requerido: Hmisc
## Warning: package 'Hmisc' was built under R version 4.4.3
## 
## Adjuntando el paquete: 'Hmisc'
## 
## The following object is masked from 'package:ape':
## 
##     zoom
## 
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## 
## The following object is masked from 'package:Biobase':
## 
##     contents
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## 
## 
## Adjuntando el paquete: 'rms'
## 
## The following object is masked from 'package:vegan':
## 
##     calibrate
library(breakaway)
library(gridExtra)
## 
## Adjuntando el paquete: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:Biobase':
## 
##     combine
## 
## The following object is masked from 'package:BiocGenerics':
## 
##     combine
library(ggplot2)
library(microViz)
## Warning: package 'microViz' was built under R version 4.4.3
## microViz version 0.12.7 - Copyright (C) 2021-2025 David Barnett
## ! Website: https://david-barnett.github.io/microViz
## ✔ Useful?  For citation details, run: `citation("microViz")`
## ✖ Silence? `suppressPackageStartupMessages(library(microViz))`
# Load the 'b2p' package, which provides tools for taxonomic assignment
library(b2p)
# Set up the local SQL taxonomic database by specifying its file path
setupdb(path = "C:/Users/DELL/Desktop/ManuscritoViroma")  # Replace with your actual path to 'accessionTaxa.sql'
## SQLite database accessionTaxa.sql already exists. Delete to regenerate
## Database setup completed in: C:/Users/DELL/Desktop/ManuscritoViroma
#Download SQLite database for taxonomy annotation
#prepareDatabase('accessionTaxa.sql')

# Load taxonomic IDs from CSV
taxid <- read.csv("taxaid.csv")           
taxaid <- list(taxid$taxid)               # Extract 'taxid' column and convert to list
taxaid <- as.numeric(unlist(taxaid))      # Unlist and convert to numeric vector

# Get taxonomy information for each taxid
taxonomy <- getTaxonomy(
  taxaid,
  'accessionTaxa.sql',                    # SQLite database file for taxonomy lookup
  desiredTaxa = c("superkingdom", "phylum", "class", 
                  "order", "family", "genus", "species")  # Specify taxonomic levels
)
## Warning in file.remove(tmp): no fue posible abrir el archivo
## 'C:\Users\DELL\AppData\Local\Temp\RtmpUvfLTD\file792040f63c98', motivo
## 'Permission denied'
## Warning in file.remove(tmp): no fue posible abrir el archivo
## 'C:\Users\DELL\AppData\Local\Temp\RtmpUvfLTD\file7920798622ac', motivo
## 'Permission denied'
## Warning in file.remove(tmp): no fue posible abrir el archivo
## 'C:\Users\DELL\AppData\Local\Temp\RtmpUvfLTD\file792079176b2e', motivo
## 'Permission denied'
## Warning in file.remove(tmp): no fue posible abrir el archivo
## 'C:\Users\DELL\AppData\Local\Temp\RtmpUvfLTD\file792034173e42', motivo
## 'Permission denied'
## Warning in file.remove(tmp): no fue posible abrir el archivo
## 'C:\Users\DELL\AppData\Local\Temp\RtmpUvfLTD\file792069bf5f41', motivo
## 'Permission denied'
## Warning in file.remove(tmp): no fue posible abrir el archivo
## 'C:\Users\DELL\AppData\Local\Temp\RtmpUvfLTD\file792054fd7670', motivo
## 'Permission denied'
## Warning in file.remove(tmp): no fue posible abrir el archivo
## 'C:\Users\DELL\AppData\Local\Temp\RtmpUvfLTD\file792039ce297e', motivo
## 'Permission denied'
## Warning in file.remove(tmp): no fue posible abrir el archivo
## 'C:\Users\DELL\AppData\Local\Temp\RtmpUvfLTD\file79205a42ec', motivo
## 'Permission denied'
## Warning in file.remove(tmp): no fue posible abrir el archivo
## 'C:\Users\DELL\AppData\Local\Temp\RtmpUvfLTD\file792078bd49cf', motivo
## 'Permission denied'
## Warning in file.remove(tmp): no fue posible abrir el archivo
## 'C:\Users\DELL\AppData\Local\Temp\RtmpUvfLTD\file792079385e8e', motivo
## 'Permission denied'
## Warning in file.remove(tmp): no fue posible abrir el archivo
## 'C:\Users\DELL\AppData\Local\Temp\RtmpUvfLTD\file792018c5680', motivo
## 'Permission denied'
## Warning in file.remove(tmp): no fue posible abrir el archivo
## 'C:\Users\DELL\AppData\Local\Temp\RtmpUvfLTD\file7920390c5b9d', motivo
## 'Permission denied'
## Warning in file.remove(tmp): no fue posible abrir el archivo
## 'C:\Users\DELL\AppData\Local\Temp\RtmpUvfLTD\file79209fb1ea1', motivo
## 'Permission denied'
# Clean up row names (remove any whitespace)
row.names(taxonomy) <- gsub("[[:space:]]", "", row.names(taxonomy))

#Import OTU table and metadata
Data=read.table("viromes.csv", sep=",", row.names = 1, header=TRUE)
Data[is.na(Data)] <- 0

# Create OTU and taxonomy objects
OTU <- otu_table(Data, taxa_are_rows = TRUE)    
TAX <- tax_table(taxonomy)                    

# Create initial phyloseq object with OTU and taxonomy data
physeq <- phyloseq(OTU, TAX)                    
sample_names(physeq)                            
##  [1] "A035" "A036" "A037" "A038" "A039" "A040" "A041" "A042" "A043" "A194"
## [11] "A195" "A196" "A197" "A198" "A199" "A200" "A201" "A202" "S035" "S036"
## [21] "S037" "S038" "S039" "S040" "S041" "S042" "S043" "S194" "S195" "S196"
## [31] "S197" "S198" "S199" "S200" "S201" "S202"
#taxa_names(physeq)                              

# Load sample metadata and format it to match the sample names in physeq
meta_data <- read.csv("mdata2.csv", sep = ",")  
sampledata <- sample_data(data.frame(
  meta_data,
  row.names = sample_names(physeq),
  stringsAsFactors = FALSE                      
))
#sampledata                                       

# Merge OTU, taxonomy, and metadata into a unified phyloseq object
physeq1 <- merge_phyloseq(physeq, sampledata)  
#sample_data(physeq1)                             

# Perform rarefaction to normalize sequencing depth across samples
p.rar2 <- rarefy_even_depth(
  physeq1,
  rngseed = 1,                                   # Set seed for reproducibility
  sample.size = min(sample_sums(physeq1)),       # Use minimum sequencing depth
  replace = FALSE                                 # Sample without replacement
)
## `set.seed(1)` was used to initialize repeatable random subsampling.
## Please record this for your records so others can reproduce.
## Try `set.seed(1); .Random.seed` for the full vector
## ...
## 193OTUs were removed because they are no longer 
## present in any sample after random subsampling
## 
## ...
# View metadata column names and rarefied phyloseq object summary
colnames(sample_data(p.rar2))  # Check available metadata variables
##  [1] "Codigo"        "Tipo"          "Punto"         "Muestreo"     
##  [5] "sul1"          "tetA"          "CTX_M"         "qnrS"         
##  [9] "ermB"          "dfrA12"        "blaTEM"        "sul2"         
## [13] "tetQ"          "intI_1"        "Cfecales"      "Ecoli"        
## [17] "Efaecalis"     "CF_EF"         "Porcentaje_OD" "mg_L_OD"      
## [21] "X.S_cm"        "X"             "X.1"           "X.2"          
## [25] "X.3"           "X.4"           "X.5"           "X.6"          
## [29] "X.7"           "X.8"           "X.9"           "X.10"         
## [33] "X.11"
#p.rar2                         # View summary of the rarefied phyloseq object

# Filter taxa by prevalence across samples
# - prev.trh = 0.05: keep taxa present in at least 5% of the samples
# - threshold_condition = "OR": keeps taxa that meet *either* prevalence or abundance thresholds
# - abund.trh = NULL: skip abundance threshold for this step
# - abund.type = "total": considers total abundance per taxon
p.rar4 <- phyloseq_filter_prevalence(
  p.rar2,
  prev.trh = 0.05,
  abund.trh = NULL,
  threshold_condition = "OR",
  abund.type = "total"
)
#p.rar4  

# Further filter samples to retain only those with minimum total abundance of 10 reads
p.rar4.2 <- phyloseq_filter_sample_wise_abund_trim(
  p.rar4,
  minabund = 10
)
METADATA

-Sampling corresponds to the time of the year: 2-Dry season, 3-Rainy season. -Type corresponds to the two types of samples: Water (free-living particles in the water column) and Sediment (particles settled to the bottom of the river) -Site corresponds to three pollution gradients: Site 1=low contamination, Site 2=medium contamination, Site 3=high contamination.

# Subset samples from the filtered phyloseq object (p.rar4.2) 

# Subset for the dry season samples (Muestreo == 2)
Tseca2 = subset_samples(p.rar4.2, Muestreo == 2)

# Subset for the rainy season samples (Muestreo == 3)
Tlluviosa2 = subset_samples(p.rar4.2, Muestreo == 3)

# Subset all water colum samples, then divide them by season
ColAgua2=subset_samples(p.rar4.2, Tipo == "water")
ColaAgua_Seca2 = subset_samples(ColAgua2, Muestreo == 2)        # Water samples from dry season
ColaAgua_Lluvioso2 = subset_samples(ColAgua2, Muestreo == 3)    # Water samples from rainy season

# Subset all sediment samples, then divide them by season
Sediment2=subset_samples(p.rar4.2, Tipo == "sediment")
Sediment2_Seca2 = subset_samples(Sediment2, Muestreo == 2)      # Sediment samples from dry season
Sediment2_Lluvioso2 = subset_samples(Sediment2, Muestreo == 3)  # Sediment samples from rainy season

Alfa diversity analysis

# Create a richness plot (Observed, Shannon, InvSimpson) for dry season samples
DrySeason <- plot_richness(Tseca2, x = "Punto", measures = c("Observed","Shannon", "InvSimpson")) +
  geom_boxplot(aes(fill = as.factor(Tipo)), outlier.shape = NA, position = position_dodge(width = 0.8)) +  
  scale_fill_manual(values = c("#f8c471", "#7fb3d5"), 
                    name = "Tipo") + 
  facet_wrap(~variable, nrow = 1, scales = "free_y") + 
  theme_bw(base_size = 14) +  
  theme(
    panel.grid = element_line(), 
    axis.line = element_line(color = "black"), 
    axis.title.x = element_blank(),
    axis.title.y = element_text(size = 14, face = "bold"),
    axis.text.x = element_text(size = 14, angle = 90, hjust = 1, face = "bold", color = "black"),
    axis.text.y = element_text(size = 14, color = "black"),
    legend.position = "none",
    legend.title = element_blank(),
    plot.title = element_text(size = 14, face = "bold", hjust = 0.5)
  ) +
  ggtitle("Dry season") +
  labs(y = "Alfa diversity measure") 
## Warning in estimate_richness(physeq, split = TRUE, measures = measures): The data you have provided does not have
## any singletons. This is highly suspicious. Results of richness
## estimates (for example) are probably unreliable, or wrong, if you have already
## trimmed low-abundance taxa from the data.
## 
## We recommended that you find the un-trimmed data and retry.
# Display the plot
DrySeason

# Create an alpha diversity plot (Observed, Shannon, InvSimpson) for rainy season samples
RainySeason <- plot_richness(Tlluviosa2, x = "Punto", measures = c("Observed","Shannon", "InvSimpson")) +
  geom_boxplot(aes(fill = as.factor(Tipo)), outlier.shape = NA, position = position_dodge(width = 0.8)) +  
  scale_fill_manual(values = c("#f8c471", "#7fb3d5"), 
                    name = "Tipo") + 
  facet_wrap(~variable, nrow = 1, scales = "free_y") + 
  theme_bw(base_size = 14) +  
  theme(
    panel.grid = element_line(), 
    axis.line = element_line(color = "black"), 
    axis.title.x = element_blank(),
    axis.title.y = element_text(size = 14, face = "bold"),
    axis.text.x = element_text(size = 14, angle = 90, hjust = 1, face = "bold", color = "black"),
    axis.text.y = element_text(size = 14, color = "black"),
    legend.position = "bottom",
    legend.title = element_blank(),
    plot.title = element_text(size = 14, face = "bold", hjust = 0.5)
  ) +
  ggtitle("Rainy season") +
  labs(y = "Alfa diversity measure") 
## Warning in estimate_richness(physeq, split = TRUE, measures = measures): The data you have provided does not have
## any singletons. This is highly suspicious. Results of richness
## estimates (for example) are probably unreliable, or wrong, if you have already
## trimmed low-abundance taxa from the data.
## 
## We recommended that you find the un-trimmed data and retry.
# Display the plot
RainySeason

# Arrange DrySeason and RainySeason plots vertically in one figure
plotTipoMuestreoPunto = grid.arrange(DrySeason, RainySeason, ncol = 1)

# Save the combined figure as a high-resolution JPEG file
ggsave("plot1.jpg", plot = plotTipoMuestreoPunto, width = 15, height = 12, dpi = 450)

Statistical tests

library(tidyverse)
library(tibble)
library(FSA)

# --- SEDIMENT, DRY SEASON ---

# Estimate alpha diversity metrics for sediment samples from the dry season
alphadiv <- estimate_richness(Sediment2_Seca2, measures = c("Observed", "Shannon", "InvSimpson"))
alphadiv2 <- rownames_to_column(alphadiv, var = "Codigo") 
meta_data2 <- rownames_to_column(meta_data, var = "Codigo") 
alphadiv3 <- left_join(alphadiv2, meta_data2, by = "Codigo")

# Perform Kruskal-Wallis test and Dunn post-hoc test for each diversity metric by sampling site (Punto)
kruskal.test(Observed ~ Punto, data = alphadiv3) # p-value = 0.3012
dunnTest(Observed ~ Punto, data = alphadiv3, method = "bonferroni") #no significant

kruskal.test(Shannon ~ Punto, data = alphadiv3) #p-value = 0.02732
dunnTest(Shannon ~ Punto, data = alphadiv3, method = "bonferroni")
#1 Site  1 - Site  2 -1.341641 0.179712495 0.53913748
#2 Site  1 - Site  3 -2.683282 0.007290358 0.02187107 significant
#3 Site  2 - Site  3 -1.341641 0.179712495 0.53913748

kruskal.test(InvSimpson ~ Punto, data = alphadiv3)#p-value = 0.03899
dunnTest(InvSimpson ~ Punto, data = alphadiv3, method = "bonferroni")
#1 Site  1 - Site  2 -1.490712 0.1360371 0.40811138
#2 Site  1 - Site  3 -2.534210 0.0112701 0.03381031 significant
#3 Site  2 - Site  3 -1.043498 0.2967175 0.89015258


# --- WATER COLUMN, DRY SEASON ---

# Estimate alpha diversity metrics for water samples from the dry season
alphadiv5 <- estimate_richness(ColaAgua_Seca2, measures = c("Observed", "Shannon", "InvSimpson"))
alphadiv6 <- rownames_to_column(alphadiv5, var = "Codigo") 
alphadiv7 <- left_join(alphadiv6, meta_data, by = "Codigo")

# Run Kruskal-Wallis and Dunn's test by site (Punto)
kruskal.test(Observed ~ Punto, data = alphadiv7) #p-value = 0.02732
dunnTest(Observed ~ Punto, data = alphadiv7, method = "bonferroni")
#1 Site  1 - Site  2 -1.341641 0.179712495 0.53913748
#2 Site  1 - Site  3 -2.683282 0.007290358 0.02187107 significant
#3 Site  2 - Site  3 -1.341641 0.179712495 0.53913748

kruskal.test(Shannon ~ Punto, data = alphadiv7) #p-value = 0.02732
dunnTest(Shannon ~ Punto, data = alphadiv7, method = "bonferroni")
#1 Site  1 - Site  2 -1.341641 0.179712495 0.53913748
#2 Site  1 - Site  3 -2.683282 0.007290358 0.02187107 significant
#3 Site  2 - Site  3 -1.341641 0.179712495 0.53913748

kruskal.test(InvSimpson ~ Punto, data = alphadiv7)#p-value = 0.06081
dunnTest(InvSimpson ~ Punto, data = alphadiv7, method = "bonferroni")#non significant


# --- WATER COLUMN, RAINY SEASON ---

# Estimate diversity metrics for water samples from the rainy season
alphadiv8 <- estimate_richness(ColaAgua_Lluvioso2, measures = c("Observed", "Shannon", "InvSimpson"))
alphadiv9 <- rownames_to_column(alphadiv8, var = "Codigo") 
alphadiv10 <- left_join(alphadiv9, meta_data, by = "Codigo")

# Run statistical tests
kruskal.test(Observed ~ Punto, data = alphadiv10) #p-value = 0.06646
dunnTest(Observed ~ Punto, data = alphadiv10, method = "bonferroni")#non significant

kruskal.test(Shannon ~ Punto, data = alphadiv10) #p-value = 0.02732
dunnTest(Shannon ~ Punto, data = alphadiv10, method = "bonferroni")
#1 Site  1 - Site  2 1.341641 0.179712495 0.53913748
#2 Site  1 - Site  3 2.683282 0.007290358 0.02187107 significant
#3 Site  2 - Site  3 1.341641 0.179712495 0.53913748

kruskal.test(InvSimpson ~ Punto, data = alphadiv10) #p-value = 0.06646
dunnTest(InvSimpson ~ Punto, data = alphadiv10, method = "bonferroni") #non significant


# --- SEDIMENT, RAINY SEASON ---

# Estimate diversity metrics for sediment samples from the rainy season
alphadiv11 <- estimate_richness(Sediment2_Lluvioso2, measures = c("Observed", "Shannon", "InvSimpson"))
alphadiv12 <- rownames_to_column(alphadiv11, var = "Codigo") 
alphadiv13 <- left_join(alphadiv12, meta_data, by = "Codigo")

# Run Kruskal-Wallis and post-hoc Dunn tests for diversity metrics
kruskal.test(Observed ~ Punto, data = alphadiv13)#p-value = 0.8712
dunnTest(Observed ~ Punto, data = alphadiv13, method = "bonferroni")#non significant

kruskal.test(Shannon ~ Punto, data = alphadiv13)#p-value = 0.3932
dunnTest(Shannon ~ Punto, data = alphadiv13, method = "bonferroni")#non significant

kruskal.test(InvSimpson ~ Punto, data = alphadiv13)#p-value = 0.2019
dunnTest(InvSimpson ~ Punto, data = alphadiv13, method = "bonferroni")#non significant

Beta Diversity analyses

# --- AGGREGATE AND FILTER TAXA BY CLASS LEVEL ---

# Group (agglomerate) taxa at the "class" level from the rarefied phyloseq object (p.rar4.2)
physeqClass = tax_glom(p.rar4.2, "class")
# Transform counts to relative abundance (i.e., each sample's counts are divided by the sample's total count)
physeqClassRA3 = transform_sample_counts(physeqClass, function(x) x / sum(x))
# Filter out classes with a mean relative abundance ≤ 1% across all samples
# Only keep classes with mean abundance > 0.01 (i.e., >1%)
physeqClassRAF3 = filter_taxa(physeqClassRA3, function(x) mean(x) > 0.01, TRUE)
# Get the names of the classes that passed the filter (still present in the filtered dataset)
keepClass = get_taxa_unique(physeqClassRAF3, "class")
physeqF3 = subset_taxa(p.rar4.2, class %in% keepClass)
# Regroup (agglomerate) the filtered dataset again at the class level
physeqF3 = tax_glom(physeqF3, "class")
# Transform to relative abundances again, this time with the final filtered and grouped dataset
ps_rel_abundclass = phyloseq::transform_sample_counts(physeqF3, function(x) { x / sum(x) })


# Define your color palette
library(RColorBrewer)
pastel1_colors <- brewer.pal(8, "Set2")
set3_colors <- brewer.pal(8, "Dark2")
combined_colors <- c(pastel1_colors, set3_colors)

# Extract the unique classes from your phyloseq object
class_names <- sort(unique(as.character(tax_table(ps_rel_abundclass)[, "class"])))
class_names
## [1] "Caudoviricetes" "Faserviricetes" "Herviviricetes" "Leviviricetes" 
## [5] "Megaviricetes"  "Naldaviricetes"
ps_rel_abundclass@tax_table[, "class"] <- as.character(ps_rel_abundclass@tax_table[, "class"])

# Create a bar plot of relative abundance at the class level, faceted by season (Muestreo)
class_Estacion <- phyloseq::plot_bar(ps_rel_abundclass, fill = "class") +
 geom_bar(aes(color = as.factor(class), fill = as.factor(class)), stat = "identity", position = "stack") +
  labs(
       x = "Samples",  
       y = "Relative abundance",  
       fill = "Class",
       color = "Class" ) +
  facet_wrap(~ Muestreo, scales = "free", 
             labeller = labeller(Muestreo = c(
               "2" = "Dry season",
               "3" = "Raining season"
             ))) + 
    scale_fill_manual(values = setNames(combined_colors[1:length(class_names)], class_names)) +
    scale_color_manual(values = setNames(combined_colors[1:length(class_names)], class_names)) +
  theme(
    panel.background = element_blank(),
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, color = "black", face = "bold"), 
    axis.text.y = element_text(color = "black"), 
    axis.ticks.x = element_line(), 
    axis.title.x = element_blank(),
    legend.title = element_text(face = "bold"),
    axis.title.y = element_text(size = 14, face = "bold")
  )+
  guides(
    fill = guide_legend(title = "Class"),
    color = guide_legend(title = "Class")
  )

# Visualize the bar plot of relative abundance by class and season
class_Estacion

# Save the plot as a high-resolution JPEG image
ggsave("ClaseEstacion4.jpg", plot = class_Estacion, width = 10, height = 7, dpi = 300)


# --- AGGREGATE AND FILTER TAXA BY ORDER LEVEL ---

# Group taxa at the "order" level (aggregates ASVs/OTUs into taxonomic orders)
physeqOrder = tax_glom(p.rar4.2, taxrank = "order")
# Transform count data into relative abundances per sample
physeqOrderRA = transform_sample_counts(physeqOrder, function(x) x / sum(x))
# Filter out orders with mean relative abundance ≤ 1%
physeqOrderRAF = filter_taxa(physeqOrderRA, function(x) mean(x) > 0.01, TRUE)
# Get the names of taxonomic orders that passed the abundance filter
keepOrders = get_taxa_unique(physeqOrderRAF, "order")
# Subset the original dataset to retain only taxa belonging to the filtered orders
physeqF = subset_taxa(p.rar4.2, order %in% keepOrders)
physeqF = tax_glom(physeqF, taxrank = "order")
# Convert counts in the filtered dataset to relative abundances
ps_rel_abund = transform_sample_counts(physeqF, function(x) x / sum(x))

# Create a color palette (choose one you like)
library(RColorBrewer)
custom_colors <- c(
  brewer.pal(8, "Set2"),
  brewer.pal(8, "Dark2"),
  "#999999", "#E0E0E0"  # Add more if needed
)

# Get unique order names (sorted for consistency)
order_names <- sort(unique(as.character(tax_table(ps_rel_abund)[, "order"])))


ps_rel_abund@tax_table[, "order"] <- as.character(ps_rel_abund@tax_table[, "order"])

# Create a stacked bar plot showing the relative abundance of taxonomic orders per sample
orden_Estacion <- plot_bar(ps_rel_abund, fill = "order") +
  geom_bar(aes(color = as.factor(order), fill = as.factor(order)), stat = "identity", position = "stack") +
  labs(
    x = "Samples",
    y = "Relative abundance ",
    fill = "Order",
    color = "Order"
  ) +
  facet_wrap(~ Muestreo, scales = "free",
             labeller = labeller(Muestreo = c(
               "2" = "Dry season",
               "3" = "Raining season"
             ))) +
  scale_fill_manual(values = setNames(custom_colors[1:length(order_names)], order_names)) +
  scale_color_manual(values = setNames(custom_colors[1:length(order_names)], order_names)) +
  theme(
    panel.background = element_blank(),
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, face = "bold", colour = "black"),
    axis.text.y = element_text(color = "black"), 
    axis.ticks.x = element_line(),
    plot.title = element_text(hjust = 0.5),
    legend.title = element_text(face = "bold"),
    axis.title.y = element_text(size = 14, face = "bold"),
    axis.title.x = element_text(size = 14, face = "bold")
  ) +
  guides(
    fill = guide_legend(title = "Order"),
    color = guide_legend(title = "Order")
  )

orden_Estacion

# Save the Plot
ggsave("OrdenEstacion.jpg", plot = orden_Estacion, width = 10, height = 7, dpi = 300)

# Arrange the class-level and order-level plots in one figure (stacked vertically)
plotrelativeabundance = grid.arrange(class_Estacion, orden_Estacion, ncol = 1)

# Save the combined plot as a jpg file
ggsave("plotrelativeabundance.jpg", plot = plotrelativeabundance, width = 15, height = 12, dpi = 450)

ORDINATION PLOT

library(ggrepel) 
library(microbiome)
library("ggforce")
## Warning: package 'ggforce' was built under R version 4.4.3
#CLR transform
(p_clr <- microbiome::transform(p.rar4.2, "clr"))
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 1043 taxa and 36 samples ]
## sample_data() Sample Data:       [ 36 samples by 33 sample variables ]
## tax_table()   Taxonomy Table:    [ 1043 taxa by 7 taxonomic ranks ]
#PCA via phyloseq
ord_clr <- phyloseq::ordinate(p_clr, "RDA")

# Calculate the proportion of variance explained by the first axis (clr1)
clr1 <- ord_clr$CA$eig[1] / sum(ord_clr$CA$eig)
# Calculate the proportion of variance explained by the second axis (clr2)
clr2 <- ord_clr$CA$eig[2] / sum(ord_clr$CA$eig)
clr1
##       PC1 
## 0.1786078
clr2
##      PC2 
## 0.138177
sample_data(p.rar4.2)$Muestreo <- as.factor(sample_data(p.rar4.2)$Muestreo)
# Create an ordination plot of clr-transformed data
ordination_plotp <- phyloseq::plot_ordination(p.rar4.2, ord_clr, type = "samples", color = "Muestreo", shape = "Tipo") +
  geom_point(size = 3, alpha = 0.7)+
  coord_fixed(clr2 / clr1) +
  theme_minimal(base_size = 14) +
  theme(
    legend.position = "right",  
    legend.title = element_blank(),
    axis.title.x = element_text(size = 14),
    axis.title.y = element_text(size = 14),
    axis.text.x = element_text(size = 14),
    axis.text.y = element_text(size = 14),
    panel.border = element_blank(),  
    axis.line = element_blank(),     
    panel.background = element_blank(),  
    plot.background = element_blank(),
    panel.grid = element_blank(),        
    axis.ticks = element_blank()  
  ) +
  theme(
    legend.position = c(0.95, 0.05), # Bottom-right corner
    legend.justification = c(1, 0), # Anchor legend to bottom-right
    legend.background = element_rect(fill = "white", color = "black"), # Optional background box
    legend.text = element_text(size = 10), 
    legend.title = element_text(size = 9, face = "bold"), 
    legend.key.size = unit(0.4, "cm") 
  )+
  # ADD ggforce's ellipses
  ggforce::geom_mark_ellipse(aes(
    group = Muestreo, color = Muestreo ), alpha = 0.4, size = 1,linetype = "dashed") +        #scale_color_manual(values = c("#eb984e", "#5499c7","black","black")) + # Set custom border colors
  coord_cartesian(xlim = c(-7, 7), ylim = c(-7, 7)) +
  labs(
    x = "PC1 (17.9%)",
    y = "PC2 (13.2%)") +
  # Adding grid lines to form a quadrilateral
  geom_hline(yintercept = c(-7, 0, 7), color = "grey", linetype = "dashed") +  # Horizontal lines
  geom_vline(xintercept = c(-7, 0, 7), color = "grey", linetype = "dashed") +   # Vertical lines
   # Custom color legend (Muestreo)
  scale_color_manual(
     values = c("2" = "#eb984e", "3" = "#5499c7"),
     labels = c("2" = "2-Dry", "3" = "3-Raining"),
     name = "Sampling"
  ) +
  # Custom shape legend (Tipo)
  scale_shape_manual(
    values = c("sediment" = 16, "water" = 17),
    labels = c("sediment" = "Sediment", "water" = "Water"),
    name = "Type"
  )
## Warning: A numeric `legend.position` argument in `theme()` was deprecated in ggplot2
## 3.5.0.
## ℹ Please use the `legend.position.inside` argument of `theme()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Coordinate system already present. Adding new coordinate system, which will
## replace the existing one.
ordination_plotp

# Save the ordination plot as a jpg file
ggsave("ordinationplot.jpg", plot = ordination_plotp, width = 15, height = 7, dpi = 300)

# Save the ordination plot as a pdf file with wider width
ggsave("ordination_plot.pdf", plot = ordination_plotp, width = 16, height = 7, dpi = 300)