#InputPhyloseq Object

# Clear environment and load phyloseq object
rm(list = ls())
pseq <- readRDS("../input_files/final_Object_4_analysis.rds")
pseq
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 1471 taxa and 12 samples ]
## sample_data() Sample Data:       [ 12 samples by 9 sample variables ]
## tax_table()   Taxonomy Table:    [ 1471 taxa by 6 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 1471 tips and 1470 internal nodes ]

##Making Stack Plots at Family level

# Agglomerate taxa at the family level, transform to percentage abundance, and modify low abundance taxa
y1 <- tax_glom(pseq, taxrank = 'Family') # Agglomerate taxa
y3 <- transform_sample_counts(y1, function(x) x / sum(x)) # Get abundance in percentage
y4 <- psmelt(y3) 
y4$Family <- as.character(y4$Family) # Convert Family to character type
y4$Family[y4$Abundance < 0.02] <- "Families < 2% abund." # Rename families with <2% abundance
# Define custom color palette and create a stacked bar plot
my_colors <- c("#556b2f", "#781156", "#A51876", "#000000", "#F098D3", "#114578", 
               "#185EA5", "#6CABEA", "#6CEAEA", "#98C4F0", "#008080", "#A0A0A0", 
               "#D2D21E", "#F7F7C5", "#784511", "#D2781E", "#781122", "darkgoldenrod1", 
               "#EA6C81", "darkblue", "#000000", "brown1", "#800000", "darkolivegreen1", 
               "#fb9a99", "#FF00FF", "dodgerblue3", "#808000", "#D21E2C", "darkseagreen", 
               "#ffff00", "#c51b7d", "#5F7FC7", "steelblue", "#117878", "#5F7FC7")

p <- ggplot(y4, aes(x = Replicates, y = Abundance, fill = fct_reorder(Family, Abundance))) +
    geom_bar(stat = "identity", position = "fill") +  # Normalize bars to sum to 1
    theme_classic() +
    scale_fill_manual(values = my_colors) +
    theme(text = element_text(size = 16), axis.text.y = element_text(hjust = 1)) +
    scale_x_discrete(drop = TRUE) +
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
    guides(fill = guide_legend(keywidth = 1, keyheight = 1)) +
    ylab("Distribution of ASVs at family level") +
    theme(axis.text = element_text(face = "bold"))

# Add additional theme elements and facet grid
p + theme(panel.grid.major = element_blank()) + 
    theme(strip.text.x = element_text(size = 12)) +
    facet_grid(cols = vars(Category2), scales = "free", space = "free") +
    guides(fill = guide_legend(ncol = 2))

# Save the plot
ggsave("20Xfamily_level_Compo_in_Dauers.tiff", height = 6, width = 10)

#Figure 2A. Sequencing-based analysis supports the lack of a gut microbiome in dauer larvae and demonstrates sensitivity of low biomass samples to contamination from reagents. (A) Stack plots showing proportion of amplicon sequence variants (ASVs), grouped and colored at bacterial family level, in dauers developing in three independent worm populations (D1-3, N=700-966 worms/population) raised on a compost-derived microbial community (CME4, see Methods). E, environmental microbiomes, M9 buffer control, EXT, DNA extraction buffer control, EL, elution buffer control (used in DNA extraction) and water. 

##Beta Diversity or PCoA Plot

pseq.rel.2 <- transform(pseq, "relative.abundance")
ord_mds_bray2 <- ordinate(pseq.rel.2, "PCoA", "bray", weighted=TRUE)
beta.ps2 <- plot_ordination(pseq.rel.2, 
                            ord_mds_bray2, 
                            color="Category") 
p9 <- beta.ps2 + ggtitle("20X_W-bray_curtis_Dauers and controls") + theme_bw(base_size = 12) + theme(plot.title = element_text(size =12, face= "bold")) + scale_color_manual(values = c("#33ff33",  "#00cccc","#006666","#ff66ff", "darkblue","#800000", "#4B0082","#fb9a99","dodgerblue3", "darkgoldenrod1") )  + geom_point( size=8)   + geom_point( size=6)  
p9

ggsave("20XBRAY_curtis_weighted.tiff", height = 5, width = 7)
#Figure2B.Principal Coordinate Analysis based on Bray-Curtis distances of the communities shown in A 

##SourceTracker Analysis

# Clear the workspace and load necessary libraries
rm(list = ls())
library(phyloseq)
library(ggplot2)
library(reshape2)
library(ggforce)

# Load external script for SourceTracker
source('SourceTracker.r')

# Load the Phyloseq object
pseq <- readRDS("../input_files/final_Object_4_analysis.rds")
taxa_names(pseq) <- paste0("SV", seq(ntaxa(pseq)))

# Extract sample metadata and OTU table from Phyloseq object
metadata <- sample_data(pseq)
otus <- as(otu_table(pseq), "matrix")

# Find common sample IDs between metadata and OTU table
common.sample.ids <- intersect(rownames(metadata), rownames(otus))
otus <- otus[common.sample.ids, ]
metadata <- metadata[common.sample.ids, ]

# Check if there are overlapping samples
if (length(common.sample.ids) <= 1) {
    stop(sprintf('Error: there are %d sample ids in common between the metadata file and OTU table', length(common.sample.ids)))
}

# Extract source and sink indices, as well as environmental metadata
train.ix <- which(metadata$SourceSink == 'source')
test.ix <- which(metadata$SourceSink == 'sink')
envs <- metadata$Env

# Optional: Extract description if it exists
if (is.element('Description', colnames(metadata))) {
    desc <- metadata$Description
}

# Set seed for reproducibility and load SourceTracker
set.seed(100)
source('SourceTracker.r')

# Tune SourceTracker parameters (rarefying training data at 1000)
tune.results <- tune.st(otus[train.ix, ], envs[train.ix])
## Loop 1 of 4, alpha1=0.001000, alpha2=0.001000 Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Loop 2 of 4, alpha1=0.001000, alpha2=0.010000 Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Loop 3 of 4, alpha1=0.001000, alpha2=0.100000 Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Loop 4 of 4, alpha1=0.001000, alpha2=1.000000 Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
## Rarefying training data at 1000
alpha1 <- tune.results$best.alpha1
alpha2 <- tune.results$best.alpha2

# Train SourceTracker object on the training data
st <- sourcetracker(otus[train.ix, ], envs[train.ix])
## Rarefying training data at 1000
# Estimate source proportions in test data
set.seed(100)
results <- predict(st, otus[test.ix, ], alpha1 = alpha1, alpha2 = alpha2)
##                                                EC     Elution Bu          HC        Lawn      MilliQ     Unknown
## ..........   1 of    3, depth= 1000:  0.23 (0.01)    0.33 (0.02) 0.05 (0.01) 0.01 (0.00) 0.01 (0.00) 0.37 (0.01)
## ..........   2 of    3, depth= 1000:  0.23 (0.01)    0.32 (0.01) 0.11 (0.01) 0.00 (0.00) 0.01 (0.00) 0.33 (0.01)
## ..........   3 of    3, depth= 1000:  0.21 (0.01)    0.27 (0.01) 0.09 (0.01) 0.00 (0.00) 0.01 (0.00) 0.43 (0.01)
# Estimate leave-one-out source proportions in training data
results.train <- predict(st, alpha1 = alpha1, alpha2 = alpha2)
## ndraws=10, V=6, T=1471, N=9
## Rarefying training data at 1000
## ..........   1 of    9, depth= 1000:  0.29 (0.02)    0.46 (0.02) 0.00 (0.00) 0.01 (0.00) 0.24 (0.01)
## Rarefying training data at 1000
## ..........   2 of    9, depth= 1000:  0.43 (0.02)    0.13 (0.01) 0.01 (0.00) 0.01 (0.00) 0.42 (0.01)
## Rarefying training data at 1000
## ..........   3 of    9, depth= 1000:  0.00 (0.00)    0.00 (0.00) 0.00 (0.00) 0.38 (0.02) 0.00 (0.00) 0.61 (0.01)
## Rarefying training data at 1000
## ..........   4 of    9, depth= 1000:  0.00 (0.00)    0.00 (0.00) 0.00 (0.00) 0.75 (0.01) 0.00 (0.00) 0.25 (0.01)
## Rarefying training data at 1000
## ..........   5 of    9, depth= 1000:  0.00 (0.00)    0.00 (0.00) 0.00 (0.00) 0.61 (0.01) 0.00 (0.00) 0.39 (0.01)
## Rarefying training data at 1000
## ..........   6 of    9, depth=   45:  0.09 (0.04)    0.72 (0.09) 0.12 (0.07) 0.00 (0.00) 0.00 (0.00) 0.07 (0.00)
## Rarefying training data at 1000
## ..........   7 of    9, depth=   62:  0.18 (0.07)    0.31 (0.06) 0.05 (0.02) 0.03 (0.01) 0.00 (0.01) 0.43 (0.02)
## Rarefying training data at 1000
## ..........   8 of    9, depth= 1000:  0.13 (0.02)    0.52 (0.02) 0.07 (0.01) 0.00 (0.00) 0.02 (0.00) 0.26 (0.01)
## Rarefying training data at 1000
## ..........   9 of    9, depth= 1000:  0.03 (0.00)    0.66 (0.01) 0.04 (0.01) 0.01 (0.00) 0.02 (0.00) 0.24 (0.01)
# Prepare labels for plotting
labels <- sprintf('%s %s', envs, desc)

# Plot results using different plot types
plot(results, labels[test.ix], type = 'pie')
plot(results.train, labels[train.ix], type = 'pie')

# Plot with custom colors and legend
env.colors <- c("#33ff33", "#00cccc", "#006666", "#ff66ff", "darkblue", "grey71", "#4B0082", "#fb9a99", "#FF00FF", "dodgerblue3", "darkgoldenrod1")
plot(results, labels[test.ix], type = 'pie', include.legend = TRUE, env.colors = env.colors)

# Create a melted data frame for downstream results and save as CSV
downstream <- data.frame(results$proportions)
downstream$id <- row.names(downstream)
meltdown <- melt(downstream, id.vars = "id")
write.csv(meltdown, "tuned_alpha_new_file_final.csv")

# Create a bar plot for the results
env.colors <- c("#006666", "#00cccc", "darkblue", "#ff66ff", "#800000", "grey71", "#fb9a99", "#FF00FF", "dodgerblue3", "darkgoldenrod1")
p <- ggplot(meltdown, aes(x = id, y = value)) +
  geom_bar(aes(fill = variable), stat = "identity") +
  scale_fill_manual(values = env.colors) +
  theme(
    panel.grid.major = element_blank(),
    strip.text.x = element_text(size = 12),
    text = element_text(size = 16),
    axis.text.y = element_text(hjust = 1),
    axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
    axis.text = element_text(face = "bold")
  ) +
  guides(fill = guide_legend(keywidth = 1, keyheight = 1))

# Display and save the plot
p + theme(panel.grid.major = element_blank(), strip.text.x = element_text(size = 12))

ggsave("tuned_alpha_final_new_file_tunest.png", height = 6, width = 6)
#Figure 2C. SourceTracker analysis assessing the contribution of environmental microbiomes and reagents to ASVs sequenced from dauers.This analysis indicated that only 0.78% of the ASVs associated with dauers could be traced back to the environment in which it grew, in contrast to 61.4% of the ASVs, which originated in reagents, mostly attributed to buffers of the DNA extraction kit. The source of some of the ASV remained unknown, but their likelihood to represent bacteria from dauers gut is slim, since only 0.38% of them was identified also in environmental samples, the source of the worm gut microbiome. 

##Supplementary Figure 1

rm(list=ls())
data <- read.csv("Table2figure.csv")
data <- data[order(data$Bacteria), ]
p4<- ggplot(data, aes(x = Bacteria, y = No_of_CFUs)) +
  geom_bar(stat = "identity", fill = "#696969") +
  facet_wrap(~ CME, scales = "free_x") +
  scale_y_continuous(limits = c(0, 11), breaks = seq(0, 11, by = 2)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) 
p4

ggsave("CFU.tiff", p4, width = 8, height =6, dpi = 300)
#Full length 16S sequencing of colonies obtained from dauers, identified in one of the two experiments dominance for species of the genus Microbacterium, with a few additional colonies of Brucella, Bacillus and Paenibacillus species; the other showed dominance for Bacillus and Paenibacillus species