knitr::opts_chunk$set(warning = FALSE, message = FALSE) 
sequencing data (CD Genomics, initial + TopOff, sequences summed) from May 2024 eDNA decay mesocosm

set up

install packages

library(knitr)
#install.packages("PKI")
#install.packages("rsconnect", type = "source")
library(rsconnect)
library(reshape2)
library(ggplot2)
library(vegan)
library(plyr)
library(dplyr)
library(palettetown)
library(heatmap3)
library(ggbeeswarm)
library(randomForest)
library(readxl)
library(segmented)
set.seed(100)

files

upload files

path<-"/Users/kylielanglois/SCCWRP/Oceankind - eDNA Work/eDNA Decay Studies/eDNA Degradation Microcosms/final data 041125"
path.local<-"/Users/kylielanglois/OneDrive - SCCWRP/eDNA decay"

dat<-read.csv(
  file.path(
    path, 
    "16S OTU Table/otu.intial.plus.topoff.final.answer.csv")) 
#".relabd" for relative abundance table

tax.col<-c("Kingdom", "Phylum", "Class", "Order", "Family", 
              "Genus", "Species", "Consensus.Lineage")
tax<-dat[ , match(c("OTUID", tax.col), colnames(dat))]

dat1<-dat[ , -match(tax.col, colnames(dat))]
dat1<-dat1[, -1] #ger rid of "rownames"

met<-read.csv("/Users/kylielanglois/OneDrive - SCCWRP/eDNA decay/eDNA decay May 2024 metadata.csv")
met1<-met[!grepl("TO", met$Sequencing.Name), ]

cop<-read_excel(
  file.path(
    path, 
    "ddPCR/eDNA decay may 2024 combo_dilutiondecision_FINAL.xlsx"), sheet = 7)

div.decay<-read.csv(
  file.path(path.local, "BIO_eDNA_decay_16S_Init_TopOff_final_shan-low.csv"))

remove samples

remove<-c("X57", "X83", "X243")
remove.g<-paste0(remove, collapse = "|")

dat<-dat[, !grepl(remove.g, colnames(dat))]
dat1<-dat1[, !grepl(remove.g, colnames(dat1))]
met<-met[!grepl(remove.g, met$Sequencing.Name), ]
met1<-met1[!grepl(remove.g, met1$Sequencing.Name), ]

gene copy number, decay rate

code from Nicholas Lombardo () broken-stick model to explain gene copies per 1ml over time, each treatment

dna_clean <- cop1 |>
  mutate(
    #make hours into character
    hours = Hours,
    #get copies per 1ml from data
    DNA_copies = `Copies per 1 mL`,
    #make treatment into factor
    Treatment.char = factor(toupper(Treatment.char), levels = c("3. FISH", "2. CELL", "1. DNA"))
  ) |>
  #put together
  arrange(Treatment.char, Temperature.char, Hours, DNA_copies) |>
  dplyr::select(Treatment.char, Temperature.char, Hours, DNA_copies) |>
  #mean copies per 1ml per temp+treatment+time
  group_by(Treatment.char, Temperature.char, Hours) |>
  summarize(
    DNA_copies = mean(DNA_copies) 
  )

intercepts<- 
  dna_clean |>
  group_by(Treatment.char, Temperature.char) |>
  summarize(
    #linear model of log copies per 1ml by hour, grouped by treatment+temp
    model = list(lm(log(DNA_copies) ~ Hours)),
    #get y intercept
    intercept = purrr::map_dbl(model, function(x) exp(coef(x)[1])),
    #get p value
    intercept_p_val = purrr::map_dbl(model, function(x) coef(summary(x))[1, 4])
  ) |>
  ungroup() |>
  dplyr::select(-model) 

### broken stick ####
# fixed breakpoint ####
model_data_for_plot1 <- 
  dna_clean |>
  group_by(Treatment.char, Temperature.char) |>
  #group by treatment+temp
  summarize(
    .groups = "keep",
    model = list(
      #linear model of average copies per 1ml by Hours
      segmented(
        lm(log(DNA_copies) ~ Hours, 
           data = pick(DNA_copies, Hours)),
        seg.Z = ~Hours,
        psi = 24,
        control = seg.control(it.max = 0)
      )
    ),
    #get model summary
    summary = purrr::map(model, function(x) summary(x)),
    adj_r_squared = purrr::map_dbl(summary, function(x) x$adj.r.squared),
    aic = purrr::map_dbl(model, function(x) AIC(x))
  ) |>
  mutate(
    #get breakpoint (24 Hours) to split linear model into 2
    preds = purrr::map(model, function(mod) {
      predict(mod,
        interval = "confidence",
        newdata = data.frame(Hours = seq(min(mod$model$Hours), 
                                         max(mod$model$Hours), by = 1)) |>
          mutate(U1.Hours = (Hours > 24) * (Hours-24))
      ) |>
        as.data.frame() |>
        bind_cols(Hours = seq(min(mod$model$Hours), max(mod$model$Hours), by = 1))
    }),
    slopes = purrr::map(model, function(mod) {
      #browser()
      mod$nameUV <- list(U = "U1.Hours", Z = "Hours")
      as.data.frame(slope(mod)$Hours) |>
        tibble::rownames_to_column(var = "slope")
    })
  ) |>
  tidyr::unnest(cols = preds) |>
  tidyr::unnest(cols = slopes) |>
  #put together
  dplyr::select(Treatment.char, Temperature.char, 
                adj_r_squared, 
                fit, 
                lwr, upr, 
                Hours,
    slope, est = `Est.`, 
    t = `t value`
  ) |>
  tidyr::pivot_wider(names_from = slope,
                     values_from = c("est", "t"))

#get annotations from model to put on plots
annotation_data <- 
  model_data_for_plot1 |>
  distinct(Treatment.char, Temperature.char, 
           k1 = est_slope1, 
           k2 = est_slope2, 
           adj_r_squared) |>
  mutate(
    metrics = paste0(
      "k: ", format(round(-k1, 3), nsmall = 3), ", ", format(round(-k2, 3), nsmall = 3), "\n",
      "break point: 24\n",
      "r.sq: ", format(round(adj_r_squared, 2), nsmall = 2)), 
    metrics2 = paste0(
      "k: ", format(round(-k1, 3), nsmall = 3), ", ", format(round(-k2, 3), nsmall = 3), "\n",
      "r.sq: ", format(round(adj_r_squared, 2), nsmall = 2)))
cop1$copies.1ml<-cop1$`Copies per 1 mL`

hg <- ggplot()+
  geom_ribbon(data=model_data_for_plot1, 
              aes(x = Hours, y = exp(fit), ymin = exp(lwr), ymax = exp(upr)), 
              alpha = 0.15) +
  geom_line(data=model_data_for_plot1, 
            aes(x = Hours, y = exp(fit), color = Temperature.char), 
            alpha=0.5) +
  geom_point(data=cop1,
           aes(x = Hours, y = copies.1ml, 
               color=Temperature.char, shape=Treatment.char), 
           size=3, alpha=0.5) +
  geom_text(data = annotation_data,
            aes(x = 0, y = 10, label = metrics2), 
            color="black", size = 8, size.unit = "pt", 
            hjust = 0, vjust = 1) +
  coord_cartesian(ylim = c(0.01, 1e8), 
                  xlim = c(-5, 175), expand = FALSE) +
  scale_y_log10(labels = scales::label_log(base = 10)) +
  labs(x="Time (hr)", y="Gene copy per 1mL")+
  facet_grid(rows = vars(Temperature.char), 
             cols = vars(Treatment.char)) +
  scale_color_manual(values = c("#3B9AB2", "#0b8e37" , "#EBCC2A", "#e67300","#F21A00")) +
  scale_shape_manual(values = c(16, 17, 18)) +
  theme_bw() +
  theme(panel.grid = element_blank(),
        legend.position = "none")
hg

#ggsave(plot = hg,
#       path="/Users/kylielanglois/OneDrive - SCCWRP/eDNA decay/figs", 
#       filename = "eDNA decay may 2024 combo total log10 decay 081125.png", 
#       width = 7, height = 7)

taxonomy set-up

consolidate samples at genus level

dat.g<-ddply(dat, "Genus", numcolwise(sum))

#make proportional from ALL ASV 
colnames(dat.g)[1]<-"Genus"

df.temp<-dat.g
df.new<-data.frame(Genus=df.temp$Genus)

for (i in 3:ncol(df.temp)) {
a<-as.data.frame(df.temp[, c(1, i)])
a[, 2]<-as.numeric(as.character(a[, 2]))
a<-as.matrix(a[order(-a[2]), ])
a<-as.data.frame(a[1:20, ])
a[, 2]<-as.numeric(a[, 2])
df.new<-merge(df.new, a, by="Genus", all=T)
}

#remove genera with ONLY na
df.new$sum<-rowSums(!is.na(df.new[2:ncol(df.new)])) #count samples with values
#only in >10% samples
df.new<-subset(df.new, df.new$sum>46) 
df.new<-df.new[, !grepl("sum", colnames(df.new))]
df.new$Genus<-sub("^$", "Unassigned", df.new$Genus)

dat1.g.count<-df.new

dat1.g<-cbind(dat1.g.count$Genus, 
                  sweep(dat1.g.count[ , 2:ncol(dat1.g.count)], 2,
                        colSums(dat.g[,3:ncol(dat.g)]), `/`)*100) 
colnames(dat1.g)[1]<-"Genus"

#genera with relative abundance >10% -- from SMT
dat1.g.alt<-cbind(dat.g$Genus, 
                  sweep(dat.g[,3:ncol(dat.g)], 2,
                        colSums(dat.g[,3:ncol(dat.g)]), `/`)*100) 
dat1.g.alt<-melt(dat1.g.alt)
#dat1.g.alt$Genus2<-ifelse(dat1.g.alt$value > 0, dat1.g.alt$`dat.g$Genus`,"Unassigned")
dat1.g.alt<-dat1.g.alt[!grepl("Unass|uncul|Chlor", dat1.g.alt$`dat.g$Genus`), ]

consolidate blanks at genus level

samp.bl<-met$Sequencing.Name[met$Time.point==-1]
samp.bl<-paste0(samp.bl, "$")
samp.bl<-paste0(samp.bl, collapse = "|")
df2.temp<-dat[ , grepl(samp.bl, colnames(dat))]
df2.temp<-sweep(df2.temp, 2,colSums(df2.temp), `/`)*100
df2.temp$Genus<-dat$Genus
df2.temp<-ddply(df2.temp, "Genus", numcolwise(sum))

df2.new<-data.frame(Genus=df2.temp$Genus)

for (i in 2:ncol(df2.temp)) {
a<-as.data.frame(df2.temp[, c(1, i)])
a[, 2]<-as.numeric(as.character(a[, 2]))
a<-as.data.frame(a[order(-a[2]), ])
a<-a[1:20, ]
df2.new<-merge(df2.new, a, by="Genus", all=T)
}

#remove genera with ONLY na
df2.new$sum<-rowSums(!is.na(select(df2.new, -Genus))) #count samples with values
df2.new<-subset(df2.new, df2.new$sum>1) #only in >1 samples
df2.new<-df2.new[, !grepl("sum", colnames(df2.new))]
df2.new$Genus<-sub("^$", "Unassigned", df2.new$Genus)

dat3.m<-melt(df2.new)
dat3.m.met<-merge(dat3.m, met1, 
                  by.x = "variable", by.y="Sequencing.Name", 
                  all.x=T, sort=F) #merge with metadata

diversity

taxa = columns, samples = rows

alpha diversity

#dat.g = all genera, no relative abundance
a.dat.choose.div<-dat.g
rownames(a.dat.choose.div)<-a.dat.choose.div$Genus
a.dat.choose.div<-a.dat.choose.div[, -1]

d.sp<-data.frame(sample=colnames(a.dat.choose.div), #samples
                 gen.num=colSums(a.dat.choose.div != 0)) #number of genera
t<-as.data.frame(t(a.dat.choose.div))
d.sp$gen.even<-(diversity(t, "shannon")/log(specnumber(t))) #Pielous evenness
d.sp$gen.shan<-diversity(t, "shannon")
d.sp.met<-merge(d.sp, met1, by.x="sample", by.y="Sequencing.Name") 
#combine with metadata

#get timepoint of lowest shannon diversity for each
d.sp.met.s<-d.sp.met %>%
  group_by(Treatment, Temperature.char) %>%
  summarise(shan.min=min(gen.shan))
d.sp.met.s<-merge(d.sp.met.s, 
                  d.sp.met[, grepl("gen.shan|Time.point", colnames(d.sp.met))], 
                  by.x="shan.min", by.y="gen.shan", all.x = T)
#write.csv(d.sp.met.s, row.names = F, 
#          file.path(path.local, "BIO_eDNA_decay_16S_Init_TopOff_final_shan-low.csv"))

#plot
d.sp.m<-melt(d.sp)
d.sp.m.met<-merge(d.sp.m, met1, by.x="sample", by.y="Sequencing.Name")
d.sp.m.met<-d.sp.m.met[!is.na(d.sp.m.met$Temperature..C.), ]

eg<-ggplot(d.sp.m.met)+
  geom_point(aes(x=Time.point, y=value, 
                 color=Temperature.char, shape=variable), 
             alpha=0.75, size=3)+
  facet_grid(Temperature..C.*variable~Treatment.char, 
             scales = "free", space="free_x")+
  scale_color_manual(
     values=c("#7b3294","#2c7bb6","#008837", "#fdae61" ,"#d7191c"))+
  scale_shape_manual(values=c(3, 4, 16), 
                     label=c("evenness", "richness", "shannon"), 
                     name="diversity metrics")+
  theme(legend.position = "bottom", 
        strip.text.y = element_blank())+
  ggtitle("eDNA decay May 2024 16S, ALL")
eg

#ggsave(plot=eg, 
#        path=path.local,
#        filename="eDNA_decay_InitTopOff_16S_div_ALL.png",
#        height=14, width=10)

peak decay vs lowest diversity

beta diversity

dat.choose<-dat1.g
#choose which table for plotting
#dat1 = ASV, no averaging
#dat1.g = top genera, relative abundance
#dat1.g.count = top genera, no relative abundance
#dat.g = all genera, no relative abundance

dat1.m<-melt(dat.choose)
dat1.m.met<-merge(dat1.m, met1, by.x = "variable", by.y="Sequencing.Name", 
                 all.x=T, sort=F) #merge with metadata

distance matrix

v<-as.data.frame(as.matrix(vegdist(dat1.t, upper = T)))
v$variable2<-rownames(v)
v.m<-melt(v, id.vars = "variable2")
v.m<-merge(v.m, met1, by.x="variable", by.y="Sequencing.Name", all.x = T)
v.m<-merge(v.m, met1, by.x="variable2", by.y="Sequencing.Name", all.x = T)
v.m$value<-ifelse(v.m$variable==v.m$variable2, NA, v.m$value)

v.m1<-v.m[!grepl("DI", v.m$variable), ]
v.m1<-v.m1[!grepl("DI", v.m1$variable2), ]
vg<-ggplot(data=v.m1[!grepl("-1", v.m1$Time.point.x), ], 
               aes(x=Time.point.x, y=value, 
                   col=Time.point.y))+
  geom_boxplot()+
  labs(x="Timepoint", y="Bray-Curtis distance", color="Timepoint", 
       title = "eDNA decay May 2024 Init+TopOff 16S\nBLANKS")
vg
#ggsave(plot=vg, 
#        path=path.local,
#        filename="eDNA_decay_InitTopOff_16S_boxplot_BLANK.png",
#        height=6, width=8)

NMDS prep

#col.choose<-met1$Sequencing.Name[grepl("BLANK|WATER", met1$Treatment)]
#dat.choose.1<-dat1[ , match(c("OTUID", col.choose), colnames(dat1))]
#DO ABOVE JUST FOR BLANKS

#choose which table for plotting
#dat1 = ASV, no averaging
#dat1.g = top genera, relative abundance
dat.choose.1<-dat1[, !grepl("Zymo", colnames(dat1), ignore.case = T)]

dat1.t<-dat.choose.1 #transpose non-relative abundance table
rownames(dat1.t)<-dat1.t$OTUID
dat1.t<-dat1.t[, -1]
dat1.t<-as.data.frame(t(dat1.t))
n<-metaMDS(dat1.t, trymax = 100)
## Square root transformation
## Wisconsin double standardization
## Run 0 stress 0.1524879 
## Run 1 stress 0.1535641 
## Run 2 stress 0.1691887 
## Run 3 stress 0.1492164 
## ... New best solution
## ... Procrustes: rmse 0.0261411  max resid 0.3308381 
## Run 4 stress 0.1712126 
## Run 5 stress 0.1799393 
## Run 6 stress 0.1760205 
## Run 7 stress 0.1688489 
## Run 8 stress 0.1750719 
## Run 9 stress 0.1757639 
## Run 10 stress 0.1584052 
## Run 11 stress 0.1758472 
## Run 12 stress 0.1655226 
## Run 13 stress 0.1784691 
## Run 14 stress 0.1644642 
## Run 15 stress 0.1753433 
## Run 16 stress 0.1664675 
## Run 17 stress 0.1471559 
## ... New best solution
## ... Procrustes: rmse 0.008082622  max resid 0.1448986 
## Run 18 stress 0.1483367 
## Run 19 stress 0.172563 
## Run 20 stress 0.1498567 
## Run 21 stress 0.1470948 
## ... New best solution
## ... Procrustes: rmse 0.009253803  max resid 0.12929 
## Run 22 stress 0.1546845 
## Run 23 stress 0.175951 
## Run 24 stress 0.1679807 
## Run 25 stress 0.1901554 
## Run 26 stress 0.1498077 
## Run 27 stress 0.1559601 
## Run 28 stress 0.1674632 
## Run 29 stress 0.1466769 
## ... New best solution
## ... Procrustes: rmse 0.003173137  max resid 0.06403831 
## Run 30 stress 0.1723356 
## Run 31 stress 0.1869873 
## Run 32 stress 0.1778012 
## Run 33 stress 0.1680976 
## Run 34 stress 0.1545373 
## Run 35 stress 0.1759392 
## Run 36 stress 0.1696864 
## Run 37 stress 0.1753737 
## Run 38 stress 0.1775694 
## Run 39 stress 0.1817036 
## Run 40 stress 0.1658267 
## Run 41 stress 0.1797226 
## Run 42 stress 0.1678203 
## Run 43 stress 0.1559332 
## Run 44 stress 0.1689354 
## Run 45 stress 0.1489876 
## Run 46 stress 0.1670674 
## Run 47 stress 0.1760844 
## Run 48 stress 0.1676759 
## Run 49 stress 0.1700887 
## Run 50 stress 0.1611745 
## Run 51 stress 0.1798052 
## Run 52 stress 0.1752023 
## Run 53 stress 0.1533122 
## Run 54 stress 0.1700666 
## Run 55 stress 0.1530144 
## Run 56 stress 0.1742682 
## Run 57 stress 0.1681619 
## Run 58 stress 0.1640562 
## Run 59 stress 0.1577849 
## Run 60 stress 0.1714015 
## Run 61 stress 0.1707437 
## Run 62 stress 0.1630471 
## Run 63 stress 0.1526446 
## Run 64 stress 0.1480504 
## Run 65 stress 0.147498 
## Run 66 stress 0.1717487 
## Run 67 stress 0.1698951 
## Run 68 stress 0.169873 
## Run 69 stress 0.1883345 
## Run 70 stress 0.1802805 
## Run 71 stress 0.1741195 
## Run 72 stress 0.1789103 
## Run 73 stress 0.1691117 
## Run 74 stress 0.1725853 
## Run 75 stress 0.1483774 
## Run 76 stress 0.172312 
## Run 77 stress 0.1525037 
## Run 78 stress 0.1905071 
## Run 79 stress 0.1648252 
## Run 80 stress 0.1483615 
## Run 81 stress 0.1710886 
## Run 82 stress 0.1673001 
## Run 83 stress 0.1756854 
## Run 84 stress 0.1724558 
## Run 85 stress 0.1794095 
## Run 86 stress 0.1607067 
## Run 87 stress 0.1612289 
## Run 88 stress 0.1790848 
## Run 89 stress 0.1711578 
## Run 90 stress 0.1685965 
## Run 91 stress 0.1718294 
## Run 92 stress 0.1675364 
## Run 93 stress 0.17214 
## Run 94 stress 0.1588018 
## Run 95 stress 0.1686777 
## Run 96 stress 0.1676676 
## Run 97 stress 0.145299 
## ... New best solution
## ... Procrustes: rmse 0.006945496  max resid 0.1305491 
## Run 98 stress 0.1697389 
## Run 99 stress 0.16983 
## Run 100 stress 0.1599419 
## *** No convergence -- monoMDS stopping criteria:
##     69: no. of iterations >= maxit
##     31: stress ratio > sratmax

NMDS

taxonomy plotting

combine replicates (average)

## SKIP FOR BLANKS (no reps) ##
toptax<-dat1.g$Genus
#genus = columns, samples = rows; NOT merged with metadata; NOT relative
#consolidate triplicates with tax

dat.s<-dat.g[, -2]
#dat.g = all genera, no relative abundance

rownames(dat.s)<-dat.s[, 1]     #make first column into row names
dat.s<-dat.s[,-1]               #remove first column
dat.s<-as.data.frame(t(dat.s))  #samples must be rows!!!!
dat.s$sample<-rownames(dat.s) #get sample names
dat.s<-merge(met1[, 
               match(c("Sequencing.Name", "combo.name", "Replicate"), 
                     colnames(met1))],
          dat.s, 
          by.y = "sample", by.x = "Sequencing.Name", all.y = T)

#get average of triplicates
dat.s<-ddply(dat.s, "combo.name", numcolwise(mean), na.rm=T) 
#make proportional for each station
dat.s.p<-cbind(dat.s$combo.name,                   #by combo name
          sweep(dat.s[,4:ncol(dat.s)], 1,          
                rowSums(dat.s[4:ncol(dat.s)]), `/`)*100) 
dat.s.p<-dat.s.p[!is.na(dat.s.p$`dat.s$combo.name`), ]
#make proportional from ALL ASV -- SKIP IF ALREADY RELATIVE ABUNDANCE
#                               -- SKIP IF AVERAGE SAMPLES
av.sums.2<-rowSums(dat.s.p[2:ncol(dat.s.p)])      #double check all 100s
                                            #or near 100 if top tax
rownames(dat.s.p)<-dat.s.p[, 1]    #put back into samples = cols, tax = rows
dat.s.p<-dat.s.p[, -1]
dat.s.p<-as.data.frame(t(dat.s.p))
dat.s.p$Genus<-rownames(dat.s.p)    #make tax column
dat.s.p<-dat.s.p[, c(ncol(dat.s.p), 1:(ncol(dat.s.p)-1))]

dat.s.p.top<-dat.s.p[match(toptax, dat.s.p$Genus), ] #match to top tax
dat.s.p.top<-dat.s.p.top[!is.na(dat.s.p.top$Genus), ]

dat.s.p.top.m<-melt(dat.s.p.top)
dat.s.p.top.m$num<-1:nrow(dat.s.p.top.m)
dat.s.p.top.met<-merge(dat.s.p.top.m, met1, by.x = "variable", by.y="combo.name", 
                 all.x=T, sort=F) #merge with metadata
dat.s.p.top.met<-dat.s.p.top.met[!duplicated(dat.s.p.top.met$num), ]

plot top taxonomy, averaged samples

plot top taxonomy, blanks

compare taxonomies

top ASV come from spike-in material, “mother” water

## [1] "% of unique spike-in ASV in top ASV: 0"
## [1] "% of unique Catalina ASV in top ASV: 44.26"
## [1] "% ASV unique Catalina in spike-in: 10.38"

plot mock community

mock.asv<-dat1[, grepl("OTU|ymo", colnames(dat1))]
mock.asv$seqcount<-rowSums(mock.asv[2:ncol(mock.asv)]>0) 
#count samples with values
mock.asv<-subset(mock.asv, mock.asv$seqcount > 0) #unique to spike in

mock.gen<-merge(mock.asv, dat[!grepl("X", colnames(dat))], 
                by="OTUID", all.x = T)
mock.gen<-ddply(mock.gen, "Genus", numcolwise(sum))
mock.gen[ , 2:5]<- (sweep(mock.gen[ , 2:5], 2,
        colSums(mock.gen[ , 2:5]), `/`))*100
mock.gen$THEOR.COMP<-c(6,0,12,6,12,12,12,12,12,0)
mock.gen.m<-melt(mock.gen[,!grepl("seqcount", colnames(mock.gen))])

mg<-ggplot(mock.gen.m)+
  geom_bar(aes(x=variable, y=value, fill=Genus), 
           stat = "identity", position = position_stack())+
  scale_fill_manual(values=c("#58BBCC", "#BCBD45", "#D57DBE", "#84584E", 
                             "#8D69B8", "#C53932", "#519E3E", "#EF8636", 
                             "#7F7F7F", "white"), name="")+
  scale_x_discrete(labels=c("R1", "R2", "R3", "theoretical\ncomposition"))+
  labs(x="", y="Relative abundance (%)", 
       title="ZymoBIOMICS Microbial Community DNA Standard (D6305)")
mg

#ggsave(plot = mg, 
#       path = path.local, 
#       filename = "zymo_mock_16S.png", 
#       height = 4, width = 8)

random forest

from Susie Theroux code “1_prep.R”and “RF.most.abd.genera.R” accessed 8/4/25 use genus consolidated (anything <10% rel. ab. = “unassigned”), all reps

random forest prep

## [1] 401

random forest

colnames(met1.rf)<-gsub(" ", "\\.", colnames(met1.rf))

# predict ddPCR with just treatment, condition, timepoint --
#pred.env<-met1.rf[ , match(c("Copies.per.250.mL", "Treatment", 
#                             "Temperature..C.", "Time.point"), 
#                           colnames(met1.rf))]
#pred.env<-pred.env[!is.na(pred.env$Copies.per.250.mL), ]

#rf1<-randomForest(data = pred.env, 
#                  Copies.per.250.mL~., ntree=999)
#rf1 # 73.71% of variance explained

# predict ddPCR with treatment, condition, timepoint and genera --
pred.genera<-gen.rf
pred.genera$Sequencing.Name<-row.names(pred.genera)
pred.genera<-merge(pred.genera, 
                   met1.rf[ , match(c("Sequencing.Name", "Copies.per.250.mL", 
                                      "Treatment", "Temperature..C.",
                                      "Time.point"),colnames(met1.rf))], 
                   by="Sequencing.Name", sort=F, all.x=T)
pred.genera<-pred.genera[,-1]
pred.genera<-pred.genera[!is.na(pred.genera$Copies.per.250.mL), ]
colnames(pred.genera)<-gsub("-", "_", colnames(pred.genera))
colnames(pred.genera)<-gsub("\\(", "_", colnames(pred.genera))
colnames(pred.genera)<-gsub("\\)", "_", colnames(pred.genera))
colnames(pred.genera)<-gsub("\\[", "_", colnames(pred.genera))
colnames(pred.genera)<-gsub("\\]", "_", colnames(pred.genera))

#get "winner" for each treatment
TREAT<-c("FISH", "CELL", "DNA")
TEMP<-unique(pred.genera$Temperature..C.)
rf.df <- data.frame(treatment="", 
                    temp="", 
                    perc.var.expl="",
                    top.5.imp="")

for (i in 1:length(TREAT)) {
  a <- TREAT[i]
  
  for (x in 1:length(TEMP)) {
    pred.genera.2<-pred.genera[pred.genera$Treatment==a, ]
    #only specific treatment
    gen.sum<-colSums(pred.genera.2[1:(ncol(pred.genera.2)-4)] != 0) 
    #number of samples that each genera is present
    gen.sum<-gen.sum[gen.sum>(nrow(pred.genera.2)*0.1)]
    #must be in 10% of samples
    gen.sum.col<-which(colnames(pred.genera.2) %in% names(gen.sum))
    col.num<-(ncol(pred.genera.2))-4
    #subset genera
    pred.genera.3<-cbind(pred.genera.2[, gen.sum.col], 
                         pred.genera.2[, col.num:ncol(pred.genera.2)])
    #make sure environmental aspects are added 
    
    a2 <- TEMP[x]
    #every temperature
    pred.genera.2<-pred.genera.2[pred.genera.2$Temperature..C.==a2, ]
    rf2<-randomForest(data = pred.genera.2, 
                      Copies.per.250.mL ~ . , ntree=100)
    b <- round((tail(rf2$rsq, 1)), 4)*100 # % variance explained
    rf2.gen<-as.data.frame(rf2$importance)
    rf2.gen$Genus<-row.names(rf2.gen)
    rf2.gen<-rf2.gen[order(rf2.gen$IncNodePurity), ]
    c <- paste0(tail(rf2.gen$Genus, 5), collapse=", ")
    temp.df <- data.frame(treatment=a, 
                          temp=a2,
                          perc.var.expl=b,
                          top.5.imp=c)
    rf.df <- rbind(rf.df, temp.df)
  }
  rf.df <- rbind(rf.df, temp.df)
  }

rf.df<-rf.df[-1, ]
rf.df$combo.treat<-paste(rf.df$treatment, rf.df$temp)
rf.df<-rf.df[!duplicated(rf.df$combo.treat), ]
rf.df$timepoint<-ifelse(grepl("Time.point", rf.df$top.5.imp), "YES", "no")

#write.csv(rf.df, row.names = F, 
#          file.path(path.local, 
#                    "BIO_eDNA_decay_16S_Init_TopOff_final_RF_10percsample.csv"))