knitr::opts_chunk$set(warning = FALSE, message = FALSE)
install packages
library(knitr)
#install.packages("PKI")
#install.packages("rsconnect", type = "source")
library(rsconnect)
library(reshape2)
library(ggplot2)
library(vegan)
library(plyr)
library(dplyr)
library(palettetown)
library(heatmap3)
library(ggbeeswarm)
library(randomForest)
library(readxl)
library(segmented)
set.seed(100)
upload files
path<-"/Users/kylielanglois/SCCWRP/Oceankind - eDNA Work/eDNA Decay Studies/eDNA Degradation Microcosms/final data 041125"
path.local<-"/Users/kylielanglois/OneDrive - SCCWRP/eDNA decay"
dat<-read.csv(
file.path(
path,
"16S OTU Table/otu.intial.plus.topoff.final.answer.csv"))
#".relabd" for relative abundance table
tax.col<-c("Kingdom", "Phylum", "Class", "Order", "Family",
"Genus", "Species", "Consensus.Lineage")
tax<-dat[ , match(c("OTUID", tax.col), colnames(dat))]
dat1<-dat[ , -match(tax.col, colnames(dat))]
dat1<-dat1[, -1] #ger rid of "rownames"
met<-read.csv("/Users/kylielanglois/OneDrive - SCCWRP/eDNA decay/eDNA decay May 2024 metadata.csv")
met1<-met[!grepl("TO", met$Sequencing.Name), ]
cop<-read_excel(
file.path(
path,
"ddPCR/eDNA decay may 2024 combo_dilutiondecision_FINAL.xlsx"), sheet = 7)
div.decay<-read.csv(
file.path(path.local, "BIO_eDNA_decay_16S_Init_TopOff_final_shan-low.csv"))
remove samples
remove<-c("X57", "X83", "X243")
remove.g<-paste0(remove, collapse = "|")
dat<-dat[, !grepl(remove.g, colnames(dat))]
dat1<-dat1[, !grepl(remove.g, colnames(dat1))]
met<-met[!grepl(remove.g, met$Sequencing.Name), ]
met1<-met1[!grepl(remove.g, met1$Sequencing.Name), ]
code from Nicholas Lombardo (nicholasl@sccwrp.org) broken-stick model to explain gene copies per 1ml over time, each treatment
dna_clean <- cop1 |>
mutate(
#make hours into character
hours = Hours,
#get copies per 1ml from data
DNA_copies = `Copies per 1 mL`,
#make treatment into factor
Treatment.char = factor(toupper(Treatment.char), levels = c("3. FISH", "2. CELL", "1. DNA"))
) |>
#put together
arrange(Treatment.char, Temperature.char, Hours, DNA_copies) |>
dplyr::select(Treatment.char, Temperature.char, Hours, DNA_copies) |>
#mean copies per 1ml per temp+treatment+time
group_by(Treatment.char, Temperature.char, Hours) |>
summarize(
DNA_copies = mean(DNA_copies)
)
intercepts<-
dna_clean |>
group_by(Treatment.char, Temperature.char) |>
summarize(
#linear model of log copies per 1ml by hour, grouped by treatment+temp
model = list(lm(log(DNA_copies) ~ Hours)),
#get y intercept
intercept = purrr::map_dbl(model, function(x) exp(coef(x)[1])),
#get p value
intercept_p_val = purrr::map_dbl(model, function(x) coef(summary(x))[1, 4])
) |>
ungroup() |>
dplyr::select(-model)
### broken stick ####
# fixed breakpoint ####
model_data_for_plot1 <-
dna_clean |>
group_by(Treatment.char, Temperature.char) |>
#group by treatment+temp
summarize(
.groups = "keep",
model = list(
#linear model of average copies per 1ml by Hours
segmented(
lm(log(DNA_copies) ~ Hours,
data = pick(DNA_copies, Hours)),
seg.Z = ~Hours,
psi = 24,
control = seg.control(it.max = 0)
)
),
#get model summary
summary = purrr::map(model, function(x) summary(x)),
adj_r_squared = purrr::map_dbl(summary, function(x) x$adj.r.squared),
aic = purrr::map_dbl(model, function(x) AIC(x))
) |>
mutate(
#get breakpoint (24 Hours) to split linear model into 2
preds = purrr::map(model, function(mod) {
predict(mod,
interval = "confidence",
newdata = data.frame(Hours = seq(min(mod$model$Hours),
max(mod$model$Hours), by = 1)) |>
mutate(U1.Hours = (Hours > 24) * (Hours-24))
) |>
as.data.frame() |>
bind_cols(Hours = seq(min(mod$model$Hours), max(mod$model$Hours), by = 1))
}),
slopes = purrr::map(model, function(mod) {
#browser()
mod$nameUV <- list(U = "U1.Hours", Z = "Hours")
as.data.frame(slope(mod)$Hours) |>
tibble::rownames_to_column(var = "slope")
})
) |>
tidyr::unnest(cols = preds) |>
tidyr::unnest(cols = slopes) |>
#put together
dplyr::select(Treatment.char, Temperature.char,
adj_r_squared,
fit,
lwr, upr,
Hours,
slope, est = `Est.`,
t = `t value`
) |>
tidyr::pivot_wider(names_from = slope,
values_from = c("est", "t"))
#get annotations from model to put on plots
annotation_data <-
model_data_for_plot1 |>
distinct(Treatment.char, Temperature.char,
k1 = est_slope1,
k2 = est_slope2,
adj_r_squared) |>
mutate(
metrics = paste0(
"k: ", format(round(-k1, 3), nsmall = 3), ", ", format(round(-k2, 3), nsmall = 3), "\n",
"break point: 24\n",
"r.sq: ", format(round(adj_r_squared, 2), nsmall = 2)),
metrics2 = paste0(
"k: ", format(round(-k1, 3), nsmall = 3), ", ", format(round(-k2, 3), nsmall = 3), "\n",
"r.sq: ", format(round(adj_r_squared, 2), nsmall = 2)))
cop1$copies.1ml<-cop1$`Copies per 1 mL`
hg <- ggplot()+
geom_ribbon(data=model_data_for_plot1,
aes(x = Hours, y = exp(fit), ymin = exp(lwr), ymax = exp(upr)),
alpha = 0.15) +
geom_line(data=model_data_for_plot1,
aes(x = Hours, y = exp(fit), color = Temperature.char),
alpha=0.5) +
geom_point(data=cop1,
aes(x = Hours, y = copies.1ml,
color=Temperature.char, shape=Treatment.char),
size=3, alpha=0.5) +
geom_text(data = annotation_data,
aes(x = 0, y = 10, label = metrics2),
color="black", size = 8, size.unit = "pt",
hjust = 0, vjust = 1) +
coord_cartesian(ylim = c(0.01, 1e8),
xlim = c(-5, 175), expand = FALSE) +
scale_y_log10(labels = scales::label_log(base = 10)) +
labs(x="Time (hr)", y="Gene copy per 1mL")+
facet_grid(rows = vars(Temperature.char),
cols = vars(Treatment.char)) +
scale_color_manual(values = c("#3B9AB2", "#0b8e37" , "#EBCC2A", "#e67300","#F21A00")) +
scale_shape_manual(values = c(16, 17, 18)) +
theme_bw() +
theme(panel.grid = element_blank(),
legend.position = "none")
hg
#ggsave(plot = hg,
# path="/Users/kylielanglois/OneDrive - SCCWRP/eDNA decay/figs",
# filename = "eDNA decay may 2024 combo total log10 decay 081125.png",
# width = 7, height = 7)
consolidate samples at genus level
dat.g<-ddply(dat, "Genus", numcolwise(sum))
#make proportional from ALL ASV
colnames(dat.g)[1]<-"Genus"
df.temp<-dat.g
df.new<-data.frame(Genus=df.temp$Genus)
for (i in 3:ncol(df.temp)) {
a<-as.data.frame(df.temp[, c(1, i)])
a[, 2]<-as.numeric(as.character(a[, 2]))
a<-as.matrix(a[order(-a[2]), ])
a<-as.data.frame(a[1:20, ])
a[, 2]<-as.numeric(a[, 2])
df.new<-merge(df.new, a, by="Genus", all=T)
}
#remove genera with ONLY na
df.new$sum<-rowSums(!is.na(df.new[2:ncol(df.new)])) #count samples with values
#only in >10% samples
df.new<-subset(df.new, df.new$sum>46)
df.new<-df.new[, !grepl("sum", colnames(df.new))]
df.new$Genus<-sub("^$", "Unassigned", df.new$Genus)
dat1.g.count<-df.new
dat1.g<-cbind(dat1.g.count$Genus,
sweep(dat1.g.count[ , 2:ncol(dat1.g.count)], 2,
colSums(dat.g[,3:ncol(dat.g)]), `/`)*100)
colnames(dat1.g)[1]<-"Genus"
#genera with relative abundance >10% -- from SMT
dat1.g.alt<-cbind(dat.g$Genus,
sweep(dat.g[,3:ncol(dat.g)], 2,
colSums(dat.g[,3:ncol(dat.g)]), `/`)*100)
dat1.g.alt<-melt(dat1.g.alt)
#dat1.g.alt$Genus2<-ifelse(dat1.g.alt$value > 0, dat1.g.alt$`dat.g$Genus`,"Unassigned")
dat1.g.alt<-dat1.g.alt[!grepl("Unass|uncul|Chlor", dat1.g.alt$`dat.g$Genus`), ]
consolidate blanks at genus level
samp.bl<-met$Sequencing.Name[met$Time.point==-1]
samp.bl<-paste0(samp.bl, "$")
samp.bl<-paste0(samp.bl, collapse = "|")
df2.temp<-dat[ , grepl(samp.bl, colnames(dat))]
df2.temp<-sweep(df2.temp, 2,colSums(df2.temp), `/`)*100
df2.temp$Genus<-dat$Genus
df2.temp<-ddply(df2.temp, "Genus", numcolwise(sum))
df2.new<-data.frame(Genus=df2.temp$Genus)
for (i in 2:ncol(df2.temp)) {
a<-as.data.frame(df2.temp[, c(1, i)])
a[, 2]<-as.numeric(as.character(a[, 2]))
a<-as.data.frame(a[order(-a[2]), ])
a<-a[1:20, ]
df2.new<-merge(df2.new, a, by="Genus", all=T)
}
#remove genera with ONLY na
df2.new$sum<-rowSums(!is.na(select(df2.new, -Genus))) #count samples with values
df2.new<-subset(df2.new, df2.new$sum>1) #only in >1 samples
df2.new<-df2.new[, !grepl("sum", colnames(df2.new))]
df2.new$Genus<-sub("^$", "Unassigned", df2.new$Genus)
dat3.m<-melt(df2.new)
dat3.m.met<-merge(dat3.m, met1,
by.x = "variable", by.y="Sequencing.Name",
all.x=T, sort=F) #merge with metadata
taxa = columns, samples = rows
alpha diversity
#dat.g = all genera, no relative abundance
a.dat.choose.div<-dat.g
rownames(a.dat.choose.div)<-a.dat.choose.div$Genus
a.dat.choose.div<-a.dat.choose.div[, -1]
d.sp<-data.frame(sample=colnames(a.dat.choose.div), #samples
gen.num=colSums(a.dat.choose.div != 0)) #number of genera
t<-as.data.frame(t(a.dat.choose.div))
d.sp$gen.even<-(diversity(t, "shannon")/log(specnumber(t))) #Pielous evenness
d.sp$gen.shan<-diversity(t, "shannon")
d.sp.met<-merge(d.sp, met1, by.x="sample", by.y="Sequencing.Name")
#combine with metadata
#get timepoint of lowest shannon diversity for each
d.sp.met.s<-d.sp.met %>%
group_by(Treatment, Temperature.char) %>%
summarise(shan.min=min(gen.shan))
d.sp.met.s<-merge(d.sp.met.s,
d.sp.met[, grepl("gen.shan|Time.point", colnames(d.sp.met))],
by.x="shan.min", by.y="gen.shan", all.x = T)
#write.csv(d.sp.met.s, row.names = F,
# file.path(path.local, "BIO_eDNA_decay_16S_Init_TopOff_final_shan-low.csv"))
#plot
d.sp.m<-melt(d.sp)
d.sp.m.met<-merge(d.sp.m, met1, by.x="sample", by.y="Sequencing.Name")
d.sp.m.met<-d.sp.m.met[!is.na(d.sp.m.met$Temperature..C.), ]
eg<-ggplot(d.sp.m.met)+
geom_point(aes(x=Time.point, y=value,
color=Temperature.char, shape=variable),
alpha=0.75, size=3)+
facet_grid(Temperature..C.*variable~Treatment.char,
scales = "free", space="free_x")+
scale_color_manual(
values=c("#7b3294","#2c7bb6","#008837", "#fdae61" ,"#d7191c"))+
scale_shape_manual(values=c(3, 4, 16),
label=c("evenness", "richness", "shannon"),
name="diversity metrics")+
theme(legend.position = "bottom",
strip.text.y = element_blank())+
ggtitle("eDNA decay May 2024 16S, ALL")
eg
#ggsave(plot=eg,
# path=path.local,
# filename="eDNA_decay_InitTopOff_16S_div_ALL.png",
# height=14, width=10)
peak decay vs lowest diversity
beta diversity
dat.choose<-dat1.g
#choose which table for plotting
#dat1 = ASV, no averaging
#dat1.g = top genera, relative abundance
#dat1.g.count = top genera, no relative abundance
#dat.g = all genera, no relative abundance
dat1.m<-melt(dat.choose)
dat1.m.met<-merge(dat1.m, met1, by.x = "variable", by.y="Sequencing.Name",
all.x=T, sort=F) #merge with metadata
distance matrix
v<-as.data.frame(as.matrix(vegdist(dat1.t, upper = T)))
v$variable2<-rownames(v)
v.m<-melt(v, id.vars = "variable2")
v.m<-merge(v.m, met1, by.x="variable", by.y="Sequencing.Name", all.x = T)
v.m<-merge(v.m, met1, by.x="variable2", by.y="Sequencing.Name", all.x = T)
v.m$value<-ifelse(v.m$variable==v.m$variable2, NA, v.m$value)
v.m1<-v.m[!grepl("DI", v.m$variable), ]
v.m1<-v.m1[!grepl("DI", v.m1$variable2), ]
vg<-ggplot(data=v.m1[!grepl("-1", v.m1$Time.point.x), ],
aes(x=Time.point.x, y=value,
col=Time.point.y))+
geom_boxplot()+
labs(x="Timepoint", y="Bray-Curtis distance", color="Timepoint",
title = "eDNA decay May 2024 Init+TopOff 16S\nBLANKS")
vg
#ggsave(plot=vg,
# path=path.local,
# filename="eDNA_decay_InitTopOff_16S_boxplot_BLANK.png",
# height=6, width=8)
NMDS prep
#col.choose<-met1$Sequencing.Name[grepl("BLANK|WATER", met1$Treatment)]
#dat.choose.1<-dat1[ , match(c("OTUID", col.choose), colnames(dat1))]
#DO ABOVE JUST FOR BLANKS
#choose which table for plotting
#dat1 = ASV, no averaging
#dat1.g = top genera, relative abundance
dat.choose.1<-dat1[, !grepl("Zymo", colnames(dat1), ignore.case = T)]
dat1.t<-dat.choose.1 #transpose non-relative abundance table
rownames(dat1.t)<-dat1.t$OTUID
dat1.t<-dat1.t[, -1]
dat1.t<-as.data.frame(t(dat1.t))
n<-metaMDS(dat1.t, trymax = 100)
## Square root transformation
## Wisconsin double standardization
## Run 0 stress 0.1524879
## Run 1 stress 0.1535641
## Run 2 stress 0.1691887
## Run 3 stress 0.1492164
## ... New best solution
## ... Procrustes: rmse 0.0261411 max resid 0.3308381
## Run 4 stress 0.1712126
## Run 5 stress 0.1799393
## Run 6 stress 0.1760205
## Run 7 stress 0.1688489
## Run 8 stress 0.1750719
## Run 9 stress 0.1757639
## Run 10 stress 0.1584052
## Run 11 stress 0.1758472
## Run 12 stress 0.1655226
## Run 13 stress 0.1784691
## Run 14 stress 0.1644642
## Run 15 stress 0.1753433
## Run 16 stress 0.1664675
## Run 17 stress 0.1471559
## ... New best solution
## ... Procrustes: rmse 0.008082622 max resid 0.1448986
## Run 18 stress 0.1483367
## Run 19 stress 0.172563
## Run 20 stress 0.1498567
## Run 21 stress 0.1470948
## ... New best solution
## ... Procrustes: rmse 0.009253803 max resid 0.12929
## Run 22 stress 0.1546845
## Run 23 stress 0.175951
## Run 24 stress 0.1679807
## Run 25 stress 0.1901554
## Run 26 stress 0.1498077
## Run 27 stress 0.1559601
## Run 28 stress 0.1674632
## Run 29 stress 0.1466769
## ... New best solution
## ... Procrustes: rmse 0.003173137 max resid 0.06403831
## Run 30 stress 0.1723356
## Run 31 stress 0.1869873
## Run 32 stress 0.1778012
## Run 33 stress 0.1680976
## Run 34 stress 0.1545373
## Run 35 stress 0.1759392
## Run 36 stress 0.1696864
## Run 37 stress 0.1753737
## Run 38 stress 0.1775694
## Run 39 stress 0.1817036
## Run 40 stress 0.1658267
## Run 41 stress 0.1797226
## Run 42 stress 0.1678203
## Run 43 stress 0.1559332
## Run 44 stress 0.1689354
## Run 45 stress 0.1489876
## Run 46 stress 0.1670674
## Run 47 stress 0.1760844
## Run 48 stress 0.1676759
## Run 49 stress 0.1700887
## Run 50 stress 0.1611745
## Run 51 stress 0.1798052
## Run 52 stress 0.1752023
## Run 53 stress 0.1533122
## Run 54 stress 0.1700666
## Run 55 stress 0.1530144
## Run 56 stress 0.1742682
## Run 57 stress 0.1681619
## Run 58 stress 0.1640562
## Run 59 stress 0.1577849
## Run 60 stress 0.1714015
## Run 61 stress 0.1707437
## Run 62 stress 0.1630471
## Run 63 stress 0.1526446
## Run 64 stress 0.1480504
## Run 65 stress 0.147498
## Run 66 stress 0.1717487
## Run 67 stress 0.1698951
## Run 68 stress 0.169873
## Run 69 stress 0.1883345
## Run 70 stress 0.1802805
## Run 71 stress 0.1741195
## Run 72 stress 0.1789103
## Run 73 stress 0.1691117
## Run 74 stress 0.1725853
## Run 75 stress 0.1483774
## Run 76 stress 0.172312
## Run 77 stress 0.1525037
## Run 78 stress 0.1905071
## Run 79 stress 0.1648252
## Run 80 stress 0.1483615
## Run 81 stress 0.1710886
## Run 82 stress 0.1673001
## Run 83 stress 0.1756854
## Run 84 stress 0.1724558
## Run 85 stress 0.1794095
## Run 86 stress 0.1607067
## Run 87 stress 0.1612289
## Run 88 stress 0.1790848
## Run 89 stress 0.1711578
## Run 90 stress 0.1685965
## Run 91 stress 0.1718294
## Run 92 stress 0.1675364
## Run 93 stress 0.17214
## Run 94 stress 0.1588018
## Run 95 stress 0.1686777
## Run 96 stress 0.1676676
## Run 97 stress 0.145299
## ... New best solution
## ... Procrustes: rmse 0.006945496 max resid 0.1305491
## Run 98 stress 0.1697389
## Run 99 stress 0.16983
## Run 100 stress 0.1599419
## *** No convergence -- monoMDS stopping criteria:
## 69: no. of iterations >= maxit
## 31: stress ratio > sratmax
NMDS
combine replicates (average)
## SKIP FOR BLANKS (no reps) ##
toptax<-dat1.g$Genus
#genus = columns, samples = rows; NOT merged with metadata; NOT relative
#consolidate triplicates with tax
dat.s<-dat.g[, -2]
#dat.g = all genera, no relative abundance
rownames(dat.s)<-dat.s[, 1] #make first column into row names
dat.s<-dat.s[,-1] #remove first column
dat.s<-as.data.frame(t(dat.s)) #samples must be rows!!!!
dat.s$sample<-rownames(dat.s) #get sample names
dat.s<-merge(met1[,
match(c("Sequencing.Name", "combo.name", "Replicate"),
colnames(met1))],
dat.s,
by.y = "sample", by.x = "Sequencing.Name", all.y = T)
#get average of triplicates
dat.s<-ddply(dat.s, "combo.name", numcolwise(mean), na.rm=T)
#make proportional for each station
dat.s.p<-cbind(dat.s$combo.name, #by combo name
sweep(dat.s[,4:ncol(dat.s)], 1,
rowSums(dat.s[4:ncol(dat.s)]), `/`)*100)
dat.s.p<-dat.s.p[!is.na(dat.s.p$`dat.s$combo.name`), ]
#make proportional from ALL ASV -- SKIP IF ALREADY RELATIVE ABUNDANCE
# -- SKIP IF AVERAGE SAMPLES
av.sums.2<-rowSums(dat.s.p[2:ncol(dat.s.p)]) #double check all 100s
#or near 100 if top tax
rownames(dat.s.p)<-dat.s.p[, 1] #put back into samples = cols, tax = rows
dat.s.p<-dat.s.p[, -1]
dat.s.p<-as.data.frame(t(dat.s.p))
dat.s.p$Genus<-rownames(dat.s.p) #make tax column
dat.s.p<-dat.s.p[, c(ncol(dat.s.p), 1:(ncol(dat.s.p)-1))]
dat.s.p.top<-dat.s.p[match(toptax, dat.s.p$Genus), ] #match to top tax
dat.s.p.top<-dat.s.p.top[!is.na(dat.s.p.top$Genus), ]
dat.s.p.top.m<-melt(dat.s.p.top)
dat.s.p.top.m$num<-1:nrow(dat.s.p.top.m)
dat.s.p.top.met<-merge(dat.s.p.top.m, met1, by.x = "variable", by.y="combo.name",
all.x=T, sort=F) #merge with metadata
dat.s.p.top.met<-dat.s.p.top.met[!duplicated(dat.s.p.top.met$num), ]
plot top taxonomy, averaged samples
plot top taxonomy, blanks
top ASV come from spike-in material, “mother” water
## [1] "% of unique spike-in ASV in top ASV: 0"
## [1] "% of unique Catalina ASV in top ASV: 44.26"
## [1] "% ASV unique Catalina in spike-in: 10.38"
plot mock community
mock.asv<-dat1[, grepl("OTU|ymo", colnames(dat1))]
mock.asv$seqcount<-rowSums(mock.asv[2:ncol(mock.asv)]>0)
#count samples with values
mock.asv<-subset(mock.asv, mock.asv$seqcount > 0) #unique to spike in
mock.gen<-merge(mock.asv, dat[!grepl("X", colnames(dat))],
by="OTUID", all.x = T)
mock.gen<-ddply(mock.gen, "Genus", numcolwise(sum))
mock.gen[ , 2:5]<- (sweep(mock.gen[ , 2:5], 2,
colSums(mock.gen[ , 2:5]), `/`))*100
mock.gen$THEOR.COMP<-c(6,0,12,6,12,12,12,12,12,0)
mock.gen.m<-melt(mock.gen[,!grepl("seqcount", colnames(mock.gen))])
mg<-ggplot(mock.gen.m)+
geom_bar(aes(x=variable, y=value, fill=Genus),
stat = "identity", position = position_stack())+
scale_fill_manual(values=c("#58BBCC", "#BCBD45", "#D57DBE", "#84584E",
"#8D69B8", "#C53932", "#519E3E", "#EF8636",
"#7F7F7F", "white"), name="")+
scale_x_discrete(labels=c("R1", "R2", "R3", "theoretical\ncomposition"))+
labs(x="", y="Relative abundance (%)",
title="ZymoBIOMICS Microbial Community DNA Standard (D6305)")
mg
#ggsave(plot = mg,
# path = path.local,
# filename = "zymo_mock_16S.png",
# height = 4, width = 8)
from Susie Theroux code “1_prep.R”and “RF.most.abd.genera.R” accessed 8/4/25 use genus consolidated (anything <10% rel. ab. = “unassigned”), all reps
random forest prep
## [1] 401
random forest
colnames(met1.rf)<-gsub(" ", "\\.", colnames(met1.rf))
# predict ddPCR with just treatment, condition, timepoint --
#pred.env<-met1.rf[ , match(c("Copies.per.250.mL", "Treatment",
# "Temperature..C.", "Time.point"),
# colnames(met1.rf))]
#pred.env<-pred.env[!is.na(pred.env$Copies.per.250.mL), ]
#rf1<-randomForest(data = pred.env,
# Copies.per.250.mL~., ntree=999)
#rf1 # 73.71% of variance explained
# predict ddPCR with treatment, condition, timepoint and genera --
pred.genera<-gen.rf
pred.genera$Sequencing.Name<-row.names(pred.genera)
pred.genera<-merge(pred.genera,
met1.rf[ , match(c("Sequencing.Name", "Copies.per.250.mL",
"Treatment", "Temperature..C.",
"Time.point"),colnames(met1.rf))],
by="Sequencing.Name", sort=F, all.x=T)
pred.genera<-pred.genera[,-1]
pred.genera<-pred.genera[!is.na(pred.genera$Copies.per.250.mL), ]
colnames(pred.genera)<-gsub("-", "_", colnames(pred.genera))
colnames(pred.genera)<-gsub("\\(", "_", colnames(pred.genera))
colnames(pred.genera)<-gsub("\\)", "_", colnames(pred.genera))
colnames(pred.genera)<-gsub("\\[", "_", colnames(pred.genera))
colnames(pred.genera)<-gsub("\\]", "_", colnames(pred.genera))
#get "winner" for each treatment
TREAT<-c("FISH", "CELL", "DNA")
TEMP<-unique(pred.genera$Temperature..C.)
rf.df <- data.frame(treatment="",
temp="",
perc.var.expl="",
top.5.imp="")
for (i in 1:length(TREAT)) {
a <- TREAT[i]
for (x in 1:length(TEMP)) {
pred.genera.2<-pred.genera[pred.genera$Treatment==a, ]
#only specific treatment
gen.sum<-colSums(pred.genera.2[1:(ncol(pred.genera.2)-4)] != 0)
#number of samples that each genera is present
gen.sum<-gen.sum[gen.sum>(nrow(pred.genera.2)*0.1)]
#must be in 10% of samples
gen.sum.col<-which(colnames(pred.genera.2) %in% names(gen.sum))
col.num<-(ncol(pred.genera.2))-4
#subset genera
pred.genera.3<-cbind(pred.genera.2[, gen.sum.col],
pred.genera.2[, col.num:ncol(pred.genera.2)])
#make sure environmental aspects are added
a2 <- TEMP[x]
#every temperature
pred.genera.2<-pred.genera.2[pred.genera.2$Temperature..C.==a2, ]
rf2<-randomForest(data = pred.genera.2,
Copies.per.250.mL ~ . , ntree=100)
b <- round((tail(rf2$rsq, 1)), 4)*100 # % variance explained
rf2.gen<-as.data.frame(rf2$importance)
rf2.gen$Genus<-row.names(rf2.gen)
rf2.gen<-rf2.gen[order(rf2.gen$IncNodePurity), ]
c <- paste0(tail(rf2.gen$Genus, 5), collapse=", ")
temp.df <- data.frame(treatment=a,
temp=a2,
perc.var.expl=b,
top.5.imp=c)
rf.df <- rbind(rf.df, temp.df)
}
rf.df <- rbind(rf.df, temp.df)
}
rf.df<-rf.df[-1, ]
rf.df$combo.treat<-paste(rf.df$treatment, rf.df$temp)
rf.df<-rf.df[!duplicated(rf.df$combo.treat), ]
rf.df$timepoint<-ifelse(grepl("Time.point", rf.df$top.5.imp), "YES", "no")
#write.csv(rf.df, row.names = F,
# file.path(path.local,
# "BIO_eDNA_decay_16S_Init_TopOff_final_RF_10percsample.csv"))