library required
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.1
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.3.1
library(vegan)
## Warning: package 'vegan' was built under R version 4.3.1
## Loading required package: permute
## Warning: package 'permute' was built under R version 4.3.1
## Loading required package: lattice
## This is vegan 2.6-4
library(ggrepel)# editing text for ggplot
## Warning: package 'ggrepel' was built under R version 4.3.1
loading bacterial genus abundance data
genus_abun <- read.table("C:/Users/liyun/Desktop/Shotgun_Canola/reads_based/genus_abundance_merged.txt", header = T, sep="\t",row.names = 1)
#filter rare genus, prevalence=0.5
min_prevalence <- 0.5
genus_prevalence <- rowMeans(genus_abun > 0)
filtered_genus_abun <- genus_abun[genus_prevalence >= min_prevalence, ]
genus_abun[1:5,1:10]
## D2000120 D2000121 D2000122 D2000123
## Halothece 0 0 0 0
## Allocoleopsis 0 0 0 0
## Aquiluna 13 11 0 0
## Candidatus Hamiltonella 0 0 0 0
## Chlorobium 0 0 0 0
## D2000124 D2000125 D2000126 D2000127
## Halothece 0 0 0 20
## Allocoleopsis 0 0 0 29
## Aquiluna 11 21 0 34
## Candidatus Hamiltonella 0 0 0 25
## Chlorobium 0 0 0 33
## D2000128 D2000129
## Halothece 0 0
## Allocoleopsis 0 23
## Aquiluna 11 15
## Candidatus Hamiltonella 0 0
## Chlorobium 0 0
filtered_genus_abun[1:5,1:10]
## D2000120 D2000121 D2000122 D2000123
## Aquiluna 13 11 0 0
## Chryseobacterium 16606 29902 29864 25413
## Epilithonimonas 0 27 105 57
## Kaistella 10 13 31 44
## Abyssalbus 24 0 0 0
## D2000124 D2000125 D2000126 D2000127
## Aquiluna 11 21 0 34
## Chryseobacterium 27706 16056 2587 25375
## Epilithonimonas 315 55 41 425
## Kaistella 51 138 0 659
## Abyssalbus 11 17 0 142
## D2000128 D2000129
## Aquiluna 11 15
## Chryseobacterium 20664 7224
## Epilithonimonas 15 121
## Kaistella 0 185
## Abyssalbus 0 41
RDA
genus.hell <- decostand (t(filtered_genus_abun), 'hell') # hellinger transform community data, and the sample ID is in row
genus.rda <- rda(genus.hell ~ site + year + line + week, data = meta_data_sort) # rda with your interested environmental variables
fwd.sel <- ordiR2step(rda(genus.hell ~ 1, data = meta_data_sort),
scope = formula(genus.rda), direction = "forward", R2scope = TRUE,
pstep = 1000, trace = FALSE) # This step iteratively adds or removes predictors from a multivariate model, evaluates the model's performance at each step, and selects the best set of predictors.
fwd.sel$call # showing which variables should be kept
## rda(formula = genus.hell ~ year + week + site, data = meta_data_sort)
tbRDA<-rda(genus.hell~year + week +site,data=meta_data_sort) #run the model with screened variables
tbRDA.anova = anova.cca(tbRDA,by="axis",parallel=getOption("mc.cores"))
vif.cca(tbRDA) #VIF is a measure used to assess multicollinearity among the environmental variables in a multivariate analysis. If value > 5, this variable should be removed from model
## year2017 weekW6 weekW9 siteM siteS
## 1.500000 1.333333 1.333333 1.500000 1.500000
factor = meta_data_sort[,c("site",'year','week')]
result=envfit(tbRDA,factor) #The purpose of envfit() is to assess the relationship between environmental variables and species composition as explained by the ordination axes. It quantifies how well each environmental variable fits the ordination space and provides statistical significance tests for these fits.
#If you want to show the effect of the categorical variable in the plot with arrows, you have to transform these variables in binary format. factor_1.csv is the transformed files. 0 or 1 is based on whether this factor is linked to the sample
factor1=read.csv("C:/Users/liyun/Desktop/Shotgun_Canola/factor_1.csv",header = T)
factor1
## X Scott Melfort Saskatoon Year_2016 Year_2017 NAM_23 NAM_30 Week3
## 1 D2000120 1 0 0 0 1 1 0 0
## 2 D2000121 0 0 1 0 1 1 0 1
## 3 D2000122 0 1 0 0 1 1 0 0
## 4 D2000123 0 0 1 0 1 0 1 0
## 5 D2000124 0 1 0 0 1 1 0 1
## 6 D2000125 0 1 0 0 1 0 1 0
## 7 D2000126 0 1 0 0 1 0 1 1
## 8 D2000127 0 0 1 1 0 1 0 0
## 9 D2000128 1 0 0 0 1 0 1 1
## 10 D2000129 0 0 1 1 0 0 1 0
## 11 D2000130 0 0 1 1 0 1 0 1
## 12 D2000131 1 0 0 0 1 0 1 0
## 13 D2000132 0 0 1 0 1 0 1 1
## 14 D2000134 0 0 1 0 1 1 0 0
## 15 D2000135 0 0 1 0 1 0 1 0
## 16 D2000137 0 1 0 0 1 0 1 0
## 17 D2000138 0 0 1 0 1 1 0 0
## 18 D2000139 0 0 1 1 0 0 1 0
## 19 D2000140 0 0 1 1 0 1 0 0
## 20 D2000141 0 1 0 0 1 1 0 0
## 21 D2000142 1 0 0 0 1 0 1 0
## 22 D2000143 0 0 1 1 0 0 1 1
## 23 D2000144 1 0 0 0 1 1 0 0
## 24 D2000145 1 0 0 0 1 1 0 1
## Week6 Week9
## 1 1 0
## 2 0 0
## 3 1 0
## 4 1 0
## 5 0 0
## 6 0 1
## 7 0 0
## 8 0 1
## 9 0 0
## 10 0 1
## 11 0 0
## 12 0 1
## 13 0 0
## 14 1 0
## 15 0 1
## 16 1 0
## 17 0 1
## 18 1 0
## 19 1 0
## 20 0 1
## 21 1 0
## 22 0 0
## 23 0 1
## 24 0 0
rda_model <- rda(genus.hell~Scott+Melfort+Saskatoon+Year_2016+Year_2017+Week3+Week6+Week9,data=factor1)
# To better edit the RDA plot, we need to extract information from rda model
#extract RDA1 RDA2 and their contribution
aa=summary(rda_model)[[9]] # extract RDA and contribution info (list() format)
RDA_Contri=data.frame(aa$importance) #transform list() to data frame
contr=RDA_Contri[2,] # Extract contribution
contr_percent=paste0(round(contr*100, 1), "%") #transform contribution to percentage
# Environmental factors' RDA coordinate
SUM=summary(rda_model)
factor=data.frame(SUM$biplot)
# Samples' RDA coordinate
meta=data.frame(SUM$sites)
meta_com=cbind(meta,meta_data_sort$site,meta_data_sort$year,meta_data_sort$week) #add environmental factors
colnames(meta_com)[7:9]=c('Site','Year','Group') #Rename column names
meta_com$Site=gsub('L','Saskatoon',meta_com$Site) # using full name of experiment site
meta_com$Site=gsub('^S$','Scott',meta_com$Site) # using full name of experiment site
meta_com$Site=gsub('^M$','Melfort',meta_com$Site) # using full name of experiment site
meta_com$Group=gsub('W','week',meta_com$Group) # change W to week
ggplot()+
geom_vline(xintercept = c(0), color = "grey70", linetype = 2) +
geom_hline(yintercept = c(0), color = "grey70", linetype = 2) +
xlab(paste('RDA1 (',contr_percent[1],')',sep='')) +
ylab(paste('RDA2 (',contr_percent[2],')',sep='')) +
scale_x_continuous(sec.axis = dup_axis(labels=NULL, name=NULL)) +
scale_y_continuous(sec.axis = dup_axis(labels=NULL, name=NULL)) +
geom_point(data=meta_com,
aes(x=RDA1, y=RDA2, colour=Group, shape=Site),
size=3) +
geom_segment(data=factor,
aes(x=0, y=0, xend=RDA1, yend=RDA2),
colour="black", size=0.2, arrow=arrow(),lwd=0.5)+
geom_text_repel(data=factor,
aes(x=RDA1-0.15, y=RDA2, label=rownames(factor)),
colour="black")+
stat_ellipse(data=meta_com, aes(x=RDA1,y=RDA2,color=Year))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#geom_vline() and geom_hline(): Add vertical and horizontal dashed gray lines at the x = 0 and y = 0 positions, respectively.
#xlab() and ylab(): Sets the x-axis and y-axis labels, with the labels indicating the proportion of contribution from contr_percent.
#scale_x_continuous() and scale_y_continuous(): Sets the secondary x and y axes with no labels or axis titles (sec.axis = dup_axis(labels = NULL, name = NULL)).
#geom_point(): Adds points to the scatterplot. These points are colored and shaped based on the "Group" and "Site" variables from the meta_com dataset.
#geom_segment(): Adds line segments that start at the origin (0, 0) and extend to the coordinates defined by "RDA1" and "RDA2" from the factor dataset. The lines are black with a size of 0.2 and have arrows at the end of each segment.
#geom_text_repel(): Adds text labels to the scatterplot. The labels come from the row names of the factor dataset and are positioned slightly to the left of the corresponding points defined by "RDA1" and "RDA2."
#stat_ellipse(): Adds ellipses to the plot. These ellipses represent the distribution of data points based on the "Year" variable from the meta_com dataset。
ggsave("C:/Users/liyun/Desktop/Shotgun_Canola/genus_RDA_2.png",width = 5,height = 4) #save data