Niche space of microbial consortia within the Northern Pitcher Plant

1. Load data and construct network

Description

The network is constructed with OTUs present in at least 6 samples having a total abundance of at least 1%. The spatial analysis (distance decay and NRI) is conducted with all OTUs.

Set up phyloseq

tree_file = '~/Dropbox/RoL/pitcher-16s/qiime2-stuff/rooted-tree/tree.nwk'
df = '~/Dropbox/RoL/pitcher-16s/site-data-bioclim-pca.txt'

ps = merge_phyloseq(import_biom('~/Dropbox/RoL/pitcher-16s/table-with-taxonomy-converted.biom', parseFunction = parse_taxonomy_greengenes), 
                    sample_data(read.csv(df, sep = '\t', row.names = 1)), 
                    read_tree(tree_file))
tax_table(ps) = tax_table(ps)[,1:7]
colnames(tax_table(ps)) = c('Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species')

# filter out taxa unassigned at Kingdom level
ps = ps %>% subset_taxa( Kingdom != "Unassigned")

# drop florida because it's a different subspecies
ps = ps %>% subset_samples(site_id != 'FLK')

# drop sample with ph = 1
# ps = ps %>% subset_samples(ph>1)

# agglomerate into OTUs
# ps_OTU = ps %>% tip_glom(h = 0.05, hcfun=hclust)
# saveRDS(ps_OTU, "OTUs_hclust_0.05")
ps_OTU = readRDS("OTUs_hclust_0.05")

# add OTUID column
#tax_table(ps_OTU) = cbind(tax_table(ps_OTU), OTUID=paste0("OTU_",1:length(taxa_names(ps_OTU))) )

ps6 = filter_taxa(ps_OTU, function(x) sum(x >= 1) >= 6, TRUE) %>% rarefy_even_depth(rngseed = 777)
flist = ps6 %>% transform_sample_counts(function(x) x / sum(x)) %>% filter_taxa(function(x) sum(x>= 0.01) > 0)
ps6Top = prune_taxa(flist, ps6) 

# transform counts
ps6Top.hel = ps6Top
otu_table(ps6Top.hel) = otu_table(decostand(t(otu_table(ps6Top)), method="hellinger"), taxa_are_rows = F)

sdata = data.frame(sample_data(ps_OTU)) %>% 
  mutate(mean_temp = bio01)

Map

library(ggmap)

## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.

## Please cite ggmap if you use it! See citation("ggmap") for details.

library(ggrepel)
library(RColorBrewer)

myPalette <- colorRampPalette(rev(brewer.pal(11, "Spectral")))

register_google(key='AIzaSyAhlRC9I2MxpYhTzoY4o8ufO2ex7YQzIPg')

#namap = ggmap::get_map(c(lon = -80, lat = 41), zoom = 4, maptype = 'terrain-background', source = 'stamen')
namap = ggmap::get_map(c(left=-90, right=-65, bottom=27, top=55), maptype = 'terrain-background', color = "bw", source = 'stamen')

## Source : http://tile.stamen.com/terrain/5/8/10.png

## Source : http://tile.stamen.com/terrain/5/9/10.png

## Source : http://tile.stamen.com/terrain/5/10/10.png

## Source : http://tile.stamen.com/terrain/5/8/11.png

## Source : http://tile.stamen.com/terrain/5/9/11.png

## Source : http://tile.stamen.com/terrain/5/10/11.png

## Source : http://tile.stamen.com/terrain/5/8/12.png

## Source : http://tile.stamen.com/terrain/5/9/12.png

## Source : http://tile.stamen.com/terrain/5/10/12.png

## Source : http://tile.stamen.com/terrain/5/8/13.png

## Source : http://tile.stamen.com/terrain/5/9/13.png

## Source : http://tile.stamen.com/terrain/5/10/13.png

m1 = ggmap(namap)

# points
m1 + geom_point(data = sdata %>% select(longitude, latitude, mean_temp) %>% unique(), 
             aes(longitude, latitude), size=4) +
  theme(axis.title = element_blank()) +
  labs(color="Mean annual\ntemp (°C)")

ggsave('~/Dropbox/RoL/pitcher-16s/map.png', width = 10)

## Saving 10 x 5 in image

Environmental variables

#sdata %>% filter(!complete.cases(.))

plantdat = sdata %>% select(rosette_diameter_1,rosette_diameter_2,pitcher_length,pitcher_width,keel_width,mouth_diameter,lip_thickness) %>% mutate(mouth_diameter=replace_na(mouth_diameter, mean(mouth_diameter, na.rm=T)))

plantpca = princomp(plantdat, cor = T)
plantpca1 = plantpca$scores[,1]

Make networks

# se.ps6Top <- spiec.easi(ps6Top, method='mb', lambda.min.ratio=1e-1,
#                            nlambda=30, pulsar.params=list(rep.num=99, ncores=4))
# saveRDS(se.ps6Top, "speiec-easi-OTU-network.rds")
se.ps6Top = readRDS("speiec-easi-OTU-network.rds")

Generate bootstrapped distribution - takes a long time to run!!

# sparcc.matrix.boot <- sparccboot(t(otu_table(ps6Top)),R = 1000, ncpus = 6)
# sparcc.pvals = pval.sparccboot(sparcc.matrix.boot)
# saveRDS(sparcc.matrix.boot, "sparcc.matrix.boot.1000.RDS")
# saveRDS(sparcc.pvals, "sparcc.pvals.1000.RDS")

# sparcc algorithm does CLR normalization of counts!!!

# load bootstrapped matrix and pvals
sparcc.matrix.boot = readRDS("sparcc.matrix.boot.100.RDS")
sparcc.pvals = readRDS("sparcc.pvals.100.RDS")

# get mean and SD correlations from normalized data
sparcc.matrix = sparcc(t(otu_table(ps6Top)), th=0.1)
# define cutoff as mean + 1 SD
sparcc.cutoff = mean(abs(sparcc.matrix$Cor)) + sd(abs(sparcc.matrix$Cor))

# get matrix of bootstrapped pvals
dim = ntaxa(ps6Top)
mat.p = matrix(rep(1,dim*dim), dim, dim)
upperTriangle(mat.p) <- p.adjust(sparcc.pvals$pvals, "fdr")
lowerTriangle(mat.p) <- upperTriangle(mat.p, byrow=T)

# sparcc.adj <- replace_na( ifelse(abs(sparcc.matrix$Cor) > sparcc.cutoff &  mat.p <= 0.01, 1, 0), replace = 0 )

# Positive correlations only!
sparcc.adj = replace_na( ifelse(sparcc.matrix$Cor > sparcc.cutoff &  mat.p <= 0.01, 1, 0), replace = 0 )
rownames(sparcc.adj) <- as.vector(tax_table(ps6Top)[,"OTUID"])
colnames(sparcc.adj) <- as.vector(tax_table(ps6Top)[,"OTUID"])
sc.ps6Net = graph.adjacency(sparcc.adj,mode = "undirected",diag = FALSE)

# plot the two networks
se.ps6Net <- adj2igraph(getRefit(se.ps6Top), vertex.attr=list(name=as.vector(tax_table(ps6Top)[,"OTUID"])))

plot(se.ps6Net, vertex.label=NA, vertex.size=4)
plot(sc.ps6Net, vertex.label=NA, vertex.size=4)

Intersection of networks

sc.se = igraph::intersection(sc.ps6Net, se.ps6Net)
# taxa=ps6Top %>% subset_taxa(taxa_names(ps6Top) %in% names(V(sc.se))) %>% tax_table(.) 
# vertex_attr(sc.se, "name") <- as.vector(taxa[,'Genus'])

# remove isolated verticies
verticesToRemove <- V(sc.se)[igraph::degree(sc.se) ==0]
sc.se <- delete.vertices(sc.se, verticesToRemove)

#vertex_attr(sc.se_plot, "name") <- as.vector(tax_table(ps_OTU)[,'OTUID'])

pdf("~/Dropbox/RoL/pitcher-16s/Manuscript/full-network.pdf")
plot(cluster_leading_eigen(sc.se, weights=NA, options = list(maxiter=10000)), sc.se, vertex.label=NA, vertex.size=4)
dev.off()

## quartz_off_screen 
##                 2

Network parameters

network_results = function(net) {
  E(net)$weight = NA
  #  modularity
  fc = cluster_leading_eigen(net,weights =NULL)
  modularity = modularity(net,membership(fc))
  # graph properties
  results = data.frame(clustering.coefficient=transitivity(net), # clustering.coefficient
                modularity=modularity,# modularity
                mean.degree= mean(igraph::degree(net)),# mean degree
                size=length(E(net)),# size - number of edges
                order=length(V(net)),# order - number of vertices
                mean.distance=mean_distance(net),#average path length
                norm.degree=mean(igraph::degree(net)/length(V(net))),#normalized degree
                betweenness.centrality=centralization.betweenness(net)$centralization #betweenness centrality
                )
  return(results)
}
network_results(sc.se)

Is the network scale-free? Yes. A significant p-value indicates that the network does NOT follow a power law.

fpl = fit_power_law(igraph::degree_distribution(sc.se))
fpl$KS.p

## [1] 0.5178808

Compare random

compare_random <- function(net, reps) {
  E(net)$weight = NA
  g2 <- replicate(reps, erdos.renyi.game(length(V(net)),length(E(net)), type=c("gnm")), simplify = FALSE)
  fc.random = sapply(g2,cluster_leading_eigen, weights=NULL, options = list(maxiter=10000))
  df = data.frame(random.cc=sapply(g2,transitivity),
                  random.path=sapply(g2,average.path.length),
                  random.modularity=sapply(fc.random, modularity) )
  return(df)
}

ERrandoms = compare_random(sc.se, 999)

t.test(ERrandoms$random.modularity, mu = network_results(sc.se)$modularity)

## 
##  One Sample t-test
## 
## data:  ERrandoms$random.modularity
## t = -210.78, df = 998, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0.7011091
## 95 percent confidence interval:
##  0.5230160 0.5263014
## sample estimates:
## mean of x 
## 0.5246587

t.test(ERrandoms$random.cc, mu = network_results(sc.se)$clustering.coefficient)

## 
##  One Sample t-test
## 
## data:  ERrandoms$random.cc
## t = -476.73, df = 998, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0.1912752
## 95 percent confidence interval:
##  0.02264228 0.02402486
## sample estimates:
##  mean of x 
## 0.02333357

2. Spatial analysis of assemblage diversity

Description

The spatial structure of the community was tested with a distance decay analysis and by looking at the net relatedness index (NRI) across latitude

Distance decay

Spatial structure in the microbial assemblage data was tested with a Mantel correlogram as described in Legendre and Legendre 2012 based on Sokal (1986) and Oden and Sokal (1986). This test allows testing response data modeled by a non-Euclidean distance function (e.g., Jaccard and UniFrac here; Borcard and Legendre 2012). A positive Mantel statistic indicates positive spatial correlation. The distances were divided into 14 classes following Sturge’s rule and the test was applied with the vegan function ‘mantel.correlog’ in R. A holm correction is applied for multiple testing. The values in the second half of the distance classes are generally not interpreted because they are based on a smaller number of pairs and therefore have lower power.

metadata = data.frame(sample_data(ps_OTU))

gendis = as.matrix(UniFrac(ps_OTU))
geodis = geosphere::distm(cbind(metadata$longitude, metadata$latitude), fun = geosphere::distHaversine )
otudis = vegdist(t(otu_table(ps_OTU)), method = "jaccard", binary = T)

genvec = as.vector(gendis[upper.tri(gendis, diag = F)])
geovec = as.vector(geodis[upper.tri(geodis, diag = F)])
otuvec = as.vector(otudis)

#breakpoints = seq(from=min(geovec[geovec>0]), to=max(geovec), length.out=15)
mgen = mantel(gendis, geodis, na.rm = T)
motu = mantel(otudis, geodis)

# ddgen = mantel.correlog(gendis, geodis/1000, cutoff = F, nperm = 999, r.type = "spearman")
#saveRDS(ddgen, "distance-decay-UniFrac.RDS")
ddgen = readRDS("distance-decay-UniFrac.RDS")

#ddotu = mantel.correlog(otudis, geodis/1000, cutoff = F, nperm = 999)
#saveRDS(ddotu, "distance-decay-Jaccard.RDS")
ddotu = readRDS("distance-decay-Jaccard.RDS")
  
ddgen.df = data.frame(ddgen[1]$mantel.res) %>% mutate(signif = ifelse(Pr.corrected. <= 0.05, T,F))
ddotu.df = data.frame(ddotu[1]$mantel.res) %>% mutate(signif = ifelse(Pr.corrected. <= 0.05, T,F))

ggplot(ddotu.df, aes(class.index, Mantel.cor)) + 
  geom_point(aes(shape=signif), size=5) + 
  geom_line() +
  geom_hline(yintercept = 0) +
  annotate("rect", xmin=1270,xmax=Inf,ymin=-Inf,ymax=Inf, alpha=0.3) +
  scale_shape_manual(values=c(1,16),guide="none") +
  xlab("Distance classes (km)") +
  ylab(expression(paste("Mantel's ", italic("r")))) +
  ggtitle("Mantel correlogram based on the binary Jaccard index")  +
  theme_bw()

ggsave("distance-decay-Jaccard.png")

## Saving 7 x 5 in image

ggplot(ddgen.df, aes(class.index, Mantel.cor)) + 
  geom_point(aes(shape=signif), size=5) + 
  geom_line() +
  geom_hline(yintercept = 0) +
  annotate("rect", xmin=1270,xmax=Inf,ymin=-Inf,ymax=Inf, alpha=0.3) +
  scale_shape_manual(values=c(1,16),guide="none") +
  xlab("Distance classes (km)") +
  ylab(expression(paste("Mantel's ", italic("r")))) +
  ggtitle("Mantel correlogram based on the UniFrac index")+
  theme_bw()

ggsave("distance-decay-UniFrac.png")

## Saving 7 x 5 in image

NRI

rooted = phy_tree(ps_OTU)
otu = data.frame(otu_table(ps_OTU))
match <- match.phylo.comm(phy = rooted, comm = t(otu))

# ses.mpd.taxlabels = ses.mpd(match$comm, cophenetic(match$phy), null.model="taxa.labels", abundance.weighted = TRUE)
# saveRDS(ses.mpd.taxlabels, "ses.mpd.taxlabels.RDS")
ses.mpd.taxlabels = readRDS("ses.mpd.taxlabels.RDS")

#ses.mntd.taxlabels = ses.mntd(match$comm, cophenetic(match$phy), null.model="taxa.labels", abundance.weighted = TRUE)   

# add to data frame
# df.taxlabels = data.frame(sample_data(ps)) %>%
#   mutate(mpd.obs.z = ses.mpd.taxlabels$mpd.obs.z,
#          mpd.obs.p = ses.mpd.taxlabels$mpd.obs.p,
#          mntd.obs.z = ses.mntd.taxlabels$mntd.obs.z,
#          mntd.obs.p = ses.mntd.taxlabels$mntd.obs.p,
#          NRI = -1*mpd.obs.z,
#          NTI = -1*mntd.obs.z)
df.taxlabels = data.frame(sample_data(ps_OTU)) %>%
  mutate(mpd.obs.z = ses.mpd.taxlabels$mpd.obs.z,
         mpd.obs.p = ses.mpd.taxlabels$mpd.obs.p,
         NRI = -1*mpd.obs.z)

df.taxlabels = df.taxlabels %>% mutate(signif_NRI = ifelse(mpd.obs.p <= 0.05 | mpd.obs.p >= 0.95, T, F))
#df.taxlabels = df.taxlabels %>% mutate(signif_NTI = ifelse(mntd.obs.p <= 0.05 | mntd.obs.p >= 0.95, T, F))

# quadratic model
df.taxlabels$latitude2 = df.taxlabels$latitude^2
mod1 = lm(NRI~latitude + latitude2, df.taxlabels)
mod2 = lm(NRI~latitude, df.taxlabels)

# anova tests if more complex model is significantly better fit to data
anova(mod2, mod1)

fstat <- summary(mod1)$fstatistic
r2 = round(summary(mod1)$adj.r.squared,2)
pval = round(pf(fstat[1], fstat[2], fstat[3], lower.tail=FALSE),4)

p1 = ggplot(df.taxlabels, aes(latitude, NRI)) + 
  geom_point(aes(shape=signif_NRI)) + 
  geom_smooth(method = "lm", formula = y ~ x + I(x^2), size = 1) +
  annotate("text", x=52,y=-0.25, label = paste0("italic(R)^2", " == ", r2 ), parse = TRUE)+
  annotate("text", x=52,y=-0.5, label = paste0("italic(P) == ", pval), parse = TRUE) +
  scale_shape_manual(values=c(1,16),guide="none")

# print the results
p1
# save the results
ggsave('NRI-Latitute-taxa.labels-OTU.png', p1)

## Saving 7 x 5 in image

# test for spatial correlation of NRI values to check assumptions of regression
# e.g., are the residuals 
# identical points are not allowed in neighbor map so I need to take the mean of the residuals per site

# make neighbor list for moran's test
library(spdep)
COORDS = metadata %>% select(longitude, latitude) %>% unique()
IDs <- unique(metadata$site_id)
# knn method
nbList1 = knn2nb(knearneigh(COORDS, longlat = T, k = 7), row.names=IDs)
plot(nbList1, COORDS)
# convert nb to listw
knnListw = nb2listw(nbList1)

# take average of NRI residuals per site
site_NRI_resids = df.taxlabels %>% 
  mutate(NRI_resids = mod1$residuals) %>%
  group_by(site_id) %>%
  summarize(site_NRI_resids = mean(NRI_resids))

moran.test(site_NRI_resids$site_NRI_resids, knnListw)
# http://www.geo.hunter.cuny.edu/~ssun/R-Spatial/spregression.html

3. Associate modules with variables

Description

Modules were detected within the co-occurrence network and variables were associated with the modules.

Community detection

sc.se = intersection(sc.ps6Net, se.ps6Net)
verticesToRemove <- V(sc.se)[igraph::degree(sc.se) ==0]
sc.se <- delete.vertices(sc.se, verticesToRemove)
#fc = cluster_leading_eigen(sc.se, options = list(maxiter=10000))
fc = cluster_leading_eigen(sc.se)
table(fc$membership)

## 
##  1  2  3  4  5  6  7  8  9 10 
## 17  2 26 19 18  9  8 16  4  7

N.Mods = length(table(fc$membership))
# change color hex code to name so I can see what module is identified
my_colors = RColorBrewer::brewer.pal(N.Mods,"Paired")
color_names = data.frame(color.name = sapply(my_colors, color.id))$color.name

comps = membership(fc)

# https://stackoverflow.com/questions/51271730/how-to-remove-small-communities-using-igraph-in-r
## Which nodes should be kept?
SizeToKeep = 10
Small = which(table(fc$membership) < SizeToKeep)
Keep = V(sc.se)[!(fc$membership %in% Small)]
sc.se_large = induced.subgraph(sc.se, Keep)

sc.se_large$community = fc$membership[!(fc$membership %in% Small)]

# module names / colors
comps = comps[!(fc$membership %in% Small)]
V(sc.se_large)$color = color_names[comps]

Module calculations

# prep data
datOTUhel = otu_table(ps6Top.hel %>% subset_taxa(OTUID %in% names(V(sc.se_large))) ) # taxa are cols
datExpr = data.frame(datOTUhel)

# Define numbers of genes and samples
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)

# Get Module Eigengenes per cluster
MEs <- moduleEigengenes(datExpr, colors = V(sc.se_large)$color)$eigengenes
# Reorder modules so similar modules are next to each other
MEs0 <- orderMEs(MEs)
module_order = names(MEs0) %>% gsub("ME","", .)

datTraits = data.frame(sample_data(ps6Top)) %>%
  select(-c(subspec,site_id,Description,site_id.1, bio01,bio12)) %>%
  select(-c(rosette_diameter_1,rosette_diameter_2,pitcher_length,pitcher_width,keel_width,mouth_diameter,lip_thickness)) %>%
  select(-c(total_mosquito, total_midge, total_prey, ants) ) %>% # keeping inverts
  mutate(plant_pca1 = plantpca1) %>%
  mutate(climate_pca1 = bioclim_pca1) %>% 
  select(-bioclim_pca1) %>%
  mutate(inverts = ifelse(inverts>20,NA,inverts))

moduleTraitCor = cor(MEs0, datTraits, use = "p") 
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
moduleColors = V(sc.se_large)$color
moduleLabels = factor(moduleColors, levels = module_order, labels = paste0("M",c(1:ncol(MEs0))))

dat2 = cbind(datTraits, MEs0 %>% rename_with(~paste0("M",c(1:ncol(MEs0)))))

Module-environment relationships plot

# labeledHeatmap(moduleTraitCor, xLabels = names(datTraits), yLabels = names(MEs0 %>% rename_with(~paste0("M",c(1:ncol(MEs0))))))

extraRowLabels = names(MEs0 %>% rename_with(~ paste0("M",c(1:ncol(MEs0)))))

pdf("~/Dropbox/RoL/pitcher-16s/corr-plot.pdf", width = 10)
labeledHeatmap(moduleTraitCor, 
               xLabels = names(datTraits), 
               yLabels = names(MEs0),
               colors = blueWhiteRed(50),
               textMatrix = signif(ifelse(moduleTraitPvalue <= 0.05, moduleTraitPvalue, NA), digits = 1),
               ySymbols = extraRowLabels,
               cex.text = .75)
dev.off()

## quartz_off_screen 
##                 2

Relatedness within modules (SES MPD)

# add a "module" column to the tax table
ps.network = ps6Top %>% subset_taxa(OTUID %in% names(V(sc.se_large)))
tax_table(ps.network) = cbind(tax_table(ps.network), Module=V(sc.se_large)$color)

# subset the tax table to the Module variable
mods = data.frame(tax_table(ps.network)[,"Module"])
# manually add module ID to taxa
mods1 = mods %>% mutate(M1 = ifelse(Module==module_order[1],1,0),
                       M2 = ifelse(Module==module_order[2],1,0),
                       M3 = ifelse(Module==module_order[3],1,0),
                       M4 = ifelse(Module==module_order[4],1,0),
                       M5 = ifelse(Module==module_order[5],1,0))

# mods1 = mods %>% mutate(M1 = ifelse(Module==module_order[1],1,0),
#                        M2 = ifelse(Module==module_order[2],1,0),
#                        M3 = ifelse(Module==module_order[3],1,0),
#                        M4 = ifelse(Module==module_order[4],1,0),
#                        M5 = ifelse(Module==module_order[5],1,0),
#                        M6 = ifelse(Module==module_order[6],1,0))

mods = mods1 %>% select(-Module)

ifelse(ncol(mods) == ncol(MEs0), yes = "MODULE NUMBER CORRECT", no = "MODULE NUMBER IS INCORRECT!")

## [1] "MODULE NUMBER CORRECT"

# set up data 
rooted = phy_tree(ps.network)
match <- match.phylo.comm(phy = rooted, comm = t(mods))

# SES MPD of modules
mod_SESMPD = ses.mpd(match$comm, cophenetic(match$phy), null.model = "taxa.labels", abundance.weighted = T, runs = 9999)
# saveRDS(mod_SESMPD, "mod_SESMPD-OTU.RDS")
# mod_SESMPD = readRDS("mod_SESMPD-OTU.RDS")
# write.table(mod_SESMPD %>% mutate(module = rownames(.)), "SESMPD_table.csv", sep = ",", row.names = F)
kable(mod_SESMPD %>% select(c(ntaxa,mpd.obs.z,mpd.obs.p)), digits = 4)

	ntaxa	mpd.obs.z	mpd.obs.p
M1	19	-2.5528	0.0046
M2	18	-0.6494	0.2594
M3	17	1.2681	0.8977
M4	16	-0.0442	0.4894
M5	26	-0.6196	0.2695

Diversity within modules

sespd = ses.pd(match$comm, match$phy)
write.table(sespd %>% mutate(module = rownames(.)), "SESPD_table.csv", sep = ",", row.names = F)
sespd

Regression of modules with environmental data

Latitude

ggplot(dat2, aes(latitude, M1)) +geom_point()

mod1 = lm(M1~latitude, data=dat2)
anova(mod1)

summary(mod1)

## 
## Call:
## lm(formula = M1 ~ latitude, data = dat2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.12114 -0.06155 -0.01461  0.05604  0.24152 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.427741   0.059158   7.230 8.75e-11 ***
## latitude    -0.009672   0.001326  -7.294 6.40e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08001 on 103 degrees of freedom
## Multiple R-squared:  0.3406, Adjusted R-squared:  0.3342 
## F-statistic: 53.21 on 1 and 103 DF,  p-value: 6.402e-11

Volume

ggplot(dat2, aes(scale(volume), scale(M3))) +geom_point()

## Warning: Removed 3 rows containing missing values (geom_point).

mod2 = lm(scale(M3)~scale(volume), data=dat2)
summary(mod2)

## 
## Call:
## lm(formula = scale(M3) ~ scale(volume), data = dat2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.3457 -0.5771 -0.3493  0.4507  3.0648 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -0.008762   0.090160  -0.097    0.923    
## scale(volume)  0.374263   0.090605   4.131 7.52e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9106 on 100 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.1458, Adjusted R-squared:  0.1372 
## F-statistic: 17.06 on 1 and 100 DF,  p-value: 7.515e-05

Associate OTUs with modules

Identify OTUs that have a high significance for variables as well as high module membership in interesting modules.

Module membership is defined as the correlation between an OTU’s relative abundance in a sample and the module eigengene for that sample

To find intramodular hub genes, one can use the module membership measure K, Equation 6. Figure 4E shows a scatterplot between the body weight based gene significance measure GS i ,

Latitude

# Define variable latitude containing the latitude column of datTrait
latitude = as.data.frame(datTraits$latitude)
names(latitude) = "latitude"
# names (colors) of the modules
# modNames = substring(names(MEs0), 3)
modNames = names(MEs0 %>% rename_with(~paste0("M",c(1:ncol(MEs0)))))

# module membership is the correlation between OTU table and module eigengenes
# meaning the module is more or less influenced by the abundance of a taxa
geneModuleMembership = as.data.frame(cor(datExpr, MEs0, use = "p"))
rownames(geneModuleMembership) = as.vector(tax_table(ps.network)[,"OTUID"])
MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples))

names(geneModuleMembership) = paste("MM_", modNames, sep="")
names(MMPvalue) = paste("p.MM_", modNames, sep="")

geneTraitSignificance = as.data.frame(cor(datExpr, latitude, use = "p")); # correlate relative abundance with latitude
rownames(geneTraitSignificance) = as.vector(tax_table(ps.network)[,"OTUID"])

GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples))

names(geneTraitSignificance) = paste("GS.", names(latitude), sep="")
names(GSPvalue) = paste("p.GS.", names(latitude), sep="")

module = "M1"
column = match(module, modNames)
# moduleGenes = moduleColors==module
moduleGenes = moduleLabels==module

verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),
                   abs(geneTraitSignificance[moduleGenes, 1]),
                   xlab = paste("Module Membership in", module, "module"),
                   ylab = "Taxa significance for latitude",
                   main = paste("Module membership vs. OTU significance\n"),
                   cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, abline=T)

plot_lat_mod = data.frame(MM=abs(geneModuleMembership[moduleGenes, "MM_M1"]),
                     GS=abs(geneTraitSignificance[moduleGenes,"GS.latitude"]))

lat.cor = cor.test(plot_lat_mod$MM, plot_lat_mod$GS)

ggplot(plot_lat_mod, (aes(MM,GS))) +
  geom_point(size=3) +
  geom_smooth(method = "lm", formula = y ~ x + I(x), size = 1, se=F, color="black") +
  xlab("Module membership") +
  ylab("Correlation with latitude") +
  theme_minimal() +
  annotate("text", x=0.7, y=0.1, label=paste0("italic(r)", "==", round(lat.cor$estimate, 2)), parse=T) + 
  annotate("text", x=0.7, y=0.05, label=paste0("italic(p)", "==", round(lat.cor$p.value, 3)), parse=T)

## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)), se.fit =
## se, : prediction from a rank-deficient fit may be misleading

ggsave("latitude-significance-plot.png")

## Saving 7 x 5 in image

## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)), se.fit =
## se, : prediction from a rank-deficient fit may be misleading

Volume

# Define variable volume containing the volume column of datTrait
volume = as.data.frame(datTraits$volume)
names(volume) = "volume"

geneTraitSignificance = as.data.frame(cor(datExpr, volume, use = "p")); # correlate abundance with volume
rownames(geneTraitSignificance) = as.vector(tax_table(ps.network)[,"OTUID"])

GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples))

names(geneTraitSignificance) = paste("GS.", names(volume), sep="")
names(GSPvalue) = paste("p.GS.", names(volume), sep="")

module = "M3"
column = match(module, modNames)
moduleGenes = moduleLabels==module

verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),
                   abs(geneTraitSignificance[moduleGenes, 1]),
                   xlab = paste("Module Membership in", module, "module"),
                   ylab = "OTU correlation with volume",
                   main = paste("Module membership vs. OTU correlation\n"),
                   cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, abline = T)

plot_vol_mod = data.frame(MM=abs(geneModuleMembership[moduleGenes, "MM_M3"]),
                     GS=abs(geneTraitSignificance[moduleGenes,"GS.volume"]))

vol.cor = cor.test(plot_vol_mod$MM, plot_vol_mod$GS)

ggplot(plot_vol_mod, (aes(MM,GS))) +
  geom_point(size=3) +
  geom_smooth(method = "lm", formula = y ~ x + I(x), size = 1, se=F, color="black") +
  xlab("Module membership") +
  ylab("Correlation with volume") +
  theme_minimal() +
  annotate("text", x=0.7, y=0.1, label=paste0("italic(r)", "==", round(vol.cor$estimate, 2)), parse=T) + 
  annotate("text", x=0.7, y=0.05, label=paste0("italic(p)", "==", round(vol.cor$p.value, 3)), parse=T)

## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)), se.fit =
## se, : prediction from a rank-deficient fit may be misleading

ggsave("volume-significance-plot.png")

## Saving 7 x 5 in image

## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)), se.fit =
## se, : prediction from a rank-deficient fit may be misleading

OTUs in the modules

Latitude

latitudeOTUs = ps.network %>%
  transform_sample_counts(function(x) x / sum(x)) %>%
  subset_taxa(Module==module_order[1])

# the WGCNA way: pull OTUs with high module membership in modules related to traits

latitudeOTUsTable = tax_table(ps.network) %>% data.frame() %>%
  mutate(Module.Membership = geneModuleMembership[,match("M1", modNames)]) %>% 
  arrange(-abs(Module.Membership)) %>%
  filter(Module==module_order[1])
write.table(latitudeOTUsTable, "latitudeOTUs.csv", sep = ',', row.names = F)
kable(latitudeOTUsTable, caption = "Latitude module taxa by module membership", row.names = F)

Latitude module taxa by module membership
Kingdom	Phylum	Class	Order	Family	Genus	Species	OTUID	Module	Module.Membership
Bacteria	Proteobacteria	Gammaproteobacteria	Burkholderiales	Comamonadaceae	NA	NA	OTU_1354	forestgreen	0.7466487
Bacteria	Proteobacteria	Alphaproteobacteria	Caulobacterales	Caulobacteraceae	NA	NA	OTU_841	forestgreen	0.6457378
Bacteria	Proteobacteria	Gammaproteobacteria	Burkholderiales	Oxalobacteraceae	NA	NA	OTU_1480	forestgreen	0.6376950
Bacteria	Proteobacteria	Gammaproteobacteria	Burkholderiales	Comamonadaceae	NA	NA	OTU_1360	forestgreen	0.6263301
Bacteria	Proteobacteria	Alphaproteobacteria	Sphingomonadales	Sphingomonadaceae	Novosphingobium	NA	OTU_1001	forestgreen	0.6244884
Bacteria	Proteobacteria	Gammaproteobacteria	Burkholderiales	Oxalobacteraceae	NA	NA	OTU_1497	forestgreen	0.5778293
Bacteria	Proteobacteria	Alphaproteobacteria	Sphingomonadales	Sphingomonadaceae	NA	NA	OTU_989	forestgreen	0.5561212
Bacteria	Proteobacteria	Gammaproteobacteria	Burkholderiales	Oxalobacteraceae	NA	NA	OTU_1481	forestgreen	0.5235389
Bacteria	Proteobacteria	Gammaproteobacteria	Burkholderiales	Chromobacteriaceae	Aquitalea	NA	OTU_1284	forestgreen	0.4864466
Bacteria	Bacteroidota	Bacteroidia	Sphingobacteriales	Sphingobacteriaceae	Mucilaginibacter	NA	OTU_64	forestgreen	0.4654385
Bacteria	Proteobacteria	Gammaproteobacteria	Burkholderiales	Burkholderiaceae	Burkholderia-Caballeronia-Paraburkholderia	NA	OTU_1422	forestgreen	0.3955791
Bacteria	Proteobacteria	Alphaproteobacteria	Sphingomonadales	Sphingomonadaceae	Sphingomonas	NA	OTU_965	forestgreen	0.3713802
Bacteria	Proteobacteria	Gammaproteobacteria	Burkholderiales	Comamonadaceae	NA	NA	OTU_1398	forestgreen	0.3589641
Bacteria	Bacteroidota	Bacteroidia	Sphingobacteriales	Sphingobacteriaceae	Mucilaginibacter	NA	OTU_62	forestgreen	0.3272241
Bacteria	Proteobacteria	Gammaproteobacteria	Burkholderiales	Chromobacteriaceae	Chromobacterium	NA	OTU_1293	forestgreen	0.3217597
Bacteria	Proteobacteria	Gammaproteobacteria	Xanthomonadales	Rhodanobacteraceae	NA	NA	OTU_1132	forestgreen	0.2164589
Bacteria	Acidobacteriota	Acidobacteriae	Acidobacteriales	Acidobacteriaceae_(Subgroup_1)	Granulicella	NA	OTU_650	forestgreen	0.2122788
Bacteria	Proteobacteria	Alphaproteobacteria	Acetobacterales	Acetobacteraceae	Acidisoma	NA	OTU_609	forestgreen	0.2020236
Bacteria	Proteobacteria	Gammaproteobacteria	Burkholderiales	Oxalobacteraceae	NA	NA	OTU_1434	forestgreen	0.1840743

plot_tree(latitudeOTUs, label.tips = "Genus", ladderize="left", size = "abundance", color = "Family",
          nodelabf = nodeplotblank)

ggsave("latitude_tree.png", width = 12, height =10)

Volume

volumeOTUs = ps.network %>%
  transform_sample_counts(function(x) x / sum(x)) %>%
  subset_taxa(Module==module_order[3])

# the WGCNA way: pull OTUs with high module membership in modules related to traits
volumeOTUsTable = tax_table(ps.network) %>% data.frame() %>%
  mutate(Module.Membership = geneModuleMembership[,match("M3", modNames)]) %>% 
  arrange(-abs(Module.Membership)) %>%
  filter(Module==module_order[3])
write.table(volumeOTUsTable, "volumeOTUs.csv", sep = ',', row.names = F)
kable(volumeOTUsTable, caption = "Volume module taxa by module membership", row.names = F)

Volume module taxa by module membership
Kingdom	Phylum	Class	Order	Family	Genus	Species	OTUID	Module	Module.Membership
Bacteria	Proteobacteria	Alphaproteobacteria	Rhizobiales	Kaistiaceae	Kaistia	NA	OTU_858	lightskyblue2	0.8188643
Bacteria	Proteobacteria	Gammaproteobacteria	Burkholderiales	NA	NA	NA	OTU_1402	lightskyblue2	0.7819199
Bacteria	Proteobacteria	Alphaproteobacteria	Rhizobiales	Rhizobiaceae	NA	NA	OTU_809	lightskyblue2	0.7742836
Bacteria	Actinobacteriota	Actinobacteria	Micrococcales	Microbacteriaceae	NA	NA	OTU_430	lightskyblue2	0.7266081
Bacteria	Proteobacteria	Gammaproteobacteria	Burkholderiales	Comamonadaceae	NA	NA	OTU_1351	lightskyblue2	0.7241397
Bacteria	Proteobacteria	Alphaproteobacteria	Rhizobiales	Xanthobacteraceae	NA	NA	OTU_1052	lightskyblue2	0.7220112
Bacteria	Proteobacteria	Alphaproteobacteria	Rhizobiales	Xanthobacteraceae	NA	NA	OTU_1054	lightskyblue2	0.6015189
Bacteria	Actinobacteriota	Actinobacteria	Corynebacteriales	Nocardiaceae	Rhodococcus	NA	OTU_475	lightskyblue2	0.5462574
Bacteria	Proteobacteria	Alphaproteobacteria	Acetobacterales	Acetobacteraceae	Roseococcus	NA	OTU_581	lightskyblue2	0.4742614
Bacteria	Actinobacteriota	Actinobacteria	Micrococcales	Microbacteriaceae	NA	NA	OTU_1384	lightskyblue2	0.4564328
Bacteria	Proteobacteria	Gammaproteobacteria	Burkholderiales	Chitinibacteraceae	Silvimonas	NA	OTU_1331	lightskyblue2	0.3307856
Bacteria	Bacteroidota	Bacteroidia	Bacteroidales	Dysgonomonadaceae	Dysgonomonas	NA	OTU_80	lightskyblue2	0.2876761
Bacteria	Bacteroidota	Bacteroidia	Sphingobacteriales	Sphingobacteriaceae	Mucilaginibacter	NA	OTU_66	lightskyblue2	0.2575401
Bacteria	Proteobacteria	Gammaproteobacteria	Burkholderiales	Comamonadaceae	NA	NA	OTU_1355	lightskyblue2	0.2494578
Bacteria	Bacteroidota	Bacteroidia	Sphingobacteriales	Sphingobacteriaceae	Pedobacter	NA	OTU_20	lightskyblue2	0.2429168
Bacteria	Bacteroidota	Bacteroidia	Chitinophagales	Chitinophagaceae	Taibaiella	NA	OTU_103	lightskyblue2	0.2069060
Bacteria	Proteobacteria	Alphaproteobacteria	Rhizobiales	Rhizobiaceae	Allorhizobium-Neorhizobium-Pararhizobium-Rhizobium	NA	OTU_792	lightskyblue2	0.1116146

plot_tree(volumeOTUs, label.tips = "Genus", ladderize="left", size = "abundance", color = "Family",
          nodelabf = nodeplotblank)

ggsave("volume_tree.png", width = 12, height =10)

M2, M4, and M5

M2 = tax_table(ps.network) %>% data.frame() %>%
  mutate(Module.Membership = geneModuleMembership[,match("M2", modNames)]) %>% 
  arrange(-abs(Module.Membership)) %>%
  filter(Module==module_order[2])

M4 = tax_table(ps.network) %>% data.frame() %>%
  mutate(Module.Membership = geneModuleMembership[,match("M4", modNames)]) %>% 
  arrange(-abs(Module.Membership)) %>%
  filter(Module==module_order[4])

M5 = tax_table(ps.network) %>% data.frame() %>%
  mutate(Module.Membership = geneModuleMembership[,match("M5", modNames)]) %>% 
  arrange(-abs(Module.Membership)) %>%
  filter(Module==module_order[5])

Network module plot

# label hubs for each module with OTU ID
key = c(volumeOTUsTable[c(1:3),8], 
        latitudeOTUsTable[(1:3),8],
        M2[(1:3),8],
        M4[(1:3),8],
        M5[(1:3),8])

pdf("~/Dropbox/RoL/pitcher-16s/Manuscript/module-plot-with-labels-test.pdf", width = 10, height = 10)
plot(sc.se_large,
     vertex.label=ifelse(names(V(sc.se_large)) %in% key, names(V(sc.se_large)), NA), 
     vertex.size = igraph::degree(sc.se_large)*1.5, 
     vertex.color=moduleColors,
     vertex.label.family="Helvetica",
     vertex.label.font=2,
     vertex.label.cex = 0.75,
     vertex.label.color = "black")
dev.off()

## quartz_off_screen 
##                 2

# table to go with plot
network_table = rbind(latitudeOTUsTable[c(1:3),c(1:6,8)],
                      M2[c(1:3),c(1:6,8)],
                      volumeOTUsTable[c(1:3),c(1:6,8)],
                      M4[c(1:3),c(1:6,8)],
                      M5[c(1:3),c(1:6,8)]) %>%
  mutate(Module = c(rep("M1", 3), rep("M2",3), rep("M3",3), rep("M4", 3), rep("M5", 3)))
write.table(network_table, "network-table.csv", sep = ',', row.names = F)

For each community, find node with higest betweenness

# https://towardsdatascience.com/community-detection-in-r-using-communities-of-friends-characters-2161e845c198

communities <- data.frame() 

for (i in unique(V(sc.se_large)$color)) { 
# create subgraphs for each community 
  subgraph <- induced_subgraph(sc.se_large, v = V(sc.se_large)$name[V(sc.se_large)$color == i]) 
# get size of each subgraph 
  size <- igraph::gorder(subgraph) 
# get betweenness centrality 
  btwn <- igraph::betweenness(subgraph) 
  communities <- communities %>% 
    dplyr::bind_rows(data.frame(module = i, 
                                n_OTUs = size,
                                most_important = names(which(btwn == max(btwn))) 
                                ) 
    ) 
  } 

knitr::kable(communities %>% dplyr::select(module, n_OTUs, most_important))

Small = which(table(fc$membership) < SizeToKeep)
Keep = V(sc.se)[!(fc$membership %in% Small)]
sc.se_large = induced.subgraph(sc.se, Keep)

Heatmap of module membership

library(gplots)
poy <- colorRampPalette(c("purple", "orange", "yellow")) 

#https://stackoverflow.com/questions/9777411/recenter-heatmap-2-in-r
mm = geneModuleMembership
#mm = mm[rowSums(abs(mm)) > 0.4,]
mm = mm[rowSums(mm) > 0.4,]
  
names(mm) = gsub("MM", "", names(mm))
taxanames = ps.network %>% subset_taxa(OTUID %in% rownames(mm)) %>% tax_table() %>% data.frame() %>%
  mutate(OTUID = gsub("OTU_", "", OTUID)) %>%
  mutate(across(Order, ~coalesce(.,Class))) %>% 
  mutate(across(Family, ~coalesce(.,Order))) %>%
  mutate(across(Genus, ~na_if(., "uncultured"))) %>%
  mutate(across(Genus, ~coalesce(.,Family))) %>%
  mutate(Genus = paste(Genus, OTUID))

rownames(mm) = taxanames$Genus

#tiff(filename = "/Users/zlm3critter/Dropbox/RoL/pitcher-16s/re-analysis/Heatmap.tiff")

heatmap.2(as.matrix(t(mm)),
          dendrogram = "none",
          trace = "none",
          key = F,
          lhei = c(0.2,23), # row height
          lwid = c(0.1,23),
          margins = c(20,10),
          adjCol = c(0.95,0.5))

dev.off()

## null device 
##           1

4.Identify top taxa in modules with BLASTn

Description

Use the refseq database to blast the top 5 taxa (based on module membership) in each module. Return a table with the ID that can be looked up on NCBI nucleotide. This avoids having to blast the sequences individually and keeps everything organized.

Code

fasta = readDNAStringSet("~/Dropbox/RoL/pitcher-16s/qiime2-stuff/dada2-denoise-cutadapt-primer/representative_sequences.fasta/dna-sequences.fasta")

# # Only need to do this once
# download.file("https://ftp.ncbi.nlm.nih.gov/blast/db/16S_ribosomal_RNA.tar.gz", "16S_ribosomal_RNA.tar.gz", mode='wb')
# untar("16S_ribosomal_RNA.tar.gz", exdir="16SMicrobialDB")

bl <- blast(db="~/Dropbox/RoL/pitcher-16s/re-analysis/16SMicrobialDB/16S_ribosomal_RNA")

# make a table of the top 3-5 taxa in the interesting modules
seq_latitude = fasta[names(fasta) %in% rownames(latitudeOTUsTable)[1:5]]
seq_volume = fasta[names(fasta) %in% rownames(volumeOTUsTable)[1:5]]

bl_latitude = predict(bl,seq_latitude,BLAST_args = "-perc_identity 99")
bl_volume = predict(bl,seq_volume,BLAST_args = "-perc_identity 99")

bl_latitude_Table = full_join(latitudeOTUsTable[1:5,] %>% mutate(QueryID=rownames(latitudeOTUsTable[1:5,])), bl_latitude)
write.table(bl_latitude_Table, "blast_table_latitude.csv", sep=",", row.names = F)

bl_volume_Table = full_join(volumeOTUsTable[1:5,] %>% mutate(QueryID=rownames(volumeOTUsTable[1:5,])), bl_volume)
write.table(bl_volume_Table, "blast_table_volume.csv", sep=",", row.names = F)

Niche space of microbial consortia within the Northern Pitcher Plant

2023-04-13

1. Load data and construct network

Description

Set up phyloseq

Map

Environmental variables

Make networks

Intersection of networks

Network parameters

2. Spatial analysis of assemblage diversity

Description

Distance decay

NRI

3. Associate modules with variables

Description

Community detection

Module calculations

Module-environment relationships plot

Relatedness within modules (SES MPD)

Diversity within modules

Regression of modules with environmental data

Latitude

Volume

Associate OTUs with modules

Latitude

Volume

OTUs in the modules

Latitude

Volume

M2, M4, and M5

Network module plot

Heatmap of module membership

4.Identify top taxa in modules with BLASTn

Description

Code