1. Load data and construct network

Description

The network is constructed with OTUs present in at least 6 samples having a total abundance of at least 1%. The spatial analysis (distance decay and NRI) is conducted with all OTUs.

Set up phyloseq

tree_file = '~/Dropbox/RoL/pitcher-16s/qiime2-stuff/rooted-tree/tree.nwk'
df = '~/Dropbox/RoL/pitcher-16s/site-data-bioclim-pca.txt'

ps = merge_phyloseq(import_biom('~/Dropbox/RoL/pitcher-16s/table-with-taxonomy-converted.biom', parseFunction = parse_taxonomy_greengenes), 
                    sample_data(read.csv(df, sep = '\t', row.names = 1)), 
                    read_tree(tree_file))
tax_table(ps) = tax_table(ps)[,1:7]
colnames(tax_table(ps)) = c('Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species')

# filter out taxa unassigned at Kingdom level
ps = ps %>% subset_taxa( Kingdom != "Unassigned")

# drop florida because it's a different subspecies
ps = ps %>% subset_samples(site_id != 'FLK')

# drop sample with ph = 1
# ps = ps %>% subset_samples(ph>1)

# agglomerate into OTUs
# ps_OTU = ps %>% tip_glom(h = 0.05, hcfun=hclust)
# saveRDS(ps_OTU, "OTUs_hclust_0.05")
ps_OTU = readRDS("OTUs_hclust_0.05")

# add OTUID column
#tax_table(ps_OTU) = cbind(tax_table(ps_OTU), OTUID=paste0("OTU_",1:length(taxa_names(ps_OTU))) )

ps6 = filter_taxa(ps_OTU, function(x) sum(x >= 1) >= 6, TRUE) %>% rarefy_even_depth(rngseed = 777)
flist = ps6 %>% transform_sample_counts(function(x) x / sum(x)) %>% filter_taxa(function(x) sum(x>= 0.01) > 0)
ps6Top = prune_taxa(flist, ps6) 

# transform counts
ps6Top.hel = ps6Top
otu_table(ps6Top.hel) = otu_table(decostand(t(otu_table(ps6Top)), method="hellinger"), taxa_are_rows = F)

sdata = data.frame(sample_data(ps_OTU)) %>% 
  mutate(mean_temp = bio01)

Map

library(ggmap)
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
library(ggrepel)
library(RColorBrewer)

myPalette <- colorRampPalette(rev(brewer.pal(11, "Spectral")))

register_google(key='AIzaSyAhlRC9I2MxpYhTzoY4o8ufO2ex7YQzIPg')

#namap = ggmap::get_map(c(lon = -80, lat = 41), zoom = 4, maptype = 'terrain-background', source = 'stamen')
namap = ggmap::get_map(c(left=-90, right=-65, bottom=27, top=55), maptype = 'terrain-background', color = "bw", source = 'stamen')
## Source : http://tile.stamen.com/terrain/5/8/10.png
## Source : http://tile.stamen.com/terrain/5/9/10.png
## Source : http://tile.stamen.com/terrain/5/10/10.png
## Source : http://tile.stamen.com/terrain/5/8/11.png
## Source : http://tile.stamen.com/terrain/5/9/11.png
## Source : http://tile.stamen.com/terrain/5/10/11.png
## Source : http://tile.stamen.com/terrain/5/8/12.png
## Source : http://tile.stamen.com/terrain/5/9/12.png
## Source : http://tile.stamen.com/terrain/5/10/12.png
## Source : http://tile.stamen.com/terrain/5/8/13.png
## Source : http://tile.stamen.com/terrain/5/9/13.png
## Source : http://tile.stamen.com/terrain/5/10/13.png
m1 = ggmap(namap)

# points
m1 + geom_point(data = sdata %>% select(longitude, latitude, mean_temp) %>% unique(), 
             aes(longitude, latitude), size=4) +
  theme(axis.title = element_blank()) +
  labs(color="Mean annual\ntemp (°C)")

ggsave('~/Dropbox/RoL/pitcher-16s/map.png', width = 10)
## Saving 10 x 5 in image

Environmental variables

#sdata %>% filter(!complete.cases(.))

plantdat = sdata %>% select(rosette_diameter_1,rosette_diameter_2,pitcher_length,pitcher_width,keel_width,mouth_diameter,lip_thickness) %>% mutate(mouth_diameter=replace_na(mouth_diameter, mean(mouth_diameter, na.rm=T)))

plantpca = princomp(plantdat, cor = T)
plantpca1 = plantpca$scores[,1]

Make networks

# se.ps6Top <- spiec.easi(ps6Top, method='mb', lambda.min.ratio=1e-1,
#                            nlambda=30, pulsar.params=list(rep.num=99, ncores=4))
# saveRDS(se.ps6Top, "speiec-easi-OTU-network.rds")
se.ps6Top = readRDS("speiec-easi-OTU-network.rds")

Generate bootstrapped distribution - takes a long time to run!!

# sparcc.matrix.boot <- sparccboot(t(otu_table(ps6Top)),R = 1000, ncpus = 6)
# sparcc.pvals = pval.sparccboot(sparcc.matrix.boot)
# saveRDS(sparcc.matrix.boot, "sparcc.matrix.boot.1000.RDS")
# saveRDS(sparcc.pvals, "sparcc.pvals.1000.RDS")
# sparcc algorithm does CLR normalization of counts!!!

# load bootstrapped matrix and pvals
sparcc.matrix.boot = readRDS("sparcc.matrix.boot.100.RDS")
sparcc.pvals = readRDS("sparcc.pvals.100.RDS")

# get mean and SD correlations from normalized data
sparcc.matrix = sparcc(t(otu_table(ps6Top)), th=0.1)
# define cutoff as mean + 1 SD
sparcc.cutoff = mean(abs(sparcc.matrix$Cor)) + sd(abs(sparcc.matrix$Cor))

# get matrix of bootstrapped pvals
dim = ntaxa(ps6Top)
mat.p = matrix(rep(1,dim*dim), dim, dim)
upperTriangle(mat.p) <- p.adjust(sparcc.pvals$pvals, "fdr")
lowerTriangle(mat.p) <- upperTriangle(mat.p, byrow=T)

# sparcc.adj <- replace_na( ifelse(abs(sparcc.matrix$Cor) > sparcc.cutoff &  mat.p <= 0.01, 1, 0), replace = 0 )

# Positive correlations only!
sparcc.adj = replace_na( ifelse(sparcc.matrix$Cor > sparcc.cutoff &  mat.p <= 0.01, 1, 0), replace = 0 )
rownames(sparcc.adj) <- as.vector(tax_table(ps6Top)[,"OTUID"])
colnames(sparcc.adj) <- as.vector(tax_table(ps6Top)[,"OTUID"])
sc.ps6Net = graph.adjacency(sparcc.adj,mode = "undirected",diag = FALSE)
# plot the two networks
se.ps6Net <- adj2igraph(getRefit(se.ps6Top), vertex.attr=list(name=as.vector(tax_table(ps6Top)[,"OTUID"])))

plot(se.ps6Net, vertex.label=NA, vertex.size=4)
plot(sc.ps6Net, vertex.label=NA, vertex.size=4)

Intersection of networks

sc.se = igraph::intersection(sc.ps6Net, se.ps6Net)
# taxa=ps6Top %>% subset_taxa(taxa_names(ps6Top) %in% names(V(sc.se))) %>% tax_table(.) 
# vertex_attr(sc.se, "name") <- as.vector(taxa[,'Genus'])

# remove isolated verticies
verticesToRemove <- V(sc.se)[igraph::degree(sc.se) ==0]
sc.se <- delete.vertices(sc.se, verticesToRemove)

#vertex_attr(sc.se_plot, "name") <- as.vector(tax_table(ps_OTU)[,'OTUID'])

pdf("~/Dropbox/RoL/pitcher-16s/Manuscript/full-network.pdf")
plot(cluster_leading_eigen(sc.se, weights=NA, options = list(maxiter=10000)), sc.se, vertex.label=NA, vertex.size=4)
dev.off()
## quartz_off_screen 
##                 2

Network parameters

network_results = function(net) {
  E(net)$weight = NA
  #  modularity
  fc = cluster_leading_eigen(net,weights =NULL)
  modularity = modularity(net,membership(fc))
  # graph properties
  results = data.frame(clustering.coefficient=transitivity(net), # clustering.coefficient
                modularity=modularity,# modularity
                mean.degree= mean(igraph::degree(net)),# mean degree
                size=length(E(net)),# size - number of edges
                order=length(V(net)),# order - number of vertices
                mean.distance=mean_distance(net),#average path length
                norm.degree=mean(igraph::degree(net)/length(V(net))),#normalized degree
                betweenness.centrality=centralization.betweenness(net)$centralization #betweenness centrality
                )
  return(results)
}
network_results(sc.se)

Is the network scale-free? Yes. A significant p-value indicates that the network does NOT follow a power law.

fpl = fit_power_law(igraph::degree_distribution(sc.se))
fpl$KS.p
## [1] 0.5178808

Compare random

compare_random <- function(net, reps) {
  E(net)$weight = NA
  g2 <- replicate(reps, erdos.renyi.game(length(V(net)),length(E(net)), type=c("gnm")), simplify = FALSE)
  fc.random = sapply(g2,cluster_leading_eigen, weights=NULL, options = list(maxiter=10000))
  df = data.frame(random.cc=sapply(g2,transitivity),
                  random.path=sapply(g2,average.path.length),
                  random.modularity=sapply(fc.random, modularity) )
  return(df)
}

ERrandoms = compare_random(sc.se, 999)

t.test(ERrandoms$random.modularity, mu = network_results(sc.se)$modularity)
## 
##  One Sample t-test
## 
## data:  ERrandoms$random.modularity
## t = -210.78, df = 998, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0.7011091
## 95 percent confidence interval:
##  0.5230160 0.5263014
## sample estimates:
## mean of x 
## 0.5246587
t.test(ERrandoms$random.cc, mu = network_results(sc.se)$clustering.coefficient)
## 
##  One Sample t-test
## 
## data:  ERrandoms$random.cc
## t = -476.73, df = 998, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0.1912752
## 95 percent confidence interval:
##  0.02264228 0.02402486
## sample estimates:
##  mean of x 
## 0.02333357

2. Spatial analysis of assemblage diversity

Description

The spatial structure of the community was tested with a distance decay analysis and by looking at the net relatedness index (NRI) across latitude

Distance decay

Spatial structure in the microbial assemblage data was tested with a Mantel correlogram as described in Legendre and Legendre 2012 based on Sokal (1986) and Oden and Sokal (1986). This test allows testing response data modeled by a non-Euclidean distance function (e.g., Jaccard and UniFrac here; Borcard and Legendre 2012). A positive Mantel statistic indicates positive spatial correlation. The distances were divided into 14 classes following Sturge’s rule and the test was applied with the vegan function ‘mantel.correlog’ in R. A holm correction is applied for multiple testing. The values in the second half of the distance classes are generally not interpreted because they are based on a smaller number of pairs and therefore have lower power.

metadata = data.frame(sample_data(ps_OTU))

gendis = as.matrix(UniFrac(ps_OTU))
geodis = geosphere::distm(cbind(metadata$longitude, metadata$latitude), fun = geosphere::distHaversine )
otudis = vegdist(t(otu_table(ps_OTU)), method = "jaccard", binary = T)

genvec = as.vector(gendis[upper.tri(gendis, diag = F)])
geovec = as.vector(geodis[upper.tri(geodis, diag = F)])
otuvec = as.vector(otudis)

#breakpoints = seq(from=min(geovec[geovec>0]), to=max(geovec), length.out=15)
mgen = mantel(gendis, geodis, na.rm = T)
motu = mantel(otudis, geodis)

# ddgen = mantel.correlog(gendis, geodis/1000, cutoff = F, nperm = 999, r.type = "spearman")
#saveRDS(ddgen, "distance-decay-UniFrac.RDS")
ddgen = readRDS("distance-decay-UniFrac.RDS")

#ddotu = mantel.correlog(otudis, geodis/1000, cutoff = F, nperm = 999)
#saveRDS(ddotu, "distance-decay-Jaccard.RDS")
ddotu = readRDS("distance-decay-Jaccard.RDS")
  
ddgen.df = data.frame(ddgen[1]$mantel.res) %>% mutate(signif = ifelse(Pr.corrected. <= 0.05, T,F))
ddotu.df = data.frame(ddotu[1]$mantel.res) %>% mutate(signif = ifelse(Pr.corrected. <= 0.05, T,F))
ggplot(ddotu.df, aes(class.index, Mantel.cor)) + 
  geom_point(aes(shape=signif), size=5) + 
  geom_line() +
  geom_hline(yintercept = 0) +
  annotate("rect", xmin=1270,xmax=Inf,ymin=-Inf,ymax=Inf, alpha=0.3) +
  scale_shape_manual(values=c(1,16),guide="none") +
  xlab("Distance classes (km)") +
  ylab(expression(paste("Mantel's ", italic("r")))) +
  ggtitle("Mantel correlogram based on the binary Jaccard index")  +
  theme_bw()

ggsave("distance-decay-Jaccard.png")
## Saving 7 x 5 in image
ggplot(ddgen.df, aes(class.index, Mantel.cor)) + 
  geom_point(aes(shape=signif), size=5) + 
  geom_line() +
  geom_hline(yintercept = 0) +
  annotate("rect", xmin=1270,xmax=Inf,ymin=-Inf,ymax=Inf, alpha=0.3) +
  scale_shape_manual(values=c(1,16),guide="none") +
  xlab("Distance classes (km)") +
  ylab(expression(paste("Mantel's ", italic("r")))) +
  ggtitle("Mantel correlogram based on the UniFrac index")+
  theme_bw()

ggsave("distance-decay-UniFrac.png")
## Saving 7 x 5 in image

NRI

rooted = phy_tree(ps_OTU)
otu = data.frame(otu_table(ps_OTU))
match <- match.phylo.comm(phy = rooted, comm = t(otu))

# ses.mpd.taxlabels = ses.mpd(match$comm, cophenetic(match$phy), null.model="taxa.labels", abundance.weighted = TRUE)
# saveRDS(ses.mpd.taxlabels, "ses.mpd.taxlabels.RDS")
ses.mpd.taxlabels = readRDS("ses.mpd.taxlabels.RDS")

#ses.mntd.taxlabels = ses.mntd(match$comm, cophenetic(match$phy), null.model="taxa.labels", abundance.weighted = TRUE)   

# add to data frame
# df.taxlabels = data.frame(sample_data(ps)) %>%
#   mutate(mpd.obs.z = ses.mpd.taxlabels$mpd.obs.z,
#          mpd.obs.p = ses.mpd.taxlabels$mpd.obs.p,
#          mntd.obs.z = ses.mntd.taxlabels$mntd.obs.z,
#          mntd.obs.p = ses.mntd.taxlabels$mntd.obs.p,
#          NRI = -1*mpd.obs.z,
#          NTI = -1*mntd.obs.z)
df.taxlabels = data.frame(sample_data(ps_OTU)) %>%
  mutate(mpd.obs.z = ses.mpd.taxlabels$mpd.obs.z,
         mpd.obs.p = ses.mpd.taxlabels$mpd.obs.p,
         NRI = -1*mpd.obs.z)

df.taxlabels = df.taxlabels %>% mutate(signif_NRI = ifelse(mpd.obs.p <= 0.05 | mpd.obs.p >= 0.95, T, F))
#df.taxlabels = df.taxlabels %>% mutate(signif_NTI = ifelse(mntd.obs.p <= 0.05 | mntd.obs.p >= 0.95, T, F))
# quadratic model
df.taxlabels$latitude2 = df.taxlabels$latitude^2
mod1 = lm(NRI~latitude + latitude2, df.taxlabels)
mod2 = lm(NRI~latitude, df.taxlabels)

# anova tests if more complex model is significantly better fit to data
anova(mod2, mod1)
fstat <- summary(mod1)$fstatistic
r2 = round(summary(mod1)$adj.r.squared,2)
pval = round(pf(fstat[1], fstat[2], fstat[3], lower.tail=FALSE),4)

p1 = ggplot(df.taxlabels, aes(latitude, NRI)) + 
  geom_point(aes(shape=signif_NRI)) + 
  geom_smooth(method = "lm", formula = y ~ x + I(x^2), size = 1) +
  annotate("text", x=52,y=-0.25, label = paste0("italic(R)^2", " == ", r2 ), parse = TRUE)+
  annotate("text", x=52,y=-0.5, label = paste0("italic(P) == ", pval), parse = TRUE) +
  scale_shape_manual(values=c(1,16),guide="none")

# print the results
p1
# save the results
ggsave('NRI-Latitute-taxa.labels-OTU.png', p1)
## Saving 7 x 5 in image

# test for spatial correlation of NRI values to check assumptions of regression
# e.g., are the residuals 
# identical points are not allowed in neighbor map so I need to take the mean of the residuals per site

# make neighbor list for moran's test
library(spdep)
COORDS = metadata %>% select(longitude, latitude) %>% unique()
IDs <- unique(metadata$site_id)
# knn method
nbList1 = knn2nb(knearneigh(COORDS, longlat = T, k = 7), row.names=IDs)
plot(nbList1, COORDS)
# convert nb to listw
knnListw = nb2listw(nbList1)

# take average of NRI residuals per site
site_NRI_resids = df.taxlabels %>% 
  mutate(NRI_resids = mod1$residuals) %>%
  group_by(site_id) %>%
  summarize(site_NRI_resids = mean(NRI_resids))

moran.test(site_NRI_resids$site_NRI_resids, knnListw)
# http://www.geo.hunter.cuny.edu/~ssun/R-Spatial/spregression.html

3. Associate modules with variables

Description

Modules were detected within the co-occurrence network and variables were associated with the modules.

Community detection

sc.se = intersection(sc.ps6Net, se.ps6Net)
verticesToRemove <- V(sc.se)[igraph::degree(sc.se) ==0]
sc.se <- delete.vertices(sc.se, verticesToRemove)
#fc = cluster_leading_eigen(sc.se, options = list(maxiter=10000))
fc = cluster_leading_eigen(sc.se)
table(fc$membership)
## 
##  1  2  3  4  5  6  7  8  9 10 
## 17  2 26 19 18  9  8 16  4  7
N.Mods = length(table(fc$membership))
# change color hex code to name so I can see what module is identified
my_colors = RColorBrewer::brewer.pal(N.Mods,"Paired")
color_names = data.frame(color.name = sapply(my_colors, color.id))$color.name

comps = membership(fc)

# https://stackoverflow.com/questions/51271730/how-to-remove-small-communities-using-igraph-in-r
## Which nodes should be kept?
SizeToKeep = 10
Small = which(table(fc$membership) < SizeToKeep)
Keep = V(sc.se)[!(fc$membership %in% Small)]
sc.se_large = induced.subgraph(sc.se, Keep)

sc.se_large$community = fc$membership[!(fc$membership %in% Small)]

# module names / colors
comps = comps[!(fc$membership %in% Small)]
V(sc.se_large)$color = color_names[comps]

Module calculations

# prep data
datOTUhel = otu_table(ps6Top.hel %>% subset_taxa(OTUID %in% names(V(sc.se_large))) ) # taxa are cols
datExpr = data.frame(datOTUhel)

# Define numbers of genes and samples
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)

# Get Module Eigengenes per cluster
MEs <- moduleEigengenes(datExpr, colors = V(sc.se_large)$color)$eigengenes
# Reorder modules so similar modules are next to each other
MEs0 <- orderMEs(MEs)
module_order = names(MEs0) %>% gsub("ME","", .)

datTraits = data.frame(sample_data(ps6Top)) %>%
  select(-c(subspec,site_id,Description,site_id.1, bio01,bio12)) %>%
  select(-c(rosette_diameter_1,rosette_diameter_2,pitcher_length,pitcher_width,keel_width,mouth_diameter,lip_thickness)) %>%
  select(-c(total_mosquito, total_midge, total_prey, ants) ) %>% # keeping inverts
  mutate(plant_pca1 = plantpca1) %>%
  mutate(climate_pca1 = bioclim_pca1) %>% 
  select(-bioclim_pca1) %>%
  mutate(inverts = ifelse(inverts>20,NA,inverts))

moduleTraitCor = cor(MEs0, datTraits, use = "p") 
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
moduleColors = V(sc.se_large)$color
moduleLabels = factor(moduleColors, levels = module_order, labels = paste0("M",c(1:ncol(MEs0))))

dat2 = cbind(datTraits, MEs0 %>% rename_with(~paste0("M",c(1:ncol(MEs0)))))

Module-environment relationships plot

# labeledHeatmap(moduleTraitCor, xLabels = names(datTraits), yLabels = names(MEs0 %>% rename_with(~paste0("M",c(1:ncol(MEs0))))))

extraRowLabels = names(MEs0 %>% rename_with(~ paste0("M",c(1:ncol(MEs0)))))

pdf("~/Dropbox/RoL/pitcher-16s/corr-plot.pdf", width = 10)
labeledHeatmap(moduleTraitCor, 
               xLabels = names(datTraits), 
               yLabels = names(MEs0),
               colors = blueWhiteRed(50),
               textMatrix = signif(ifelse(moduleTraitPvalue <= 0.05, moduleTraitPvalue, NA), digits = 1),
               ySymbols = extraRowLabels,
               cex.text = .75)
dev.off()
## quartz_off_screen 
##                 2

Relatedness within modules (SES MPD)

# add a "module" column to the tax table
ps.network = ps6Top %>% subset_taxa(OTUID %in% names(V(sc.se_large)))
tax_table(ps.network) = cbind(tax_table(ps.network), Module=V(sc.se_large)$color)

# subset the tax table to the Module variable
mods = data.frame(tax_table(ps.network)[,"Module"])
# manually add module ID to taxa
mods1 = mods %>% mutate(M1 = ifelse(Module==module_order[1],1,0),
                       M2 = ifelse(Module==module_order[2],1,0),
                       M3 = ifelse(Module==module_order[3],1,0),
                       M4 = ifelse(Module==module_order[4],1,0),
                       M5 = ifelse(Module==module_order[5],1,0))

# mods1 = mods %>% mutate(M1 = ifelse(Module==module_order[1],1,0),
#                        M2 = ifelse(Module==module_order[2],1,0),
#                        M3 = ifelse(Module==module_order[3],1,0),
#                        M4 = ifelse(Module==module_order[4],1,0),
#                        M5 = ifelse(Module==module_order[5],1,0),
#                        M6 = ifelse(Module==module_order[6],1,0))

mods = mods1 %>% select(-Module)

ifelse(ncol(mods) == ncol(MEs0), yes = "MODULE NUMBER CORRECT", no = "MODULE NUMBER IS INCORRECT!")
## [1] "MODULE NUMBER CORRECT"
# set up data 
rooted = phy_tree(ps.network)
match <- match.phylo.comm(phy = rooted, comm = t(mods))

# SES MPD of modules
mod_SESMPD = ses.mpd(match$comm, cophenetic(match$phy), null.model = "taxa.labels", abundance.weighted = T, runs = 9999)
# saveRDS(mod_SESMPD, "mod_SESMPD-OTU.RDS")
# mod_SESMPD = readRDS("mod_SESMPD-OTU.RDS")
# write.table(mod_SESMPD %>% mutate(module = rownames(.)), "SESMPD_table.csv", sep = ",", row.names = F)
kable(mod_SESMPD %>% select(c(ntaxa,mpd.obs.z,mpd.obs.p)), digits = 4) 
ntaxa mpd.obs.z mpd.obs.p
M1 19 -2.5528 0.0046
M2 18 -0.6494 0.2594
M3 17 1.2681 0.8977
M4 16 -0.0442 0.4894
M5 26 -0.6196 0.2695

Diversity within modules

sespd = ses.pd(match$comm, match$phy)
write.table(sespd %>% mutate(module = rownames(.)), "SESPD_table.csv", sep = ",", row.names = F)
sespd

Regression of modules with environmental data

Latitude

ggplot(dat2, aes(latitude, M1)) +geom_point()

mod1 = lm(M1~latitude, data=dat2)
anova(mod1)
summary(mod1)
## 
## Call:
## lm(formula = M1 ~ latitude, data = dat2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.12114 -0.06155 -0.01461  0.05604  0.24152 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.427741   0.059158   7.230 8.75e-11 ***
## latitude    -0.009672   0.001326  -7.294 6.40e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08001 on 103 degrees of freedom
## Multiple R-squared:  0.3406, Adjusted R-squared:  0.3342 
## F-statistic: 53.21 on 1 and 103 DF,  p-value: 6.402e-11

Volume

ggplot(dat2, aes(scale(volume), scale(M3))) +geom_point()
## Warning: Removed 3 rows containing missing values (geom_point).

mod2 = lm(scale(M3)~scale(volume), data=dat2)
summary(mod2)
## 
## Call:
## lm(formula = scale(M3) ~ scale(volume), data = dat2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.3457 -0.5771 -0.3493  0.4507  3.0648 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -0.008762   0.090160  -0.097    0.923    
## scale(volume)  0.374263   0.090605   4.131 7.52e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9106 on 100 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.1458, Adjusted R-squared:  0.1372 
## F-statistic: 17.06 on 1 and 100 DF,  p-value: 7.515e-05

Associate OTUs with modules

Identify OTUs that have a high significance for variables as well as high module membership in interesting modules.

Module membership is defined as the correlation between an OTU’s relative abundance in a sample and the module eigengene for that sample

To find intramodular hub genes, one can use the module membership measure K, Equation 6. Figure 4E shows a scatterplot between the body weight based gene significance measure GS i ,

Latitude

# Define variable latitude containing the latitude column of datTrait
latitude = as.data.frame(datTraits$latitude)
names(latitude) = "latitude"
# names (colors) of the modules
# modNames = substring(names(MEs0), 3)
modNames = names(MEs0 %>% rename_with(~paste0("M",c(1:ncol(MEs0)))))

# module membership is the correlation between OTU table and module eigengenes
# meaning the module is more or less influenced by the abundance of a taxa
geneModuleMembership = as.data.frame(cor(datExpr, MEs0, use = "p"))
rownames(geneModuleMembership) = as.vector(tax_table(ps.network)[,"OTUID"])
MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples))

names(geneModuleMembership) = paste("MM_", modNames, sep="")
names(MMPvalue) = paste("p.MM_", modNames, sep="")

geneTraitSignificance = as.data.frame(cor(datExpr, latitude, use = "p")); # correlate relative abundance with latitude
rownames(geneTraitSignificance) = as.vector(tax_table(ps.network)[,"OTUID"])

GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples))

names(geneTraitSignificance) = paste("GS.", names(latitude), sep="")
names(GSPvalue) = paste("p.GS.", names(latitude), sep="")

module = "M1"
column = match(module, modNames)
# moduleGenes = moduleColors==module
moduleGenes = moduleLabels==module

verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),
                   abs(geneTraitSignificance[moduleGenes, 1]),
                   xlab = paste("Module Membership in", module, "module"),
                   ylab = "Taxa significance for latitude",
                   main = paste("Module membership vs. OTU significance\n"),
                   cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, abline=T)

plot_lat_mod = data.frame(MM=abs(geneModuleMembership[moduleGenes, "MM_M1"]),
                     GS=abs(geneTraitSignificance[moduleGenes,"GS.latitude"]))

lat.cor = cor.test(plot_lat_mod$MM, plot_lat_mod$GS)

ggplot(plot_lat_mod, (aes(MM,GS))) +
  geom_point(size=3) +
  geom_smooth(method = "lm", formula = y ~ x + I(x), size = 1, se=F, color="black") +
  xlab("Module membership") +
  ylab("Correlation with latitude") +
  theme_minimal() +
  annotate("text", x=0.7, y=0.1, label=paste0("italic(r)", "==", round(lat.cor$estimate, 2)), parse=T) + 
  annotate("text", x=0.7, y=0.05, label=paste0("italic(p)", "==", round(lat.cor$p.value, 3)), parse=T)
## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)), se.fit =
## se, : prediction from a rank-deficient fit may be misleading

ggsave("latitude-significance-plot.png")
## Saving 7 x 5 in image
## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)), se.fit =
## se, : prediction from a rank-deficient fit may be misleading

Volume

# Define variable volume containing the volume column of datTrait
volume = as.data.frame(datTraits$volume)
names(volume) = "volume"

geneTraitSignificance = as.data.frame(cor(datExpr, volume, use = "p")); # correlate abundance with volume
rownames(geneTraitSignificance) = as.vector(tax_table(ps.network)[,"OTUID"])

GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples))

names(geneTraitSignificance) = paste("GS.", names(volume), sep="")
names(GSPvalue) = paste("p.GS.", names(volume), sep="")

module = "M3"
column = match(module, modNames)
moduleGenes = moduleLabels==module

verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),
                   abs(geneTraitSignificance[moduleGenes, 1]),
                   xlab = paste("Module Membership in", module, "module"),
                   ylab = "OTU correlation with volume",
                   main = paste("Module membership vs. OTU correlation\n"),
                   cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, abline = T)

plot_vol_mod = data.frame(MM=abs(geneModuleMembership[moduleGenes, "MM_M3"]),
                     GS=abs(geneTraitSignificance[moduleGenes,"GS.volume"]))

vol.cor = cor.test(plot_vol_mod$MM, plot_vol_mod$GS)

ggplot(plot_vol_mod, (aes(MM,GS))) +
  geom_point(size=3) +
  geom_smooth(method = "lm", formula = y ~ x + I(x), size = 1, se=F, color="black") +
  xlab("Module membership") +
  ylab("Correlation with volume") +
  theme_minimal() +
  annotate("text", x=0.7, y=0.1, label=paste0("italic(r)", "==", round(vol.cor$estimate, 2)), parse=T) + 
  annotate("text", x=0.7, y=0.05, label=paste0("italic(p)", "==", round(vol.cor$p.value, 3)), parse=T)
## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)), se.fit =
## se, : prediction from a rank-deficient fit may be misleading

ggsave("volume-significance-plot.png")
## Saving 7 x 5 in image
## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)), se.fit =
## se, : prediction from a rank-deficient fit may be misleading

OTUs in the modules

Latitude

latitudeOTUs = ps.network %>%
  transform_sample_counts(function(x) x / sum(x)) %>%
  subset_taxa(Module==module_order[1])

# the WGCNA way: pull OTUs with high module membership in modules related to traits

latitudeOTUsTable = tax_table(ps.network) %>% data.frame() %>%
  mutate(Module.Membership = geneModuleMembership[,match("M1", modNames)]) %>% 
  arrange(-abs(Module.Membership)) %>%
  filter(Module==module_order[1])
write.table(latitudeOTUsTable, "latitudeOTUs.csv", sep = ',', row.names = F)
kable(latitudeOTUsTable, caption = "Latitude module taxa by module membership", row.names = F)
Latitude module taxa by module membership
Kingdom Phylum Class Order Family Genus Species OTUID Module Module.Membership
Bacteria Proteobacteria Gammaproteobacteria Burkholderiales Comamonadaceae NA NA OTU_1354 forestgreen 0.7466487
Bacteria Proteobacteria Alphaproteobacteria Caulobacterales Caulobacteraceae NA NA OTU_841 forestgreen 0.6457378
Bacteria Proteobacteria Gammaproteobacteria Burkholderiales Oxalobacteraceae NA NA OTU_1480 forestgreen 0.6376950
Bacteria Proteobacteria Gammaproteobacteria Burkholderiales Comamonadaceae NA NA OTU_1360 forestgreen 0.6263301
Bacteria Proteobacteria Alphaproteobacteria Sphingomonadales Sphingomonadaceae Novosphingobium NA OTU_1001 forestgreen 0.6244884
Bacteria Proteobacteria Gammaproteobacteria Burkholderiales Oxalobacteraceae NA NA OTU_1497 forestgreen 0.5778293
Bacteria Proteobacteria Alphaproteobacteria Sphingomonadales Sphingomonadaceae NA NA OTU_989 forestgreen 0.5561212
Bacteria Proteobacteria Gammaproteobacteria Burkholderiales Oxalobacteraceae NA NA OTU_1481 forestgreen 0.5235389
Bacteria Proteobacteria Gammaproteobacteria Burkholderiales Chromobacteriaceae Aquitalea NA OTU_1284 forestgreen 0.4864466
Bacteria Bacteroidota Bacteroidia Sphingobacteriales Sphingobacteriaceae Mucilaginibacter NA OTU_64 forestgreen 0.4654385
Bacteria Proteobacteria Gammaproteobacteria Burkholderiales Burkholderiaceae Burkholderia-Caballeronia-Paraburkholderia NA OTU_1422 forestgreen 0.3955791
Bacteria Proteobacteria Alphaproteobacteria Sphingomonadales Sphingomonadaceae Sphingomonas NA OTU_965 forestgreen 0.3713802
Bacteria Proteobacteria Gammaproteobacteria Burkholderiales Comamonadaceae NA NA OTU_1398 forestgreen 0.3589641
Bacteria Bacteroidota Bacteroidia Sphingobacteriales Sphingobacteriaceae Mucilaginibacter NA OTU_62 forestgreen 0.3272241
Bacteria Proteobacteria Gammaproteobacteria Burkholderiales Chromobacteriaceae Chromobacterium NA OTU_1293 forestgreen 0.3217597
Bacteria Proteobacteria Gammaproteobacteria Xanthomonadales Rhodanobacteraceae NA NA OTU_1132 forestgreen 0.2164589
Bacteria Acidobacteriota Acidobacteriae Acidobacteriales Acidobacteriaceae_(Subgroup_1) Granulicella NA OTU_650 forestgreen 0.2122788
Bacteria Proteobacteria Alphaproteobacteria Acetobacterales Acetobacteraceae Acidisoma NA OTU_609 forestgreen 0.2020236
Bacteria Proteobacteria Gammaproteobacteria Burkholderiales Oxalobacteraceae NA NA OTU_1434 forestgreen 0.1840743
plot_tree(latitudeOTUs, label.tips = "Genus", ladderize="left", size = "abundance", color = "Family",
          nodelabf = nodeplotblank)

ggsave("latitude_tree.png", width = 12, height =10)

Volume

volumeOTUs = ps.network %>%
  transform_sample_counts(function(x) x / sum(x)) %>%
  subset_taxa(Module==module_order[3])

# the WGCNA way: pull OTUs with high module membership in modules related to traits
volumeOTUsTable = tax_table(ps.network) %>% data.frame() %>%
  mutate(Module.Membership = geneModuleMembership[,match("M3", modNames)]) %>% 
  arrange(-abs(Module.Membership)) %>%
  filter(Module==module_order[3])
write.table(volumeOTUsTable, "volumeOTUs.csv", sep = ',', row.names = F)
kable(volumeOTUsTable, caption = "Volume module taxa by module membership", row.names = F)
Volume module taxa by module membership
Kingdom Phylum Class Order Family Genus Species OTUID Module Module.Membership
Bacteria Proteobacteria Alphaproteobacteria Rhizobiales Kaistiaceae Kaistia NA OTU_858 lightskyblue2 0.8188643
Bacteria Proteobacteria Gammaproteobacteria Burkholderiales NA NA NA OTU_1402 lightskyblue2 0.7819199
Bacteria Proteobacteria Alphaproteobacteria Rhizobiales Rhizobiaceae NA NA OTU_809 lightskyblue2 0.7742836
Bacteria Actinobacteriota Actinobacteria Micrococcales Microbacteriaceae NA NA OTU_430 lightskyblue2 0.7266081
Bacteria Proteobacteria Gammaproteobacteria Burkholderiales Comamonadaceae NA NA OTU_1351 lightskyblue2 0.7241397
Bacteria Proteobacteria Alphaproteobacteria Rhizobiales Xanthobacteraceae NA NA OTU_1052 lightskyblue2 0.7220112
Bacteria Proteobacteria Alphaproteobacteria Rhizobiales Xanthobacteraceae NA NA OTU_1054 lightskyblue2 0.6015189
Bacteria Actinobacteriota Actinobacteria Corynebacteriales Nocardiaceae Rhodococcus NA OTU_475 lightskyblue2 0.5462574
Bacteria Proteobacteria Alphaproteobacteria Acetobacterales Acetobacteraceae Roseococcus NA OTU_581 lightskyblue2 0.4742614
Bacteria Actinobacteriota Actinobacteria Micrococcales Microbacteriaceae NA NA OTU_1384 lightskyblue2 0.4564328
Bacteria Proteobacteria Gammaproteobacteria Burkholderiales Chitinibacteraceae Silvimonas NA OTU_1331 lightskyblue2 0.3307856
Bacteria Bacteroidota Bacteroidia Bacteroidales Dysgonomonadaceae Dysgonomonas NA OTU_80 lightskyblue2 0.2876761
Bacteria Bacteroidota Bacteroidia Sphingobacteriales Sphingobacteriaceae Mucilaginibacter NA OTU_66 lightskyblue2 0.2575401
Bacteria Proteobacteria Gammaproteobacteria Burkholderiales Comamonadaceae NA NA OTU_1355 lightskyblue2 0.2494578
Bacteria Bacteroidota Bacteroidia Sphingobacteriales Sphingobacteriaceae Pedobacter NA OTU_20 lightskyblue2 0.2429168
Bacteria Bacteroidota Bacteroidia Chitinophagales Chitinophagaceae Taibaiella NA OTU_103 lightskyblue2 0.2069060
Bacteria Proteobacteria Alphaproteobacteria Rhizobiales Rhizobiaceae Allorhizobium-Neorhizobium-Pararhizobium-Rhizobium NA OTU_792 lightskyblue2 0.1116146
plot_tree(volumeOTUs, label.tips = "Genus", ladderize="left", size = "abundance", color = "Family",
          nodelabf = nodeplotblank)

ggsave("volume_tree.png", width = 12, height =10)

M2, M4, and M5

M2 = tax_table(ps.network) %>% data.frame() %>%
  mutate(Module.Membership = geneModuleMembership[,match("M2", modNames)]) %>% 
  arrange(-abs(Module.Membership)) %>%
  filter(Module==module_order[2])

M4 = tax_table(ps.network) %>% data.frame() %>%
  mutate(Module.Membership = geneModuleMembership[,match("M4", modNames)]) %>% 
  arrange(-abs(Module.Membership)) %>%
  filter(Module==module_order[4])

M5 = tax_table(ps.network) %>% data.frame() %>%
  mutate(Module.Membership = geneModuleMembership[,match("M5", modNames)]) %>% 
  arrange(-abs(Module.Membership)) %>%
  filter(Module==module_order[5])

Network module plot

# label hubs for each module with OTU ID
key = c(volumeOTUsTable[c(1:3),8], 
        latitudeOTUsTable[(1:3),8],
        M2[(1:3),8],
        M4[(1:3),8],
        M5[(1:3),8])

pdf("~/Dropbox/RoL/pitcher-16s/Manuscript/module-plot-with-labels-test.pdf", width = 10, height = 10)
plot(sc.se_large,
     vertex.label=ifelse(names(V(sc.se_large)) %in% key, names(V(sc.se_large)), NA), 
     vertex.size = igraph::degree(sc.se_large)*1.5, 
     vertex.color=moduleColors,
     vertex.label.family="Helvetica",
     vertex.label.font=2,
     vertex.label.cex = 0.75,
     vertex.label.color = "black")
dev.off()
## quartz_off_screen 
##                 2
# table to go with plot
network_table = rbind(latitudeOTUsTable[c(1:3),c(1:6,8)],
                      M2[c(1:3),c(1:6,8)],
                      volumeOTUsTable[c(1:3),c(1:6,8)],
                      M4[c(1:3),c(1:6,8)],
                      M5[c(1:3),c(1:6,8)]) %>%
  mutate(Module = c(rep("M1", 3), rep("M2",3), rep("M3",3), rep("M4", 3), rep("M5", 3)))
write.table(network_table, "network-table.csv", sep = ',', row.names = F)

For each community, find node with higest betweenness

# https://towardsdatascience.com/community-detection-in-r-using-communities-of-friends-characters-2161e845c198

communities <- data.frame() 

for (i in unique(V(sc.se_large)$color)) { 
# create subgraphs for each community 
  subgraph <- induced_subgraph(sc.se_large, v = V(sc.se_large)$name[V(sc.se_large)$color == i]) 
# get size of each subgraph 
  size <- igraph::gorder(subgraph) 
# get betweenness centrality 
  btwn <- igraph::betweenness(subgraph) 
  communities <- communities %>% 
    dplyr::bind_rows(data.frame(module = i, 
                                n_OTUs = size,
                                most_important = names(which(btwn == max(btwn))) 
                                ) 
    ) 
  } 

knitr::kable(communities %>% dplyr::select(module, n_OTUs, most_important))

Small = which(table(fc$membership) < SizeToKeep)
Keep = V(sc.se)[!(fc$membership %in% Small)]
sc.se_large = induced.subgraph(sc.se, Keep)

Heatmap of module membership

library(gplots)
poy <- colorRampPalette(c("purple", "orange", "yellow")) 

#https://stackoverflow.com/questions/9777411/recenter-heatmap-2-in-r
mm = geneModuleMembership
#mm = mm[rowSums(abs(mm)) > 0.4,]
mm = mm[rowSums(mm) > 0.4,]
  
names(mm) = gsub("MM", "", names(mm))
taxanames = ps.network %>% subset_taxa(OTUID %in% rownames(mm)) %>% tax_table() %>% data.frame() %>%
  mutate(OTUID = gsub("OTU_", "", OTUID)) %>%
  mutate(across(Order, ~coalesce(.,Class))) %>% 
  mutate(across(Family, ~coalesce(.,Order))) %>%
  mutate(across(Genus, ~na_if(., "uncultured"))) %>%
  mutate(across(Genus, ~coalesce(.,Family))) %>%
  mutate(Genus = paste(Genus, OTUID))

rownames(mm) = taxanames$Genus

#tiff(filename = "/Users/zlm3critter/Dropbox/RoL/pitcher-16s/re-analysis/Heatmap.tiff")

heatmap.2(as.matrix(t(mm)),
          dendrogram = "none",
          trace = "none",
          key = F,
          lhei = c(0.2,23), # row height
          lwid = c(0.1,23),
          margins = c(20,10),
          adjCol = c(0.95,0.5))

dev.off()
## null device 
##           1

4.Identify top taxa in modules with BLASTn

Description

Use the refseq database to blast the top 5 taxa (based on module membership) in each module. Return a table with the ID that can be looked up on NCBI nucleotide. This avoids having to blast the sequences individually and keeps everything organized.

Code

fasta = readDNAStringSet("~/Dropbox/RoL/pitcher-16s/qiime2-stuff/dada2-denoise-cutadapt-primer/representative_sequences.fasta/dna-sequences.fasta")

# # Only need to do this once
# download.file("https://ftp.ncbi.nlm.nih.gov/blast/db/16S_ribosomal_RNA.tar.gz", "16S_ribosomal_RNA.tar.gz", mode='wb')
# untar("16S_ribosomal_RNA.tar.gz", exdir="16SMicrobialDB")

bl <- blast(db="~/Dropbox/RoL/pitcher-16s/re-analysis/16SMicrobialDB/16S_ribosomal_RNA")

# make a table of the top 3-5 taxa in the interesting modules
seq_latitude = fasta[names(fasta) %in% rownames(latitudeOTUsTable)[1:5]]
seq_volume = fasta[names(fasta) %in% rownames(volumeOTUsTable)[1:5]]

bl_latitude = predict(bl,seq_latitude,BLAST_args = "-perc_identity 99")
bl_volume = predict(bl,seq_volume,BLAST_args = "-perc_identity 99")

bl_latitude_Table = full_join(latitudeOTUsTable[1:5,] %>% mutate(QueryID=rownames(latitudeOTUsTable[1:5,])), bl_latitude)
write.table(bl_latitude_Table, "blast_table_latitude.csv", sep=",", row.names = F)

bl_volume_Table = full_join(volumeOTUsTable[1:5,] %>% mutate(QueryID=rownames(volumeOTUsTable[1:5,])), bl_volume)
write.table(bl_volume_Table, "blast_table_volume.csv", sep=",", row.names = F)