The network is constructed with OTUs present in at least 6 samples having a total abundance of at least 1%. The spatial analysis (distance decay and NRI) is conducted with all OTUs.
tree_file = '~/Dropbox/RoL/pitcher-16s/qiime2-stuff/rooted-tree/tree.nwk'
df = '~/Dropbox/RoL/pitcher-16s/site-data-bioclim-pca.txt'
ps = merge_phyloseq(import_biom('~/Dropbox/RoL/pitcher-16s/table-with-taxonomy-converted.biom', parseFunction = parse_taxonomy_greengenes),
sample_data(read.csv(df, sep = '\t', row.names = 1)),
read_tree(tree_file))
tax_table(ps) = tax_table(ps)[,1:7]
colnames(tax_table(ps)) = c('Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species')
# filter out taxa unassigned at Kingdom level
ps = ps %>% subset_taxa( Kingdom != "Unassigned")
# drop florida because it's a different subspecies
ps = ps %>% subset_samples(site_id != 'FLK')
# drop sample with ph = 1
# ps = ps %>% subset_samples(ph>1)
# agglomerate into OTUs
# ps_OTU = ps %>% tip_glom(h = 0.05, hcfun=hclust)
# saveRDS(ps_OTU, "OTUs_hclust_0.05")
ps_OTU = readRDS("OTUs_hclust_0.05")
# add OTUID column
#tax_table(ps_OTU) = cbind(tax_table(ps_OTU), OTUID=paste0("OTU_",1:length(taxa_names(ps_OTU))) )
ps6 = filter_taxa(ps_OTU, function(x) sum(x >= 1) >= 6, TRUE) %>% rarefy_even_depth(rngseed = 777)
flist = ps6 %>% transform_sample_counts(function(x) x / sum(x)) %>% filter_taxa(function(x) sum(x>= 0.01) > 0)
ps6Top = prune_taxa(flist, ps6)
# transform counts
ps6Top.hel = ps6Top
otu_table(ps6Top.hel) = otu_table(decostand(t(otu_table(ps6Top)), method="hellinger"), taxa_are_rows = F)
sdata = data.frame(sample_data(ps_OTU)) %>%
mutate(mean_temp = bio01)
library(ggmap)
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
library(ggrepel)
library(RColorBrewer)
myPalette <- colorRampPalette(rev(brewer.pal(11, "Spectral")))
register_google(key='AIzaSyAhlRC9I2MxpYhTzoY4o8ufO2ex7YQzIPg')
#namap = ggmap::get_map(c(lon = -80, lat = 41), zoom = 4, maptype = 'terrain-background', source = 'stamen')
namap = ggmap::get_map(c(left=-90, right=-65, bottom=27, top=55), maptype = 'terrain-background', color = "bw", source = 'stamen')
## Source : http://tile.stamen.com/terrain/5/8/10.png
## Source : http://tile.stamen.com/terrain/5/9/10.png
## Source : http://tile.stamen.com/terrain/5/10/10.png
## Source : http://tile.stamen.com/terrain/5/8/11.png
## Source : http://tile.stamen.com/terrain/5/9/11.png
## Source : http://tile.stamen.com/terrain/5/10/11.png
## Source : http://tile.stamen.com/terrain/5/8/12.png
## Source : http://tile.stamen.com/terrain/5/9/12.png
## Source : http://tile.stamen.com/terrain/5/10/12.png
## Source : http://tile.stamen.com/terrain/5/8/13.png
## Source : http://tile.stamen.com/terrain/5/9/13.png
## Source : http://tile.stamen.com/terrain/5/10/13.png
m1 = ggmap(namap)
# points
m1 + geom_point(data = sdata %>% select(longitude, latitude, mean_temp) %>% unique(),
aes(longitude, latitude), size=4) +
theme(axis.title = element_blank()) +
labs(color="Mean annual\ntemp (°C)")
ggsave('~/Dropbox/RoL/pitcher-16s/map.png', width = 10)
## Saving 10 x 5 in image
#sdata %>% filter(!complete.cases(.))
plantdat = sdata %>% select(rosette_diameter_1,rosette_diameter_2,pitcher_length,pitcher_width,keel_width,mouth_diameter,lip_thickness) %>% mutate(mouth_diameter=replace_na(mouth_diameter, mean(mouth_diameter, na.rm=T)))
plantpca = princomp(plantdat, cor = T)
plantpca1 = plantpca$scores[,1]
# se.ps6Top <- spiec.easi(ps6Top, method='mb', lambda.min.ratio=1e-1,
# nlambda=30, pulsar.params=list(rep.num=99, ncores=4))
# saveRDS(se.ps6Top, "speiec-easi-OTU-network.rds")
se.ps6Top = readRDS("speiec-easi-OTU-network.rds")
Generate bootstrapped distribution - takes a long time to run!!
# sparcc.matrix.boot <- sparccboot(t(otu_table(ps6Top)),R = 1000, ncpus = 6)
# sparcc.pvals = pval.sparccboot(sparcc.matrix.boot)
# saveRDS(sparcc.matrix.boot, "sparcc.matrix.boot.1000.RDS")
# saveRDS(sparcc.pvals, "sparcc.pvals.1000.RDS")
# sparcc algorithm does CLR normalization of counts!!!
# load bootstrapped matrix and pvals
sparcc.matrix.boot = readRDS("sparcc.matrix.boot.100.RDS")
sparcc.pvals = readRDS("sparcc.pvals.100.RDS")
# get mean and SD correlations from normalized data
sparcc.matrix = sparcc(t(otu_table(ps6Top)), th=0.1)
# define cutoff as mean + 1 SD
sparcc.cutoff = mean(abs(sparcc.matrix$Cor)) + sd(abs(sparcc.matrix$Cor))
# get matrix of bootstrapped pvals
dim = ntaxa(ps6Top)
mat.p = matrix(rep(1,dim*dim), dim, dim)
upperTriangle(mat.p) <- p.adjust(sparcc.pvals$pvals, "fdr")
lowerTriangle(mat.p) <- upperTriangle(mat.p, byrow=T)
# sparcc.adj <- replace_na( ifelse(abs(sparcc.matrix$Cor) > sparcc.cutoff & mat.p <= 0.01, 1, 0), replace = 0 )
# Positive correlations only!
sparcc.adj = replace_na( ifelse(sparcc.matrix$Cor > sparcc.cutoff & mat.p <= 0.01, 1, 0), replace = 0 )
rownames(sparcc.adj) <- as.vector(tax_table(ps6Top)[,"OTUID"])
colnames(sparcc.adj) <- as.vector(tax_table(ps6Top)[,"OTUID"])
sc.ps6Net = graph.adjacency(sparcc.adj,mode = "undirected",diag = FALSE)
# plot the two networks
se.ps6Net <- adj2igraph(getRefit(se.ps6Top), vertex.attr=list(name=as.vector(tax_table(ps6Top)[,"OTUID"])))
plot(se.ps6Net, vertex.label=NA, vertex.size=4)
plot(sc.ps6Net, vertex.label=NA, vertex.size=4)
sc.se = igraph::intersection(sc.ps6Net, se.ps6Net)
# taxa=ps6Top %>% subset_taxa(taxa_names(ps6Top) %in% names(V(sc.se))) %>% tax_table(.)
# vertex_attr(sc.se, "name") <- as.vector(taxa[,'Genus'])
# remove isolated verticies
verticesToRemove <- V(sc.se)[igraph::degree(sc.se) ==0]
sc.se <- delete.vertices(sc.se, verticesToRemove)
#vertex_attr(sc.se_plot, "name") <- as.vector(tax_table(ps_OTU)[,'OTUID'])
pdf("~/Dropbox/RoL/pitcher-16s/Manuscript/full-network.pdf")
plot(cluster_leading_eigen(sc.se, weights=NA, options = list(maxiter=10000)), sc.se, vertex.label=NA, vertex.size=4)
dev.off()
## quartz_off_screen
## 2
network_results = function(net) {
E(net)$weight = NA
# modularity
fc = cluster_leading_eigen(net,weights =NULL)
modularity = modularity(net,membership(fc))
# graph properties
results = data.frame(clustering.coefficient=transitivity(net), # clustering.coefficient
modularity=modularity,# modularity
mean.degree= mean(igraph::degree(net)),# mean degree
size=length(E(net)),# size - number of edges
order=length(V(net)),# order - number of vertices
mean.distance=mean_distance(net),#average path length
norm.degree=mean(igraph::degree(net)/length(V(net))),#normalized degree
betweenness.centrality=centralization.betweenness(net)$centralization #betweenness centrality
)
return(results)
}
network_results(sc.se)
Is the network scale-free? Yes. A significant p-value indicates that the network does NOT follow a power law.
fpl = fit_power_law(igraph::degree_distribution(sc.se))
fpl$KS.p
## [1] 0.5178808
Compare random
compare_random <- function(net, reps) {
E(net)$weight = NA
g2 <- replicate(reps, erdos.renyi.game(length(V(net)),length(E(net)), type=c("gnm")), simplify = FALSE)
fc.random = sapply(g2,cluster_leading_eigen, weights=NULL, options = list(maxiter=10000))
df = data.frame(random.cc=sapply(g2,transitivity),
random.path=sapply(g2,average.path.length),
random.modularity=sapply(fc.random, modularity) )
return(df)
}
ERrandoms = compare_random(sc.se, 999)
t.test(ERrandoms$random.modularity, mu = network_results(sc.se)$modularity)
##
## One Sample t-test
##
## data: ERrandoms$random.modularity
## t = -210.78, df = 998, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0.7011091
## 95 percent confidence interval:
## 0.5230160 0.5263014
## sample estimates:
## mean of x
## 0.5246587
t.test(ERrandoms$random.cc, mu = network_results(sc.se)$clustering.coefficient)
##
## One Sample t-test
##
## data: ERrandoms$random.cc
## t = -476.73, df = 998, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0.1912752
## 95 percent confidence interval:
## 0.02264228 0.02402486
## sample estimates:
## mean of x
## 0.02333357
The spatial structure of the community was tested with a distance decay analysis and by looking at the net relatedness index (NRI) across latitude
Spatial structure in the microbial assemblage data was tested with a Mantel correlogram as described in Legendre and Legendre 2012 based on Sokal (1986) and Oden and Sokal (1986). This test allows testing response data modeled by a non-Euclidean distance function (e.g., Jaccard and UniFrac here; Borcard and Legendre 2012). A positive Mantel statistic indicates positive spatial correlation. The distances were divided into 14 classes following Sturge’s rule and the test was applied with the vegan function ‘mantel.correlog’ in R. A holm correction is applied for multiple testing. The values in the second half of the distance classes are generally not interpreted because they are based on a smaller number of pairs and therefore have lower power.
metadata = data.frame(sample_data(ps_OTU))
gendis = as.matrix(UniFrac(ps_OTU))
geodis = geosphere::distm(cbind(metadata$longitude, metadata$latitude), fun = geosphere::distHaversine )
otudis = vegdist(t(otu_table(ps_OTU)), method = "jaccard", binary = T)
genvec = as.vector(gendis[upper.tri(gendis, diag = F)])
geovec = as.vector(geodis[upper.tri(geodis, diag = F)])
otuvec = as.vector(otudis)
#breakpoints = seq(from=min(geovec[geovec>0]), to=max(geovec), length.out=15)
mgen = mantel(gendis, geodis, na.rm = T)
motu = mantel(otudis, geodis)
# ddgen = mantel.correlog(gendis, geodis/1000, cutoff = F, nperm = 999, r.type = "spearman")
#saveRDS(ddgen, "distance-decay-UniFrac.RDS")
ddgen = readRDS("distance-decay-UniFrac.RDS")
#ddotu = mantel.correlog(otudis, geodis/1000, cutoff = F, nperm = 999)
#saveRDS(ddotu, "distance-decay-Jaccard.RDS")
ddotu = readRDS("distance-decay-Jaccard.RDS")
ddgen.df = data.frame(ddgen[1]$mantel.res) %>% mutate(signif = ifelse(Pr.corrected. <= 0.05, T,F))
ddotu.df = data.frame(ddotu[1]$mantel.res) %>% mutate(signif = ifelse(Pr.corrected. <= 0.05, T,F))
ggplot(ddotu.df, aes(class.index, Mantel.cor)) +
geom_point(aes(shape=signif), size=5) +
geom_line() +
geom_hline(yintercept = 0) +
annotate("rect", xmin=1270,xmax=Inf,ymin=-Inf,ymax=Inf, alpha=0.3) +
scale_shape_manual(values=c(1,16),guide="none") +
xlab("Distance classes (km)") +
ylab(expression(paste("Mantel's ", italic("r")))) +
ggtitle("Mantel correlogram based on the binary Jaccard index") +
theme_bw()
ggsave("distance-decay-Jaccard.png")
## Saving 7 x 5 in image
ggplot(ddgen.df, aes(class.index, Mantel.cor)) +
geom_point(aes(shape=signif), size=5) +
geom_line() +
geom_hline(yintercept = 0) +
annotate("rect", xmin=1270,xmax=Inf,ymin=-Inf,ymax=Inf, alpha=0.3) +
scale_shape_manual(values=c(1,16),guide="none") +
xlab("Distance classes (km)") +
ylab(expression(paste("Mantel's ", italic("r")))) +
ggtitle("Mantel correlogram based on the UniFrac index")+
theme_bw()
ggsave("distance-decay-UniFrac.png")
## Saving 7 x 5 in image
rooted = phy_tree(ps_OTU)
otu = data.frame(otu_table(ps_OTU))
match <- match.phylo.comm(phy = rooted, comm = t(otu))
# ses.mpd.taxlabels = ses.mpd(match$comm, cophenetic(match$phy), null.model="taxa.labels", abundance.weighted = TRUE)
# saveRDS(ses.mpd.taxlabels, "ses.mpd.taxlabels.RDS")
ses.mpd.taxlabels = readRDS("ses.mpd.taxlabels.RDS")
#ses.mntd.taxlabels = ses.mntd(match$comm, cophenetic(match$phy), null.model="taxa.labels", abundance.weighted = TRUE)
# add to data frame
# df.taxlabels = data.frame(sample_data(ps)) %>%
# mutate(mpd.obs.z = ses.mpd.taxlabels$mpd.obs.z,
# mpd.obs.p = ses.mpd.taxlabels$mpd.obs.p,
# mntd.obs.z = ses.mntd.taxlabels$mntd.obs.z,
# mntd.obs.p = ses.mntd.taxlabels$mntd.obs.p,
# NRI = -1*mpd.obs.z,
# NTI = -1*mntd.obs.z)
df.taxlabels = data.frame(sample_data(ps_OTU)) %>%
mutate(mpd.obs.z = ses.mpd.taxlabels$mpd.obs.z,
mpd.obs.p = ses.mpd.taxlabels$mpd.obs.p,
NRI = -1*mpd.obs.z)
df.taxlabels = df.taxlabels %>% mutate(signif_NRI = ifelse(mpd.obs.p <= 0.05 | mpd.obs.p >= 0.95, T, F))
#df.taxlabels = df.taxlabels %>% mutate(signif_NTI = ifelse(mntd.obs.p <= 0.05 | mntd.obs.p >= 0.95, T, F))
# quadratic model
df.taxlabels$latitude2 = df.taxlabels$latitude^2
mod1 = lm(NRI~latitude + latitude2, df.taxlabels)
mod2 = lm(NRI~latitude, df.taxlabels)
# anova tests if more complex model is significantly better fit to data
anova(mod2, mod1)
fstat <- summary(mod1)$fstatistic
r2 = round(summary(mod1)$adj.r.squared,2)
pval = round(pf(fstat[1], fstat[2], fstat[3], lower.tail=FALSE),4)
p1 = ggplot(df.taxlabels, aes(latitude, NRI)) +
geom_point(aes(shape=signif_NRI)) +
geom_smooth(method = "lm", formula = y ~ x + I(x^2), size = 1) +
annotate("text", x=52,y=-0.25, label = paste0("italic(R)^2", " == ", r2 ), parse = TRUE)+
annotate("text", x=52,y=-0.5, label = paste0("italic(P) == ", pval), parse = TRUE) +
scale_shape_manual(values=c(1,16),guide="none")
# print the results
p1
# save the results
ggsave('NRI-Latitute-taxa.labels-OTU.png', p1)
## Saving 7 x 5 in image
# test for spatial correlation of NRI values to check assumptions of regression
# e.g., are the residuals
# identical points are not allowed in neighbor map so I need to take the mean of the residuals per site
# make neighbor list for moran's test
library(spdep)
COORDS = metadata %>% select(longitude, latitude) %>% unique()
IDs <- unique(metadata$site_id)
# knn method
nbList1 = knn2nb(knearneigh(COORDS, longlat = T, k = 7), row.names=IDs)
plot(nbList1, COORDS)
# convert nb to listw
knnListw = nb2listw(nbList1)
# take average of NRI residuals per site
site_NRI_resids = df.taxlabels %>%
mutate(NRI_resids = mod1$residuals) %>%
group_by(site_id) %>%
summarize(site_NRI_resids = mean(NRI_resids))
moran.test(site_NRI_resids$site_NRI_resids, knnListw)
# http://www.geo.hunter.cuny.edu/~ssun/R-Spatial/spregression.html
Modules were detected within the co-occurrence network and variables were associated with the modules.
sc.se = intersection(sc.ps6Net, se.ps6Net)
verticesToRemove <- V(sc.se)[igraph::degree(sc.se) ==0]
sc.se <- delete.vertices(sc.se, verticesToRemove)
#fc = cluster_leading_eigen(sc.se, options = list(maxiter=10000))
fc = cluster_leading_eigen(sc.se)
table(fc$membership)
##
## 1 2 3 4 5 6 7 8 9 10
## 17 2 26 19 18 9 8 16 4 7
N.Mods = length(table(fc$membership))
# change color hex code to name so I can see what module is identified
my_colors = RColorBrewer::brewer.pal(N.Mods,"Paired")
color_names = data.frame(color.name = sapply(my_colors, color.id))$color.name
comps = membership(fc)
# https://stackoverflow.com/questions/51271730/how-to-remove-small-communities-using-igraph-in-r
## Which nodes should be kept?
SizeToKeep = 10
Small = which(table(fc$membership) < SizeToKeep)
Keep = V(sc.se)[!(fc$membership %in% Small)]
sc.se_large = induced.subgraph(sc.se, Keep)
sc.se_large$community = fc$membership[!(fc$membership %in% Small)]
# module names / colors
comps = comps[!(fc$membership %in% Small)]
V(sc.se_large)$color = color_names[comps]
# prep data
datOTUhel = otu_table(ps6Top.hel %>% subset_taxa(OTUID %in% names(V(sc.se_large))) ) # taxa are cols
datExpr = data.frame(datOTUhel)
# Define numbers of genes and samples
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
# Get Module Eigengenes per cluster
MEs <- moduleEigengenes(datExpr, colors = V(sc.se_large)$color)$eigengenes
# Reorder modules so similar modules are next to each other
MEs0 <- orderMEs(MEs)
module_order = names(MEs0) %>% gsub("ME","", .)
datTraits = data.frame(sample_data(ps6Top)) %>%
select(-c(subspec,site_id,Description,site_id.1, bio01,bio12)) %>%
select(-c(rosette_diameter_1,rosette_diameter_2,pitcher_length,pitcher_width,keel_width,mouth_diameter,lip_thickness)) %>%
select(-c(total_mosquito, total_midge, total_prey, ants) ) %>% # keeping inverts
mutate(plant_pca1 = plantpca1) %>%
mutate(climate_pca1 = bioclim_pca1) %>%
select(-bioclim_pca1) %>%
mutate(inverts = ifelse(inverts>20,NA,inverts))
moduleTraitCor = cor(MEs0, datTraits, use = "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
moduleColors = V(sc.se_large)$color
moduleLabels = factor(moduleColors, levels = module_order, labels = paste0("M",c(1:ncol(MEs0))))
dat2 = cbind(datTraits, MEs0 %>% rename_with(~paste0("M",c(1:ncol(MEs0)))))
# labeledHeatmap(moduleTraitCor, xLabels = names(datTraits), yLabels = names(MEs0 %>% rename_with(~paste0("M",c(1:ncol(MEs0))))))
extraRowLabels = names(MEs0 %>% rename_with(~ paste0("M",c(1:ncol(MEs0)))))
pdf("~/Dropbox/RoL/pitcher-16s/corr-plot.pdf", width = 10)
labeledHeatmap(moduleTraitCor,
xLabels = names(datTraits),
yLabels = names(MEs0),
colors = blueWhiteRed(50),
textMatrix = signif(ifelse(moduleTraitPvalue <= 0.05, moduleTraitPvalue, NA), digits = 1),
ySymbols = extraRowLabels,
cex.text = .75)
dev.off()
## quartz_off_screen
## 2
sespd = ses.pd(match$comm, match$phy)
write.table(sespd %>% mutate(module = rownames(.)), "SESPD_table.csv", sep = ",", row.names = F)
sespd
ggplot(dat2, aes(latitude, M1)) +geom_point()
mod1 = lm(M1~latitude, data=dat2)
anova(mod1)
summary(mod1)
##
## Call:
## lm(formula = M1 ~ latitude, data = dat2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.12114 -0.06155 -0.01461 0.05604 0.24152
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.427741 0.059158 7.230 8.75e-11 ***
## latitude -0.009672 0.001326 -7.294 6.40e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08001 on 103 degrees of freedom
## Multiple R-squared: 0.3406, Adjusted R-squared: 0.3342
## F-statistic: 53.21 on 1 and 103 DF, p-value: 6.402e-11
ggplot(dat2, aes(scale(volume), scale(M3))) +geom_point()
## Warning: Removed 3 rows containing missing values (geom_point).
mod2 = lm(scale(M3)~scale(volume), data=dat2)
summary(mod2)
##
## Call:
## lm(formula = scale(M3) ~ scale(volume), data = dat2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.3457 -0.5771 -0.3493 0.4507 3.0648
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.008762 0.090160 -0.097 0.923
## scale(volume) 0.374263 0.090605 4.131 7.52e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9106 on 100 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.1458, Adjusted R-squared: 0.1372
## F-statistic: 17.06 on 1 and 100 DF, p-value: 7.515e-05
Identify OTUs that have a high significance for variables as well as high module membership in interesting modules.
Module membership is defined as the correlation between an OTU’s relative abundance in a sample and the module eigengene for that sample
To find intramodular hub genes, one can use the module membership measure K, Equation 6. Figure 4E shows a scatterplot between the body weight based gene significance measure GS i ,
# Define variable latitude containing the latitude column of datTrait
latitude = as.data.frame(datTraits$latitude)
names(latitude) = "latitude"
# names (colors) of the modules
# modNames = substring(names(MEs0), 3)
modNames = names(MEs0 %>% rename_with(~paste0("M",c(1:ncol(MEs0)))))
# module membership is the correlation between OTU table and module eigengenes
# meaning the module is more or less influenced by the abundance of a taxa
geneModuleMembership = as.data.frame(cor(datExpr, MEs0, use = "p"))
rownames(geneModuleMembership) = as.vector(tax_table(ps.network)[,"OTUID"])
MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples))
names(geneModuleMembership) = paste("MM_", modNames, sep="")
names(MMPvalue) = paste("p.MM_", modNames, sep="")
geneTraitSignificance = as.data.frame(cor(datExpr, latitude, use = "p")); # correlate relative abundance with latitude
rownames(geneTraitSignificance) = as.vector(tax_table(ps.network)[,"OTUID"])
GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples))
names(geneTraitSignificance) = paste("GS.", names(latitude), sep="")
names(GSPvalue) = paste("p.GS.", names(latitude), sep="")
module = "M1"
column = match(module, modNames)
# moduleGenes = moduleColors==module
moduleGenes = moduleLabels==module
verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),
abs(geneTraitSignificance[moduleGenes, 1]),
xlab = paste("Module Membership in", module, "module"),
ylab = "Taxa significance for latitude",
main = paste("Module membership vs. OTU significance\n"),
cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, abline=T)
plot_lat_mod = data.frame(MM=abs(geneModuleMembership[moduleGenes, "MM_M1"]),
GS=abs(geneTraitSignificance[moduleGenes,"GS.latitude"]))
lat.cor = cor.test(plot_lat_mod$MM, plot_lat_mod$GS)
ggplot(plot_lat_mod, (aes(MM,GS))) +
geom_point(size=3) +
geom_smooth(method = "lm", formula = y ~ x + I(x), size = 1, se=F, color="black") +
xlab("Module membership") +
ylab("Correlation with latitude") +
theme_minimal() +
annotate("text", x=0.7, y=0.1, label=paste0("italic(r)", "==", round(lat.cor$estimate, 2)), parse=T) +
annotate("text", x=0.7, y=0.05, label=paste0("italic(p)", "==", round(lat.cor$p.value, 3)), parse=T)
## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)), se.fit =
## se, : prediction from a rank-deficient fit may be misleading
ggsave("latitude-significance-plot.png")
## Saving 7 x 5 in image
## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)), se.fit =
## se, : prediction from a rank-deficient fit may be misleading
# Define variable volume containing the volume column of datTrait
volume = as.data.frame(datTraits$volume)
names(volume) = "volume"
geneTraitSignificance = as.data.frame(cor(datExpr, volume, use = "p")); # correlate abundance with volume
rownames(geneTraitSignificance) = as.vector(tax_table(ps.network)[,"OTUID"])
GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples))
names(geneTraitSignificance) = paste("GS.", names(volume), sep="")
names(GSPvalue) = paste("p.GS.", names(volume), sep="")
module = "M3"
column = match(module, modNames)
moduleGenes = moduleLabels==module
verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),
abs(geneTraitSignificance[moduleGenes, 1]),
xlab = paste("Module Membership in", module, "module"),
ylab = "OTU correlation with volume",
main = paste("Module membership vs. OTU correlation\n"),
cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, abline = T)
plot_vol_mod = data.frame(MM=abs(geneModuleMembership[moduleGenes, "MM_M3"]),
GS=abs(geneTraitSignificance[moduleGenes,"GS.volume"]))
vol.cor = cor.test(plot_vol_mod$MM, plot_vol_mod$GS)
ggplot(plot_vol_mod, (aes(MM,GS))) +
geom_point(size=3) +
geom_smooth(method = "lm", formula = y ~ x + I(x), size = 1, se=F, color="black") +
xlab("Module membership") +
ylab("Correlation with volume") +
theme_minimal() +
annotate("text", x=0.7, y=0.1, label=paste0("italic(r)", "==", round(vol.cor$estimate, 2)), parse=T) +
annotate("text", x=0.7, y=0.05, label=paste0("italic(p)", "==", round(vol.cor$p.value, 3)), parse=T)
## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)), se.fit =
## se, : prediction from a rank-deficient fit may be misleading
ggsave("volume-significance-plot.png")
## Saving 7 x 5 in image
## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)), se.fit =
## se, : prediction from a rank-deficient fit may be misleading
latitudeOTUs = ps.network %>%
transform_sample_counts(function(x) x / sum(x)) %>%
subset_taxa(Module==module_order[1])
# the WGCNA way: pull OTUs with high module membership in modules related to traits
latitudeOTUsTable = tax_table(ps.network) %>% data.frame() %>%
mutate(Module.Membership = geneModuleMembership[,match("M1", modNames)]) %>%
arrange(-abs(Module.Membership)) %>%
filter(Module==module_order[1])
write.table(latitudeOTUsTable, "latitudeOTUs.csv", sep = ',', row.names = F)
kable(latitudeOTUsTable, caption = "Latitude module taxa by module membership", row.names = F)
| Kingdom | Phylum | Class | Order | Family | Genus | Species | OTUID | Module | Module.Membership |
|---|---|---|---|---|---|---|---|---|---|
| Bacteria | Proteobacteria | Gammaproteobacteria | Burkholderiales | Comamonadaceae | NA | NA | OTU_1354 | forestgreen | 0.7466487 |
| Bacteria | Proteobacteria | Alphaproteobacteria | Caulobacterales | Caulobacteraceae | NA | NA | OTU_841 | forestgreen | 0.6457378 |
| Bacteria | Proteobacteria | Gammaproteobacteria | Burkholderiales | Oxalobacteraceae | NA | NA | OTU_1480 | forestgreen | 0.6376950 |
| Bacteria | Proteobacteria | Gammaproteobacteria | Burkholderiales | Comamonadaceae | NA | NA | OTU_1360 | forestgreen | 0.6263301 |
| Bacteria | Proteobacteria | Alphaproteobacteria | Sphingomonadales | Sphingomonadaceae | Novosphingobium | NA | OTU_1001 | forestgreen | 0.6244884 |
| Bacteria | Proteobacteria | Gammaproteobacteria | Burkholderiales | Oxalobacteraceae | NA | NA | OTU_1497 | forestgreen | 0.5778293 |
| Bacteria | Proteobacteria | Alphaproteobacteria | Sphingomonadales | Sphingomonadaceae | NA | NA | OTU_989 | forestgreen | 0.5561212 |
| Bacteria | Proteobacteria | Gammaproteobacteria | Burkholderiales | Oxalobacteraceae | NA | NA | OTU_1481 | forestgreen | 0.5235389 |
| Bacteria | Proteobacteria | Gammaproteobacteria | Burkholderiales | Chromobacteriaceae | Aquitalea | NA | OTU_1284 | forestgreen | 0.4864466 |
| Bacteria | Bacteroidota | Bacteroidia | Sphingobacteriales | Sphingobacteriaceae | Mucilaginibacter | NA | OTU_64 | forestgreen | 0.4654385 |
| Bacteria | Proteobacteria | Gammaproteobacteria | Burkholderiales | Burkholderiaceae | Burkholderia-Caballeronia-Paraburkholderia | NA | OTU_1422 | forestgreen | 0.3955791 |
| Bacteria | Proteobacteria | Alphaproteobacteria | Sphingomonadales | Sphingomonadaceae | Sphingomonas | NA | OTU_965 | forestgreen | 0.3713802 |
| Bacteria | Proteobacteria | Gammaproteobacteria | Burkholderiales | Comamonadaceae | NA | NA | OTU_1398 | forestgreen | 0.3589641 |
| Bacteria | Bacteroidota | Bacteroidia | Sphingobacteriales | Sphingobacteriaceae | Mucilaginibacter | NA | OTU_62 | forestgreen | 0.3272241 |
| Bacteria | Proteobacteria | Gammaproteobacteria | Burkholderiales | Chromobacteriaceae | Chromobacterium | NA | OTU_1293 | forestgreen | 0.3217597 |
| Bacteria | Proteobacteria | Gammaproteobacteria | Xanthomonadales | Rhodanobacteraceae | NA | NA | OTU_1132 | forestgreen | 0.2164589 |
| Bacteria | Acidobacteriota | Acidobacteriae | Acidobacteriales | Acidobacteriaceae_(Subgroup_1) | Granulicella | NA | OTU_650 | forestgreen | 0.2122788 |
| Bacteria | Proteobacteria | Alphaproteobacteria | Acetobacterales | Acetobacteraceae | Acidisoma | NA | OTU_609 | forestgreen | 0.2020236 |
| Bacteria | Proteobacteria | Gammaproteobacteria | Burkholderiales | Oxalobacteraceae | NA | NA | OTU_1434 | forestgreen | 0.1840743 |
plot_tree(latitudeOTUs, label.tips = "Genus", ladderize="left", size = "abundance", color = "Family",
nodelabf = nodeplotblank)
ggsave("latitude_tree.png", width = 12, height =10)
volumeOTUs = ps.network %>%
transform_sample_counts(function(x) x / sum(x)) %>%
subset_taxa(Module==module_order[3])
# the WGCNA way: pull OTUs with high module membership in modules related to traits
volumeOTUsTable = tax_table(ps.network) %>% data.frame() %>%
mutate(Module.Membership = geneModuleMembership[,match("M3", modNames)]) %>%
arrange(-abs(Module.Membership)) %>%
filter(Module==module_order[3])
write.table(volumeOTUsTable, "volumeOTUs.csv", sep = ',', row.names = F)
kable(volumeOTUsTable, caption = "Volume module taxa by module membership", row.names = F)
| Kingdom | Phylum | Class | Order | Family | Genus | Species | OTUID | Module | Module.Membership |
|---|---|---|---|---|---|---|---|---|---|
| Bacteria | Proteobacteria | Alphaproteobacteria | Rhizobiales | Kaistiaceae | Kaistia | NA | OTU_858 | lightskyblue2 | 0.8188643 |
| Bacteria | Proteobacteria | Gammaproteobacteria | Burkholderiales | NA | NA | NA | OTU_1402 | lightskyblue2 | 0.7819199 |
| Bacteria | Proteobacteria | Alphaproteobacteria | Rhizobiales | Rhizobiaceae | NA | NA | OTU_809 | lightskyblue2 | 0.7742836 |
| Bacteria | Actinobacteriota | Actinobacteria | Micrococcales | Microbacteriaceae | NA | NA | OTU_430 | lightskyblue2 | 0.7266081 |
| Bacteria | Proteobacteria | Gammaproteobacteria | Burkholderiales | Comamonadaceae | NA | NA | OTU_1351 | lightskyblue2 | 0.7241397 |
| Bacteria | Proteobacteria | Alphaproteobacteria | Rhizobiales | Xanthobacteraceae | NA | NA | OTU_1052 | lightskyblue2 | 0.7220112 |
| Bacteria | Proteobacteria | Alphaproteobacteria | Rhizobiales | Xanthobacteraceae | NA | NA | OTU_1054 | lightskyblue2 | 0.6015189 |
| Bacteria | Actinobacteriota | Actinobacteria | Corynebacteriales | Nocardiaceae | Rhodococcus | NA | OTU_475 | lightskyblue2 | 0.5462574 |
| Bacteria | Proteobacteria | Alphaproteobacteria | Acetobacterales | Acetobacteraceae | Roseococcus | NA | OTU_581 | lightskyblue2 | 0.4742614 |
| Bacteria | Actinobacteriota | Actinobacteria | Micrococcales | Microbacteriaceae | NA | NA | OTU_1384 | lightskyblue2 | 0.4564328 |
| Bacteria | Proteobacteria | Gammaproteobacteria | Burkholderiales | Chitinibacteraceae | Silvimonas | NA | OTU_1331 | lightskyblue2 | 0.3307856 |
| Bacteria | Bacteroidota | Bacteroidia | Bacteroidales | Dysgonomonadaceae | Dysgonomonas | NA | OTU_80 | lightskyblue2 | 0.2876761 |
| Bacteria | Bacteroidota | Bacteroidia | Sphingobacteriales | Sphingobacteriaceae | Mucilaginibacter | NA | OTU_66 | lightskyblue2 | 0.2575401 |
| Bacteria | Proteobacteria | Gammaproteobacteria | Burkholderiales | Comamonadaceae | NA | NA | OTU_1355 | lightskyblue2 | 0.2494578 |
| Bacteria | Bacteroidota | Bacteroidia | Sphingobacteriales | Sphingobacteriaceae | Pedobacter | NA | OTU_20 | lightskyblue2 | 0.2429168 |
| Bacteria | Bacteroidota | Bacteroidia | Chitinophagales | Chitinophagaceae | Taibaiella | NA | OTU_103 | lightskyblue2 | 0.2069060 |
| Bacteria | Proteobacteria | Alphaproteobacteria | Rhizobiales | Rhizobiaceae | Allorhizobium-Neorhizobium-Pararhizobium-Rhizobium | NA | OTU_792 | lightskyblue2 | 0.1116146 |
plot_tree(volumeOTUs, label.tips = "Genus", ladderize="left", size = "abundance", color = "Family",
nodelabf = nodeplotblank)
ggsave("volume_tree.png", width = 12, height =10)
M2 = tax_table(ps.network) %>% data.frame() %>%
mutate(Module.Membership = geneModuleMembership[,match("M2", modNames)]) %>%
arrange(-abs(Module.Membership)) %>%
filter(Module==module_order[2])
M4 = tax_table(ps.network) %>% data.frame() %>%
mutate(Module.Membership = geneModuleMembership[,match("M4", modNames)]) %>%
arrange(-abs(Module.Membership)) %>%
filter(Module==module_order[4])
M5 = tax_table(ps.network) %>% data.frame() %>%
mutate(Module.Membership = geneModuleMembership[,match("M5", modNames)]) %>%
arrange(-abs(Module.Membership)) %>%
filter(Module==module_order[5])
# label hubs for each module with OTU ID
key = c(volumeOTUsTable[c(1:3),8],
latitudeOTUsTable[(1:3),8],
M2[(1:3),8],
M4[(1:3),8],
M5[(1:3),8])
pdf("~/Dropbox/RoL/pitcher-16s/Manuscript/module-plot-with-labels-test.pdf", width = 10, height = 10)
plot(sc.se_large,
vertex.label=ifelse(names(V(sc.se_large)) %in% key, names(V(sc.se_large)), NA),
vertex.size = igraph::degree(sc.se_large)*1.5,
vertex.color=moduleColors,
vertex.label.family="Helvetica",
vertex.label.font=2,
vertex.label.cex = 0.75,
vertex.label.color = "black")
dev.off()
## quartz_off_screen
## 2
# table to go with plot
network_table = rbind(latitudeOTUsTable[c(1:3),c(1:6,8)],
M2[c(1:3),c(1:6,8)],
volumeOTUsTable[c(1:3),c(1:6,8)],
M4[c(1:3),c(1:6,8)],
M5[c(1:3),c(1:6,8)]) %>%
mutate(Module = c(rep("M1", 3), rep("M2",3), rep("M3",3), rep("M4", 3), rep("M5", 3)))
write.table(network_table, "network-table.csv", sep = ',', row.names = F)
For each community, find node with higest betweenness
# https://towardsdatascience.com/community-detection-in-r-using-communities-of-friends-characters-2161e845c198
communities <- data.frame()
for (i in unique(V(sc.se_large)$color)) {
# create subgraphs for each community
subgraph <- induced_subgraph(sc.se_large, v = V(sc.se_large)$name[V(sc.se_large)$color == i])
# get size of each subgraph
size <- igraph::gorder(subgraph)
# get betweenness centrality
btwn <- igraph::betweenness(subgraph)
communities <- communities %>%
dplyr::bind_rows(data.frame(module = i,
n_OTUs = size,
most_important = names(which(btwn == max(btwn)))
)
)
}
knitr::kable(communities %>% dplyr::select(module, n_OTUs, most_important))
Small = which(table(fc$membership) < SizeToKeep)
Keep = V(sc.se)[!(fc$membership %in% Small)]
sc.se_large = induced.subgraph(sc.se, Keep)
library(gplots)
poy <- colorRampPalette(c("purple", "orange", "yellow"))
#https://stackoverflow.com/questions/9777411/recenter-heatmap-2-in-r
mm = geneModuleMembership
#mm = mm[rowSums(abs(mm)) > 0.4,]
mm = mm[rowSums(mm) > 0.4,]
names(mm) = gsub("MM", "", names(mm))
taxanames = ps.network %>% subset_taxa(OTUID %in% rownames(mm)) %>% tax_table() %>% data.frame() %>%
mutate(OTUID = gsub("OTU_", "", OTUID)) %>%
mutate(across(Order, ~coalesce(.,Class))) %>%
mutate(across(Family, ~coalesce(.,Order))) %>%
mutate(across(Genus, ~na_if(., "uncultured"))) %>%
mutate(across(Genus, ~coalesce(.,Family))) %>%
mutate(Genus = paste(Genus, OTUID))
rownames(mm) = taxanames$Genus
#tiff(filename = "/Users/zlm3critter/Dropbox/RoL/pitcher-16s/re-analysis/Heatmap.tiff")
heatmap.2(as.matrix(t(mm)),
dendrogram = "none",
trace = "none",
key = F,
lhei = c(0.2,23), # row height
lwid = c(0.1,23),
margins = c(20,10),
adjCol = c(0.95,0.5))
dev.off()
## null device
## 1
Use the refseq database to blast the top 5 taxa (based on module membership) in each module. Return a table with the ID that can be looked up on NCBI nucleotide. This avoids having to blast the sequences individually and keeps everything organized.
fasta = readDNAStringSet("~/Dropbox/RoL/pitcher-16s/qiime2-stuff/dada2-denoise-cutadapt-primer/representative_sequences.fasta/dna-sequences.fasta")
# # Only need to do this once
# download.file("https://ftp.ncbi.nlm.nih.gov/blast/db/16S_ribosomal_RNA.tar.gz", "16S_ribosomal_RNA.tar.gz", mode='wb')
# untar("16S_ribosomal_RNA.tar.gz", exdir="16SMicrobialDB")
bl <- blast(db="~/Dropbox/RoL/pitcher-16s/re-analysis/16SMicrobialDB/16S_ribosomal_RNA")
# make a table of the top 3-5 taxa in the interesting modules
seq_latitude = fasta[names(fasta) %in% rownames(latitudeOTUsTable)[1:5]]
seq_volume = fasta[names(fasta) %in% rownames(volumeOTUsTable)[1:5]]
bl_latitude = predict(bl,seq_latitude,BLAST_args = "-perc_identity 99")
bl_volume = predict(bl,seq_volume,BLAST_args = "-perc_identity 99")
bl_latitude_Table = full_join(latitudeOTUsTable[1:5,] %>% mutate(QueryID=rownames(latitudeOTUsTable[1:5,])), bl_latitude)
write.table(bl_latitude_Table, "blast_table_latitude.csv", sep=",", row.names = F)
bl_volume_Table = full_join(volumeOTUsTable[1:5,] %>% mutate(QueryID=rownames(volumeOTUsTable[1:5,])), bl_volume)
write.table(bl_volume_Table, "blast_table_volume.csv", sep=",", row.names = F)