Selbal is an R package for selection of balances in microbiome compositional data. As described in Rivera-Pinto et al. 2018 Balances: a new perspective for microbiome analysis https://msystems.asm.org/content/3/4/e00053-18, selbal implements a forward-selection method for the identification of two groups of taxa whose relative abundance, or balance, is associated with the response variable of interest. See https://github.com/UVic-omics/selbal.
# Load "devtools" library (install it if needed)
# library(devtools)
# Install "selbal""
# install_github(repo = "UVic-omics/selbal")
# Import metadata
metadata <-read.csv("~/Documents/gbru_fy18_rice_methane/march2020/metadata_biomassCH4_noRIL9.csv", header=TRUE)
# Import OTU counts
otudata<-read.table("~/Documents/gbru_fy18_rice_methane/march2020/otu_table.tsv", header=TRUE)
# Move feature ID to rownames
row.names(otudata)<-otudata$feature.id
otudata<-otudata[,-1]
# Sort columns by name
otudata<- otudata[, order(names(otudata))]
# load Taxonomy annotation
taxa <- read.table("~/Documents/gbru_fy18_rice_methane/march2020/observation_meta_data.tsv", sep="\t", header=TRUE, quote = "")
#Filter data to specific taxonomic levels
cdata <-cbind(taxa,otudata)
library(dplyr)
Registered S3 method overwritten by 'dplyr':
method from
print.rowwise_df
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
library(tidyr)
fd_long<- gather(cdata, sample, count, colnames(cdata)[10:105], factor_key=TRUE)
#group by family
fd_fam_tally <- fd_long[,c(7,10,11)] %>% group_by(taxonomy_5, sample) %>% summarise(n = sum(count))
fd_fam_wide <- fd_fam_tally %>% pivot_wider(names_from= sample, values_from=n)
df_fam_wide<- as.data.frame(fd_fam_wide)
rownames(df_fam_wide)<- df_fam_wide[,1]
df_fam_wide<- df_fam_wide[,-1]
#Define the geometric mean for all values greater than 0
geomean <- function(x) { exp(mean(log(x[x>0]))) }
# Create a function to filter rows by the proportion of nonzero samples and the geometric mean of the nonzero samples
prefilter <- function(x, zcutoff, mcutoff){
nzvect <- apply(x, 1, function(y){
((sum(y>0)/length(y))> zcutoff) & (geomean(y)> mcutoff)
})
lowcounts <- t(as.data.frame(apply(x[!nzvect,],2,sum)))
rownames(lowcounts)<-c("lowconts")
df1 <-rbind(lowcounts, x[nzvect,])
return(df1)
}
#filter taxa
otutaxa100<-prefilter(df_fam_wide, 0.5, 100)
# Construct covariates matrix
covariates1 <- data.frame(genotype=metadata$genotype, sumch4=log10(metadata$sum_ch4), dev=metadata$dev)
# Process
#select D3 developmental stage
d3cov <- covariates1[covariates1$dev=="D3",]
d3cov <- d3cov[1:30,]
d3rows <- as.numeric(rownames(d3cov))
d3otus_filtered <- otutaxa100[,d3rows]
# create model matrix for log10 of total methane
mm1 <- model.matrix(~ sumch4, d3cov)
library(selbal)
sbout<- selbal.cv(t(d3otus_filtered), d3cov$sumch4, n.iter=10, seed=234753)
###############################################################
STARTING selbal.cv FUNCTION
###############################################################
#-------------------------------------------------------------#
# ZERO REPLACEMENT . . .
Loading required package: MASS
Attaching package: ‘MASS’
The following object is masked from ‘package:dplyr’:
select
Loading required package: NADA
Loading required package: survival
Attaching package: ‘NADA’
The following object is masked from ‘package:stats’:
cor
Loading required package: truncnorm
, . . . FINISHED.
#-------------------------------------------------------------#
#-------------------------------------------------------------#
# Starting the cross - validation procedure . . .
sbout
$accuracy.nvar
$var.barplot
$global.plot
TableGrob (2 x 1) "arrange": 2 grobs
$global.plot2
TableGrob (2 x 1) "arrange": 2 grobs
$ROC.plot
NULL
$cv.tab
$cv.accuracy
[1] 0.002196164 0.001584954 0.008468705 0.002888459 0.002589784 0.001143780 0.005728671 0.002428404 0.003209565 0.004745270 0.005433683 0.002540281 0.002808864 0.001804024 0.004018338
[16] 0.003645877 0.002620363 0.007321406 0.002308431 0.005934665 0.005683666 0.003990323 0.007266434 0.003994663 0.002468332 0.004055390 0.002135893 0.001932227 0.004274039 0.005223320
[31] 0.004103786 0.004483961 0.004330137 0.002490260 0.003030292 0.001593479 0.003938556 0.002550531 0.004408869 0.002772143 0.004321207 0.002297730 0.001778298 0.004970326 0.004858929
[46] 0.004765624 0.002910011 0.003723047 0.003273437 0.003432713
$global.balance
$glm
Call: glm(formula = numy ~ ., family = f.class, data = U)
Coefficients:
(Intercept) V1
5.9954 0.3728
Degrees of Freedom: 29 Total (i.e. Null); 28 Residual
Null Deviance: 0.07472
Residual Deviance: 0.03441 AIC: -112
$opt.nvar
[1] 2
#Results The results of SelBal analysi show that hte best correlated balance is the ratio of Moraxellaceae to Planctomycetaceae