Author: Diego Veliz-Otani

Date created: April 13, 2021

Date Updated: April 20, 2021

1. Prepare session

2. Load the data and create new operationalizations for the microbiome data

The raw microbiome sequence data is counts data, i.e. they are the number of sequencing reads that, after quality control, aligned to each Faecalibacterium prausnitzii Organizational Taxonomic Unit (OTU). This will be referred to as the “raw data” from now on, and is identified by the object name “otus_raw”.
We created a new variable, named otus_root by calculating the square root of otus_raw.
We also created the variable otus_binary which contains indicator variables for the presence of each OTU, thus it only has values of 0 and 1.
Finally, we created a new variable that equals the sum of the 6 OTUs in the otus_raw dataset. We also created an square-root and an indicator variable for this variable. This 3 variables are stored in the table otus_total.

dat <- read.table(dat_file, stringsAsFactors = F, header = T, sep = "\t")
rownames(dat) <- as.character(dat$Sample_Id)
model_data <- dat %>% select(Sample_Id, delta_laz_median, sex, age_days.x, time_span, breastfed, country)
model_data$delta_laz_median <- as.numeric(model_data$delta_laz_median)
model_data$age_days.x <- as.numeric(model_data$age_days.x)
model_data$time_span <- as.numeric(model_data$time_span)

model_data <- model_data %>% dplyr::rename(linear_growth=delta_laz_median)
model_data <- model_data %>% dplyr::rename(age=age_days.x)
model_data <- model_data %>% dplyr::rename(time=time_span)

otus_raw <- dat %>% select(Sample_Id, starts_with("OTU")[1:6])
otus_raw$OTU_total <- rowSums(otus_raw[,-1])
for(i in 2:ncol(otus_raw)){
  otus_raw[,i] <- as.integer(otus_raw[,i])
}

otus_root <- otus_raw
otus_root[,2:8] <- sqrt(otus_raw[,2:8])

otus_binary  <- otus_raw
for(i in 2:ncol(otus_binary)){
  otus_binary[,i] <- as.integer(otus_binary[,i]>=1)
}

otus_total         <- data.frame(Sample_Id=otus_raw$Sample_Id,
                                 stringsAsFactors = F)
otus_total$raw    <- otus_raw$OTU_total
otus_total$root    <- otus_root$OTU_total
otus_total$binary  <- otus_binary$OTU_total
rownames(otus_total) <- rownames(otus_raw)

otu_objects <- c("otus_raw", "otus_binary", "otus_root")
parameterizations <- c("Raw", "Binary", "Square Root")

2. Univariate analysis of metadata

2.1. Demographic and clinical variables

tmp <- model_data %>% select(!Sample_Id)
metadata_summary <- autosummary_list(tmp)
if(!is.na(metadata_summary$quant)){
  kbl(metadata_summary$quant, caption="Summary of the quantitative demographic and clinical variables") %>% 
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
}

Summary of the quantitative demographic and clinical variables
Variable	Mean	Std. Dev.	Min	Q1	Q2	Q3	Max
linear_growth	-0.2	0.6	-4	-0.5	-0.2	0.1	6.7
age	376.7	172	43	235	356	515	728
time	62.1	11.7	48	52	59	70	102

vars <- unique(metadata_summary$qual$Variable)
tmp <-  metadata_summary$qual %>% filter(Variable==vars[1]) %>% select(!Variable)
kbl(tmp, caption=vars[1]) %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

sex
Group	Frequency (%)
Female	540(44)
Male	697(56)
NA	0(0)

tmp <-  metadata_summary$qual %>% filter(Variable==vars[2]) %>% select(!Variable)
kbl(tmp, caption=vars[2]) %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

breastfed
Group	Frequency (%)
exclusively	69(6)
no	160(13)
partially	1008(81)
NA	0(0)

tmp <-  metadata_summary$qual %>% filter(Variable==vars[3]) %>% select(!Variable)
kbl(tmp, caption=vars[3]) %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

country
Group	Frequency (%)
Bangladesh	312(25)
Kenya	520(42)
Mali	85(7)
The Gambia	320(26)
NA	0(0)

cols <- c("dodgerblue4", "gray50")
cols <- adjustcolor(cols, alpha.f = 0.5)

hist(model_data$linear_growth, breaks = 30, main = "Linear growth", 
     xlab = expr(paste(Delta, " height Z-score")), cex.lab = 1.2)

qqnorm(model_data$linear_growth, main = "Linear growth", cex.lab = 1.2,
       col=cols[2], pch = 19)

extreme_values <- sort(model_data$linear_growth)
extreme_values <- extreme_values[c(1:2, (nrow(model_data)-3):nrow(model_data))]
idx_outliers <- model_data$linear_growth %in% extreme_values
outliers <- model_data[idx_outliers,]

par(xpd=TRUE)

qqnorm(model_data$linear_growth, main = "Linear growth", cex.lab = 1.2,
       col=cols[abs(idx_outliers-2)], pch = 19)
grid()
legend("topleft", legend= c("Non-outliers", "Potential outliers"),
       pch =19, bty="L", box.col=cols[2], border=cols[2],
       col = cols[2:1])

par(xpd=FALSE)
kbl(outliers, caption="List of potential outliers whose removal will be assessed") %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

List of potential outliers whose removal will be assessed
	Sample_Id	linear_growth	sex	age	time	breastfed	country
102474	102474	-3.96	Female	728	74	partially	The Gambia
102333	102333	4.21	Male	512	67	no	The Gambia
100432	100432	6.71	Male	715	77	no	The Gambia
200253	200253	4.28	Male	471	59	partially	Mali
400993	400993	3.91	Male	291	90	partially	Kenya
402065	402065	-3.69	Male	554	50	no	Kenya

2.2. Microbiota abundance data

overview1 <- data.frame()
for(j in c(1,3)){
  obj <- get(otu_objects[j]) %>% select(!Sample_Id)
  coln <- colnames(obj)
  for(i in (1:ncol(obj))){
    tmp <- round(summary(obj[,i]),2)
    std <- round(sd(obj[,i]),2)
    tmp <- c(parameterizations[j], coln[i], tmp, std)
    overview1 <- rbind(overview1, tmp)
  }
}
colnames(overview1) <- c("Parameterization", "OTU", "Min", "Q1", "Q2", "Mean", "Q3", "Max", "SD")
overview1 <- overview1  %>%  select(Parameterization, OTU, Mean, SD, Q1, Q2, Q3, Min, Max)

nbinary <- unname(colSums(otus_binary[,-1]))
nbinary <- paste0( nbinary, "(", round(nbinary/nrow(otus_binary)*100), ")")

overview2 <- data.frame(Parameterization="Binary", 
                        OTU=colnames(otus_binary[,-1]),
                        N=nbinary,
                        n=colSums(otus_binary[,-1]))

rownames(overview2) <- NULL
# otus_raw    <- otus_raw %>% select(!OTU_total)
# otus_binary <- otus_binary %>% select(!OTU_total)
# otus_root   <- otus_root %>% select(!OTU_total)
kbl(overview1, caption="Summary of the quantitative microbiota abundance variables used as predictors") %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

Summary of the quantitative microbiota abundance variables used as predictors
Parameterization	OTU	Mean	SD	Q3	Max
Raw	OTU_39042	1.68	10.9	0	190
Raw	OTU_39040	1.12	6.56	0	136
Raw	OTU_67958	0.94	6.52	0	156
Raw	OTU_54565	0.72	2.88	0	46
Raw	OTU_87766	0.38	1.43	0	18
Raw	OTU_65293	0.31	1.83	0	35
Raw	OTU_total	5.15	17.48	3	207
Square Root	OTU_39042	0.34	1.25	0	13.78
Square Root	OTU_39040	0.32	1.01	0	11.66
Square Root	OTU_67958	0.28	0.93	0	12.49
Square Root	OTU_54565	0.32	0.79	0	6.78
Square Root	OTU_87766	0.21	0.58	0	4.24
Square Root	OTU_65293	0.14	0.54	0	5.92
Square Root	OTU_total	1.13	1.97	1.73	14.39

kbl(overview2, caption="Summary of the indicator variables for presence of F. prausnitzii") %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

Summary of the indicator variables for presence of F. prausnitzii
Parameterization	OTU	N	n
Binary	OTU_39042	153(12)	153
Binary	OTU_39040	183(15)	183
Binary	OTU_67958	170(14)	170
Binary	OTU_54565	232(19)	232
Binary	OTU_87766	178(14)	178
Binary	OTU_65293	103(8)	103
Binary	OTU_total	505(41)	505

3. Bivariate analyses (before removing outliers)

model_data$breastfed <- factor(model_data$breastfed) %>% relevel(ref="no")
model_data$country <- factor(model_data$country) %>% relevel(ref="Bangladesh")
model_data$sex <- factor(model_data$sex) %>% relevel(ref="Female")

bivariate_metadata <- data.frame()
for(i in 3:ncol(model_data)){
  mod_tmp <- lm(model_data$linear_growth ~ model_data[,i]) %>% confint2(digits = 4)  
  bivariate_metadata <- rbind(bivariate_metadata, mod_tmp)
}

bivariate_metadata$variable <- rownames(bivariate_metadata)
rownames(bivariate_metadata) <- NULL
bivariate_metadata <- bivariate_metadata[,c(4,1:3)]
bivariate_metadata$variable <- c("intercept_sex", "Male", "intercept_age", "age",
                                 "intercept_timespan", "timespan", "intercept_breastfed",
                                 "Exclusively", "Partially", "intercept_country",
                                 "Kenya", "Mali", "The_Gambia")
bivariate_metadata <- bivariate_metadata[-grep("intercept_", bivariate_metadata$variable),]

otus_names <- colnames(otus_raw[,-1])
bivariate_obj <- c("bivariate_raw", "bivariate_root", "bivariate_binary")
otus_obj <- c("otus_raw", "otus_root", "otus_binary")
options(par(mfrow=c(1,2)))
for(j in 1:3){
  otus_tmp <- get(otus_obj[j])
  bivariate_tmp <- data.frame()
  for(i in 2:ncol(otus_tmp)){
    mod_tmp <- lm(model_data$linear_growth ~ otus_tmp[,i])   
    # cat(paste0("_**", parameterizations[j], ", ", colnames(otus_tmp)[i], ":**_  \n  "))
    cat("  \n")
    cat(paste0("### ", parameterizations[j], ", ", colnames(otus_tmp)[i], "  \n"))
    cat("  \n")
    # cat(paste0("### ", parameterizations[j], ", ", colnames(otus_tmp)[i], "  \n"))
    par(mfrow=c(3,2))
    plot(mod_tmp, which = 1:6, pch = 19, main = parameterizations[j],
         col=cols[(idx_outliers-1)^2+1], lwd = 3, cex.lab = 2, cex = 1.2, cex.id = 1.2)
    cat("  \n  \n  ")
    mod_tmp <- mod_tmp  %>% confint2(digits = 4)  
    bivariate_tmp <- rbind(bivariate_tmp, mod_tmp)
  }
  bivariate_tmp <- bivariate_tmp[-c(1,3,5,7,9,11,13),]
  bivariate_tmp$variable<- otus_names
  bivariate_tmp <- bivariate_tmp[,c(4,1:3)]
  rownames(bivariate_tmp) <- NULL
  assign(bivariate_obj[j], bivariate_tmp)
}

Raw, OTU_39042

Raw, OTU_39040

Raw, OTU_67958

Raw, OTU_54565

Raw, OTU_87766

Raw, OTU_65293

Raw, OTU_total

Binary, OTU_39042

Binary, OTU_39040

Binary, OTU_67958

Binary, OTU_54565

Binary, OTU_87766

Binary, OTU_65293

Binary, OTU_total

Square Root, OTU_39042

Square Root, OTU_39040

Square Root, OTU_67958

Square Root, OTU_54565

Square Root, OTU_87766

Square Root, OTU_65293

Square Root, OTU_total

par(mfrow=c(1,1))

kbl(bivariate_metadata, caption="Crude associations of linear growth with demographic and clinical variables") %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

Crude associations of linear growth with demographic and clinical variables
	variable	Estimate	P-value	95%CI
2	Male	0.0175	0.6371	(-0.0552, 0.0901)
4	age	4e-04	1e-04	(2e-04, 6e-04)
6	timespan	-0.001	0.5061	(-0.0041, 0.002)
8	Exclusively	-0.2301	0.0133	(-0.4122, -0.048)
9	Partially	-0.1088	0.0474	(-0.2164, -0.0013)
11	Kenya	0.0276	0.5509	(-0.0631, 0.1183)
12	Mali	-0.0457	0.5628	(-0.2007, 0.1093)
13	The_Gambia	-0.0408	0.4278	(-0.1415, 0.06)

kbl(bivariate_raw, caption="Crude associations of linear growth with RAW F. prausnitzii counts") %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

Crude associations of linear growth with RAW F. prausnitzii counts
variable	Estimate	P-value	95%CI
OTU_39042	0.0013	0.4256	(-0.002, 0.0046)
OTU_39040	0.0012	0.6664	(-0.0043, 0.0067)
OTU_67958	0.0144	0	(0.0089, 0.0199)
OTU_54565	0.0148	0.0204	(0.0023, 0.0272)
OTU_87766	-0.0077	0.5486	(-0.0328, 0.0174)
OTU_65293	-0.0021	0.8334	(-0.0218, 0.0176)
OTU_total	0.003	0.004	(0.001, 0.0051)

kbl(bivariate_root, caption="Crude associations of linear growth with square root(F. prausnitzii counts)") %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

Crude associations of linear growth with square root(F. prausnitzii counts)
variable	Estimate	P-value	95%CI
OTU_39042	0.0202	0.1685	(-0.0086, 0.049)
OTU_39040	0.0267	0.1433	(-0.0091, 0.0625)
OTU_67958	0.0735	2e-04	(0.0349, 0.1121)
OTU_54565	0.0365	0.1182	(-0.0093, 0.0824)
OTU_87766	-0.0165	0.6011	(-0.0784, 0.0454)
OTU_65293	-0.0014	0.9666	(-0.0684, 0.0655)
OTU_total	0.0265	0.0045	(0.0082, 0.0447)

kbl(bivariate_binary, caption="Crude associations of linear growth with presence of F. prausnitzii") %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

Crude associations of linear growth with presence of F. prausnitzii
variable	Estimate	P-value	95%CI
OTU_39042	0.1284	0.0213	(0.0192, 0.2375)
OTU_39040	0.0734	0.1556	(-0.028, 0.1748)
OTU_67958	0.1041	0.0507	(-3e-04, 0.2086)
OTU_54565	0.0166	0.7237	(-0.0757, 0.1089)
OTU_87766	-0.0056	0.9148	(-0.1082, 0.097)
OTU_65293	0.0014	0.9831	(-0.129, 0.1318)
OTU_total	0.0863	0.0208	(0.0131, 0.1594)

varnames <- c("Sex", "Country", "Breastfed")
col_pos <- c(3, 6, 7)

  boxplot(model_data$linear_growth~model_data$sex,
          main = varnames[i], xlab = "", ylab = "Linear growth",
          col=cols[2], pch = 19)

plotdata <- model_data  
plotdata$breastfed <- str_to_sentence(plotdata$breastfed)
plotdata$sex <- str_to_sentence(plotdata$sex)
plotdata$country <- str_to_sentence(plotdata$country)



ggplot(data = plotdata, aes(x = sex, y = linear_growth)) +
  geom_boxplot(
     position = position_dodge(width=0.5),
      aes(fill = sex),
     outlier.colour = cols[2],
     outlier.size = 0.8) +
    scale_fill_manual(values = cols) +

    facet_wrap(~ breastfed + country, ncol = 4, nrow=3) +
    #Set the size of the plotting window
    theme_bw(base_size=12) +

      #Modify various aspects of the plot text and legend
      theme(
        legend.position = 'none',
        legend.background = element_rect(),
        axis.text.x = element_text(angle = 45, size = 12, hjust = 1.10),
        axis.text.y = element_text(angle = 0, size = 12, vjust = 0.5),
        axis.title = element_text(size = 12),
        axis.title.x = element_text(size = 14, face="bold"),
        axis.title.y = element_text(size = 14,  face="bold"),

        #facet wrap
        strip.text.x = element_text(size = 12),
        strip.text.y = element_text(size = 12, angle=0),
        strip.background = element_rect(fill = 'white'), 
        strip.text = element_text(size = 12, colour = 'black'))+

    #Change the size of the icons/symbols in the legend
    guides(colour = guide_legend(override.aes = list(size = 2.5))) +

    #Set x- and y-axes labels
    ylab('Linear growth') +
  xlab("Sex") +
  labs(title = 'Linear growth stratified by sex, country and history of being breastfed')

### Second plot:

plot2 <- ggplot(data = plotdata, aes(x = sex, y = linear_growth)) +
  geom_boxplot(
     position = position_dodge(width=0.5),
      aes(fill = sex),
     outlier.colour = cols[2],
     outlier.size = 1) +
    scale_fill_manual(values = c(cols[2], cols[2])) +

    facet_wrap(~ breastfed, ncol=3) +
    #Set the size of the plotting window
    theme_bw(base_size=12) +

      #Modify various aspects of the plot text and legend
      theme(
        legend.position = 'none',
        legend.background = element_rect(),
        axis.text.x = element_text(angle = 45, size = 12, hjust = 1.10),
        axis.text.y = element_text(angle = 0, size = 12, vjust = 0.5),
        axis.title = element_text(size = 12),
        axis.title.x = element_text(size = 14, face = "bold"),
        axis.title.y = element_text(size = 14, face = "bold"),

        #facet wrap
        strip.text.x = element_text(size = 12),
        strip.text.y = element_text(size = 12, angle=0),
        strip.background = element_rect(fill = 'white'), 
        strip.text = element_text(size = 12, colour = 'black'))+

    #Change the size of the icons/symbols in the legend
    guides(colour = guide_legend(override.aes = list(size = 2.5))) +

    #Set x- and y-axes labels
    ylab('Linear growth') +
    xlab('Sex')  +
   labs(title = 'Linear growth stratified by sex and history of being breastfed')


### Third plot:

plot3 <- ggplot(data = plotdata, aes(x = sex, y = linear_growth)) +
  geom_boxplot(
     position = position_dodge(width=0.5),
      aes(fill = sex),
     outlier.colour = cols[2],
     outlier.size = 1) +
    scale_fill_manual(values = c(cols[2], cols[2])) +

    facet_wrap(~ country, ncol=4) +
    #Set the size of the plotting window
    theme_bw(base_size=12) +

      #Modify various aspects of the plot text and legend
      theme(
        legend.position = 'none',
        legend.background = element_rect(),
        axis.text.x = element_text(angle = 45, size = 12, hjust = 1.10),
        axis.text.y = element_text(angle = 0, size = 12, vjust = 0.5),
        axis.title = element_text(size = 12),
        axis.title.x = element_text(size = 14, face = "bold"),
        axis.title.y = element_text(size = 14, face = "bold"),

        #facet wrap
        strip.text.x = element_text(size = 12),
        strip.text.y = element_text(size = 12, angle=0),
        strip.background = element_rect(fill = 'white'), 
        strip.text = element_text(size = 12, colour = 'black'))+

    #Change the size of the icons/symbols in the legend
    guides(colour = guide_legend(override.aes = list(size = 2.5))) +

    #Set x- and y-axes labels
    ylab('Linear growth') +
    xlab('Sex')  +
   labs(title = 'Linear growth stratified by sex and country')

grid.arrange(plot2, plot3, nrow=2)

options(par(mfrow=c(3,1)))
cat("  \n")

cat("**Bivariate association: Linear growth vs OTU_total**  \n  ")

## **Bivariate association: Linear growth vs OTU_total**  
##

plot(model_data$linear_growth ~ otus_raw$OTU_total, pch=19, col = cols[(idx_outliers-1)^2+1],
     main = "Linear growth vs Raw OTU_total", xlab = "Raw OTU_total", ylab= "Linear growth",
     cex.lab=1.6, cex=1.6, cex.main = 1.6); grid()
plot(model_data$linear_growth ~ otus_root$OTU_total, pch=19, col = cols[(idx_outliers-1)^2+1],
     main = "Linear growth Square root(OTU_total)", xlab = "Square root(OTU_total)", ylab= "Linear growth",
     cex.lab=1.6, cex=1.6, cex.main = 1.6); grid()
plot(model_data$linear_growth ~ otus_binary$OTU_total, pch=19, col = cols[(idx_outliers-1)^2+1],
     main = "Linear growth vs Binary OTU_total", xlab = "Binary OTU_total", ylab= "Linear growth",
     cex.lab=1.6, cex=1.6, cex.main = 1.6); grid()

cat("  \n")

cat("**Bivariate association: Linear growth vs OTU_67958**  \n  ")

## **Bivariate association: Linear growth vs OTU_67958**  
##

plot(model_data$linear_growth ~ otus_raw$OTU_67958, pch=19, col = cols[(idx_outliers-1)^2+1],
     main = "Linear growth vs Raw OTU_67958", xlab = "Raw OTU_67958", ylab= "Linear growth",
     cex.lab=1.6, cex=1.6, cex.main = 1.6); grid()
plot(model_data$linear_growth ~ otus_root$OTU_67958, pch=19, col = cols[(idx_outliers-1)^2+1],
     main = "Linear growth Square root(OTU_67958)", xlab = "Square root(OTU_67958)", ylab= "Linear growth",
     cex.lab=1.6, cex=1.6, cex.main = 1.6); grid()
plot(model_data$linear_growth ~ otus_binary$OTU_67958, pch=19, col = cols[(idx_outliers-1)^2+1],
     main = "Linear growth vs Binary OTU_67958", xlab = "Binary OTU_67958", ylab= "Linear growth",
     cex.lab=1.6, cex=1.6, cex.main = 1.6); grid()

cat("  \n")

options(par(mfrow=c(1,1)))

4. Multivariable analyses before removing outliers

otus_root3 <- otus_3root <- otus_raw^(1/3)
otus_root4 <- otus_4root <- otus_raw^(1/4)

multimod0 <- lm(model_data$linear_growth ~ model_data$sex + model_data$age  + model_data$time +
                  model_data$breastfed + model_data$country + otus_binary$OTU_total)

multimod1 <- lm(model_data$linear_growth ~ model_data$sex + model_data$age  + model_data$time +
                  model_data$breastfed + model_data$country + otus_root$OTU_total + otus_binary$OTU_total)

multimod2 <- lm(model_data$linear_growth ~ model_data$sex + model_data$age  + model_data$time +
                  model_data$breastfed + model_data$country + otus_root$OTU_total + otus_binary$OTU_total +
                  otus_raw$OTU_total)

multimod3 <- lm(model_data$linear_growth ~ model_data$sex + model_data$age  + model_data$time +
                  model_data$breastfed + model_data$country + otus_root$OTU_67958 + otus_binary$OTU_67958)

multimod4 <- lm(model_data$linear_growth ~ model_data$sex + model_data$age  + model_data$time +
                  model_data$breastfed + model_data$country + otus_root$OTU_67958 + otus_binary$OTU_67958 + 
                  otus_raw$OTU_67958)

multimod5 <- lm(model_data$linear_growth ~ model_data$sex + model_data$age  + model_data$time +
                  model_data$breastfed + model_data$country + otus_root$OTU_67958 + otus_binary$OTU_67958 + 
                  otus_raw$OTU_67958 + otus_root3$OTU_67958)

multimod6 <- lm(model_data$linear_growth ~  model_data$age  + otus_root$OTU_67958 +
                  otus_binary$OTU_67958 + 
                  otus_raw$OTU_67958 + otus_root3$OTU_67958 ) ## Best model

for(i in 0:6){
  tmpmod <- get(paste0("multimod", i))
  tmp_output <- format_model(tmpmod)
  tmpcaption <- paste0("Multivariable model ", i, ". R-squared: ", round(summary(tmpmod)$r.squared, 3))
  cat("  \n")
  print(kbl(format_model(tmpmod), caption=tmpcaption) %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")))
  cat("  \n")
}

Multivariable model 0. R-squared: 0.017
Estimate	Std. Error	t-statistic	P-value	variable
-0.3224074	0.1756001	-1.8360323	0.0665947	Intercept
0.0179152	0.0370226	0.4838985	0.6285443	sexMale
0.0003944	0.0001339	2.9447571	0.0032930	age
0.0004505	0.0021974	0.2050025	0.8376042	time
-0.0693451	0.1033678	-0.6708575	0.5024376	breastfedexclusively
-0.0388964	0.0605777	-0.6420918	0.5209335	breastfedpartially
0.0357739	0.0611468	0.5850498	0.5586218	countryKenya
-0.0291160	0.0821072	-0.3546094	0.7229431	countryMali
-0.0722800	0.0531170	-1.3607713	0.1738359	countryThe Gambia
0.0153269	0.0434695	0.3525896	0.7244567	otus_binary: total

Multivariable model 1. R-squared: 0.018
Estimate	Std. Error	t-statistic	P-value	variable
-0.3177217	0.1756040	-1.8093085	0.0706479	Intercept
0.0178470	0.0370147	0.4821582	0.6297796	sexMale
0.0003810	0.0001344	2.8354141	0.0046518	age
0.0004161	0.0021971	0.1893983	0.8498120	time
-0.0650103	0.1034057	-0.6286916	0.5296681	breastfedexclusively
-0.0346777	0.0606614	-0.5716597	0.5676574	breastfedpartially
0.0294528	0.0613483	0.4800912	0.6312482	countryKenya
-0.0356595	0.0822611	-0.4334919	0.6647337	countryMali
-0.0705533	0.0531242	-1.3280832	0.1843978	countryThe Gambia
0.0161363	0.0130865	1.2330523	0.2177926	otus_root: total
-0.0256736	0.0547215	-0.4691687	0.6390325	otus_binary: total

Multivariable model 2. R-squared: 0.021
Estimate	Std. Error	t-statistic	P-value	variable
-0.3230552	0.1754316	-1.8414882	0.0657917	Intercept
0.0177924	0.0369738	0.4812159	0.6304490	sexMale
0.0003914	0.0001343	2.9136296	0.0036374	age
0.0004546	0.0021948	0.2071188	0.8359515	time
-0.0690215	0.1033123	-0.6680857	0.5042047	breastfedexclusively
-0.0380190	0.0606191	-0.6271776	0.5306597	breastfedpartially
0.0354774	0.0613602	0.5781836	0.5632464	countryKenya
-0.0340753	0.0821743	-0.4146708	0.6784555	countryMali
-0.0680025	0.0530819	-1.2810852	0.2004062	countryThe Gambia
-0.0510185	0.0372090	-1.3711342	0.1705842	otus_root: total
0.0794356	0.0772072	1.0288630	0.3037472	otus_binary: total
0.0061885	0.0032104	1.9276759	0.0541264	otus_raw: total

Multivariable model 3. R-squared: 0.027
Estimate	Std. Error	t-statistic	P-value	variable
-0.3145681	0.1748041	-1.7995470	0.0721780	Intercept
0.0140232	0.0368109	0.3809518	0.7033050	sexMale
0.0003880	0.0001245	3.1158669	0.0018766	age
0.0004078	0.0021867	0.1864969	0.8520859	time
-0.0647361	0.1029237	-0.6289719	0.5294846	breastfedexclusively
-0.0385193	0.0603596	-0.6381639	0.5234862	breastfedpartially
0.0340291	0.0602397	0.5648950	0.5722485	countryKenya
-0.0587492	0.0819960	-0.7164888	0.4738260	countryMali
-0.0661855	0.0528965	-1.2512255	0.2110909	countryThe Gambia
0.1042277	0.0302550	3.4449773	0.0005904	otus_root: 67958
-0.1668640	0.0833439	-2.0021145	0.0454924	otus_binary: 67958

Multivariable model 4. R-squared: 0.042
Estimate	Std. Error	t-statistic	P-value	variable
-0.3122884	0.1735037	-1.7998949	0.0721232	Intercept
0.0177847	0.0365468	0.4866278	0.6266092	sexMale
0.0003889	0.0001236	3.1467588	0.0016907	age
0.0004563	0.0021704	0.2102508	0.8335069	time
-0.0631722	0.1021582	-0.6183767	0.5364420	breastfedexclusively
-0.0456468	0.0599321	-0.7616420	0.4464203	breastfedpartially
0.0375940	0.0597968	0.6286964	0.5296650	countryKenya
-0.0864847	0.0816282	-1.0594952	0.2895832	countryMali
-0.0724589	0.0525221	-1.3795891	0.1679650	countryThe Gambia
-0.2196245	0.0793233	-2.7687270	0.0057124	otus_root: 67958
0.2551566	0.1264786	2.0173899	0.0438726	otus_binary: 67958
0.0349795	0.0079301	4.4109942	0.0000112	otus_raw: 67958

Multivariable model 5. R-squared: 0.052
Estimate	Std. Error	t-statistic	P-value	variable
-0.3137674	0.1726827	-1.8170167	0.0694590	Intercept
0.0093235	0.0364513	0.2557801	0.7981636	sexMale
0.0003706	0.0001231	3.0101779	0.0026644	age
0.0008669	0.0021632	0.4007272	0.6886909	time
-0.0833005	0.1018315	-0.8180228	0.4135034	breastfedexclusively
-0.0600623	0.0597856	-1.0046279	0.3152746	breastfedpartially
0.0408560	0.0595207	0.6864161	0.4925807	countryKenya
-0.0882210	0.0812432	-1.0858885	0.2777421	countryMali
-0.0754095	0.0522800	-1.4424174	0.1494405	countryThe Gambia
-3.7371378	0.9908653	-3.7715902	0.0001700	otus_root: 67958
-2.4478734	0.7693765	-3.1816328	0.0015013	otus_binary: 67958
0.1323734	0.0284642	4.6505185	0.0000037	otus_raw: 67958
6.0056006	1.6863683	3.5612628	0.0003832	otus_root3: 67958

Multivariable model 6. R-squared: 0.046
Estimate	Std. Error	t-statistic	P-value	variable
-0.3213757	0.0433946	-7.405896	0.0000000	Intercept
0.0003798	0.0001082	3.510398	0.0004636	age
-3.6363710	0.9854776	-3.689958	0.0002341	otus_root: 67958
-2.3928970	0.7649901	-3.128011	0.0018012	otus_binary: 67958
0.1282339	0.0283022	4.530881	0.0000064	otus_raw: 67958
5.8602828	1.6771533	3.494184	0.0004924	otus_root3: 67958

mods1 <- c(3,4,3, 5)
mods2 <- c(4, 5, 5, 6)
for(i in 1:4){
  tmpmod1 <- get(paste0("multimod", mods1[i]))
  tmpmod2 <- get(paste0("multimod", mods2[i]))
  tmppval <- round(anova(tmpmod1, tmpmod2, test= "F")$P[2],4)
  tmppval <- ifelse(tmppval<1e-4, "p<1e-4", tmppval)
  tmpstr <- paste0("* Partial F test: Model ", mods1[i], 
                   " vs Model ", mods2[i], ": ",
                   tmppval,  "  \n  \n")
  cat(tmpstr)
}

Partial F test: Model 3 vs Model 4: p<1e-4
Partial F test: Model 4 vs Model 5: 4e-04
Partial F test: Model 3 vs Model 5: p<1e-4
Partial F test: Model 5 vs Model 6: 0.3237

5. Outlier-removal

model_data2 <- model_data[!idx_outliers,]
otus_binary_b <- otus_binary[!idx_outliers,]
otus_raw_b <- otus_raw[!idx_outliers,]
otus_root_b <- otus_root[!idx_outliers,]
otus_3root_b <- otus_3root[!idx_outliers,]
otus_4root_b <- otus_4root[!idx_outliers,]

par(mfrow=c(3,2))
plot(multimod6, which = 1:6, pch = 19, main = "Best model before removing outliers" ,
     col=cols[(idx_outliers-1)^2+1], cex = 1.5, cex.lab = 1.6, cex.main=1.4, lwd = 2,
     cex.id=1.4)

6. Best model selection after outliers removal

multimod0_b <- lm(model_data2$linear_growth ~ model_data2$sex + model_data2$age  + model_data2$time +
                    model_data2$breastfed + model_data2$country + otus_binary_b$OTU_total)

multimod1_b <- lm(model_data2$linear_growth ~ model_data2$sex + model_data2$age  + model_data2$time +
                    model_data2$breastfed + model_data2$country + otus_root_b$OTU_total + otus_binary_b$OTU_total)

multimod2_b <- lm(model_data2$linear_growth ~ model_data2$sex + model_data2$age  + model_data2$time +
                    model_data2$breastfed + model_data2$country + otus_root_b$OTU_total + otus_binary_b$OTU_total + 
                    otus_raw_b$OTU_total)

multimod3_b <- lm(model_data2$linear_growth ~ model_data2$sex + model_data2$age  + model_data2$time +
                    model_data2$breastfed + model_data2$country + otus_root_b$OTU_67958 + otus_binary_b$OTU_67958)

multimod4_b <- lm(model_data2$linear_growth ~ model_data2$sex + model_data2$age  + model_data2$time +
                    model_data2$breastfed + model_data2$country + otus_root_b$OTU_67958 + otus_binary_b$OTU_67958 + 
                    otus_raw_b$OTU_67958)

multimod5_b <- lm(model_data2$linear_growth ~ model_data2$sex + model_data2$age  + model_data2$time +
                    model_data2$breastfed + model_data2$country + otus_root_b$OTU_67958 + otus_binary_b$OTU_67958 + 
                    otus_raw_b$OTU_67958 + otus_3root_b$OTU_67958)

multimod6_b <- lm(model_data2$linear_growth ~  model_data2$age  + otus_root_b$OTU_67958 +
                    otus_binary_b$OTU_67958 + 
                    otus_raw_b$OTU_67958 + otus_3root_b$OTU_67958 ) ## Best model

multimod7_b <- lm(model_data2$linear_growth ~  model_data2$age  + otus_root_b$OTU_67958 +
                    otus_binary_b$OTU_67958 + otus_3root_b$OTU_67958 )


multimod8_b <- lm(model_data2$linear_growth ~ model_data2$sex + model_data2$age  + model_data2$time +
                    model_data2$breastfed + model_data2$country + otus_binary_b$OTU_67958 + 
                    otus_raw_b$OTU_67958)
multimod9_b <- lm(model_data2$linear_growth ~ model_data2$sex + model_data2$age  + model_data2$time +
                    model_data2$breastfed + model_data2$country + otus_binary_b$OTU_67958)
multimod10_b <- lm(model_data2$linear_growth ~ model_data2$sex + model_data2$age  + model_data2$time +
                     model_data2$breastfed + model_data2$country + otus_raw_b$OTU_67958)

summary(multimod7_b)

## 
## Call:
## lm(formula = model_data2$linear_growth ~ model_data2$age + otus_root_b$OTU_67958 + 
##     otus_binary_b$OTU_67958 + otus_3root_b$OTU_67958)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.4480 -0.3267 -0.0020  0.3017  2.7190 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             -3.191e-01  3.823e-02  -8.347  < 2e-16 ***
## model_data2$age          3.336e-04  9.544e-05   3.496  0.00049 ***
## otus_root_b$OTU_67958    2.673e-02  2.346e-01   0.114  0.90929    
## otus_binary_b$OTU_67958  1.886e-01  3.203e-01   0.589  0.55614    
## otus_3root_b$OTU_67958  -9.671e-02  5.101e-01  -0.190  0.84965    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5558 on 1226 degrees of freedom
## Multiple R-squared:  0.01716,    Adjusted R-squared:  0.01395 
## F-statistic: 5.352 on 4 and 1226 DF,  p-value: 0.0002838

summary(multimod8_b)

## 
## Call:
## lm(formula = model_data2$linear_growth ~ model_data2$sex + model_data2$age + 
##     model_data2$time + model_data2$breastfed + model_data2$country + 
##     otus_binary_b$OTU_67958 + otus_raw_b$OTU_67958)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.38093 -0.33389 -0.01144  0.29533  2.79960 
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      -0.2140267  0.1527081  -1.402 0.161306    
## model_data2$sexMale              -0.0129703  0.0319718  -0.406 0.685048    
## model_data2$age                   0.0003988  0.0001081   3.689 0.000235 ***
## model_data2$time                 -0.0016020  0.0019128  -0.838 0.402462    
## model_data2$breastfedexclusively  0.0097166  0.0894581   0.109 0.913525    
## model_data2$breastfedpartially    0.0159552  0.0527181   0.303 0.762208    
## model_data2$countryKenya         -0.0059620  0.0525175  -0.114 0.909633    
## model_data2$countryMali          -0.1080011  0.0712835  -1.515 0.130008    
## model_data2$countryThe Gambia    -0.1008688  0.0459252  -2.196 0.028253 *  
## otus_binary_b$OTU_67958           0.1006301  0.0536711   1.875 0.061040 .  
## otus_raw_b$OTU_67958             -0.0016366  0.0036594  -0.447 0.654800    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5544 on 1220 degrees of freedom
## Multiple R-squared:  0.02675,    Adjusted R-squared:  0.01878 
## F-statistic: 3.354 on 10 and 1220 DF,  p-value: 0.000253

summary(multimod9_b)

## 
## Call:
## lm(formula = model_data2$linear_growth ~ model_data2$sex + model_data2$age + 
##     model_data2$time + model_data2$breastfed + model_data2$country + 
##     otus_binary_b$OTU_67958)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.38157 -0.33237 -0.01148  0.29599  2.79913 
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      -0.2129967  0.1526407  -1.395 0.163145    
## model_data2$sexMale              -0.0129728  0.0319613  -0.406 0.684893    
## model_data2$age                   0.0003987  0.0001081   3.689 0.000235 ***
## model_data2$time                 -0.0016179  0.0019118  -0.846 0.397576    
## model_data2$breastfedexclusively  0.0100585  0.0894255   0.112 0.910462    
## model_data2$breastfedpartially    0.0158888  0.0527006   0.301 0.763091    
## model_data2$countryKenya         -0.0060204  0.0525002  -0.115 0.908722    
## model_data2$countryMali          -0.1102427  0.0710838  -1.551 0.121189    
## model_data2$countryThe Gambia    -0.1000480  0.0458735  -2.181 0.029377 *  
## otus_binary_b$OTU_67958           0.0908123  0.0489599   1.855 0.063862 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5542 on 1221 degrees of freedom
## Multiple R-squared:  0.02659,    Adjusted R-squared:  0.01942 
## F-statistic: 3.706 on 9 and 1221 DF,  p-value: 0.0001334

summary(multimod10_b)

## 
## Call:
## lm(formula = model_data2$linear_growth ~ model_data2$sex + model_data2$age + 
##     model_data2$time + model_data2$breastfed + model_data2$country + 
##     otus_raw_b$OTU_67958)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3971 -0.3335 -0.0062  0.2996  2.7874 
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      -0.2181423  0.1528495  -1.427   0.1538    
## model_data2$sexMale              -0.0138770  0.0320010  -0.434   0.6646    
## model_data2$age                   0.0004386  0.0001061   4.133 3.83e-05 ***
## model_data2$time                 -0.0016403  0.0019146  -0.857   0.3918    
## model_data2$breastfedexclusively  0.0013566  0.0894389   0.015   0.9879    
## model_data2$breastfedpartially    0.0101976  0.0526828   0.194   0.8465    
## model_data2$countryKenya          0.0074195  0.0520839   0.142   0.8867    
## model_data2$countryMali          -0.0964357  0.0710893  -1.357   0.1752    
## model_data2$countryThe Gambia    -0.0924093  0.0457501  -2.020   0.0436 *  
## otus_raw_b$OTU_67958              0.0011699  0.0033428   0.350   0.7264    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.555 on 1221 degrees of freedom
## Multiple R-squared:  0.02395,    Adjusted R-squared:  0.01675 
## F-statistic: 3.329 on 9 and 1221 DF,  p-value: 0.000497

# anova(multimod3_b, multimod4_b, test= "LRT") ## Testing raw ==> SIGNIFICANT
# anova(multimod4_b, multimod5_b, test= "LRT") ## Testing root 3 ==> SIGNIFICANT
# anova(multimod3_b, multimod5_b, test= "LRT") ## Testing raw and root 3 ==> SIGNIFICANT
# anova(multimod5_b, multimod6_b, test= "LRT") ## testing root4 ==> NON SIGNIFICANT
# Anova(multimod5_b) ## Testing all terms, best OTU model, but with irrelevant demographic covariates
# anova(multimod6_b, multimod5_b, test= "LRT") ## Testing demographic covariates
# Anova(multimod6_b) ## Best model
# summary(multimod7_b)

Association between linear growth and Faecalibacterium abundance - Preliminary analisis

Author: Diego Veliz-Otani

Date created: April 13, 2021

Date Updated: April 20, 2021

1. Prepare session

2. Load the data and create new operationalizations for the microbiome data

2. Univariate analysis of metadata

2.1. Demographic and clinical variables

2.2. Microbiota abundance data

3. Bivariate analyses (before removing outliers)

Raw, OTU_39042

Raw, OTU_39040

Raw, OTU_67958

Raw, OTU_54565

Raw, OTU_87766

Raw, OTU_65293

Raw, OTU_total

Binary, OTU_39042

Binary, OTU_39040

Binary, OTU_67958

Binary, OTU_54565

Binary, OTU_87766

Binary, OTU_65293

Binary, OTU_total

Square Root, OTU_39042

Square Root, OTU_39040

Square Root, OTU_67958

Square Root, OTU_54565

Square Root, OTU_87766

Square Root, OTU_65293

Square Root, OTU_total

4. Multivariable analyses before removing outliers

5. Outlier-removal

6. Best model selection after outliers removal