Author: Diego Veliz-Otani

Date created: April 13, 2021

Date Updated: April 20, 2021

1. Prepare session

2. Load the data and create new operationalizations for the microbiome data

dat <- read.table(dat_file, stringsAsFactors = F, header = T, sep = "\t")
rownames(dat) <- as.character(dat$Sample_Id)
model_data <- dat %>% select(Sample_Id, delta_laz_median, sex, age_days.x, time_span, breastfed, country)
model_data$delta_laz_median <- as.numeric(model_data$delta_laz_median)
model_data$age_days.x <- as.numeric(model_data$age_days.x)
model_data$time_span <- as.numeric(model_data$time_span)

model_data <- model_data %>% dplyr::rename(linear_growth=delta_laz_median)
model_data <- model_data %>% dplyr::rename(age=age_days.x)
model_data <- model_data %>% dplyr::rename(time=time_span)

otus_raw <- dat %>% select(Sample_Id, starts_with("OTU")[1:6])
otus_raw$OTU_total <- rowSums(otus_raw[,-1])
for(i in 2:ncol(otus_raw)){
  otus_raw[,i] <- as.integer(otus_raw[,i])
}

otus_root <- otus_raw
otus_root[,2:8] <- sqrt(otus_raw[,2:8])

otus_binary  <- otus_raw
for(i in 2:ncol(otus_binary)){
  otus_binary[,i] <- as.integer(otus_binary[,i]>=1)
}

otus_total         <- data.frame(Sample_Id=otus_raw$Sample_Id,
                                 stringsAsFactors = F)
otus_total$raw    <- otus_raw$OTU_total
otus_total$root    <- otus_root$OTU_total
otus_total$binary  <- otus_binary$OTU_total
rownames(otus_total) <- rownames(otus_raw)

otu_objects <- c("otus_raw", "otus_binary", "otus_root")
parameterizations <- c("Raw", "Binary", "Square Root")

2. Univariate analysis of metadata

2.1. Demographic and clinical variables

tmp <- model_data %>% select(!Sample_Id)
metadata_summary <- autosummary_list(tmp)
if(!is.na(metadata_summary$quant)){
  kbl(metadata_summary$quant, caption="Summary of the quantitative demographic and clinical variables") %>% 
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
}
Summary of the quantitative demographic and clinical variables
Variable Mean Std. Dev. Min Q1 Q2 Q3 Max
linear_growth -0.2 0.6 -4 -0.5 -0.2 0.1 6.7
age 376.7 172 43 235 356 515 728
time 62.1 11.7 48 52 59 70 102
vars <- unique(metadata_summary$qual$Variable)
tmp <-  metadata_summary$qual %>% filter(Variable==vars[1]) %>% select(!Variable)
kbl(tmp, caption=vars[1]) %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) 
sex
Group Frequency (%)
Female 540(44)
Male 697(56)
NA 0(0)
tmp <-  metadata_summary$qual %>% filter(Variable==vars[2]) %>% select(!Variable)
kbl(tmp, caption=vars[2]) %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
breastfed
Group Frequency (%)
exclusively 69(6)
no 160(13)
partially 1008(81)
NA 0(0)
tmp <-  metadata_summary$qual %>% filter(Variable==vars[3]) %>% select(!Variable)
kbl(tmp, caption=vars[3]) %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
country
Group Frequency (%)
Bangladesh 312(25)
Kenya 520(42)
Mali 85(7)
The Gambia 320(26)
NA 0(0)
cols <- c("dodgerblue4", "gray50")
cols <- adjustcolor(cols, alpha.f = 0.5)

hist(model_data$linear_growth, breaks = 30, main = "Linear growth", 
     xlab = expr(paste(Delta, " height Z-score")), cex.lab = 1.2)

qqnorm(model_data$linear_growth, main = "Linear growth", cex.lab = 1.2,
       col=cols[2], pch = 19)

extreme_values <- sort(model_data$linear_growth)
extreme_values <- extreme_values[c(1:2, (nrow(model_data)-3):nrow(model_data))]
idx_outliers <- model_data$linear_growth %in% extreme_values
outliers <- model_data[idx_outliers,]

par(xpd=TRUE)

qqnorm(model_data$linear_growth, main = "Linear growth", cex.lab = 1.2,
       col=cols[abs(idx_outliers-2)], pch = 19)
grid()
legend("topleft", legend= c("Non-outliers", "Potential outliers"),
       pch =19, bty="L", box.col=cols[2], border=cols[2],
       col = cols[2:1])

par(xpd=FALSE)
kbl(outliers, caption="List of potential outliers whose removal will be assessed") %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
List of potential outliers whose removal will be assessed
Sample_Id linear_growth sex age time breastfed country
102474 102474 -3.96 Female 728 74 partially The Gambia
102333 102333 4.21 Male 512 67 no The Gambia
100432 100432 6.71 Male 715 77 no The Gambia
200253 200253 4.28 Male 471 59 partially Mali
400993 400993 3.91 Male 291 90 partially Kenya
402065 402065 -3.69 Male 554 50 no Kenya

2.2. Microbiota abundance data

overview1 <- data.frame()
for(j in c(1,3)){
  obj <- get(otu_objects[j]) %>% select(!Sample_Id)
  coln <- colnames(obj)
  for(i in (1:ncol(obj))){
    tmp <- round(summary(obj[,i]),2)
    std <- round(sd(obj[,i]),2)
    tmp <- c(parameterizations[j], coln[i], tmp, std)
    overview1 <- rbind(overview1, tmp)
  }
}
colnames(overview1) <- c("Parameterization", "OTU", "Min", "Q1", "Q2", "Mean", "Q3", "Max", "SD")
overview1 <- overview1  %>%  select(Parameterization, OTU, Mean, SD, Q1, Q2, Q3, Min, Max)

nbinary <- unname(colSums(otus_binary[,-1]))
nbinary <- paste0( nbinary, "(", round(nbinary/nrow(otus_binary)*100), ")")

overview2 <- data.frame(Parameterization="Binary", 
                        OTU=colnames(otus_binary[,-1]),
                        N=nbinary,
                        n=colSums(otus_binary[,-1]))

rownames(overview2) <- NULL
# otus_raw    <- otus_raw %>% select(!OTU_total)
# otus_binary <- otus_binary %>% select(!OTU_total)
# otus_root   <- otus_root %>% select(!OTU_total)
kbl(overview1, caption="Summary of the quantitative microbiota abundance variables used as predictors") %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Summary of the quantitative microbiota abundance variables used as predictors
Parameterization OTU Mean SD Q1 Q2 Q3 Min Max
Raw OTU_39042 1.68 10.9 0 0 0 0 190
Raw OTU_39040 1.12 6.56 0 0 0 0 136
Raw OTU_67958 0.94 6.52 0 0 0 0 156
Raw OTU_54565 0.72 2.88 0 0 0 0 46
Raw OTU_87766 0.38 1.43 0 0 0 0 18
Raw OTU_65293 0.31 1.83 0 0 0 0 35
Raw OTU_total 5.15 17.48 0 0 3 0 207
Square Root OTU_39042 0.34 1.25 0 0 0 0 13.78
Square Root OTU_39040 0.32 1.01 0 0 0 0 11.66
Square Root OTU_67958 0.28 0.93 0 0 0 0 12.49
Square Root OTU_54565 0.32 0.79 0 0 0 0 6.78
Square Root OTU_87766 0.21 0.58 0 0 0 0 4.24
Square Root OTU_65293 0.14 0.54 0 0 0 0 5.92
Square Root OTU_total 1.13 1.97 0 0 1.73 0 14.39
kbl(overview2, caption="Summary of the indicator variables for presence of F. prausnitzii") %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Summary of the indicator variables for presence of F. prausnitzii
Parameterization OTU N n
Binary OTU_39042 153(12) 153
Binary OTU_39040 183(15) 183
Binary OTU_67958 170(14) 170
Binary OTU_54565 232(19) 232
Binary OTU_87766 178(14) 178
Binary OTU_65293 103(8) 103
Binary OTU_total 505(41) 505

3. Bivariate analyses (before removing outliers)

model_data$breastfed <- factor(model_data$breastfed) %>% relevel(ref="no")
model_data$country <- factor(model_data$country) %>% relevel(ref="Bangladesh")
model_data$sex <- factor(model_data$sex) %>% relevel(ref="Female")

bivariate_metadata <- data.frame()
for(i in 3:ncol(model_data)){
  mod_tmp <- lm(model_data$linear_growth ~ model_data[,i]) %>% confint2(digits = 4)  
  bivariate_metadata <- rbind(bivariate_metadata, mod_tmp)
}

bivariate_metadata$variable <- rownames(bivariate_metadata)
rownames(bivariate_metadata) <- NULL
bivariate_metadata <- bivariate_metadata[,c(4,1:3)]
bivariate_metadata$variable <- c("intercept_sex", "Male", "intercept_age", "age",
                                 "intercept_timespan", "timespan", "intercept_breastfed",
                                 "Exclusively", "Partially", "intercept_country",
                                 "Kenya", "Mali", "The_Gambia")
bivariate_metadata <- bivariate_metadata[-grep("intercept_", bivariate_metadata$variable),]

otus_names <- colnames(otus_raw[,-1])
bivariate_obj <- c("bivariate_raw", "bivariate_root", "bivariate_binary")
otus_obj <- c("otus_raw", "otus_root", "otus_binary")
options(par(mfrow=c(1,2)))
for(j in 1:3){
  otus_tmp <- get(otus_obj[j])
  bivariate_tmp <- data.frame()
  for(i in 2:ncol(otus_tmp)){
    mod_tmp <- lm(model_data$linear_growth ~ otus_tmp[,i])   
    # cat(paste0("_**", parameterizations[j], ", ", colnames(otus_tmp)[i], ":**_  \n  "))
    cat("  \n")
    cat(paste0("### ", parameterizations[j], ", ", colnames(otus_tmp)[i], "  \n"))
    cat("  \n")
    # cat(paste0("### ", parameterizations[j], ", ", colnames(otus_tmp)[i], "  \n"))
    par(mfrow=c(3,2))
    plot(mod_tmp, which = 1:6, pch = 19, main = parameterizations[j],
         col=cols[(idx_outliers-1)^2+1], lwd = 3, cex.lab = 2, cex = 1.2, cex.id = 1.2)
    cat("  \n  \n  ")
    mod_tmp <- mod_tmp  %>% confint2(digits = 4)  
    bivariate_tmp <- rbind(bivariate_tmp, mod_tmp)
  }
  bivariate_tmp <- bivariate_tmp[-c(1,3,5,7,9,11,13),]
  bivariate_tmp$variable<- otus_names
  bivariate_tmp <- bivariate_tmp[,c(4,1:3)]
  rownames(bivariate_tmp) <- NULL
  assign(bivariate_obj[j], bivariate_tmp)
}

Raw, OTU_39042

Raw, OTU_39040

Raw, OTU_67958

Raw, OTU_54565

Raw, OTU_87766

Raw, OTU_65293

Raw, OTU_total

Binary, OTU_39042

Binary, OTU_39040

Binary, OTU_67958

Binary, OTU_54565

Binary, OTU_87766

Binary, OTU_65293

Binary, OTU_total

Square Root, OTU_39042

Square Root, OTU_39040

Square Root, OTU_67958

Square Root, OTU_54565

Square Root, OTU_87766

Square Root, OTU_65293

Square Root, OTU_total

par(mfrow=c(1,1))

kbl(bivariate_metadata, caption="Crude associations of linear growth with demographic and clinical variables") %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Crude associations of linear growth with demographic and clinical variables
variable Estimate P-value 95%CI
2 Male 0.0175 0.6371 (-0.0552, 0.0901)
4 age 4e-04 1e-04 (2e-04, 6e-04)
6 timespan -0.001 0.5061 (-0.0041, 0.002)
8 Exclusively -0.2301 0.0133 (-0.4122, -0.048)
9 Partially -0.1088 0.0474 (-0.2164, -0.0013)
11 Kenya 0.0276 0.5509 (-0.0631, 0.1183)
12 Mali -0.0457 0.5628 (-0.2007, 0.1093)
13 The_Gambia -0.0408 0.4278 (-0.1415, 0.06)
kbl(bivariate_raw, caption="Crude associations of linear growth with RAW F. prausnitzii counts") %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Crude associations of linear growth with RAW F. prausnitzii counts
variable Estimate P-value 95%CI
OTU_39042 0.0013 0.4256 (-0.002, 0.0046)
OTU_39040 0.0012 0.6664 (-0.0043, 0.0067)
OTU_67958 0.0144 0 (0.0089, 0.0199)
OTU_54565 0.0148 0.0204 (0.0023, 0.0272)
OTU_87766 -0.0077 0.5486 (-0.0328, 0.0174)
OTU_65293 -0.0021 0.8334 (-0.0218, 0.0176)
OTU_total 0.003 0.004 (0.001, 0.0051)
kbl(bivariate_root, caption="Crude associations of linear growth with square root(F. prausnitzii counts)") %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Crude associations of linear growth with square root(F. prausnitzii counts)
variable Estimate P-value 95%CI
OTU_39042 0.0202 0.1685 (-0.0086, 0.049)
OTU_39040 0.0267 0.1433 (-0.0091, 0.0625)
OTU_67958 0.0735 2e-04 (0.0349, 0.1121)
OTU_54565 0.0365 0.1182 (-0.0093, 0.0824)
OTU_87766 -0.0165 0.6011 (-0.0784, 0.0454)
OTU_65293 -0.0014 0.9666 (-0.0684, 0.0655)
OTU_total 0.0265 0.0045 (0.0082, 0.0447)
kbl(bivariate_binary, caption="Crude associations of linear growth with presence of F. prausnitzii") %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Crude associations of linear growth with presence of F. prausnitzii
variable Estimate P-value 95%CI
OTU_39042 0.1284 0.0213 (0.0192, 0.2375)
OTU_39040 0.0734 0.1556 (-0.028, 0.1748)
OTU_67958 0.1041 0.0507 (-3e-04, 0.2086)
OTU_54565 0.0166 0.7237 (-0.0757, 0.1089)
OTU_87766 -0.0056 0.9148 (-0.1082, 0.097)
OTU_65293 0.0014 0.9831 (-0.129, 0.1318)
OTU_total 0.0863 0.0208 (0.0131, 0.1594)
varnames <- c("Sex", "Country", "Breastfed")
col_pos <- c(3, 6, 7)

  boxplot(model_data$linear_growth~model_data$sex,
          main = varnames[i], xlab = "", ylab = "Linear growth",
          col=cols[2], pch = 19)

plotdata <- model_data  
plotdata$breastfed <- str_to_sentence(plotdata$breastfed)
plotdata$sex <- str_to_sentence(plotdata$sex)
plotdata$country <- str_to_sentence(plotdata$country)



ggplot(data = plotdata, aes(x = sex, y = linear_growth)) +
  geom_boxplot(
     position = position_dodge(width=0.5),
      aes(fill = sex),
     outlier.colour = cols[2],
     outlier.size = 0.8) +
    scale_fill_manual(values = cols) +

    facet_wrap(~ breastfed + country, ncol = 4, nrow=3) +
    #Set the size of the plotting window
    theme_bw(base_size=12) +

      #Modify various aspects of the plot text and legend
      theme(
        legend.position = 'none',
        legend.background = element_rect(),
        axis.text.x = element_text(angle = 45, size = 12, hjust = 1.10),
        axis.text.y = element_text(angle = 0, size = 12, vjust = 0.5),
        axis.title = element_text(size = 12),
        axis.title.x = element_text(size = 14, face="bold"),
        axis.title.y = element_text(size = 14,  face="bold"),

        #facet wrap
        strip.text.x = element_text(size = 12),
        strip.text.y = element_text(size = 12, angle=0),
        strip.background = element_rect(fill = 'white'), 
        strip.text = element_text(size = 12, colour = 'black'))+

    #Change the size of the icons/symbols in the legend
    guides(colour = guide_legend(override.aes = list(size = 2.5))) +

    #Set x- and y-axes labels
    ylab('Linear growth') +
  xlab("Sex") +
  labs(title = 'Linear growth stratified by sex, country and history of being breastfed')

### Second plot:

plot2 <- ggplot(data = plotdata, aes(x = sex, y = linear_growth)) +
  geom_boxplot(
     position = position_dodge(width=0.5),
      aes(fill = sex),
     outlier.colour = cols[2],
     outlier.size = 1) +
    scale_fill_manual(values = c(cols[2], cols[2])) +

    facet_wrap(~ breastfed, ncol=3) +
    #Set the size of the plotting window
    theme_bw(base_size=12) +

      #Modify various aspects of the plot text and legend
      theme(
        legend.position = 'none',
        legend.background = element_rect(),
        axis.text.x = element_text(angle = 45, size = 12, hjust = 1.10),
        axis.text.y = element_text(angle = 0, size = 12, vjust = 0.5),
        axis.title = element_text(size = 12),
        axis.title.x = element_text(size = 14, face = "bold"),
        axis.title.y = element_text(size = 14, face = "bold"),

        #facet wrap
        strip.text.x = element_text(size = 12),
        strip.text.y = element_text(size = 12, angle=0),
        strip.background = element_rect(fill = 'white'), 
        strip.text = element_text(size = 12, colour = 'black'))+

    #Change the size of the icons/symbols in the legend
    guides(colour = guide_legend(override.aes = list(size = 2.5))) +

    #Set x- and y-axes labels
    ylab('Linear growth') +
    xlab('Sex')  +
   labs(title = 'Linear growth stratified by sex and history of being breastfed')


### Third plot:

plot3 <- ggplot(data = plotdata, aes(x = sex, y = linear_growth)) +
  geom_boxplot(
     position = position_dodge(width=0.5),
      aes(fill = sex),
     outlier.colour = cols[2],
     outlier.size = 1) +
    scale_fill_manual(values = c(cols[2], cols[2])) +

    facet_wrap(~ country, ncol=4) +
    #Set the size of the plotting window
    theme_bw(base_size=12) +

      #Modify various aspects of the plot text and legend
      theme(
        legend.position = 'none',
        legend.background = element_rect(),
        axis.text.x = element_text(angle = 45, size = 12, hjust = 1.10),
        axis.text.y = element_text(angle = 0, size = 12, vjust = 0.5),
        axis.title = element_text(size = 12),
        axis.title.x = element_text(size = 14, face = "bold"),
        axis.title.y = element_text(size = 14, face = "bold"),

        #facet wrap
        strip.text.x = element_text(size = 12),
        strip.text.y = element_text(size = 12, angle=0),
        strip.background = element_rect(fill = 'white'), 
        strip.text = element_text(size = 12, colour = 'black'))+

    #Change the size of the icons/symbols in the legend
    guides(colour = guide_legend(override.aes = list(size = 2.5))) +

    #Set x- and y-axes labels
    ylab('Linear growth') +
    xlab('Sex')  +
   labs(title = 'Linear growth stratified by sex and country')

grid.arrange(plot2, plot3, nrow=2)

options(par(mfrow=c(3,1)))
cat("  \n")
cat("**Bivariate association: Linear growth vs OTU_total**  \n  ")
## **Bivariate association: Linear growth vs OTU_total**  
## 
plot(model_data$linear_growth ~ otus_raw$OTU_total, pch=19, col = cols[(idx_outliers-1)^2+1],
     main = "Linear growth vs Raw OTU_total", xlab = "Raw OTU_total", ylab= "Linear growth",
     cex.lab=1.6, cex=1.6, cex.main = 1.6); grid()
plot(model_data$linear_growth ~ otus_root$OTU_total, pch=19, col = cols[(idx_outliers-1)^2+1],
     main = "Linear growth Square root(OTU_total)", xlab = "Square root(OTU_total)", ylab= "Linear growth",
     cex.lab=1.6, cex=1.6, cex.main = 1.6); grid()
plot(model_data$linear_growth ~ otus_binary$OTU_total, pch=19, col = cols[(idx_outliers-1)^2+1],
     main = "Linear growth vs Binary OTU_total", xlab = "Binary OTU_total", ylab= "Linear growth",
     cex.lab=1.6, cex=1.6, cex.main = 1.6); grid()

cat("  \n")
cat("**Bivariate association: Linear growth vs OTU_67958**  \n  ")
## **Bivariate association: Linear growth vs OTU_67958**  
## 
plot(model_data$linear_growth ~ otus_raw$OTU_67958, pch=19, col = cols[(idx_outliers-1)^2+1],
     main = "Linear growth vs Raw OTU_67958", xlab = "Raw OTU_67958", ylab= "Linear growth",
     cex.lab=1.6, cex=1.6, cex.main = 1.6); grid()
plot(model_data$linear_growth ~ otus_root$OTU_67958, pch=19, col = cols[(idx_outliers-1)^2+1],
     main = "Linear growth Square root(OTU_67958)", xlab = "Square root(OTU_67958)", ylab= "Linear growth",
     cex.lab=1.6, cex=1.6, cex.main = 1.6); grid()
plot(model_data$linear_growth ~ otus_binary$OTU_67958, pch=19, col = cols[(idx_outliers-1)^2+1],
     main = "Linear growth vs Binary OTU_67958", xlab = "Binary OTU_67958", ylab= "Linear growth",
     cex.lab=1.6, cex=1.6, cex.main = 1.6); grid()

cat("  \n")
options(par(mfrow=c(1,1)))

4. Multivariable analyses before removing outliers

otus_root3 <- otus_3root <- otus_raw^(1/3)
otus_root4 <- otus_4root <- otus_raw^(1/4)

multimod0 <- lm(model_data$linear_growth ~ model_data$sex + model_data$age  + model_data$time +
                  model_data$breastfed + model_data$country + otus_binary$OTU_total)

multimod1 <- lm(model_data$linear_growth ~ model_data$sex + model_data$age  + model_data$time +
                  model_data$breastfed + model_data$country + otus_root$OTU_total + otus_binary$OTU_total)

multimod2 <- lm(model_data$linear_growth ~ model_data$sex + model_data$age  + model_data$time +
                  model_data$breastfed + model_data$country + otus_root$OTU_total + otus_binary$OTU_total +
                  otus_raw$OTU_total)

multimod3 <- lm(model_data$linear_growth ~ model_data$sex + model_data$age  + model_data$time +
                  model_data$breastfed + model_data$country + otus_root$OTU_67958 + otus_binary$OTU_67958)

multimod4 <- lm(model_data$linear_growth ~ model_data$sex + model_data$age  + model_data$time +
                  model_data$breastfed + model_data$country + otus_root$OTU_67958 + otus_binary$OTU_67958 + 
                  otus_raw$OTU_67958)

multimod5 <- lm(model_data$linear_growth ~ model_data$sex + model_data$age  + model_data$time +
                  model_data$breastfed + model_data$country + otus_root$OTU_67958 + otus_binary$OTU_67958 + 
                  otus_raw$OTU_67958 + otus_root3$OTU_67958)

multimod6 <- lm(model_data$linear_growth ~  model_data$age  + otus_root$OTU_67958 +
                  otus_binary$OTU_67958 + 
                  otus_raw$OTU_67958 + otus_root3$OTU_67958 ) ## Best model

for(i in 0:6){
  tmpmod <- get(paste0("multimod", i))
  tmp_output <- format_model(tmpmod)
  tmpcaption <- paste0("Multivariable model ", i, ". R-squared: ", round(summary(tmpmod)$r.squared, 3))
  cat("  \n")
  print(kbl(format_model(tmpmod), caption=tmpcaption) %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")))
  cat("  \n")
}
Multivariable model 0. R-squared: 0.017
Estimate Std. Error t-statistic P-value variable
-0.3224074 0.1756001 -1.8360323 0.0665947 Intercept
0.0179152 0.0370226 0.4838985 0.6285443 sexMale
0.0003944 0.0001339 2.9447571 0.0032930 age
0.0004505 0.0021974 0.2050025 0.8376042 time
-0.0693451 0.1033678 -0.6708575 0.5024376 breastfedexclusively
-0.0388964 0.0605777 -0.6420918 0.5209335 breastfedpartially
0.0357739 0.0611468 0.5850498 0.5586218 countryKenya
-0.0291160 0.0821072 -0.3546094 0.7229431 countryMali
-0.0722800 0.0531170 -1.3607713 0.1738359 countryThe Gambia
0.0153269 0.0434695 0.3525896 0.7244567 otus_binary: total
Multivariable model 1. R-squared: 0.018
Estimate Std. Error t-statistic P-value variable
-0.3177217 0.1756040 -1.8093085 0.0706479 Intercept
0.0178470 0.0370147 0.4821582 0.6297796 sexMale
0.0003810 0.0001344 2.8354141 0.0046518 age
0.0004161 0.0021971 0.1893983 0.8498120 time
-0.0650103 0.1034057 -0.6286916 0.5296681 breastfedexclusively
-0.0346777 0.0606614 -0.5716597 0.5676574 breastfedpartially
0.0294528 0.0613483 0.4800912 0.6312482 countryKenya
-0.0356595 0.0822611 -0.4334919 0.6647337 countryMali
-0.0705533 0.0531242 -1.3280832 0.1843978 countryThe Gambia
0.0161363 0.0130865 1.2330523 0.2177926 otus_root: total
-0.0256736 0.0547215 -0.4691687 0.6390325 otus_binary: total
Multivariable model 2. R-squared: 0.021
Estimate Std. Error t-statistic P-value variable
-0.3230552 0.1754316 -1.8414882 0.0657917 Intercept
0.0177924 0.0369738 0.4812159 0.6304490 sexMale
0.0003914 0.0001343 2.9136296 0.0036374 age
0.0004546 0.0021948 0.2071188 0.8359515 time
-0.0690215 0.1033123 -0.6680857 0.5042047 breastfedexclusively
-0.0380190 0.0606191 -0.6271776 0.5306597 breastfedpartially
0.0354774 0.0613602 0.5781836 0.5632464 countryKenya
-0.0340753 0.0821743 -0.4146708 0.6784555 countryMali
-0.0680025 0.0530819 -1.2810852 0.2004062 countryThe Gambia
-0.0510185 0.0372090 -1.3711342 0.1705842 otus_root: total
0.0794356 0.0772072 1.0288630 0.3037472 otus_binary: total
0.0061885 0.0032104 1.9276759 0.0541264 otus_raw: total
Multivariable model 3. R-squared: 0.027
Estimate Std. Error t-statistic P-value variable
-0.3145681 0.1748041 -1.7995470 0.0721780 Intercept
0.0140232 0.0368109 0.3809518 0.7033050 sexMale
0.0003880 0.0001245 3.1158669 0.0018766 age
0.0004078 0.0021867 0.1864969 0.8520859 time
-0.0647361 0.1029237 -0.6289719 0.5294846 breastfedexclusively
-0.0385193 0.0603596 -0.6381639 0.5234862 breastfedpartially
0.0340291 0.0602397 0.5648950 0.5722485 countryKenya
-0.0587492 0.0819960 -0.7164888 0.4738260 countryMali
-0.0661855 0.0528965 -1.2512255 0.2110909 countryThe Gambia
0.1042277 0.0302550 3.4449773 0.0005904 otus_root: 67958
-0.1668640 0.0833439 -2.0021145 0.0454924 otus_binary: 67958
Multivariable model 4. R-squared: 0.042
Estimate Std. Error t-statistic P-value variable
-0.3122884 0.1735037 -1.7998949 0.0721232 Intercept
0.0177847 0.0365468 0.4866278 0.6266092 sexMale
0.0003889 0.0001236 3.1467588 0.0016907 age
0.0004563 0.0021704 0.2102508 0.8335069 time
-0.0631722 0.1021582 -0.6183767 0.5364420 breastfedexclusively
-0.0456468 0.0599321 -0.7616420 0.4464203 breastfedpartially
0.0375940 0.0597968 0.6286964 0.5296650 countryKenya
-0.0864847 0.0816282 -1.0594952 0.2895832 countryMali
-0.0724589 0.0525221 -1.3795891 0.1679650 countryThe Gambia
-0.2196245 0.0793233 -2.7687270 0.0057124 otus_root: 67958
0.2551566 0.1264786 2.0173899 0.0438726 otus_binary: 67958
0.0349795 0.0079301 4.4109942 0.0000112 otus_raw: 67958
Multivariable model 5. R-squared: 0.052
Estimate Std. Error t-statistic P-value variable
-0.3137674 0.1726827 -1.8170167 0.0694590 Intercept
0.0093235 0.0364513 0.2557801 0.7981636 sexMale
0.0003706 0.0001231 3.0101779 0.0026644 age
0.0008669 0.0021632 0.4007272 0.6886909 time
-0.0833005 0.1018315 -0.8180228 0.4135034 breastfedexclusively
-0.0600623 0.0597856 -1.0046279 0.3152746 breastfedpartially
0.0408560 0.0595207 0.6864161 0.4925807 countryKenya
-0.0882210 0.0812432 -1.0858885 0.2777421 countryMali
-0.0754095 0.0522800 -1.4424174 0.1494405 countryThe Gambia
-3.7371378 0.9908653 -3.7715902 0.0001700 otus_root: 67958
-2.4478734 0.7693765 -3.1816328 0.0015013 otus_binary: 67958
0.1323734 0.0284642 4.6505185 0.0000037 otus_raw: 67958
6.0056006 1.6863683 3.5612628 0.0003832 otus_root3: 67958
Multivariable model 6. R-squared: 0.046
Estimate Std. Error t-statistic P-value variable
-0.3213757 0.0433946 -7.405896 0.0000000 Intercept
0.0003798 0.0001082 3.510398 0.0004636 age
-3.6363710 0.9854776 -3.689958 0.0002341 otus_root: 67958
-2.3928970 0.7649901 -3.128011 0.0018012 otus_binary: 67958
0.1282339 0.0283022 4.530881 0.0000064 otus_raw: 67958
5.8602828 1.6771533 3.494184 0.0004924 otus_root3: 67958
mods1 <- c(3,4,3, 5)
mods2 <- c(4, 5, 5, 6)
for(i in 1:4){
  tmpmod1 <- get(paste0("multimod", mods1[i]))
  tmpmod2 <- get(paste0("multimod", mods2[i]))
  tmppval <- round(anova(tmpmod1, tmpmod2, test= "F")$P[2],4)
  tmppval <- ifelse(tmppval<1e-4, "p<1e-4", tmppval)
  tmpstr <- paste0("* Partial F test: Model ", mods1[i], 
                   " vs Model ", mods2[i], ": ",
                   tmppval,  "  \n  \n")
  cat(tmpstr)
}

5. Outlier-removal

model_data2 <- model_data[!idx_outliers,]
otus_binary_b <- otus_binary[!idx_outliers,]
otus_raw_b <- otus_raw[!idx_outliers,]
otus_root_b <- otus_root[!idx_outliers,]
otus_3root_b <- otus_3root[!idx_outliers,]
otus_4root_b <- otus_4root[!idx_outliers,]

par(mfrow=c(3,2))
plot(multimod6, which = 1:6, pch = 19, main = "Best model before removing outliers" ,
     col=cols[(idx_outliers-1)^2+1], cex = 1.5, cex.lab = 1.6, cex.main=1.4, lwd = 2,
     cex.id=1.4)

6. Best model selection after outliers removal

multimod0_b <- lm(model_data2$linear_growth ~ model_data2$sex + model_data2$age  + model_data2$time +
                    model_data2$breastfed + model_data2$country + otus_binary_b$OTU_total)

multimod1_b <- lm(model_data2$linear_growth ~ model_data2$sex + model_data2$age  + model_data2$time +
                    model_data2$breastfed + model_data2$country + otus_root_b$OTU_total + otus_binary_b$OTU_total)

multimod2_b <- lm(model_data2$linear_growth ~ model_data2$sex + model_data2$age  + model_data2$time +
                    model_data2$breastfed + model_data2$country + otus_root_b$OTU_total + otus_binary_b$OTU_total + 
                    otus_raw_b$OTU_total)

multimod3_b <- lm(model_data2$linear_growth ~ model_data2$sex + model_data2$age  + model_data2$time +
                    model_data2$breastfed + model_data2$country + otus_root_b$OTU_67958 + otus_binary_b$OTU_67958)

multimod4_b <- lm(model_data2$linear_growth ~ model_data2$sex + model_data2$age  + model_data2$time +
                    model_data2$breastfed + model_data2$country + otus_root_b$OTU_67958 + otus_binary_b$OTU_67958 + 
                    otus_raw_b$OTU_67958)

multimod5_b <- lm(model_data2$linear_growth ~ model_data2$sex + model_data2$age  + model_data2$time +
                    model_data2$breastfed + model_data2$country + otus_root_b$OTU_67958 + otus_binary_b$OTU_67958 + 
                    otus_raw_b$OTU_67958 + otus_3root_b$OTU_67958)

multimod6_b <- lm(model_data2$linear_growth ~  model_data2$age  + otus_root_b$OTU_67958 +
                    otus_binary_b$OTU_67958 + 
                    otus_raw_b$OTU_67958 + otus_3root_b$OTU_67958 ) ## Best model

multimod7_b <- lm(model_data2$linear_growth ~  model_data2$age  + otus_root_b$OTU_67958 +
                    otus_binary_b$OTU_67958 + otus_3root_b$OTU_67958 )


multimod8_b <- lm(model_data2$linear_growth ~ model_data2$sex + model_data2$age  + model_data2$time +
                    model_data2$breastfed + model_data2$country + otus_binary_b$OTU_67958 + 
                    otus_raw_b$OTU_67958)
multimod9_b <- lm(model_data2$linear_growth ~ model_data2$sex + model_data2$age  + model_data2$time +
                    model_data2$breastfed + model_data2$country + otus_binary_b$OTU_67958)
multimod10_b <- lm(model_data2$linear_growth ~ model_data2$sex + model_data2$age  + model_data2$time +
                     model_data2$breastfed + model_data2$country + otus_raw_b$OTU_67958)

summary(multimod7_b)
## 
## Call:
## lm(formula = model_data2$linear_growth ~ model_data2$age + otus_root_b$OTU_67958 + 
##     otus_binary_b$OTU_67958 + otus_3root_b$OTU_67958)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.4480 -0.3267 -0.0020  0.3017  2.7190 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             -3.191e-01  3.823e-02  -8.347  < 2e-16 ***
## model_data2$age          3.336e-04  9.544e-05   3.496  0.00049 ***
## otus_root_b$OTU_67958    2.673e-02  2.346e-01   0.114  0.90929    
## otus_binary_b$OTU_67958  1.886e-01  3.203e-01   0.589  0.55614    
## otus_3root_b$OTU_67958  -9.671e-02  5.101e-01  -0.190  0.84965    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5558 on 1226 degrees of freedom
## Multiple R-squared:  0.01716,    Adjusted R-squared:  0.01395 
## F-statistic: 5.352 on 4 and 1226 DF,  p-value: 0.0002838
summary(multimod8_b)
## 
## Call:
## lm(formula = model_data2$linear_growth ~ model_data2$sex + model_data2$age + 
##     model_data2$time + model_data2$breastfed + model_data2$country + 
##     otus_binary_b$OTU_67958 + otus_raw_b$OTU_67958)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.38093 -0.33389 -0.01144  0.29533  2.79960 
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      -0.2140267  0.1527081  -1.402 0.161306    
## model_data2$sexMale              -0.0129703  0.0319718  -0.406 0.685048    
## model_data2$age                   0.0003988  0.0001081   3.689 0.000235 ***
## model_data2$time                 -0.0016020  0.0019128  -0.838 0.402462    
## model_data2$breastfedexclusively  0.0097166  0.0894581   0.109 0.913525    
## model_data2$breastfedpartially    0.0159552  0.0527181   0.303 0.762208    
## model_data2$countryKenya         -0.0059620  0.0525175  -0.114 0.909633    
## model_data2$countryMali          -0.1080011  0.0712835  -1.515 0.130008    
## model_data2$countryThe Gambia    -0.1008688  0.0459252  -2.196 0.028253 *  
## otus_binary_b$OTU_67958           0.1006301  0.0536711   1.875 0.061040 .  
## otus_raw_b$OTU_67958             -0.0016366  0.0036594  -0.447 0.654800    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5544 on 1220 degrees of freedom
## Multiple R-squared:  0.02675,    Adjusted R-squared:  0.01878 
## F-statistic: 3.354 on 10 and 1220 DF,  p-value: 0.000253
summary(multimod9_b)
## 
## Call:
## lm(formula = model_data2$linear_growth ~ model_data2$sex + model_data2$age + 
##     model_data2$time + model_data2$breastfed + model_data2$country + 
##     otus_binary_b$OTU_67958)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.38157 -0.33237 -0.01148  0.29599  2.79913 
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      -0.2129967  0.1526407  -1.395 0.163145    
## model_data2$sexMale              -0.0129728  0.0319613  -0.406 0.684893    
## model_data2$age                   0.0003987  0.0001081   3.689 0.000235 ***
## model_data2$time                 -0.0016179  0.0019118  -0.846 0.397576    
## model_data2$breastfedexclusively  0.0100585  0.0894255   0.112 0.910462    
## model_data2$breastfedpartially    0.0158888  0.0527006   0.301 0.763091    
## model_data2$countryKenya         -0.0060204  0.0525002  -0.115 0.908722    
## model_data2$countryMali          -0.1102427  0.0710838  -1.551 0.121189    
## model_data2$countryThe Gambia    -0.1000480  0.0458735  -2.181 0.029377 *  
## otus_binary_b$OTU_67958           0.0908123  0.0489599   1.855 0.063862 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5542 on 1221 degrees of freedom
## Multiple R-squared:  0.02659,    Adjusted R-squared:  0.01942 
## F-statistic: 3.706 on 9 and 1221 DF,  p-value: 0.0001334
summary(multimod10_b)
## 
## Call:
## lm(formula = model_data2$linear_growth ~ model_data2$sex + model_data2$age + 
##     model_data2$time + model_data2$breastfed + model_data2$country + 
##     otus_raw_b$OTU_67958)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3971 -0.3335 -0.0062  0.2996  2.7874 
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      -0.2181423  0.1528495  -1.427   0.1538    
## model_data2$sexMale              -0.0138770  0.0320010  -0.434   0.6646    
## model_data2$age                   0.0004386  0.0001061   4.133 3.83e-05 ***
## model_data2$time                 -0.0016403  0.0019146  -0.857   0.3918    
## model_data2$breastfedexclusively  0.0013566  0.0894389   0.015   0.9879    
## model_data2$breastfedpartially    0.0101976  0.0526828   0.194   0.8465    
## model_data2$countryKenya          0.0074195  0.0520839   0.142   0.8867    
## model_data2$countryMali          -0.0964357  0.0710893  -1.357   0.1752    
## model_data2$countryThe Gambia    -0.0924093  0.0457501  -2.020   0.0436 *  
## otus_raw_b$OTU_67958              0.0011699  0.0033428   0.350   0.7264    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.555 on 1221 degrees of freedom
## Multiple R-squared:  0.02395,    Adjusted R-squared:  0.01675 
## F-statistic: 3.329 on 9 and 1221 DF,  p-value: 0.000497
# anova(multimod3_b, multimod4_b, test= "LRT") ## Testing raw ==> SIGNIFICANT
# anova(multimod4_b, multimod5_b, test= "LRT") ## Testing root 3 ==> SIGNIFICANT
# anova(multimod3_b, multimod5_b, test= "LRT") ## Testing raw and root 3 ==> SIGNIFICANT
# anova(multimod5_b, multimod6_b, test= "LRT") ## testing root4 ==> NON SIGNIFICANT
# Anova(multimod5_b) ## Testing all terms, best OTU model, but with irrelevant demographic covariates
# anova(multimod6_b, multimod5_b, test= "LRT") ## Testing demographic covariates
# Anova(multimod6_b) ## Best model
# summary(multimod7_b)