COD_20251201_IBS7048_Week14_ML

Minsik Kim

2025.12.01

Loading packages

Loading data

my_phyloseq <- readRDS("Inha/5_Lectures/2_IBS7048_Metagenomics/2025F/scripts/IBS7048_Advanced_metagenomics/phyloseq_ex_host_dep_deidentified.rds")

DA analysis results

# Before running DA analysis, you need to double check the distribution of your data

set.seed(seed)


# I did CLR transform with `microbiome` package ---------------------------------

my_phyloseq_da <- my_phyloseq %>% # set your file here
        subset_taxa(., Domain %in% "k__Bacteria") %>%
        prune_taxa(taxa_sums(.) > 0, .) %>%
        microbiome::transform(., 'clr') 

sample_data(my_phyloseq_da) <- sample_data(my_phyloseq) %>% # set your file here
        data.frame() %>%
        mutate(treatment = case_when(
                control == 1 ~ "0",
                depletion_method1 == 1 ~ "1",
                depletion_method2 == 1 ~ "2",
                depletion_method3 == 1 ~ "3",
                depletion_method4 == 1 ~ "4",
                depletion_method5 == 1 ~ "5",
                .default = NA_character_)) %>%
        sample_data

#dummy <- compositions::clr(otu_table(phyloseq_metaphlan$phy_microbe_rel)) %>% data.frame()
#dummy2 <- otu_table(physeq_mic_clr_school) %>% data.frame()

# LME4 for CLR normalized data --------------------------------------------

lmer_function <- function(phyloseq_lmer) {
        
        lmer_taxa_rel_clr <- data.frame()
        
        all <- phyloseq_lmer %>% taxa_sums() %>% length()
        
        for(i in 1:all) {
                #Creating a data frame that includes CLR transformed data of i-th bug.
                #making differnt otu tables for each sample type
                otu_table <- phyloseq_lmer %>% otu_table %>% t
                
                #tax table for different sample type
                taxa_data <- otu_table[,i] %>% data.frame()
                
                #Making a merged dataframe having sample data and CLR transformed output
                lme_data <- merge(taxa_data, sample_data(phyloseq_lmer), by = 0) %>% column_to_rownames("Row.names")
                
                #generating a character of formula.
                #Here, as the taxa are already CLR transformed, I did not make model at log-scale.
                lme4_formula <- paste(names(taxa_data), "~ treatment + (1|subject_id)") # <-- modify your model here
                
                
                class_result <- tryCatch({
                        model <- lmerTest::lmer(formula = lme4_formula, data = lme_data)
                        summary(model) %>%
                                .$coefficients %>%
                                data.frame(check.names = FALSE) %>%
                                mutate(feature = colnames(otu_table)[i]) %>%
                                rownames_to_column("value") %>%
                                subset(value %in% c("treatment1", "treatment2","treatment3","treatment4", "treatment5"))  # <-- modify based on your model here
                } %>% suppressMessages(), 
                error = function(e) {
                        message(paste("Error in model for taxon", i, ":", e$message))
                        return(NULL)
                }) 
                
                #row binding all the associations of i-th taxa to one data frame
                lmer_taxa_rel_clr <- rbind(lmer_taxa_rel_clr,
                                                 class_result) %>%
                        remove_rownames()
                
                
        } 
        lmer_taxa_rel_clr
}


da_result <- lmer_function(my_phyloseq_da)# set your file here
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge with max|grad| = 0.00278885 (tol = 0.002, component 1)
da_result <- da_result %>%
        mutate(qval = qvalue::qvalue(`Pr(>|t|)`, lambda = 0)$qvalue,
               p_bh = p.adjust(p = `Pr(>|t|)`, method = "BH")) %>%
        suppressMessages()

Table visualization

da_result %>% reactable::reactable(searchable = T, sortable = T, filterable = T)
species_italic <- function(data) { # richtext formatting with Phage
  clean <- gsub("_A|_B", "", data)
  clean <- gsub("_", " ", clean)
  clean <- gsub("[.]", " ", clean)
  clean <- gsub("\\[|\\]", "", clean)
  clean <- trimws(gsub("\\s+", " ", clean))

  # 1) Case of phage 
  out <- ifelse(
    grepl("(?i)host\\s*unknown", clean),              # Host unknown --> regular font
    paste0("'", clean, "'"),
    NA_character_
  )

  is_phage_code <- grepl("(?i)^vsgb\\d+", clean)      # find vSGBxxxxx 

  out[is.na(out) & is_phage_code] <- vapply(clean[is.na(out) & is_phage_code], function(x) {
    toks  <- strsplit(x, "\\s+")[[1]]
    gidxs <- which(grepl("^[A-Z][a-z]+$", toks))      #  Genus
    if (length(gidxs)) {
      genus  <- toks[tail(gidxs, 1)]
      prefix <- sub(paste0("\\s*", genus, "$"), "", x)
      paste0("'", prefix, "'~italic('", genus, "')")
    } else {
      paste0("'", x, "'")                             # no Genus: regular font
    }
  }, character(1))

  remain <- is.na(out)
  out[remain] <- ifelse(
    grepl(" sp", clean[remain]), {
      parts  <- strsplit(clean[remain], " sp", fixed = TRUE)
      genus  <- vapply(parts, `[`, character(1), 1)
      strain <- vapply(parts, function(p) ifelse(length(p) > 1, trimws(p[2]), ""), character(1))
      paste0("italic('", genus, "')~'sp.'~'", strain, "'")
    },
    paste0("italic('", clean[remain], "')")
  )

  return(out)
}

  
  
da_result <- bind_rows(da_result, .id = "Taxa") %>%
  #        column_to_rownames(feature)
        left_join(., 
                  my_phyloseq_da %>%
                          tax_table() %>%
                          as("matrix") %>%
                          as.data.frame(stringsAsFactors = FALSE) %>%
                          rownames_to_column("feature")) %>%
        mutate(alpha  = if_else(qval <= 0.1, 0.9, 0.05),
                ## lables interaction only
                feature = species_italic(feature),
                label_candidate = case_when(
                         qval <= 0.1 ~ feature,
                        TRUE ~ NA_character_),
                ## label separation (left & right)
                label_left  = if_else(Estimate < 0, label_candidate, NA_character_),
                label_right = if_else(Estimate > 0, label_candidate, NA_character_)) 
## Joining with `by = join_by(feature)`
da_result %>%
        ggplot(. ,
               aes(x = Estimate, y = -log10(qval), color = value)) +
        geom_hline(yintercept = 1, color = "gray70") +   # q=0.1
        geom_vline(xintercept = 0, color = "gray70") +
        geom_point(, size = 2, stroke = 0)  

da_result %>%
        ggplot(. ,
               aes(x = Estimate, y = -log10(qval), color = value)) +
        geom_hline(yintercept = 1, color = "gray70") +   # q=0.1
        geom_vline(xintercept = 0, color = "gray70") +
        geom_point(, size = 2, stroke = 0) +
        facet_wrap(~ value, ncol = 3) 

da_result %>%
        ggplot(. ,
               aes(x = Estimate, y = -log10(qval), color = value)) +
        geom_hline(yintercept = 1, color = "gray70") +   # q=0.1
        geom_vline(xintercept = 0, color = "gray70") +
        geom_point(, size = 2, stroke = 0) +
        facet_wrap(~ value, ncol = 3) +
        labs(x = "Change estimate of CLR-transformed relative abundance",
             y = "-log~10~ *q*-value") +
        theme_classic(base_family = "sans") +
        theme(legend.position = "top",
                axis.title.y = ggtext::element_markdown(),
                axis.title.x = ggtext::element_markdown(),
                strip.text = element_text(face = "bold")) + 
        ggrepel::geom_text_repel(
                aes(label = label_right),
                #color = MetBrewer::met.brewer("Austria"),
                data = subset(da_result, !is.na(label_right)),
                box.padding = unit(0.35, "lines"),
                grob = "richtext",
                na.rm = TRUE, show.legend = FALSE, max.overlaps = 20,
                size = 3, segment.size = 0.25,
                nudge_x = 5, nudge_y = 2, direction = "y",
                parse = T) +
        ggrepel::geom_text_repel(
                aes(label = label_left),
                #color = MetBrewer::met.brewer("Austria", 2)[[1]],
                data = subset(da_result, !is.na(label_left)),
                box.padding = unit(0.35, "lines"),
                na.rm = TRUE, show.legend = FALSE, max.overlaps = 20,
                size = 3, segment.size = 0.25,
                nudge_x = -5, nudge_y = 2, direction = "y",
                parse = T)
## Warning in ggrepel::geom_text_repel(aes(label = label_right), data =
## subset(da_result, : Ignoring unknown parameters: `grob`
## Warning: ggrepel: 47 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 46 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 30 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 62 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 88 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 51 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 63 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

mixomics & caret

#creating data 

df_ml <- my_phyloseq_da %>% 
        # transform_sample_counts(., function(x){x/sum(x)}) %>% # microbiome packages normalizes otu_table and then transfrom with CLR 
        {
                data <- .
                otu_table <- otu_table(data) %>% 
                        t() %>%
                        data.frame()
                sample_data <- sample_data(data) %>% 
                        data.frame()
                otu_table %>%
                        mutate(`treatment5` = as.factor(sample_data$depletion_method5))
                }

#Splitting data - train vs test 
train_index <- createDataPartition(df_ml$treatment5, p = 0.8, list = FALSE)
df_ml_train <- df_ml[train_index, ]


#Setting parameters for cross-validation
ctrl_n5 <- caret::trainControl(
    method = "repeatedcv",
    number = 5, # High dimensional data. 1/5 of data will be used for the test set, therefore 80% is train set and 20% is test set.
        # Earlier analysis showed the pattern between number = 5 & number = 10 were similar
    repeats = 10,
    returnResamp = "all",
    savePredictions = "all",
    classProbs = TRUE,            # Enable class probabilities
    allowParallel = TRUE,
    verboseIter = FALSE            # Show progress
)


spls_model_clr_treatment5 <- caret::train(as.numeric(`treatment5`) ~ ., 
                                   data=df_ml_train,
                                   method=mixOmicsCaret::get_mixOmics_spls(),
                                   preProc=c("center", "scale"),
                                   #metric="ROC",
                                   tuneGrid=expand.grid(ncomp=3,# When there are multiple dimensinos that can elucidate the change of data, multiple number of ncompis suggested. Here the data could have been confounded by seasonal effect, ncomp = 2 was ammended.
                                                        keepX=seq(10, df_ml_train %>% length %>% (\(x) floor(x / 10) * 10)(), 10),
                                                        keepY=1),
                                   trControl=ctrl_n5)
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
## Warning in train.default(x, y, weights = w, ...): cannnot compute class
## probabilities for regression

Model evaluation

spls_evaluation <- function(model_list) {
        outcome <- list()
        pred <- model_list
        bestRep <- model_list
        savedPreds <- model_list
        cvauc.train <- model_list
        for(i in 1:length(model_list)){
                pred[[i]] <- model_list[[i]] %>% 
                        mixOmicsCaret::get_best_predictions() %>% 
                        group_by(Rep) %>% 
                        summarize(auc=as.numeric(pROC::auc(obs, pred, direction="<"))) %>% 
                        arrange(desc(auc))
                bestRep[[i]] <- model_list[[i]] %>% 
                        mixOmicsCaret::get_best_predictions() %>%
                        group_by(Rep) %>% 
                        summarize(auc=as.numeric(suppressMessages(pROC::auc(obs, pred, direction="<")))) %>% 
                        arrange(desc(auc)) %>% 
                        (\(x) x[1, "Rep"])() %>% 
                        as.character()
                # extract predictions for calculating CV AUC from best repetition
                 savedPreds[[i]] <- model_list[[i]] %>% 
                         mixOmicsCaret::get_best_predictions() %>% 
                         filter(Rep==bestRep[[i]])
                 cvauc.train[[i]] <- pROC::auc(obs ~ pred, direction="<", data=savedPreds[[i]])
        }
                outcome$pred <- pred
                outcome$bestRep <- bestRep
                outcome$savedPreds <- savedPreds
                outcome$cvauc.train <- cvauc.train
                outcome
}

eval_list_spls_t1 <- spls_evaluation(list(spls_model_clr_treatment5)) %>% suppressMessages()
## Warning: `filter_()` was deprecated in dplyr 0.7.0.
## ℹ Please use `filter()` instead.
## ℹ See vignette('programming') for more help
## ℹ The deprecated feature was likely used in the mixOmicsCaret package.
##   Please report the issue to the authors.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
list(spls_model_clr_treatment5)[[1]]$pred %>%
  separate(Resample, c("Fold", "Rep")) %>%
  group_by(ncomp, keepX, Rep) %>%
  summarize(auc = pROC::auc(predictor = pred, obs, direction = "<") %>% suppressMessages() %>% as.numeric, .groups = "drop") %>%
  ggplot(aes(x = factor(keepX), y = auc)) +
        geom_boxplot(outlier.shape = NA) +
        geom_jitter(aes(color = factor(ncomp)), alpha = 0.2, show.legend = FALSE) +
        geom_hline(yintercept = 0.5) +
        #ylim(c(0.6,0.9)) +
        facet_grid(. ~ ncomp) +
        scale_color_brewer(palette = "Set1", name = NULL) +
        theme_bw() + theme(strip.background = element_blank()) +
        xlab("Number of variables") +
        theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
        ggtitle("Predictino result of treatment 5, facetted by ncomp")

mixomics & caret (sample type)

#creating data 

df_ml_st3 <- my_phyloseq_da %>% 
        # transform_sample_counts(., function(x){x/sum(x)}) %>% # microbiome packages normalizes otu_table and then transfrom with CLR 
        {
                data <- my_phyloseq_da
                otu_table <- otu_table(data) %>% 
                        t() %>%
                        data.frame()
                sample_data <- sample_data(data) %>% 
                        data.frame()
                otu_table %>%
                        mutate(`sample_type_3` = as.factor(sample_data$sample_type == "sample_type_3"))
                }

#Splitting data - train vs test 
set.seed(seed)
train_index_st3 <- createDataPartition(df_ml_st3$sample_type_3, p = 0.8, list = FALSE)
df_ml_train_st3 <- df_ml_st3[train_index_st3, ]
df_ml_test_st3 <- df_ml_st3[-train_index_st3, ]


#Setting parameters for cross-validation
ctrl_n5 <- caret::trainControl(
    method = "repeatedcv",
    number = 5, # High dimensional data. 1/5 of data will be used for the test set, therefore 80% is train set and 20% is test set.
        # Earlier analysis showed the pattern between number = 5 & number = 10 were similar
    repeats = 10,
    returnResamp = "all",
    savePredictions = "all",
    classProbs = TRUE,            # Enable class probabilities
    allowParallel = TRUE,
    verboseIter = FALSE            # Show progress
)


spls_model_clr_st3_t1 <- caret::train(as.numeric(`sample_type_3`) ~ ., 
                                   data=df_ml_train_st3,
                                   method=mixOmicsCaret::get_mixOmics_spls(),
                                   preProc=c("center", "scale"),
                                   #metric="ROC",
                                   tuneGrid=expand.grid(ncomp=3,# When there are multiple dimensinos that can elucidate the change of data, multiple number of ncompis suggested. Here the data could have been confounded by seasonal effect, ncomp = 2 was ammended.
                                                        keepX=seq(10, df_ml_train_st3 %>% length %>% (\(x) floor(x / 10) * 10)(), 10),
                                                        keepY=1),
                                   trControl=ctrl_n5)
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
## Warning in train.default(x, y, weights = w, ...): cannnot compute class
## probabilities for regression

Model evaluation

eval_list_spls_st3_t1 <- spls_evaluation(list(spls_model_clr_st3_t1)) %>% suppressMessages()

list(spls_model_clr_st3_t1)[[1]]$pred %>%
  separate(Resample, c("Fold", "Rep")) %>%
  group_by(ncomp, keepX, Rep) %>%
  summarize(auc = pROC::auc(predictor = pred, obs, direction = "<") %>% suppressMessages() %>% as.numeric, .groups = "drop") %>%
  ggplot(aes(x = factor(keepX), y = auc)) +
        geom_boxplot(outlier.shape = NA) +
        geom_jitter(aes(color = factor(ncomp)), alpha = 0.2, show.legend = FALSE) +
        geom_hline(yintercept = 0.5) +
        #ylim(c(0.6,0.9)) +
        facet_grid(. ~ ncomp) +
        scale_color_brewer(palette = "Set1", name = NULL) +
        theme_bw() + theme(strip.background = element_blank()) +
        xlab("Number of variables") +
        theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
        ggtitle("Prediction result of sample type 3, facetted by ncomp")

Parameter tuning

spls_model_clr_st3_t2 <- caret::train(as.numeric(`sample_type_3`) ~ ., 
                                   data=df_ml_train_st3,
                                   method=mixOmicsCaret::get_mixOmics_spls(),
                                   preProc=c("center", "scale"),
                                   #metric="ROC",
                                   tuneGrid=expand.grid(ncomp=2,# When there are multiple dimensinos that can elucidate the change of data, multiple number of ncompis suggested. Here the data could have been confounded by seasonal effect, ncomp = 2 was ammended.
                                                        keepX=1:30,
                                                        keepY=1),
                                   trControl=ctrl_n5)
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
## Warning in train.default(x, y, weights = w, ...): cannnot compute class
## probabilities for regression
list(spls_model_clr_st3_t2)[[1]]$pred %>%
  separate(Resample, c("Fold", "Rep")) %>%
  group_by(ncomp, keepX, Rep) %>%
  summarize(auc = pROC::auc(predictor = pred, obs, direction = "<") %>% suppressMessages() %>% as.numeric, .groups = "drop") %>%
  ggplot(aes(x = factor(keepX), y = auc)) +
        geom_boxplot(outlier.shape = NA) +
        geom_jitter(aes(color = factor(ncomp)), alpha = 0.2, show.legend = FALSE) +
        geom_hline(yintercept = 0.5) +
        #ylim(c(0.6,0.9)) +
        facet_grid(. ~ ncomp) +
        scale_color_brewer(palette = "Set1", name = NULL) +
        theme_bw() + theme(strip.background = element_blank()) +
        xlab("Number of variables") +
        theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
        ggtitle("Prediction result of sample type 3, after parameter tuning")

# Final tuning

spls_model_clr_st3_t3 <- caret::train(as.numeric(`sample_type_3`) ~ ., 
                                   data=df_ml_train_st3,
                                   method=mixOmicsCaret::get_mixOmics_spls(),
                                   preProc=c("center", "scale"),
                                   #metric="ROC",
                                   tuneGrid=expand.grid(ncomp=2,# When there are multiple dimensinos that can elucidate the change of data, multiple number of ncompis suggested. Here the data could have been confounded by seasonal effect, ncomp = 2 was ammended.
                                                        keepX=12,
                                                        keepY=1),
                                   trControl=ctrl_n5)
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
## Warning in train.default(x, y, weights = w, ...): cannnot compute class
## probabilities for regression
tibble::tibble(
  model_name = c("T1", "T1", "T2", "T2", "T3", "T3"),
  dataset = c("Train", "Test", "Train", "Test", "Train", "Test"),
  auc = c(
    pROC::roc(df_ml_train_st3$sample_type_3, #actual value
        predict(spls_model_clr_st3_t1, newdata = df_ml_train_st3), #prediction
        quiet = TRUE)$auc, #metric
    pROC::roc(df_ml_test_st3$sample_type_3,
        predict(spls_model_clr_st3_t1, newdata = df_ml_test_st3),
        quiet = TRUE)$auc,

    pROC::roc(df_ml_train_st3$sample_type_3, #actual value
        predict(spls_model_clr_st3_t2, newdata = df_ml_train_st3),
        quiet = TRUE)$auc,
    pROC::roc(df_ml_test_st3$sample_type_3,
        predict(spls_model_clr_st3_t2, newdata = df_ml_test_st3),
        quiet = TRUE)$auc,
    
    pROC::roc(df_ml_train_st3$sample_type_3, #actual value
        predict(spls_model_clr_st3_t3, newdata = df_ml_train_st3),
        quiet = TRUE)$auc,
    pROC::roc(df_ml_test_st3$sample_type_3,
        predict(spls_model_clr_st3_t3, newdata = df_ml_test_st3),
        quiet = TRUE)$auc
  ))
## # A tibble: 6 × 3
##   model_name dataset   auc
##   <chr>      <chr>   <dbl>
## 1 T1         Train   0.999
## 2 T1         Test    0.967
## 3 T2         Train   0.996
## 4 T2         Test    0.94 
## 5 T3         Train   0.995
## 6 T3         Test    0.94

sPLS loadings

dat_train <- df_ml_train_st3 
y <- as.numeric(dat_train$sample_type_3 == T)

# Data preparation
fm <- spls_model_clr_st3_t2$finalModel
scores <- fm$variates$X              # (sample × comp)
loadX  <- fm$loadings$X              # (feature × comp)


# 1) Component sign alignment
sign_flip <- rep(1, ncol(scores))

for (c in seq_len(ncol(scores))) {
  if (cor(scores[, c], y, use = "complete.obs") < 0) {
    sign_flip[c] <- -1
  }
}
scores <- sweep(scores, 2, sign_flip, `*`)
loadX  <- sweep(loadX,  2, sign_flip, `*`)

# 2) Loading tables
load_tab <- map_dfr(seq_len(ncol(loadX)), function(c){
  tibble(
    var     = rownames(loadX),
    loading = as.numeric(loadX[, c]),
    comp    = c
  )
})


load_tab <- load_tab #%>% mutate(Species = gsub("^X[.]", "", var))

# 3) delata mean calculaiton
feat <- unique(load_tab$var)
X <- dat_train[, feat, drop = FALSE]
ok <- complete.cases(y, X)
delta <- colMeans(X[ok & y==1, , drop=FALSE]) - colMeans(X[ok & y==0, , drop=FALSE])
annot <- tibble(var = names(delta), delta = as.numeric(delta),
                `Sample type 3` = factor(ifelse(delta>0, "+ Sample type 3","- Sample type 3"),
                               levels=c("+ Sample type 3","- Sample type 3")))

plot_df <- load_tab %>%
  left_join(annot, by="var")

# 4) annotation and plotting
topK <- spls_model_clr_st3_t2$bestTune$keepX
plot_df_top <- plot_df %>%
  group_by(comp) %>%
  slice_max(order_by = abs(loading), n = topK, with_ties = FALSE) %>%
  ungroup()


spls_laoding_data <- rbind(
        left_join(
                plot_df_top %>%
                        mutate(tax=gsub("^X[.]", "", var), Species=paste0("s__", var)),
                my_phyloseq_da %>% 
                        tax_table() %>% data.frame, by="Species") %>%  
                arrange(desc(abs(loading))) %>% 
                #head(list_spls_final$spls_model_clr_ipm_nc2_kx32$bestTune$keepX*2) %>% 
                dplyr::select(-var) %>%
                mutate(data  = "Sample type 3"))

spls_laoding_data$Phylum <- gsub("_", "-", spls_laoding_data$Phylum)

col_list <- spls_laoding_data %>% 
        .$`Sample type 3` %>% 
        unique %>%
        .[order(.)]

col_list <- col_list %>%{
        data <- .
        colors_class <- MetBrewer::met.brewer("Austria", n =length(data)) %>% c()
        #colors_class <- c("#a6cee3", "#1f78b4" , "#b2df8a", "#33a02c", "#fb9a99", "#e31a1c")
        names(colors_class) <- data
        colors_class
}


spls_laoding_data %>%
        head(spls_model_clr_st3_t3$bestTune$keepX * 2) %>% 
        mutate(data  = "Sample type 3",
               comp = dplyr::case_when(
                       comp == "1" ~ "Component 1",
                       comp == "2" ~ "Component 2",
                       TRUE ~ NA_character_
                       )) 
## # A tibble: 24 × 13
##    loading comp    delta `Sample type 3` tax   Species Domain Phylum Class Order
##      <dbl> <chr>   <dbl> <fct>           <chr> <chr>   <chr>  <chr>  <chr> <chr>
##  1   0.554 Compo…  2.63  + Sample type 3 Stre… s__Str… k__Ba… p--Fi… c__B… o__L…
##  2  -0.519 Compo… -6.35  - Sample type 3 Cuti… s__Cut… k__Ba… p--Ac… c__A… o__P…
##  3   0.380 Compo…  2.93  + Sample type 3 Allo… s__All… k__Ba… p--Ba… c__B… o__B…
##  4   0.349 Compo…  3.96  + Sample type 3 Stap… s__Sta… k__Ba… p--Fi… c__B… o__B…
##  5   0.344 Compo…  2.10  + Sample type 3 Camp… s__Cam… k__Ba… p--Pr… c__E… o__C…
##  6   0.326 Compo…  3.96  + Sample type 3 Stap… s__Sta… k__Ba… p--Fi… c__B… o__B…
##  7   0.289 Compo…  0.866 + Sample type 3 Stre… s__Str… k__Ba… p--Fi… c__B… o__L…
##  8   0.275 Compo…  2.04  + Sample type 3 Serr… s__Ser… k__Ba… p--Pr… c__G… o__E…
##  9   0.274 Compo…  2.09  + Sample type 3 Allo… s__All… k__Ba… p--Ba… c__B… o__B…
## 10   0.274 Compo…  1.71  + Sample type 3 Nega… s__Neg… k__Ba… p--Fi… c__N… o__V…
## # ℹ 14 more rows
## # ℹ 3 more variables: Family <chr>, Genus <chr>, data <chr>
sPLS_loadings_plot <- spls_laoding_data %>%
        head(spls_model_clr_st3_t3$bestTune$keepX * 2) %>% 
        mutate(data  = "Sample type 3",
               comp = dplyr::case_when(
                       comp == "1" ~ "Component 1",
                       comp == "2" ~ "Component 2",
                       TRUE ~ NA_character_
                       )) %>%
        ggplot(aes(fct_reorder(tax, loading),
                   loading, fill=`Sample type 3`)) +
        geom_bar(stat="identity", color="black", lwd=.2) + 
        coord_flip() + 
        ylab("sPLS loadings") +
        facet_wrap(~comp, ncol = 1, scale = "free_y") +
        xlab("Sample type 3 signature at species level") +
        scale_fill_manual(values = col_list)  +
        scale_x_discrete(labels = function(x) as.expression(parse(text = species_italic(x)))) +  
        theme_classic() +
        theme(legend.position = "top",
              text=element_text(size=10, color = "black"),
              axis.text.y=element_text(color = "black"),
              axis.text.x=element_text(color = "black"))


sPLS_loadings_plot

Random forest & caret

# Random forest & caret ----------------------------------------------------

df_rf <- df_ml_st3
df_rf$sample_type_3 <- factor(df_rf$sample_type_3,
                              levels = c(FALSE, TRUE),
                              labels = c("Other", "Type3"))

set.seed(seed)
train_index_rf <- createDataPartition(df_rf$sample_type_3, p = 0.8, list = FALSE)
df_rf_train <- df_rf[train_index_rf, ]
df_rf_test  <- df_rf[-train_index_rf, ]

# caret's trainControl settings (ROC)
ctrl_rf <- caret::trainControl(
        method = "repeatedcv",
        number = 5,
        repeats = 5,
        returnResamp = "all",
        savePredictions = "final",
        classProbs = TRUE,
        summaryFunction = twoClassSummary,
        allowParallel = TRUE)

# Random forest 
rf_model_st3 <- caret::train(sample_type_3 ~ .,
                             data = df_rf_train,
                             method = "rf",
                             metric = "ROC",
                             trControl = ctrl_rf,
                             tuneLength = 5,
                             importance = TRUE
)

rf_model_st3
## Random Forest 
## 
## 126 samples
## 335 predictors
##   2 classes: 'Other', 'Type3' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 5 times) 
## Summary of sample sizes: 100, 102, 101, 101, 100, 101, ... 
## Resampling results across tuning parameters:
## 
##   mtry  ROC        Sens       Spec 
##     2   0.8743429  0.9115238  0.512
##    85   0.9885429  0.9860952  0.718
##   168   0.9956762  0.9960000  0.812
##   251   0.9971762  0.9960000  0.812
##   335   0.9962643  0.9980000  0.828
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 251.

RF evaluation

# Test set prediction
rf_prob_test <- predict(rf_model_st3, newdata = df_rf_test, type = "prob")
rf_pred_test <- predict(rf_model_st3, newdata = df_rf_test, type = "raw")

# Confusion matrix
caret::confusionMatrix(rf_pred_test, df_rf_test$sample_type_3)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Other Type3
##      Other    25     0
##      Type3     0     6
##                                      
##                Accuracy : 1          
##                  95% CI : (0.8878, 1)
##     No Information Rate : 0.8065     
##     P-Value [Acc > NIR] : 0.00127    
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.8065     
##          Detection Rate : 0.8065     
##    Detection Prevalence : 0.8065     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : Other      
## 
# ROC & AUC
rf_roc_test <- pROC::roc(
  response  = df_rf_test$sample_type_3,
  predictor = rf_prob_test[,"Type3"],
  levels    = c("Other", "Type3"),
  direction = "<"
)

rf_roc_test$auc
## Area under the curve: 1
# ROC curve
rf_roc_df <- tibble::tibble(
  fpr = 1 - rf_roc_test$specificities,
  tpr = rf_roc_test$sensitivities
)

rf_roc_df %>%
        ggplot(aes(x = fpr, y = tpr)) +
        geom_line() +
        geom_abline(slope = 1, intercept = 0, linetype = "dashed", colour = "grey70") +
        coord_equal() +
        theme_classic() +
        labs(x = "1 - Specificity",
             y = "Sensitivity",
             title = paste0("Random forest ROC (Test AUC = ", round(as.numeric(rf_roc_test$auc), 3), ")"))

RF variable importance

# Extraction of variable importance 
rf_varimp_raw <- caret::varImp(rf_model_st3, scale = TRUE)$importance %>%
  tibble::rownames_to_column("feature") %>%
  arrange(desc(Type3))

# Top 30
rf_varimp_top <- rf_varimp_raw %>%
  head(30) %>%
  dplyr::left_join(
    my_phyloseq_da %>%
      tax_table() %>%
      as("matrix") %>%
      data.frame(stringsAsFactors = FALSE) %>%
      tibble::rownames_to_column("feature"),
    by = "feature"
  ) %>%
  mutate(
    label = species_italic(feature) 
  )

rf_varimp_top %>%
  ggplot(aes(x = reorder(feature, Type3), y = Type3)) +
  geom_col(color = "black") +
  coord_flip() +
  scale_x_discrete(labels = function(x) {
    
    sapply(x, function(xx) parse(text = species_italic(xx)))
  }) +
  theme_classic(base_family = "sans") +
  labs(
    x = "Top species (random forest importance)",
    y = "Importance (scaled)",
    title = "Top 30 features contributing to RF model (sample_type_3)"
  )

lasso using SIAMCAT

# SIAMCAT obejct preparation
meta_siam <- sample_data(my_phyloseq) %>%
  data.frame() %>%
  mutate(
    sample_type_3 = factor(sample_type == "sample_type_3",
                           levels = c(FALSE, TRUE),
                           labels = c("Other", "Type3"))
  )

# SIAMCAT table creation (taxa and sample)
feat_siam <- my_phyloseq %>%
  otu_table() %>%
  as("matrix")  # taxa and sample 

# Labeling
label_siam <- SIAMCAT::create.label(
  meta    = meta_siam,
  label   = "sample_type_3",
  case    = "Type3",
  control = "Other"
)
## Label used as case:
##    Type3
## Label used as control:
##    Other
## + finished create.label.from.metadata in    0 s
# SIAMCAT object 
sc.obj <- SIAMCAT::siamcat(
  feat  = feat_siam,
  meta  = meta_siam,
  label = label_siam
)
## + starting validate.data
## Warning in validate.data(sc, verbose = verbose): The data do not seem to
## consist of relative abundances! (values ranging between 0 and 1)
## +++ checking overlap between labels and features
## + Keeping labels of 157 sample(s).
## +++ checking sample number per class
## +++ checking overlap between samples and metadata
## + finished validate.data in 0.264 s
# feature filtering & normalization
# abundance filtering

sc.obj <- SIAMCAT::filter.features(
  sc.obj,
  filter.method = "abundance",
  cutoff        = 0.001,
  rm.unmapped   = TRUE
)
## Features successfully filtered
##### log transform

sc.obj <- SIAMCAT::normalize.features(
  sc.obj,
  norm.method = "log.std"
)
## Features normalized successfully.
# Splitting data
set.seed(seed)
sc.obj <- SIAMCAT::create.data.split(
  sc.obj,
  num.folds   = 5,
  num.resample = 5,
  stratify    = TRUE
)
## Features splitted for cross-validation successfully.
# Training with LASSO 
sc.obj <- SIAMCAT::train.model(
  sc.obj,
  method = "lasso"
)
## Trained lasso models successfully.
# Prediction
sc.obj <- SIAMCAT::make.predictions(sc.obj)
## Made predictions successfully.
sc.obj <- SIAMCAT::evaluate.predictions(sc.obj)
## Evaluated predictions successfully.
# Plot everyting (ROC, PR, etc.)

model.evaluation.plot(sc.obj)[[1]]

## NULL
model.interpretation.plot(sc.obj, fn.plot = 'Inha/5_Lectures/2_IBS7048_Metagenomics/2025F/scripts/IBS7048_Advanced_metagenomics/interpretation.pdf',
    consens.thres = 0.5, limits = c(-3, 3), heatmap.type = 'zscore')
## Warning in arrows(y0 = c(seq_along(med)) - 0.5, x0 = -upp.qt, x1 = -low.qt, :
## zero-length arrow is of indeterminate angle and so skipped
## Successfully plotted model interpretation plot to: Inha/5_Lectures/2_IBS7048_Metagenomics/2025F/scripts/IBS7048_Advanced_metagenomics/interpretation.pdf

Bibliography

## Computing. R Foundation for Statistical Computing, Vienna, Austria. <https://www.R-project.org/>. We have invested a lot of time and effort in creating R, please cite it when using it for data analysis. See also 'citation("pkgname")' for citing R packages.
## version 1.4.5, <https://CRAN.R-project.org/package=readxl>.
## graphics of microbiome census data. Paul J. McMurdie and Susan Holmes (2013) PLoS ONE 8(4):e61217.
## Grolemund G, Hayes A, Henry L, Hester J, Kuhn M, Pedersen TL, Miller E, Bache SM, Müller K, Ooms J, Robinson D, Seidel DP, Spinu V, Takahashi K, Vaughan D, Wilke C, Woo K, Yutani H (2019). "Welcome to the tidyverse." Journal of Open Source Software_, *4*(43), 1686. doi:10.21105/joss.01686 <https://doi.org/10.21105/joss.01686>.
## R. version 0.5.0. Buffalo, New York. http://github.com/trinker/pacman
## J, reikoch, Beasley W, O'Connor B, Warnes GR, Quinn M, Kamvar ZN, Gao C (2024). yaml: Methods to Convert R Data to YAML and Back_. R package version 2.3.10, <https://CRAN.R-project.org/package=yaml>. ATTENTION: This citation information has been auto-generated from the package DESCRIPTION file and may need manual editing, see 'help("citation")'.
## Package: Tests in Linear Mixed Effects Models." Journal of Statistical Software, *82*(13), 1-26. doi:10.18637/jss.v082.i13 <https://doi.org/10.18637/jss.v082.i13>.
## Package. Journal of Statistical Software, 28(5), 1–26. https://doi.org/10.18637/jss.v028.i05
## cross-disease comparison enabled by the SIAMCAT machine learning toolbox. Genome Biol 22, 93 (2021). https://doi.org/10.1186/s13059-021-02306-1
## package for 'omics feature selection and multiple data integration. PLoS computational biology 13(11):e1005752
## 0.1.1, commit b9ef1d3683fb14c78c78f63a7658cb04b1169e87, <https://github.com/jonathanth/mixOmicsCaret>. ATTENTION: This citation information has been auto-generated from the package DESCRIPTION file and may need manual editing, see 'help("citation")'.