suppressPackageStartupMessages({
library(curatedMetagenomicData)
library(dplyr)
library(caret)
library(pROC)
})Healthy vs. Cancer: Random Forest classification model
Abstract
Reproduce Hannigan et.al. (mBio, 2018) paper
Setup
Load packages
Custom functions
#' Function to preprocess data
#'
#' @param data A matrix with taxa (row) and samples (column), stored in the
#' assay slot of the TreeSummarizedExperiment object.
#' @param min Integer(1). The minimum number of samples detecting the feature.
#' Default is 30.
#'
preprocess_data <- function(data, min = 30) {
keep_features <- rowSums(data > 0) >= min
data <- data[keep_features, ]
return(data)
}
#' Evaluate Random Forest classification model using ROC curve
#'
#' @import caret
#'
#' @param data A matrix with taxa (row) and samples (column), stored in the
#' assay slot of the TreeSummarizedExperiment object.
#' @param labels A vector of outcomes. The number of samples and the length of
#' labels should be same.
#' @param p The percentage of data that goes to training. Between 0 and 1. Default is 0.8.
#' @param aucOnly If `TRUE`, this function returns only the AUC.
#'
#' @examples
#' se <- curated_cmd |>
#' filter(study_name == "HanniganGD_2017") |>
#' filter(disease %in% c("Healthy", "Colorectal Carcinoma")) |>
#' select(where(~ !all(is.na(.x)))) |>
#' returnSamples("relative_abundance", rownames = "short")
#'
#' data <- assay(se)
#' labels <- colData(se)$disease %>% factor
#' evaluateRFmodel(data, labels)
#'
evaluateRFmodel <- function(data, labels, p = 0.8, aucOnly = FALSE) {
## Preprocess data
data <- preprocess_data(data)
## Split data into training and test sets
train_index <- createDataPartition(labels, p = p, list = FALSE)
train_data <- data[,train_index] |> t()
train_labels <- labels[train_index]
test_data <- data[,-train_index] |> t()
test_labels <- labels[-train_index]
## Train binary random forest model with nested cross-validation
rf_model <- train(
train_data, train_labels,
method = "rf", # random forest
metric = "Accuracy",
trControl = trainControl(
method = "repeatedcv", # repeating cross-validation
number = 5, # number of re-sampling iterations
repeats = 5,
search = "random"),
tuneLength = 20
)
## Evaluate model performance on test set
predictions <- predict(rf_model, newdata = test_data, type = "prob")
## Generate ROC curve
lvs <- levels(train_labels)
roc_obj <- roc(response = factor(test_labels, levels = lvs),
predictor = predictions[, 2],
levels = rev(lvs))
if (isTRUE(aucOnly)) {
return(roc_obj$auc)
} else {
plot(roc_obj, print.auc = TRUE, main = "ROC Curve") # plot ROC curve
}
}HanniganGD_2017 only
Load data
Hannigan_se <- sampleMetadata |>
filter(study_name == "HanniganGD_2017") |>
filter(disease %in% c("healthy", "CRC")) |>
select(where(~ !all(is.na(.x)))) |>
returnSamples("relative_abundance", rownames = "short")
data <- assay(Hannigan_se) # 289 taxa x 55 samples
labels <- colData(Hannigan_se)$disease %>% factorROC curve
set.seed(1234)
evaluateRFmodel(data, labels)
From harmonized metadata
Load data
curated_cmd <- read.csv("~/OmicsMLRepo/OmicsMLRepoData/inst/extdata/cMD_curated_metadata_release.csv")sub <- curated_cmd |>
filter(disease %in% c("Healthy", "Colorectal Carcinoma")) |>
filter(body_site == "feces") |>
filter(country == "United States") |>
filter(age_group %in% c("Adult", "Elderly")) |>
filter(antibiotics_current_use == "no") |>
filter(westernized == "Yes") |>
select(where(~ !all(is.na(.x)))) # 421 samplesROC with the same number of samples
set.seed(1234)
random_ind <- sample(nrow(sub),
55, # the same number of samples like HanniganGD_2018
replace = FALSE)
se_sub <- sub[random_ind,] |>
returnSamples("relative_abundance", rownames = "short")
data <- assay(se_sub)
labels <- colData(se_sub)$disease %>% factorset.seed(1234)
evaluateRFmodel(data, labels)
ROC with more samples
set.seed(1234)
random_ind2 <- sample(nrow(sub), 100, replace = FALSE)
se_sub2 <- sub[random_ind2,] |>
returnSamples("relative_abundance", rownames = "short")
data <- assay(se_sub2)
labels <- colData(se_sub2)$disease %>% factorset.seed(1234)
evaluateRFmodel(data, labels)