Loading packages
Loading data
my_phyloseq <- readRDS("Inha/5_Lectures/2_IBS7048_Metagenomics/2025F/scripts/IBS7048_Advanced_metagenomics/phyloseq_ex_host_dep_deidentified.rds")
DA analysis results
# Before running DA analysis, you need to double check the distribution of your data
set.seed(seed)
# I did CLR transform with `microbiome` package ---------------------------------
my_phyloseq_da <- my_phyloseq %>% # set your file here
subset_taxa(., Domain %in% "k__Bacteria") %>%
prune_taxa(taxa_sums(.) > 0, .) %>%
microbiome::transform(., 'clr')
sample_data(my_phyloseq_da) <- sample_data(my_phyloseq) %>% # set your file here
data.frame() %>%
mutate(treatment = case_when(
control == 1 ~ "0",
depletion_method1 == 1 ~ "1",
depletion_method2 == 1 ~ "2",
depletion_method3 == 1 ~ "3",
depletion_method4 == 1 ~ "4",
depletion_method5 == 1 ~ "5",
.default = NA_character_)) %>%
sample_data
#dummy <- compositions::clr(otu_table(phyloseq_metaphlan$phy_microbe_rel)) %>% data.frame()
#dummy2 <- otu_table(physeq_mic_clr_school) %>% data.frame()
# LME4 for CLR normalized data --------------------------------------------
lmer_function <- function(phyloseq_lmer) {
lmer_taxa_rel_clr <- data.frame()
all <- phyloseq_lmer %>% taxa_sums() %>% length()
for(i in 1:all) {
#Creating a data frame that includes CLR transformed data of i-th bug.
#making differnt otu tables for each sample type
otu_table <- phyloseq_lmer %>% otu_table %>% t
#tax table for different sample type
taxa_data <- otu_table[,i] %>% data.frame()
#Making a merged dataframe having sample data and CLR transformed output
lme_data <- merge(taxa_data, sample_data(phyloseq_lmer), by = 0) %>% column_to_rownames("Row.names")
#generating a character of formula.
#Here, as the taxa are already CLR transformed, I did not make model at log-scale.
lme4_formula <- paste(names(taxa_data), "~ treatment + (1|subject_id)") # <-- modify your model here
class_result <- tryCatch({
model <- lmerTest::lmer(formula = lme4_formula, data = lme_data)
summary(model) %>%
.$coefficients %>%
data.frame(check.names = FALSE) %>%
mutate(feature = colnames(otu_table)[i]) %>%
rownames_to_column("value") %>%
subset(value %in% c("treatment1", "treatment2","treatment3","treatment4", "treatment5")) # <-- modify based on your model here
} %>% suppressMessages(),
error = function(e) {
message(paste("Error in model for taxon", i, ":", e$message))
return(NULL)
})
#row binding all the associations of i-th taxa to one data frame
lmer_taxa_rel_clr <- rbind(lmer_taxa_rel_clr,
class_result) %>%
remove_rownames()
}
lmer_taxa_rel_clr
}
da_result <- lmer_function(my_phyloseq_da)# set your file here
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge with max|grad| = 0.00278885 (tol = 0.002, component 1)
da_result <- da_result %>%
mutate(qval = qvalue::qvalue(`Pr(>|t|)`, lambda = 0)$qvalue,
p_bh = p.adjust(p = `Pr(>|t|)`, method = "BH")) %>%
suppressMessages()
Table visualization
da_result %>% reactable::reactable(searchable = T, sortable = T, filterable = T)
species_italic <- function(data) { # richtext formatting with Phage
clean <- gsub("_A|_B", "", data)
clean <- gsub("_", " ", clean)
clean <- gsub("[.]", " ", clean)
clean <- gsub("\\[|\\]", "", clean)
clean <- trimws(gsub("\\s+", " ", clean))
# 1) Case of phage
out <- ifelse(
grepl("(?i)host\\s*unknown", clean), # Host unknown --> regular font
paste0("'", clean, "'"),
NA_character_
)
is_phage_code <- grepl("(?i)^vsgb\\d+", clean) # find vSGBxxxxx
out[is.na(out) & is_phage_code] <- vapply(clean[is.na(out) & is_phage_code], function(x) {
toks <- strsplit(x, "\\s+")[[1]]
gidxs <- which(grepl("^[A-Z][a-z]+$", toks)) # Genus
if (length(gidxs)) {
genus <- toks[tail(gidxs, 1)]
prefix <- sub(paste0("\\s*", genus, "$"), "", x)
paste0("'", prefix, "'~italic('", genus, "')")
} else {
paste0("'", x, "'") # no Genus: regular font
}
}, character(1))
remain <- is.na(out)
out[remain] <- ifelse(
grepl(" sp", clean[remain]), {
parts <- strsplit(clean[remain], " sp", fixed = TRUE)
genus <- vapply(parts, `[`, character(1), 1)
strain <- vapply(parts, function(p) ifelse(length(p) > 1, trimws(p[2]), ""), character(1))
paste0("italic('", genus, "')~'sp.'~'", strain, "'")
},
paste0("italic('", clean[remain], "')")
)
return(out)
}
da_result <- bind_rows(da_result, .id = "Taxa") %>%
# column_to_rownames(feature)
left_join(.,
my_phyloseq_da %>%
tax_table() %>%
as("matrix") %>%
as.data.frame(stringsAsFactors = FALSE) %>%
rownames_to_column("feature")) %>%
mutate(alpha = if_else(qval <= 0.1, 0.9, 0.05),
## lables interaction only
feature = species_italic(feature),
label_candidate = case_when(
qval <= 0.1 ~ feature,
TRUE ~ NA_character_),
## label separation (left & right)
label_left = if_else(Estimate < 0, label_candidate, NA_character_),
label_right = if_else(Estimate > 0, label_candidate, NA_character_))
## Joining with `by = join_by(feature)`
da_result %>%
ggplot(. ,
aes(x = Estimate, y = -log10(qval), color = value)) +
geom_hline(yintercept = 1, color = "gray70") + # q=0.1
geom_vline(xintercept = 0, color = "gray70") +
geom_point(, size = 2, stroke = 0)
da_result %>%
ggplot(. ,
aes(x = Estimate, y = -log10(qval), color = value)) +
geom_hline(yintercept = 1, color = "gray70") + # q=0.1
geom_vline(xintercept = 0, color = "gray70") +
geom_point(, size = 2, stroke = 0) +
facet_wrap(~ value, ncol = 3)
da_result %>%
ggplot(. ,
aes(x = Estimate, y = -log10(qval), color = value)) +
geom_hline(yintercept = 1, color = "gray70") + # q=0.1
geom_vline(xintercept = 0, color = "gray70") +
geom_point(, size = 2, stroke = 0) +
facet_wrap(~ value, ncol = 3) +
labs(x = "Change estimate of CLR-transformed relative abundance",
y = "-log~10~ *q*-value") +
theme_classic(base_family = "sans") +
theme(legend.position = "top",
axis.title.y = ggtext::element_markdown(),
axis.title.x = ggtext::element_markdown(),
strip.text = element_text(face = "bold")) +
ggrepel::geom_text_repel(
aes(label = label_right),
#color = MetBrewer::met.brewer("Austria"),
data = subset(da_result, !is.na(label_right)),
box.padding = unit(0.35, "lines"),
grob = "richtext",
na.rm = TRUE, show.legend = FALSE, max.overlaps = 20,
size = 3, segment.size = 0.25,
nudge_x = 5, nudge_y = 2, direction = "y",
parse = T) +
ggrepel::geom_text_repel(
aes(label = label_left),
#color = MetBrewer::met.brewer("Austria", 2)[[1]],
data = subset(da_result, !is.na(label_left)),
box.padding = unit(0.35, "lines"),
na.rm = TRUE, show.legend = FALSE, max.overlaps = 20,
size = 3, segment.size = 0.25,
nudge_x = -5, nudge_y = 2, direction = "y",
parse = T)
## Warning in ggrepel::geom_text_repel(aes(label = label_right), data =
## subset(da_result, : Ignoring unknown parameters: `grob`
## Warning: ggrepel: 47 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 46 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 30 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 62 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 88 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 51 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 63 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
mixomics & caret
#creating data
df_ml <- my_phyloseq_da %>%
# transform_sample_counts(., function(x){x/sum(x)}) %>% # microbiome packages normalizes otu_table and then transfrom with CLR
{
data <- .
otu_table <- otu_table(data) %>%
t() %>%
data.frame()
sample_data <- sample_data(data) %>%
data.frame()
otu_table %>%
mutate(`treatment5` = as.factor(sample_data$depletion_method5))
}
#Splitting data - train vs test
train_index <- createDataPartition(df_ml$treatment5, p = 0.8, list = FALSE)
df_ml_train <- df_ml[train_index, ]
#Setting parameters for cross-validation
ctrl_n5 <- caret::trainControl(
method = "repeatedcv",
number = 5, # High dimensional data. 1/5 of data will be used for the test set, therefore 80% is train set and 20% is test set.
# Earlier analysis showed the pattern between number = 5 & number = 10 were similar
repeats = 10,
returnResamp = "all",
savePredictions = "all",
classProbs = TRUE, # Enable class probabilities
allowParallel = TRUE,
verboseIter = FALSE # Show progress
)
spls_model_clr_treatment5 <- caret::train(as.numeric(`treatment5`) ~ .,
data=df_ml_train,
method=mixOmicsCaret::get_mixOmics_spls(),
preProc=c("center", "scale"),
#metric="ROC",
tuneGrid=expand.grid(ncomp=3,# When there are multiple dimensinos that can elucidate the change of data, multiple number of ncompis suggested. Here the data could have been confounded by seasonal effect, ncomp = 2 was ammended.
keepX=seq(10, df_ml_train %>% length %>% (\(x) floor(x / 10) * 10)(), 10),
keepY=1),
trControl=ctrl_n5)
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
## Warning in train.default(x, y, weights = w, ...): cannnot compute class
## probabilities for regression
Model evaluation
spls_evaluation <- function(model_list) {
outcome <- list()
pred <- model_list
bestRep <- model_list
savedPreds <- model_list
cvauc.train <- model_list
for(i in 1:length(model_list)){
pred[[i]] <- model_list[[i]] %>%
mixOmicsCaret::get_best_predictions() %>%
group_by(Rep) %>%
summarize(auc=as.numeric(pROC::auc(obs, pred, direction="<"))) %>%
arrange(desc(auc))
bestRep[[i]] <- model_list[[i]] %>%
mixOmicsCaret::get_best_predictions() %>%
group_by(Rep) %>%
summarize(auc=as.numeric(suppressMessages(pROC::auc(obs, pred, direction="<")))) %>%
arrange(desc(auc)) %>%
(\(x) x[1, "Rep"])() %>%
as.character()
# extract predictions for calculating CV AUC from best repetition
savedPreds[[i]] <- model_list[[i]] %>%
mixOmicsCaret::get_best_predictions() %>%
filter(Rep==bestRep[[i]])
cvauc.train[[i]] <- pROC::auc(obs ~ pred, direction="<", data=savedPreds[[i]])
}
outcome$pred <- pred
outcome$bestRep <- bestRep
outcome$savedPreds <- savedPreds
outcome$cvauc.train <- cvauc.train
outcome
}
eval_list_spls_t1 <- spls_evaluation(list(spls_model_clr_treatment5)) %>% suppressMessages()
## Warning: `filter_()` was deprecated in dplyr 0.7.0.
## ℹ Please use `filter()` instead.
## ℹ See vignette('programming') for more help
## ℹ The deprecated feature was likely used in the mixOmicsCaret package.
## Please report the issue to the authors.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
list(spls_model_clr_treatment5)[[1]]$pred %>%
separate(Resample, c("Fold", "Rep")) %>%
group_by(ncomp, keepX, Rep) %>%
summarize(auc = pROC::auc(predictor = pred, obs, direction = "<") %>% suppressMessages() %>% as.numeric, .groups = "drop") %>%
ggplot(aes(x = factor(keepX), y = auc)) +
geom_boxplot(outlier.shape = NA) +
geom_jitter(aes(color = factor(ncomp)), alpha = 0.2, show.legend = FALSE) +
geom_hline(yintercept = 0.5) +
#ylim(c(0.6,0.9)) +
facet_grid(. ~ ncomp) +
scale_color_brewer(palette = "Set1", name = NULL) +
theme_bw() + theme(strip.background = element_blank()) +
xlab("Number of variables") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
ggtitle("Predictino result of treatment 5, facetted by ncomp")
mixomics & caret (sample type)
#creating data
df_ml_st3 <- my_phyloseq_da %>%
# transform_sample_counts(., function(x){x/sum(x)}) %>% # microbiome packages normalizes otu_table and then transfrom with CLR
{
data <- my_phyloseq_da
otu_table <- otu_table(data) %>%
t() %>%
data.frame()
sample_data <- sample_data(data) %>%
data.frame()
otu_table %>%
mutate(`sample_type_3` = as.factor(sample_data$sample_type == "sample_type_3"))
}
#Splitting data - train vs test
set.seed(seed)
train_index_st3 <- createDataPartition(df_ml_st3$sample_type_3, p = 0.8, list = FALSE)
df_ml_train_st3 <- df_ml_st3[train_index_st3, ]
df_ml_test_st3 <- df_ml_st3[-train_index_st3, ]
#Setting parameters for cross-validation
ctrl_n5 <- caret::trainControl(
method = "repeatedcv",
number = 5, # High dimensional data. 1/5 of data will be used for the test set, therefore 80% is train set and 20% is test set.
# Earlier analysis showed the pattern between number = 5 & number = 10 were similar
repeats = 10,
returnResamp = "all",
savePredictions = "all",
classProbs = TRUE, # Enable class probabilities
allowParallel = TRUE,
verboseIter = FALSE # Show progress
)
spls_model_clr_st3_t1 <- caret::train(as.numeric(`sample_type_3`) ~ .,
data=df_ml_train_st3,
method=mixOmicsCaret::get_mixOmics_spls(),
preProc=c("center", "scale"),
#metric="ROC",
tuneGrid=expand.grid(ncomp=3,# When there are multiple dimensinos that can elucidate the change of data, multiple number of ncompis suggested. Here the data could have been confounded by seasonal effect, ncomp = 2 was ammended.
keepX=seq(10, df_ml_train_st3 %>% length %>% (\(x) floor(x / 10) * 10)(), 10),
keepY=1),
trControl=ctrl_n5)
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
## Warning in train.default(x, y, weights = w, ...): cannnot compute class
## probabilities for regression
Model evaluation
eval_list_spls_st3_t1 <- spls_evaluation(list(spls_model_clr_st3_t1)) %>% suppressMessages()
list(spls_model_clr_st3_t1)[[1]]$pred %>%
separate(Resample, c("Fold", "Rep")) %>%
group_by(ncomp, keepX, Rep) %>%
summarize(auc = pROC::auc(predictor = pred, obs, direction = "<") %>% suppressMessages() %>% as.numeric, .groups = "drop") %>%
ggplot(aes(x = factor(keepX), y = auc)) +
geom_boxplot(outlier.shape = NA) +
geom_jitter(aes(color = factor(ncomp)), alpha = 0.2, show.legend = FALSE) +
geom_hline(yintercept = 0.5) +
#ylim(c(0.6,0.9)) +
facet_grid(. ~ ncomp) +
scale_color_brewer(palette = "Set1", name = NULL) +
theme_bw() + theme(strip.background = element_blank()) +
xlab("Number of variables") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
ggtitle("Prediction result of sample type 3, facetted by ncomp")
Parameter tuning
spls_model_clr_st3_t2 <- caret::train(as.numeric(`sample_type_3`) ~ .,
data=df_ml_train_st3,
method=mixOmicsCaret::get_mixOmics_spls(),
preProc=c("center", "scale"),
#metric="ROC",
tuneGrid=expand.grid(ncomp=2,# When there are multiple dimensinos that can elucidate the change of data, multiple number of ncompis suggested. Here the data could have been confounded by seasonal effect, ncomp = 2 was ammended.
keepX=1:30,
keepY=1),
trControl=ctrl_n5)
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
## Warning in train.default(x, y, weights = w, ...): cannnot compute class
## probabilities for regression
list(spls_model_clr_st3_t2)[[1]]$pred %>%
separate(Resample, c("Fold", "Rep")) %>%
group_by(ncomp, keepX, Rep) %>%
summarize(auc = pROC::auc(predictor = pred, obs, direction = "<") %>% suppressMessages() %>% as.numeric, .groups = "drop") %>%
ggplot(aes(x = factor(keepX), y = auc)) +
geom_boxplot(outlier.shape = NA) +
geom_jitter(aes(color = factor(ncomp)), alpha = 0.2, show.legend = FALSE) +
geom_hline(yintercept = 0.5) +
#ylim(c(0.6,0.9)) +
facet_grid(. ~ ncomp) +
scale_color_brewer(palette = "Set1", name = NULL) +
theme_bw() + theme(strip.background = element_blank()) +
xlab("Number of variables") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
ggtitle("Prediction result of sample type 3, after parameter tuning")
# Final tuning
spls_model_clr_st3_t3 <- caret::train(as.numeric(`sample_type_3`) ~ .,
data=df_ml_train_st3,
method=mixOmicsCaret::get_mixOmics_spls(),
preProc=c("center", "scale"),
#metric="ROC",
tuneGrid=expand.grid(ncomp=2,# When there are multiple dimensinos that can elucidate the change of data, multiple number of ncompis suggested. Here the data could have been confounded by seasonal effect, ncomp = 2 was ammended.
keepX=12,
keepY=1),
trControl=ctrl_n5)
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
## Warning in train.default(x, y, weights = w, ...): cannnot compute class
## probabilities for regression
tibble::tibble(
model_name = c("T1", "T1", "T2", "T2", "T3", "T3"),
dataset = c("Train", "Test", "Train", "Test", "Train", "Test"),
auc = c(
pROC::roc(df_ml_train_st3$sample_type_3, #actual value
predict(spls_model_clr_st3_t1, newdata = df_ml_train_st3), #prediction
quiet = TRUE)$auc, #metric
pROC::roc(df_ml_test_st3$sample_type_3,
predict(spls_model_clr_st3_t1, newdata = df_ml_test_st3),
quiet = TRUE)$auc,
pROC::roc(df_ml_train_st3$sample_type_3, #actual value
predict(spls_model_clr_st3_t2, newdata = df_ml_train_st3),
quiet = TRUE)$auc,
pROC::roc(df_ml_test_st3$sample_type_3,
predict(spls_model_clr_st3_t2, newdata = df_ml_test_st3),
quiet = TRUE)$auc,
pROC::roc(df_ml_train_st3$sample_type_3, #actual value
predict(spls_model_clr_st3_t3, newdata = df_ml_train_st3),
quiet = TRUE)$auc,
pROC::roc(df_ml_test_st3$sample_type_3,
predict(spls_model_clr_st3_t3, newdata = df_ml_test_st3),
quiet = TRUE)$auc
))
## # A tibble: 6 × 3
## model_name dataset auc
## <chr> <chr> <dbl>
## 1 T1 Train 0.999
## 2 T1 Test 0.967
## 3 T2 Train 0.996
## 4 T2 Test 0.94
## 5 T3 Train 0.995
## 6 T3 Test 0.94
sPLS loadings
dat_train <- df_ml_train_st3
y <- as.numeric(dat_train$sample_type_3 == T)
# Data preparation
fm <- spls_model_clr_st3_t2$finalModel
scores <- fm$variates$X # (sample × comp)
loadX <- fm$loadings$X # (feature × comp)
# 1) Component sign alignment
sign_flip <- rep(1, ncol(scores))
for (c in seq_len(ncol(scores))) {
if (cor(scores[, c], y, use = "complete.obs") < 0) {
sign_flip[c] <- -1
}
}
scores <- sweep(scores, 2, sign_flip, `*`)
loadX <- sweep(loadX, 2, sign_flip, `*`)
# 2) Loading tables
load_tab <- map_dfr(seq_len(ncol(loadX)), function(c){
tibble(
var = rownames(loadX),
loading = as.numeric(loadX[, c]),
comp = c
)
})
load_tab <- load_tab #%>% mutate(Species = gsub("^X[.]", "", var))
# 3) delata mean calculaiton
feat <- unique(load_tab$var)
X <- dat_train[, feat, drop = FALSE]
ok <- complete.cases(y, X)
delta <- colMeans(X[ok & y==1, , drop=FALSE]) - colMeans(X[ok & y==0, , drop=FALSE])
annot <- tibble(var = names(delta), delta = as.numeric(delta),
`Sample type 3` = factor(ifelse(delta>0, "+ Sample type 3","- Sample type 3"),
levels=c("+ Sample type 3","- Sample type 3")))
plot_df <- load_tab %>%
left_join(annot, by="var")
# 4) annotation and plotting
topK <- spls_model_clr_st3_t2$bestTune$keepX
plot_df_top <- plot_df %>%
group_by(comp) %>%
slice_max(order_by = abs(loading), n = topK, with_ties = FALSE) %>%
ungroup()
spls_laoding_data <- rbind(
left_join(
plot_df_top %>%
mutate(tax=gsub("^X[.]", "", var), Species=paste0("s__", var)),
my_phyloseq_da %>%
tax_table() %>% data.frame, by="Species") %>%
arrange(desc(abs(loading))) %>%
#head(list_spls_final$spls_model_clr_ipm_nc2_kx32$bestTune$keepX*2) %>%
dplyr::select(-var) %>%
mutate(data = "Sample type 3"))
spls_laoding_data$Phylum <- gsub("_", "-", spls_laoding_data$Phylum)
col_list <- spls_laoding_data %>%
.$`Sample type 3` %>%
unique %>%
.[order(.)]
col_list <- col_list %>%{
data <- .
colors_class <- MetBrewer::met.brewer("Austria", n =length(data)) %>% c()
#colors_class <- c("#a6cee3", "#1f78b4" , "#b2df8a", "#33a02c", "#fb9a99", "#e31a1c")
names(colors_class) <- data
colors_class
}
spls_laoding_data %>%
head(spls_model_clr_st3_t3$bestTune$keepX * 2) %>%
mutate(data = "Sample type 3",
comp = dplyr::case_when(
comp == "1" ~ "Component 1",
comp == "2" ~ "Component 2",
TRUE ~ NA_character_
))
## # A tibble: 24 × 13
## loading comp delta `Sample type 3` tax Species Domain Phylum Class Order
## <dbl> <chr> <dbl> <fct> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 0.554 Compo… 2.63 + Sample type 3 Stre… s__Str… k__Ba… p--Fi… c__B… o__L…
## 2 -0.519 Compo… -6.35 - Sample type 3 Cuti… s__Cut… k__Ba… p--Ac… c__A… o__P…
## 3 0.380 Compo… 2.93 + Sample type 3 Allo… s__All… k__Ba… p--Ba… c__B… o__B…
## 4 0.349 Compo… 3.96 + Sample type 3 Stap… s__Sta… k__Ba… p--Fi… c__B… o__B…
## 5 0.344 Compo… 2.10 + Sample type 3 Camp… s__Cam… k__Ba… p--Pr… c__E… o__C…
## 6 0.326 Compo… 3.96 + Sample type 3 Stap… s__Sta… k__Ba… p--Fi… c__B… o__B…
## 7 0.289 Compo… 0.866 + Sample type 3 Stre… s__Str… k__Ba… p--Fi… c__B… o__L…
## 8 0.275 Compo… 2.04 + Sample type 3 Serr… s__Ser… k__Ba… p--Pr… c__G… o__E…
## 9 0.274 Compo… 2.09 + Sample type 3 Allo… s__All… k__Ba… p--Ba… c__B… o__B…
## 10 0.274 Compo… 1.71 + Sample type 3 Nega… s__Neg… k__Ba… p--Fi… c__N… o__V…
## # ℹ 14 more rows
## # ℹ 3 more variables: Family <chr>, Genus <chr>, data <chr>
sPLS_loadings_plot <- spls_laoding_data %>%
head(spls_model_clr_st3_t3$bestTune$keepX * 2) %>%
mutate(data = "Sample type 3",
comp = dplyr::case_when(
comp == "1" ~ "Component 1",
comp == "2" ~ "Component 2",
TRUE ~ NA_character_
)) %>%
ggplot(aes(fct_reorder(tax, loading),
loading, fill=`Sample type 3`)) +
geom_bar(stat="identity", color="black", lwd=.2) +
coord_flip() +
ylab("sPLS loadings") +
facet_wrap(~comp, ncol = 1, scale = "free_y") +
xlab("Sample type 3 signature at species level") +
scale_fill_manual(values = col_list) +
scale_x_discrete(labels = function(x) as.expression(parse(text = species_italic(x)))) +
theme_classic() +
theme(legend.position = "top",
text=element_text(size=10, color = "black"),
axis.text.y=element_text(color = "black"),
axis.text.x=element_text(color = "black"))
sPLS_loadings_plot
Random forest & caret
# Random forest & caret ----------------------------------------------------
df_rf <- df_ml_st3
df_rf$sample_type_3 <- factor(df_rf$sample_type_3,
levels = c(FALSE, TRUE),
labels = c("Other", "Type3"))
set.seed(seed)
train_index_rf <- createDataPartition(df_rf$sample_type_3, p = 0.8, list = FALSE)
df_rf_train <- df_rf[train_index_rf, ]
df_rf_test <- df_rf[-train_index_rf, ]
# caret's trainControl settings (ROC)
ctrl_rf <- caret::trainControl(
method = "repeatedcv",
number = 5,
repeats = 5,
returnResamp = "all",
savePredictions = "final",
classProbs = TRUE,
summaryFunction = twoClassSummary,
allowParallel = TRUE)
# Random forest
rf_model_st3 <- caret::train(sample_type_3 ~ .,
data = df_rf_train,
method = "rf",
metric = "ROC",
trControl = ctrl_rf,
tuneLength = 5,
importance = TRUE
)
rf_model_st3
## Random Forest
##
## 126 samples
## 335 predictors
## 2 classes: 'Other', 'Type3'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 5 times)
## Summary of sample sizes: 100, 102, 101, 101, 100, 101, ...
## Resampling results across tuning parameters:
##
## mtry ROC Sens Spec
## 2 0.8743429 0.9115238 0.512
## 85 0.9885429 0.9860952 0.718
## 168 0.9956762 0.9960000 0.812
## 251 0.9971762 0.9960000 0.812
## 335 0.9962643 0.9980000 0.828
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 251.
RF evaluation
# Test set prediction
rf_prob_test <- predict(rf_model_st3, newdata = df_rf_test, type = "prob")
rf_pred_test <- predict(rf_model_st3, newdata = df_rf_test, type = "raw")
# Confusion matrix
caret::confusionMatrix(rf_pred_test, df_rf_test$sample_type_3)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Other Type3
## Other 25 0
## Type3 0 6
##
## Accuracy : 1
## 95% CI : (0.8878, 1)
## No Information Rate : 0.8065
## P-Value [Acc > NIR] : 0.00127
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.8065
## Detection Rate : 0.8065
## Detection Prevalence : 0.8065
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : Other
##
# ROC & AUC
rf_roc_test <- pROC::roc(
response = df_rf_test$sample_type_3,
predictor = rf_prob_test[,"Type3"],
levels = c("Other", "Type3"),
direction = "<"
)
rf_roc_test$auc
## Area under the curve: 1
# ROC curve
rf_roc_df <- tibble::tibble(
fpr = 1 - rf_roc_test$specificities,
tpr = rf_roc_test$sensitivities
)
rf_roc_df %>%
ggplot(aes(x = fpr, y = tpr)) +
geom_line() +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", colour = "grey70") +
coord_equal() +
theme_classic() +
labs(x = "1 - Specificity",
y = "Sensitivity",
title = paste0("Random forest ROC (Test AUC = ", round(as.numeric(rf_roc_test$auc), 3), ")"))
RF variable importance
# Extraction of variable importance
rf_varimp_raw <- caret::varImp(rf_model_st3, scale = TRUE)$importance %>%
tibble::rownames_to_column("feature") %>%
arrange(desc(Type3))
# Top 30
rf_varimp_top <- rf_varimp_raw %>%
head(30) %>%
dplyr::left_join(
my_phyloseq_da %>%
tax_table() %>%
as("matrix") %>%
data.frame(stringsAsFactors = FALSE) %>%
tibble::rownames_to_column("feature"),
by = "feature"
) %>%
mutate(
label = species_italic(feature)
)
rf_varimp_top %>%
ggplot(aes(x = reorder(feature, Type3), y = Type3)) +
geom_col(color = "black") +
coord_flip() +
scale_x_discrete(labels = function(x) {
sapply(x, function(xx) parse(text = species_italic(xx)))
}) +
theme_classic(base_family = "sans") +
labs(
x = "Top species (random forest importance)",
y = "Importance (scaled)",
title = "Top 30 features contributing to RF model (sample_type_3)"
)
lasso using SIAMCAT
# SIAMCAT obejct preparation
meta_siam <- sample_data(my_phyloseq) %>%
data.frame() %>%
mutate(
sample_type_3 = factor(sample_type == "sample_type_3",
levels = c(FALSE, TRUE),
labels = c("Other", "Type3"))
)
# SIAMCAT table creation (taxa and sample)
feat_siam <- my_phyloseq %>%
otu_table() %>%
as("matrix") # taxa and sample
# Labeling
label_siam <- SIAMCAT::create.label(
meta = meta_siam,
label = "sample_type_3",
case = "Type3",
control = "Other"
)
## Label used as case:
## Type3
## Label used as control:
## Other
## + finished create.label.from.metadata in 0 s
# SIAMCAT object
sc.obj <- SIAMCAT::siamcat(
feat = feat_siam,
meta = meta_siam,
label = label_siam
)
## + starting validate.data
## Warning in validate.data(sc, verbose = verbose): The data do not seem to
## consist of relative abundances! (values ranging between 0 and 1)
## +++ checking overlap between labels and features
## + Keeping labels of 157 sample(s).
## +++ checking sample number per class
## +++ checking overlap between samples and metadata
## + finished validate.data in 0.264 s
# feature filtering & normalization
# abundance filtering
sc.obj <- SIAMCAT::filter.features(
sc.obj,
filter.method = "abundance",
cutoff = 0.001,
rm.unmapped = TRUE
)
## Features successfully filtered
##### log transform
sc.obj <- SIAMCAT::normalize.features(
sc.obj,
norm.method = "log.std"
)
## Features normalized successfully.
# Splitting data
set.seed(seed)
sc.obj <- SIAMCAT::create.data.split(
sc.obj,
num.folds = 5,
num.resample = 5,
stratify = TRUE
)
## Features splitted for cross-validation successfully.
# Training with LASSO
sc.obj <- SIAMCAT::train.model(
sc.obj,
method = "lasso"
)
## Trained lasso models successfully.
# Prediction
sc.obj <- SIAMCAT::make.predictions(sc.obj)
## Made predictions successfully.
sc.obj <- SIAMCAT::evaluate.predictions(sc.obj)
## Evaluated predictions successfully.
# Plot everyting (ROC, PR, etc.)
model.evaluation.plot(sc.obj)[[1]]
## NULL
model.interpretation.plot(sc.obj, fn.plot = 'Inha/5_Lectures/2_IBS7048_Metagenomics/2025F/scripts/IBS7048_Advanced_metagenomics/interpretation.pdf',
consens.thres = 0.5, limits = c(-3, 3), heatmap.type = 'zscore')
## Warning in arrows(y0 = c(seq_along(med)) - 0.5, x0 = -upp.qt, x1 = -low.qt, :
## zero-length arrow is of indeterminate angle and so skipped
## Successfully plotted model interpretation plot to: Inha/5_Lectures/2_IBS7048_Metagenomics/2025F/scripts/IBS7048_Advanced_metagenomics/interpretation.pdf
Bibliography
## Computing. R Foundation for Statistical Computing, Vienna, Austria. <https://www.R-project.org/>. We have invested a lot of time and effort in creating R, please cite it when using it for data analysis. See also 'citation("pkgname")' for citing R packages.
## version 1.4.5, <https://CRAN.R-project.org/package=readxl>.
## graphics of microbiome census data. Paul J. McMurdie and Susan Holmes (2013) PLoS ONE 8(4):e61217.
## Grolemund G, Hayes A, Henry L, Hester J, Kuhn M, Pedersen TL, Miller E, Bache SM, Müller K, Ooms J, Robinson D, Seidel DP, Spinu V, Takahashi K, Vaughan D, Wilke C, Woo K, Yutani H (2019). "Welcome to the tidyverse." Journal of Open Source Software_, *4*(43), 1686. doi:10.21105/joss.01686 <https://doi.org/10.21105/joss.01686>.
## R. version 0.5.0. Buffalo, New York. http://github.com/trinker/pacman
## J, reikoch, Beasley W, O'Connor B, Warnes GR, Quinn M, Kamvar ZN, Gao C (2024). yaml: Methods to Convert R Data to YAML and Back_. R package version 2.3.10, <https://CRAN.R-project.org/package=yaml>. ATTENTION: This citation information has been auto-generated from the package DESCRIPTION file and may need manual editing, see 'help("citation")'.
## Package: Tests in Linear Mixed Effects Models." Journal of Statistical Software, *82*(13), 1-26. doi:10.18637/jss.v082.i13 <https://doi.org/10.18637/jss.v082.i13>.
## Package. Journal of Statistical Software, 28(5), 1–26. https://doi.org/10.18637/jss.v028.i05
## cross-disease comparison enabled by the SIAMCAT machine learning toolbox. Genome Biol 22, 93 (2021). https://doi.org/10.1186/s13059-021-02306-1
## package for 'omics feature selection and multiple data integration. PLoS computational biology 13(11):e1005752
## 0.1.1, commit b9ef1d3683fb14c78c78f63a7658cb04b1169e87, <https://github.com/jonathanth/mixOmicsCaret>. ATTENTION: This citation information has been auto-generated from the package DESCRIPTION file and may need manual editing, see 'help("citation")'.