#Making Phyloseq Object from Metaphlan Outoput
rm (list=ls())
seqdir <- "../00_InputFiles_AnalysisFromWynton/02_MetaPhlAnOutput/" #FCohort-1er containing individual MetaPhlAn output files
files <- list.files(
seqdir,
pattern = "_profiled\\.txt$|_profile\\.txt$",
full.names = FALSE
)
files_full <- file.path(seqdir, files)
#extract sample names (remove suffix)
Sample <- files %>%
gsub("_pure_reads_profiled\\.txt$", "", .) %>%
gsub("_profile\\.txt$", "", .)
taxa_df <- NULL
for (k in seq_along(files_full)) {
file <- files_full[k]
#read table, skip header lines (starting with #), allow ragged rows
samp <- read.table(file,
sep = "\t",
header = TRUE,
comment.char = "#",
quote = "",
stringsAsFactors = FALSE,
fill = TRUE)
#only keep taxa and relative abundance columns
if (ncol(samp) >= 3) {
samp <- samp[, c(1, 3)]
colnames(samp) <- c("Taxa", Sample[k])
if (is.null(taxa_df)) {
taxa_df <- samp
} else {
taxa_df <- full_join(taxa_df, samp, by = "Taxa")
}
}
}
taxa_df$Level <- case_when(
grepl("t__", taxa_df$Taxa) ~ "Strain",
grepl("s__", taxa_df$Taxa) ~ "Species",
grepl("g__", taxa_df$Taxa) ~ "Genus",
grepl("f__", taxa_df$Taxa) ~ "Family",
grepl("o__", taxa_df$Taxa) ~ "Order",
grepl("c__", taxa_df$Taxa) ~ "Class",
grepl("p__", taxa_df$Taxa) ~ "Phylum",
grepl("k__", taxa_df$Taxa) ~ "Kingdom",
TRUE ~ "Unclassified"
)
#reorder columns
taxa_df <- taxa_df %>% select(Taxa, Level, everything())
write.table(taxa_df, "merged_taxa_table.tsv", sep = "\t",
row.names = FALSE, quote = FALSE)
##Merging data
rm(list = ls())
Metadata <- read.csv("../01_CuratingMetadata/metadataDAS28.csv", stringsAsFactors = FALSE) #This is the curated metadata, see below
#set the row names using the 'KT_sampleName_withOutS' column
rownames(Metadata) <- Metadata$KT_sampleName_withS
head(Metadata)
## fileName PatientID age
## SRR13436328 NYU_RA_5B_S152_L002_R1_001.fastq.gz 5 28
## SRR13436327 37_S2.R1.fastq.gz* 37 30
## SRR13436277 NYU_RA_72B_S156_L002_R1_001.fastq.gz 72 37
## SRR13436266 NYU_RA_82B_S161_L002_R1_001.fastq.gz 82 31
## SRR13436255 NYU_RA_88B_S71_R1_001.fastq.gz 88 44
## SRR13436244 NYU_RA_90B_S75_R1_001.fastq.gz 90 71
## KT_sampleName_withOutS KT_sampleName_withS patientID MTXRandMTXNR
## SRR13436328 SRR13436328 SRR13436328 5 MTXNR
## SRR13436327 SRR13436327 SRR13436327 37 MTXNR
## SRR13436277 SRR13436277 SRR13436277 72 MTXR
## SRR13436266 SRR13436266 SRR13436266 82 MTXNR
## SRR13436255 SRR13436255 SRR13436255 88 MTXNR
## SRR13436244 SRR13436244 SRR13436244 90 MTXNR
## gender Cohort_Original Cohort Country DAS28
## SRR13436328 Female Old Cohort-1 USA 6.45
## SRR13436327 Male Old Cohort-1 USA 6.04
## SRR13436277 Female Old Cohort-1 USA 6.31
## SRR13436266 Female Old Cohort-1 USA 4.62
## SRR13436255 Female Old Cohort-1 USA 6.63
## SRR13436244 Female Old Cohort-1 USA 4.92
seqtab <- read.csv("merged_taxa_table.tsv",
row.names = 1, header = TRUE, sep = "\t")
seqtab[is.na(seqtab)] <- 0
colnames(seqtab) <- gsub("_L002", "", colnames(seqtab))
seqtab_samples <- colnames(seqtab)
#check samples are now without "_L002"
print(seqtab_samples[grep("NYU_RA_", seqtab_samples)])
## [1] "NYU_RA_107B_S165" "NYU_RA_126B_S168" "NYU_RA_140B_S80" "NYU_RA_211B_S85"
## [5] "NYU_RA_215B_S173" "NYU_RA_222B_S89" "NYU_RA_263B_S177" "NYU_RA_283B_S91"
## [9] "NYU_RA_317B_S93" "NYU_RA_334B_S95" "NYU_RA_347B_S180" "NYU_RA_356B_S183"
## [13] "NYU_RA_360B_S186" "NYU_RA_371B_S189" "NYU_RA_5B_S152" "NYU_RA_72B_S156"
## [17] "NYU_RA_82B_S161" "NYU_RA_88B_S71" "NYU_RA_90B_S75"
#Select Strains that will select all the taxonomic levels
seqtab_strain <- seqtab[seqtab$Level == "Strain", ]
seqtab_strain$Level <- NULL
colnames(seqtab_strain) <- colnames(seqtab_strain) %>%
gsub("^X", "", .) %>% #remove leading X
gsub("\\.", "-", .) #replace . w/ -
#subset ASV table to only metadata samples
otu_filtered <- seqtab_strain[, colnames(seqtab_strain) %in% rownames(Metadata)]
#create phyloseq components
otu_table_ps <- otu_table(as.matrix(otu_filtered), taxa_are_rows = TRUE)
#sample data
sample_data_ps <- sample_data(Metadata)
#taxonomy table
taxonomy_split <- strsplit(rownames(otu_filtered), "\\|")
rank_names <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species", "Strain")
tax_matrix <- do.call(rbind, lapply(taxonomy_split, function(x) {
length(x) <- length(rank_names) #pad with NAs if needed
return(x)
}))
colnames(tax_matrix) <- rank_names
rownames(tax_matrix) <- rownames(otu_filtered)
tax_table_ps <- tax_table(as.matrix(tax_matrix))
sdata <- sample_data(Metadata)
taxonomy_split <- strsplit(rownames(seqtab_strain), "\\|")
#Define the rank names (MetaPhlAn usually has 7 levels)
rank_names <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")
#Pad or truncate to 7 levels and convert to matrix
tax_matrix <- do.call(rbind, lapply(taxonomy_split, function(x) {
length(x) <- length(rank_names) #Pad with NA if short
return(x)
}))
colnames(tax_matrix) <- rank_names
rownames(tax_matrix) <- rownames(seqtab_strain)
tax <- tax_table(as.matrix(tax_matrix))
ps <- phyloseq(otu_table_ps, sdata, tax)
ps
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 2659 taxa and 100 samples ]
## sample_data() Sample Data: [ 100 samples by 12 sample variables ]
## tax_table() Taxonomy Table: [ 2659 taxa by 7 taxonomic ranks ]
sample_names(ps)[1:5]
## [1] "RA_457_SAPBO_B_S13" "RA_492_B_microra_S27" "RA_495_SANORA_B_S39"
## [4] "RA_505-1_Before_S98" "RA_513-1__Before_S100"
taxa_names(ps)[1:5]
## [1] "k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Bacteroidales|f__Prevotellaceae|g__Segatella|s__Segatella_brasiliensis|t__SGB1636"
## [2] "k__Bacteria|p__Firmicutes|c__Clostridia|o__Lachnospirales|f__Lachnospiraceae|g__Roseburia|s__Roseburia_sp_AM59_24XD|t__SGB4654"
## [3] "k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Bacteroidales|f__Prevotellaceae|g__Segatella|s__Segatella_sinensis|t__SGB1644"
## [4] "k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Bacteroidales|f__Prevotellaceae|g__GGB1146|s__GGB1146_SGB1472|t__SGB1472"
## [5] "k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Bacteroidales|f__Prevotellaceae|g__Segatella|s__Segatella_copri|t__SGB1626"
rank_names(ps)
## [1] "Kingdom" "Phylum" "Class" "Order" "Family" "Genus" "Species"
ps <- phyloseq(otu_table_ps, sample_data_ps, tax_table_ps)
ps
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 2659 taxa and 100 samples ]
## sample_data() Sample Data: [ 100 samples by 12 sample variables ]
## tax_table() Taxonomy Table: [ 2659 taxa by 8 taxonomic ranks ]
rank_names(ps)
## [1] "Kingdom" "Phylum" "Class" "Order" "Family" "Genus" "Species"
## [8] "Strain"
ps <- prune_taxa(taxa_sums(ps) > 0, ps)
saveRDS(ps, "PhyloseqObject.rds")
sample_data1 <- as_tibble(sample_data(ps))
md<- as_tibble(sample_data(ps))
ptnumber<- md%>%
group_by(MTXRandMTXNR, Cohort)%>%
summarize(pcount = n_distinct(PatientID))
## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by MTXRandMTXNR and Cohort.
## ℹ Output is grouped by MTXRandMTXNR.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(MTXRandMTXNR, Cohort))` for per-operation grouping
## (`?dplyr::dplyr_by`) instead.
p<- ggplot(ptnumber, aes(x=MTXRandMTXNR, y=pcount, fill =MTXRandMTXNR))+
geom_col() +facet_wrap(~Cohort) +theme_classic2()+ geom_text(aes(label = pcount), vjust = -0.5)+ scale_fill_manual(values=c("darkorange3", "darkgreen" ))
p
ggsave( filename = "plot_pcount.pdf", plot = p, width = 8, height = 6, dpi = 300 )
rm(list=ls())
pseq <- readRDS("PhyloseqObject.rds")
pseq_rel <- transform_sample_counts(pseq, function(x) x / sum(x))
pseq_filtered <- prune_taxa(taxa_sums(pseq_rel) > 0, pseq_rel)
md <- as_tibble(sample_data(pseq_filtered))
pseq_filtered
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 2100 taxa and 100 samples ]
## sample_data() Sample Data: [ 100 samples by 12 sample variables ]
## tax_table() Taxonomy Table: [ 2100 taxa by 8 taxonomic ranks ]
otu_mat <- as(otu_table(pseq_filtered), "matrix")
if (!taxa_are_rows(pseq_filtered)) {
otu_mat <- t(otu_mat)
}
CRC_meta <- as(sample_data(pseq_filtered), "data.frame")
fit_adjust_batch <- adjust_batch(
feature_abd = otu_mat,
batch = "Cohort",
covariates = "MTXRandMTXNR",
data = CRC_meta,
control = list(verbose = FALSE))
## Warning in check_batch(df_batch[[batch]], min_n_batch = 2): Batch variable is
## not a factor as provided and will be converted to one.
CRC_abd_adj <- fit_adjust_batch$feature_abd_adj
otu_table_adj <- otu_table(CRC_abd_adj, taxa_are_rows = TRUE)
pseq_adj <- phyloseq(otu_table_adj, tax_table(pseq_filtered), sample_data(pseq_filtered))
pseq_adj
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 2100 taxa and 100 samples ]
## sample_data() Sample Data: [ 100 samples by 12 sample variables ]
## tax_table() Taxonomy Table: [ 2100 taxa by 8 taxonomic ranks ]
saveRDS(pseq_adj, file = "PseqBatchCorrected.rds")
#Suppl. Figure 2AB
pcoa_before <- ordinate(pseq_filtered, method = "PCoA", distance = "bray")
eig_vals_before <- pcoa_before$values$Eigenvalues
var_explained_before <- round(100 * eig_vals_before / sum(eig_vals_before), 2)
p1 <- plot_ordination(pseq_filtered, pcoa_before, color = "Cohort") +
geom_point(size = 4) +
theme_classic() +
scale_color_manual(values = c("darkgreen", "darkorange3", "black")) +
theme(
axis.title = element_text(size = 16),
axis.text = element_text(size = 14)
) +
labs(
title = "Before Batch Correction (Bray-Curtis PCoA)",
x = paste0("PCoA1 (", var_explained_before[1], "%)"),
y = paste0("PCoA2 (", var_explained_before[2], "%)")
)
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the phyloseq package.
## Please report the issue at <https://github.com/joey711/phyloseq/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
p1
ggsave("SupplFigure2A_PCoA_before_batch_correct.pdf", p1, height = 4, width = 6, dpi = 300)
pcoa_after <- ordinate(pseq_adj, method = "PCoA", distance = "bray")
eig_vals_after <- pcoa_after$values$Eigenvalues
var_explained_after <- round(100 * eig_vals_after / sum(eig_vals_after), 2)
p2 <- plot_ordination(pseq_adj, pcoa_after, color = "Cohort") +
geom_point(size = 4) +
theme_classic() +
scale_color_manual(values = c("darkgreen", "darkorange3", "black")) +
theme(
axis.title = element_text(size = 16),
axis.text = element_text(size = 14)
) +
labs(
title = "After Batch Correction (Bray-Curtis PCoA)",
x = paste0("PCoA1 (", var_explained_after[1], "%)"),
y = paste0("PCoA2 (", var_explained_after[2], "%)"))
p2
ggsave("SupplFigure2B_PCoA_After_batch_correct.pdf", p2, height = 4, width = 6, dpi = 300)
set.seed(111)
bc_dist_before <- phyloseq::distance(pseq_filtered, method = "bray")
meta_before <- as(sample_data(pseq_filtered), "data.frame")
res_perm_before <- adonis2(
bc_dist_before ~ Cohort+MTXRandMTXNR,,
by = "term",
data = meta_before,
permutations = 1000)
res_perm_before
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 1000
##
## adonis2(formula = bc_dist_before ~ Cohort + MTXRandMTXNR, data = meta_before, permutations = 1000, by = "term")
## Df SumOfSqs R2 F Pr(>F)
## Cohort 2 3.814 0.10462 5.7082 0.000999 ***
## MTXRandMTXNR 1 0.571 0.01566 1.7094 0.014985 *
## Residual 96 32.074 0.87972
## Total 99 36.460 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
bc_dist_after <- phyloseq::distance(pseq_adj, method = "bray")
meta_after <- as(sample_data(pseq_adj), "data.frame")
res_after <- adonis2(
bc_dist_after ~ Cohort+MTXRandMTXNR,,
by = "term",
data = meta_after,
permutations = 1000)
res_after
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 1000
##
## adonis2(formula = bc_dist_after ~ Cohort + MTXRandMTXNR, data = meta_after, permutations = 1000, by = "term")
## Df SumOfSqs R2 F Pr(>F)
## Cohort 2 0.964 0.02808 1.4119 0.01099 *
## MTXRandMTXNR 1 0.594 0.01730 1.7402 0.01099 *
## Residual 96 32.786 0.95462
## Total 99 34.344 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Suppl figure 4
rm(list=ls())
pseq <- readRDS("PseqBatchCorrected.rds")
df <- estimate_richness(pseq, measures = "Shannon") %>% cbind(sample_data(pseq))
## Warning in estimate_richness(pseq, measures = "Shannon"): The data you have provided does not have
## any singletons. This is highly suspicious. Results of richness
## estimates (for example) are probably unreliable, or wrong, if you have already
## trimmed low-abundance taxa from the data.
##
## We recommended that you find the un-trimmed data and retry.
base_plot <- ggplot(df, aes(x = MTXRandMTXNR, y = Shannon, fill = MTXRandMTXNR)) +
stat_summary(fun = mean, geom = "bar", alpha = 0.4) +
scale_fill_manual(values = c("darksalmon", "#238A8DFF")) +
scale_shape_manual(values = c("Cohort-1" = 22, "Cohort-2" = 21, "Cohort-3" = 24)) +
theme_classic() +
theme(axis.title = element_text(size = 16), axis.text = element_text(size = 14))
#combined dara
p1 <- base_plot +
geom_point(aes(shape = Cohort), size = 2.5, stroke = 0.5, color = "black",
position = position_jitter(0.1, 0)) +
stat_compare_means(method = "wilcox.test")
ggsave("SupplFig4left_Alpha.pdf", p1, width = 5, height = 3)
#cohortwise
p2 <- base_plot +
geom_point(aes(shape = Cohort), size = 1.5, stroke = 0.3, color = "black",
position = position_jitter(0.1, 0)) +
facet_wrap(~Cohort, scales = "free_y") +
stat_compare_means(method = "wilcox.test", label = "p.signif", hide.ns = TRUE) +
stat_compare_means(method = "t.test", label = "p.format")
p1
p2
ggsave("SupplFig4right_Alpha.pdf", p2, width = 8, height = 3)
##Figure 1A
rm(list=ls())
pseq <- readRDS("PhyloseqObject.rds")
ntaxa(pseq)
## [1] 2100
pseq_rel <- transform_sample_counts(pseq, function(x) x / sum(x))
summary(sample_sums(pseq_rel)) #Verify normalization
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 1 1 1 1 1
cohorts <- unique(sample_data(pseq_rel)$Cohort)
cat("Cohorts found:", paste(cohorts, collapse = ", "), "\n")
## Cohorts found: Cohort-2, Cohort-1, Cohort-3
#Keep species that are >1e-5 in >=10% of samples in at least one cohort
pseq_filtered <- filter_taxa(
pseq_rel,
function(x) {
all(sapply(cohorts, function(cohort) {
cohort_samples <- which(sample_data(pseq_rel)$Cohort == cohort)
mean(x[cohort_samples] > 5e-5) >= 0.1
}))
},
prune = TRUE
)
cat("No . of taxa after cohort-aware filtering:", ntaxa(pseq_filtered), "\n")
## No . of taxa after cohort-aware filtering: 249
otu_mat <- as(otu_table(pseq_filtered), "matrix")
if (!taxa_are_rows(pseq_filtered)) otu_mat <- t(otu_mat)
meta <- as(sample_data(pseq_filtered), "data.frame")
fit_adjust_batch <- adjust_batch(
feature_abd = otu_mat,
batch = "Cohort",
covariates = "MTXRandMTXNR",
data = meta,
control = list(verbose = FALSE))
## Warning in check_batch(df_batch[[batch]], min_n_batch = 2): Batch variable is
## not a factor as provided and will be converted to one.
CRC_abd_adj <- fit_adjust_batch$feature_abd_adj
otu_table_adj <- otu_table(CRC_abd_adj, taxa_are_rows = TRUE)
pseq_adj <- phyloseq( otu_table_adj,
tax_table(pseq_filtered),
sample_data(pseq_filtered))
cat("strain-level phyloseq object filtered and batch-corrected.\n")
## strain-level phyloseq object filtered and batch-corrected.
set.seed(111)
bc_dist_after <- phyloseq::distance(pseq_adj, method = "bray")
meta_adj <- as(sample_data(pseq_adj), "data.frame")
res_perm_after <- adonis2(
bc_dist_after ~ Cohort+MTXRandMTXNR,
data = meta_adj, by ="terms",
permutations = 1000)
res_perm_after
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 1000
##
## adonis2(formula = bc_dist_after ~ Cohort + MTXRandMTXNR, data = meta_adj, permutations = 1000, by = "terms")
## Df SumOfSqs R2 F Pr(>F)
## Cohort 2 1.2369 0.03945 2.0119 0.000999 ***
## MTXRandMTXNR 1 0.6026 0.01922 1.9603 0.006993 **
## Residual 96 29.5117 0.94132
## Total 99 31.3513 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
dist_matrix <- phyloseq::distance(pseq_adj, method = "bray", weighted = TRUE)
ordination <- ordinate(pseq_adj, method = "PCoA", distance = dist_matrix)
var_expl <- ordination$values$Eigenvalues / sum(ordination$values$Eigenvalues) * 100
sample_data_df <- as(sample_data(pseq_adj), "data.frame")
p <- plot_ordination(pseq_adj, ordination, axes = c(3, 5), color = "MTXRandMTXNR", shape = "Cohort") + geom_point(size = 5) +
scale_color_manual(values = c("darksalmon", "#238A8DFF")) +
scale_shape_manual(values = c("Cohort-1" = 15, "Cohort-2" = 16, "Cohort-3" = 17)) +
labs( x = paste0("PC3 (", round(var_expl[3], 1), "%)"),
y = paste0("PC5 (", round(var_expl[5], 1), "%)")) +theme_classic() + theme(axis.title = element_text(size = 16),
axis.text = element_text(size = 14))
p
ggsave("Figure1A_PCoA.pdf", p, height = 6, width = 8, dpi = 300)
##Figure 1B
#selecting PC3 and PC12 as they are p<0.05
plot_data_long <- data.frame(
sample_data_df,
PC3 = ordination$vectors[,3],
PC12 = ordination$vectors[,12]
) %>%
pivot_longer(
cols = c(PC3, PC12),
names_to = "PC_Axis",
values_to = "Coordinate_Value"
)
p_faceted <- ggplot(plot_data_long, aes(x = MTXRandMTXNR, y = Coordinate_Value, fill = MTXRandMTXNR)) +
geom_boxplot(outlier.shape = NA, alpha = 0.7) +
geom_jitter(aes(shape = Cohort), width = 0.1, height =0, size = 2, alpha = 0.6) +
#Use facet_wrap to create separate panels for PC3 and PC5
#scales = "free_y" is crucial since PC3 and PC5 often have different ranges
facet_wrap(~PC_Axis, scales = "free_y") +
scale_fill_manual(values = c("darksalmon", "#238A8DFF")) +
scale_shape_manual(values = c("Cohort-1" = 15, "Cohort-2" = 16, "Cohort-3" = 17)) +
labs( y = "PCoA comp") + theme_classic()
print(p_faceted)
ggsave("Figure1B_PC_Distri3_12.pdf", p_faceted, height = 6, width = 7)
wilcox_PC3 <- wilcox.test(ordination$vectors[,3] ~ sample_data_df$MTXRandMTXNR)
wilcox_PC12 <- wilcox.test(ordination$vectors[,12] ~ sample_data_df$MTXRandMTXNR)
print(wilcox_PC12)
##
## Wilcoxon rank sum test with continuity correction
##
## data: ordination$vectors[, 12] by sample_data_df$MTXRandMTXNR
## W = 727, p-value = 0.000347
## alternative hypothesis: true location shift is not equal to 0
print(wilcox_PC3)
##
## Wilcoxon rank sum test with continuity correction
##
## data: ordination$vectors[, 3] by sample_data_df$MTXRandMTXNR
## W = 1634, p-value = 0.00737
## alternative hypothesis: true location shift is not equal to 0
##Figure 1C
rm(list=ls())
pseq <- readRDS("PhyloseqObject.rds")
pseq_phylum <- tax_glom(pseq, taxrank = "Phylum", NArm = TRUE)
ntaxa(pseq)
## [1] 2100
ntaxa(pseq_phylum) #after tax_glom("Phylum")
## [1] 21
pseq_rel <- transform_sample_counts(pseq_phylum, function(x) x / sum(x))
pseq_rel
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 21 taxa and 100 samples ]
## sample_data() Sample Data: [ 100 samples by 12 sample variables ]
## tax_table() Taxonomy Table: [ 21 taxa by 8 taxonomic ranks ]
summary(sample_sums(pseq_rel))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 1 1 1 1 1
cohorts <- unique(sample_data(pseq_rel)$Cohort)
cat("Cohorts found:", paste(cohorts, collapse = ", "), "\n")
## Cohorts found: Cohort-2, Cohort-1, Cohort-3
#Keep phyla that are >5e-5 in >=10% of samples in at least one cohort
pseq_filtered <- filter_taxa(
pseq_rel,
function(x) {
all(sapply(cohorts, function(cohort) {
cohort_samples <- which(sample_data(pseq_rel)$Cohort == cohort)
mean(x[cohort_samples] > 5e-5) >= 0.10
}))
},
prune = TRUE)
cat("no of phyla after filtering:", ntaxa(pseq_filtered), "\n")
## no of phyla after filtering: 7
#prepare ASV matrix & metadata
otu_mat <- as(otu_table(pseq_filtered), "matrix")
if (!taxa_are_rows(pseq_filtered)) otu_mat <- t(otu_mat)
meta <- as(sample_data(pseq_filtered), "data.frame")
#batch correction
fit_adjust_batch <- adjust_batch(
feature_abd = otu_mat,
batch = "Cohort",
covariates = "MTXRandMTXNR",
data = meta,
control = list(verbose = FALSE))
## Warning in check_batch(df_batch[[batch]], min_n_batch = 2): Batch variable is
## not a factor as provided and will be converted to one.
CRC_abd_adj <- fit_adjust_batch$feature_abd_adj
otu_table_adj <- otu_table(CRC_abd_adj, taxa_are_rows = TRUE)
pseq_adj <- phyloseq(
otu_table_adj,
tax_table(pseq_filtered),
sample_data(pseq_filtered))
cat("phylum-level phyloseq object filtered and batch-corrected.\n")
## phylum-level phyloseq object filtered and batch-corrected.
saveRDS(pseq_adj, file = "PhylumBatchCorrected_phyloseq.rds")
#permanova
#Before batch correction
set.seed(111)
bc_dist_before <- phyloseq::distance(pseq_filtered, method = "bray")
res_perm_before <- adonis2(
bc_dist_before ~ MTXRandMTXNR+Cohort+gender+age+DAS28,
data = meta,
by = "term",
permutations = 1000
)
res_perm_before
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 1000
##
## adonis2(formula = bc_dist_before ~ MTXRandMTXNR + Cohort + gender + age + DAS28, data = meta, permutations = 1000, by = "term")
## Df SumOfSqs R2 F Pr(>F)
## MTXRandMTXNR 1 0.1920 0.02246 4.0426 0.031968 *
## Cohort 2 3.7592 0.43994 39.5845 0.000999 ***
## gender 1 0.1086 0.01271 2.2880 0.119880
## age 1 0.0622 0.00727 1.3090 0.254745
## DAS28 1 0.0068 0.00080 0.1441 0.851149
## Residual 93 4.4159 0.51680
## Total 99 8.5447 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#After batch correction (only MTX effect)
set.seed(111)
bc_dist_after <- phyloseq::distance(pseq_adj, method = "bray")
meta_adj <- as(sample_data(pseq_adj), "data.frame")
res_perm_after <- adonis2(
bc_dist_after ~ MTXRandMTXNR + Cohort+gender+age,
data = meta,
by = "term",
permutations = 1000)
res_perm_after
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 1000
##
## adonis2(formula = bc_dist_after ~ MTXRandMTXNR + Cohort + gender + age, data = meta, permutations = 1000, by = "term")
## Df SumOfSqs R2 F Pr(>F)
## MTXRandMTXNR 1 0.2076 0.03746 3.7697 0.03497 *
## Cohort 2 0.0407 0.00734 0.3691 0.81319
## gender 1 0.0815 0.01472 1.4808 0.21578
## age 1 0.0352 0.00636 0.6400 0.51548
## Residual 94 5.1762 0.93413
## Total 99 5.5412 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#conver phyloobject to matrix to input into MMUPHIN
abund <- as(otu_table(pseq_adj), "matrix")
if (taxa_are_rows(pseq_adj)) abund <- t(abund)
common_samples <- intersect(rownames(abund), rownames(meta_adj))
abund <- abund[common_samples, , drop = FALSE]
meta_adj <- meta_adj[common_samples, , drop = FALSE]
abund_t <- t(abund) #taxa x samples
fit_meta <- lm_meta(
feature_abd = abund_t,
batch = "Cohort",
exposure = "MTXRandMTXNR",
covariates = c("age", "gender"),
data = meta_adj)
## Warning in check_batch(df_batch[[batch]], min_n_batch = 2): Batch variable is
## not a factor as provided and will be converted to one.
## Found 3 batches
## Fitting Maaslin2 on batch Cohort-1...
## Fitting Maaslin2 on batch Cohort-2...
## Fitting Maaslin2 on batch Cohort-3...
## Fitting meta-analysis model.
meta_fits <- fit_meta$meta_fits
#View(meta_fits)
write.csv(meta_fits, "Phylum.csv")
p <- ggplot(meta_fits, aes(x = coef, y = -log10(pval))) +
geom_point(aes(color = ifelse(qval.fdr < 0.2,
ifelse(coef >= 0, "MTX-R", "MTX-NR"), "NS"),
alpha = ifelse(qval.fdr < 0.2, 1, 0.3)),
size = 1.5) +
geom_point(
data = subset(meta_fits, qval.fdr < 0.2),
aes(color = ifelse(coef >= 0, "MTX-R", "MTX-NR")),
shape = 21, #Essential: enables fill + border
color = "black", #The outline color
stroke = 0.5, #Controls thickness of the outline
size = 2,
alpha = 1
) +
#Label significant phyla
geom_text_repel(
data = subset(meta_fits, qval.fdr < 0.2),
aes(label = str_extract(feature, "p__[^|]+")),
size = 3,
box.padding = 0.3,
point.padding = 0.3,
segment.color = "grey30",
segment.size = 0.4,
segment.alpha = 0.8,
min.segment.length = 0,
force = 2,
max.overlaps = Inf
) +
scale_color_manual(
values = c(
"MTX-R" = "#238A8DFF",
"MTX-NR" = "darksalmon",
"NS" = "grey")) +
#alpha scale
scale_alpha_identity() +
theme_classic() +
labs( x = "Effect Size (Coefficient)",
y = expression(-log[10]("p-value")),
color = "Significance" ) +theme( axis.text = element_text(size = 10),
axis.title = element_text(size = 12), legend.position = "right")
p
ggsave("Phylum_volcano.pdf", p, height = 2, width = 3, dpi = 300, )
##Supplementary Figure 05
library(patchwork)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
pseq <- readRDS("PhylumBatchCorrected_phyloseq.rds")
pseq_phylum <- tax_glom(pseq, taxrank = "Phylum", NArm = TRUE)
ntaxa(pseq) #original OTUs
## [1] 7
ntaxa(pseq_phylum) #after tax_glom("Phylum")
## [1] 7
pseq <- transform_sample_counts(pseq_phylum, function(x) x / sum(x))
y4b <- psmelt(pseq)
y4b$Phylum <- as.character(y4b$Phylum)
phyla_to_test <- meta_fits %>%
filter(qval.fdr < 0.2) %>%
pull(feature) %>%
str_extract("p__[^|]+") %>%
unique()
output_dir <- "01_Phylum_Plots"
if (!dir.exists(output_dir)) dir.create(output_dir)
for (phy in phyla_to_test) {
plot_data <- subset(y4b, Phylum == phy)
plot_data$MTXRandMTXNR <- factor(plot_data$MTXRandMTXNR, levels = c("MTXNR", "MTXR"))
#Remove zero-abundance samples for baseline calculation and log-scale plotting- when Iam remiving this it is causing weird plot thus removing non-zero samples
nonzero_data <- plot_data %>% filter(Abundance > 0)
facet_min_per_cohort <- nonzero_data %>%
group_by(Cohort) %>%
summarise(bar_base = min(Abundance, na.rm = TRUE), .groups = "drop")
summary_facet <- nonzero_data %>%
group_by(Cohort, MTXRandMTXNR) %>%
summarise(mean_abundance = 10^mean(log10(Abundance)), .groups = "drop") %>%
left_join(facet_min_per_cohort, by = "Cohort") %>%
mutate(x = as.numeric(MTXRandMTXNR),
xmin = x - 0.3,
xmax = x + 0.3)
plot_facet <- ggplot() +
geom_rect(data = summary_facet,
aes(xmin = xmin, xmax = xmax, ymin = bar_base, ymax = mean_abundance, fill = MTXRandMTXNR),
alpha = 0.7, color = "black", linewidth = 0.3) +
geom_point(data = nonzero_data,
aes(x = MTXRandMTXNR, y = Abundance, shape = Cohort, fill = MTXRandMTXNR),
color = "black", stroke = 0.5,
position = position_jitter(width = 0.1, height = 0), size = 2) +
scale_shape_manual(values = c(22,21,24)) +
scale_fill_manual(values = c("MTXNR"="darksalmon","MTXR"="#238A8DFF")) +
scale_y_log10(expand = expansion(mult = c(0,0.1)),
labels = trans_format("log10", math_format(10^.x))) +
theme_classic() +
stat_compare_means(data = nonzero_data,
aes(x=MTXRandMTXNR, y=Abundance),
method="wilcox.test", label="p.signif", hide.ns=FALSE) +
facet_wrap(~Cohort, nrow = 1, scales = "free_y") +
ggtitle(paste(phy, "by Cohort")) +
theme(axis.title.x = element_blank(),
axis.text.x = element_blank(),
legend.position = "none",
plot.title = element_text(size=11, face="bold.italic"))
#baseline = minimum non-zero value across all samples
combined_min <- min(nonzero_data$Abundance, na.rm = TRUE)
summary_combined <- nonzero_data %>%
group_by(MTXRandMTXNR) %>%
summarise(mean_abundance = 10^mean(log10(Abundance)), .groups = "drop") %>%
mutate(bar_base = combined_min,
x = as.numeric(MTXRandMTXNR),
xmin = x - 0.3,
xmax = x + 0.3)
plot_combined <- ggplot() +
geom_rect(data = summary_combined,
aes(xmin = xmin, xmax = xmax, ymin = bar_base, ymax = mean_abundance, fill = MTXRandMTXNR),
alpha = 0.7, color="black", linewidth=0.3) +
geom_point(data = nonzero_data,
aes(x = MTXRandMTXNR, y = Abundance, shape = Cohort, fill = MTXRandMTXNR),
color = "black", stroke=0.5,
position = position_jitter(width=0.1, height=0), size=2.5) +
scale_shape_manual(values = c(22,21,24)) +
scale_fill_manual(values = c("MTXNR"="darksalmon","MTXR"="#238A8DFF")) +
scale_y_log10(expand = expansion(mult = c(0,0.1)),
labels = trans_format("log10", math_format(10^.x))) +
theme_classic() +
theme(axis.title.x = element_blank(),
axis.text.x = element_blank(),
legend.position = "left") +
ggtitle(paste(phy, "Combined"))
#Combin
combined_output <- plot_combined + plot_facet + plot_layout(ncol=2, widths=c(2,3))
clean_phy_name <- gsub("p__","",phy)
pdf_filename <- file.path(output_dir, paste0(clean_phy_name,"_RvsNR_Abundance.pdf"))
pdf(pdf_filename, width=12, height=4)
print(combined_output)
dev.off()
message("Saved plot for ", phy)
}
## Saved plot for p__Bacteroidota
## Saved plot for p__Firmicutes
## Saved plot for p__Methanobacteriota
##Figure 1D
rm(list=ls())
pseq <- readRDS("PhyloseqObject.rds")
pseq_family <- tax_glom(pseq, taxrank = "Family", NArm = TRUE)
pseq_family
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 407 taxa and 100 samples ]
## sample_data() Sample Data: [ 100 samples by 12 sample variables ]
## tax_table() Taxonomy Table: [ 407 taxa by 8 taxonomic ranks ]
ntaxa(pseq)
## [1] 2100
ntaxa(pseq_family)
## [1] 407
pseq_rel <- transform_sample_counts(pseq_family, function(x) x / sum(x))
summary(sample_sums(pseq_rel))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 1 1 1 1 1
cohorts <- unique(sample_data(pseq_rel)$Cohort)
cat("Cohorts found:", paste(cohorts, collapse = ", "), "\n")
## Cohorts found: Cohort-2, Cohort-1, Cohort-3
#Keep phyla that are >1e-4 in >=5% of samples in at least one cohort
pseq_filtered <- filter_taxa(
pseq_rel,
function(x) {
all(sapply(cohorts, function(cohort) {
cohort_samples <- which(sample_data(pseq_rel)$Cohort == cohort)
mean(x[cohort_samples] > 5e-5) >= 0.1
}))
},
prune = TRUE)
cat("Number of Phyla after cohort wise filtering:", ntaxa(pseq_filtered), "\n")
## Number of Phyla after cohort wise filtering: 79
otu_mat <- as(otu_table(pseq_filtered), "matrix")
if (!taxa_are_rows(pseq_filtered)) otu_mat <- t(otu_mat)
meta <- as(sample_data(pseq_filtered), "data.frame")
fit_adjust_batch <- adjust_batch(
feature_abd = otu_mat,
batch = "Cohort", #Batch variable
covariates = "MTXRandMTXNR", #Preserve MTX signal
data = meta,
control = list(verbose = FALSE))
## Warning in check_batch(df_batch[[batch]], min_n_batch = 2): Batch variable is
## not a factor as provided and will be converted to one.
CRC_abd_adj <- fit_adjust_batch$feature_abd_adj
otu_table_adj <- otu_table(CRC_abd_adj, taxa_are_rows = TRUE)
pseq_adj <- phyloseq(
otu_table_adj,
tax_table(pseq_filtered),
sample_data(pseq_filtered))
saveRDS(pseq_adj, file = "familyBatchCorrected_phyloseq.rds")
#PERMANOVA
#Before batch correction
set.seed(111)
bc_dist_before <- phyloseq::distance(pseq_filtered, method = "bray")
res_perm_before <- adonis2( bc_dist_before ~ Cohort + MTXRandMTXNR,data = meta,
by = "term", permutations = 1000)
res_perm_before
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 1000
##
## adonis2(formula = bc_dist_before ~ Cohort + MTXRandMTXNR, data = meta, permutations = 1000, by = "term")
## Df SumOfSqs R2 F Pr(>F)
## Cohort 2 4.7383 0.23238 14.8590 0.000999 ***
## MTXRandMTXNR 1 0.3455 0.01694 2.1668 0.043956 *
## Residual 96 15.3064 0.75068
## Total 99 20.3902 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#After batch correction (only MTX effect)
set.seed(111)
bc_dist_after <- phyloseq::distance(pseq_adj, method = "bray")
meta_adj <- as(sample_data(pseq_adj), "data.frame")
res_perm_after <- adonis2( bc_dist_after ~ MTXRandMTXNR + Cohort+gender+age, data = meta,
by = "term",permutations = 1000)
res_perm_after
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 1000
##
## adonis2(formula = bc_dist_after ~ MTXRandMTXNR + Cohort + gender + age, data = meta, permutations = 1000, by = "term")
## Df SumOfSqs R2 F Pr(>F)
## MTXRandMTXNR 1 0.2150 0.01353 1.3636 0.19580
## Cohort 2 0.5130 0.03227 1.6269 0.04795 *
## gender 1 0.1424 0.00896 0.9034 0.52747
## age 1 0.2041 0.01284 1.2947 0.23377
## Residual 94 14.8215 0.93240
## Total 99 15.8961 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
abund <- as(otu_table(pseq_adj), "matrix")
if (taxa_are_rows(pseq_adj)) abund <- t(abund)
common_samples <- intersect(rownames(abund), rownames(meta_adj))
abund <- abund[common_samples, , drop = FALSE]
meta_adj <- meta_adj[common_samples, , drop = FALSE]
abund_t <- t(abund) #taxa vs samples
fit_meta <- lm_meta(feature_abd = abund_t, batch = "Cohort", exposure = "MTXRandMTXNR",
covariates = c("age", "gender"),
data = meta_adj)
## Warning in check_batch(df_batch[[batch]], min_n_batch = 2): Batch variable is
## not a factor as provided and will be converted to one.
## Found 3 batches
## Fitting Maaslin2 on batch Cohort-1...
## Fitting Maaslin2 on batch Cohort-2...
## Fitting Maaslin2 on batch Cohort-3...
## Fitting meta-analysis model.
meta_fits <- fit_meta$meta_fits
#View(meta_fits)
p <- ggplot(meta_fits, aes(x = coef, y = -log10(pval))) +
geom_point(aes(color = ifelse(qval.fdr < 0.2,
ifelse(coef >= 0, "MTX-R", "MTX-NR"), "NS"),
alpha = ifelse(qval.fdr < 0.2, 1, 0.3)),
size = 1.5) +
geom_point(
data = subset(meta_fits, qval.fdr < 0.2),
aes(fill = ifelse(coef >= 0, "MTX-R", "MTX-NR")), #Use fill for the inside
shape = 21, #Enables the border
color = "black", #The border color
stroke = 0.5, #Thickness of the black line
size = 2,
alpha = 1
) +
geom_text_repel(
data = subset(meta_fits, qval.fdr < 0.2),
aes(label = str_extract(feature, "f__[^|]+")),
size = 3,
box.padding = 0.3,
point.padding = 0.3,
segment.color = "grey30",
segment.size = 0.4,
segment.alpha = 0.8,
min.segment.length = 0,
force = 2,
max.overlaps = Inf
) +
scale_fill_manual(
values = c("MTX-R" = "#238A8DFF", "MTX-NR" = "darksalmon")
) +
scale_color_manual(
values = c("MTX-R" = "#238A8DFF", "MTX-NR" = "darksalmon", "NS" = "grey")
) +
scale_alpha_identity() +
theme_classic() +
labs(
x = "Effect Size (Coefficient)",
y = expression(-log[10]("p-value")),
color = "Significance",
fill = "Significance"
) +
theme(
axis.text = element_text(size = 10),
axis.title = element_text(size = 12),
legend.position = "right"
)
p
ggsave("Family_volcano.pdf", p, height = 2, width = 3, dpi = 300)
##Figure 1E
rm(list=ls())
pseq <- readRDS("PhyloseqObject.rds")
pseq_genus <- tax_glom(pseq, taxrank = "Genus", NArm = TRUE)
ntaxa(pseq)
## [1] 2100
ntaxa(pseq_genus )
## [1] 1256
pseq_rel <- transform_sample_counts(pseq_genus, function(x) x / sum(x))
summary(sample_sums(pseq_rel)) #verify normalization
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 1 1 1 1 1
cohorts <- unique(sample_data(pseq_rel)$Cohort)
cat("Cohorts :", paste(cohorts, collapse = ", "), "\n")
## Cohorts : Cohort-2, Cohort-1, Cohort-3
#Keep genus that are >1e-5 in >=10% of samples in at least one cohort
pseq_filtered <- filter_taxa(
pseq_rel,
function(x) {
all(sapply(cohorts, function(cohort) {
cohort_samples <- which(sample_data(pseq_rel)$Cohort == cohort)
mean(x[cohort_samples] > 5e-5) >= 0.1
}))
},
prune = TRUE
)
cat(ntaxa(pseq_filtered), "\n")
## 159
otu_mat <- as(otu_table(pseq_filtered), "matrix")
if (!taxa_are_rows(pseq_filtered)) otu_mat <- t(otu_mat)
meta <- as(sample_data(pseq_filtered), "data.frame")
#batch correction
fit_adjust_batch <- adjust_batch(
feature_abd = otu_mat,
batch = "Cohort",
covariates = "MTXRandMTXNR",
data = meta,
control = list(verbose = FALSE)
)
## Warning in check_batch(df_batch[[batch]], min_n_batch = 2): Batch variable is
## not a factor as provided and will be converted to one.
CRC_abd_adj <- fit_adjust_batch$feature_abd_adj
otu_table_adj <- otu_table(CRC_abd_adj, taxa_are_rows = TRUE)
pseq_adj <- phyloseq(
otu_table_adj,
tax_table(pseq_filtered),
sample_data(pseq_filtered)
)
saveRDS(pseq_adj, file = "genusBatchCorrected_phyloseq.rds")
#Before batch correction
set.seed(111)
bc_dist_before <- phyloseq::distance(pseq_filtered, method = "bray")
res_perm_before <- adonis2(
bc_dist_before ~ Cohort + MTXRandMTXNR,
data = meta,
by = "term",
permutations = 1000)
res_perm_before
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 1000
##
## adonis2(formula = bc_dist_before ~ Cohort + MTXRandMTXNR, data = meta, permutations = 1000, by = "term")
## Df SumOfSqs R2 F Pr(>F)
## Cohort 2 5.2850 0.18863 11.3833 0.000999 ***
## MTXRandMTXNR 1 0.4479 0.01599 1.9294 0.040959 *
## Residual 96 22.2855 0.79539
## Total 99 28.0185 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#AFTER batch correction
bc_dist_after <- phyloseq::distance(pseq_adj, method = "bray")
meta_adj <- as(sample_data(pseq_adj), "data.frame")
res_perm_after <- adonis2(
bc_dist_after ~ MTXRandMTXNR + Cohort,
data = meta_adj,
permutations = 1000)
res_perm_after
## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 1000
##
## adonis2(formula = bc_dist_after ~ MTXRandMTXNR + Cohort, data = meta_adj, permutations = 1000)
## Df SumOfSqs R2 F Pr(>F)
## Model 3 1.4136 0.05767 1.9585 0.003996 **
## Residual 96 23.0961 0.94233
## Total 99 24.5096 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
abund <- as(otu_table(pseq_adj), "matrix")
if (taxa_are_rows(pseq_adj)) abund <- t(abund)
common_samples <- intersect(rownames(abund), rownames(meta_adj))
abund <- abund[common_samples, , drop = FALSE]
meta_adj <- meta_adj[common_samples, , drop = FALSE]
abund_t <- t(abund) #taxa x samples
fit_meta <- lm_meta(
feature_abd = abund_t,
batch = "Cohort",
exposure = "MTXRandMTXNR",
covariates = c("age", "gender"),
data = meta_adj)
## Warning in check_batch(df_batch[[batch]], min_n_batch = 2): Batch variable is
## not a factor as provided and will be converted to one.
## Found 3 batches
## Fitting Maaslin2 on batch Cohort-1...
## Fitting Maaslin2 on batch Cohort-2...
## Fitting Maaslin2 on batch Cohort-3...
## Fitting meta-analysis model.
## Warning in rma_wrapper(maaslin_fits, method = control$rma_method, output = control$output, : Fitting rma on feature k__Bacteria|p__Firmicutes|c__Negativicutes|o__Veillonellales|f__Veillonellaceae|g__Megasphaera|s__Megasphaera_sp_NM10|t__SGB5858;
## Fisher scoring algorithm may have gotten stuck at a local maximum.
## Setting tau^2 = 0. Check the profile likelihood plot with profile().
meta_fits <- fit_meta$meta_fits
#View(meta_fits)
write.csv(meta_fits, "Supplementary_Table_Genus.csv")
p <- ggplot(meta_fits, aes(x = coef, y = -log10(pval))) +
#Non-significant)
geom_point(aes(color = ifelse(qval.fdr < 0.2,
ifelse(coef >= 0, "MTX-R", "MTX-NR"), "NS"),
alpha = ifelse(qval.fdr < 0.2, 1, 0.3)),
size = 1.5) +
#2. Significant points with black outline
geom_point(
data = subset(meta_fits, qval.fdr < 0.2),
aes(fill = ifelse(coef >= 0, "MTX-R", "MTX-NR")), #Map to fill, not color
shape = 21, #Required for the outline + fill combo
color = "black", #The outline color
stroke = 0.5, #Thickness of the outline
size = 2,
alpha = 1
) +
#3. Label significant Genus (g__)
geom_text_repel(
data = subset(meta_fits, qval.fdr < 0.2),
aes(label = str_extract(feature, "g__[^|]+")),
size = 3,
box.padding = 0.3,
point.padding = 0.3,
segment.color = "grey30",
segment.size = 0.4,
segment.alpha = 0.8,
min.segment.length = 0,
force = 2,
max.overlaps = Inf
) +
#4. Manual Scales
scale_fill_manual(
values = c("MTX-R" = "#238A8DFF", "MTX-NR" = "darksalmon")
) +
scale_color_manual(
values = c("MTX-R" = "#238A8DFF", "MTX-NR" = "darksalmon", "NS" = "grey")
) +
#5. Styling and Labels
scale_alpha_identity() +
theme_classic() +
labs(
x = "Effect Size (Coefficient)",
y = expression(-log[10]("p-value")),
color = "Significance",
fill = "Significance"
) +
theme(
axis.text = element_text(size = 10),
axis.title = element_text(size = 12),
legend.position = "right"
)
print(p)
ggsave("Figure1E.pdf", p, height = 2, width = 3, dpi = 300)
##Figure 1F
rm(list=ls())
pseq <- readRDS("PhyloseqObject.rds")
pseq_species <- tax_glom(pseq, taxrank = "Species", NArm = TRUE)
ntaxa(pseq)
## [1] 2100
ntaxa(pseq_species )
## [1] 2028
pseq_rel <- transform_sample_counts(pseq_species, function(x) x / sum(x))
summary(sample_sums(pseq_rel))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 1 1 1 1 1
cohorts <- unique(sample_data(pseq_rel)$Cohort)
cat("Chrts found:", paste(cohorts, collapse = ", "), "\n")
## Chrts found: Cohort-2, Cohort-1, Cohort-3
pseq_filtered <- filter_taxa(
pseq_rel,
function(x) {
all(sapply(cohorts, function(cohort) {
cohort_samples <- which(sample_data(pseq_rel)$Cohort == cohort)
mean(x[cohort_samples] > 5e-5) >= 0.1
}))
},
prune = TRUE
)
cat( ntaxa(pseq_filtered), "\n")
## 235
#Prepare OTU matrix & metadata
otu_mat <- as(otu_table(pseq_filtered), "matrix")
if (!taxa_are_rows(pseq_filtered)) otu_mat <- t(otu_mat)
meta <- as(sample_data(pseq_filtered), "data.frame")
#Batch correction
fit_adjust_batch <- adjust_batch(
feature_abd = otu_mat,
batch = "Cohort", #Batch variable
covariates = "MTXRandMTXNR", #Preserve MTX signal
data = meta,
control = list(verbose = FALSE))
## Warning in check_batch(df_batch[[batch]], min_n_batch = 2): Batch variable is
## not a factor as provided and will be converted to one.
CRC_abd_adj <- fit_adjust_batch$feature_abd_adj
otu_table_adj <- otu_table(CRC_abd_adj, taxa_are_rows = TRUE)
pseq_adj <- phyloseq(
otu_table_adj,
tax_table(pseq_filtered),
sample_data(pseq_filtered))
saveRDS(pseq_adj, file = "SpeciesBatchCorrected_phyloseq.rds")
set.seed(111)
bc_dist_before <- phyloseq::distance(pseq_filtered, method = "bray")
res_perm_before <- adonis2(
bc_dist_before ~ Cohort + MTXRandMTXNR,
data = meta,
by = "term",
permutations = 1000)
res_perm_before
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 1000
##
## adonis2(formula = bc_dist_before ~ Cohort + MTXRandMTXNR, data = meta, permutations = 1000, by = "term")
## Df SumOfSqs R2 F Pr(>F)
## Cohort 2 4.161 0.12686 7.1190 0.000999 ***
## MTXRandMTXNR 1 0.584 0.01780 1.9983 0.009990 **
## Residual 96 28.053 0.85534
## Total 99 32.798 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#After batch correction (only MTX effect)
set.seed(111)
bc_dist_after <- phyloseq::distance(pseq_adj, method = "bray")
meta_adj <- as(sample_data(pseq_adj), "data.frame")
res_perm_after <- adonis2(
bc_dist_after ~ MTXRandMTXNR,
data = meta_adj,
permutations = 1000
)
res_perm_after
## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 1000
##
## adonis2(formula = bc_dist_after ~ MTXRandMTXNR, data = meta_adj, permutations = 1000)
## Df SumOfSqs R2 F Pr(>F)
## Model 1 0.5882 0.01951 1.9498 0.007992 **
## Residual 98 29.5663 0.98049
## Total 99 30.1546 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
res_perm_after2 <- adonis2(
bc_dist_after ~ MTXRandMTXNR + Cohort,
data = meta_adj,
permutations = 1000
)
res_perm_after2
## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 1000
##
## adonis2(formula = bc_dist_after ~ MTXRandMTXNR + Cohort, data = meta_adj, permutations = 1000)
## Df SumOfSqs R2 F Pr(>F)
## Model 3 1.8146 0.06018 2.0489 0.000999 ***
## Residual 96 28.3400 0.93982
## Total 99 30.1546 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
abund <- as(otu_table(pseq_adj), "matrix")
if (taxa_are_rows(pseq_adj)) abund <- t(abund)
common_samples <- intersect(rownames(abund), rownames(meta_adj))
abund <- abund[common_samples, , drop = FALSE]
meta_adj <- meta_adj[common_samples, , drop = FALSE]
abund_t <- t(abund) #taxa x samples
fit_meta <- lm_meta(
feature_abd = abund_t,
batch = "Cohort",
exposure = "MTXRandMTXNR",
covariates = c("age", "gender"),
data = meta_adj)
## Warning in check_batch(df_batch[[batch]], min_n_batch = 2): Batch variable is
## not a factor as provided and will be converted to one.
## Found 3 batches
## Fitting Maaslin2 on batch Cohort-1...
## Fitting Maaslin2 on batch Cohort-2...
## Fitting Maaslin2 on batch Cohort-3...
## Fitting meta-analysis model.
meta_fits <- fit_meta$meta_fits
#View(meta_fits)
write.csv(meta_fits, "Supplementary Table Species.csv")
p_species <- ggplot(meta_fits, aes(x = coef, y = -log10(pval))) +
#1. Background points (Non-significant)
geom_point(aes(color = ifelse(qval.fdr < 0.2,
ifelse(coef >= 0, "MTX-R", "MTX-NR"), "NS"),
alpha = ifelse(qval.fdr < 0.2, 1, 0.3)),
size = 1.5) +
geom_point(
data = subset(meta_fits, qval.fdr < 0.2),
aes(fill = ifelse(coef >= 0, "MTX-R", "MTX-NR")), #Map to fill, not color
shape = 21, #Required for the outline + fill combo
color = "black", #The outline color
stroke = 0.5, #Thickness of the outline
size = 2,
alpha = 1
) +
#label significant sp (s__)
geom_text_repel(
data = subset(meta_fits, qval.fdr < 0.2),
aes(label = str_extract(feature, "s__[^|]+")),
size = 3,
box.padding = 0.3,
point.padding = 0.3,
segment.color = "grey30",
segment.size = 0.4,
segment.alpha = 0.8,
min.segment.length = 0,
force = 2,
max.overlaps = Inf
) +
#4. Manual Scales
scale_fill_manual(
values = c("MTX-R" = "#238A8DFF", "MTX-NR" = "darksalmon")
) +
scale_color_manual(
values = c("MTX-R" = "#238A8DFF", "MTX-NR" = "darksalmon", "NS" = "grey")
) +
#5. Styling and Labels
scale_alpha_identity() +
theme_classic() +
labs(
x = "Effect Size (Coefficient)",
y = expression(-log[10]("p-value")),
color = "Significance",
fill = "Significance"
) +
theme(
axis.text = element_text(size = 10),
axis.title = element_text(size = 12),
legend.position = "right"
)
print(p_species)
ggsave("Figure_1F.pdf", p_species, height = 2, width = 3, dpi = 300)
#Figure 2A-C Suppl. table diff abundant KOs
rm(list=ls())
Metadata <- read.csv("../01_CuratingMetadata/metadataDAS28.csv",
stringsAsFactors = FALSE,
check.names = FALSE)
#assign sample names as rownames
rownames(Metadata) <- Metadata$KT_sampleName_withS
#rplace '.' with '-' in both row and colnames to match with KO data
colnames(Metadata) <- gsub("\\.", "-", colnames(Metadata))
rownames(Metadata) <- gsub("\\.", "-", rownames(Metadata))
cat("No. of samples in metadata:", nrow(Metadata), "\n")
## No. of samples in metadata: 100
d2 <- fread("../03_HumannAnalysis/00_humannOutputfromWynton/12_CombUnstratFiles/Normalized_KO_RPKG.tsv")
d2 <- as.data.frame(d2, check.names = FALSE)
rownames(d2) <- d2[,1]
d2 <- d2[,-1]
d2 <- d2[rowSums(d2) > 0, ]
#clean column names
colnames(d2) <- sub("^X", "", colnames(d2))
colnames(d2) <- sub("_L002$", "", colnames(d2))
cat("NO. of samples in abundance matrix", ncol(d2), "\n")
## NO. of samples in abundance matrix 291
cat("NO. of feature after remocing feature with sum 0", nrow(d2), "\n")
## NO. of feature after remocing feature with sum 0 6542
common_samples <- intersect(colnames(d2), rownames(Metadata))
cat("cat common samples", length(common_samples), "\n")
## cat common samples 100
Metadata_matched <- Metadata[match(common_samples, rownames(Metadata)), , drop = FALSE]
d2_matrix_matched <- d2[, common_samples, drop = FALSE]
#alignment chekk
stopifnot(all(colnames(d2_matrix_matched) == rownames(Metadata_matched)))
#Proportional normlaization
d2_matrix_prop <- sweep(d2_matrix_matched, 2, colSums(d2_matrix_matched), FUN = "/")
cohorts <- unique(Metadata_matched$Cohort)
cat("cohorts found:", paste(cohorts, collapse = ", "), "\n")
## cohorts found: Cohort-2, Cohort-1, Cohort-3
keep_features <- rep(FALSE, nrow(d2_matrix_prop))
names(keep_features) <- rownames(d2_matrix_prop)
for (cohort in cohorts) {
cohort_samples <- rownames(Metadata_matched)[Metadata_matched$Cohort == cohort]
mat_cohort <- d2_matrix_prop[, cohort_samples, drop = FALSE]
prevalence <- rowMeans(mat_cohort > 5e-5)
keep_features <- keep_features | (prevalence >= 0.1)
}
#row features satisfying this criteria and column would be Cohort as column and crossc chekc manually and match N here and that file
#subset relative abundance matrix to keep only filtered features
d2_matrix_prop_filtered <- d2_matrix_prop[keep_features, , drop = FALSE]
cat("No. of features after cohort-aware filtering:", nrow(d2_matrix_prop_filtered), "\n")
## No. of features after cohort-aware filtering: 3900
#cross checked multiple times with MMUPHin tutorial
fit_adjust_batch <- adjust_batch(
feature_abd = d2_matrix_prop_filtered,
batch = "Cohort",
covariates = "MTXRandMTXNR",
data = Metadata_matched,
control = list(verbose = FALSE))
## Warning in check_batch(df_batch[[batch]], min_n_batch = 2): Batch variable is
## not a factor as provided and will be converted to one.
CRC_abd_adj <- fit_adjust_batch$feature_abd_adj
#save it for future analysis so don't have to redo the steps above
write.csv(CRC_abd_adj, "20251028BatchCorrectedKO.csv", row.names = TRUE)
###Suppl.Figure 6.
alpha_df <- data.frame(
Sample = colnames(CRC_abd_adj),
Richness = colSums(CRC_abd_adj > 0))
alpha_df <- alpha_df %>%
mutate(
MTXRandMTXNR = Metadata_matched$MTXRandMTXNR[match(Sample, rownames(Metadata_matched))],
Cohort = Metadata_matched$Cohort[match(Sample, rownames(Metadata_matched))])
alpha_stats <- alpha_df %>%
group_by(Cohort) %>%
summarise(
wilcox_p = wilcox.test(Richness ~ MTXRandMTXNR)$p.value,
ttest_p = t.test(Richness ~ MTXRandMTXNR)$p.value
)
## Warning: There were 2 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `wilcox_p = wilcox.test(Richness ~ MTXRandMTXNR)$p.value`.
## ℹ In group 1: `Cohort = "Cohort-1"`.
## Caused by warning in `wilcox.test.default()`:
## ! cannot compute exact p-value with ties
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
print(alpha_stats)
## # A tibble: 3 × 3
## Cohort wilcox_p ttest_p
## <chr> <dbl> <dbl>
## 1 Cohort-1 0.126 0.188
## 2 Cohort-2 0.114 0.0880
## 3 Cohort-3 0.443 0.619
p_combined <- ggplot(alpha_df, aes(x = MTXRandMTXNR, y = Richness, fill = MTXRandMTXNR)) +
#Bar layer
stat_summary(fun = mean, geom = "bar", alpha = 0.4) +
#Point layer with outlines
geom_point(aes(fill = MTXRandMTXNR, shape = Cohort), #Use fill for the inner color
color = "black", #Set outline to black
size = 2,
alpha = 0.8,
stroke = 0.5, #Adjust outline thickness
position = position_jitter(width = 0.1, height = 0)) +
#Scales
scale_fill_manual(values = c("darksalmon", "#238A8DFF")) +
#Swapping shapes to fillable versions: 22 (Sq), 21 (Circ), 24 (Tri)
scale_shape_manual(values = c("Cohort-1" = 22, "Cohort-2" = 21, "Cohort-3" = 24)) +
theme_classic() +
labs(y = "Number of distinct KOs", fill = "Group", shape = "Cohort") +
theme(axis.title = element_text(size = 16),
axis.text = element_text(size = 14),
legend.title = element_text(size = 14),
legend.text = element_text(size = 12)) +
stat_compare_means(aes(group = MTXRandMTXNR),
method = "t.test", label = "p.format")
p_combined
ggsave("AlphaDivSuppleFig5left.pdf", p_combined, height = 3, width = 5, dpi = 300)
p_facet <- ggplot(alpha_df, aes(x = MTXRandMTXNR, y = Richness, fill = MTXRandMTXNR)) +
stat_summary(fun = mean, geom = "bar", alpha = 0.4) +
geom_point(
aes(fill = MTXRandMTXNR, shape = Cohort),
color = "black", #outline
size = 1.2,
alpha = 0.8,
stroke = 0.4,
position = position_jitter(width = 0.15)
) +
scale_fill_manual(values = c("darksalmon", "#238A8DFF")) +
scale_shape_manual(values = c("Cohort-1" = 22, "Cohort-2" = 21, "Cohort-3" = 24)) +
theme_classic() +
labs(y = "Number of distinct KOs", fill = "Group", shape = "Cohort") +
theme(
axis.title = element_text(size = 16),
axis.text = element_text(size = 14),
legend.title = element_text(size = 14),
legend.text = element_text(size = 12)
) +
facet_wrap(~Cohort, scales = "free_y")
p_facet
ggsave("AlphaDivSuppleFig5right.pdf", p_facet, height = 3, width = 8, dpi = 300)
###Figure 2A
dist_before <- vegdist(t(d2_matrix_prop_filtered), method = "bray")
set.seed(1234)
fit_adonis_before <- adonis2(dist_before ~ MTXRandMTXNR + Cohort + age+gender + DAS28 ,
data = Metadata_matched, by = "terms")
#PERMANOVA after batch correction
dist_after <- vegdist(t(CRC_abd_adj), method = "bray")
set.seed(1234)
fit_adonis_after <- adonis2(dist_after ~ MTXRandMTXNR + Cohort + age+gender +DAS28,
data = Metadata_matched, by = "terms")
pcoa_after <- cmdscale(dist_after, k = 6, eig = TRUE)
eig_after <- pcoa_after$eig
var_explained_after <- round(100 * eig_after / sum(eig_after), 2)
pcoa_df_after <- as.data.frame(pcoa_after$points)
colnames(pcoa_df_after) <- paste0("PC", 1:ncol(pcoa_df_after))
pcoa_df_after$Sample <- rownames(pcoa_df_after)
pcoa_df_after$Cohort <- Metadata_matched$Cohort[match(pcoa_df_after$Sample, rownames(Metadata_matched))]
pcoa_df_after$MTXRandMTXNR <- Metadata_matched$MTXRandMTXNR[match(pcoa_df_after$Sample, rownames(Metadata_matched))]
p2_vs_p4 <- ggplot(pcoa_df_after, aes(x = PC2, y = PC4,
color = MTXRandMTXNR, shape = Cohort)) + geom_point(size = 3) +
theme_classic() +
labs(x = paste0("PC1 (", var_explained_after[1], "%)"),
y = paste0("PC4 (", var_explained_after[4], "%)")) +
scale_color_manual(values = c("darksalmon", "#238A8DFF")) +
scale_shape_manual(values = c("Cohort-1" = 15, "Cohort-2" = 16, "Cohort-3" = 17)) +
theme(legend.position = "none") #will add legend manually in A-ILLUstrator
p2_vs_p4
ggsave("PCoA_KO_After.pdf", p2_vs_p4, height = 4, width = 5, dpi = 300)
#Permanova
print(fit_adonis_before)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = dist_before ~ MTXRandMTXNR + Cohort + age + gender + DAS28, data = Metadata_matched, by = "terms")
## Df SumOfSqs R2 F Pr(>F)
## MTXRandMTXNR 1 0.1347 0.01504 1.7597 0.081 .
## Cohort 2 1.3067 0.14597 8.5383 0.001 ***
## age 1 0.0969 0.01083 1.2665 0.241
## gender 1 0.1824 0.02037 2.3830 0.018 *
## DAS28 1 0.1147 0.01282 1.4993 0.130
## Residual 93 7.1165 0.79497
## Total 99 8.9519 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
print(fit_adonis_after)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = dist_after ~ MTXRandMTXNR + Cohort + age + gender + DAS28, data = Metadata_matched, by = "terms")
## Df SumOfSqs R2 F Pr(>F)
## MTXRandMTXNR 1 0.1542 0.01993 2.0329 0.031 *
## Cohort 2 0.1587 0.02051 1.0458 0.405
## age 1 0.0956 0.01236 1.2604 0.226
## gender 1 0.1452 0.01876 1.9136 0.039 *
## DAS28 1 0.1286 0.01661 1.6946 0.090 .
## Residual 93 7.0557 0.91183
## Total 99 7.7380 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#PC1
wilcox_pc1 <- wilcox.test(pcoa_df_after$PC1 ~ pcoa_df_after$MTXRandMTXNR)
#PC4
wilcox_pc4 <- wilcox.test(pcoa_df_after$PC4 ~ pcoa_df_after$MTXRandMTXNR)
print(wilcox_pc1)
##
## Wilcoxon rank sum test with continuity correction
##
## data: pcoa_df_after$PC1 by pcoa_df_after$MTXRandMTXNR
## W = 1530, p-value = 0.04983
## alternative hypothesis: true location shift is not equal to 0
print(wilcox_pc4)
##
## Wilcoxon rank sum test with continuity correction
##
## data: pcoa_df_after$PC4 by pcoa_df_after$MTXRandMTXNR
## W = 943, p-value = 0.03701
## alternative hypothesis: true location shift is not equal to 0
###Figure 2B
pcoa_long_14 <- pcoa_df_after %>%
select(MTXRandMTXNR, Cohort, PC1, PC4) %>%
pivot_longer(
cols = c(PC1, PC4),
names_to = "PC_Axis",
values_to = "Coordinate_Value"
) %>%
#create a custom label column that includes the variance explained
mutate(Facet_Label = case_when(
PC_Axis == "PC1" ~ paste0("PC1 (", var_explained_after[1], "%)"),
PC_Axis == "PC4" ~ paste0("PC4 (", var_explained_after[4], "%)") ))
p_faceted_14 <- ggplot(pcoa_long_14, aes(x = MTXRandMTXNR, y = Coordinate_Value, fill = MTXRandMTXNR)) +
geom_boxplot(outlier.shape = NA, alpha = 0.7) +
geom_jitter(aes(shape = Cohort), width = 0.2, size = 2, alpha = 0.6) +
#scales = "free_y" is crucial since PC1 and PC4 often have different ranges
facet_wrap(~PC_Axis, scales = "free_y") +
scale_fill_manual(values = c("darksalmon", "#238A8DFF")) +
scale_shape_manual(values = c("Cohort-1" = 15, "Cohort-2" = 16, "Cohort-3" = 17)) + labs(title = "Distribution of PCoA",
y = "PCoA scores") +theme_classic()
print(p_faceted_14)
ggsave("Figure2B_PC2_PC4_Faceted.pdf", p_faceted_14, height = 6, width = 7, dpi = 300)
###Supplementaty_Table diff abundant KOs
fit_lm_meta <- lm_meta(
feature_abd = CRC_abd_adj,
batch = "Cohort",
exposure = "MTXRandMTXNR",
covariates = c("age" , "gender"),
data = Metadata_matched,
control = list(verbose = FALSE))
## Warning in check_batch(df_batch[[batch]], min_n_batch = 2): Batch variable is
## not a factor as provided and will be converted to one.
meta_fits <- fit_lm_meta$meta_fits
#View(meta_fits)
write.csv(meta_fits, "Supplementaty_Table_metafits.csv", row.names = FALSE)
###Figure 2C
meta_fits <- meta_fits %>%
mutate(sig_color = case_when(
pval < 0.05 & coef > 0 ~ "MTX-R",
pval < 0.05 & coef < 0 ~ "MTX-NR",
TRUE ~ "NS" ))
p <- ggplot(meta_fits, aes(x = coef, y = -log10(pval))) +
#1. All points (Background layer)
geom_point(aes(color = sig_color, alpha = sig_color), size = 1.5) +
geom_point(
data = subset(meta_fits, qval.fdr < 0.2),
aes(fill = sig_color), #Changed color to fill
shape = 21, #Shape with border and fill
color = "black", #Outline color
stroke = 0.5, #Outline thickness
size = 2,
alpha = 1
) + geom_text_repel(
data = subset(meta_fits, qval.fdr < 0.2),
aes(label = feature),
size = 2.5,
box.padding = 0.3,
point.padding = 0.3,
segment.color = "grey30",
segment.size = 0.4,
segment.alpha = 0.8,
min.segment.length = 0,
force = 2,
max.overlaps = Inf) + scale_color_manual(
values = c(
"MTX-R" = "#238A8DFF",
"MTX-NR" = "darksalmon",
"NS" = "grey") ) +
scale_fill_manual(
values = c(
"MTX-R" = "#238A8DFF",
"MTX-NR" = "darksalmon",
"NS" = "grey")) +
#control alpha
scale_alpha_manual(
values = c(
"MTX-R" = 0.4,
"MTX-NR" = 0.4,
"NS" = 0.3
),
guide = "none"
) +
theme_classic() +
labs(
x = "Effect Size (Coefficient)",
y = expression(-log[10]("p-value")),
color = "Significance",
fill = "Significance" #Keeps the legend unified
) + theme(
axis.text = element_text(size = 10),
axis.title = element_text(size = 12),
legend.position = "right")
print(p)
ggsave("Figure2C.pdf", plot = p, width = 5.5, height = 3.5, units = "in", )
##Figure 2D GSEA analysis
rm(list = ls())
meta_fits <- fread("Supplementaty_Table_metafits.csv")
meta_fits$rank_metric <- sign(meta_fits$coef) * -log10(meta_fits$pval)
set.seed(1234)
#Prepare ranked named vector
ranked_KOs <- meta_fits$rank_metric
names(ranked_KOs) <- meta_fits$feature
ranked_KOs <- ranked_KOs[is.finite(ranked_KOs)] #remove any NaN/Inf
ranked_KOs <- sort(ranked_KOs, decreasing = TRUE)
cat("number of KOs for GSEA:", length(ranked_KOs), "\n")
## number of KOs for GSEA: 3900
#Fetch all KO → pathway associations from KEGG using the KEGG API.
#keggLink returns a named vector where:
# -names are KOs
#-Values are KEGG pathway IDs (e.g., "path:map00010")
#This provides the raw relationships between KOs and pathways.
ko2pathway_raw <- keggLink("pathway", "ko")
#Convert the named vector into a clean data.frame for easier manipulation.
#'ko' column: removes the "ko:" prefix using sub()
#'pathway' column: removes the "path:" prefix for readability using sub()
#stringsAsFactors = FALSE ensures the columns remain character vectors, not factors
ko2pathway <- data.frame(
ko = sub("ko:", "", names(ko2pathway_raw)), #remove "ko:" prefix from KO IDs
pathway = sub("path:", "", ko2pathway_raw), #remove "path:" prefix from pathway IDs
stringsAsFactors = FALSE)
#ensures GSEA only evaluates KOs that have ranking metrics
ko2pathway <- ko2pathway %>% filter(ko %in% names(ranked_KOs))
#Remove very broad/general pathways like map01100 ("Metabolic pathways")
#Such pathways are overly generic and may dominate enrichment results, reducing specificity
ko2pathway <- ko2pathway %>% filter(!grepl("map01100", pathway))
set.seed(1234)
#Fetch all KEGG pathway names.
#Returns a named vector: names = pathway IDs (e.g., "path:map00010"), values = descriptive names
all_pathways <- keggList("pathway")
#clean the pathway IDs to remove the "path:" prefix
#setNames() creates a named vector where:
# - Names = cleaned pathway IDs (e.g., "map00010")
# - Values = descriptive names (e.g., "Glycolysis / Gluconeogenesis")
all_pathways <- setNames(all_pathways, sub("path:", "", names(all_pathways)))
#adda new column 'PathwayName' to ko2pathway to store the descriptive name for each pathway
#easier interpretation in plots and tables
ko2pathway$PathwayName <- all_pathways[ko2pathway$pathway]
pathway2ko <- ko2pathway %>%
group_by(pathway, PathwayName) %>% #Group all rows by unique pathway
summarise(
#concatenate all KOs in the group into a single string, separated by ";"
#'paste()' combines multiple character strings into one
#'unique(ko)' ensures each KO appears only once in the string
#'collapse = ";"' specifies that ";" will be inserted between each KO
KOs = paste(unique(ko), collapse = ";"),
.groups = "drop" )
#save the KO-to-pathway mapping as a CSV file for reference
fwrite(pathway2ko, "KO_to_Pathway_mapping.csv")
#clusterProfiler expects a data.frame with two columns:
#- pathway: the pathway ID
#- gene: the gene/KO ID
#Here we rename 'ko' to 'gene' and select only the necessary columns
term2gene <- ko2pathway %>% rename(gene = ko, pathway = pathway) %>% select(pathway, gene)
#Run GSEA with clusterProfiler
#Inputs:
#- ranked_KOs: a named numeric vector of KOs ranked by metric
#- TERM2GENE: the pathway definitions
#- pvalueCutoff = 1: keep all results for downstream filtering
#- verbose = FALSE: suppress console output
set.seed(1234)
gsea_res <- GSEA(
ranked_KOs,
seed = TRUE,
TERM2GENE = term2gene,
pvalueCutoff = 1,
verbose = FALSE)
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.15% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : For some of the pathways the P-values were likely overestimated. For
## such pathways log2err is set to NA.
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : For some pathways, in reality P-values are less than 1e-10. You can
## set the `eps` argument to zero for better estimation.
#convert the GSEA result object to a standard frame for easier manipulation
#arange by adjusted p-value (ascending) to see most significant pathways first
set.seed(1234)
gsea_df <- as.data.frame(gsea_res) %>% arrange(p.adjust)
#Add human-readable pathway names to GSEA results for easier interpretation
gsea_df$PathwayName <- all_pathways[gsea_df$ID]
fwrite(gsea_df, "GSEA_results_clusterProfiler2.csv")
#ssign direction
gsea_df$Direction <- ifelse(gsea_df$NES > 0, "MTX-R", "MTX-NR")
sig_pathways <- gsea_df %>% filter(p.adjust < 0.05)
#horizontal plot
p_horizontal <- ggplot(sig_pathways, aes(x = reorder(PathwayName, NES),
y = NES,
color = Direction,
size = -log10(p.adjust))) +
geom_point() +
scale_color_manual(values = c("MTX-R" = "#238A8DFF", "MTX-NR" = "darksalmon")) +
scale_x_discrete(position = "top") + #pathway names on top
labs(x = "KEGG Pathway",
y = "Normalized Enrichment Score (NES)",
size = "-log10(q-value)",
color = "nnriched in") +
theme_classic2() + theme(axis.text.x = element_text(angle = 45, hjust = 0))
ggsave("Figure2D.pdf", plot = p_horizontal, width = 8, height =4 , dpi = 300)
#Figure 3 ##ArgSynthesis
rm(list=ls())
SELECTko <- read_excel("../03_HumannAnalysis/05_KO_of_Interest/Figure3.xlsx", sheet = "ArgSynthesis")
selected_KOs <- SELECTko$KO # assuming the sheet has a column named 'KO'
CRC_abd_adj <- fread("../03_HumannAnalysis/02_KO/20251028BatchCorrectedKO.csv") %>%
as.data.frame(check.names = FALSE)
colnames(CRC_abd_adj)[1] <- "GeneFamily"
Metadata <- read.csv("../01_CuratingMetadata/metadataDAS28.csv",
stringsAsFactors = FALSE, check.names = FALSE)
selected_KOs
## [1] "K00262" "K01940"
# Clean sample names
rownames(Metadata) <- Metadata$KT_sampleName_withS
colnames(Metadata) <- gsub("\\.", "-", colnames(Metadata))
rownames(Metadata) <- gsub("\\.", "-", rownames(Metadata))
Metadata$SampleID <- rownames(Metadata)
dt_long <- CRC_abd_adj %>%
pivot_longer(-GeneFamily, names_to = "Sample", values_to = "Abundance") %>%
rename(KO = GeneFamily)
dt_with_metadata <- dt_long %>%
left_join(Metadata, by = c("Sample" = "SampleID"))
dt_filtered <- dt_with_metadata %>%
filter(KO %in% selected_KOs)
dt_filtered$MTXRandMTXNR <- factor(dt_filtered$MTXRandMTXNR, levels = c("MTXNR", "MTXR"))
dt_filtered$Cohort <- factor(dt_filtered$Cohort, levels = c("Cohort-1", "Cohort-2", "Cohort-3"))
unique_KOs <- unique(dt_filtered$KO)
output_dir <- "./KO_Plots"
dir.create(output_dir, showWarnings = FALSE)
for (ko in unique_KOs) {
ko_data <- dt_filtered %>% filter(KO == ko)
if (nrow(ko_data) == 0) next
# 1. Faceted Plot
p_facet <- ggplot(ko_data, aes(x = MTXRandMTXNR, y = Abundance, fill = MTXRandMTXNR)) +
geom_bar(stat = "summary", fun = "mean", alpha = 0.7) +
geom_point(aes(shape = Cohort),
color = "black", # Border of the point
stroke = 0.5, # Thickness of the border
position = position_jitter(width = 0.1, height = 0),
size = 1.2) +
scale_shape_manual(values = c(22, 21, 24)) + # Use fillable shapes
scale_fill_manual(values = c("MTXR" = "#238A8DFF", "MTXNR" = "darksalmon")) +
facet_wrap(~Cohort, nrow = 1, scales = "free_y") +
theme_classic() +
theme(axis.title.x = element_blank(), axis.text.x = element_blank(),
legend.position = "none", strip.text = element_text(size = 10)) +
ggtitle(paste(ko, "by Cohort"))
# 2. Combined Plot
p_combined <- ggplot(ko_data, aes(x = MTXRandMTXNR, y = Abundance, fill = MTXRandMTXNR)) +
geom_bar(stat = "summary", fun = "mean", alpha = 0.7) +
geom_point(aes(shape = Cohort),
color = "black", # Border of the point
stroke = 0.5,
position = position_jitter(width = 0.1, height = 0),
size = 2) +
scale_shape_manual(values = c(22, 21, 24)) + # Use fillable shapes
scale_fill_manual(values = c("MTXNR" = "darksalmon", "MTXR" = "#238A8DFF")) +
theme_classic() +
theme(axis.title.x = element_blank(), legend.position = "right") +
ggtitle(paste(ko, "Combined Cohorts"))
# Saving...
facet_path <- file.path(output_dir, paste0(ko, "_Faceted.pdf"))
pdf(facet_path, width = 8, height = 3)
print(p_facet)
dev.off()
combined_path <- file.path(output_dir, paste0(ko, "_Combined.pdf"))
pdf(combined_path, width = 5, height = 3)
print(p_combined)
dev.off()
}
##TrpSynthesis
SELECTko <- read_excel("../03_HumannAnalysis/05_KO_of_Interest/Figure3.xlsx", sheet = "TrpSynthesis")
selected_KOs <- SELECTko$KO # assuming the sheet has a column named 'KO'
selected_KOs
## [1] "K04518"
#clean sample names
rownames(Metadata) <- Metadata$KT_sampleName_withS
colnames(Metadata) <- gsub("\\.", "-", colnames(Metadata))
rownames(Metadata) <- gsub("\\.", "-", rownames(Metadata))
Metadata$SampleID <- rownames(Metadata)
dt_long <- CRC_abd_adj %>%
pivot_longer(-GeneFamily, names_to = "Sample", values_to = "Abundance") %>%
rename(KO = GeneFamily)
dt_with_metadata <- dt_long %>%
left_join(Metadata, by = c("Sample" = "SampleID"))
dt_filtered <- dt_with_metadata %>%
filter(KO %in% selected_KOs)
dt_filtered$MTXRandMTXNR <- factor(dt_filtered$MTXRandMTXNR, levels = c("MTXNR", "MTXR"))
dt_filtered$Cohort <- factor(dt_filtered$Cohort, levels = c("Cohort-1", "Cohort-2", "Cohort-3"))
unique_KOs <- unique(dt_filtered$KO)
output_dir <- "./KO_Plots"
for (ko in unique_KOs) {
ko_data <- dt_filtered %>% filter(KO == ko)
if (nrow(ko_data) == 0) next
# 1. Faceted Plot
p_facet <- ggplot(ko_data, aes(x = MTXRandMTXNR, y = Abundance, fill = MTXRandMTXNR)) +
geom_bar(stat = "summary", fun = "mean", alpha = 0.7) +
geom_point(aes(shape = Cohort),
color = "black", # Border of the point
stroke = 0.5, # Thickness of the border
position = position_jitter(width = 0.1, height = 0),
size = 1.2) +
scale_shape_manual(values = c(22, 21, 24)) + # Use fillable shapes
scale_fill_manual(values = c("MTXR" = "#238A8DFF", "MTXNR" = "darksalmon")) +
facet_wrap(~Cohort, nrow = 1, scales = "free_y") +
theme_classic() +
theme(axis.title.x = element_blank(), axis.text.x = element_blank(),
legend.position = "none", strip.text = element_text(size = 10)) +
ggtitle(paste(ko, "by Cohort"))
# 2. Combined Plot
p_combined <- ggplot(ko_data, aes(x = MTXRandMTXNR, y = Abundance, fill = MTXRandMTXNR)) +
geom_bar(stat = "summary", fun = "mean", alpha = 0.7) +
geom_point(aes(shape = Cohort),
color = "black", # Border of the point
stroke = 0.5,
position = position_jitter(width = 0.1, height = 0),
size = 2) +
scale_shape_manual(values = c(22, 21, 24)) + # Use fillable shapes
scale_fill_manual(values = c("MTXNR" = "darksalmon", "MTXR" = "#238A8DFF")) +
theme_classic() +
theme(axis.title.x = element_blank(), legend.position = "right") +
ggtitle(paste(ko, "Combined Cohorts"))
# Saving...
facet_path <- file.path(output_dir, paste0(ko, "_Faceted.pdf"))
pdf(facet_path, width = 8, height = 3)
print(p_facet)
dev.off()
combined_path <- file.path(output_dir, paste0(ko, "_Combined.pdf"))
pdf(combined_path, width = 5, height = 3)
print(p_combined)
dev.off()
}
##Nucleotide Synthesis
SELECTko <- read_excel("../03_HumannAnalysis/05_KO_of_Interest/Figure3.xlsx", sheet = "APRT")
selected_KOs <- SELECTko$KO
selected_KOs
## [1] "K00759" "K06287"
rownames(Metadata) <- Metadata$KT_sampleName_withS
colnames(Metadata) <- gsub("\\.", "-", colnames(Metadata))
rownames(Metadata) <- gsub("\\.", "-", rownames(Metadata))
Metadata$SampleID <- rownames(Metadata)
dt_long <- CRC_abd_adj %>%
pivot_longer(-GeneFamily, names_to = "Sample", values_to = "Abundance") %>%
rename(KO = GeneFamily)
dt_with_metadata <- dt_long %>%
left_join(Metadata, by = c("Sample" = "SampleID"))
dt_filtered <- dt_with_metadata %>%
filter(KO %in% selected_KOs)
dt_filtered$MTXRandMTXNR <- factor(dt_filtered$MTXRandMTXNR, levels = c("MTXNR", "MTXR"))
dt_filtered$Cohort <- factor(dt_filtered$Cohort, levels = c("Cohort-1", "Cohort-2", "Cohort-3"))
unique_KOs <- unique(dt_filtered$KO)
output_dir <- "./KO_Plots"
dir.create(output_dir, showWarnings = FALSE)
for (ko in unique_KOs) {
ko_data <- dt_filtered %>% filter(KO == ko)
if (nrow(ko_data) == 0) next
# 1. Faceted Plot
p_facet <- ggplot(ko_data, aes(x = MTXRandMTXNR, y = Abundance, fill = MTXRandMTXNR)) +
geom_bar(stat = "summary", fun = "mean", alpha = 0.7) +
geom_point(aes(shape = Cohort),
color = "black", # Border of the point
stroke = 0.5, # Thickness of the border
position = position_jitter(width = 0.1, height = 0),
size = 1.2) +
scale_shape_manual(values = c(22, 21, 24)) + # Use fillable shapes
scale_fill_manual(values = c("MTXR" = "#238A8DFF", "MTXNR" = "darksalmon")) +
facet_wrap(~Cohort, nrow = 1, scales = "free_y") +
theme_classic() +
theme(axis.title.x = element_blank(), axis.text.x = element_blank(),
legend.position = "none", strip.text = element_text(size = 10)) +
ggtitle(paste(ko, "by Cohort"))
# 2. Combined Plot
p_combined <- ggplot(ko_data, aes(x = MTXRandMTXNR, y = Abundance, fill = MTXRandMTXNR)) +
geom_bar(stat = "summary", fun = "mean", alpha = 0.7) +
geom_point(aes(shape = Cohort),
color = "black", # Border of the point
stroke = 0.5,
position = position_jitter(width = 0.1, height = 0),
size = 2) +
scale_shape_manual(values = c(22, 21, 24)) + # Use fillable shapes
scale_fill_manual(values = c("MTXNR" = "darksalmon", "MTXR" = "#238A8DFF")) +
theme_classic() +
theme(axis.title.x = element_blank(), legend.position = "right") +
ggtitle(paste(ko, "Combined Cohorts"))
# Saving...
facet_path <- file.path(output_dir, paste0(ko, "_Faceted.pdf"))
pdf(facet_path, width = 8, height = 3)
print(p_facet)
dev.off()
combined_path <- file.path(output_dir, paste0(ko, "_Combined.pdf"))
pdf(combined_path, width = 5, height = 3)
print(p_combined)
dev.off()
}
#Figure 4A ##Figure 4A
rm(list=ls())
meta <- read.csv("../01_CuratingMetadata/metadataDAS28.csv", stringsAsFactors = FALSE)
rownames(meta) <- meta$KT_sampleName_withS
meta$DAS28 <- as.numeric(meta$DAS28)
meta$MTXRandMTXNR <- factor(meta$MTXRandMTXNR,
levels = c("MTXNR", "MTXR"))
wilcox_test <- wilcox.test(DAS28 ~ MTXRandMTXNR, data = meta)
wilcox_test
##
## Wilcoxon rank sum test with continuity correction
##
## data: DAS28 by MTXRandMTXNR
## W = 916, p-value = 0.02307
## alternative hypothesis: true location shift is not equal to 0
t.test(DAS28 ~ MTXRandMTXNR, data = meta)
##
## Welch Two Sample t-test
##
## data: DAS28 by MTXRandMTXNR
## t = -2.56, df = 97.241, p-value = 0.01201
## alternative hypothesis: true difference in means between group MTXNR and group MTXR is not equal to 0
## 95 percent confidence interval:
## -1.292138 -0.163587
## sample estimates:
## mean in group MTXNR mean in group MTXR
## 5.012750 5.740613
summary_df <- meta %>%
group_by(MTXRandMTXNR) %>%
summarise(
mean_DAS28 = mean(DAS28, na.rm = TRUE),
.groups = "drop")
P <- ggplot(summary_df, aes(x = MTXRandMTXNR, y = mean_DAS28, fill = MTXRandMTXNR)) +
# Bar layer
geom_col(width = 0.6, alpha = 0.7) +
# Point layer - explicitly adding 'fill' inside aes()
geom_point(data = meta,
aes(x = MTXRandMTXNR, y = DAS28, shape = Cohort, fill = MTXRandMTXNR),
color = "black",
stroke = 0.5,
position = position_jitter(width = 0.1, height = 0),
size = 2) +
# Consistent color and shape scales
scale_fill_manual(values = c("MTXNR" = "darksalmon", "MTXR" = "#238A8DFF")) +
scale_shape_manual(values = c("Cohort-1" = 22, "Cohort-2" = 21, "Cohort-3" = 24)) +
labs(x = "Response Status",
y = "DAS28 Score",
shape = "Cohort") +
theme_classic(base_size = 14) +
theme(legend.position = "right")
P
ggsave("Figure4A_DAS28.pdf", P, width = 5, height = 3)
##Figure 4B and Suppl.figure 9A
rm(list = ls())
pseq <- readRDS("../02_20251229_MetaphlanAnalysis/01_Diff_Level_Analysis/genusBatchCorrected_phyloseq.rds")
Metadata <- read.csv("../01_CuratingMetadata/metadataDAS28.csv", stringsAsFactors = FALSE)
rownames(Metadata) <- Metadata$KT_sampleName_withS
Metadata$DAS28 <- as.numeric(Metadata$DAS28)
abund <- as(otu_table(pseq), "matrix")
if (taxa_are_rows(pseq)) { abund <- t(abund) }
abund <- abund + 1e-6
abund_clr <- t(apply(abund, 1, function(x) log(x / exp(mean(log(x))))))
abund_clr <- as.data.frame(abund_clr)
common_samples <- intersect(rownames(abund_clr), rownames(Metadata))
abund_clr <- abund_clr[common_samples, , drop = FALSE]
meta <- Metadata[common_samples, , drop = FALSE]
genus_names <- colnames(abund_clr)
results <- data.frame(
Genus = genus_names,
Spearman_rho = NA_real_,
p_value = NA_real_
)
for (i in seq_along(genus_names)) {
g <- genus_names[i]
x <- abund_clr[, g]
y <- meta$DAS28
if (sd(x, na.rm = TRUE) == 0 || all(is.na(y))) next
cor_test <- suppressWarnings(cor.test(x, y, method = "spearman"))
results$Spearman_rho[i] <- cor_test$estimate
results$p_value[i] <- cor_test$p.value
}
results$padj <- p.adjust(results$p_value, method = "BH")
results_filtered <- results %>%
mutate(
Genus_name = str_extract(Genus, "g__[^|]+"),
Genus_name = sub("^g__", "", Genus_name)
) %>%
filter(!is.na(Spearman_rho), padj <= 0.2, !str_starts(Genus_name, "GGB")) %>%
arrange(desc(abs(Spearman_rho)))
abund_long <- abund_clr[, results_filtered$Genus, drop = FALSE] %>%
as.data.frame() %>%
mutate(Sample = rownames(.)) %>%
pivot_longer(
cols = -Sample,
names_to = "Genus",
values_to = "Abundance"
) %>%
left_join(meta %>% mutate(Sample = rownames(meta)) %>% select(Sample, DAS28, Cohort),
by = "Sample"
) %>%
mutate(
Genus_name = str_extract(Genus, "g__[^|]+"),
Genus_name = sub("^g__", "", Genus_name)
) %>%
filter(!str_starts(Genus_name, "GGB"))
style <- theme_classic(base_size = 14) +
theme(
legend.position = "bottom",
aspect.ratio = 0.75,
panel.spacing = unit(1, "lines"),
strip.background = element_blank(),
strip.text = element_text(face = "bold")
)
my_geom_point <- geom_point(aes(color = Cohort), size = 2, alpha = 0.7)
my_geom_smooth <- geom_smooth(method = "lm", color = "black", linetype = "dashed", se = TRUE)
P_genus <- ggplot(abund_long, aes(x = Abundance, y = DAS28)) +
geom_point(aes(color = Cohort), size = 2, alpha = 0.7) +
geom_smooth(method = "lm", color = "black", linetype = "dashed", se = TRUE) +
facet_wrap(~Genus_name, scales = "free", nrow = 1) +
labs(x = "Genus CLR Abundance", y = "DAS28") +
style
ggsave("Figure4B_GeneravsDAS28Corr.pdf", plot = P_genus, width = 11, height = 4)
## `geom_smooth()` using formula = 'y ~ x'
##Figure 4C and Suppl.figure 9B
rm(list = ls())
pseq <- readRDS("../02_20251229_MetaphlanAnalysis/01_Diff_Level_Analysis/SpeciesBatchCorrected_phyloseq.rds")
Metadata <- read.csv("../01_CuratingMetadata/metadataDAS28.csv", stringsAsFactors = FALSE)
rownames(Metadata) <- Metadata$KT_sampleName_withS
Metadata$DAS28 <- as.numeric(Metadata$DAS28)
abund <- as(otu_table(pseq), "matrix")
if (taxa_are_rows(pseq)) { abund <- t(abund) }
abund <- abund + 1e-6
abund_clr <- t(apply(abund, 1, function(x) log(x / exp(mean(log(x))))))
abund_clr <- as.data.frame(abund_clr)
common_samples <- intersect(rownames(abund_clr), rownames(Metadata))
abund_clr <- abund_clr[common_samples, , drop = FALSE]
meta <- Metadata[common_samples, , drop = FALSE]
Species_names <- colnames(abund_clr)
results <- data.frame(
Species = Species_names,
Spearman_rho = NA_real_,
p_value = NA_real_
)
for (i in seq_along(Species_names)) {
g <- Species_names[i]
x <- abund_clr[, g]
y <- meta$DAS28
if (sd(x, na.rm = TRUE) == 0 || all(is.na(y))) next
cor_test <- suppressWarnings(cor.test(x, y, method = "spearman"))
results$Spearman_rho[i] <- cor_test$estimate
results$p_value[i] <- cor_test$p.value
}
results$padj <- p.adjust(results$p_value, method = "BH")
results_filtered <- results %>%
mutate(
Species_name = str_extract(Species, "s__[^|]+"),
Species_name = sub("^s__", "", Species_name)
) %>%
filter(!is.na(Spearman_rho), padj <= 0.2,
!str_starts(Species_name, "GGB"),
!str_starts(Species_name, "SGB")) %>%
arrange(desc(abs(Spearman_rho)))
write.csv(results_filtered, "Suppl_Table_Species_DAS28_Corr.csv", row.names = FALSE)
abund_long <- abund_clr[, results_filtered$Species, drop = FALSE] %>%
as.data.frame() %>%
mutate(Sample = rownames(.)) %>%
pivot_longer(
cols = -Sample,
names_to = "Species",
values_to = "Abundance"
) %>%
left_join(
# Explicitly include Cohort here
meta %>% mutate(Sample = rownames(meta)) %>% select(Sample, DAS28, Cohort),
by = "Sample"
) %>%
mutate(
Species_name = str_extract(Species, "s__[^|]+"),
Species_name = sub("^s__", "", Species_name)
) %>%
filter(!str_starts(Species_name, "SGB"), !str_starts(Species_name, "GGB"))
style <- theme_classic(base_size = 14) +
theme(
legend.position = "bottom",
aspect.ratio = 0.75,
panel.spacing = unit(1, "lines"),
strip.background = element_blank(),
strip.text = element_text(face = "bold")
)
my_geom_point <- geom_point(aes(color = Cohort), size = 2, alpha = 0.7)
my_geom_smooth <- geom_smooth(method = "lm", color = "black", linetype = "dashed", se = TRUE)
P_species <- ggplot(abund_long, aes(x = Abundance, y = DAS28)) + my_geom_point + my_geom_smooth +
facet_wrap(~Species_name, scales = "free", ncol = 4) +
labs(
x = "Species CLR Abundance",
y = "DAS28",
title = "Significant Species Correlation") + style +
theme(strip.text = element_text(face = "italic"))
ggsave("Figure4C_Species.pdf", plot = P_species, width = 11, height = 10)
## `geom_smooth()` using formula = 'y ~ x'
##Figure 4D and Suppl.figure 9C
rm(list=ls())
d2 <- read.csv("../03_HumannAnalysis/02_KO/20251028BatchCorrectedKO.csv",
check.names = FALSE)
#Name first column as Genefamily
colnames(d2)[1] <- "Genefamily"
rownames(d2) <- d2$Genefamily
d2$Genefamily <- NULL
#Clean sample names
colnames(d2) <- sub("^X", "", colnames(d2))
colnames(d2) <- sub("_L002$", "", colnames(d2))
#Transpose: rows = samples, columns = KOs
abund <- t(d2)
abund <- as.data.frame(abund)
Metadata <- read.csv("../01_CuratingMetadata/metadataDAS28.csv",
stringsAsFactors = FALSE)
Metadata$KT_sampleName_withS <- gsub("\\.", "-", Metadata$KT_sampleName_withS)
Metadata$DAS28 <- as.numeric(Metadata$DAS28)
rownames(Metadata) <- Metadata$KT_sampleName_withS
common_samples <- intersect(rownames(abund), rownames(Metadata))
abund <- abund[common_samples, , drop = FALSE]
meta <- Metadata[common_samples, , drop = FALSE]
#Add small pseudocount to avoid log(0)
abund_clr <- abund + 1e-6
#CLR per sample
abund_clr <- t(apply(abund_clr, 1, function(x) log(x / exp(mean(log(x))))))
abund_clr <- as.data.frame(abund_clr)
results <- data.frame(
GeneFamily = colnames(abund_clr),
Spearman_rho = NA_real_,
p_value = NA_real_
)
for (i in seq_along(colnames(abund_clr))) {
x <- abund_clr[, i]
y <- meta$DAS28
#Skip KOs with zero variance
if (sd(x, na.rm = TRUE) == 0) next
ct <- suppressWarnings(cor.test(x, y, method = "spearman"))
results$Spearman_rho[i] <- ct$estimate
results$p_value[i] <- ct$p.value
}
results$padj <- p.adjust(results$p_value, method = "BH")
results_filtered <- results %>%
filter(!is.na(Spearman_rho), padj <= 0.05) %>%
arrange(desc(abs(Spearman_rho)))
write.csv(results_filtered, "KEGG_DAS28_Correlations_CLR.csv", row.names = FALSE)
abund_long <- abund_clr[, results_filtered$GeneFamily, drop = FALSE] %>%
as.data.frame() %>%
mutate(Sample = rownames(.)) %>%
pivot_longer(
cols = -Sample,
names_to = "GeneFamily",
values_to = "Abundance"
) %>%
left_join(
#Ensure Cohort is selected here from the meta object
meta %>% mutate(Sample = rownames(meta)) %>% select(Sample, DAS28, Cohort),
by = "Sample"
)
P <- ggplot(abund_long, aes(x = Abundance, y = DAS28)) +
#Map color to Cohort inside geom_point
geom_point(aes(color = Cohort), size = 2, alpha = 0.7) +
#Standard black regression line for the overall trend
geom_smooth(method = "lm", se = TRUE, color = "black", linetype = "dashed") +
facet_wrap(~ GeneFamily, scales = "free") +
labs( x = "KEGG KO CLR Abundance",
y = "DAS28",
title = "Correlation of CLR-Transformed KEGG KO Abundance with DAS28",
color = "Cohort") +
theme_classic(base_size = 14) +
theme(legend.position = "bottom")
print(P)
## `geom_smooth()` using formula = 'y ~ x'
ggsave("KEGG_Abund_vs_DAS28.pdf", plot = P, width = 10, height = 12)
## `geom_smooth()` using formula = 'y ~ x'
#Figure 5A
rm(list=ls())
set.seed(33)
d2_kegg <- fread("../03_HumannAnalysis/02_KO/20251028BatchCorrectedKO.csv") |> as.data.frame()
rownames(d2_kegg) <- d2_kegg[, 1]; d2_kegg <- d2_kegg[, -1]
colnames(d2_kegg) <- gsub("^X|_L002$|\\.", "-", colnames(d2_kegg))
Metadata <- read.csv("../01_CuratingMetadata/metadataDAS28.csv", stringsAsFactors = FALSE)
rownames(Metadata) <- gsub("\\.", "-", Metadata$KT_sampleName_withS)
Metadata$MTXRandMTXNR <- factor(Metadata$MTXRandMTXNR, levels = c("MTXR", "MTXNR"))
common_s <- intersect(colnames(d2_kegg), rownames(Metadata))
Metadata <- Metadata[common_s, ]
kegg_df <- as.data.frame(t(d2_kegg[, common_s]))
rel_abund_df <- kegg_df / rowSums(kegg_df)
cohorts <- unique(Metadata$Cohort)
plot_results <- data.frame()
clin_features <- c("age", "gender", "DAS28")
#iitialize a list to store selected features per cohort
feature_log <- list()
#LOCO Loop
for (test_cohort in cohorts) {
cat("\n--- Testing on Cohort:", test_cohort, "---\n")
train_s <- rownames(Metadata)[Metadata$Cohort != test_cohort]
val_s <- rownames(Metadata)[Metadata$Cohort == test_cohort]
k_train_rel <- rel_abund_df[train_s, ]
k_val_rel <- rel_abund_df[val_s, ]
target_train <- Metadata[train_s, "MTXRandMTXNR"]
target_val <- Metadata[val_s, "MTXRandMTXNR"]
#Indepedent KO selection
cat(" Running Boruta on KEGG...\n")
boruta_k <- Boruta(x = k_train_rel, y = target_train, maxRuns = 200)
k_selected <- getSelectedAttributes(TentativeRoughFix(boruta_k))
#have at least 5 KOs
if (length(k_selected) < 5) {
cat(" Applying fallback to top 5 features...\n")
temp_rf <- randomForest::randomForest(x = k_train_rel, y = target_train, ntree = 500)
imp <- randomForest::importance(temp_rf)
top_kos <- row.names(imp)[order(imp[,1], decreasing = TRUE)]
k_selected <- unique(c(k_selected, setdiff(top_kos, k_selected)[1:(5 - length(k_selected))]))
}
#save the selected features for this cohort ---
feature_log[[test_cohort]] <- k_selected
cat(" Features selected for this split:", paste(k_selected, collapse=", "), "\n")
#define strategies
strategies <- list(
"KEGG Only" = k_selected,
"KEGG + DAS28" = c(k_selected, "DAS28"),
"KEGG + Clinical" = c(k_selected, "age", "gender", "DAS28"),
"Clinical Only" = c("age", "gender", "DAS28"))
for (strat_name in names(strategies)) {
feats <- strategies[[strat_name]]
k_in_strat <- intersect(feats, colnames(k_train_rel))
clin_in_strat <- intersect(feats, clin_features)
train_df <- k_train_rel[, k_in_strat, drop=FALSE]
val_df <- k_val_rel[, k_in_strat, drop=FALSE]
for(cli in clin_in_strat) {
tr_val <- Metadata[train_s, cli]; te_val <- Metadata[val_s, cli]
if(is.numeric(tr_val)) {
train_df[[cli]] <- tr_val; val_df[[cli]] <- te_val
} else {
lvl <- unique(c(tr_val, te_val))
train_df[[cli]] <- factor(tr_val, levels=lvl); val_df[[cli]] <- factor(te_val, levels=lvl)
}
}
train_df$Target <- target_train
ctrl <- trainControl(method="cv", number=5, classProbs=TRUE, summaryFunction=twoClassSummary)
rf_mod <- train(Target ~ ., data = train_df, method = "rf", ntree = 500, metric = "ROC", trControl = ctrl)
probs <- predict(rf_mod, val_df, type = "prob")[, "MTXNR"]
roc_obj <- roc(target_val, probs, quiet = TRUE)
plot_results <- rbind(plot_results, data.frame(
FPR = 1 - roc_obj$specificities, TPR = roc_obj$sensitivities,
Strategy = strat_name, TestCohort = test_cohort, AUC = as.numeric(auc(roc_obj))
))
}
}
##
## --- Testing on Cohort: Cohort-2 ---
## Running Boruta on KEGG...
## Features selected for this split: K00390, K01197, K01358, K01995, K02020, K04518
## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
##
##
## --- Testing on Cohort: Cohort-1 ---
## Running Boruta on KEGG...
## Features selected for this split: K00127, K00783, K01190, K01255, K01358, K01620, K02348, K02874, K03306, K03469, K07056, K07114, K07726, K08093, K12373
## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
##
##
## --- Testing on Cohort: Cohort-3 ---
## Running Boruta on KEGG...
## Features selected for this split: K00042, K00087, K00548, K01191, K01197, K01308, K01995, K02042, K02874, K02902, K02907, K03148, K04651, K04758, K07406, K13652, K15598
## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
summary_auc <- plot_results %>%
group_by(Strategy) %>%
summarize(MeanAUC = round(mean(AUC), 2))
#create a text label for EACH facet that lists every cohort's specific AUC
facet_text <- plot_results %>%
group_by(Strategy, TestCohort) %>%
summarize(AUC = first(AUC)) %>% #Get the single AUC value for this cohort/strat
mutate(TextEntry = paste0(TestCohort, ": ", round(AUC, 2))) %>%
group_by(Strategy) %>%
summarize(AUC_List = paste(TextEntry, collapse = "\n"))
## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by Strategy and TestCohort.
## ℹ Output is grouped by Strategy.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(Strategy, TestCohort))` for per-operation grouping
## (`?dplyr::dplyr_by`) instead.
#prepare the main plotting dataframe
plot_df <- left_join(plot_results, summary_auc, by = "Strategy")
plot_df <- left_join(plot_df, facet_text, by = "Strategy")
#create the Facet Labels
plot_df$StratLabel <- factor(
paste0(plot_df$Strategy, "\n(Mean AUC: ", plot_df$MeanAUC, ")"),
levels = paste0(names(strategies), "\n(Mean AUC: ",
summary_auc$MeanAUC[match(names(strategies), summary_auc$Strategy)], ")")
)
#generate the Plot
p_final_auc <- ggplot(plot_df, aes(x = FPR, y = TPR, color = TestCohort)) +
geom_path(linewidth = 1.2) +
geom_abline(linetype = "dashed", color = "grey50") +
#Add the text annotation to the bottom-right of each facet
geom_text(aes(label = AUC_List),
x = 1, y = 0, hjust = 1, vjust = -0.2,
size = 3.5, color = "black", inherit.aes = FALSE, check_overlap = TRUE,
data = plot_df %>% select(StratLabel, AUC_List) %>% unique()) +
facet_wrap(~StratLabel, ncol = 2, scales= "free") +
theme_classic(base_size = 12) +
scale_color_brewer(palette = "Set1") +
labs(
x = "1-Specificity (FPR)", y = "Sensitivity (TPR)",
color = "Validation Cohort") +
theme(legend.position = "bottom",
strip.background = element_rect(fill = "grey95"),
strip.text = element_text(face = "bold"))
print(p_final_auc)
ggsave("Figure5A.pdf", p_final_auc, height = 7, width = 7, dpi = 300)
#Figure 5B, Suppl.figure 10
rm(list=ls())
d2_kegg_raw <- fread("../03_HumannAnalysis/02_KO/20251028BatchCorrectedKO.csv") |> as.data.frame()
rownames(d2_kegg_raw) <- d2_kegg_raw[, 1]
d2_kegg_raw <- d2_kegg_raw[, -1]
colnames(d2_kegg_raw) <- gsub("^X|_L002$|\\.", "-", colnames(d2_kegg_raw))
Metadata <- read.csv("../01_CuratingMetadata/metadataDAS28.csv", stringsAsFactors = FALSE)
rownames(Metadata) <- gsub("\\.", "-", Metadata$KT_sampleName_withS)
Metadata$MTXRandMTXNR <- factor(Metadata$MTXRandMTXNR, levels = c("MTXR", "MTXNR"))
common_s <- intersect(colnames(d2_kegg_raw), rownames(Metadata))
Metadata <- Metadata[common_s, ]
kegg_ra <- as.data.frame(t(d2_kegg_raw[, common_s]))
kegg_ra <- as.data.frame(sweep(kegg_ra, 1, rowSums(kegg_ra), "/"))
#tandardize clinical variables
Metadata$gender_num <- as.numeric(factor(Metadata$gender))
combined_df <- cbind(kegg_ra, Metadata[, c("DAS28", "age", "gender_num", "MTXRandMTXNR")])
colnames(combined_df)[colnames(combined_df) == "MTXRandMTXNR"] <- "Target"
colnames(combined_df)[which(colnames(combined_df) == "gender_num")] <- "gender"
#set seed for reproducibility
set_seed <- 18
set.seed(set_seed)
#STRATIFIED SPLIT: Partitioning data into 80% Training and 20% Validation + maintaing the same ratio of R/NR.
train_idx <- createDataPartition(combined_df$Target, p = 0.8, list = FALSE)
train_set <- combined_df[train_idx, ]
val_set <- combined_df[-train_idx, ]
#boruta Feature Selection on training cohort only
kegg_cols <- colnames(kegg_ra)
##run on only on the 80% Training Set to prevent data leakage.
#x the biological KEGG pathways (predictors)
#y is the Target (MTXR vs MTXNR status)
#maxRuns = 100 iterations are ok.
boruta_out <- Boruta(x = train_set[, kegg_cols], y = train_set$Target, doTrace = 0, maxRuns = 100)
#Boruta labels features as confirmed/Rejcted/Tent
#TentativeRoughFix performs a final statistical comparison to make keep/drop' decision
final_boruta <- TentativeRoughFix(boruta_out)
kegg_features <- getSelectedAttributes(final_boruta, withTentative = TRUE)
#check if less than 2 features selected then force select top 5 features based on median imp
if(length(kegg_features) < 2) {
imps <- attStats(final_boruta)
kegg_features <- rownames(imps[order(-imps$medianImp), ])[1:3]
}
#make grouping for main figure and supp figure as suggested by RN -Combined Strategies-single plot
feature_sets <- list(
"KEGG_only" = kegg_features,
"Clin_Param" = c("age", "gender", "DAS28"),
"KEGG_Clin_Param" = c(kegg_features, "age", "gender", "DAS28"),
#this is for suppl figure
"DAS28_only" = "DAS28",
"Age_only" = "age",
"Gender_only" = "gender",
"KEGG_DAS28" = c(kegg_features, "DAS28"),
"KEGG_Age" = c(kegg_features, "age"),
"KEGG_Gender" = c(kegg_features, "gender")
)
#loop for traing and validation
roc_results <- list()
#here am looping through each strategy (KEGG Only, Clinical, Both-see above)
#to compare their performance using the same train/test parameters.
for (strat_name in names(feature_sets)) {
#Extract the specific col names
vars <- feature_sets[[strat_name]]
#parameters:
#using 5-fold CV within the training set. what this does- splits the 80% training data into 5 'folds'—training on 4 and testing on 1—repeatedly to find the most stable model.
#then 'twoClassSummary' -we calculate AUC-ROC, Sensitivity, and Specificity.
set.seed(set_seed)
cv_ctrl <- trainControl(
method = "cv", number = 5,
classProbs = TRUE, summaryFunction = twoClassSummary, savePredictions="final" )
#500 trees and 5 fold cross validation, method = "rf"- a random Forest
#metric = "ROC"- model to maximize the Area Under the Curve (AUC).
#drop = FALSEdata stays as a dataframe
rf_mod <- train(
x = train_set[, vars, drop = FALSE],
y = train_set$Target,
method = "rf", ntree = 500, metric = "ROC",
trControl = cv_ctrl
)
#rf_mod objects saves the learning
#We apply the trained rf_mod to the validation cohort- 20% hold-out data-see above.
#type = "prob"' returns the specific confidence level for each class. for example- Patient 501_1 has a 0.85 probability of being a MTXNR.
#Predict probabilities for AUC and Accuracy
probs <- predict(rf_mod, newdata = val_set[, vars, drop = FALSE], type = "prob")
#below we use default 50% threshold to decide the final category. that predicts the MTXR/NR and calculate the Accuracy percentage.
preds <- predict(rf_mod, newdata = val_set[, vars, drop = FALSE])
#Now compare the model's predictions against the actual 'Target' outcomes. from the validation set to calculate performance scores.
#We use the predicted probabilities for the Non-Responder class.
#The 'roc' function calculates the trade-off between Sensitivity and Specificity
#at every possible probability threshold (from 0 to 1).
roc_obj <- roc(val_set$Target, probs$MTXNR, quiet = TRUE)
#against the actual patient status. The 'overall["Accuracy"]' gives us the
#percentage of total patients who were correctly classified.
acc_val <- confusionMatrix(preds, val_set$Target)$overall["Accuracy"]
#now save - 'roc': The full curve object (needed for plotting the ROC curve later).
#auc-the single-number-AUC
#acc-the raw Accuracy score (percentage of correct calls).
roc_results[[strat_name]] <- list(
roc = roc_obj,
auc = as.numeric(auc(roc_obj)),
acc = suppressWarnings(as.numeric(acc_val))
)
}
## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
#convert the roc_results into a dataframe.
#This format is required for plotting multiple ROC curves in a single ggplot.
plot_data <- bind_rows(
lapply(names(roc_results), function(x){
#Pull the Sensitivity and Specificity
#oordinates from the roc object for the current feature.
df <- data.frame(
Sensitivity = roc_results[[x]]$roc$sensitivities,
Specificity = roc_results[[x]]$roc$specificities)
#Remove duplicate points
df <- df[!duplicated(df$Specificity, fromLast = TRUE), ]
#order by Specificity so the line draws correctly from left to right.
df <- df[order(df$Specificity, decreasing = TRUE), ]
#Add the Strategy name and the final rounded AUC/Accuracy scores to each row.
df$Strategy <- x
df$AUC <- round(roc_results[[x]]$auc, 2)
df$Accuracy <- round(roc_results[[x]]$acc, 2)
return(df)
})
)
#update label to include both AUC and Accuracy
plot_data$Legend_Label <- paste0(plot_data$Strategy,
" (AUC: ", plot_data$AUC,
", Acc: ", plot_data$Accuracy, ")")
main_strategies <- c("KEGG_only", "Clin_Param", "KEGG_Clin_Param")
main_df <- plot_data %>% filter(Strategy %in% main_strategies)
#We use aes(color = Legend_Label) to show the metrics in the legend
legend_map <- unique(main_df[, c("Strategy", "Legend_Label")])
legend_labels <- setNames(legend_map$Legend_Label, legend_map$Strategy)
# 2. Plotting
p_main <- ggplot(main_df, aes(x = 1 - Specificity, y = Sensitivity, color = Strategy)) +
geom_step(direction = "vh", linewidth = 1.2) + # 'size' is deprecated, using 'linewidth'
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "darkgrey") +
# Map colors to the Strategy column (the short names)
# Use 'labels' to swap the short names for the long Legend_Labels in the visual legend
scale_color_manual(
values = c(
"KEGG_only" = "blue",
"Clin_Param" = "red",
"KEGG_Clin_Param" = "green"
),
labels = legend_labels
) +
theme_classic() +
labs(
x = "False Positive Rate (1 - Specificity)",
y = "True Positive Rate (Sensitivity)",
title = "ROC Curve Comparison: Main Strategies",
color = "Model Strategy & Metrics"
) +
theme(
legend.position = c(0.65, 0.25),
legend.background = element_rect(fill = "white", color = "black")
)
print(p_main)
#individual Faceted Plot
indiv_df <- plot_data %>% filter(!(Strategy %in% main_strategies))
p_indiv <- ggplot(indiv_df, aes(x = 1 - Specificity, y = Sensitivity)) +
geom_step(color = "black", size = 1) +
geom_abline(slope = 1, intercept = 0, linetype = "dotted") +
facet_wrap(~ Strategy) +
theme_classic() +
#Add both AUC and Accuracy text to each facet
geom_text(aes(x = 0.5, y = 0.2,
label = paste0("AUC: ", AUC, "\nAcc: ", Accuracy)),
size = 3.5, color = "black") +
labs(title = "Individual Feature Subsets Performance")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(p_indiv)
ggsave("Strategy_Comparison_Main.pdf", plot = p_main, width = 8, height = 7)
ggsave("Strategy_Comparison_Individual.pdf", plot = p_indiv, width = 10, height = 8)
train_metadata_full <- Metadata[rownames(train_set), ]
val_metadata_full <- Metadata[rownames(val_set), ]
write.csv(train_metadata_full, "Train_Full_Metadata2.csv", row.names = TRUE)
write.csv(val_metadata_full, "Validation_Full_Metadata2.csv", row.names = TRUE)
#Print Boruta Features & Training Performance ---
cat("\n--- Boruta Selected KEGG Features ---\n")
##
## --- Boruta Selected KEGG Features ---
print(kegg_features)
## [1] "K00773" "K01197" "K01358" "K01876" "K01995" "K02874" "K02892" "K02907"
## [9] "K02952" "K02965" "K04758" "K08300" "K09760"
cat("\n--- RF Model Training Results (Best Tune) ---\n")
##
## --- RF Model Training Results (Best Tune) ---
#Note: rf_mod is the last model trained in my loop (KEGG_Clin_Param)
print(rf_mod$results[rf_mod$results$ROC == max(rf_mod$results$ROC), ])
## mtry ROC Sens Spec ROCSD SensSD SpecSD
## 2 8 0.8862103 0.7142857 0.8583333 0.08064415 0.237238 0.0993808
cat("\n--- Boruta Selected KEGG Features ---\n")
##
## --- Boruta Selected KEGG Features ---
if(length(kegg_features) > 0) {
print(kegg_features)
} else {
cat("No features significantly selected; using top 5 by median importance.\n")
}
## [1] "K00773" "K01197" "K01358" "K01876" "K01995" "K02874" "K02892" "K02907"
## [9] "K02952" "K02965" "K04758" "K08300" "K09760"
cat("\n--- Training Results (Cross-Validation Metrics) ---\n")
##
## --- Training Results (Cross-Validation Metrics) ---
for (strat_name in names(feature_sets)) {
cat(paste0("\nStrategy: ", strat_name, "\n"))
#Accessing the best result from the caret object
print(rf_mod$results[rf_mod$results$ROC == max(rf_mod$results$ROC), ])
}
##
## Strategy: KEGG_only
## mtry ROC Sens Spec ROCSD SensSD SpecSD
## 2 8 0.8862103 0.7142857 0.8583333 0.08064415 0.237238 0.0993808
##
## Strategy: Clin_Param
## mtry ROC Sens Spec ROCSD SensSD SpecSD
## 2 8 0.8862103 0.7142857 0.8583333 0.08064415 0.237238 0.0993808
##
## Strategy: KEGG_Clin_Param
## mtry ROC Sens Spec ROCSD SensSD SpecSD
## 2 8 0.8862103 0.7142857 0.8583333 0.08064415 0.237238 0.0993808
##
## Strategy: DAS28_only
## mtry ROC Sens Spec ROCSD SensSD SpecSD
## 2 8 0.8862103 0.7142857 0.8583333 0.08064415 0.237238 0.0993808
##
## Strategy: Age_only
## mtry ROC Sens Spec ROCSD SensSD SpecSD
## 2 8 0.8862103 0.7142857 0.8583333 0.08064415 0.237238 0.0993808
##
## Strategy: Gender_only
## mtry ROC Sens Spec ROCSD SensSD SpecSD
## 2 8 0.8862103 0.7142857 0.8583333 0.08064415 0.237238 0.0993808
##
## Strategy: KEGG_DAS28
## mtry ROC Sens Spec ROCSD SensSD SpecSD
## 2 8 0.8862103 0.7142857 0.8583333 0.08064415 0.237238 0.0993808
##
## Strategy: KEGG_Age
## mtry ROC Sens Spec ROCSD SensSD SpecSD
## 2 8 0.8862103 0.7142857 0.8583333 0.08064415 0.237238 0.0993808
##
## Strategy: KEGG_Gender
## mtry ROC Sens Spec ROCSD SensSD SpecSD
## 2 8 0.8862103 0.7142857 0.8583333 0.08064415 0.237238 0.0993808
cat("\n--- Validation Set Performance (Holdout) ---\n")
##
## --- Validation Set Performance (Holdout) ---
for (strat_name in names(roc_results)) {
cat(sprintf("%s - AUC: %.3f, Accuracy: %.3f\n",
strat_name,
roc_results[[strat_name]]$auc,
roc_results[[strat_name]]$acc))
}
## KEGG_only - AUC: 0.878, Accuracy: 0.789
## Clin_Param - AUC: 0.611, Accuracy: 0.579
## KEGG_Clin_Param - AUC: 0.922, Accuracy: 0.789
## DAS28_only - AUC: 0.600, Accuracy: 0.579
## Age_only - AUC: 0.672, Accuracy: 0.368
## Gender_only - AUC: 0.617, Accuracy: 0.632
## KEGG_DAS28 - AUC: 0.872, Accuracy: 0.737
## KEGG_Age - AUC: 0.867, Accuracy: 0.737
## KEGG_Gender - AUC: 0.844, Accuracy: 0.789
rf_models <- list()
roc_results <- list()
set.seed(set_seed)
cv_ctrl <- trainControl(
method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary
)
for (strat_name in names(feature_sets)) {
vars <- feature_sets[[strat_name]]
cat("\nTraining strategy:", strat_name, "\n")
#Train Random Forest model
rf_mod <- train(
x = train_set[, vars, drop = FALSE],
y = train_set$Target,
method = "rf",
ntree = 500,
metric = "ROC",
trControl = cv_ctrl
)
#Store model
rf_models[[strat_name]] <- rf_mod
#Predict probabilities on validation set
probs <- predict(rf_mod, newdata = val_set[, vars, drop = FALSE], type = "prob")
#Predict classes
preds <- predict(rf_mod, newdata = val_set[, vars, drop = FALSE])
roc_obj <- roc(val_set$Target, probs$MTXNR, quiet = TRUE)
acc_val <- confusionMatrix(preds, val_set$Target)$overall["Accuracy"]
#Store results
roc_results[[strat_name]] <- list(
roc = roc_obj,
auc = as.numeric(auc(roc_obj)),
acc = as.numeric(acc_val)
)
}
##
## Training strategy: KEGG_only
##
## Training strategy: Clin_Param
## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
##
##
## Training strategy: KEGG_Clin_Param
##
## Training strategy: DAS28_only
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
##
## Training strategy: Age_only
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
##
## Training strategy: Gender_only
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
##
## Training strategy: KEGG_DAS28
##
## Training strategy: KEGG_Age
##
## Training strategy: KEGG_Gender
cat("\n--- Training Results (Cross-Validation) ---\n")
##
## --- Training Results (Cross-Validation) ---
for (strat_name in names(rf_models)) {
cat(paste0("\nStrategy: ", strat_name, "\n"))
mod <- rf_models[[strat_name]]
best_row <- mod$results[which.max(mod$results$ROC), ]
print(best_row)
}
##
## Strategy: KEGG_only
## mtry ROC Sens Spec ROCSD SensSD SpecSD
## 3 13 0.8789435 0.7107143 0.8583333 0.07790267 0.2099259 0.0993808
##
## Strategy: Clin_Param
## mtry ROC Sens Spec ROCSD SensSD SpecSD
## 2 3 0.534127 0.4892857 0.6472222 0.1778114 0.2935858 0.1494589
##
## Strategy: KEGG_Clin_Param
## mtry ROC Sens Spec ROCSD SensSD SpecSD
## 1 2 0.8334077 0.7607143 0.8583333 0.09679308 0.1888134 0.0993808
##
## Strategy: DAS28_only
## mtry ROC Sens Spec ROCSD SensSD SpecSD
## 1 2 0.4890873 0.4785714 0.4 0.08547017 0.1667517 0.1493944
##
## Strategy: Age_only
## mtry ROC Sens Spec ROCSD SensSD SpecSD
## 1 2 0.6161458 0.5535714 0.5861111 0.1518314 0.1143415 0.2013649
##
## Strategy: Gender_only
## mtry ROC Sens Spec ROCSD SensSD SpecSD
## 1 2 0.5482143 0.2678571 0.8 0.06985635 0.2000478 0.1987616
##
## Strategy: KEGG_DAS28
## mtry ROC Sens Spec ROCSD SensSD SpecSD
## 2 8 0.8375496 0.7357143 0.8555556 0.0480927 0.2213854 0.1065016
##
## Strategy: KEGG_Age
## mtry ROC Sens Spec ROCSD SensSD SpecSD
## 1 2 0.8538194 0.7142857 0.9055556 0.08020728 0.09698911 0.1031899
##
## Strategy: KEGG_Gender
## mtry ROC Sens Spec ROCSD SensSD SpecSD
## 1 2 0.8508929 0.7107143 0.7916667 0.06569341 0.05113508 0.09212847
cat("\n--- Validation Set Performance (Holdout) ---\n")
##
## --- Validation Set Performance (Holdout) ---
for (strat_name in names(roc_results)) {
cat(sprintf("%s - AUC: %.3f, Accuracy: %.3f\n",
strat_name,
roc_results[[strat_name]]$auc,
roc_results[[strat_name]]$acc))
}
## KEGG_only - AUC: 0.878, Accuracy: 0.789
## Clin_Param - AUC: 0.644, Accuracy: 0.632
## KEGG_Clin_Param - AUC: 0.744, Accuracy: 0.579
## DAS28_only - AUC: 0.600, Accuracy: 0.579
## Age_only - AUC: 0.678, Accuracy: 0.368
## Gender_only - AUC: 0.617, Accuracy: 0.632
## KEGG_DAS28 - AUC: 0.889, Accuracy: 0.789
## KEGG_Age - AUC: 0.800, Accuracy: 0.684
## KEGG_Gender - AUC: 0.772, Accuracy: 0.579
#Suppl.figure 1 :Age and Sex metadata
rm(list = ls())
meta <- read.csv("../01_CuratingMetadata/metadataDAS28.csv",
stringsAsFactors = FALSE)
#Convert to factors
meta$MTXRandMTXNR <- as.factor(meta$MTXRandMTXNR)
meta$Cohort <- as.factor(meta$Cohort)
meta$gender <- as.factor(meta$gender)
meta$age <- as.numeric(meta$age)
t.test(age ~ MTXRandMTXNR, data = meta)
##
## Welch Two Sample t-test
##
## data: age by MTXRandMTXNR
## t = 0.31132, df = 94.143, p-value = 0.7562
## alternative hypothesis: true difference in means between group MTXNR and group MTXR is not equal to 0
## 95 percent confidence interval:
## -5.148716 7.063610
## sample estimates:
## mean in group MTXNR mean in group MTXR
## 49.00000 48.04255
summary_age <- meta %>%
group_by(MTXRandMTXNR) %>%
summarise(
mean_age = mean(age, na.rm = TRUE),
.groups = "drop"
)
summary_age_cohort <- meta %>%
group_by(Cohort, MTXRandMTXNR) %>%
summarise(
mean_age = mean(age, na.rm = TRUE),
.groups = "drop" )
summary_gender <- meta %>%
group_by(MTXRandMTXNR, gender) %>%
summarise(
n = n(),
.groups = "drop")
#1. Age Combined Plot
P_age <- ggplot(summary_age, aes(x = MTXRandMTXNR, y = mean_age, fill = MTXRandMTXNR)) +
geom_col(width = 0.6, alpha = 0.7) +
geom_point(data = meta,
aes(x = MTXRandMTXNR, y = age, shape = Cohort, fill = MTXRandMTXNR),
color = "black",
stroke = 0.5,
position = position_jitter(width = 0.1, height = 0),
size = 2) +
scale_fill_manual(values = c("MTXNR" = "darksalmon", "MTXR" = "#238A8DFF")) +
scale_shape_manual(values = c("Cohort-1" = 22, "Cohort-2" = 21, "Cohort-3" = 24)) +
labs(x = "Response Status", y = "Age (years)", shape = "Cohort") +
theme_classic(base_size = 14)
#2. Age Faceted by Cohort
P_age_cohort <- ggplot(summary_age_cohort, aes(x = MTXRandMTXNR, y = mean_age, fill = MTXRandMTXNR)) +
geom_col(width = 0.6, alpha = 0.7) +
geom_point(data = meta,
aes(x = MTXRandMTXNR, y = age, shape = Cohort, fill = MTXRandMTXNR),
color = "black",
stroke = 0.5,
position = position_jitter(width = 0.15, height = 0),
size = 2) +
scale_fill_manual(values = c("MTXNR" = "darksalmon", "MTXR" = "#238A8DFF")) +
scale_shape_manual(values = c("Cohort-1" = 22, "Cohort-2" = 21, "Cohort-3" = 24)) +
facet_wrap(~Cohort, nrow = 1, scales = "free_y") +
labs(x = "Response Status", y = "Age (years)") +
theme_classic(base_size = 14) +
theme(legend.position = "none",
axis.title.x = element_blank(),
axis.text.x = element_blank())
#3. Gender Counts Plot
#Note: No points here as it is a count summary, but keeping the colors consistent.
P_gender <- ggplot(summary_gender, aes(x = MTXRandMTXNR, y = n, fill = MTXRandMTXNR)) +
geom_col(width = 0.6, alpha = 0.8) +
facet_wrap(~ gender) +
scale_fill_manual(values = c("MTXNR" = "darksalmon", "MTXR" = "#238A8DFF")) +
labs(x = "Response Status", y = "Number of subjects", fill = "Response") +
theme_classic(base_size = 14) +
theme(strip.text = element_text(face = "bold"))
P_age_cohort
P_gender
ggsave("Age_in_MTXR_MTXNR.pdf", P_age, width = 5, height = 3)
ggsave("Age_in_Each_Cohort.pdf", P_age_cohort, width = 8, height = 3)
ggsave("Gender_counts_MTXR_MTXNR_facet_gender.pdf", P_gender, width = 6, height = 3, useDingbats=F)