#Making Phyloseq Object from Metaphlan Outoput

rm (list=ls())
seqdir <- "../00_InputFiles_AnalysisFromWynton/02_MetaPhlAnOutput/"   #FCohort-1er containing individual MetaPhlAn output files
files <- list.files(
  seqdir,
  pattern = "_profiled\\.txt$|_profile\\.txt$",
  full.names = FALSE
)
files_full <- file.path(seqdir, files)

#extract sample names (remove suffix)
Sample <- files %>%
  gsub("_pure_reads_profiled\\.txt$", "", .) %>%
  gsub("_profile\\.txt$", "", .)
taxa_df <- NULL
for (k in seq_along(files_full)) {
  file <- files_full[k]
  #read table, skip header lines (starting with #), allow ragged rows
  samp <- read.table(file,
                     sep = "\t",
                     header = TRUE,
                     comment.char = "#",
                     quote = "",
                     stringsAsFactors = FALSE,
                     fill = TRUE)

  #only keep taxa and relative abundance columns
  if (ncol(samp) >= 3) {
    samp <- samp[, c(1, 3)]
    colnames(samp) <- c("Taxa", Sample[k])

    if (is.null(taxa_df)) {
      taxa_df <- samp
    } else {
      taxa_df <- full_join(taxa_df, samp, by = "Taxa")
    }
  }
}

taxa_df$Level <- case_when(
  grepl("t__", taxa_df$Taxa) ~ "Strain",
  grepl("s__", taxa_df$Taxa) ~ "Species",
  grepl("g__", taxa_df$Taxa) ~ "Genus",
  grepl("f__", taxa_df$Taxa) ~ "Family",
  grepl("o__", taxa_df$Taxa) ~ "Order",
  grepl("c__", taxa_df$Taxa) ~ "Class",
  grepl("p__", taxa_df$Taxa) ~ "Phylum",
  grepl("k__", taxa_df$Taxa) ~ "Kingdom",
  TRUE ~ "Unclassified"
)

#reorder columns
taxa_df <- taxa_df %>% select(Taxa, Level, everything())
write.table(taxa_df, "merged_taxa_table.tsv", sep = "\t",
            row.names = FALSE, quote = FALSE)

##Merging data

rm(list = ls())
Metadata <- read.csv("../01_CuratingMetadata/metadataDAS28.csv", stringsAsFactors = FALSE) #This is the curated metadata,  see below

#set the row names using the 'KT_sampleName_withOutS' column
rownames(Metadata) <- Metadata$KT_sampleName_withS
head(Metadata)
##                                         fileName PatientID age
## SRR13436328  NYU_RA_5B_S152_L002_R1_001.fastq.gz         5  28
## SRR13436327                   37_S2.R1.fastq.gz*        37  30
## SRR13436277 NYU_RA_72B_S156_L002_R1_001.fastq.gz        72  37
## SRR13436266 NYU_RA_82B_S161_L002_R1_001.fastq.gz        82  31
## SRR13436255       NYU_RA_88B_S71_R1_001.fastq.gz        88  44
## SRR13436244       NYU_RA_90B_S75_R1_001.fastq.gz        90  71
##             KT_sampleName_withOutS KT_sampleName_withS patientID MTXRandMTXNR
## SRR13436328            SRR13436328         SRR13436328         5        MTXNR
## SRR13436327            SRR13436327         SRR13436327        37        MTXNR
## SRR13436277            SRR13436277         SRR13436277        72         MTXR
## SRR13436266            SRR13436266         SRR13436266        82        MTXNR
## SRR13436255            SRR13436255         SRR13436255        88        MTXNR
## SRR13436244            SRR13436244         SRR13436244        90        MTXNR
##             gender Cohort_Original   Cohort Country DAS28
## SRR13436328 Female             Old Cohort-1     USA  6.45
## SRR13436327   Male             Old Cohort-1     USA  6.04
## SRR13436277 Female             Old Cohort-1     USA  6.31
## SRR13436266 Female             Old Cohort-1     USA  4.62
## SRR13436255 Female             Old Cohort-1     USA  6.63
## SRR13436244 Female             Old Cohort-1     USA  4.92
seqtab <- read.csv("merged_taxa_table.tsv", 
                   row.names = 1,  header = TRUE,  sep = "\t")
seqtab[is.na(seqtab)] <- 0
colnames(seqtab) <- gsub("_L002", "", colnames(seqtab))
seqtab_samples <- colnames(seqtab)

#check samples are now without "_L002"
print(seqtab_samples[grep("NYU_RA_", seqtab_samples)])
##  [1] "NYU_RA_107B_S165" "NYU_RA_126B_S168" "NYU_RA_140B_S80"  "NYU_RA_211B_S85" 
##  [5] "NYU_RA_215B_S173" "NYU_RA_222B_S89"  "NYU_RA_263B_S177" "NYU_RA_283B_S91" 
##  [9] "NYU_RA_317B_S93"  "NYU_RA_334B_S95"  "NYU_RA_347B_S180" "NYU_RA_356B_S183"
## [13] "NYU_RA_360B_S186" "NYU_RA_371B_S189" "NYU_RA_5B_S152"   "NYU_RA_72B_S156" 
## [17] "NYU_RA_82B_S161"  "NYU_RA_88B_S71"   "NYU_RA_90B_S75"
#Select Strains that will select all the taxonomic levels
seqtab_strain <- seqtab[seqtab$Level == "Strain", ]
seqtab_strain$Level <- NULL  

colnames(seqtab_strain) <- colnames(seqtab_strain) %>%
  gsub("^X", "", .) %>%        #remove leading X
  gsub("\\.", "-", .)          #replace . w/ -

#subset ASV table to only metadata samples
otu_filtered <- seqtab_strain[, colnames(seqtab_strain) %in% rownames(Metadata)]

#create phyloseq components
otu_table_ps <- otu_table(as.matrix(otu_filtered), taxa_are_rows = TRUE)

#sample data
sample_data_ps <- sample_data(Metadata)

#taxonomy table
taxonomy_split <- strsplit(rownames(otu_filtered), "\\|")
rank_names <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species", "Strain")
tax_matrix <- do.call(rbind, lapply(taxonomy_split, function(x) {
  length(x) <- length(rank_names)  #pad with NAs if needed
  return(x)
}))
colnames(tax_matrix) <- rank_names
rownames(tax_matrix) <- rownames(otu_filtered)
tax_table_ps <- tax_table(as.matrix(tax_matrix))
sdata <- sample_data(Metadata)
taxonomy_split <- strsplit(rownames(seqtab_strain), "\\|")

#Define the rank names (MetaPhlAn usually has 7 levels)
rank_names <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")

#Pad or truncate to 7 levels and convert to matrix
tax_matrix <- do.call(rbind, lapply(taxonomy_split, function(x) {
  length(x) <- length(rank_names)  #Pad with NA if short
  return(x)
}))
colnames(tax_matrix) <- rank_names
rownames(tax_matrix) <- rownames(seqtab_strain)

tax <- tax_table(as.matrix(tax_matrix))
ps <- phyloseq(otu_table_ps, sdata, tax)
ps
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 2659 taxa and 100 samples ]
## sample_data() Sample Data:       [ 100 samples by 12 sample variables ]
## tax_table()   Taxonomy Table:    [ 2659 taxa by 7 taxonomic ranks ]
sample_names(ps)[1:5]
## [1] "RA_457_SAPBO_B_S13"    "RA_492_B_microra_S27"  "RA_495_SANORA_B_S39"  
## [4] "RA_505-1_Before_S98"   "RA_513-1__Before_S100"
taxa_names(ps)[1:5]
## [1] "k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Bacteroidales|f__Prevotellaceae|g__Segatella|s__Segatella_brasiliensis|t__SGB1636"
## [2] "k__Bacteria|p__Firmicutes|c__Clostridia|o__Lachnospirales|f__Lachnospiraceae|g__Roseburia|s__Roseburia_sp_AM59_24XD|t__SGB4654" 
## [3] "k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Bacteroidales|f__Prevotellaceae|g__Segatella|s__Segatella_sinensis|t__SGB1644"    
## [4] "k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Bacteroidales|f__Prevotellaceae|g__GGB1146|s__GGB1146_SGB1472|t__SGB1472"         
## [5] "k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Bacteroidales|f__Prevotellaceae|g__Segatella|s__Segatella_copri|t__SGB1626"
rank_names(ps)
## [1] "Kingdom" "Phylum"  "Class"   "Order"   "Family"  "Genus"   "Species"
ps <- phyloseq(otu_table_ps, sample_data_ps, tax_table_ps)

ps
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 2659 taxa and 100 samples ]
## sample_data() Sample Data:       [ 100 samples by 12 sample variables ]
## tax_table()   Taxonomy Table:    [ 2659 taxa by 8 taxonomic ranks ]
rank_names(ps)
## [1] "Kingdom" "Phylum"  "Class"   "Order"   "Family"  "Genus"   "Species"
## [8] "Strain"
ps <- prune_taxa(taxa_sums(ps) > 0, ps)
saveRDS(ps, "PhyloseqObject.rds")
sample_data1 <- as_tibble(sample_data(ps))
md<- as_tibble(sample_data(ps))
ptnumber<- md%>%
  group_by(MTXRandMTXNR, Cohort)%>%
  summarize(pcount = n_distinct(PatientID))
## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by MTXRandMTXNR and Cohort.
## ℹ Output is grouped by MTXRandMTXNR.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(MTXRandMTXNR, Cohort))` for per-operation grouping
##   (`?dplyr::dplyr_by`) instead.
p<- ggplot(ptnumber, aes(x=MTXRandMTXNR, y=pcount, fill =MTXRandMTXNR))+
  geom_col() +facet_wrap(~Cohort) +theme_classic2()+ geom_text(aes(label = pcount), vjust = -0.5)+ scale_fill_manual(values=c("darkorange3", "darkgreen" ))
p

ggsave( filename = "plot_pcount.pdf",  plot = p, width = 8, height = 6, dpi = 300 )
rm(list=ls())
pseq <- readRDS("PhyloseqObject.rds")
pseq_rel <- transform_sample_counts(pseq, function(x) x / sum(x))
pseq_filtered <- prune_taxa(taxa_sums(pseq_rel) > 0, pseq_rel)
md <- as_tibble(sample_data(pseq_filtered))
pseq_filtered
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 2100 taxa and 100 samples ]
## sample_data() Sample Data:       [ 100 samples by 12 sample variables ]
## tax_table()   Taxonomy Table:    [ 2100 taxa by 8 taxonomic ranks ]
otu_mat <- as(otu_table(pseq_filtered), "matrix")
if (!taxa_are_rows(pseq_filtered)) {
otu_mat <- t(otu_mat)
}
CRC_meta <- as(sample_data(pseq_filtered), "data.frame")


fit_adjust_batch <- adjust_batch(
feature_abd = otu_mat,
batch = "Cohort",   
covariates = "MTXRandMTXNR",    
data = CRC_meta,
control = list(verbose = FALSE))
## Warning in check_batch(df_batch[[batch]], min_n_batch = 2): Batch variable is
## not a factor as provided and will be converted to one.
CRC_abd_adj <- fit_adjust_batch$feature_abd_adj

otu_table_adj <- otu_table(CRC_abd_adj, taxa_are_rows = TRUE)
pseq_adj <- phyloseq(otu_table_adj, tax_table(pseq_filtered), sample_data(pseq_filtered))
pseq_adj
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 2100 taxa and 100 samples ]
## sample_data() Sample Data:       [ 100 samples by 12 sample variables ]
## tax_table()   Taxonomy Table:    [ 2100 taxa by 8 taxonomic ranks ]
saveRDS(pseq_adj, file = "PseqBatchCorrected.rds")

#Suppl. Figure 2AB

pcoa_before <- ordinate(pseq_filtered, method = "PCoA", distance = "bray")

eig_vals_before <- pcoa_before$values$Eigenvalues
var_explained_before <- round(100 * eig_vals_before / sum(eig_vals_before), 2)

p1 <- plot_ordination(pseq_filtered, pcoa_before, color = "Cohort") +
geom_point(size = 4) +
theme_classic() +
scale_color_manual(values = c("darkgreen", "darkorange3", "black")) +
theme(
axis.title = element_text(size = 16),
axis.text = element_text(size = 14)
) +
labs(
title = "Before Batch Correction (Bray-Curtis PCoA)",
x = paste0("PCoA1 (", var_explained_before[1], "%)"),
y = paste0("PCoA2 (", var_explained_before[2], "%)")
)
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the phyloseq package.
##   Please report the issue at <https://github.com/joey711/phyloseq/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
p1

ggsave("SupplFigure2A_PCoA_before_batch_correct.pdf", p1, height = 4, width = 6, dpi = 300)
pcoa_after <- ordinate(pseq_adj, method = "PCoA", distance = "bray")

eig_vals_after <- pcoa_after$values$Eigenvalues
var_explained_after <- round(100 * eig_vals_after / sum(eig_vals_after), 2)

p2 <- plot_ordination(pseq_adj, pcoa_after, color = "Cohort") +
geom_point(size = 4) +
theme_classic() +
scale_color_manual(values = c("darkgreen", "darkorange3", "black")) +
theme(
axis.title = element_text(size = 16),
axis.text = element_text(size = 14)
) +
labs(
title = "After Batch Correction (Bray-Curtis PCoA)",
x = paste0("PCoA1 (", var_explained_after[1], "%)"),
y = paste0("PCoA2 (", var_explained_after[2], "%)"))
p2

ggsave("SupplFigure2B_PCoA_After_batch_correct.pdf", p2, height = 4, width = 6, dpi = 300)
set.seed(111)
bc_dist_before <- phyloseq::distance(pseq_filtered, method = "bray")
meta_before <- as(sample_data(pseq_filtered), "data.frame")
res_perm_before <- adonis2(
bc_dist_before ~ Cohort+MTXRandMTXNR,,
by = "term",
data = meta_before,
permutations = 1000)
res_perm_before
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 1000
## 
## adonis2(formula = bc_dist_before ~ Cohort + MTXRandMTXNR, data = meta_before, permutations = 1000, by = "term")
##              Df SumOfSqs      R2      F   Pr(>F)    
## Cohort        2    3.814 0.10462 5.7082 0.000999 ***
## MTXRandMTXNR  1    0.571 0.01566 1.7094 0.014985 *  
## Residual     96   32.074 0.87972                    
## Total        99   36.460 1.00000                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
bc_dist_after <- phyloseq::distance(pseq_adj, method = "bray")
meta_after <- as(sample_data(pseq_adj), "data.frame")
res_after <- adonis2(
bc_dist_after ~ Cohort+MTXRandMTXNR,,
by = "term",
data = meta_after,
permutations = 1000)
res_after
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 1000
## 
## adonis2(formula = bc_dist_after ~ Cohort + MTXRandMTXNR, data = meta_after, permutations = 1000, by = "term")
##              Df SumOfSqs      R2      F  Pr(>F)  
## Cohort        2    0.964 0.02808 1.4119 0.01099 *
## MTXRandMTXNR  1    0.594 0.01730 1.7402 0.01099 *
## Residual     96   32.786 0.95462                 
## Total        99   34.344 1.00000                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#Suppl figure 4

rm(list=ls())
pseq <- readRDS("PseqBatchCorrected.rds")
df <- estimate_richness(pseq, measures = "Shannon") %>% cbind(sample_data(pseq))
## Warning in estimate_richness(pseq, measures = "Shannon"): The data you have provided does not have
## any singletons. This is highly suspicious. Results of richness
## estimates (for example) are probably unreliable, or wrong, if you have already
## trimmed low-abundance taxa from the data.
## 
## We recommended that you find the un-trimmed data and retry.
base_plot <- ggplot(df, aes(x = MTXRandMTXNR, y = Shannon, fill = MTXRandMTXNR)) +
  stat_summary(fun = mean, geom = "bar", alpha = 0.4) +
  scale_fill_manual(values = c("darksalmon", "#238A8DFF")) +
  scale_shape_manual(values = c("Cohort-1" = 22, "Cohort-2" = 21, "Cohort-3" = 24)) +
  theme_classic() +
  theme(axis.title = element_text(size = 16), axis.text = element_text(size = 14))

#combined dara
p1 <- base_plot + 
  geom_point(aes(shape = Cohort), size = 2.5, stroke = 0.5, color = "black",
             position = position_jitter(0.1, 0)) +
  stat_compare_means(method = "wilcox.test")

ggsave("SupplFig4left_Alpha.pdf", p1, width = 5, height = 3)

#cohortwise
p2 <- base_plot + 
  geom_point(aes(shape = Cohort), size = 1.5, stroke = 0.3, color = "black",
             position = position_jitter(0.1, 0)) +
  facet_wrap(~Cohort, scales = "free_y") +
  stat_compare_means(method = "wilcox.test", label = "p.signif", hide.ns = TRUE) +
  stat_compare_means(method = "t.test", label = "p.format")
p1

p2 

ggsave("SupplFig4right_Alpha.pdf", p2, width = 8, height = 3)

##Figure 1A

rm(list=ls())
pseq <- readRDS("PhyloseqObject.rds")
ntaxa(pseq)
## [1] 2100
pseq_rel <- transform_sample_counts(pseq, function(x) x / sum(x))
summary(sample_sums(pseq_rel))  #Verify normalization
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1       1       1       1       1       1
cohorts <- unique(sample_data(pseq_rel)$Cohort)
cat("Cohorts found:", paste(cohorts, collapse = ", "), "\n")
## Cohorts found: Cohort-2, Cohort-1, Cohort-3
#Keep species that are >1e-5 in >=10% of samples in at least one cohort
pseq_filtered <- filter_taxa(
  pseq_rel,
  function(x) {
    all(sapply(cohorts, function(cohort) {
      cohort_samples <- which(sample_data(pseq_rel)$Cohort == cohort)
      mean(x[cohort_samples] > 5e-5) >= 0.1
    }))
  },
  prune = TRUE
)

cat("No . of taxa after cohort-aware filtering:", ntaxa(pseq_filtered), "\n")
## No . of taxa after cohort-aware filtering: 249
otu_mat <- as(otu_table(pseq_filtered), "matrix")
if (!taxa_are_rows(pseq_filtered)) otu_mat <- t(otu_mat)

meta <- as(sample_data(pseq_filtered), "data.frame")
fit_adjust_batch <- adjust_batch(
  feature_abd = otu_mat,
  batch = "Cohort",       
  covariates = "MTXRandMTXNR",    
  data = meta,
  control = list(verbose = FALSE))
## Warning in check_batch(df_batch[[batch]], min_n_batch = 2): Batch variable is
## not a factor as provided and will be converted to one.
CRC_abd_adj <- fit_adjust_batch$feature_abd_adj
otu_table_adj <- otu_table(CRC_abd_adj, taxa_are_rows = TRUE)

pseq_adj <- phyloseq( otu_table_adj,
  tax_table(pseq_filtered),
  sample_data(pseq_filtered))

cat("strain-level phyloseq object filtered and batch-corrected.\n")
## strain-level phyloseq object filtered and batch-corrected.
set.seed(111)
bc_dist_after <- phyloseq::distance(pseq_adj, method = "bray")
meta_adj <- as(sample_data(pseq_adj), "data.frame")
res_perm_after <- adonis2(
  bc_dist_after ~ Cohort+MTXRandMTXNR,
  data = meta_adj, by ="terms", 
  permutations = 1000)
res_perm_after
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 1000
## 
## adonis2(formula = bc_dist_after ~ Cohort + MTXRandMTXNR, data = meta_adj, permutations = 1000, by = "terms")
##              Df SumOfSqs      R2      F   Pr(>F)    
## Cohort        2   1.2369 0.03945 2.0119 0.000999 ***
## MTXRandMTXNR  1   0.6026 0.01922 1.9603 0.006993 ** 
## Residual     96  29.5117 0.94132                    
## Total        99  31.3513 1.00000                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
dist_matrix <- phyloseq::distance(pseq_adj, method = "bray", weighted = TRUE)
ordination <- ordinate(pseq_adj, method = "PCoA", distance = dist_matrix)
var_expl <- ordination$values$Eigenvalues / sum(ordination$values$Eigenvalues) * 100
sample_data_df <- as(sample_data(pseq_adj), "data.frame")
p <- plot_ordination(pseq_adj, ordination, axes = c(3, 5), color = "MTXRandMTXNR", shape = "Cohort") + geom_point(size = 5) +
  scale_color_manual(values = c("darksalmon", "#238A8DFF")) +
  scale_shape_manual(values = c("Cohort-1" = 15, "Cohort-2" = 16, "Cohort-3" = 17)) +
  labs( x = paste0("PC3 (", round(var_expl[3], 1), "%)"),
    y = paste0("PC5 (", round(var_expl[5], 1), "%)")) +theme_classic() + theme(axis.title = element_text(size = 16),
    axis.text = element_text(size = 14))
p

ggsave("Figure1A_PCoA.pdf", p, height = 6, width = 8, dpi = 300)

##Figure 1B

#selecting PC3 and PC12 as they are p<0.05
plot_data_long <- data.frame(
    sample_data_df, 
    PC3 = ordination$vectors[,3], 
    PC12 = ordination$vectors[,12]
  ) %>%
  pivot_longer(
    cols = c(PC3, PC12), 
    names_to = "PC_Axis", 
    values_to = "Coordinate_Value"
  )

p_faceted <- ggplot(plot_data_long, aes(x = MTXRandMTXNR, y = Coordinate_Value, fill = MTXRandMTXNR)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.7) +
  geom_jitter(aes(shape = Cohort), width = 0.1, height =0, size = 2, alpha = 0.6) +
  #Use facet_wrap to create separate panels for PC3 and PC5
  #scales = "free_y" is crucial since PC3 and PC5 often have different ranges
  facet_wrap(~PC_Axis, scales = "free_y") + 
  scale_fill_manual(values = c("darksalmon", "#238A8DFF")) +
  scale_shape_manual(values = c("Cohort-1" = 15, "Cohort-2" = 16, "Cohort-3" = 17)) +
  labs(  y = "PCoA comp") + theme_classic() 

print(p_faceted)

ggsave("Figure1B_PC_Distri3_12.pdf", p_faceted, height = 6, width = 7)
wilcox_PC3 <- wilcox.test(ordination$vectors[,3] ~ sample_data_df$MTXRandMTXNR)
wilcox_PC12 <- wilcox.test(ordination$vectors[,12] ~ sample_data_df$MTXRandMTXNR)
print(wilcox_PC12)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  ordination$vectors[, 12] by sample_data_df$MTXRandMTXNR
## W = 727, p-value = 0.000347
## alternative hypothesis: true location shift is not equal to 0
print(wilcox_PC3)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  ordination$vectors[, 3] by sample_data_df$MTXRandMTXNR
## W = 1634, p-value = 0.00737
## alternative hypothesis: true location shift is not equal to 0

##Figure 1C

rm(list=ls())
pseq <- readRDS("PhyloseqObject.rds")
pseq_phylum <- tax_glom(pseq, taxrank = "Phylum", NArm = TRUE)
ntaxa(pseq)   
## [1] 2100
ntaxa(pseq_phylum)   #after tax_glom("Phylum")
## [1] 21
pseq_rel <- transform_sample_counts(pseq_phylum, function(x) x / sum(x))
pseq_rel
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 21 taxa and 100 samples ]
## sample_data() Sample Data:       [ 100 samples by 12 sample variables ]
## tax_table()   Taxonomy Table:    [ 21 taxa by 8 taxonomic ranks ]
summary(sample_sums(pseq_rel))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1       1       1       1       1       1
cohorts <- unique(sample_data(pseq_rel)$Cohort)
cat("Cohorts found:", paste(cohorts, collapse = ", "), "\n")
## Cohorts found: Cohort-2, Cohort-1, Cohort-3
#Keep phyla that are >5e-5 in >=10% of samples in at least one cohort
pseq_filtered <- filter_taxa(
  pseq_rel,
  function(x) {
    all(sapply(cohorts, function(cohort) {
      cohort_samples <- which(sample_data(pseq_rel)$Cohort == cohort)
      mean(x[cohort_samples] > 5e-5) >= 0.10
    }))
  },
  prune = TRUE)

cat("no of phyla after filtering:", ntaxa(pseq_filtered), "\n")
## no of phyla after filtering: 7
#prepare ASV matrix & metadata

otu_mat <- as(otu_table(pseq_filtered), "matrix")
if (!taxa_are_rows(pseq_filtered)) otu_mat <- t(otu_mat)

meta <- as(sample_data(pseq_filtered), "data.frame")

#batch correction

fit_adjust_batch <- adjust_batch(
  feature_abd = otu_mat,
  batch = "Cohort",                   
  covariates = "MTXRandMTXNR",
  data = meta,
  control = list(verbose = FALSE))
## Warning in check_batch(df_batch[[batch]], min_n_batch = 2): Batch variable is
## not a factor as provided and will be converted to one.
CRC_abd_adj <- fit_adjust_batch$feature_abd_adj
otu_table_adj <- otu_table(CRC_abd_adj, taxa_are_rows = TRUE)

pseq_adj <- phyloseq(
  otu_table_adj,
  tax_table(pseq_filtered),
  sample_data(pseq_filtered))

cat("phylum-level phyloseq object filtered and batch-corrected.\n")
## phylum-level phyloseq object filtered and batch-corrected.
saveRDS(pseq_adj, file = "PhylumBatchCorrected_phyloseq.rds")

#permanova
#Before batch correction
set.seed(111)
bc_dist_before <- phyloseq::distance(pseq_filtered, method = "bray")
res_perm_before <- adonis2(
  bc_dist_before ~ MTXRandMTXNR+Cohort+gender+age+DAS28,
  data = meta,
  by = "term",
  permutations = 1000
)
res_perm_before
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 1000
## 
## adonis2(formula = bc_dist_before ~ MTXRandMTXNR + Cohort + gender + age + DAS28, data = meta, permutations = 1000, by = "term")
##              Df SumOfSqs      R2       F   Pr(>F)    
## MTXRandMTXNR  1   0.1920 0.02246  4.0426 0.031968 *  
## Cohort        2   3.7592 0.43994 39.5845 0.000999 ***
## gender        1   0.1086 0.01271  2.2880 0.119880    
## age           1   0.0622 0.00727  1.3090 0.254745    
## DAS28         1   0.0068 0.00080  0.1441 0.851149    
## Residual     93   4.4159 0.51680                     
## Total        99   8.5447 1.00000                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#After batch correction (only MTX effect)
set.seed(111)
bc_dist_after <- phyloseq::distance(pseq_adj, method = "bray")
meta_adj <- as(sample_data(pseq_adj), "data.frame")
res_perm_after <- adonis2(
  bc_dist_after ~ MTXRandMTXNR + Cohort+gender+age,
  data = meta,
  by = "term",
  permutations = 1000)
res_perm_after
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 1000
## 
## adonis2(formula = bc_dist_after ~ MTXRandMTXNR + Cohort + gender + age, data = meta, permutations = 1000, by = "term")
##              Df SumOfSqs      R2      F  Pr(>F)  
## MTXRandMTXNR  1   0.2076 0.03746 3.7697 0.03497 *
## Cohort        2   0.0407 0.00734 0.3691 0.81319  
## gender        1   0.0815 0.01472 1.4808 0.21578  
## age           1   0.0352 0.00636 0.6400 0.51548  
## Residual     94   5.1762 0.93413                 
## Total        99   5.5412 1.00000                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#conver phyloobject to matrix to input into MMUPHIN
abund <- as(otu_table(pseq_adj), "matrix")
if (taxa_are_rows(pseq_adj)) abund <- t(abund)

common_samples <- intersect(rownames(abund), rownames(meta_adj))
abund <- abund[common_samples, , drop = FALSE]
meta_adj <- meta_adj[common_samples, , drop = FALSE]

abund_t <- t(abund)  #taxa x samples

fit_meta <- lm_meta(
  feature_abd = abund_t,
  batch = "Cohort",
  exposure = "MTXRandMTXNR",
  covariates = c("age", "gender"),
  data = meta_adj)
## Warning in check_batch(df_batch[[batch]], min_n_batch = 2): Batch variable is
## not a factor as provided and will be converted to one.
## Found 3 batches
## Fitting Maaslin2 on batch Cohort-1...
## Fitting Maaslin2 on batch Cohort-2...
## Fitting Maaslin2 on batch Cohort-3...
## Fitting meta-analysis model.
meta_fits <- fit_meta$meta_fits
#View(meta_fits)
write.csv(meta_fits, "Phylum.csv")
p <- ggplot(meta_fits, aes(x = coef, y = -log10(pval))) +
  geom_point(aes(color = ifelse(qval.fdr < 0.2, 
                               ifelse(coef >= 0, "MTX-R", "MTX-NR"), "NS"),
                 alpha = ifelse(qval.fdr < 0.2, 1, 0.3)), 
             size = 1.5) +
  geom_point(
    data = subset(meta_fits, qval.fdr < 0.2),
    aes(color = ifelse(coef >= 0, "MTX-R", "MTX-NR")),
    shape = 21,         #Essential: enables fill + border
    color = "black",    #The outline color
    stroke = 0.5,       #Controls thickness of the outline
    size = 2, 
    alpha = 1
  ) +
  
  #Label significant phyla
  geom_text_repel(
    data = subset(meta_fits, qval.fdr < 0.2),
    aes(label = str_extract(feature, "p__[^|]+")),
    size = 3,
    box.padding = 0.3,
    point.padding = 0.3,
    segment.color = "grey30",
    segment.size = 0.4,
    segment.alpha = 0.8,
    min.segment.length = 0,
    force = 2,
    max.overlaps = Inf
  ) +
  scale_color_manual(
    values = c(
      "MTX-R" = "#238A8DFF",
      "MTX-NR" = "darksalmon",
      "NS" = "grey")) +
  
  #alpha scale
  scale_alpha_identity() +
  theme_classic() +
  labs( x = "Effect Size (Coefficient)",
    y = expression(-log[10]("p-value")),
    color = "Significance" ) +theme( axis.text = element_text(size = 10),
    axis.title = element_text(size = 12), legend.position = "right")
p

ggsave("Phylum_volcano.pdf", p, height = 2, width = 3, dpi = 300, )

##Supplementary Figure 05

library(patchwork)
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
pseq <- readRDS("PhylumBatchCorrected_phyloseq.rds")
pseq_phylum <- tax_glom(pseq, taxrank = "Phylum", NArm = TRUE)
ntaxa(pseq)          #original OTUs
## [1] 7
ntaxa(pseq_phylum)   #after tax_glom("Phylum")
## [1] 7
pseq <- transform_sample_counts(pseq_phylum, function(x) x / sum(x))

y4b <- psmelt(pseq)
y4b$Phylum <- as.character(y4b$Phylum)

phyla_to_test <- meta_fits %>%
  filter(qval.fdr < 0.2) %>%
  pull(feature) %>%
  str_extract("p__[^|]+") %>%
  unique()

output_dir <- "01_Phylum_Plots"
if (!dir.exists(output_dir)) dir.create(output_dir)

for (phy in phyla_to_test) {

  plot_data <- subset(y4b, Phylum == phy)
  plot_data$MTXRandMTXNR <- factor(plot_data$MTXRandMTXNR, levels = c("MTXNR", "MTXR"))
  
  #Remove zero-abundance samples for baseline calculation and log-scale plotting- when Iam remiving this it is causing weird plot thus removing non-zero samples
  nonzero_data <- plot_data %>% filter(Abundance > 0)

    facet_min_per_cohort <- nonzero_data %>%
    group_by(Cohort) %>%
    summarise(bar_base = min(Abundance, na.rm = TRUE), .groups = "drop")
  
  summary_facet <- nonzero_data %>%
    group_by(Cohort, MTXRandMTXNR) %>%
    summarise(mean_abundance = 10^mean(log10(Abundance)), .groups = "drop") %>%
    left_join(facet_min_per_cohort, by = "Cohort") %>%
    mutate(x = as.numeric(MTXRandMTXNR),
           xmin = x - 0.3,
           xmax = x + 0.3)
  
  plot_facet <- ggplot() +
    geom_rect(data = summary_facet,
              aes(xmin = xmin, xmax = xmax, ymin = bar_base, ymax = mean_abundance, fill = MTXRandMTXNR),
              alpha = 0.7, color = "black", linewidth = 0.3) +
    geom_point(data = nonzero_data,
               aes(x = MTXRandMTXNR, y = Abundance, shape = Cohort, fill = MTXRandMTXNR),
               color = "black", stroke = 0.5,
               position = position_jitter(width = 0.1, height = 0), size = 2) +
    scale_shape_manual(values = c(22,21,24)) +
    scale_fill_manual(values = c("MTXNR"="darksalmon","MTXR"="#238A8DFF")) +
    scale_y_log10(expand = expansion(mult = c(0,0.1)),
                  labels = trans_format("log10", math_format(10^.x))) +
    theme_classic() +
    stat_compare_means(data = nonzero_data,
                       aes(x=MTXRandMTXNR, y=Abundance),
                       method="wilcox.test", label="p.signif", hide.ns=FALSE) +
    facet_wrap(~Cohort, nrow = 1, scales = "free_y") +
    ggtitle(paste(phy, "by Cohort")) +
    theme(axis.title.x = element_blank(),
          axis.text.x = element_blank(),
          legend.position = "none",
          plot.title = element_text(size=11, face="bold.italic"))
  
  #baseline = minimum non-zero value across all samples
  combined_min <- min(nonzero_data$Abundance, na.rm = TRUE)
  
  summary_combined <- nonzero_data %>%
    group_by(MTXRandMTXNR) %>%
    summarise(mean_abundance = 10^mean(log10(Abundance)), .groups = "drop") %>%
    mutate(bar_base = combined_min,
           x = as.numeric(MTXRandMTXNR),
           xmin = x - 0.3,
           xmax = x + 0.3)
  
  plot_combined <- ggplot() +
    geom_rect(data = summary_combined,
              aes(xmin = xmin, xmax = xmax, ymin = bar_base, ymax = mean_abundance, fill = MTXRandMTXNR),
              alpha = 0.7, color="black", linewidth=0.3) +
    geom_point(data = nonzero_data,
               aes(x = MTXRandMTXNR, y = Abundance, shape = Cohort, fill = MTXRandMTXNR),
               color = "black", stroke=0.5,
               position = position_jitter(width=0.1, height=0), size=2.5) +
    scale_shape_manual(values = c(22,21,24)) +
    scale_fill_manual(values = c("MTXNR"="darksalmon","MTXR"="#238A8DFF")) +
    scale_y_log10(expand = expansion(mult = c(0,0.1)),
                  labels = trans_format("log10", math_format(10^.x))) +
    theme_classic() +
    theme(axis.title.x = element_blank(),
          axis.text.x = element_blank(),
          legend.position = "left") +
    ggtitle(paste(phy, "Combined"))
  
  #Combin
  combined_output <- plot_combined + plot_facet + plot_layout(ncol=2, widths=c(2,3))
  
  clean_phy_name <- gsub("p__","",phy)
  pdf_filename <- file.path(output_dir, paste0(clean_phy_name,"_RvsNR_Abundance.pdf"))
  
  pdf(pdf_filename, width=12, height=4)
  print(combined_output)
  dev.off()
  
  message("Saved plot for ", phy)
}
## Saved plot for p__Bacteroidota
## Saved plot for p__Firmicutes
## Saved plot for p__Methanobacteriota

##Figure 1D

rm(list=ls())
pseq <- readRDS("PhyloseqObject.rds")
pseq_family <- tax_glom(pseq, taxrank = "Family", NArm = TRUE)
pseq_family 
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 407 taxa and 100 samples ]
## sample_data() Sample Data:       [ 100 samples by 12 sample variables ]
## tax_table()   Taxonomy Table:    [ 407 taxa by 8 taxonomic ranks ]
ntaxa(pseq)          
## [1] 2100
ntaxa(pseq_family)   
## [1] 407
pseq_rel <- transform_sample_counts(pseq_family, function(x) x / sum(x))
summary(sample_sums(pseq_rel))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1       1       1       1       1       1
cohorts <- unique(sample_data(pseq_rel)$Cohort)
cat("Cohorts found:", paste(cohorts, collapse = ", "), "\n")
## Cohorts found: Cohort-2, Cohort-1, Cohort-3
#Keep phyla that are >1e-4 in >=5% of samples in at least one cohort
pseq_filtered <- filter_taxa(
  pseq_rel,
  function(x) {
    all(sapply(cohorts, function(cohort) {
      cohort_samples <- which(sample_data(pseq_rel)$Cohort == cohort)
      mean(x[cohort_samples] > 5e-5) >= 0.1
    }))
  },
  prune = TRUE)

cat("Number of Phyla after cohort wise filtering:", ntaxa(pseq_filtered), "\n")
## Number of Phyla after cohort wise filtering: 79
otu_mat <- as(otu_table(pseq_filtered), "matrix")
if (!taxa_are_rows(pseq_filtered)) otu_mat <- t(otu_mat)

meta <- as(sample_data(pseq_filtered), "data.frame")
fit_adjust_batch <- adjust_batch(
  feature_abd = otu_mat,
  batch = "Cohort",                   #Batch variable
  covariates = "MTXRandMTXNR",       #Preserve MTX signal
  data = meta,
  control = list(verbose = FALSE))
## Warning in check_batch(df_batch[[batch]], min_n_batch = 2): Batch variable is
## not a factor as provided and will be converted to one.
CRC_abd_adj <- fit_adjust_batch$feature_abd_adj
otu_table_adj <- otu_table(CRC_abd_adj, taxa_are_rows = TRUE)

pseq_adj <- phyloseq(
  otu_table_adj,
  tax_table(pseq_filtered),
  sample_data(pseq_filtered))

saveRDS(pseq_adj, file = "familyBatchCorrected_phyloseq.rds")


#PERMANOVA
#Before batch correction
set.seed(111)
bc_dist_before <- phyloseq::distance(pseq_filtered, method = "bray")
res_perm_before <- adonis2( bc_dist_before ~ Cohort + MTXRandMTXNR,data = meta,
  by = "term", permutations = 1000)
res_perm_before
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 1000
## 
## adonis2(formula = bc_dist_before ~ Cohort + MTXRandMTXNR, data = meta, permutations = 1000, by = "term")
##              Df SumOfSqs      R2       F   Pr(>F)    
## Cohort        2   4.7383 0.23238 14.8590 0.000999 ***
## MTXRandMTXNR  1   0.3455 0.01694  2.1668 0.043956 *  
## Residual     96  15.3064 0.75068                     
## Total        99  20.3902 1.00000                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#After batch correction (only MTX effect)
set.seed(111)
bc_dist_after <- phyloseq::distance(pseq_adj, method = "bray")
meta_adj <- as(sample_data(pseq_adj), "data.frame")
res_perm_after <- adonis2( bc_dist_after ~ MTXRandMTXNR + Cohort+gender+age, data = meta,
  by = "term",permutations = 1000)
res_perm_after
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 1000
## 
## adonis2(formula = bc_dist_after ~ MTXRandMTXNR + Cohort + gender + age, data = meta, permutations = 1000, by = "term")
##              Df SumOfSqs      R2      F  Pr(>F)  
## MTXRandMTXNR  1   0.2150 0.01353 1.3636 0.19580  
## Cohort        2   0.5130 0.03227 1.6269 0.04795 *
## gender        1   0.1424 0.00896 0.9034 0.52747  
## age           1   0.2041 0.01284 1.2947 0.23377  
## Residual     94  14.8215 0.93240                 
## Total        99  15.8961 1.00000                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
abund <- as(otu_table(pseq_adj), "matrix")
if (taxa_are_rows(pseq_adj)) abund <- t(abund)

common_samples <- intersect(rownames(abund), rownames(meta_adj))
abund <- abund[common_samples, , drop = FALSE]
meta_adj <- meta_adj[common_samples, , drop = FALSE]

abund_t <- t(abund)  #taxa vs samples

fit_meta <- lm_meta(feature_abd = abund_t, batch = "Cohort", exposure = "MTXRandMTXNR",
  covariates = c("age", "gender"),
  data = meta_adj)
## Warning in check_batch(df_batch[[batch]], min_n_batch = 2): Batch variable is
## not a factor as provided and will be converted to one.
## Found 3 batches
## Fitting Maaslin2 on batch Cohort-1...
## Fitting Maaslin2 on batch Cohort-2...
## Fitting Maaslin2 on batch Cohort-3...
## Fitting meta-analysis model.
meta_fits <- fit_meta$meta_fits
#View(meta_fits)
p <- ggplot(meta_fits, aes(x = coef, y = -log10(pval))) +
  geom_point(aes(color = ifelse(qval.fdr < 0.2, 
                                 ifelse(coef >= 0, "MTX-R", "MTX-NR"), "NS"),
                 alpha = ifelse(qval.fdr < 0.2, 1, 0.3)), 
             size = 1.5) +
  

  geom_point(
    data = subset(meta_fits, qval.fdr < 0.2),
    aes(fill = ifelse(coef >= 0, "MTX-R", "MTX-NR")), #Use fill for the inside
    shape = 21,          #Enables the border
    color = "black",     #The border color
    stroke = 0.5,        #Thickness of the black line
    size = 2, 
    alpha = 1
  ) +
  
  geom_text_repel(
    data = subset(meta_fits, qval.fdr < 0.2),
    aes(label = str_extract(feature, "f__[^|]+")),
    size = 3,
    box.padding = 0.3,
    point.padding = 0.3,
    segment.color = "grey30",
    segment.size = 0.4,
    segment.alpha = 0.8,
    min.segment.length = 0,
    force = 2,
    max.overlaps = Inf
  ) +

  scale_fill_manual(
    values = c("MTX-R" = "#238A8DFF", "MTX-NR" = "darksalmon")
  ) +
  scale_color_manual(
    values = c("MTX-R" = "#238A8DFF", "MTX-NR" = "darksalmon", "NS" = "grey")
  ) +
  
  scale_alpha_identity() +
  theme_classic() +
  labs(
    x = "Effect Size (Coefficient)",
    y = expression(-log[10]("p-value")),
    color = "Significance",
    fill = "Significance"
  ) +
  theme(
    axis.text = element_text(size = 10),
    axis.title = element_text(size = 12),
    legend.position = "right"
  )

p

ggsave("Family_volcano.pdf", p, height = 2, width = 3, dpi = 300)

##Figure 1E

rm(list=ls())
pseq <- readRDS("PhyloseqObject.rds")
pseq_genus <- tax_glom(pseq, taxrank = "Genus", NArm = TRUE)

ntaxa(pseq) 
## [1] 2100
ntaxa(pseq_genus )
## [1] 1256
pseq_rel <- transform_sample_counts(pseq_genus, function(x) x / sum(x))
summary(sample_sums(pseq_rel))  #verify normalization
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1       1       1       1       1       1
cohorts <- unique(sample_data(pseq_rel)$Cohort)
cat("Cohorts :", paste(cohorts, collapse = ", "), "\n")
## Cohorts : Cohort-2, Cohort-1, Cohort-3
#Keep genus that are >1e-5 in >=10% of samples in at least one cohort
pseq_filtered <- filter_taxa(
  pseq_rel,
  function(x) {
    all(sapply(cohorts, function(cohort) {
      cohort_samples <- which(sample_data(pseq_rel)$Cohort == cohort)
      mean(x[cohort_samples] > 5e-5) >= 0.1
    }))
  },
  prune = TRUE
)

cat(ntaxa(pseq_filtered), "\n")
## 159
otu_mat <- as(otu_table(pseq_filtered), "matrix")
if (!taxa_are_rows(pseq_filtered)) otu_mat <- t(otu_mat)

meta <- as(sample_data(pseq_filtered), "data.frame")

#batch correction
fit_adjust_batch <- adjust_batch(
  feature_abd = otu_mat,
  batch = "Cohort",                 
  covariates = "MTXRandMTXNR",
  data = meta,
  control = list(verbose = FALSE)
)
## Warning in check_batch(df_batch[[batch]], min_n_batch = 2): Batch variable is
## not a factor as provided and will be converted to one.
CRC_abd_adj <- fit_adjust_batch$feature_abd_adj
otu_table_adj <- otu_table(CRC_abd_adj, taxa_are_rows = TRUE)

pseq_adj <- phyloseq(
  otu_table_adj,
  tax_table(pseq_filtered),
  sample_data(pseq_filtered)
)
saveRDS(pseq_adj, file = "genusBatchCorrected_phyloseq.rds")

#Before batch correction
set.seed(111)
bc_dist_before <- phyloseq::distance(pseq_filtered, method = "bray")
res_perm_before <- adonis2(
  bc_dist_before ~ Cohort + MTXRandMTXNR,
  data = meta,
  by = "term",
  permutations = 1000)
res_perm_before
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 1000
## 
## adonis2(formula = bc_dist_before ~ Cohort + MTXRandMTXNR, data = meta, permutations = 1000, by = "term")
##              Df SumOfSqs      R2       F   Pr(>F)    
## Cohort        2   5.2850 0.18863 11.3833 0.000999 ***
## MTXRandMTXNR  1   0.4479 0.01599  1.9294 0.040959 *  
## Residual     96  22.2855 0.79539                     
## Total        99  28.0185 1.00000                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#AFTER batch correction
bc_dist_after <- phyloseq::distance(pseq_adj, method = "bray")
meta_adj <- as(sample_data(pseq_adj), "data.frame")
res_perm_after <- adonis2(
  bc_dist_after ~ MTXRandMTXNR + Cohort,
  data = meta_adj,
  permutations = 1000)
res_perm_after
## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 1000
## 
## adonis2(formula = bc_dist_after ~ MTXRandMTXNR + Cohort, data = meta_adj, permutations = 1000)
##          Df SumOfSqs      R2      F   Pr(>F)   
## Model     3   1.4136 0.05767 1.9585 0.003996 **
## Residual 96  23.0961 0.94233                   
## Total    99  24.5096 1.00000                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
abund <- as(otu_table(pseq_adj), "matrix")
if (taxa_are_rows(pseq_adj)) abund <- t(abund)

common_samples <- intersect(rownames(abund), rownames(meta_adj))
abund <- abund[common_samples, , drop = FALSE]
meta_adj <- meta_adj[common_samples, , drop = FALSE]

abund_t <- t(abund)  #taxa x samples

fit_meta <- lm_meta(
  feature_abd = abund_t,
  batch = "Cohort",
  exposure = "MTXRandMTXNR",
  covariates = c("age", "gender"),
  data = meta_adj)
## Warning in check_batch(df_batch[[batch]], min_n_batch = 2): Batch variable is
## not a factor as provided and will be converted to one.
## Found 3 batches
## Fitting Maaslin2 on batch Cohort-1...
## Fitting Maaslin2 on batch Cohort-2...
## Fitting Maaslin2 on batch Cohort-3...
## Fitting meta-analysis model.
## Warning in rma_wrapper(maaslin_fits, method = control$rma_method, output = control$output, : Fitting rma on feature k__Bacteria|p__Firmicutes|c__Negativicutes|o__Veillonellales|f__Veillonellaceae|g__Megasphaera|s__Megasphaera_sp_NM10|t__SGB5858;
## Fisher scoring algorithm may have gotten stuck at a local maximum.
## Setting tau^2 = 0. Check the profile likelihood plot with profile().
meta_fits <- fit_meta$meta_fits
#View(meta_fits)
write.csv(meta_fits, "Supplementary_Table_Genus.csv")
p <- ggplot(meta_fits, aes(x = coef, y = -log10(pval))) +
  #Non-significant)
  geom_point(aes(color = ifelse(qval.fdr < 0.2, 
                                 ifelse(coef >= 0, "MTX-R", "MTX-NR"), "NS"),
                 alpha = ifelse(qval.fdr < 0.2, 1, 0.3)), 
             size = 1.5) +
  
  #2. Significant points with black outline
  geom_point(
    data = subset(meta_fits, qval.fdr < 0.2),
    aes(fill = ifelse(coef >= 0, "MTX-R", "MTX-NR")), #Map to fill, not color
    shape = 21,         #Required for the outline + fill combo
    color = "black",    #The outline color
    stroke = 0.5,       #Thickness of the outline
    size = 2, 
    alpha = 1
  ) +
  
  #3. Label significant Genus (g__)
  geom_text_repel(
    data = subset(meta_fits, qval.fdr < 0.2),
    aes(label = str_extract(feature, "g__[^|]+")),
    size = 3,
    box.padding = 0.3, 
    point.padding = 0.3,
    segment.color = "grey30",
    segment.size = 0.4,
    segment.alpha = 0.8,
    min.segment.length = 0,
    force = 2,
    max.overlaps = Inf
  ) +
  
  #4. Manual Scales
  scale_fill_manual(
    values = c("MTX-R" = "#238A8DFF", "MTX-NR" = "darksalmon")
  ) +
  scale_color_manual(
    values = c("MTX-R" = "#238A8DFF", "MTX-NR" = "darksalmon", "NS" = "grey")
  ) +
  
  #5. Styling and Labels
  scale_alpha_identity() + 
  theme_classic() + 
  labs(
    x = "Effect Size (Coefficient)",
    y = expression(-log[10]("p-value")),
    color = "Significance",
    fill = "Significance"
  ) +
  theme(
    axis.text = element_text(size = 10),
    axis.title = element_text(size = 12),
    legend.position = "right"
  )

print(p)

ggsave("Figure1E.pdf", p, height = 2, width = 3, dpi = 300)

##Figure 1F

rm(list=ls())
pseq <- readRDS("PhyloseqObject.rds")
pseq_species <- tax_glom(pseq, taxrank = "Species", NArm = TRUE)

ntaxa(pseq) 
## [1] 2100
ntaxa(pseq_species )  
## [1] 2028
pseq_rel <- transform_sample_counts(pseq_species, function(x) x / sum(x))
summary(sample_sums(pseq_rel))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1       1       1       1       1       1
cohorts <- unique(sample_data(pseq_rel)$Cohort)
cat("Chrts found:", paste(cohorts, collapse = ", "), "\n")
## Chrts found: Cohort-2, Cohort-1, Cohort-3
pseq_filtered <- filter_taxa(
  pseq_rel,
  function(x) {
    all(sapply(cohorts, function(cohort) {
      cohort_samples <- which(sample_data(pseq_rel)$Cohort == cohort)
      mean(x[cohort_samples] > 5e-5) >= 0.1
    }))
  },
  prune = TRUE
)

cat( ntaxa(pseq_filtered), "\n")
## 235
#Prepare OTU matrix & metadata

otu_mat <- as(otu_table(pseq_filtered), "matrix")
if (!taxa_are_rows(pseq_filtered)) otu_mat <- t(otu_mat)

meta <- as(sample_data(pseq_filtered), "data.frame")


#Batch correction
fit_adjust_batch <- adjust_batch(
  feature_abd = otu_mat,
  batch = "Cohort",                   #Batch variable
  covariates = "MTXRandMTXNR",       #Preserve MTX signal
  data = meta,
  control = list(verbose = FALSE))
## Warning in check_batch(df_batch[[batch]], min_n_batch = 2): Batch variable is
## not a factor as provided and will be converted to one.
CRC_abd_adj <- fit_adjust_batch$feature_abd_adj
otu_table_adj <- otu_table(CRC_abd_adj, taxa_are_rows = TRUE)

pseq_adj <- phyloseq(
  otu_table_adj,
  tax_table(pseq_filtered),
  sample_data(pseq_filtered))
saveRDS(pseq_adj, file = "SpeciesBatchCorrected_phyloseq.rds")
set.seed(111)
bc_dist_before <- phyloseq::distance(pseq_filtered, method = "bray")
res_perm_before <- adonis2(
  bc_dist_before ~ Cohort + MTXRandMTXNR,
  data = meta,
  by = "term",
  permutations = 1000)
res_perm_before
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 1000
## 
## adonis2(formula = bc_dist_before ~ Cohort + MTXRandMTXNR, data = meta, permutations = 1000, by = "term")
##              Df SumOfSqs      R2      F   Pr(>F)    
## Cohort        2    4.161 0.12686 7.1190 0.000999 ***
## MTXRandMTXNR  1    0.584 0.01780 1.9983 0.009990 ** 
## Residual     96   28.053 0.85534                    
## Total        99   32.798 1.00000                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#After batch correction (only MTX effect)
set.seed(111)
bc_dist_after <- phyloseq::distance(pseq_adj, method = "bray")
meta_adj <- as(sample_data(pseq_adj), "data.frame")
res_perm_after <- adonis2(
  bc_dist_after ~ MTXRandMTXNR,
  data = meta_adj,
  permutations = 1000
)
res_perm_after
## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 1000
## 
## adonis2(formula = bc_dist_after ~ MTXRandMTXNR, data = meta_adj, permutations = 1000)
##          Df SumOfSqs      R2      F   Pr(>F)   
## Model     1   0.5882 0.01951 1.9498 0.007992 **
## Residual 98  29.5663 0.98049                   
## Total    99  30.1546 1.00000                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
res_perm_after2 <- adonis2(
  bc_dist_after ~ MTXRandMTXNR + Cohort,
  data = meta_adj,
  permutations = 1000
)
res_perm_after2
## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 1000
## 
## adonis2(formula = bc_dist_after ~ MTXRandMTXNR + Cohort, data = meta_adj, permutations = 1000)
##          Df SumOfSqs      R2      F   Pr(>F)    
## Model     3   1.8146 0.06018 2.0489 0.000999 ***
## Residual 96  28.3400 0.93982                    
## Total    99  30.1546 1.00000                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
abund <- as(otu_table(pseq_adj), "matrix")
if (taxa_are_rows(pseq_adj)) abund <- t(abund)

common_samples <- intersect(rownames(abund), rownames(meta_adj))
abund <- abund[common_samples, , drop = FALSE]
meta_adj <- meta_adj[common_samples, , drop = FALSE]

abund_t <- t(abund)  #taxa x samples
fit_meta <- lm_meta(
  feature_abd = abund_t,
  batch = "Cohort",
  exposure = "MTXRandMTXNR",
  covariates = c("age", "gender"),
  data = meta_adj)
## Warning in check_batch(df_batch[[batch]], min_n_batch = 2): Batch variable is
## not a factor as provided and will be converted to one.
## Found 3 batches
## Fitting Maaslin2 on batch Cohort-1...
## Fitting Maaslin2 on batch Cohort-2...
## Fitting Maaslin2 on batch Cohort-3...
## Fitting meta-analysis model.
meta_fits <- fit_meta$meta_fits
#View(meta_fits)
write.csv(meta_fits, "Supplementary Table Species.csv")
p_species <- ggplot(meta_fits, aes(x = coef, y = -log10(pval))) +
  #1. Background points (Non-significant)
  geom_point(aes(color = ifelse(qval.fdr < 0.2, 
                                 ifelse(coef >= 0, "MTX-R", "MTX-NR"), "NS"),
                 alpha = ifelse(qval.fdr < 0.2, 1, 0.3)), 
             size = 1.5) +
  geom_point(
    data = subset(meta_fits, qval.fdr < 0.2),
    aes(fill = ifelse(coef >= 0, "MTX-R", "MTX-NR")), #Map to fill, not color
    shape = 21,         #Required for the outline + fill combo
    color = "black",    #The outline color
    stroke = 0.5,       #Thickness of the outline
    size = 2, 
    alpha = 1
  ) +
  
  #label significant sp (s__)
  geom_text_repel(
    data = subset(meta_fits, qval.fdr < 0.2),
    aes(label = str_extract(feature, "s__[^|]+")),
    size = 3,
    box.padding = 0.3, 
    point.padding = 0.3,
    segment.color = "grey30",
    segment.size = 0.4,
    segment.alpha = 0.8,
    min.segment.length = 0,
    force = 2,
    max.overlaps = Inf
  ) +
  
  #4. Manual Scales
  scale_fill_manual(
    values = c("MTX-R" = "#238A8DFF", "MTX-NR" = "darksalmon")
  ) +
  scale_color_manual(
    values = c("MTX-R" = "#238A8DFF", "MTX-NR" = "darksalmon", "NS" = "grey")
  ) +
  
  #5. Styling and Labels
  scale_alpha_identity() + 
  theme_classic() + 
  labs(
    x = "Effect Size (Coefficient)",
    y = expression(-log[10]("p-value")),
    color = "Significance",
    fill = "Significance"
  ) +
  theme(
    axis.text = element_text(size = 10),
    axis.title = element_text(size = 12),
    legend.position = "right"
  )

print(p_species)

ggsave("Figure_1F.pdf", p_species, height = 2, width = 3, dpi = 300)

#Figure 2A-C Suppl. table diff abundant KOs

rm(list=ls())
Metadata <- read.csv("../01_CuratingMetadata/metadataDAS28.csv", 
                     stringsAsFactors = FALSE, 
                     check.names = FALSE)
#assign sample names as rownames
rownames(Metadata) <- Metadata$KT_sampleName_withS

#rplace '.' with '-' in both row and colnames to match with KO data
colnames(Metadata) <- gsub("\\.", "-", colnames(Metadata))
rownames(Metadata) <- gsub("\\.", "-", rownames(Metadata))

cat("No. of samples in metadata:", nrow(Metadata), "\n")
## No. of samples in metadata: 100
d2 <- fread("../03_HumannAnalysis/00_humannOutputfromWynton/12_CombUnstratFiles/Normalized_KO_RPKG.tsv")
d2 <- as.data.frame(d2, check.names = FALSE)
rownames(d2) <- d2[,1]
d2 <- d2[,-1]
d2 <- d2[rowSums(d2) > 0, ]

#clean column names
colnames(d2) <- sub("^X", "", colnames(d2))
colnames(d2) <- sub("_L002$", "", colnames(d2))

cat("NO. of samples in abundance matrix", ncol(d2), "\n")
## NO. of samples in abundance matrix 291
cat("NO. of feature after remocing feature with sum 0", nrow(d2), "\n")
## NO. of feature after remocing feature with sum 0 6542
common_samples <- intersect(colnames(d2), rownames(Metadata))
cat("cat common samples", length(common_samples), "\n")
## cat common samples 100
Metadata_matched <- Metadata[match(common_samples, rownames(Metadata)), , drop = FALSE]

d2_matrix_matched <- d2[, common_samples, drop = FALSE]

#alignment chekk
stopifnot(all(colnames(d2_matrix_matched) == rownames(Metadata_matched)))

#Proportional normlaization
d2_matrix_prop <- sweep(d2_matrix_matched, 2, colSums(d2_matrix_matched), FUN = "/")

cohorts <- unique(Metadata_matched$Cohort)
cat("cohorts found:", paste(cohorts, collapse = ", "), "\n")
## cohorts found: Cohort-2, Cohort-1, Cohort-3
keep_features <- rep(FALSE, nrow(d2_matrix_prop))
names(keep_features) <- rownames(d2_matrix_prop)

for (cohort in cohorts) {
  cohort_samples <- rownames(Metadata_matched)[Metadata_matched$Cohort == cohort]
  mat_cohort <- d2_matrix_prop[, cohort_samples, drop = FALSE]
  prevalence <- rowMeans(mat_cohort > 5e-5)
  keep_features <- keep_features | (prevalence >= 0.1)
}
#row features satisfying this criteria and column would be Cohort as column and crossc chekc manually and match N here and that file

#subset relative abundance matrix to keep only filtered features
d2_matrix_prop_filtered <- d2_matrix_prop[keep_features, , drop = FALSE]
cat("No. of features after cohort-aware filtering:", nrow(d2_matrix_prop_filtered), "\n")
## No. of features after cohort-aware filtering: 3900
#cross checked multiple times with MMUPHin tutorial
fit_adjust_batch <- adjust_batch(
  feature_abd = d2_matrix_prop_filtered,
  batch = "Cohort",
  covariates = "MTXRandMTXNR",
  data = Metadata_matched,
  control = list(verbose = FALSE))
## Warning in check_batch(df_batch[[batch]], min_n_batch = 2): Batch variable is
## not a factor as provided and will be converted to one.
CRC_abd_adj <- fit_adjust_batch$feature_abd_adj
#save it for future analysis so don't have to redo the steps above
write.csv(CRC_abd_adj, "20251028BatchCorrectedKO.csv", row.names = TRUE)

###Suppl.Figure 6.

alpha_df <- data.frame(
  Sample = colnames(CRC_abd_adj),
  Richness = colSums(CRC_abd_adj > 0))

alpha_df <- alpha_df %>%
  mutate(
    MTXRandMTXNR = Metadata_matched$MTXRandMTXNR[match(Sample, rownames(Metadata_matched))],
    Cohort = Metadata_matched$Cohort[match(Sample, rownames(Metadata_matched))])
alpha_stats <- alpha_df %>%
  group_by(Cohort) %>%
  summarise(
    wilcox_p = wilcox.test(Richness ~ MTXRandMTXNR)$p.value,
    ttest_p = t.test(Richness ~ MTXRandMTXNR)$p.value
  )
## Warning: There were 2 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `wilcox_p = wilcox.test(Richness ~ MTXRandMTXNR)$p.value`.
## ℹ In group 1: `Cohort = "Cohort-1"`.
## Caused by warning in `wilcox.test.default()`:
## ! cannot compute exact p-value with ties
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
print(alpha_stats)
## # A tibble: 3 × 3
##   Cohort   wilcox_p ttest_p
##   <chr>       <dbl>   <dbl>
## 1 Cohort-1    0.126  0.188 
## 2 Cohort-2    0.114  0.0880
## 3 Cohort-3    0.443  0.619
p_combined <- ggplot(alpha_df, aes(x = MTXRandMTXNR, y = Richness, fill = MTXRandMTXNR)) +
  #Bar layer
  stat_summary(fun = mean, geom = "bar", alpha = 0.4) +
  
  #Point layer with outlines
  geom_point(aes(fill = MTXRandMTXNR, shape = Cohort), #Use fill for the inner color
             color = "black",                         #Set outline to black
             size = 2, 
             alpha = 0.8,
             stroke = 0.5,                            #Adjust outline thickness
             position = position_jitter(width = 0.1, height = 0)) +
  
  #Scales
  scale_fill_manual(values = c("darksalmon", "#238A8DFF")) +
  #Swapping shapes to fillable versions: 22 (Sq), 21 (Circ), 24 (Tri)
  scale_shape_manual(values = c("Cohort-1" = 22, "Cohort-2" = 21, "Cohort-3" = 24)) +
  
  theme_classic() +
  labs(y = "Number of distinct KOs", fill = "Group", shape = "Cohort") +
  theme(axis.title = element_text(size = 16),
        axis.text = element_text(size = 14),
        legend.title = element_text(size = 14),
        legend.text = element_text(size = 12)) +
  
  stat_compare_means(aes(group = MTXRandMTXNR), 
                     method = "t.test", label = "p.format")

p_combined

ggsave("AlphaDivSuppleFig5left.pdf", p_combined, height = 3, width = 5, dpi = 300)
p_facet <- ggplot(alpha_df, aes(x = MTXRandMTXNR, y = Richness, fill = MTXRandMTXNR)) +
  stat_summary(fun = mean, geom = "bar", alpha = 0.4) +

  geom_point(
    aes(fill = MTXRandMTXNR, shape = Cohort),
    color = "black",                 #outline
    size = 1.2, 
    alpha = 0.8,
    stroke = 0.4, 
    position = position_jitter(width = 0.15)
  ) +
  scale_fill_manual(values = c("darksalmon", "#238A8DFF")) +
  scale_shape_manual(values = c("Cohort-1" = 22, "Cohort-2" = 21, "Cohort-3" = 24)) +
  theme_classic() +
  labs(y = "Number of distinct KOs", fill = "Group", shape = "Cohort") + 
  theme(
    axis.title = element_text(size = 16),
    axis.text = element_text(size = 14),
    legend.title = element_text(size = 14),
    legend.text = element_text(size = 12)
  ) + 
  facet_wrap(~Cohort, scales = "free_y")
p_facet

ggsave("AlphaDivSuppleFig5right.pdf", p_facet, height = 3, width = 8, dpi = 300)

###Figure 2A

dist_before <- vegdist(t(d2_matrix_prop_filtered), method = "bray")

set.seed(1234)
fit_adonis_before <- adonis2(dist_before ~ MTXRandMTXNR + Cohort + age+gender + DAS28 ,
                             data = Metadata_matched, by = "terms")
#PERMANOVA after batch correction
dist_after <- vegdist(t(CRC_abd_adj), method = "bray") 

set.seed(1234)
fit_adonis_after <- adonis2(dist_after ~ MTXRandMTXNR + Cohort + age+gender +DAS28,
                            data = Metadata_matched, by = "terms")

pcoa_after <- cmdscale(dist_after, k = 6, eig = TRUE)
eig_after <- pcoa_after$eig
var_explained_after <- round(100 * eig_after / sum(eig_after), 2)

pcoa_df_after <- as.data.frame(pcoa_after$points)
colnames(pcoa_df_after) <- paste0("PC", 1:ncol(pcoa_df_after))
pcoa_df_after$Sample <- rownames(pcoa_df_after)
pcoa_df_after$Cohort <- Metadata_matched$Cohort[match(pcoa_df_after$Sample, rownames(Metadata_matched))]
pcoa_df_after$MTXRandMTXNR <- Metadata_matched$MTXRandMTXNR[match(pcoa_df_after$Sample, rownames(Metadata_matched))]
p2_vs_p4 <- ggplot(pcoa_df_after, aes(x = PC2, y = PC4, 
  color = MTXRandMTXNR, shape = Cohort)) + geom_point(size = 3) +
  theme_classic() +
  labs(x = paste0("PC1 (", var_explained_after[1], "%)"),
       y = paste0("PC4 (", var_explained_after[4], "%)")) +
  scale_color_manual(values = c("darksalmon", "#238A8DFF")) +
  scale_shape_manual(values = c("Cohort-1" = 15, "Cohort-2" = 16, "Cohort-3" = 17)) +
  theme(legend.position = "none") #will add legend manually in A-ILLUstrator
p2_vs_p4

ggsave("PCoA_KO_After.pdf", p2_vs_p4, height = 4, width = 5, dpi = 300)
#Permanova
print(fit_adonis_before)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = dist_before ~ MTXRandMTXNR + Cohort + age + gender + DAS28, data = Metadata_matched, by = "terms")
##              Df SumOfSqs      R2      F Pr(>F)    
## MTXRandMTXNR  1   0.1347 0.01504 1.7597  0.081 .  
## Cohort        2   1.3067 0.14597 8.5383  0.001 ***
## age           1   0.0969 0.01083 1.2665  0.241    
## gender        1   0.1824 0.02037 2.3830  0.018 *  
## DAS28         1   0.1147 0.01282 1.4993  0.130    
## Residual     93   7.1165 0.79497                  
## Total        99   8.9519 1.00000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
print(fit_adonis_after)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = dist_after ~ MTXRandMTXNR + Cohort + age + gender + DAS28, data = Metadata_matched, by = "terms")
##              Df SumOfSqs      R2      F Pr(>F)  
## MTXRandMTXNR  1   0.1542 0.01993 2.0329  0.031 *
## Cohort        2   0.1587 0.02051 1.0458  0.405  
## age           1   0.0956 0.01236 1.2604  0.226  
## gender        1   0.1452 0.01876 1.9136  0.039 *
## DAS28         1   0.1286 0.01661 1.6946  0.090 .
## Residual     93   7.0557 0.91183                
## Total        99   7.7380 1.00000                
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#PC1
wilcox_pc1 <- wilcox.test(pcoa_df_after$PC1 ~ pcoa_df_after$MTXRandMTXNR)
#PC4
wilcox_pc4 <- wilcox.test(pcoa_df_after$PC4 ~ pcoa_df_after$MTXRandMTXNR)
print(wilcox_pc1)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  pcoa_df_after$PC1 by pcoa_df_after$MTXRandMTXNR
## W = 1530, p-value = 0.04983
## alternative hypothesis: true location shift is not equal to 0
print(wilcox_pc4)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  pcoa_df_after$PC4 by pcoa_df_after$MTXRandMTXNR
## W = 943, p-value = 0.03701
## alternative hypothesis: true location shift is not equal to 0

###Figure 2B

pcoa_long_14 <- pcoa_df_after %>%
  select(MTXRandMTXNR, Cohort, PC1, PC4) %>%
  pivot_longer(
    cols = c(PC1, PC4), 
    names_to = "PC_Axis", 
    values_to = "Coordinate_Value"
  ) %>%
  #create a custom label column that includes the variance explained
  mutate(Facet_Label = case_when(
    PC_Axis == "PC1" ~ paste0("PC1 (", var_explained_after[1], "%)"),
    PC_Axis == "PC4" ~ paste0("PC4 (", var_explained_after[4], "%)") ))

p_faceted_14 <- ggplot(pcoa_long_14, aes(x = MTXRandMTXNR, y = Coordinate_Value, fill = MTXRandMTXNR)) +
 geom_boxplot(outlier.shape = NA, alpha = 0.7) +
  geom_jitter(aes(shape = Cohort), width = 0.2, size = 2, alpha = 0.6) +

  #scales = "free_y" is crucial since PC1 and PC4 often have different ranges
  facet_wrap(~PC_Axis, scales = "free_y") + 
  scale_fill_manual(values = c("darksalmon", "#238A8DFF")) +
  scale_shape_manual(values = c("Cohort-1" = 15, "Cohort-2" = 16, "Cohort-3" = 17)) + labs(title = "Distribution of PCoA",
    y = "PCoA scores") +theme_classic() 
print(p_faceted_14)

ggsave("Figure2B_PC2_PC4_Faceted.pdf", p_faceted_14, height = 6, width = 7, dpi = 300)

###Supplementaty_Table diff abundant KOs

fit_lm_meta <- lm_meta(
  feature_abd = CRC_abd_adj,
  batch = "Cohort",
  exposure = "MTXRandMTXNR",
  covariates = c("age" , "gender"),
  data = Metadata_matched,
  control = list(verbose = FALSE))
## Warning in check_batch(df_batch[[batch]], min_n_batch = 2): Batch variable is
## not a factor as provided and will be converted to one.
meta_fits <- fit_lm_meta$meta_fits
#View(meta_fits)
write.csv(meta_fits, "Supplementaty_Table_metafits.csv", row.names = FALSE)

###Figure 2C

meta_fits <- meta_fits %>%
  mutate(sig_color = case_when(
    pval < 0.05 & coef > 0 ~ "MTX-R",
    pval < 0.05 & coef < 0 ~ "MTX-NR",
    TRUE ~ "NS" ))
p <- ggplot(meta_fits, aes(x = coef, y = -log10(pval))) +
  #1. All points (Background layer)
  geom_point(aes(color = sig_color, alpha = sig_color), size = 1.5) +
  geom_point(
    data = subset(meta_fits, qval.fdr < 0.2),
    aes(fill = sig_color), #Changed color to fill
    shape = 21,            #Shape with border and fill
    color = "black",       #Outline color
    stroke = 0.5,          #Outline thickness
    size = 2, 
    alpha = 1
  ) +  geom_text_repel(
    data = subset(meta_fits, qval.fdr < 0.2),
    aes(label = feature),
    size = 2.5,
    box.padding = 0.3,
    point.padding = 0.3,
    segment.color = "grey30",
    segment.size = 0.4,
    segment.alpha = 0.8,
    min.segment.length = 0,
    force = 2,
    max.overlaps = Inf) + scale_color_manual(
    values = c(
      "MTX-R" = "#238A8DFF",
      "MTX-NR" = "darksalmon",
      "NS" = "grey") ) +
  scale_fill_manual(
    values = c(
      "MTX-R" = "#238A8DFF",
      "MTX-NR" = "darksalmon",
      "NS" = "grey")) +
  
  #control alpha
  scale_alpha_manual(
    values = c(
      "MTX-R" = 0.4,
      "MTX-NR" = 0.4,
      "NS" = 0.3
    ),
    guide = "none"
  ) +
  
  theme_classic() +
  labs(
    x = "Effect Size (Coefficient)",
    y = expression(-log[10]("p-value")),
    color = "Significance",
    fill = "Significance" #Keeps the legend unified
  ) + theme(
    axis.text = element_text(size = 10),
    axis.title = element_text(size = 12),
    legend.position = "right")

print(p)

ggsave("Figure2C.pdf", plot = p, width = 5.5, height = 3.5, units = "in", )

##Figure 2D GSEA analysis

rm(list = ls())
meta_fits <- fread("Supplementaty_Table_metafits.csv")
meta_fits$rank_metric <- sign(meta_fits$coef) * -log10(meta_fits$pval)
set.seed(1234)
#Prepare ranked named vector
ranked_KOs <- meta_fits$rank_metric
names(ranked_KOs) <- meta_fits$feature
ranked_KOs <- ranked_KOs[is.finite(ranked_KOs)]  #remove any NaN/Inf
ranked_KOs <- sort(ranked_KOs, decreasing = TRUE)

cat("number of KOs for GSEA:", length(ranked_KOs), "\n")
## number of KOs for GSEA: 3900
#Fetch all KO → pathway associations from KEGG using the KEGG API.
#keggLink returns a named vector where:
# -names are KOs
#-Values are KEGG pathway IDs (e.g., "path:map00010")
#This provides the raw relationships between KOs and pathways.
ko2pathway_raw <- keggLink("pathway", "ko")  

#Convert the named vector into a clean data.frame for easier manipulation.
#'ko' column: removes the "ko:" prefix using sub()
#'pathway' column: removes the "path:" prefix for readability using sub()
#stringsAsFactors = FALSE ensures the columns remain character vectors, not factors
ko2pathway <- data.frame(
  ko      = sub("ko:", "", names(ko2pathway_raw)),       #remove "ko:" prefix from KO IDs
  pathway = sub("path:", "", ko2pathway_raw),           #remove "path:" prefix from pathway IDs
  stringsAsFactors = FALSE)

#ensures GSEA only evaluates KOs that have ranking metrics
ko2pathway <- ko2pathway %>% filter(ko %in% names(ranked_KOs))

#Remove very broad/general pathways like map01100 ("Metabolic pathways")
#Such pathways are overly generic and may dominate enrichment results, reducing specificity
ko2pathway <- ko2pathway %>% filter(!grepl("map01100", pathway))

set.seed(1234)
#Fetch all KEGG pathway names.
#Returns a named vector: names = pathway IDs (e.g., "path:map00010"), values = descriptive names
all_pathways <- keggList("pathway")

#clean the pathway IDs to remove the "path:" prefix
#setNames() creates a named vector where:
# - Names = cleaned pathway IDs (e.g., "map00010")
# - Values = descriptive names (e.g., "Glycolysis / Gluconeogenesis")
all_pathways <- setNames(all_pathways, sub("path:", "", names(all_pathways)))

#adda new column 'PathwayName' to ko2pathway to store the descriptive name for each pathway
#easier interpretation in plots and tables
ko2pathway$PathwayName <- all_pathways[ko2pathway$pathway]

pathway2ko <- ko2pathway %>%
  group_by(pathway, PathwayName) %>%          #Group all rows by unique pathway
  summarise(
    #concatenate all KOs in the group into a single string, separated by ";"
    #'paste()' combines multiple character strings into one
    #'unique(ko)' ensures each KO appears only once in the string
    #'collapse = ";"' specifies that ";" will be inserted between each KO
    KOs = paste(unique(ko), collapse = ";"),  
    .groups = "drop" )

#save the KO-to-pathway mapping as a CSV file for reference
fwrite(pathway2ko, "KO_to_Pathway_mapping.csv")

#clusterProfiler expects a data.frame with two columns:
#- pathway: the pathway ID
#- gene: the gene/KO ID
#Here we rename 'ko' to 'gene' and select only the necessary columns
term2gene <- ko2pathway %>% rename(gene = ko, pathway = pathway) %>% select(pathway, gene)


#Run GSEA with clusterProfiler
#Inputs:
#- ranked_KOs: a named numeric vector of KOs ranked by metric
#- TERM2GENE: the pathway definitions
#- pvalueCutoff = 1: keep all results for downstream filtering
#- verbose = FALSE: suppress console output
set.seed(1234)
gsea_res <- GSEA(
  ranked_KOs,
  seed = TRUE,
  TERM2GENE = term2gene,
  pvalueCutoff = 1,  
  verbose = FALSE)
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.15% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : For some of the pathways the P-values were likely overestimated. For
## such pathways log2err is set to NA.
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : For some pathways, in reality P-values are less than 1e-10. You can
## set the `eps` argument to zero for better estimation.
#convert the GSEA result object to a standard frame for easier manipulation
#arange by adjusted p-value (ascending) to see most significant pathways first
set.seed(1234)
gsea_df <- as.data.frame(gsea_res) %>% arrange(p.adjust)

#Add human-readable pathway names to GSEA results for easier interpretation
gsea_df$PathwayName <- all_pathways[gsea_df$ID]


fwrite(gsea_df, "GSEA_results_clusterProfiler2.csv")

#ssign direction
gsea_df$Direction <- ifelse(gsea_df$NES > 0, "MTX-R", "MTX-NR")
sig_pathways <- gsea_df %>% filter(p.adjust < 0.05)
#horizontal plot
p_horizontal <- ggplot(sig_pathways, aes(x = reorder(PathwayName, NES),
                                         y = NES,
                                         color = Direction,
                                         size = -log10(p.adjust))) +
  geom_point() +
  scale_color_manual(values = c("MTX-R" = "#238A8DFF", "MTX-NR" = "darksalmon")) +
  scale_x_discrete(position = "top") +  #pathway names on top
  labs(x = "KEGG Pathway",
       y = "Normalized Enrichment Score (NES)",
       size = "-log10(q-value)",
       color = "nnriched in") +
  theme_classic2() + theme(axis.text.x = element_text(angle = 45, hjust = 0))

ggsave("Figure2D.pdf", plot = p_horizontal, width = 8, height =4 , dpi = 300)

#Figure 3 ##ArgSynthesis

rm(list=ls())
SELECTko <- read_excel("../03_HumannAnalysis/05_KO_of_Interest/Figure3.xlsx", sheet = "ArgSynthesis")
selected_KOs <- SELECTko$KO  # assuming the sheet has a column named 'KO'
CRC_abd_adj <- fread("../03_HumannAnalysis/02_KO/20251028BatchCorrectedKO.csv") %>% 
  as.data.frame(check.names = FALSE)
colnames(CRC_abd_adj)[1] <- "GeneFamily"
Metadata <- read.csv("../01_CuratingMetadata/metadataDAS28.csv",
                     stringsAsFactors = FALSE, check.names = FALSE)
selected_KOs 
## [1] "K00262" "K01940"
# Clean sample names
rownames(Metadata) <- Metadata$KT_sampleName_withS
colnames(Metadata) <- gsub("\\.", "-", colnames(Metadata))
rownames(Metadata) <- gsub("\\.", "-", rownames(Metadata))
Metadata$SampleID <- rownames(Metadata)

dt_long <- CRC_abd_adj %>%
  pivot_longer(-GeneFamily, names_to = "Sample", values_to = "Abundance") %>%
  rename(KO = GeneFamily)
dt_with_metadata <- dt_long %>%
  left_join(Metadata, by = c("Sample" = "SampleID"))

dt_filtered <- dt_with_metadata %>%
  filter(KO %in% selected_KOs)
dt_filtered$MTXRandMTXNR <- factor(dt_filtered$MTXRandMTXNR, levels = c("MTXNR", "MTXR"))
dt_filtered$Cohort <- factor(dt_filtered$Cohort, levels = c("Cohort-1", "Cohort-2", "Cohort-3"))

unique_KOs <- unique(dt_filtered$KO)

output_dir <- "./KO_Plots"
dir.create(output_dir, showWarnings = FALSE)

for (ko in unique_KOs) {
  ko_data <- dt_filtered %>% filter(KO == ko)
  if (nrow(ko_data) == 0) next  
  
  # 1. Faceted Plot
  p_facet <- ggplot(ko_data, aes(x = MTXRandMTXNR, y = Abundance, fill = MTXRandMTXNR)) +
    geom_bar(stat = "summary", fun = "mean", alpha = 0.7)  +
    geom_point(aes(shape = Cohort), 
               color = "black",              # Border of the point
               stroke = 0.5,                 # Thickness of the border
               position = position_jitter(width = 0.1, height = 0), 
               size = 1.2) +
    scale_shape_manual(values = c(22, 21, 24)) + # Use fillable shapes
    scale_fill_manual(values = c("MTXR" = "#238A8DFF", "MTXNR" = "darksalmon")) +
    facet_wrap(~Cohort, nrow = 1, scales = "free_y") +
    theme_classic() +
    theme(axis.title.x = element_blank(), axis.text.x = element_blank(),
          legend.position = "none", strip.text = element_text(size = 10)) +
    ggtitle(paste(ko, "by Cohort"))

  # 2. Combined Plot
  p_combined <- ggplot(ko_data, aes(x = MTXRandMTXNR, y = Abundance, fill = MTXRandMTXNR)) +
    geom_bar(stat = "summary", fun = "mean", alpha = 0.7) +
    geom_point(aes(shape = Cohort), 
               color = "black",              # Border of the point
               stroke = 0.5, 
               position = position_jitter(width = 0.1, height = 0), 
               size = 2) +
    scale_shape_manual(values = c(22, 21, 24)) + # Use fillable shapes
    scale_fill_manual(values = c("MTXNR" = "darksalmon", "MTXR" = "#238A8DFF")) +
    theme_classic() +
    theme(axis.title.x = element_blank(), legend.position = "right") + 
    ggtitle(paste(ko, "Combined Cohorts"))

  # Saving...
  facet_path <- file.path(output_dir, paste0(ko, "_Faceted.pdf"))
  pdf(facet_path, width = 8, height = 3)
  print(p_facet)
  dev.off()
  
  combined_path <- file.path(output_dir, paste0(ko, "_Combined.pdf"))
  pdf(combined_path, width = 5, height = 3)
  print(p_combined)
  dev.off()
}

##TrpSynthesis

SELECTko <- read_excel("../03_HumannAnalysis/05_KO_of_Interest/Figure3.xlsx", sheet = "TrpSynthesis")
selected_KOs <- SELECTko$KO  # assuming the sheet has a column named 'KO'
selected_KOs 
## [1] "K04518"
#clean sample names
rownames(Metadata) <- Metadata$KT_sampleName_withS
colnames(Metadata) <- gsub("\\.", "-", colnames(Metadata))
rownames(Metadata) <- gsub("\\.", "-", rownames(Metadata))
Metadata$SampleID <- rownames(Metadata)

dt_long <- CRC_abd_adj %>%
  pivot_longer(-GeneFamily, names_to = "Sample", values_to = "Abundance") %>%
  rename(KO = GeneFamily)
dt_with_metadata <- dt_long %>%
  left_join(Metadata, by = c("Sample" = "SampleID"))

dt_filtered <- dt_with_metadata %>%
  filter(KO %in% selected_KOs)
dt_filtered$MTXRandMTXNR <- factor(dt_filtered$MTXRandMTXNR, levels = c("MTXNR", "MTXR"))
dt_filtered$Cohort <- factor(dt_filtered$Cohort, levels = c("Cohort-1", "Cohort-2", "Cohort-3"))

unique_KOs <- unique(dt_filtered$KO)

output_dir <- "./KO_Plots"

for (ko in unique_KOs) {
  ko_data <- dt_filtered %>% filter(KO == ko)
  if (nrow(ko_data) == 0) next  
  
  # 1. Faceted Plot
  p_facet <- ggplot(ko_data, aes(x = MTXRandMTXNR, y = Abundance, fill = MTXRandMTXNR)) +
    geom_bar(stat = "summary", fun = "mean", alpha = 0.7)  +
    geom_point(aes(shape = Cohort), 
               color = "black",              # Border of the point
               stroke = 0.5,                 # Thickness of the border
               position = position_jitter(width = 0.1, height = 0), 
               size = 1.2) +
    scale_shape_manual(values = c(22, 21, 24)) + # Use fillable shapes
    scale_fill_manual(values = c("MTXR" = "#238A8DFF", "MTXNR" = "darksalmon")) +
    facet_wrap(~Cohort, nrow = 1, scales = "free_y") +
    theme_classic() +
    theme(axis.title.x = element_blank(), axis.text.x = element_blank(),
          legend.position = "none", strip.text = element_text(size = 10)) +
    ggtitle(paste(ko, "by Cohort"))

  # 2. Combined Plot
  p_combined <- ggplot(ko_data, aes(x = MTXRandMTXNR, y = Abundance, fill = MTXRandMTXNR)) +
    geom_bar(stat = "summary", fun = "mean", alpha = 0.7) +
    geom_point(aes(shape = Cohort), 
               color = "black",              # Border of the point
               stroke = 0.5, 
               position = position_jitter(width = 0.1, height = 0), 
               size = 2) +
    scale_shape_manual(values = c(22, 21, 24)) + # Use fillable shapes
    scale_fill_manual(values = c("MTXNR" = "darksalmon", "MTXR" = "#238A8DFF")) +
    theme_classic() +
    theme(axis.title.x = element_blank(), legend.position = "right") + 
    ggtitle(paste(ko, "Combined Cohorts"))

  # Saving...
  facet_path <- file.path(output_dir, paste0(ko, "_Faceted.pdf"))
  pdf(facet_path, width = 8, height = 3)
  print(p_facet)
  dev.off()
  
  combined_path <- file.path(output_dir, paste0(ko, "_Combined.pdf"))
  pdf(combined_path, width = 5, height = 3)
  print(p_combined)
  dev.off()
}

##Nucleotide Synthesis

SELECTko <- read_excel("../03_HumannAnalysis/05_KO_of_Interest/Figure3.xlsx", sheet = "APRT")
selected_KOs <- SELECTko$KO
selected_KOs 
## [1] "K00759" "K06287"
rownames(Metadata) <- Metadata$KT_sampleName_withS
colnames(Metadata) <- gsub("\\.", "-", colnames(Metadata))
rownames(Metadata) <- gsub("\\.", "-", rownames(Metadata))
Metadata$SampleID <- rownames(Metadata)

dt_long <- CRC_abd_adj %>%
  pivot_longer(-GeneFamily, names_to = "Sample", values_to = "Abundance") %>%
  rename(KO = GeneFamily)
dt_with_metadata <- dt_long %>%
  left_join(Metadata, by = c("Sample" = "SampleID"))

dt_filtered <- dt_with_metadata %>%
  filter(KO %in% selected_KOs)
dt_filtered$MTXRandMTXNR <- factor(dt_filtered$MTXRandMTXNR, levels = c("MTXNR", "MTXR"))
dt_filtered$Cohort <- factor(dt_filtered$Cohort, levels = c("Cohort-1", "Cohort-2", "Cohort-3"))

unique_KOs <- unique(dt_filtered$KO)

output_dir <- "./KO_Plots"
dir.create(output_dir, showWarnings = FALSE)

for (ko in unique_KOs) {
  ko_data <- dt_filtered %>% filter(KO == ko)
  if (nrow(ko_data) == 0) next  
  
  # 1. Faceted Plot
  p_facet <- ggplot(ko_data, aes(x = MTXRandMTXNR, y = Abundance, fill = MTXRandMTXNR)) +
    geom_bar(stat = "summary", fun = "mean", alpha = 0.7)  +
    geom_point(aes(shape = Cohort), 
               color = "black",              # Border of the point
               stroke = 0.5,                 # Thickness of the border
               position = position_jitter(width = 0.1, height = 0), 
               size = 1.2) +
    scale_shape_manual(values = c(22, 21, 24)) + # Use fillable shapes
    scale_fill_manual(values = c("MTXR" = "#238A8DFF", "MTXNR" = "darksalmon")) +
    facet_wrap(~Cohort, nrow = 1, scales = "free_y") +
    theme_classic() +
    theme(axis.title.x = element_blank(), axis.text.x = element_blank(),
          legend.position = "none", strip.text = element_text(size = 10)) +
    ggtitle(paste(ko, "by Cohort"))

  # 2. Combined Plot
  p_combined <- ggplot(ko_data, aes(x = MTXRandMTXNR, y = Abundance, fill = MTXRandMTXNR)) +
    geom_bar(stat = "summary", fun = "mean", alpha = 0.7) +
    geom_point(aes(shape = Cohort), 
               color = "black",              # Border of the point
               stroke = 0.5, 
               position = position_jitter(width = 0.1, height = 0), 
               size = 2) +
    scale_shape_manual(values = c(22, 21, 24)) + # Use fillable shapes
    scale_fill_manual(values = c("MTXNR" = "darksalmon", "MTXR" = "#238A8DFF")) +
    theme_classic() +
    theme(axis.title.x = element_blank(), legend.position = "right") + 
    ggtitle(paste(ko, "Combined Cohorts"))

  # Saving...
  facet_path <- file.path(output_dir, paste0(ko, "_Faceted.pdf"))
  pdf(facet_path, width = 8, height = 3)
  print(p_facet)
  dev.off()
  
  combined_path <- file.path(output_dir, paste0(ko, "_Combined.pdf"))
  pdf(combined_path, width = 5, height = 3)
  print(p_combined)
  dev.off()
}

#Figure 4A ##Figure 4A

rm(list=ls())
meta <- read.csv("../01_CuratingMetadata/metadataDAS28.csv", stringsAsFactors = FALSE)
rownames(meta) <- meta$KT_sampleName_withS
meta$DAS28 <- as.numeric(meta$DAS28)
meta$MTXRandMTXNR <- factor(meta$MTXRandMTXNR,
                            levels = c("MTXNR", "MTXR"))
wilcox_test <- wilcox.test(DAS28 ~ MTXRandMTXNR, data = meta)
wilcox_test
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  DAS28 by MTXRandMTXNR
## W = 916, p-value = 0.02307
## alternative hypothesis: true location shift is not equal to 0
t.test(DAS28 ~ MTXRandMTXNR, data = meta)
## 
##  Welch Two Sample t-test
## 
## data:  DAS28 by MTXRandMTXNR
## t = -2.56, df = 97.241, p-value = 0.01201
## alternative hypothesis: true difference in means between group MTXNR and group MTXR is not equal to 0
## 95 percent confidence interval:
##  -1.292138 -0.163587
## sample estimates:
## mean in group MTXNR  mean in group MTXR 
##            5.012750            5.740613
summary_df <- meta %>%
  group_by(MTXRandMTXNR) %>%
  summarise(
    mean_DAS28 = mean(DAS28, na.rm = TRUE),
    .groups = "drop")

P <- ggplot(summary_df, aes(x = MTXRandMTXNR, y = mean_DAS28, fill = MTXRandMTXNR)) +
  # Bar layer
  geom_col(width = 0.6, alpha = 0.7) +
  # Point layer - explicitly adding 'fill' inside aes()
  geom_point(data = meta, 
             aes(x = MTXRandMTXNR, y = DAS28, shape = Cohort, fill = MTXRandMTXNR), 
             color = "black", 
             stroke = 0.5,
             position = position_jitter(width = 0.1, height = 0), 
             size = 2) +
  # Consistent color and shape scales
  scale_fill_manual(values = c("MTXNR" = "darksalmon", "MTXR" = "#238A8DFF")) +
  scale_shape_manual(values = c("Cohort-1" = 22, "Cohort-2" = 21, "Cohort-3" = 24)) +
  labs(x = "Response Status",
       y = "DAS28 Score",
       shape = "Cohort") +
  theme_classic(base_size = 14) +
  theme(legend.position = "right")

P

ggsave("Figure4A_DAS28.pdf", P, width = 5, height = 3)

##Figure 4B and Suppl.figure 9A

rm(list = ls())
pseq <- readRDS("../02_20251229_MetaphlanAnalysis/01_Diff_Level_Analysis/genusBatchCorrected_phyloseq.rds")
Metadata <- read.csv("../01_CuratingMetadata/metadataDAS28.csv", stringsAsFactors = FALSE)
rownames(Metadata) <- Metadata$KT_sampleName_withS
Metadata$DAS28 <- as.numeric(Metadata$DAS28)

abund <- as(otu_table(pseq), "matrix")
if (taxa_are_rows(pseq)) { abund <- t(abund) }
abund <- abund + 1e-6
abund_clr <- t(apply(abund, 1, function(x) log(x / exp(mean(log(x))))))
abund_clr <- as.data.frame(abund_clr)


common_samples <- intersect(rownames(abund_clr), rownames(Metadata))
abund_clr <- abund_clr[common_samples, , drop = FALSE]
meta <- Metadata[common_samples, , drop = FALSE]

genus_names <- colnames(abund_clr)
results <- data.frame(
  Genus = genus_names,
  Spearman_rho = NA_real_,
  p_value = NA_real_
)

for (i in seq_along(genus_names)) {
  g <- genus_names[i]
  x <- abund_clr[, g]
  y <- meta$DAS28
  if (sd(x, na.rm = TRUE) == 0 || all(is.na(y))) next
  cor_test <- suppressWarnings(cor.test(x, y, method = "spearman"))
  results$Spearman_rho[i] <- cor_test$estimate
  results$p_value[i] <- cor_test$p.value
}

results$padj <- p.adjust(results$p_value, method = "BH")

results_filtered <- results %>%
  mutate(
    Genus_name = str_extract(Genus, "g__[^|]+"),
    Genus_name = sub("^g__", "", Genus_name)
  ) %>%
  filter(!is.na(Spearman_rho), padj <= 0.2, !str_starts(Genus_name, "GGB")) %>%
  arrange(desc(abs(Spearman_rho)))

abund_long <- abund_clr[, results_filtered$Genus, drop = FALSE] %>%
  as.data.frame() %>%
  mutate(Sample = rownames(.)) %>%
  pivot_longer(
    cols = -Sample,
    names_to = "Genus",
    values_to = "Abundance"
  ) %>%
  left_join(meta %>% mutate(Sample = rownames(meta)) %>% select(Sample, DAS28, Cohort), 
    by = "Sample"
  ) %>%
  mutate(
    Genus_name = str_extract(Genus, "g__[^|]+"),
    Genus_name = sub("^g__", "", Genus_name)
  ) %>%
  filter(!str_starts(Genus_name, "GGB"))

style <- theme_classic(base_size = 14) +
  theme(
    legend.position = "bottom",
    aspect.ratio = 0.75, 
    panel.spacing = unit(1, "lines"),
    strip.background = element_blank(),
    strip.text = element_text(face = "bold")
  )
my_geom_point  <- geom_point(aes(color = Cohort), size = 2, alpha = 0.7)
my_geom_smooth <- geom_smooth(method = "lm", color = "black", linetype = "dashed", se = TRUE)
P_genus <- ggplot(abund_long, aes(x = Abundance, y = DAS28)) +
  geom_point(aes(color = Cohort), size = 2, alpha = 0.7) +
  geom_smooth(method = "lm", color = "black", linetype = "dashed", se = TRUE) +
  facet_wrap(~Genus_name, scales = "free", nrow = 1) +
  labs(x = "Genus CLR Abundance", y = "DAS28") +
  style

ggsave("Figure4B_GeneravsDAS28Corr.pdf", plot = P_genus, width = 11, height = 4)
## `geom_smooth()` using formula = 'y ~ x'

##Figure 4C and Suppl.figure 9B

rm(list = ls())
pseq <- readRDS("../02_20251229_MetaphlanAnalysis/01_Diff_Level_Analysis/SpeciesBatchCorrected_phyloseq.rds")
Metadata <- read.csv("../01_CuratingMetadata/metadataDAS28.csv", stringsAsFactors = FALSE)
rownames(Metadata) <- Metadata$KT_sampleName_withS
Metadata$DAS28 <- as.numeric(Metadata$DAS28)

abund <- as(otu_table(pseq), "matrix")
if (taxa_are_rows(pseq)) { abund <- t(abund) }
abund <- abund + 1e-6
abund_clr <- t(apply(abund, 1, function(x) log(x / exp(mean(log(x))))))
abund_clr <- as.data.frame(abund_clr)

common_samples <- intersect(rownames(abund_clr), rownames(Metadata))
abund_clr <- abund_clr[common_samples, , drop = FALSE]
meta <- Metadata[common_samples, , drop = FALSE]


Species_names <- colnames(abund_clr)
results <- data.frame(
  Species = Species_names,
  Spearman_rho = NA_real_,
  p_value = NA_real_
)

for (i in seq_along(Species_names)) {
  g <- Species_names[i]
  x <- abund_clr[, g]
  y <- meta$DAS28
  if (sd(x, na.rm = TRUE) == 0 || all(is.na(y))) next
 cor_test <- suppressWarnings(cor.test(x, y, method = "spearman"))
  results$Spearman_rho[i] <- cor_test$estimate
  results$p_value[i] <- cor_test$p.value
}

results$padj <- p.adjust(results$p_value, method = "BH")

results_filtered <- results %>%
  mutate(
    Species_name = str_extract(Species, "s__[^|]+"),
    Species_name = sub("^s__", "", Species_name)
  ) %>%
  filter(!is.na(Spearman_rho), padj <= 0.2, 
         !str_starts(Species_name, "GGB"), 
         !str_starts(Species_name, "SGB")) %>%
  arrange(desc(abs(Spearman_rho)))

write.csv(results_filtered, "Suppl_Table_Species_DAS28_Corr.csv", row.names = FALSE)

abund_long <- abund_clr[, results_filtered$Species, drop = FALSE] %>%
  as.data.frame() %>%
  mutate(Sample = rownames(.)) %>%
  pivot_longer(
    cols = -Sample,
    names_to = "Species",
    values_to = "Abundance"
  ) %>%
  left_join(
    # Explicitly include Cohort here
    meta %>% mutate(Sample = rownames(meta)) %>% select(Sample, DAS28, Cohort),
    by = "Sample"
  ) %>%
  mutate(
    Species_name = str_extract(Species, "s__[^|]+"),
    Species_name = sub("^s__", "", Species_name)
  ) %>%
  filter(!str_starts(Species_name, "SGB"), !str_starts(Species_name, "GGB"))

style <- theme_classic(base_size = 14) +
  theme(
    legend.position = "bottom",
    aspect.ratio = 0.75, 
    panel.spacing = unit(1, "lines"),
    strip.background = element_blank(),
    strip.text = element_text(face = "bold")
  )
my_geom_point  <- geom_point(aes(color = Cohort), size = 2, alpha = 0.7)
my_geom_smooth <- geom_smooth(method = "lm", color = "black", linetype = "dashed", se = TRUE)

P_species <- ggplot(abund_long, aes(x = Abundance, y = DAS28)) + my_geom_point + my_geom_smooth +
  facet_wrap(~Species_name, scales = "free", ncol = 4) + 
  labs(
    x = "Species CLR Abundance",
    y = "DAS28",
    title = "Significant Species Correlation") + style +
  theme(strip.text = element_text(face = "italic")) 
ggsave("Figure4C_Species.pdf", plot = P_species, width = 11, height = 10)
## `geom_smooth()` using formula = 'y ~ x'

##Figure 4D and Suppl.figure 9C

rm(list=ls())
d2 <- read.csv("../03_HumannAnalysis/02_KO/20251028BatchCorrectedKO.csv",
               check.names = FALSE)

#Name first column as Genefamily
colnames(d2)[1] <- "Genefamily"
rownames(d2) <- d2$Genefamily
d2$Genefamily <- NULL

#Clean sample names
colnames(d2) <- sub("^X", "", colnames(d2))
colnames(d2) <- sub("_L002$", "", colnames(d2))

#Transpose: rows = samples, columns = KOs
abund <- t(d2)
abund <- as.data.frame(abund)

Metadata <- read.csv("../01_CuratingMetadata/metadataDAS28.csv",
                     stringsAsFactors = FALSE)

Metadata$KT_sampleName_withS <- gsub("\\.", "-", Metadata$KT_sampleName_withS)
Metadata$DAS28 <- as.numeric(Metadata$DAS28)
rownames(Metadata) <- Metadata$KT_sampleName_withS

common_samples <- intersect(rownames(abund), rownames(Metadata))


abund <- abund[common_samples, , drop = FALSE]
meta  <- Metadata[common_samples, , drop = FALSE]

#Add small pseudocount to avoid log(0)
abund_clr <- abund + 1e-6

#CLR per sample
abund_clr <- t(apply(abund_clr, 1, function(x) log(x / exp(mean(log(x))))))
abund_clr <- as.data.frame(abund_clr)


results <- data.frame(
  GeneFamily   = colnames(abund_clr),
  Spearman_rho = NA_real_,
  p_value      = NA_real_
)

for (i in seq_along(colnames(abund_clr))) {
  x <- abund_clr[, i]
  y <- meta$DAS28
  
 #Skip KOs with zero variance
  if (sd(x, na.rm = TRUE) == 0) next
  
  ct <- suppressWarnings(cor.test(x, y, method = "spearman"))
  results$Spearman_rho[i] <- ct$estimate
  results$p_value[i]      <- ct$p.value
}
results$padj <- p.adjust(results$p_value, method = "BH")

results_filtered <- results %>%
  filter(!is.na(Spearman_rho), padj <= 0.05) %>%
  arrange(desc(abs(Spearman_rho)))

write.csv(results_filtered, "KEGG_DAS28_Correlations_CLR.csv", row.names = FALSE)

abund_long <- abund_clr[, results_filtered$GeneFamily, drop = FALSE] %>%
  as.data.frame() %>%
  mutate(Sample = rownames(.)) %>%
  pivot_longer(
    cols = -Sample,
    names_to = "GeneFamily",
    values_to = "Abundance"
  ) %>%
  left_join(
   #Ensure Cohort is selected here from the meta object
    meta %>% mutate(Sample = rownames(meta)) %>% select(Sample, DAS28, Cohort), 
    by = "Sample"
  )

P <- ggplot(abund_long, aes(x = Abundance, y = DAS28)) +
 #Map color to Cohort inside geom_point
  geom_point(aes(color = Cohort), size = 2, alpha = 0.7) +
 #Standard black regression line for the overall trend
  geom_smooth(method = "lm", se = TRUE, color = "black", linetype = "dashed") +
  facet_wrap(~ GeneFamily, scales = "free") +
  labs( x = "KEGG KO CLR Abundance",
    y = "DAS28",
    title = "Correlation of CLR-Transformed KEGG KO Abundance with DAS28",
    color = "Cohort") +
  theme_classic(base_size = 14) +
  theme(legend.position = "bottom")
print(P)
## `geom_smooth()` using formula = 'y ~ x'

ggsave("KEGG_Abund_vs_DAS28.pdf", plot = P, width = 10, height = 12)
## `geom_smooth()` using formula = 'y ~ x'

#Figure 5A

rm(list=ls())
set.seed(33) 
d2_kegg <- fread("../03_HumannAnalysis/02_KO/20251028BatchCorrectedKO.csv") |> as.data.frame()
rownames(d2_kegg) <- d2_kegg[, 1]; d2_kegg <- d2_kegg[, -1]
colnames(d2_kegg) <- gsub("^X|_L002$|\\.", "-", colnames(d2_kegg))

Metadata <- read.csv("../01_CuratingMetadata/metadataDAS28.csv", stringsAsFactors = FALSE)
rownames(Metadata) <- gsub("\\.", "-", Metadata$KT_sampleName_withS)
Metadata$MTXRandMTXNR <- factor(Metadata$MTXRandMTXNR, levels = c("MTXR", "MTXNR"))

common_s <- intersect(colnames(d2_kegg), rownames(Metadata))
Metadata <- Metadata[common_s, ]
kegg_df  <- as.data.frame(t(d2_kegg[, common_s]))
rel_abund_df <- kegg_df / rowSums(kegg_df)

cohorts <- unique(Metadata$Cohort)
plot_results <- data.frame()
clin_features <- c("age", "gender", "DAS28")

#iitialize a list to store selected features per cohort
feature_log <- list()

#LOCO Loop
for (test_cohort in cohorts) {
  cat("\n--- Testing on Cohort:", test_cohort, "---\n")
  
  train_s <- rownames(Metadata)[Metadata$Cohort != test_cohort]
  val_s   <- rownames(Metadata)[Metadata$Cohort == test_cohort]
  
  k_train_rel <- rel_abund_df[train_s, ]
  k_val_rel   <- rel_abund_df[val_s, ]
  target_train <- Metadata[train_s, "MTXRandMTXNR"]
  target_val   <- Metadata[val_s, "MTXRandMTXNR"]

  #Indepedent KO selection
  cat("   Running Boruta on KEGG...\n")
  boruta_k <- Boruta(x = k_train_rel, y = target_train, maxRuns = 200)
  k_selected <- getSelectedAttributes(TentativeRoughFix(boruta_k))

  #have at least 5 KOs
  if (length(k_selected) < 5) {
    cat("   Applying fallback to top 5 features...\n")
    temp_rf <- randomForest::randomForest(x = k_train_rel, y = target_train, ntree = 500)
    imp <- randomForest::importance(temp_rf)
    top_kos <- row.names(imp)[order(imp[,1], decreasing = TRUE)]
    k_selected <- unique(c(k_selected, setdiff(top_kos, k_selected)[1:(5 - length(k_selected))]))
  }

  #save the selected features for this cohort ---
  feature_log[[test_cohort]] <- k_selected
  cat("   Features selected for this split:", paste(k_selected, collapse=", "), "\n")

  #define strategies
  strategies <- list(
    "KEGG Only"        = k_selected,
    "KEGG + DAS28"     = c(k_selected, "DAS28"),
    "KEGG + Clinical"  = c(k_selected, "age", "gender", "DAS28"),
    "Clinical Only"    = c("age", "gender", "DAS28"))

  for (strat_name in names(strategies)) {
    feats <- strategies[[strat_name]]
    k_in_strat <- intersect(feats, colnames(k_train_rel))
    clin_in_strat <- intersect(feats, clin_features)
    
    train_df <- k_train_rel[, k_in_strat, drop=FALSE]
    val_df   <- k_val_rel[, k_in_strat, drop=FALSE]
    
    for(cli in clin_in_strat) {
      tr_val <- Metadata[train_s, cli]; te_val <- Metadata[val_s, cli]
      if(is.numeric(tr_val)) {
        train_df[[cli]] <- tr_val; val_df[[cli]] <- te_val
      } else {
        lvl <- unique(c(tr_val, te_val))
        train_df[[cli]] <- factor(tr_val, levels=lvl); val_df[[cli]] <- factor(te_val, levels=lvl)
      }
    }
    
    train_df$Target <- target_train
    ctrl <- trainControl(method="cv", number=5, classProbs=TRUE, summaryFunction=twoClassSummary)
    rf_mod <- train(Target ~ ., data = train_df, method = "rf", ntree = 500, metric = "ROC", trControl = ctrl)
    
    probs <- predict(rf_mod, val_df, type = "prob")[, "MTXNR"]
    roc_obj <- roc(target_val, probs, quiet = TRUE)
    
    plot_results <- rbind(plot_results, data.frame(
      FPR = 1 - roc_obj$specificities, TPR = roc_obj$sensitivities, 
      Strategy = strat_name, TestCohort = test_cohort, AUC = as.numeric(auc(roc_obj))
    ))
  }
}
## 
## --- Testing on Cohort: Cohort-2 ---
##    Running Boruta on KEGG...
##    Features selected for this split: K00390, K01197, K01358, K01995, K02020, K04518 
## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
## 
## 
## --- Testing on Cohort: Cohort-1 ---
##    Running Boruta on KEGG...
##    Features selected for this split: K00127, K00783, K01190, K01255, K01358, K01620, K02348, K02874, K03306, K03469, K07056, K07114, K07726, K08093, K12373 
## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
## 
## 
## --- Testing on Cohort: Cohort-3 ---
##    Running Boruta on KEGG...
##    Features selected for this split: K00042, K00087, K00548, K01191, K01197, K01308, K01995, K02042, K02874, K02902, K02907, K03148, K04651, K04758, K07406, K13652, K15598 
## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
summary_auc <- plot_results %>% 
  group_by(Strategy) %>% 
  summarize(MeanAUC = round(mean(AUC), 2))

#create a text label for EACH facet that lists every cohort's specific AUC
facet_text <- plot_results %>%
  group_by(Strategy, TestCohort) %>%
  summarize(AUC = first(AUC)) %>% #Get the single AUC value for this cohort/strat
  mutate(TextEntry = paste0(TestCohort, ": ", round(AUC, 2))) %>%
  group_by(Strategy) %>%
  summarize(AUC_List = paste(TextEntry, collapse = "\n"))
## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by Strategy and TestCohort.
## ℹ Output is grouped by Strategy.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(Strategy, TestCohort))` for per-operation grouping
##   (`?dplyr::dplyr_by`) instead.
#prepare the main plotting dataframe
plot_df <- left_join(plot_results, summary_auc, by = "Strategy")
plot_df <- left_join(plot_df, facet_text, by = "Strategy")

#create the Facet Labels
plot_df$StratLabel <- factor(
  paste0(plot_df$Strategy, "\n(Mean AUC: ", plot_df$MeanAUC, ")"),
  levels = paste0(names(strategies), "\n(Mean AUC: ", 
                  summary_auc$MeanAUC[match(names(strategies), summary_auc$Strategy)], ")")
)

#generate the Plot
p_final_auc <- ggplot(plot_df, aes(x = FPR, y = TPR, color = TestCohort)) +
  geom_path(linewidth = 1.2) +
  geom_abline(linetype = "dashed", color = "grey50") +
  #Add the text annotation to the bottom-right of each facet
  geom_text(aes(label = AUC_List), 
            x = 1, y = 0, hjust = 1, vjust = -0.2, 
            size = 3.5, color = "black", inherit.aes = FALSE, check_overlap = TRUE,
            data = plot_df %>% select(StratLabel, AUC_List) %>% unique()) +
  facet_wrap(~StratLabel, ncol = 2, scales= "free") + 
  theme_classic(base_size = 12) +
  scale_color_brewer(palette = "Set1") +
  labs(
       x = "1-Specificity (FPR)", y = "Sensitivity (TPR)",
       color = "Validation Cohort") +
  theme(legend.position = "bottom", 
        strip.background = element_rect(fill = "grey95"),
        strip.text = element_text(face = "bold"))

print(p_final_auc)

ggsave("Figure5A.pdf", p_final_auc, height = 7, width = 7, dpi = 300)

#Figure 5B, Suppl.figure 10

rm(list=ls())
d2_kegg_raw <- fread("../03_HumannAnalysis/02_KO/20251028BatchCorrectedKO.csv") |> as.data.frame()
rownames(d2_kegg_raw) <- d2_kegg_raw[, 1]
d2_kegg_raw <- d2_kegg_raw[, -1]
colnames(d2_kegg_raw) <- gsub("^X|_L002$|\\.", "-", colnames(d2_kegg_raw))

Metadata <- read.csv("../01_CuratingMetadata/metadataDAS28.csv", stringsAsFactors = FALSE)
rownames(Metadata) <- gsub("\\.", "-", Metadata$KT_sampleName_withS)
Metadata$MTXRandMTXNR <- factor(Metadata$MTXRandMTXNR, levels = c("MTXR", "MTXNR"))

common_s <- intersect(colnames(d2_kegg_raw), rownames(Metadata))
Metadata <- Metadata[common_s, ]
kegg_ra  <- as.data.frame(t(d2_kegg_raw[, common_s]))
kegg_ra  <- as.data.frame(sweep(kegg_ra, 1, rowSums(kegg_ra), "/"))

#tandardize clinical variables
Metadata$gender_num <- as.numeric(factor(Metadata$gender)) 
combined_df <- cbind(kegg_ra, Metadata[, c("DAS28", "age", "gender_num", "MTXRandMTXNR")])
colnames(combined_df)[colnames(combined_df) == "MTXRandMTXNR"] <- "Target"
colnames(combined_df)[which(colnames(combined_df) == "gender_num")] <- "gender"
#set seed for reproducibility
set_seed <- 18
set.seed(set_seed)

#STRATIFIED SPLIT: Partitioning data into 80% Training and 20% Validation + maintaing  the same ratio of R/NR.
train_idx <- createDataPartition(combined_df$Target, p = 0.8, list = FALSE)
train_set <- combined_df[train_idx, ]
val_set   <- combined_df[-train_idx, ]

#boruta Feature Selection on training cohort only
kegg_cols <- colnames(kegg_ra)

##run on only on the 80% Training Set to prevent data leakage.
#x the biological KEGG pathways (predictors)
#y is the Target (MTXR vs MTXNR status)
#maxRuns = 100 iterations are ok.

boruta_out <- Boruta(x = train_set[, kegg_cols], y = train_set$Target, doTrace = 0, maxRuns = 100)

#Boruta labels features as confirmed/Rejcted/Tent
#TentativeRoughFix performs a final statistical comparison to make keep/drop' decision 
final_boruta <- TentativeRoughFix(boruta_out)
kegg_features <- getSelectedAttributes(final_boruta, withTentative = TRUE)

#check if less than 2 features selected then force select top 5 features based on median imp
if(length(kegg_features) < 2) {
  imps <- attStats(final_boruta)
  kegg_features <- rownames(imps[order(-imps$medianImp), ])[1:3]
}

#make grouping for main figure and supp figure as suggested by RN -Combined Strategies-single plot
feature_sets <- list(

  "KEGG_only"       = kegg_features,
  "Clin_Param"      = c("age", "gender", "DAS28"),
  "KEGG_Clin_Param" = c(kegg_features, "age", "gender", "DAS28"),
  
#this is for suppl figure
  "DAS28_only"      = "DAS28",
  "Age_only"        = "age",
  "Gender_only"     = "gender",
  "KEGG_DAS28"      = c(kegg_features, "DAS28"),
  "KEGG_Age"        = c(kegg_features, "age"),
  "KEGG_Gender"     = c(kegg_features, "gender")
)

#loop for traing and validation
roc_results <- list()
#here am looping through each strategy (KEGG Only, Clinical, Both-see above)
#to compare their performance using the same train/test parameters.
for (strat_name in names(feature_sets)) {
  #Extract the specific col names
  vars <- feature_sets[[strat_name]]
  
  #parameters:
  #using 5-fold CV within the training set. what this does- splits the 80% training data into 5 'folds'—training on 4 and  testing on 1—repeatedly to find the most stable model.
  #then 'twoClassSummary' -we calculate AUC-ROC, Sensitivity, and Specificity.
  set.seed(set_seed)
  cv_ctrl <- trainControl(
    method = "cv", number = 5,
    classProbs = TRUE, summaryFunction = twoClassSummary, savePredictions="final" )
  #500 trees and 5 fold cross validation, method = "rf"- a random Forest 
  #metric = "ROC"- model to maximize the Area Under the Curve (AUC).
  #drop = FALSEdata stays as a dataframe
  rf_mod <- train(
    x = train_set[, vars, drop = FALSE],
    y = train_set$Target,
    method = "rf", ntree = 500, metric = "ROC",
    trControl = cv_ctrl
  )
  #rf_mod objects saves the learning
  
  #We apply the trained rf_mod to the validation cohort- 20% hold-out data-see above.
#type = "prob"' returns the specific confidence level for each class. for example- Patient 501_1 has a 0.85  probability of being a MTXNR.
  #Predict probabilities for AUC and Accuracy
  probs <- predict(rf_mod, newdata = val_set[, vars, drop = FALSE], type = "prob")
  
  #below we use default 50% threshold to decide the final category. that predicts the MTXR/NR and calculate the Accuracy percentage.
  
  preds <- predict(rf_mod, newdata = val_set[, vars, drop = FALSE])
  
  #Now compare the model's predictions against the actual 'Target' outcomes. from the validation set to calculate performance scores.
  #We use the predicted probabilities for the Non-Responder class.
#The 'roc' function calculates the trade-off between Sensitivity and Specificity
#at every possible probability threshold (from 0 to 1).

  roc_obj <- roc(val_set$Target, probs$MTXNR, quiet = TRUE)
  
#against the actual patient status. The 'overall["Accuracy"]' gives us the 
#percentage of total patients who were correctly classified.
  acc_val <- confusionMatrix(preds, val_set$Target)$overall["Accuracy"]
  
  
  #now save - 'roc': The full curve object (needed for plotting the ROC curve later).
#auc-the single-number-AUC
#acc-the raw Accuracy score (percentage of correct calls).
  roc_results[[strat_name]] <- list(
    roc = roc_obj,
    auc = as.numeric(auc(roc_obj)),
    
    acc = suppressWarnings(as.numeric(acc_val)) 
)
}
## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
#convert the roc_results into a dataframe.
#This format is required for plotting multiple ROC curves in a single ggplot.
plot_data <- bind_rows(
  lapply(names(roc_results), function(x){
    #Pull the Sensitivity  and Specificity 
    #oordinates from the roc object for the current feature.
    df <- data.frame(
      Sensitivity = roc_results[[x]]$roc$sensitivities,
      Specificity = roc_results[[x]]$roc$specificities)
   
     #Remove duplicate points 

    df <- df[!duplicated(df$Specificity, fromLast = TRUE), ]
    #order by Specificity so the line draws correctly from left to right.
    df <- df[order(df$Specificity, decreasing = TRUE), ]
    #Add the Strategy name and the final rounded AUC/Accuracy scores to each row.
    df$Strategy <- x
    df$AUC      <- round(roc_results[[x]]$auc, 2)
    df$Accuracy <- round(roc_results[[x]]$acc, 2)
    return(df)
  })
)

#update label to include both AUC and Accuracy
plot_data$Legend_Label <- paste0(plot_data$Strategy, 
                                 " (AUC: ", plot_data$AUC, 
                                 ", Acc: ", plot_data$Accuracy, ")")

main_strategies <- c("KEGG_only", "Clin_Param", "KEGG_Clin_Param")
main_df <- plot_data %>% filter(Strategy %in% main_strategies)

#We use aes(color = Legend_Label) to show the metrics in the legend
legend_map <- unique(main_df[, c("Strategy", "Legend_Label")])
legend_labels <- setNames(legend_map$Legend_Label, legend_map$Strategy)

# 2. Plotting
p_main <- ggplot(main_df, aes(x = 1 - Specificity, y = Sensitivity, color = Strategy)) +
  geom_step(direction = "vh", linewidth = 1.2) + # 'size' is deprecated, using 'linewidth'
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "darkgrey") +
  
  # Map colors to the Strategy column (the short names)
  # Use 'labels' to swap the short names for the long Legend_Labels in the visual legend
  scale_color_manual(
    values = c(
      "KEGG_only" = "blue",
      "Clin_Param" = "red",
      "KEGG_Clin_Param" = "green"
    ),
    labels = legend_labels
  ) + 
  
  theme_classic() +
  labs(
    x = "False Positive Rate (1 - Specificity)",
    y = "True Positive Rate (Sensitivity)",
    title = "ROC Curve Comparison: Main Strategies",
    color = "Model Strategy & Metrics"
  ) +
  theme(
    legend.position = c(0.65, 0.25), 
    legend.background = element_rect(fill = "white", color = "black")
  )

print(p_main)

#individual Faceted Plot
indiv_df <- plot_data %>% filter(!(Strategy %in% main_strategies))

p_indiv <- ggplot(indiv_df, aes(x = 1 - Specificity, y = Sensitivity)) +
  geom_step(color = "black", size = 1) +
  geom_abline(slope = 1, intercept = 0, linetype = "dotted") +
  facet_wrap(~ Strategy) + 
  theme_classic() +
  #Add both AUC and Accuracy text to each facet
  geom_text(aes(x = 0.5, y = 0.2, 
                label = paste0("AUC: ", AUC, "\nAcc: ", Accuracy)), 
            size = 3.5, color = "black") +
  labs(title = "Individual Feature Subsets Performance")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(p_indiv)

ggsave("Strategy_Comparison_Main.pdf", plot = p_main, width = 8, height = 7)
ggsave("Strategy_Comparison_Individual.pdf", plot = p_indiv, width = 10, height = 8)
train_metadata_full <- Metadata[rownames(train_set), ]
val_metadata_full   <- Metadata[rownames(val_set), ]

write.csv(train_metadata_full, "Train_Full_Metadata2.csv", row.names = TRUE)
write.csv(val_metadata_full, "Validation_Full_Metadata2.csv", row.names = TRUE)

#Print Boruta Features & Training Performance ---
cat("\n--- Boruta Selected KEGG Features ---\n")
## 
## --- Boruta Selected KEGG Features ---
print(kegg_features)
##  [1] "K00773" "K01197" "K01358" "K01876" "K01995" "K02874" "K02892" "K02907"
##  [9] "K02952" "K02965" "K04758" "K08300" "K09760"
cat("\n--- RF Model Training Results (Best Tune) ---\n")
## 
## --- RF Model Training Results (Best Tune) ---
#Note: rf_mod is the last model trained in my loop (KEGG_Clin_Param)
print(rf_mod$results[rf_mod$results$ROC == max(rf_mod$results$ROC), ])
##   mtry       ROC      Sens      Spec      ROCSD   SensSD    SpecSD
## 2    8 0.8862103 0.7142857 0.8583333 0.08064415 0.237238 0.0993808
cat("\n--- Boruta Selected KEGG Features ---\n")
## 
## --- Boruta Selected KEGG Features ---
if(length(kegg_features) > 0) {
  print(kegg_features)
} else {
  cat("No features significantly selected; using top 5 by median importance.\n")
}
##  [1] "K00773" "K01197" "K01358" "K01876" "K01995" "K02874" "K02892" "K02907"
##  [9] "K02952" "K02965" "K04758" "K08300" "K09760"
cat("\n--- Training Results (Cross-Validation Metrics) ---\n")
## 
## --- Training Results (Cross-Validation Metrics) ---
for (strat_name in names(feature_sets)) {
  cat(paste0("\nStrategy: ", strat_name, "\n"))
  #Accessing the best result from the caret object
  print(rf_mod$results[rf_mod$results$ROC == max(rf_mod$results$ROC), ])
}
## 
## Strategy: KEGG_only
##   mtry       ROC      Sens      Spec      ROCSD   SensSD    SpecSD
## 2    8 0.8862103 0.7142857 0.8583333 0.08064415 0.237238 0.0993808
## 
## Strategy: Clin_Param
##   mtry       ROC      Sens      Spec      ROCSD   SensSD    SpecSD
## 2    8 0.8862103 0.7142857 0.8583333 0.08064415 0.237238 0.0993808
## 
## Strategy: KEGG_Clin_Param
##   mtry       ROC      Sens      Spec      ROCSD   SensSD    SpecSD
## 2    8 0.8862103 0.7142857 0.8583333 0.08064415 0.237238 0.0993808
## 
## Strategy: DAS28_only
##   mtry       ROC      Sens      Spec      ROCSD   SensSD    SpecSD
## 2    8 0.8862103 0.7142857 0.8583333 0.08064415 0.237238 0.0993808
## 
## Strategy: Age_only
##   mtry       ROC      Sens      Spec      ROCSD   SensSD    SpecSD
## 2    8 0.8862103 0.7142857 0.8583333 0.08064415 0.237238 0.0993808
## 
## Strategy: Gender_only
##   mtry       ROC      Sens      Spec      ROCSD   SensSD    SpecSD
## 2    8 0.8862103 0.7142857 0.8583333 0.08064415 0.237238 0.0993808
## 
## Strategy: KEGG_DAS28
##   mtry       ROC      Sens      Spec      ROCSD   SensSD    SpecSD
## 2    8 0.8862103 0.7142857 0.8583333 0.08064415 0.237238 0.0993808
## 
## Strategy: KEGG_Age
##   mtry       ROC      Sens      Spec      ROCSD   SensSD    SpecSD
## 2    8 0.8862103 0.7142857 0.8583333 0.08064415 0.237238 0.0993808
## 
## Strategy: KEGG_Gender
##   mtry       ROC      Sens      Spec      ROCSD   SensSD    SpecSD
## 2    8 0.8862103 0.7142857 0.8583333 0.08064415 0.237238 0.0993808
cat("\n--- Validation Set Performance (Holdout) ---\n")
## 
## --- Validation Set Performance (Holdout) ---
for (strat_name in names(roc_results)) {
  cat(sprintf("%s - AUC: %.3f, Accuracy: %.3f\n", 
              strat_name, 
              roc_results[[strat_name]]$auc, 
              roc_results[[strat_name]]$acc))
}
## KEGG_only - AUC: 0.878, Accuracy: 0.789
## Clin_Param - AUC: 0.611, Accuracy: 0.579
## KEGG_Clin_Param - AUC: 0.922, Accuracy: 0.789
## DAS28_only - AUC: 0.600, Accuracy: 0.579
## Age_only - AUC: 0.672, Accuracy: 0.368
## Gender_only - AUC: 0.617, Accuracy: 0.632
## KEGG_DAS28 - AUC: 0.872, Accuracy: 0.737
## KEGG_Age - AUC: 0.867, Accuracy: 0.737
## KEGG_Gender - AUC: 0.844, Accuracy: 0.789
rf_models <- list()
roc_results <- list()

set.seed(set_seed)

cv_ctrl <- trainControl(
  method = "cv",
  number = 5,
  classProbs = TRUE,
  summaryFunction = twoClassSummary
)

for (strat_name in names(feature_sets)) {
  
  vars <- feature_sets[[strat_name]]
  
  cat("\nTraining strategy:", strat_name, "\n")
  
  #Train Random Forest model
  rf_mod <- train(
    x = train_set[, vars, drop = FALSE],
    y = train_set$Target,
    method = "rf",
    ntree = 500,
    metric = "ROC",
    trControl = cv_ctrl
  )
  
  #Store model
  rf_models[[strat_name]] <- rf_mod
  
  #Predict probabilities on validation set
  probs <- predict(rf_mod, newdata = val_set[, vars, drop = FALSE], type = "prob")
  
  #Predict classes
  preds <- predict(rf_mod, newdata = val_set[, vars, drop = FALSE])
  
  roc_obj <- roc(val_set$Target, probs$MTXNR, quiet = TRUE)
  acc_val <- confusionMatrix(preds, val_set$Target)$overall["Accuracy"]
  
  #Store results
  roc_results[[strat_name]] <- list(
    roc = roc_obj,
    auc = as.numeric(auc(roc_obj)),
    acc = as.numeric(acc_val)
  )
}
## 
## Training strategy: KEGG_only 
## 
## Training strategy: Clin_Param 
## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
## 
## 
## Training strategy: KEGG_Clin_Param 
## 
## Training strategy: DAS28_only
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## 
## Training strategy: Age_only
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## 
## Training strategy: Gender_only
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## 
## Training strategy: KEGG_DAS28 
## 
## Training strategy: KEGG_Age 
## 
## Training strategy: KEGG_Gender
cat("\n--- Training Results (Cross-Validation) ---\n")
## 
## --- Training Results (Cross-Validation) ---
for (strat_name in names(rf_models)) {
  
  cat(paste0("\nStrategy: ", strat_name, "\n"))
  
  mod <- rf_models[[strat_name]]
  
  best_row <- mod$results[which.max(mod$results$ROC), ]
  
  print(best_row)
}
## 
## Strategy: KEGG_only
##   mtry       ROC      Sens      Spec      ROCSD    SensSD    SpecSD
## 3   13 0.8789435 0.7107143 0.8583333 0.07790267 0.2099259 0.0993808
## 
## Strategy: Clin_Param
##   mtry      ROC      Sens      Spec     ROCSD    SensSD    SpecSD
## 2    3 0.534127 0.4892857 0.6472222 0.1778114 0.2935858 0.1494589
## 
## Strategy: KEGG_Clin_Param
##   mtry       ROC      Sens      Spec      ROCSD    SensSD    SpecSD
## 1    2 0.8334077 0.7607143 0.8583333 0.09679308 0.1888134 0.0993808
## 
## Strategy: DAS28_only
##   mtry       ROC      Sens Spec      ROCSD    SensSD    SpecSD
## 1    2 0.4890873 0.4785714  0.4 0.08547017 0.1667517 0.1493944
## 
## Strategy: Age_only
##   mtry       ROC      Sens      Spec     ROCSD    SensSD    SpecSD
## 1    2 0.6161458 0.5535714 0.5861111 0.1518314 0.1143415 0.2013649
## 
## Strategy: Gender_only
##   mtry       ROC      Sens Spec      ROCSD    SensSD    SpecSD
## 1    2 0.5482143 0.2678571  0.8 0.06985635 0.2000478 0.1987616
## 
## Strategy: KEGG_DAS28
##   mtry       ROC      Sens      Spec     ROCSD    SensSD    SpecSD
## 2    8 0.8375496 0.7357143 0.8555556 0.0480927 0.2213854 0.1065016
## 
## Strategy: KEGG_Age
##   mtry       ROC      Sens      Spec      ROCSD     SensSD    SpecSD
## 1    2 0.8538194 0.7142857 0.9055556 0.08020728 0.09698911 0.1031899
## 
## Strategy: KEGG_Gender
##   mtry       ROC      Sens      Spec      ROCSD     SensSD     SpecSD
## 1    2 0.8508929 0.7107143 0.7916667 0.06569341 0.05113508 0.09212847
cat("\n--- Validation Set Performance (Holdout) ---\n")
## 
## --- Validation Set Performance (Holdout) ---
for (strat_name in names(roc_results)) {
  
  cat(sprintf("%s - AUC: %.3f, Accuracy: %.3f\n",
              strat_name,
              roc_results[[strat_name]]$auc,
              roc_results[[strat_name]]$acc))
}
## KEGG_only - AUC: 0.878, Accuracy: 0.789
## Clin_Param - AUC: 0.644, Accuracy: 0.632
## KEGG_Clin_Param - AUC: 0.744, Accuracy: 0.579
## DAS28_only - AUC: 0.600, Accuracy: 0.579
## Age_only - AUC: 0.678, Accuracy: 0.368
## Gender_only - AUC: 0.617, Accuracy: 0.632
## KEGG_DAS28 - AUC: 0.889, Accuracy: 0.789
## KEGG_Age - AUC: 0.800, Accuracy: 0.684
## KEGG_Gender - AUC: 0.772, Accuracy: 0.579

#Suppl.figure 1 :Age and Sex metadata

rm(list = ls())
meta <- read.csv("../01_CuratingMetadata/metadataDAS28.csv",
                 stringsAsFactors = FALSE)

#Convert to factors
meta$MTXRandMTXNR <- as.factor(meta$MTXRandMTXNR)
meta$Cohort       <- as.factor(meta$Cohort)
meta$gender       <- as.factor(meta$gender)

meta$age <- as.numeric(meta$age)

t.test(age ~ MTXRandMTXNR, data = meta)
## 
##  Welch Two Sample t-test
## 
## data:  age by MTXRandMTXNR
## t = 0.31132, df = 94.143, p-value = 0.7562
## alternative hypothesis: true difference in means between group MTXNR and group MTXR is not equal to 0
## 95 percent confidence interval:
##  -5.148716  7.063610
## sample estimates:
## mean in group MTXNR  mean in group MTXR 
##            49.00000            48.04255
summary_age <- meta %>%
  group_by(MTXRandMTXNR) %>%
  summarise(
    mean_age = mean(age, na.rm = TRUE),
    .groups = "drop"
  )



summary_age_cohort <- meta %>%
  group_by(Cohort, MTXRandMTXNR) %>%
  summarise(
    mean_age = mean(age, na.rm = TRUE),
    .groups = "drop" )



summary_gender <- meta %>%
  group_by(MTXRandMTXNR, gender) %>%
  summarise(
    n = n(),
    .groups = "drop")

#1. Age Combined Plot
P_age <- ggplot(summary_age, aes(x = MTXRandMTXNR, y = mean_age, fill = MTXRandMTXNR)) +
  geom_col(width = 0.6, alpha = 0.7) +
  geom_point(data = meta, 
             aes(x = MTXRandMTXNR, y = age, shape = Cohort, fill = MTXRandMTXNR), 
             color = "black", 
             stroke = 0.5,
             position = position_jitter(width = 0.1, height = 0), 
             size = 2) +
  scale_fill_manual(values = c("MTXNR" = "darksalmon", "MTXR" = "#238A8DFF")) +
  scale_shape_manual(values = c("Cohort-1" = 22, "Cohort-2" = 21, "Cohort-3" = 24)) +
  labs(x = "Response Status", y = "Age (years)", shape = "Cohort") +
  theme_classic(base_size = 14)

#2. Age Faceted by Cohort
P_age_cohort <- ggplot(summary_age_cohort, aes(x = MTXRandMTXNR, y = mean_age, fill = MTXRandMTXNR)) +
  geom_col(width = 0.6, alpha = 0.7) + 
  geom_point(data = meta, 
             aes(x = MTXRandMTXNR, y = age, shape = Cohort, fill = MTXRandMTXNR),
             color = "black", 
             stroke = 0.5,
             position = position_jitter(width = 0.15, height = 0),
             size = 2) +
  scale_fill_manual(values = c("MTXNR" = "darksalmon", "MTXR" = "#238A8DFF")) +
  scale_shape_manual(values = c("Cohort-1" = 22, "Cohort-2" = 21, "Cohort-3" = 24)) +
  facet_wrap(~Cohort, nrow = 1, scales = "free_y") +
  labs(x = "Response Status", y = "Age (years)") +
  theme_classic(base_size = 14) +
  theme(legend.position = "none",
        axis.title.x = element_blank(),
        axis.text.x = element_blank())

#3. Gender Counts Plot
#Note: No points here as it is a count summary, but keeping the colors consistent.
P_gender <- ggplot(summary_gender, aes(x = MTXRandMTXNR, y = n, fill = MTXRandMTXNR)) + 
  geom_col(width = 0.6, alpha = 0.8) + 
  facet_wrap(~ gender) +
  scale_fill_manual(values = c("MTXNR" = "darksalmon", "MTXR" = "#238A8DFF")) +
  labs(x = "Response Status", y = "Number of subjects", fill = "Response") + 
  theme_classic(base_size = 14) +
  theme(strip.text = element_text(face = "bold"))
P_age_cohort

P_gender

ggsave("Age_in_MTXR_MTXNR.pdf", P_age, width = 5, height = 3)
ggsave("Age_in_Each_Cohort.pdf", P_age_cohort, width = 8, height = 3)
ggsave("Gender_counts_MTXR_MTXNR_facet_gender.pdf", P_gender, width = 6, height = 3,  useDingbats=F)