Note: Using the serum data as an example, the analysis of intestine contents, feces and liver samples only require adjustment of some parameters.

1.BA composition (BAI and pie chart)

# 1.BA composition (BAI and pie chart) ####
rm(list=ls())

####数据处理####
# 数据清洗
library(readxl)

## Warning: package 'readxl' was built under R version 4.4.3

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.4.3

## Warning: package 'ggplot2' was built under R version 4.4.3

## Warning: package 'tibble' was built under R version 4.4.3

## Warning: package 'tidyr' was built under R version 4.4.3

## Warning: package 'readr' was built under R version 4.4.3

## Warning: package 'purrr' was built under R version 4.4.3

## Warning: package 'dplyr' was built under R version 4.4.3

## Warning: package 'stringr' was built under R version 4.4.3

## Warning: package 'forcats' was built under R version 4.4.3

## Warning: package 'lubridate' was built under R version 4.4.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(dplyr)
library(tidyr)
data<-read_excel("data/20250730_Zhou_BA.xlsx", sheet = 2) #不同数据更改sheet
data<-data[1:17,] #根据数据情况修改
colnames(data)

##  [1] "serum"             "Conc. (nmol/L)"    "TwMCA"            
##  [4] "TaMCA"             "TbMCA"             "GbMCA"            
##  [7] "TUDCA"             "THDCA"             "GHCA"             
## [10] "TCA"               "GUDCA"             "GHDCA"            
## [13] "wMCA"              "aMCA"              "7keto_DCA"        
## [16] "bMCA"              "GCA"               "MDCA"             
## [19] "3Keto,7a,12a(OH)2" "TCDCA"             "HCA"              
## [22] "UDCA"              "TDCA"              "HDCA"             
## [25] "CA"                "GCDCA"             "7keto_LCA"        
## [28] "isoDCA"            "12keto_LCA"        "GDCA"             
## [31] "TLCA"              "CDCA"              "GLCA"             
## [34] "DCA"               "isoLCA"            "allo_isoLCA"      
## [37] "3KetoLCA"          "LCA"               "CA-7-S"           
## [40] "CDCA-3-S"          "DCA-3-S"           "LCA-3-S"          
## [43] "UDCA-3-S"          "CA-3-S"            "NCA"              
## [46] "NDCA"

sample_ids <- data$`serum` #根据情况改
data$group <- c(rep("aged", 9), rep("young", 8)) #根据情况修改

group_info <- data$group
colnames(data)

##  [1] "serum"             "Conc. (nmol/L)"    "TwMCA"            
##  [4] "TaMCA"             "TbMCA"             "GbMCA"            
##  [7] "TUDCA"             "THDCA"             "GHCA"             
## [10] "TCA"               "GUDCA"             "GHDCA"            
## [13] "wMCA"              "aMCA"              "7keto_DCA"        
## [16] "bMCA"              "GCA"               "MDCA"             
## [19] "3Keto,7a,12a(OH)2" "TCDCA"             "HCA"              
## [22] "UDCA"              "TDCA"              "HDCA"             
## [25] "CA"                "GCDCA"             "7keto_LCA"        
## [28] "isoDCA"            "12keto_LCA"        "GDCA"             
## [31] "TLCA"              "CDCA"              "GLCA"             
## [34] "DCA"               "isoLCA"            "allo_isoLCA"      
## [37] "3KetoLCA"          "LCA"               "CA-7-S"           
## [40] "CDCA-3-S"          "DCA-3-S"           "LCA-3-S"          
## [43] "UDCA-3-S"          "CA-3-S"            "NCA"              
## [46] "NDCA"              "group"

data_mat <- data %>% dplyr::select(-group,-`Conc. (nmol/L)`, -`serum`) # 去掉非数值列

rownames(data)<-sample_ids

## Warning: Setting row names on a tibble is deprecated.

# 过滤高缺失特征（阈值可调）
filtered_data <- data_mat %>% dplyr::select(where(~ mean(!is.na(.)) > 0.5)) # 保留缺失率 <50% 的代谢物

#缺失值填补(检测下限的一半½ LOD)
imputed_data <- as.matrix(filtered_data)
imputed_data[is.na(imputed_data) | imputed_data == 0] <- 0.15
imputed_data <- as.data.frame(imputed_data)
imputed_data <- imputed_data %>%
  mutate(across(everything(), ~ as.numeric(.)))

Sumimputed_data<-rowSums(imputed_data, na.rm = TRUE)  
Sumimputed_data<-as.data.frame(Sumimputed_data)
colnames(Sumimputed_data)<-"Total Bile Acids"

# 百分比
per_BA<-sweep(imputed_data, 1, # 1表示按行操作，2表示按列操作
              Sumimputed_data$`Total Bile Acids`, #用于操作的统计值（这里是每行的总和）
              "/") #操作函数（这里是除法 /）

# 计算BAI, Bile Acid Hydrophobicity Index 
# BAI=1*LCA+0.86*DCA+0.55*CDCA+0.39*CA-0.04*UDCA+0.18*GCA+0.17*TCA+0.38*GDCA+0.29*TDCA+0.27*GCDCA+0.15*TCDCA-0.26*TUDCA
weights <- c(
  LCA   = 1,
  DCA   = 0.86,
  CDCA  = 0.55,
  CA    = 0.39,
  UDCA  = -0.04,
  GCA   = 0.18,
  TCA   = 0.17,
  GDCA  = 0.38,
  TDCA  = 0.29,
  GCDCA = 0.27,
  TCDCA = 0.15,
  TUDCA = -0.26
)

weight_tbl <- tibble(
  metabolite = names(weights),
  weight = as.numeric(weights)
)

BAI_df <- per_BA %>%
  mutate(Sample = row_number()) %>%   # 如果已有 SampleID 列，这里替换
  pivot_longer(
    cols = -Sample,
    names_to = "metabolite",
    values_to = "value"
  ) %>%
  left_join(weight_tbl, by = "metabolite") %>%
  mutate(
    contribution = ifelse(is.na(weight), 0, value * weight)
  ) %>%
  group_by(Sample) %>%
  summarise(BAI = sum(contribution, na.rm = TRUE), .groups = "drop")
BAI_df$Sample<-sample_ids
print(BAI_df)

## # A tibble: 17 × 2
##    Sample   BAI
##     <dbl> <dbl>
##  1   9027 0.223
##  2   9028 0.317
##  3   9030 0.356
##  4   9031 0.264
##  5   9032 0.376
##  6   9033 0.434
##  7   9034 0.306
##  8   9035 0.258
##  9   9036 0.227
## 10   9193 0.474
## 11   9194 0.299
## 12   9195 0.127
## 13   9196 0.172
## 14   9197 0.334
## 15   9198 0.530
## 16   9199 0.389
## 17   9200 0.249

library(ggpubr)

## Warning: package 'ggpubr' was built under R version 4.4.3

library(ggrepel)

## Warning: package 'ggrepel' was built under R version 4.4.3

library(forcats)

#配色
library(RColorBrewer)
cols <- c("#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd",
          "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf",
  brewer.pal(12, "Set3"),
          brewer.pal(8 , "Set2"),
          brewer.pal(9 , "Pastel1"),
          brewer.pal(11, "Paired"),
          brewer.pal(8 , "Accent")) 
library(scales)

## Warning: package 'scales' was built under R version 4.4.3

## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

show_col(cols, labels = TRUE)#查看配色

# 行为样本可运行如下代码
plot_dat <- per_BA %>%
  mutate(Sample = sample_ids) %>%  #加样本ID
  mutate(Group=group_info) %>%  #加分组
  pivot_longer(
    cols = -c(Sample, Group),
    names_to = "BA",
    values_to = "Value"
  ) %>%
  group_by(Sample) %>%
  mutate(Share = Value / sum(Value, na.rm=TRUE)) %>%
  ungroup()

# 每组有多个重复，求share的平均
plot_dat <- plot_dat %>%               # 原始长表：Sample/Group/BA/Share
  group_by(Group, BA) %>%
  summarise(avg_Share = mean(Share, na.rm = TRUE), .groups = "drop")

# 在绘图前就把BA固定为一个全局一致的因子，否则每个分组里重新排序因子水平，会导致相同的 BA 在不同组里排序不同，颜色就乱了
# （可选）如果你想让颜色跟平均丰度的全局大小排序，可以这样运行下面五行
ba_levels <- plot_dat %>%
  group_by(BA) %>%
  summarise(total_share = sum(avg_Share), .groups = "drop") %>%
  arrange(desc(total_share)) %>%
  pull(BA)

plot_dat <- plot_dat %>%
  mutate(BA = factor(BA, levels = ba_levels))

#用极坐标画饼图
pie_fun <-function(df, group_name, top_n = 5) {
  tmp <- df %>%
    filter(Group == group_name) %>% #取当前分组
    arrange(desc(avg_Share)) %>%  #计算每组平均share 再按占比降序排
    mutate(rank = row_number(desc(avg_Share)),
      label = ifelse(rank <= top_n, #前n名显示标签:“代谢物，百分比”
                     paste0(BA, ",", sprintf("%.2f%%", avg_Share * 100)), 
                     ""),
      fraction = avg_Share / sum(avg_Share), # 归一化，保证总和=1
      ymax = cumsum(fraction),
      ymin = lag(ymax, default = 0), # 每个扇形的“下边界”
      mid = (ymin + ymax) / 2, # 中心位置
      angle = 360 * mid,       # 极坐标角度
      hjust = ifelse(angle > 180, 1, 0), # 文本水平对齐
      angle_text = ifelse(angle > 180, angle - 180, angle) # 文本旋转，避免倒写
    )
  
  ggplot(tmp, aes(ymax = ymax, ymin = ymin, xmax = 1, xmin = 0, 
                  fill =BA)) +
    geom_rect(color = "white") + # 画扇形（白色边框）
    coord_polar(theta = "y") + # 把 y 轴弯成极坐标 → 变成饼图
    geom_text(aes(x = 1.1, #把标签放到半径 1.1的位置，即圆饼外侧,可以修改
          y = mid, label = label),
      angle = 0, #标签文字水平；如需随扇形旋转，可改成 angle = angle_text
      hjust = 0.5, # 0左对齐，0.5居中
      size = 3.2,
      inherit.aes = FALSE
    ) +
    scale_fill_manual(values = cols) + #配色
    labs(title = group_name, fill = NULL) +
    theme_void() + #去掉坐标轴
    theme(
      plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
      legend.position = "right",
      legend.text = element_text(size = 7) 
    )
}

# 绘图与拼图 
library(purrr)
groups <- c("aged","young") #根据自己情况修改
pie_list <-  purrr::map(  # 循环
  groups, ~ pie_fun(plot_dat, .x))
combined <- ggarrange(plotlist = pie_list,
                      ncol = 2, nrow = 1,
                      common.legend = TRUE, legend = "right")
# 添加标题
combined <- annotate_figure(
  combined,
  top = text_grob("Bile acids relative abundance (serum)", size = 16, face = "bold")
)
print(combined)

2.单变量差异分析

#常用的差异代谢产物分析方法： 单变量统计分析（t检验或单因素方差分析），联合多元统计分析（PLS-DA）,倍数变化（FC），多种方法共同筛选
#代谢组数据具有高维，高噪音，高变异的特点，一般采用多元统计分析方法，如PLS-DA或OPLS-DA
#PLS-DA偏最小二乘判别分析是一种监督模式识别的多元统计分析方法，将多维数据在压缩前先按需要寻找的差异因素分组，用于筛选组间差异代谢产物
#变量差异贡献度（VIP， varaible important in projection）是(O)PLS-DA模型的变量权重值，用于衡量代谢产物积累差异对各组样本分类判别的影响强度和解释能力
#VIP≥1是常见的差异代谢物筛选标准

#Transformation（数据转换）
log_data <- imputed_data %>%
  mutate(across(where(is.numeric), ~ log2(. + 1)))

#Scaling 标准化 Z-score
scaled_data <-as.data.frame(scale(log_data, center = TRUE, scale = TRUE))  #auto scaling (Z-score)
#Mean centering: scale(log_data, scale = FALSE)

#Normalization可选

##差异代谢物分析用log_data，不要scaling

#### 2.1正态性检验####
#n < 30，Shapiro-Wilk 检验 是最常用的正态性检验方法
log_data$group<-group_info
log_data$group <- factor(log_data$group)

metabolites <- colnames(log_data[,-ncol(log_data)]) 

normality_results <- lapply(metabolites, function(met) { #对向量 metabolites 中的每一个代谢物名字循环执行函数
  list(
    aged  = shapiro.test(log_data[[met]][log_data$group == "aged"]), #取出该代谢物的整列数据
    young = shapiro.test(log_data[[met]][log_data$group == "young"])
  )
})
names(normality_results) <- metabolites

#输出
library(purrr)
library(tidyverse)
map_dfr(names(normality_results), ~ {
  map_dfr(c("aged", "young"), function(g) {
    tibble(
      metabolite = .x,
      group      = g,
      p.value    = normality_results[[.x]][[g]]$p.value
    )
  })
})

## # A tibble: 34 × 3
##    metabolite group   p.value
##    <chr>      <chr>     <dbl>
##  1 TwMCA      aged  0.387    
##  2 TwMCA      young 0.360    
##  3 TaMCA      aged  0.126    
##  4 TaMCA      young 0.517    
##  5 TbMCA      aged  0.0856   
##  6 TbMCA      young 0.370    
##  7 GbMCA      aged  0.0000104
##  8 GbMCA      young 0.312    
##  9 TUDCA      aged  0.149    
## 10 TUDCA      young 0.0351   
## # ℹ 24 more rows

# p > 0.05，说明该组该代谢物近似正态分布
# p < 0.05，说明偏离正态，考虑非参数检验（如 Mann-Whitney U 检验）


#### 2.2 方差齐性检验 ####
library(car)

## Warning: package 'car' was built under R version 4.4.3

## Loading required package: carData

## Warning: package 'carData' was built under R version 4.4.3

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following object is masked from 'package:purrr':
## 
##     some

#Levene 检验
levene_results <- lapply(metabolites, function(met) {
  leveneTest(as.formula(paste0("`", met, "` ~ group")), data = log_data)
})
names(levene_results) <- metabolites

map_dfr(names(levene_results),
        ~ tibble(metabolite = .x,
                 Pr = levene_results[[.x]]["Pr(>F)"][[1]][1]  )) # 提取 Pr(>F)

## # A tibble: 17 × 2
##    metabolite     Pr
##    <chr>       <dbl>
##  1 TwMCA      0.276 
##  2 TaMCA      0.0943
##  3 TbMCA      0.483 
##  4 GbMCA      0.961 
##  5 TUDCA      0.200 
##  6 THDCA      0.106 
##  7 TCA        0.156 
##  8 wMCA       0.297 
##  9 bMCA       0.470 
## 10 GCA        0.314 
## 11 TCDCA      0.0843
## 12 UDCA       0.193 
## 13 TDCA       0.0905
## 14 CA         0.548 
## 15 CDCA       0.171 
## 16 DCA        0.717 
## 17 LCA        0.158

#Pr(>F) ≥ 0.05→ 不能拒绝零假设，认为各组方差齐性，可以安全使用普通 t 检验或 ANOVA
#< 0.05→ 拒绝零假设，认为各组方差不齐，考虑使用 Welch's t 检验 或 非参数检验

#正态+方差齐：独立样本 t 检验
#正态+方差不齐：Welch's t 检验 (t.test(..., var.equal = FALSE))
#非正态：Mann-Whitney U 检验

library(tidyverse)
#  整理正态性结果
norm_tbl <- normality_results %>%               # 原来的 list
  imap_dfr(~{                                 # .x = list(aged, young), .y = 代谢物名
    aged_p  <- .x$aged$p.value
    young_p <- .x$young$p.value
    tibble(metabolite = .y,
           norm_aged  = aged_p,
           norm_young = young_p)
  })

# 整理方差齐性结果
levene_tbl <- levene_results %>% 
  imap_dfr(~ tibble(metabolite = .y,
                    levene_p   = .x["Pr(>F)"][1, 1]))

# 合并
all_res <- norm_tbl %>% left_join(levene_tbl, by = "metabolite")

# 筛选正态性检验中 aged、young两组的p.value > 0.05，且方差齐性检验中 Pr(>F) > 0.05的代谢物名字向量
qualified_met <- all_res %>% 
  filter(norm_aged > 0.05,
         norm_young > 0.05,
         levene_p   > 0.05) %>% 
  pull(metabolite)

non_qualified_met <- setdiff(metabolites, qualified_met)



#### 2.3 t-test + FDR ####
# 拆分组数据
log_data1<-log_data %>% 
  dplyr::select(all_of(qualified_met), group)

met_cols <- setdiff(names(log_data1), "group")
grouped <- split(log_data1, group_info)

# t检验 map_dfr属于purrr包
ttest_results <- map_dfr(met_cols, # 遍历每一个代谢物名字
                         function(metabolite) { # 对每个代谢物执行：
                           aged_vals <- grouped$aged[[metabolite]] # aged 组的该代谢物值
                           young_vals <- grouped$young[[metabolite]] # young 组的该代谢物值
                           
                           test <- t.test(aged_vals, young_vals) # 独立样本 t 检验
                           
                           data.frame( # 返回一行结果
                             Metabolite = metabolite,
                             log2FC = mean(aged_vals) - mean(young_vals) ,#如果前面没有数据转换就取log：log2(mean(aged_vals) / mean(young_vals))
                             p_value = test$p.value
                           )
                         }) %>%
  mutate(FDR = p.adjust(p_value, method = "fdr")) #多重检验校正

# 查看前几行
print(ttest_results)

##   Metabolite      log2FC   p_value       FDR
## 1      TwMCA  0.35766163 0.6825450 0.9255022
## 2      TaMCA -0.22933908 0.8003824 0.9255022
## 3      TbMCA -0.74085599 0.4396976 0.9255022
## 4        TCA  0.16194032 0.8575443 0.9255022
## 5      TCDCA  0.07272629 0.9255022 0.9255022
## 6       CDCA  0.47112573 0.7143222 0.9255022
## 7        LCA  0.73746833 0.4174810 0.9255022

#### 2.4 Mann-Whitney U 检验 ####
log_data2<-log_data %>% 
  dplyr::select(all_of(non_qualified_met), group)
grouped <- split(log_data1, group_info)

# 提取代谢物列名
met_cols2 <- colnames(log_data2)[colnames(log_data2) != "group"]

# 批量做 Wilcoxon 检验（Mann-Whitney U）
wilcox_results <- map_dfr(met_cols2, function(met) {
  aged_vals <- log_data2[[met]][log_data2$group == "aged"]
  young_vals <- log_data2[[met]][log_data2$group == "young"]
  test <- wilcox.test(aged_vals, young_vals) #检验方法
  data.frame(
    Metabolite = met,
    median_aged = median(aged_vals, na.rm = TRUE),
    median_young = median(young_vals, na.rm = TRUE),
    p_value = test$p.value
  )
}) %>%
  mutate(FDR = p.adjust(p_value, method = "fdr"))

## Warning in wilcox.test.default(aged_vals, young_vals): cannot compute exact
## p-value with ties

#  报cannot compute exact p-value with ties，说明数据中存在相同的观测值，因此wilcox.test() 无法使用精确算法计算 p 值
# 样本量较大时（每组 n > 20–30），近似结果与精确值几乎无差别，可直接忽略警告
# 否则，可以用Monte-Carlo 模拟
wilcox_results <- map_dfr(met_cols2, function(met) {
  aged_vals  <- log_data2[[met]][log_data2$group == "aged"]
  young_vals <- log_data2[[met]][log_data2$group == "young"]
  # Monte-Carlo 模拟，1 万次
  test <- wilcox.test(aged_vals, young_vals,
                      exact = FALSE,           # 关闭精确法
                      simulate.p.value = TRUE, # 启用 Monte-Carlo
                      B = 10000)               # 模拟次数
  data.frame(
    Metabolite   = met,
    median_aged  = median(aged_vals,  na.rm = TRUE),
    median_young = median(young_vals, na.rm = TRUE),
    p_value      = test$p.value
  )
}) %>%
  mutate(FDR = p.adjust(p_value, method = "fdr"))

print(wilcox_results)

##    Metabolite median_aged median_young    p_value       FDR
## 1       GbMCA    6.486393     6.488053 0.96162657 0.9616266
## 2       TUDCA    7.254462     6.283231 0.26847209 0.3835316
## 3       THDCA    7.407183     6.125423 0.09219360 0.2247024
## 4        wMCA    8.941018     7.280636 0.07504923 0.2247024
## 5        bMCA    9.403843     8.333282 0.13583337 0.2263889
## 6         GCA    5.770036     5.492901 0.47048642 0.5227627
## 7        UDCA    6.235727     6.405360 0.31202474 0.3900309
## 8        TDCA    9.087357     7.436421 0.09219360 0.2247024
## 9          CA    9.893575     7.894435 0.07504923 0.2247024
## 10        DCA   11.644460    11.526048 0.11235120 0.2247024

3.单个代谢物的箱线图

library(ggpubr)
library(ggplot2)

## total BA barplot
log_datasum <- Sumimputed_data %>%
  mutate(across(where(is.numeric), ~ log2(. + 1)))
log_datasum$group<-group_info
log_datasum$group <- factor(log_data$group)

p2 <- ggbarplot(log_datasum,x = "group",y = "Total Bile Acids",
                fill = "group", ## 柱状图通常用 fill
                palette = c("#E64B35FF", "#4DBBD5FF"),
                alpha    = 0.3,
                add = "mean_sd") + # 可加均值误差线，或 "mean_se
  scale_y_continuous(limits = c(0, NA),      # 强制从 0 开始
                     expand = expansion(0)   # 去掉上下留白，防止原点被抬高
  ) +
  geom_jitter( aes(color = group), size  = 1.5, width = 0.2, alpha = 1) + #显示原始数据点
  stat_compare_means(method = "wilcox.test",
                     label  = "p.format",      # 只显示 p 值
                     vjust  = -1)

p3 <- p2 +
  theme_classic(base_size = 13) +
  theme(
    axis.title.y = element_text(margin = margin(r = 8)),  # 标题离轴远点
    axis.ticks.length = unit(-2, "pt")                    # 刻度朝内
  )
print(p3)

#批量画图
out_dir <- "output/boxplots_BA_serum"
dir.create(out_dir, showWarnings = FALSE)

# t检验的代谢物储存在met_cols，Wilcoxon检验的代谢物储存在met_cols2

#注意数字开头的列名在 ggpubr 的 aes 解析里会被当成“非法表达式”，需要转换
#用for循环给每个代谢物画一个分组箱线图，并用 stat_compare_means() 添加 t 检验的 p 值
for (met in met_cols) {
  ##先给数据框临时起个合法的列名
  tmp_col <- "tmp_met"               # 固定临时名
  log_data[[tmp_col]] <- log_data[[met]] #把数据框log_data里列名为met的那一列的所有值，复制到同一数据框里一个临时列tmp_col上
  
  #画图
  p <- ggboxplot(
    log_data,
    x = "group",
    y = tmp_col,
    color = "group",
    palette = c("#E64B35FF", "#4DBBD5FF"),
    add = "jitter"
  ) +
    stat_compare_means(method = "t.test") + 
    labs(y = paste0("log2 Conc. ", met, " (ng/mL)"))                    # 把 y 轴标签复原
  
  #保存
  ggsave(file.path(out_dir, paste0(met, ".pdf")),
         plot = p, width = 5, height = 4)
  
  #删除临时列，避免污染数据框
  log_data[[tmp_col]] <- NULL
}

#Wilcoxon检验的代谢物循环
for (met in met_cols2) {
  ##先给数据框临时起个合法的列名
  tmp_col <- "tmp_met"               # 固定临时名
  log_data[[tmp_col]] <- log_data[[met]] #把数据框log_data里列名为met的那一列的所有值，复制到同一数据框里一个临时列tmp_col上
  
  #画图
  p <- ggboxplot(
    log_data,
    x = "group",
    y = tmp_col,
    color = "group",
    palette = c("#E64B35FF", "#4DBBD5FF"),
    add = "jitter"
  ) +
    stat_compare_means(method = "wilcox.test") + #检验方法
    labs(y = paste0("log2 Conc. ", met, " (ng/mL)"))                    # 把 y 轴标签复原
  #保存
  ggsave(file.path(out_dir, paste0(met, ".pdf")),
         plot = p, width = 5, height = 4)
  
  #删除临时列，避免污染数据框
  log_data[[tmp_col]] <- NULL
}



#拼图
library(patchwork)

## Warning: package 'patchwork' was built under R version 4.4.3

library(ggpubr)

plots <- list()  # 用于保存所有图

# t 检验
for (met in met_cols) {
  tmp_col <- "tmp_met"
  log_data[[tmp_col]] <- log_data[[met]]
  
  p <- ggboxplot(
    log_data,
    x = "group",
    y = tmp_col,
    color = "group",
    palette = c("#E64B35FF", "#4DBBD5FF"),
    add = "jitter"
  ) +
    stat_compare_means(method = "t.test") +
    labs(y = paste0("log2 Conc. ", met, " (ng/mL)"), title = paste0(met, " (t-test)"))
  
  plots[[length(plots) + 1]] <- p  # 存入列表
  
  log_data[[tmp_col]] <- NULL
}

# Wilcoxon 检验
for (met in met_cols2) {
  tmp_col <- "tmp_met"
  log_data[[tmp_col]] <- log_data[[met]]
  
  p <- ggboxplot(
    log_data,
    x = "group",
    y = tmp_col,
    color = "group",
    palette = c("#E64B35FF", "#4DBBD5FF"),
    add = "jitter"
  ) +
    stat_compare_means(method = "wilcox.test") +
    labs(y = paste0("log2 Conc. ", met, " (ng/mL)"), title = paste0(met, " (Wilcoxon)"))
  
  plots[[length(plots) + 1]] <- p  # 存入列表
  
  log_data[[tmp_col]] <- NULL
}

# 拼图
combined_plot <- wrap_plots(plots, ncol = 6)  # 可调 ncol 或 nrow
print(combined_plot)

4.聚类分析：热图，用scaled_data

library(pheatmap)

## Warning: package 'pheatmap' was built under R version 4.4.3

scaled_data$group<-group_info
data_matrix <- as.matrix(scaled_data[, 1:17])  # 改成提取需要的数据矩阵
rownames(data_matrix) <- sample_ids

#样本注释
annotation_row <- data.frame(Group = scaled_data$group)
rownames(annotation_row) <- sample_ids

identical(rownames(data_matrix), rownames(annotation_row)) # TRUE

## [1] TRUE

# 设置颜色（可自定义）
ann_colors <- list(
  Group = c(aged = "orange", young = "steelblue")  #或aged = "#E64B35FF", young = "#4DBBD5FF"
)

pheatmap(
  data_matrix, #数值矩阵
  scale = "none",                # 数据已标准化，无需再次 scale
  annotation_row = annotation_row,
  annotation_colors = ann_colors,
  clustering_distance_rows = "euclidean",
  clustering_method = "complete",
  cluster_rows      = F,         # 不样本聚类
  cluster_cols      = T,     # 聚类代谢物列
  show_rownames = TRUE,
  show_colnames = TRUE, # 显示代谢物名
  fontsize = 10,
  main = "Heatmap of bile acids in serum"  #改标题
)

5.PLS-DA（Partial Least Squares Discriminant Analysis

library(mixOmics)

## Loading required package: MASS

## Warning: package 'MASS' was built under R version 4.4.3

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:patchwork':
## 
##     area

## The following object is masked from 'package:dplyr':
## 
##     select

## Loading required package: lattice

## Warning: package 'lattice' was built under R version 4.4.3

## 
## Loaded mixOmics 6.30.0
## Thank you for using mixOmics!
## Tutorials: http://mixomics.org
## Bookdown vignette: https://mixomicsteam.github.io/Bookdown
## Questions, issues: Follow the prompts at http://mixomics.org/contact-us
## Cite us:  citation('mixOmics')

## 
## Attaching package: 'mixOmics'

## The following object is masked from 'package:purrr':
## 
##     map

# 准备输入数据
X <- log_data [,-ncol(log_data)]  # 数值型矩阵(如果以及是就不用删除列)
Y <- group_info   # 因变量（组别）

# 运行 PLS-DA
plsda_res <- plsda(X, Y, ncomp = 2)

# 可视化结果
plotIndiv(plsda_res,
          comp = c(1, 2),
          group = Y,
          ellipse = TRUE,
          legend = TRUE,
          title = 'PLS-DA: Aged vs Young')

# 提取 VIP 值
vip_scores <- vip(plsda_res)

# 查看每个变量的 VIP（按第一主成分排序）
vip_df <- data.frame(
  Metabolite = colnames(X),
  VIP = vip_scores[, 1]
) %>%
  arrange(desc(VIP))

head(vip_df, 10)  # 前10个重要变量

##       Metabolite       VIP
## UDCA        UDCA 1.7814826
## wMCA        wMCA 1.7098297
## TDCA        TDCA 1.6645763
## CA            CA 1.3788628
## GbMCA      GbMCA 1.1728850
## THDCA      THDCA 1.1024520
## LCA          LCA 1.0165454
## TbMCA      TbMCA 0.9358815
## bMCA        bMCA 0.8577293
## TwMCA      TwMCA 0.5098213

# 可视化 Top VIP 变量
top_vip <- vip_df %>%
  slice(1:20)  # Top 20

ggplot(top_vip, aes(x = reorder(Metabolite, VIP), y = VIP)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "VIP Metabolites (PLS-DA)",
       x = "Metabolite", y = "VIP Score") +
  theme_minimal()

BA_analysis

Chen

2025-09-07

Note: Using the serum data as an example, the analysis of intestine contents, feces and liver samples only require adjustment of some parameters.

1.BA composition (BAI and pie chart)

2.单变量差异分析

3.单个代谢物的箱线图

4.聚类分析：热图，用scaled_data

5.PLS-DA（Partial Least Squares Discriminant Analysis