# 1.BA composition (BAI and pie chart) ####
rm(list=ls())
####数据处理####
# 数据清洗
library(readxl)
## Warning: package 'readxl' was built under R version 4.4.3
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'tibble' was built under R version 4.4.3
## Warning: package 'tidyr' was built under R version 4.4.3
## Warning: package 'readr' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.3
## Warning: package 'stringr' was built under R version 4.4.3
## Warning: package 'forcats' was built under R version 4.4.3
## Warning: package 'lubridate' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(tidyr)
data<-read_excel("data/20250730_Zhou_BA.xlsx", sheet = 2) #不同数据更改sheet
data<-data[1:17,] #根据数据情况修改
colnames(data)
## [1] "serum" "Conc. (nmol/L)" "TwMCA"
## [4] "TaMCA" "TbMCA" "GbMCA"
## [7] "TUDCA" "THDCA" "GHCA"
## [10] "TCA" "GUDCA" "GHDCA"
## [13] "wMCA" "aMCA" "7keto_DCA"
## [16] "bMCA" "GCA" "MDCA"
## [19] "3Keto,7a,12a(OH)2" "TCDCA" "HCA"
## [22] "UDCA" "TDCA" "HDCA"
## [25] "CA" "GCDCA" "7keto_LCA"
## [28] "isoDCA" "12keto_LCA" "GDCA"
## [31] "TLCA" "CDCA" "GLCA"
## [34] "DCA" "isoLCA" "allo_isoLCA"
## [37] "3KetoLCA" "LCA" "CA-7-S"
## [40] "CDCA-3-S" "DCA-3-S" "LCA-3-S"
## [43] "UDCA-3-S" "CA-3-S" "NCA"
## [46] "NDCA"
sample_ids <- data$`serum` #根据情况改
data$group <- c(rep("aged", 9), rep("young", 8)) #根据情况修改
group_info <- data$group
colnames(data)
## [1] "serum" "Conc. (nmol/L)" "TwMCA"
## [4] "TaMCA" "TbMCA" "GbMCA"
## [7] "TUDCA" "THDCA" "GHCA"
## [10] "TCA" "GUDCA" "GHDCA"
## [13] "wMCA" "aMCA" "7keto_DCA"
## [16] "bMCA" "GCA" "MDCA"
## [19] "3Keto,7a,12a(OH)2" "TCDCA" "HCA"
## [22] "UDCA" "TDCA" "HDCA"
## [25] "CA" "GCDCA" "7keto_LCA"
## [28] "isoDCA" "12keto_LCA" "GDCA"
## [31] "TLCA" "CDCA" "GLCA"
## [34] "DCA" "isoLCA" "allo_isoLCA"
## [37] "3KetoLCA" "LCA" "CA-7-S"
## [40] "CDCA-3-S" "DCA-3-S" "LCA-3-S"
## [43] "UDCA-3-S" "CA-3-S" "NCA"
## [46] "NDCA" "group"
data_mat <- data %>% dplyr::select(-group,-`Conc. (nmol/L)`, -`serum`) # 去掉非数值列
rownames(data)<-sample_ids
## Warning: Setting row names on a tibble is deprecated.
# 过滤高缺失特征(阈值可调)
filtered_data <- data_mat %>% dplyr::select(where(~ mean(!is.na(.)) > 0.5)) # 保留缺失率 <50% 的代谢物
#缺失值填补(检测下限的一半½ LOD)
imputed_data <- as.matrix(filtered_data)
imputed_data[is.na(imputed_data) | imputed_data == 0] <- 0.15
imputed_data <- as.data.frame(imputed_data)
imputed_data <- imputed_data %>%
mutate(across(everything(), ~ as.numeric(.)))
Sumimputed_data<-rowSums(imputed_data, na.rm = TRUE)
Sumimputed_data<-as.data.frame(Sumimputed_data)
colnames(Sumimputed_data)<-"Total Bile Acids"
# 百分比
per_BA<-sweep(imputed_data, 1, # 1表示按行操作,2表示按列操作
Sumimputed_data$`Total Bile Acids`, #用于操作的统计值(这里是每行的总和)
"/") #操作函数(这里是除法 /)
# 计算BAI, Bile Acid Hydrophobicity Index
# BAI=1*LCA+0.86*DCA+0.55*CDCA+0.39*CA-0.04*UDCA+0.18*GCA+0.17*TCA+0.38*GDCA+0.29*TDCA+0.27*GCDCA+0.15*TCDCA-0.26*TUDCA
weights <- c(
LCA = 1,
DCA = 0.86,
CDCA = 0.55,
CA = 0.39,
UDCA = -0.04,
GCA = 0.18,
TCA = 0.17,
GDCA = 0.38,
TDCA = 0.29,
GCDCA = 0.27,
TCDCA = 0.15,
TUDCA = -0.26
)
weight_tbl <- tibble(
metabolite = names(weights),
weight = as.numeric(weights)
)
BAI_df <- per_BA %>%
mutate(Sample = row_number()) %>% # 如果已有 SampleID 列,这里替换
pivot_longer(
cols = -Sample,
names_to = "metabolite",
values_to = "value"
) %>%
left_join(weight_tbl, by = "metabolite") %>%
mutate(
contribution = ifelse(is.na(weight), 0, value * weight)
) %>%
group_by(Sample) %>%
summarise(BAI = sum(contribution, na.rm = TRUE), .groups = "drop")
BAI_df$Sample<-sample_ids
print(BAI_df)
## # A tibble: 17 × 2
## Sample BAI
## <dbl> <dbl>
## 1 9027 0.223
## 2 9028 0.317
## 3 9030 0.356
## 4 9031 0.264
## 5 9032 0.376
## 6 9033 0.434
## 7 9034 0.306
## 8 9035 0.258
## 9 9036 0.227
## 10 9193 0.474
## 11 9194 0.299
## 12 9195 0.127
## 13 9196 0.172
## 14 9197 0.334
## 15 9198 0.530
## 16 9199 0.389
## 17 9200 0.249
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 4.4.3
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 4.4.3
library(forcats)
#配色
library(RColorBrewer)
cols <- c("#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd",
"#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf",
brewer.pal(12, "Set3"),
brewer.pal(8 , "Set2"),
brewer.pal(9 , "Pastel1"),
brewer.pal(11, "Paired"),
brewer.pal(8 , "Accent"))
library(scales)
## Warning: package 'scales' was built under R version 4.4.3
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
show_col(cols, labels = TRUE)#查看配色
# 行为样本可运行如下代码
plot_dat <- per_BA %>%
mutate(Sample = sample_ids) %>% #加样本ID
mutate(Group=group_info) %>% #加分组
pivot_longer(
cols = -c(Sample, Group),
names_to = "BA",
values_to = "Value"
) %>%
group_by(Sample) %>%
mutate(Share = Value / sum(Value, na.rm=TRUE)) %>%
ungroup()
# 每组有多个重复,求share的平均
plot_dat <- plot_dat %>% # 原始长表:Sample/Group/BA/Share
group_by(Group, BA) %>%
summarise(avg_Share = mean(Share, na.rm = TRUE), .groups = "drop")
# 在绘图前就把BA固定为一个全局一致的因子,否则每个分组里重新排序因子水平,会导致相同的 BA 在不同组里排序不同,颜色就乱了
# (可选)如果你想让颜色跟平均丰度的全局大小排序,可以这样运行下面五行
ba_levels <- plot_dat %>%
group_by(BA) %>%
summarise(total_share = sum(avg_Share), .groups = "drop") %>%
arrange(desc(total_share)) %>%
pull(BA)
plot_dat <- plot_dat %>%
mutate(BA = factor(BA, levels = ba_levels))
#用极坐标画饼图
pie_fun <-function(df, group_name, top_n = 5) {
tmp <- df %>%
filter(Group == group_name) %>% #取当前分组
arrange(desc(avg_Share)) %>% #计算每组平均share 再按占比降序排
mutate(rank = row_number(desc(avg_Share)),
label = ifelse(rank <= top_n, #前n名显示标签:“代谢物,百分比”
paste0(BA, ",", sprintf("%.2f%%", avg_Share * 100)),
""),
fraction = avg_Share / sum(avg_Share), # 归一化,保证总和=1
ymax = cumsum(fraction),
ymin = lag(ymax, default = 0), # 每个扇形的“下边界”
mid = (ymin + ymax) / 2, # 中心位置
angle = 360 * mid, # 极坐标角度
hjust = ifelse(angle > 180, 1, 0), # 文本水平对齐
angle_text = ifelse(angle > 180, angle - 180, angle) # 文本旋转,避免倒写
)
ggplot(tmp, aes(ymax = ymax, ymin = ymin, xmax = 1, xmin = 0,
fill =BA)) +
geom_rect(color = "white") + # 画扇形(白色边框)
coord_polar(theta = "y") + # 把 y 轴弯成极坐标 → 变成饼图
geom_text(aes(x = 1.1, #把标签放到半径 1.1的位置,即圆饼外侧,可以修改
y = mid, label = label),
angle = 0, #标签文字水平;如需随扇形旋转,可改成 angle = angle_text
hjust = 0.5, # 0左对齐,0.5居中
size = 3.2,
inherit.aes = FALSE
) +
scale_fill_manual(values = cols) + #配色
labs(title = group_name, fill = NULL) +
theme_void() + #去掉坐标轴
theme(
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
legend.position = "right",
legend.text = element_text(size = 7)
)
}
# 绘图与拼图
library(purrr)
groups <- c("aged","young") #根据自己情况修改
pie_list <- purrr::map( # 循环
groups, ~ pie_fun(plot_dat, .x))
combined <- ggarrange(plotlist = pie_list,
ncol = 2, nrow = 1,
common.legend = TRUE, legend = "right")
# 添加标题
combined <- annotate_figure(
combined,
top = text_grob("Bile acids relative abundance (serum)", size = 16, face = "bold")
)
print(combined)
rmarkdown::render(“report.Rmd”)