setwd("C:/Users/石源方/Desktop/数据搬家/华理工/班级-华/各科课程作业/高等生物信息学-注意PDF格式/24-12-12-practice5_Proteomic")

# 读取CSV文件
proteomics_data <- read.csv("proteomics_data.csv", row.names = 1)
sample_group <- read.csv("sample_group.csv")


# 加载必要的库
library(ggplot2)   # 绘制火山图
library(dplyr)     # 数据操作
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggrepel)   # 用于标注火山图上的点
library(tidyr)     # 数据整理

# 提取BPH和TA1组数据
bph_samples <- sample_group %>% filter(Group == "BPH") %>% pull(Sample_ID)  # BPH组样本名
ta1_samples <- sample_group %>% filter(Group == "TA1") %>% pull(Sample_ID)  # TA1组样本名

bph_data <- proteomics_data[, bph_samples]  # 提取BPH组的蛋白表达数据
ta1_data <- proteomics_data[, ta1_samples]  # 提取TA1组的蛋白表达数据

# 计算每个蛋白在BPH和TA1中的平均表达量
mean_bph <- rowMeans(bph_data)  # BPH组每个蛋白的平均表达量
mean_ta1 <- rowMeans(ta1_data)  # TA1组每个蛋白的平均表达量

# 计算TA1与BPH之间的log2倍数变化和显著性(##已标准化)
log2fc <- log2(mean_ta1 / mean_bph)  # 计算log2 Fold Change (倍数变化)

# 假设进行t检验来计算p值(示意代码)
p_values <- sapply(1:nrow(proteomics_data), function(i) {
  t.test(as.numeric(bph_data[i, ]), as.numeric(ta1_data[i, ]))$p.value
})
adj_p <- p.adjust(p_values, method = "fdr")  # 多重假设检验校正p值

# 整理数据,构建火山图数据框
volcano_data <- data.frame(
  Protein = rownames(proteomics_data),
  log2FC = log2fc,
  p_adj = adj_p
)

# 筛选TA1中表达最高和最低的10个蛋白
top10_proteins <- volcano_data %>%
  arrange(desc(log2FC)) %>%  # 按log2FC降序排列
  slice(1:10)               # 取前10个表达最高的蛋白
print(top10_proteins)#打印表达最高10个蛋白
##           Protein     log2FC        p_adj
## SEPHS1     SEPHS1 0.08900012 1.406428e-22
## MARCKSL1 MARCKSL1 0.08390037 1.723207e-12
## AGR2         AGR2 0.08383702 4.915087e-14
## POSTN       POSTN 0.06799049 4.010546e-21
## MDH2         MDH2 0.06743333 4.875506e-18
## NHP2L1     NHP2L1 0.06530599 7.298131e-13
## GPNMB       GPNMB 0.06204851 1.710586e-08
## HSPD1       HSPD1 0.06012755 1.934845e-13
## FABP5       FABP5 0.05960344 1.461345e-05
## GNPNAT1   GNPNAT1 0.05756883 2.445818e-08
bottom10_proteins <- volcano_data %>%
  arrange(log2FC) %>%       # 按log2FC升序排列
  slice(1:10)               # 取前10个表达最低的蛋白
print(bottom10_proteins)#打印表达最低10个蛋白
##           Protein      log2FC        p_adj
## RABL3       RABL3 -0.09916715 1.353746e-06
## MFAP4       MFAP4 -0.07009186 2.696531e-08
## CNTN1       CNTN1 -0.06615588 1.384060e-11
## ARMC10     ARMC10 -0.05890787 3.858525e-03
## PSME4       PSME4 -0.05797133 1.848565e-03
## LGALS3BP LGALS3BP -0.05535989 6.469932e-07
## KRT5         KRT5 -0.05186479 6.946517e-11
## TIMP1       TIMP1 -0.04987104 1.007064e-12
## ZNF30       ZNF30 -0.04891809 2.207482e-04
## EEF1E1     EEF1E1 -0.04858284 5.776258e-08
# 合并这20个蛋白作为需要标记的点
highlight_proteins <- rbind(top10_proteins, bottom10_proteins)

# 为bottom10_proteins添加一个颜色标记列
volcano_data$Color <- ifelse(volcano_data$Protein %in% bottom10_proteins$Protein, 
                             "Bottom10", 
                             ifelse(volcano_data$Protein %in% top10_proteins$Protein, 
                                    "Top10", "Normal"))

# 绘制火山图
ggplot(volcano_data, aes(x = log2FC, y = -log10(p_adj))) +
  geom_point(aes(color = Color), alpha = 0.7) +  # 根据Color列定义颜色
  scale_color_manual(values = c("Top10" = "red", "Bottom10" = "blue", "Normal" = "grey")) +
  geom_label_repel(data = highlight_proteins, 
                   aes(label = Protein), 
                   color = "black", 
                   size = 3,
                   box.padding = 0.5,
                   point.padding = 0.5) +  # 标注蛋白名
  labs(title = "Volcano Plot: TA1 vs BPH",
       x = "Log2 Fold Change (TA1 / BPH)",
       y = "-Log10 Adjusted p-value") +
  theme_minimal()