setwd("C:/Users/石源方/Desktop/数据搬家/华理工/班级-华/各科课程作业/高等生物信息学-注意PDF格式/24-12-12-practice5_Proteomic")
# 读取CSV文件
proteomics_data <- read.csv("proteomics_data.csv", row.names = 1)
sample_group <- read.csv("sample_group.csv")
# 加载必要的库
library(ggplot2) # 绘制火山图
library(dplyr) # 数据操作
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggrepel) # 用于标注火山图上的点
library(tidyr) # 数据整理
# 提取BPH和TA1组数据
bph_samples <- sample_group %>% filter(Group == "BPH") %>% pull(Sample_ID) # BPH组样本名
ta1_samples <- sample_group %>% filter(Group == "TA1") %>% pull(Sample_ID) # TA1组样本名
bph_data <- proteomics_data[, bph_samples] # 提取BPH组的蛋白表达数据
ta1_data <- proteomics_data[, ta1_samples] # 提取TA1组的蛋白表达数据
# 计算每个蛋白在BPH和TA1中的平均表达量
mean_bph <- rowMeans(bph_data) # BPH组每个蛋白的平均表达量
mean_ta1 <- rowMeans(ta1_data) # TA1组每个蛋白的平均表达量
# 计算TA1与BPH之间的log2倍数变化和显著性(##已标准化)
log2fc <- log2(mean_ta1 / mean_bph) # 计算log2 Fold Change (倍数变化)
# 假设进行t检验来计算p值(示意代码)
p_values <- sapply(1:nrow(proteomics_data), function(i) {
t.test(as.numeric(bph_data[i, ]), as.numeric(ta1_data[i, ]))$p.value
})
adj_p <- p.adjust(p_values, method = "fdr") # 多重假设检验校正p值
# 整理数据,构建火山图数据框
volcano_data <- data.frame(
Protein = rownames(proteomics_data),
log2FC = log2fc,
p_adj = adj_p
)
# 筛选TA1中表达最高和最低的10个蛋白
top10_proteins <- volcano_data %>%
arrange(desc(log2FC)) %>% # 按log2FC降序排列
slice(1:10) # 取前10个表达最高的蛋白
print(top10_proteins)#打印表达最高10个蛋白
## Protein log2FC p_adj
## SEPHS1 SEPHS1 0.08900012 1.406428e-22
## MARCKSL1 MARCKSL1 0.08390037 1.723207e-12
## AGR2 AGR2 0.08383702 4.915087e-14
## POSTN POSTN 0.06799049 4.010546e-21
## MDH2 MDH2 0.06743333 4.875506e-18
## NHP2L1 NHP2L1 0.06530599 7.298131e-13
## GPNMB GPNMB 0.06204851 1.710586e-08
## HSPD1 HSPD1 0.06012755 1.934845e-13
## FABP5 FABP5 0.05960344 1.461345e-05
## GNPNAT1 GNPNAT1 0.05756883 2.445818e-08
bottom10_proteins <- volcano_data %>%
arrange(log2FC) %>% # 按log2FC升序排列
slice(1:10) # 取前10个表达最低的蛋白
print(bottom10_proteins)#打印表达最低10个蛋白
## Protein log2FC p_adj
## RABL3 RABL3 -0.09916715 1.353746e-06
## MFAP4 MFAP4 -0.07009186 2.696531e-08
## CNTN1 CNTN1 -0.06615588 1.384060e-11
## ARMC10 ARMC10 -0.05890787 3.858525e-03
## PSME4 PSME4 -0.05797133 1.848565e-03
## LGALS3BP LGALS3BP -0.05535989 6.469932e-07
## KRT5 KRT5 -0.05186479 6.946517e-11
## TIMP1 TIMP1 -0.04987104 1.007064e-12
## ZNF30 ZNF30 -0.04891809 2.207482e-04
## EEF1E1 EEF1E1 -0.04858284 5.776258e-08
# 合并这20个蛋白作为需要标记的点
highlight_proteins <- rbind(top10_proteins, bottom10_proteins)
# 为bottom10_proteins添加一个颜色标记列
volcano_data$Color <- ifelse(volcano_data$Protein %in% bottom10_proteins$Protein,
"Bottom10",
ifelse(volcano_data$Protein %in% top10_proteins$Protein,
"Top10", "Normal"))
# 绘制火山图
ggplot(volcano_data, aes(x = log2FC, y = -log10(p_adj))) +
geom_point(aes(color = Color), alpha = 0.7) + # 根据Color列定义颜色
scale_color_manual(values = c("Top10" = "red", "Bottom10" = "blue", "Normal" = "grey")) +
geom_label_repel(data = highlight_proteins,
aes(label = Protein),
color = "black",
size = 3,
box.padding = 0.5,
point.padding = 0.5) + # 标注蛋白名
labs(title = "Volcano Plot: TA1 vs BPH",
x = "Log2 Fold Change (TA1 / BPH)",
y = "-Log10 Adjusted p-value") +
theme_minimal()
