setwd("C:/Users/石源方/Desktop/数据搬家/华理工/班级-华/各科课程作业/高等生物信息学-注意PDF格式/24-12-12-practice5_Proteomic")


# 加载必要的库
library(ggplot2)  # 用于绘制PCA散点图
library(dplyr)    # 数据操作
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# 读取数据
# 读取蛋白表达矩阵
proteomics_data <- read.csv("proteomics_data.csv", row.names = 1)

# 读取样本分组信息
sample_group <- read.csv("sample_group.csv")

# 确保样本顺序一致
# 筛选出在蛋白表达矩阵中存在的样本,并按照分组顺序排序
sample_group <- sample_group %>%
  filter(Sample_ID %in% colnames(proteomics_data)) %>%
  arrange(factor(Group, levels = c("BPH", "TA1", "TA2")))

# 重新排列蛋白表达矩阵的列,使样本顺序与分组一致
proteomics_data <- proteomics_data[, sample_group$Sample_ID]

# 数据标准化 (Z-score)
# PCA对数据的尺度很敏感,标准化数据有助于消除量纲的影响
normalized_data <- t(scale(t(proteomics_data)))

# 执行PCA分析
# t() 转置矩阵,使PCA分析以样本为行、蛋白为列进行
pca_result <- prcomp(t(normalized_data), scale. = FALSE)

# 查看PCA结果的方差解释比例
explained_variance <- summary(pca_result)$importance[2, ]  # 获取主成分的方差比例

# 整理PCA结果数据
# 提取PCA前两个主成分的得分
pca_data <- data.frame(
  PC1 = pca_result$x[, 1],  # 第一主成分得分
  PC2 = pca_result$x[, 2],  # 第二主成分得分
  Group = sample_group$Group,  # 样本分组信息
  Sample_ID = sample_group$Sample_ID  # 样本ID
)

# 绘制PCA散点图
ggplot(pca_data, aes(x = PC1, y = PC2, color = Group)) +
  geom_point(size = 1.5, alpha = 0.8) +  # 绘制散点
  labs(
    title = "PCA Plot of Proteomics Data",
    x = paste0("PC1: ", round(explained_variance[1] * 100, 2), "% Variance Explained"),
    y = paste0("PC2: ", round(explained_variance[2] * 100, 2), "% Variance Explained"),
    color = "Group"
  ) +
  theme_minimal() +  # 使用简洁主题
  scale_color_manual(values = c("BPH" = "#1F78B4", "TA1" = "#E31A1C", "TA2" = "#33A02C"))  # 自定义颜色