remove(list = ls())
setwd("C:/Users/石源方/Desktop/数据搬家/华理工/班级-华/各科课程作业/高等生物信息学-注意PDF格式/24-12-19-practice6/6")

# 加载必要的R包
library(limma)
library(edgeR)
library(pheatmap)


# 1. 读取数据
# 基因表达矩阵和样本分组文件
expr_matrix <- read.csv("TCGA_LIHC_Tpm_346T50N.csv", row.names = 1, header = TRUE)
sample_group <- read.csv("Sample_Stage_Stage346N50.csv")

# 检查数据维度是否匹配
head(expr_matrix[, 1:5])
##          TCGA.FV.A3I0.01 TCGA.DD.A3A6.01 TCGA.BD.A3ER.01 TCGA.CC.5261.01
## TSPAN6             16693             783            3056            4785
## DPM1                1419             279             633            1908
## SCYL3                543             202             397             975
## C1orf112             132              26              98             451
## FGR                  188             412             237             326
## CFH                  644            1848          112525           28847
##          TCGA.DD.AAVZ.01
## TSPAN6              7408
## DPM1                1010
## SCYL3                885
## C1orf112             279
## FGR                  105
## CFH               102375
head(sample_group)
##        Samples_ID  Stage
## 1 TCGA-FV-A3I0-01 Stage2
## 2 TCGA-DD-A3A6-01 Stage2
## 3 TCGA-BD-A3ER-01 Stage2
## 4 TCGA-CC-5261-01 Stage2
## 5 TCGA-DD-AAVZ-01 Stage1
## 6 TCGA-DD-AADN-01 Stage1
# 确保 Stage 列是因子类型
sample_group$Stage <- factor(sample_group$Stage, levels = c("Normal", "Stage1", "Stage2", "Stage3", "Stage4"))

# 2. 数据预处理
# 确保样本名称一致,并按样本名称匹配
colnames(expr_matrix) <- gsub("\\.", "-", colnames(expr_matrix)) # 确保样本名格式一致,这里需要将 '.' 替换为 '-'
rownames(sample_group) <- sample_group$Samples_ID

# 提取 expr_matrix 中存在的样本
existing_samples <- intersect(colnames(expr_matrix), sample_group$Samples_ID)
expr_matrix <- expr_matrix[, existing_samples]
sample_group <- sample_group[sample_group$Samples_ID %in% existing_samples, ]

# 创建 DGE对象,用于存储差异表达分析所需的数据
dge <- DGEList(counts = expr_matrix, group = sample_group$Stage)

# 过滤低表达基因(CPM值较低的基因过滤),低表达阈值设置:筛选出在两个样本以上的CPM值均大于1的基因
keep_gene <- rowSums(cpm(dge) > 1) >= 2
dge <- dge[keep_gene, , keep.lib.sizes = FALSE]  # 保留符合条件的基因

# TMM标准化(校正测序深度(默认情况下calcNormFactors 使用 TMM算法来计算归一化因子))
dge <- calcNormFactors(dge)

# 定义设计矩阵
design <- model.matrix(~0 + Stage, data = sample_group)
colnames(design) <- levels(sample_group$Stage)

# 创建对比组
contrasts <- makeContrasts(
  Stage1_vs_Normal = Stage1 - Normal,
  Stage2_vs_Normal = Stage2 - Normal,
  Stage3_vs_Normal = Stage3 - Normal,
  Stage4_vs_Normal = Stage4 - Normal,
  levels = design
)

# 差异基因分析:使用 edgeR 对 Stage 1、2、3 与 Normal 组进行差异分析。
# 使用 edgeR 对 Stage 4 进行初步分析,并用 Wilcoxon 检验作为辅助验证。
# 定义函数:差异分析并导出结果
run_edger_analysis <- function(dge, contrast, design, group_name) {
  # 估算三种离散度
  dge <- estimateGLMCommonDisp(dge, design)
  dge <- estimateGLMTrendedDisp(dge, design)
  dge <- estimateGLMTagwiseDisp(dge, design)
  
  # 拟合广义线性模型
  fit <- glmFit(dge, design)
  
  # 进行似然比检验
  lrt <- glmLRT(fit, contrast = contrast)
  
  # 提取显著差异基因
  deg <- topTags(lrt, n = Inf)$table
  deg <- deg[deg$FDR < 0.05 & abs(deg$logFC) > 1.5, ]
  
  # 导出 CSV 文件
  output_file <- paste0("DEG_", group_name, ".csv")
  write.csv(deg, output_file, row.names = TRUE)
  
  return(deg)
}

# 分别对 Stage 1、2、3 进行差异分析
deg_stage1 <- run_edger_analysis(dge, contrasts[,"Stage1_vs_Normal"], design, "Stage1_vs_Normal")
deg_stage2 <- run_edger_analysis(dge, contrasts[,"Stage2_vs_Normal"], design, "Stage2_vs_Normal")
deg_stage3 <- run_edger_analysis(dge, contrasts[,"Stage3_vs_Normal"], design, "Stage3_vs_Normal")

# 对 Stage 4 使用 edgeR 初步筛选
# 估算三种离散度
dge <- estimateGLMCommonDisp(dge, design)
dge <- estimateGLMTrendedDisp(dge, design)
dge <- estimateGLMTagwiseDisp(dge, design)

# 拟合广义线性模型
fit <- glmFit(dge, design)

# 进行似然比检验
lrt <- glmLRT(fit, contrast = contrasts[,"Stage4_vs_Normal"])
# 提取显著差异基因
deg_stage4 <- topTags(lrt, n = Inf)$table
deg_stage4 <- deg_stage4[deg_stage4$FDR < 0.05 & abs(deg_stage4$logFC) > 1.5, ]

# 使用 Wilcoxon 检验对 Stage 4 进行辅助验证
normalized_matrix <- cpm(dge, log = TRUE)
group <- sample_group$Stage

# 初始化 results 数据框,确保行名与 deg_stage4 一致
results <- data.frame(Gene = rownames(deg_stage4),
                      p_value = NA,
                      logFC = NA,
                      stringsAsFactors = FALSE)

# 遍历 deg_stage4 的基因
for (i in seq_len(nrow(deg_stage4))) {
  gene <- rownames(deg_stage4)[i] # 逐一获取基因名
  
  # 确保基因存在于 normalized_matrix
  if (gene %in% rownames(normalized_matrix)) {
    gene_expr <- normalized_matrix[gene, ] # 获取该基因的表达矩阵
    
    # 确保 group 分组正常
    group1 <- gene_expr[group == "Normal"]
    group2 <- gene_expr[group == "Stage4"]
    
    # 检查 group1 和 group2 是否存在 NA 值
    if (!any(is.na(group1)) && !any(is.na(group2))) {
      test <- wilcox.test(group1, group2) # 进行 Wilcoxon 检验
      logFC <- mean(group2) - mean(group1) # 计算 logFC
      
      # 赋值到 results 的对应行
      results$p_value[i] <- test$p.value
      results$logFC[i] <- logFC
    }
  }
}

# 检查结果
head(results)
##      Gene      p_value    logFC
## 1   GABRD 0.0002673372 5.385393
## 2   PLVAP 0.0002674056 3.355653
## 3   HAGLR 0.0003305184 5.356068
## 4 COL15A1 0.0004677046 4.121443
## 5    ZIC2 0.0011596868 4.382585
## 6 OLFML2B 0.0002674056 4.046936
# 调整 p 值(FDR 校正)
results$adj_p_value <- p.adjust(results$p_value, method = "BH")
# 筛选显著差异基因
wilcox_deg <- results[results$adj_p_value < 0.05 & abs(results$logFC) > 1.5, ]
# 导出显著差异基因为 CSV 文件
write.csv(wilcox_deg, "Wilcoxon_DEG_Stage4_vs_Normal.csv", row.names = FALSE)

# 最终 Stage 4 差异基因
final_deg_stage4 <- wilcox_deg
# 将第一列 Gene 设置为行名
rownames(final_deg_stage4) <- final_deg_stage4$Gene
final_deg_stage4$Gene <- NULL  # 删除 Gene 列
write.csv(final_deg_stage4, "Final_DEG_Stage4_vs_Normal.csv")

# # 由于R报错关闭,重新读取文件final_deg_stage4作图
# setwd("C:/Users/石源方/Desktop/数据搬家/华理工/班级-华/各科课程作业/高等生物信息学-注意PDF格式/24-12-19-practice6/6")
# final_deg_stage4 <- read.csv("Final_DEG_Stage4_vs_Normal.csv", row.names = 1)

# 按照分组聚类
# 提取 top 10 上调和下调的基因,并按照 logFC 排序
top10_up <- head(final_deg_stage4[order(final_deg_stage4$logFC, decreasing = TRUE), ], 10)
top10_down <- head(final_deg_stage4[order(final_deg_stage4$logFC, decreasing = FALSE), ], 10)

# 合并基因列表,并确保按照 logFC 排序
top_genes <- c(rownames(top10_up), rownames(top10_down))

# 提取热图矩阵
heatmap_matrix <- normalized_matrix[top_genes, ]

# 根据 logFC 对行排序(top10_up 和 top10_down 分别排序)
heatmap_matrix <- heatmap_matrix[c(rownames(top10_up), rownames(top10_down)), ]

# 添加分组注释
annotation_col <- data.frame(Group = group)
rownames(annotation_col) <- colnames(heatmap_matrix)

# 设置列顺序为 (Normal, Stage1, Stage2, Stage3, Stage4)
# 这里通过分组信息重新排序列
sample_order <- c("Normal", "Stage1", "Stage2", "Stage3", "Stage4")
ordered_samples <- order(factor(annotation_col$Group, levels = sample_order))
heatmap_matrix <- heatmap_matrix[, ordered_samples]
annotation_col <- annotation_col[ordered_samples, , drop = FALSE]

# 绘制热图
pheatmap(heatmap_matrix, 
         cluster_rows = FALSE,  # 不对行(基因)进行聚类
         cluster_cols = FALSE,  # 不对列(样本)进行聚类
         annotation_col = annotation_col, 
         scale = "row", 
         show_rownames = TRUE, 
         show_colnames = FALSE,
         main = "Top 10 Upregulated and Downregulated Genes (Stage 4)")

# 如若按照无监督聚类,可使用下面的代码
# 提取 top 10 上调和下调的基因
top10_up <- head(final_deg_stage4[order(final_deg_stage4$logFC, decreasing = TRUE), ], 10)
top10_down <- head(final_deg_stage4[order(final_deg_stage4$logFC, decreasing = FALSE), ], 10)

# 合并基因列表
top_genes <- c(rownames(top10_up), rownames(top10_down))

# 提取热图矩阵
heatmap_matrix <- normalized_matrix[top_genes, ]

# 添加分组注释
annotation_col <- data.frame(Group = group)
rownames(annotation_col) <- colnames(heatmap_matrix)

# 绘制热图
pheatmap(heatmap_matrix, cluster_rows = TRUE, cluster_cols = TRUE,
         annotation_col = annotation_col, 
         scale = "row", show_rownames = TRUE, show_colnames = FALSE,
         main = "Top 10 Upregulated and Downregulated Genes (Stage 4)")