# 加载 ALL 数据集
library(ALL)
data(ALL)
# 筛选出 B 细胞样本和特定分子类型 BCR/ABL 或 NEG 的样本
bcell <- grep("^B", as.character(ALL$BT))
moltyp <- which(as.character(ALL$mol.biol) %in% c("BCR/ABL", "NEG"))
ALL_bcrneg <- ALL[, intersect(bcell, moltyp)]
# 将分子类型列转换为因子,以便进一步分析
ALL_bcrneg$mol.biol <- factor(ALL_bcrneg$mol.biol)
# 使用 genefilter 包进行过滤,去除变异度较低的基因
library(genefilter)
ALLfilt_bcrneg <- nsFilter(ALL_bcrneg, var.cutoff = 0.5)$eset # 过滤后保留变异度较高的基因
dim(ALLfilt_bcrneg) # 查看过滤后的数据维度
## Features Samples
## 4388 79
# 构建 KEGG 基因集合
library(GSEABase)
gsc <- GeneSetCollection(ALLfilt_bcrneg, setType = KEGGCollection()) # 从 KEGG 中创建基因集合
Am <- incidence(gsc) # 获取 KEGG 基因集合的发生矩阵
dim(Am) # 查看发生矩阵的维度
## [1] 221 2058
Am[1:5, 1:5] # 查看前几行和列
## 189_s_at 35894_at 36543_at 36781_at 37310_at
## 04610 1 1 1 1 1
## 00232 0 0 0 0 0
## 00983 0 0 0 0 0
## 01100 0 0 0 0 0
## 00380 0 0 0 0 0
# 筛选基因表达矩阵中与 KEGG 集合矩阵匹配的基因
nsF <- ALLfilt_bcrneg[colnames(Am),]
# 进行 t 检验,比较不同分子类型中的基因表达差异
rtt <- rowttests(nsF, "mol.biol")
rttStat <- rtt$statistic # 提取 t 检验的统计量
# 筛选出参与多个 KEGG 路径的基因集合,以确保路径较为显著
selectRows <- (rowSums(Am) > 10)
Am2 <- Am[selectRows,]
dim(Am2) # 查看筛选后的发生矩阵维度
## [1] 161 2058
# 计算 KEGG 基因集合中显著差异的 t 统计量,并进行标准化
tA <- as.vector(Am2 %*% rttStat) # 计算 t 统计量总和
tAadj <- tA / sqrt(rowSums(Am2)) # 标准化 t 统计量
names(tA) <- names(tAadj) <- rownames(Am2) # 为统计量添加 KEGG 基因集合名称
qqnorm(tAadj) # 绘制标准化 t 统计量的正态 Q-Q 图,以评估其分布
# 加载 KEGG 注释包,查找 KEGG 路径名称
library(KEGG.db)
smPw <- tAadj[tAadj < -5] # 筛选出具有显著差异的 KEGG 路径
smPw # 显示显著的 KEGG 路径及其 t 统计量
## 03040 03030 03010
## -5.199654 -6.328842 -9.537670
# 查看 KEGG 路径 ID 和名称映射表
head(toTable(KEGGPATHID2NAME))
## path_id path_name
## 1 map01100 Metabolic pathways
## 2 map01110 Biosynthesis of secondary metabolites
## 3 map01120 Microbial metabolism in diverse environments
## 4 map01200 Carbon metabolism
## 5 map01210 2-Oxocarboxylic acid metabolism
## 6 map01212 Fatty acid metabolism
# 将 KEGG 路径 ID 添加 "map" 前缀,查找对应路径名称
kegg_ids <- paste0("map", names(smPw))
pwName <- KEGGPATHID2NAME[kegg_ids] # 获取 KEGG 路径名称
toTable(pwName) # 查看显著 KEGG 路径的名称
## path_id path_name
## 1 map03040 Spliceosome
## 2 map03010 Ribosome
## 3 map03030 DNA replication
# 使用 KEGGmnplot 函数绘制基因表达差异的 KEGG 路径图
KEGGmnplot(names(smPw)[2], nsF, "hgu95av2", nsF$mol.biol, pch=16, col="darkblue")
## 41583_at 1376_at 35312_at 33252_at 981_at 982_at 40117_at 947_at
## BCR/ABL 6.962329 7.123143 8.532773 5.577960 4.945757 5.857321 7.667987 8.067253
## NEG 7.162829 7.237553 8.830642 5.725185 5.036721 6.098909 7.832980 8.405091
## 1884_s_at 786_at 1462_s_at 1470_at 1726_at 798_at 36898_r_at
## BCR/ABL 7.599851 5.278474 7.237885 7.426107 6.728127 5.055080 5.050921
## NEG 7.756538 5.412038 7.405221 7.729124 6.776125 5.246665 5.010045
## 1053_at 39269_at 1055_g_at 653_at 38481_at 1119_at 652_g_at
## BCR/ABL 4.953551 3.656517 5.398765 4.376620 6.379486 6.090589 5.388773
## NEG 5.102480 3.795749 5.599139 4.456762 6.460504 6.263615 5.514692
## 39086_g_at 35141_at 37646_at 38702_at 38397_at 35972_at
## BCR/ABL 6.600899 5.855209 6.058494 8.498555 7.982021 5.683269
## NEG 6.712780 5.985984 6.187735 8.463555 7.838099 5.921239
# 为 KEGG2heatmap 准备分组标签,并绘制显著 KEGG 路径的热图
sel <- as.integer(nsF$mol.biol) # 将分子类型转换为整数,便于颜色编码 1为BCR/ABL 2为NEG
KEGG2heatmap(names(smPw)[2], nsF, "hgu95av2",
col=colorRampPalette(c("white", "darkblue"))(256),
ColSideColors=c("black", "white")[sel]) # 绘制热图
# 加载所需的 R 包
library(gage) # gage 包用于基因集富集分析
library(pathview) # pathview 包用于通路可视化
# 加载基因表达数据集 gse16873
data(gse16873)
# 查看数据集的前几行数据,快速了解其结构
head(gse16873)
## HN_1 DCIS_1 HN_2 DCIS_2 HN_3 DCIS_3 HN_4
## 10000 6.765984 6.458339 6.921720 6.774493 7.010564 6.986779 6.958761
## 10001 6.339474 6.755342 7.177369 6.842597 7.392611 6.879474 6.296571
## 10002 6.591755 6.790304 6.735359 6.773255 6.700016 7.041881 6.586285
## 10003 6.822092 6.590539 6.508452 6.411859 6.575640 6.470913 6.896886
## 100048912 7.356051 7.311144 7.385513 7.333481 7.392233 7.428623 7.314579
## 10004 6.941935 6.854373 6.883973 6.833695 6.855043 6.856864 7.021072
## DCIS_4 HN_5 DCIS_5 HN_6 DCIS_6
## 10000 6.888199 6.949912 6.948589 7.220427 7.070159
## 10001 6.130034 7.831222 7.942344 7.665248 7.799255
## 10002 6.501011 6.884931 7.652490 7.342505 7.500791
## 10003 6.848872 6.615143 6.407087 6.618174 6.651619
## 100048912 7.362658 7.370044 7.397249 7.383196 7.437643
## 10004 7.051310 6.767242 6.775276 6.873855 6.805247
# 显示数据集的列名,帮助确认样本信息
colnames(gse16873)
## [1] "HN_1" "DCIS_1" "HN_2" "DCIS_2" "HN_3" "DCIS_3" "HN_4" "DCIS_4"
## [9] "HN_5" "DCIS_5" "HN_6" "DCIS_6"
# 定义对照组和处理组的索引
# control 为奇数索引 (1, 3, 5, ...),即前 12 个样本中的 6 个样本
control <- (1:6)*2 - 1
# treat 为偶数索引 (2, 4, 6, ...),即前 12 个样本中的另外 6 个样本
treat <- (1:6)*2
# 加载 KEGG 基因集数据
data(kegg.gs)
# 查看 KEGG 基因集的前几个子集,了解每个子集的结构和包含的基因
lapply(kegg.gs[1:3], head)
## $`hsa00010 Glycolysis / Gluconeogenesis`
## [1] "10327" "124" "125" "126" "127" "128"
##
## $`hsa00020 Citrate cycle (TCA cycle)`
## [1] "1431" "1737" "1738" "1743" "2271" "3417"
##
## $`hsa00030 Pentose phosphate pathway`
## [1] "2203" "221823" "226" "229" "22934" "230"
# 使用 gage 函数对 gse16873 数据进行基因集富集分析
# 参数说明:
# gsets = kegg.gs 指定 KEGG 基因集
# ref = control 指定对照组
# samp = treat 指定处理组
gse16873.kegg.p <- gage(gse16873, gsets = kegg.gs, ref = control, samp = treat)
# 显示富集分析结果的列表名称,包括“greater”和“less”列表
# “greater” 表示基因集在处理组中上调富集的结果
names(gse16873.kegg.p)
## [1] "greater" "less" "stats"
# 查看上调富集结果的前几行,了解显著的 KEGG 通路
head(gse16873.kegg.p$greater)
## p.geomean stat.mean
## hsa04141 Protein processing in endoplasmic reticulum 0.0002164597 3.517131
## hsa00190 Oxidative phosphorylation 0.0014970021 2.848815
## hsa03050 Proteasome 0.0047708296 2.631099
## hsa04142 Lysosome 0.0037176143 2.541611
## hsa03060 Protein export 0.0186201404 2.118436
## hsa04145 Phagosome 0.0145567304 1.996678
## p.val q.val
## hsa04141 Protein processing in endoplasmic reticulum 9.236559e-18 1.477850e-15
## hsa00190 Oxidative phosphorylation 3.279350e-12 2.623480e-10
## hsa03050 Proteasome 2.108534e-10 1.124551e-08
## hsa04142 Lysosome 4.027638e-10 1.611055e-08
## hsa03060 Protein export 4.404710e-07 1.409507e-05
## hsa04145 Phagosome 6.258738e-07 1.668997e-05
## set.size DCIS_1
## hsa04141 Protein processing in endoplasmic reticulum 144 1.909626e-05
## hsa00190 Oxidative phosphorylation 97 3.475070e-05
## hsa03050 Proteasome 39 8.947511e-04
## hsa04142 Lysosome 108 9.028336e-03
## hsa03060 Protein export 18 9.442471e-04
## hsa04145 Phagosome 132 7.176237e-02
## DCIS_2 DCIS_3
## hsa04141 Protein processing in endoplasmic reticulum 4.500862e-06 4.338235e-03
## hsa00190 Oxidative phosphorylation 2.149291e-04 2.665382e-01
## hsa03050 Proteasome 3.504204e-03 2.374528e-02
## hsa04142 Lysosome 6.278064e-02 4.882085e-06
## hsa03060 Protein export 1.365472e-02 5.193202e-02
## hsa04145 Phagosome 2.369194e-01 4.056498e-05
## DCIS_4 DCIS_5
## hsa04141 Protein processing in endoplasmic reticulum 0.0006625566 0.0008103988
## hsa00190 Oxidative phosphorylation 0.0060485163 0.0019291992
## hsa03050 Proteasome 0.0109516651 0.0005634787
## hsa04142 Lysosome 0.0776453319 0.0007493152
## hsa03060 Protein export 0.0462144533 0.0095604241
## hsa04145 Phagosome 0.0666306128 0.0033630990
## DCIS_6
## hsa04141 Protein processing in endoplasmic reticulum 0.0005137881
## hsa00190 Oxidative phosphorylation 0.0004844961
## hsa03050 Proteasome 0.0256647373
## hsa04142 Lysosome 0.0163971080
## hsa03060 Protein export 0.1408766864
## hsa04145 Phagosome 0.0615631576
# 计算处理组和对照组的基因表达差异,生成差异矩阵
gse16873.d <- gse16873[, treat] - gse16873[, control]
head(gse16873.d)
## DCIS_1 DCIS_2 DCIS_3 DCIS_4 DCIS_5
## 10000 -0.30764480 -0.14722769 -0.023784808 -0.07056193 -0.001323087
## 10001 0.41586805 -0.33477259 -0.513136907 -0.16653712 0.111122223
## 10002 0.19854925 0.03789588 0.341865341 -0.08527420 0.767559264
## 10003 -0.23155297 -0.09659311 -0.104727283 -0.04801404 -0.208056443
## 100048912 -0.04490724 -0.05203146 0.036390376 0.04807823 0.027205816
## 10004 -0.08756237 -0.05027725 0.001821133 0.03023835 0.008034394
## DCIS_6
## 10000 -0.15026813
## 10001 0.13400734
## 10002 0.15828609
## 10003 0.03344448
## 100048912 0.05444739
## 10004 -0.06860749
# 指定要可视化的通路 ID,包含通路名称用于识别
path.ids = c("hsa04110 Cell cycle", "hsa00020 Citrate cycle (TCA cycle)")
# 从通路 ID 中提取编号部分,忽略通路名称
path.ids2 <- substr(path.ids, 4, 8)
# 使用 sapply 函数循环遍历每个提取后的通路编号(path.ids2)
# 在每次循环中调用 pathview 函数对特定通路进行可视化,生成图片
# 参数说明:
# gene.data = gse16873.d[,1:2] 指定使用的基因数据(差异矩阵的前两列)
# pathway.id = pid 指定当前通路的 ID
# species = "hsa" 指定物种为人类
# KEGG API 当下可能访问错误
pv.out.list <- sapply(path.ids2, function(pid) pathview(gene.data = gse16873.d[,1:2], pathway.id = pid, species = "hsa"))
# Graphviz 视图:在非 KEGG 原生视图的设置下重新生成通路图
# 通过设置 `kegg.native=F`,可以使用基于 Graphviz 的绘图方式进行可视化
# sign.pos = "bottomleft" 参数指定了显著性标志在图片左下角的位置
pv.out.list <- sapply(path.ids2, function(pid) pathview(gene.data = gse16873.d[,1:2], pathway.id = pid, species = "hsa", kegg.native=F, sign.pos="bottomleft"))
## [,1] [,2]
## [1,] "9" "300"
## [2,] "9" "306"