数据准备(获取表达矩阵)

环境配置

library(GEOquery)
## 载入需要的程序包:Biobase
## 载入需要的程序包:BiocGenerics
## 
## 载入程序包:'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, table,
##     tapply, union, unique, unsplit, which.max, which.min
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Setting options('download.file.method.GEOquery'='auto')
## Setting options('GEOquery.inmemory.gpl'=FALSE)
library(ggplot2)
library(limma)
## 
## 载入程序包:'limma'
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA
library(stringr)
library(WGCNA)
## 载入需要的程序包:dynamicTreeCut
## 载入需要的程序包:fastcluster
## 
## 载入程序包:'fastcluster'
## The following object is masked from 'package:stats':
## 
##     hclust
## 
## 
## 载入程序包:'WGCNA'
## The following object is masked from 'package:stats':
## 
##     cor
library(reshape2)
rm(list = ls())
options(stringsAsFactors=F)
enableWGCNAThreads()
## Allowing parallel execution with up to 7 working processes.

数据下载

gset<-getGEO('GSE68918',destdir = ".",AnnotGPL = F,getGPL = F)
## Found 1 file(s)
## GSE68918_series_matrix.txt.gz
## Using locally cached version: ./GSE68918_series_matrix.txt.gz
exp<-exprs(gset[[1]])
#获取表达矩阵
pdata<-pData(gset[[1]])
#获取临床信息

创建分组

group_list<-pdata$`agent:ch1`
group_list<-factor(group_list,levels = c("vehicle","10nME2"))

预处理数据

exp_l<-melt(exp) # 拆分数据
colnames(exp_l)=c('probe','sample','value')
exp_l$group=rep(group_list,each=nrow(exp))

p<-ggplot(exp_l,aes(x=sample,y=value,fill=group))+
geom_boxplot()+
ggtitle("before adjustment")+
theme(plot.title=element_text(hjust=0.5))+
theme(axis.text.x = element_text(angle = 90,hjust = 1,size = 7))
print(p)

#绘制处理前数据箱线图
exp=normalizeBetweenArrays(exp) #校正函数
exp_ll<-melt(exp)
colnames(exp_ll)<-c('probe','sample','value')
exp_ll$group<-rep(group_list,each=nrow(exp))

pp<-ggplot(exp_ll,aes(x=sample,y=value,fill=group))+
geom_boxplot()+
ggtitle("after adjustment")+
theme(plot.title = element_text(hjust = 0.5))+
theme(axis.text.x = element_text(angle = 90,hjust = 1,size = 7))
print(pp)

range(exp) #查看表达矩阵的取值范围,一般而言,范围在20以内的表达值基本已经经过了log对数转换
## [1]  5.157608 15.066452

探针名和基因名转换

1.手动

gpl<-getGEO('GPL10558',destdir = ".")

colnames(Table(gpl))

probe2gene=Table(gpl)[,c(1,x)] #x为需要的列(如gene symbol/entrez id...

probe2gene<-subset(probe2gene,GENE_SYMBOL!='')

ids<-probe2gene

2.使用devtools包里的idmap

library(devtools)
## 载入需要的程序包:usethis
library(idmap3)
## idmap3 v 0.1.0  welcome to use idmap3!
## If you use idmap3 in published research, please acknowledgements:
## We thank Dr.Jianming Zeng(University of Macau), and all the members of his bioinformatics team, biotrainee, for generously sharing their experience and codes.
ids<-idmap3::get_pipe_IDs('GPL10558')

(使用哪一种看具体网络情况)

替换

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine()    masks Biobase::combine(), BiocGenerics::combine()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
exp <- as.data.frame(exp)

exp <- exp %>%
mutate(probe_id=rownames(exp)) %>%
  #为数据框新加一列
  inner_join(ids,by="probe_id") %>%
  #将 exp 数据框和 ids 数据框根据 probe_id 这一列进行内连接,保留匹配的行
  select(probe_id, symbol, everything())
#重新排列列的顺序,将 probe_id 和 symbol 两列放在前面,其他列用 everything() 保持其原有顺序
exp <- exp[!duplicated(exp$symbol),]
#去除重复
rownames(exp)<-exp$symbol
exp<-exp[,-(1:2)] #去掉第一二列

WGCNA分析

层次聚类树判断和去除离群样品

expr0<-t(exp)
sampleTree = hclust(dist(expr0), method = "average")
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="")

expr0<-expr0[row.names(expr0)!="GSM1686444",]

选择softpower值

powers = c(c(1:10), seq(from = 12, to=30, by=2))

sft = pickSoftThreshold(expr0, powerVector=powers,
networkType="signed hybrid", verbose=3)
## pickSoftThreshold: will use block size 1632.
##  pickSoftThreshold: calculating connectivity for given powers...
##    ..working on genes 1 through 1632 of 27410
##    ..working on genes 1633 through 3264 of 27410
##    ..working on genes 3265 through 4896 of 27410
##    ..working on genes 4897 through 6528 of 27410
##    ..working on genes 6529 through 8160 of 27410
##    ..working on genes 8161 through 9792 of 27410
##    ..working on genes 9793 through 11424 of 27410
##    ..working on genes 11425 through 13056 of 27410
##    ..working on genes 13057 through 14688 of 27410
##    ..working on genes 14689 through 16320 of 27410
##    ..working on genes 16321 through 17952 of 27410
##    ..working on genes 17953 through 19584 of 27410
##    ..working on genes 19585 through 21216 of 27410
##    ..working on genes 21217 through 22848 of 27410
##    ..working on genes 22849 through 24480 of 27410
##    ..working on genes 24481 through 26112 of 27410
##    ..working on genes 26113 through 27410 of 27410
##    Power SFT.R.sq  slope truncated.R.sq mean.k. median.k. max.k.
## 1      1    0.214 -0.827          0.636  4600.0   4450.00   7330
## 2      2    0.768 -1.260          0.914  2250.0   2050.00   4760
## 3      3    0.888 -1.350          0.958  1320.0   1090.00   3500
## 4      4    0.912 -1.370          0.957   865.0    641.00   2760
## 5      5    0.904 -1.390          0.942   610.0    401.00   2280
## 6      6    0.901 -1.380          0.936   453.0    263.00   1930
## 7      7    0.897 -1.370          0.934   350.0    178.00   1670
## 8      8    0.892 -1.360          0.931   279.0    125.00   1460
## 9      9    0.873 -1.360          0.916   227.0     90.10   1300
## 10    10    0.858 -1.360          0.906   189.0     66.40   1160
## 11    12    0.848 -1.340          0.899   137.0     38.00    958
## 12    14    0.845 -1.330          0.897   103.0     23.60    806
## 13    16    0.844 -1.310          0.894    80.8     15.40    690
## 14    18    0.844 -1.290          0.890    64.9     10.60    598
## 15    20    0.854 -1.270          0.894    53.3      7.62    524
## 16    22    0.900 -1.240          0.932    44.5      5.72    464
## 17    24    0.947 -1.200          0.972    37.7      4.39    414
## 18    26    0.945 -1.220          0.977    32.3      3.52    382
## 19    28    0.934 -1.250          0.975    28.0      2.88    358
## 20    30    0.931 -1.280          0.979    24.4      2.43    337

选择标准:1.SFT.R.sq>0.85

2.mean.k.(平均连通性)不太高也不太低,1到100之间比较好

(之前选择了power值为3去构建层次聚类树状图,但特别平坦,检查发现是平均连通性太高了。所以后面选了10为power值)

网络构建

net = blockwiseModules(expr0, power = 10, TOMType = "signed", minModuleSize = 50,reassignThreshold = 0, mergeCutHeight = 1,numericLabels = TRUE, pamRespectsDendro = FALSE, saveTOMs=TRUE, loadTOMs=TRUE,verbose = 3)
##  Calculating module eigengenes block-wise from all genes
##    Flagging genes and samples with too many missing values...
##     ..step 1
##  ....pre-clustering genes to determine blocks..
##    Projective K-means:
##    ..k-means clustering..
##    ..merging smaller clusters...
## Block sizes:
## gBlocks
##    1    2    3    4    5    6 
## 4995 4974 4857 4530 4093 3961 
##  ..Working on block 1 .
##     TOM calculation: adjacency..
##     ..will not use multithreading.
##      Fraction of slow calculations: 0.000000
##     ..connectivity..
##     ..matrix multiplication (system BLAS)..
##     ..normalization..
##     ..done.
##    ..saving TOM for block 1 into file blockwiseTOM-block.1.RData
##  ....clustering..
##  ....detecting modules..
##  ....calculating module eigengenes..
##  ....checking kME in modules..
##  ..Working on block 2 .
##     TOM calculation: adjacency..
##     ..will not use multithreading.
##      Fraction of slow calculations: 0.000000
##     ..connectivity..
##     ..matrix multiplication (system BLAS)..
##     ..normalization..
##     ..done.
##    ..saving TOM for block 2 into file blockwiseTOM-block.2.RData
##  ....clustering..
##  ....detecting modules..
##  ....calculating module eigengenes..
##  ....checking kME in modules..
##  ..Working on block 3 .
##     TOM calculation: adjacency..
##     ..will not use multithreading.
##      Fraction of slow calculations: 0.000000
##     ..connectivity..
##     ..matrix multiplication (system BLAS)..
##     ..normalization..
##     ..done.
##    ..saving TOM for block 3 into file blockwiseTOM-block.3.RData
##  ....clustering..
##  ....detecting modules..
##  ....calculating module eigengenes..
##  ....checking kME in modules..
##      ..removing 1 genes from module 1 because their KME is too low.
##  ..Working on block 4 .
##     TOM calculation: adjacency..
##     ..will not use multithreading.
##      Fraction of slow calculations: 0.000000
##     ..connectivity..
##     ..matrix multiplication (system BLAS)..
##     ..normalization..
##     ..done.
##    ..saving TOM for block 4 into file blockwiseTOM-block.4.RData
##  ....clustering..
##  ....detecting modules..
##  ....calculating module eigengenes..
##  ....checking kME in modules..
##      ..removing 4 genes from module 2 because their KME is too low.
##      ..removing 1 genes from module 9 because their KME is too low.
##  ..Working on block 5 .
##     TOM calculation: adjacency..
##     ..will not use multithreading.
##      Fraction of slow calculations: 0.000000
##     ..connectivity..
##     ..matrix multiplication (system BLAS)..
##     ..normalization..
##     ..done.
##    ..saving TOM for block 5 into file blockwiseTOM-block.5.RData
##  ....clustering..
##  ....detecting modules..
##  ....calculating module eigengenes..
##  ....checking kME in modules..
##  ..Working on block 6 .
##     TOM calculation: adjacency..
##     ..will not use multithreading.
##      Fraction of slow calculations: 0.000000
##     ..connectivity..
##     ..matrix multiplication (system BLAS)..
##     ..normalization..
##     ..done.
##    ..saving TOM for block 6 into file blockwiseTOM-block.6.RData
##  ....clustering..
##  ....detecting modules..
##  ....calculating module eigengenes..
##  ....checking kME in modules..
##      ..removing 2 genes from module 3 because their KME is too low.
##      ..removing 2 genes from module 4 because their KME is too low.
##  ..merging modules that are too close..
##      mergeCloseModules: Merging modules whose distance is less than 1
##        Calculating new MEs...

层次聚类树

moduleLabels = net$colors
moduleColors = labels2colors(moduleLabels)
pdf(file = "dendrogram_plot.pdf", width = 10, height = 6)
plotDendroAndColors(net$dendrograms[[1]], moduleColors[net$blockGenes[[1]]],"Module colors",dendroLabels = FALSE, hang = 0.03,addGuide = TRUE, guideHang = 0.05)
dev.off()
## png 
##   2

绘制模块之间相关性图

MEs = net$MEs
MEs_col = MEs
colnames(MEs_col) = paste0("ME", labels2colors(as.numeric(str_replace_all(colnames(MEs),"ME",""))))
MEs_col = orderMEs(MEs_col)

pdf(file = "Eigengene_heatmap.pdf", width = 10, height = 8)
plotEigengeneNetworks(
MEs_col,
"Eigengene adjacency heatmap",
marDendro = c(3, 3, 2, 4),
marHeatmap = c(3, 4, 2, 2),
plotDendrograms = TRUE,
xLabelsAngle = 90
)
dev.off()
## png 
##   2

聚类分析输出结果

获取各模块内的基因

library("org.Hs.eg.db")
## 载入需要的程序包:AnnotationDbi
## 载入需要的程序包:stats4
## 载入需要的程序包:IRanges
## 载入需要的程序包:S4Vectors
## 
## 载入程序包:'S4Vectors'
## The following objects are masked from 'package:lubridate':
## 
##     second, second<-
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following object is masked from 'package:tidyr':
## 
##     expand
## The following object is masked from 'package:utils':
## 
##     findMatches
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## 
## 载入程序包:'IRanges'
## The following object is masked from 'package:lubridate':
## 
##     %within%
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## The following object is masked from 'package:purrr':
## 
##     reduce
## The following object is masked from 'package:grDevices':
## 
##     windows
## 
## 载入程序包:'AnnotationDbi'
## The following object is masked from 'package:dplyr':
## 
##     select
## 
unique_modulecol<-unique(moduleColors)
module_genes <- list()

for (color in unique_modulecol) {
genes_in_module <- colnames(expr0)[which(moduleColors == color)]
module_genes[[color]] <- genes_in_module}

M1<-module_genes[["blue"]][c(1:200)]
M2<-module_genes[["brown"]][c(1:200)]
M3<-module_genes[["turquoise"]][c(1:200)]

gene_data<-list(m1=M1,m2=M2,m3=M3)

#将基因名称和数据库一一对应
valid_genes <- AnnotationDbi::select(org.Hs.eg.db,
keys = unlist(gene_data),
columns = c("SYMBOL", "ENTREZID"),
keytype = "SYMBOL")
## 'select()' returned 1:many mapping between keys and columns
#将gene symbol转换为entrez id
gene_data_entrez <- lapply(gene_data, function(genes) {
valid_entrez <- AnnotationDbi::select(org.Hs.eg.db,
keys = genes,
columns = "ENTREZID",
keytype = "SYMBOL")
return(valid_entrez$ENTREZID)
})
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
#去除数据库中不存在的基因
gene_data_entrez[[1]]<-na.omit(gene_data_entrez[[1]])
gene_data_entrez[[2]]<-na.omit(gene_data_entrez[[2]])
gene_data_entrez[[3]]<-na.omit(gene_data_entrez[[3]])
library(clusterProfiler)
## clusterProfiler v4.12.6 Learn more at https://yulab-smu.top/contribution-knowledge-mining/
## 
## Please cite:
## 
## S Xu, E Hu, Y Cai, Z Xie, X Luo, L Zhan, W Tang, Q Wang, B Liu, R Wang,
## W Xie, T Wu, L Xie, G Yu. Using clusterProfiler to characterize
## multiomics data. Nature Protocols. 2024, doi:10.1038/s41596-024-01020-z
## 
## 载入程序包:'clusterProfiler'
## The following object is masked from 'package:AnnotationDbi':
## 
##     select
## The following object is masked from 'package:IRanges':
## 
##     slice
## The following object is masked from 'package:S4Vectors':
## 
##     rename
## The following object is masked from 'package:purrr':
## 
##     simplify
## The following object is masked from 'package:stats':
## 
##     filter
gene_enrich<-compareCluster(
gene_data_entrez,
fun="enrichGO",
OrgDb="org.Hs.eg.db",
ont="BP",
pvalueCutoff = 0.05)

可视化

cnetplot(gene_enrich)

dotplot(gene_enrich)

数据来源

ER 阳性乳腺癌细胞系 MCF-7 中 KDM3A 调控基因的鉴定