library(GEOquery)
## 载入需要的程序包:Biobase
## 载入需要的程序包:BiocGenerics
##
## 载入程序包:'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, table,
## tapply, union, unique, unsplit, which.max, which.min
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Setting options('download.file.method.GEOquery'='auto')
## Setting options('GEOquery.inmemory.gpl'=FALSE)
library(ggplot2)
library(limma)
##
## 载入程序包:'limma'
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
library(stringr)
library(WGCNA)
## 载入需要的程序包:dynamicTreeCut
## 载入需要的程序包:fastcluster
##
## 载入程序包:'fastcluster'
## The following object is masked from 'package:stats':
##
## hclust
##
##
## 载入程序包:'WGCNA'
## The following object is masked from 'package:stats':
##
## cor
library(reshape2)
rm(list = ls())
options(stringsAsFactors=F)
enableWGCNAThreads()
## Allowing parallel execution with up to 7 working processes.
gset<-getGEO('GSE68918',destdir = ".",AnnotGPL = F,getGPL = F)
## Found 1 file(s)
## GSE68918_series_matrix.txt.gz
## Using locally cached version: ./GSE68918_series_matrix.txt.gz
exp<-exprs(gset[[1]])
#获取表达矩阵
pdata<-pData(gset[[1]])
#获取临床信息
group_list<-pdata$`agent:ch1`
group_list<-factor(group_list,levels = c("vehicle","10nME2"))
exp_l<-melt(exp) # 拆分数据
colnames(exp_l)=c('probe','sample','value')
exp_l$group=rep(group_list,each=nrow(exp))
p<-ggplot(exp_l,aes(x=sample,y=value,fill=group))+
geom_boxplot()+
ggtitle("before adjustment")+
theme(plot.title=element_text(hjust=0.5))+
theme(axis.text.x = element_text(angle = 90,hjust = 1,size = 7))
print(p)
#绘制处理前数据箱线图
exp=normalizeBetweenArrays(exp) #校正函数
exp_ll<-melt(exp)
colnames(exp_ll)<-c('probe','sample','value')
exp_ll$group<-rep(group_list,each=nrow(exp))
pp<-ggplot(exp_ll,aes(x=sample,y=value,fill=group))+
geom_boxplot()+
ggtitle("after adjustment")+
theme(plot.title = element_text(hjust = 0.5))+
theme(axis.text.x = element_text(angle = 90,hjust = 1,size = 7))
print(pp)
range(exp) #查看表达矩阵的取值范围,一般而言,范围在20以内的表达值基本已经经过了log对数转换
## [1] 5.157608 15.066452
gpl<-getGEO('GPL10558',destdir = ".")
colnames(Table(gpl))
probe2gene=Table(gpl)[,c(1,x)] #x为需要的列(如gene symbol/entrez id...
probe2gene<-subset(probe2gene,GENE_SYMBOL!='')
ids<-probe2gene
library(devtools)
## 载入需要的程序包:usethis
library(idmap3)
## idmap3 v 0.1.0 welcome to use idmap3!
## If you use idmap3 in published research, please acknowledgements:
## We thank Dr.Jianming Zeng(University of Macau), and all the members of his bioinformatics team, biotrainee, for generously sharing their experience and codes.
ids<-idmap3::get_pipe_IDs('GPL10558')
(使用哪一种看具体网络情况)
替换
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine() masks Biobase::combine(), BiocGenerics::combine()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
exp <- as.data.frame(exp)
exp <- exp %>%
mutate(probe_id=rownames(exp)) %>%
#为数据框新加一列
inner_join(ids,by="probe_id") %>%
#将 exp 数据框和 ids 数据框根据 probe_id 这一列进行内连接,保留匹配的行
select(probe_id, symbol, everything())
#重新排列列的顺序,将 probe_id 和 symbol 两列放在前面,其他列用 everything() 保持其原有顺序
exp <- exp[!duplicated(exp$symbol),]
#去除重复
rownames(exp)<-exp$symbol
exp<-exp[,-(1:2)] #去掉第一二列
expr0<-t(exp)
sampleTree = hclust(dist(expr0), method = "average")
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="")
expr0<-expr0[row.names(expr0)!="GSM1686444",]
powers = c(c(1:10), seq(from = 12, to=30, by=2))
sft = pickSoftThreshold(expr0, powerVector=powers,
networkType="signed hybrid", verbose=3)
## pickSoftThreshold: will use block size 1632.
## pickSoftThreshold: calculating connectivity for given powers...
## ..working on genes 1 through 1632 of 27410
## ..working on genes 1633 through 3264 of 27410
## ..working on genes 3265 through 4896 of 27410
## ..working on genes 4897 through 6528 of 27410
## ..working on genes 6529 through 8160 of 27410
## ..working on genes 8161 through 9792 of 27410
## ..working on genes 9793 through 11424 of 27410
## ..working on genes 11425 through 13056 of 27410
## ..working on genes 13057 through 14688 of 27410
## ..working on genes 14689 through 16320 of 27410
## ..working on genes 16321 through 17952 of 27410
## ..working on genes 17953 through 19584 of 27410
## ..working on genes 19585 through 21216 of 27410
## ..working on genes 21217 through 22848 of 27410
## ..working on genes 22849 through 24480 of 27410
## ..working on genes 24481 through 26112 of 27410
## ..working on genes 26113 through 27410 of 27410
## Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1 1 0.214 -0.827 0.636 4600.0 4450.00 7330
## 2 2 0.768 -1.260 0.914 2250.0 2050.00 4760
## 3 3 0.888 -1.350 0.958 1320.0 1090.00 3500
## 4 4 0.912 -1.370 0.957 865.0 641.00 2760
## 5 5 0.904 -1.390 0.942 610.0 401.00 2280
## 6 6 0.901 -1.380 0.936 453.0 263.00 1930
## 7 7 0.897 -1.370 0.934 350.0 178.00 1670
## 8 8 0.892 -1.360 0.931 279.0 125.00 1460
## 9 9 0.873 -1.360 0.916 227.0 90.10 1300
## 10 10 0.858 -1.360 0.906 189.0 66.40 1160
## 11 12 0.848 -1.340 0.899 137.0 38.00 958
## 12 14 0.845 -1.330 0.897 103.0 23.60 806
## 13 16 0.844 -1.310 0.894 80.8 15.40 690
## 14 18 0.844 -1.290 0.890 64.9 10.60 598
## 15 20 0.854 -1.270 0.894 53.3 7.62 524
## 16 22 0.900 -1.240 0.932 44.5 5.72 464
## 17 24 0.947 -1.200 0.972 37.7 4.39 414
## 18 26 0.945 -1.220 0.977 32.3 3.52 382
## 19 28 0.934 -1.250 0.975 28.0 2.88 358
## 20 30 0.931 -1.280 0.979 24.4 2.43 337
选择标准:1.SFT.R.sq>0.85
2.mean.k.(平均连通性)不太高也不太低,1到100之间比较好
(之前选择了power值为3去构建层次聚类树状图,但特别平坦,检查发现是平均连通性太高了。所以后面选了10为power值)
net = blockwiseModules(expr0, power = 10, TOMType = "signed", minModuleSize = 50,reassignThreshold = 0, mergeCutHeight = 1,numericLabels = TRUE, pamRespectsDendro = FALSE, saveTOMs=TRUE, loadTOMs=TRUE,verbose = 3)
## Calculating module eigengenes block-wise from all genes
## Flagging genes and samples with too many missing values...
## ..step 1
## ....pre-clustering genes to determine blocks..
## Projective K-means:
## ..k-means clustering..
## ..merging smaller clusters...
## Block sizes:
## gBlocks
## 1 2 3 4 5 6
## 4995 4974 4857 4530 4093 3961
## ..Working on block 1 .
## TOM calculation: adjacency..
## ..will not use multithreading.
## Fraction of slow calculations: 0.000000
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
## ..saving TOM for block 1 into file blockwiseTOM-block.1.RData
## ....clustering..
## ....detecting modules..
## ....calculating module eigengenes..
## ....checking kME in modules..
## ..Working on block 2 .
## TOM calculation: adjacency..
## ..will not use multithreading.
## Fraction of slow calculations: 0.000000
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
## ..saving TOM for block 2 into file blockwiseTOM-block.2.RData
## ....clustering..
## ....detecting modules..
## ....calculating module eigengenes..
## ....checking kME in modules..
## ..Working on block 3 .
## TOM calculation: adjacency..
## ..will not use multithreading.
## Fraction of slow calculations: 0.000000
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
## ..saving TOM for block 3 into file blockwiseTOM-block.3.RData
## ....clustering..
## ....detecting modules..
## ....calculating module eigengenes..
## ....checking kME in modules..
## ..removing 1 genes from module 1 because their KME is too low.
## ..Working on block 4 .
## TOM calculation: adjacency..
## ..will not use multithreading.
## Fraction of slow calculations: 0.000000
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
## ..saving TOM for block 4 into file blockwiseTOM-block.4.RData
## ....clustering..
## ....detecting modules..
## ....calculating module eigengenes..
## ....checking kME in modules..
## ..removing 4 genes from module 2 because their KME is too low.
## ..removing 1 genes from module 9 because their KME is too low.
## ..Working on block 5 .
## TOM calculation: adjacency..
## ..will not use multithreading.
## Fraction of slow calculations: 0.000000
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
## ..saving TOM for block 5 into file blockwiseTOM-block.5.RData
## ....clustering..
## ....detecting modules..
## ....calculating module eigengenes..
## ....checking kME in modules..
## ..Working on block 6 .
## TOM calculation: adjacency..
## ..will not use multithreading.
## Fraction of slow calculations: 0.000000
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
## ..saving TOM for block 6 into file blockwiseTOM-block.6.RData
## ....clustering..
## ....detecting modules..
## ....calculating module eigengenes..
## ....checking kME in modules..
## ..removing 2 genes from module 3 because their KME is too low.
## ..removing 2 genes from module 4 because their KME is too low.
## ..merging modules that are too close..
## mergeCloseModules: Merging modules whose distance is less than 1
## Calculating new MEs...
moduleLabels = net$colors
moduleColors = labels2colors(moduleLabels)
pdf(file = "dendrogram_plot.pdf", width = 10, height = 6)
plotDendroAndColors(net$dendrograms[[1]], moduleColors[net$blockGenes[[1]]],"Module colors",dendroLabels = FALSE, hang = 0.03,addGuide = TRUE, guideHang = 0.05)
dev.off()
## png
## 2
MEs = net$MEs
MEs_col = MEs
colnames(MEs_col) = paste0("ME", labels2colors(as.numeric(str_replace_all(colnames(MEs),"ME",""))))
MEs_col = orderMEs(MEs_col)
pdf(file = "Eigengene_heatmap.pdf", width = 10, height = 8)
plotEigengeneNetworks(
MEs_col,
"Eigengene adjacency heatmap",
marDendro = c(3, 3, 2, 4),
marHeatmap = c(3, 4, 2, 2),
plotDendrograms = TRUE,
xLabelsAngle = 90
)
dev.off()
## png
## 2
library("org.Hs.eg.db")
## 载入需要的程序包:AnnotationDbi
## 载入需要的程序包:stats4
## 载入需要的程序包:IRanges
## 载入需要的程序包:S4Vectors
##
## 载入程序包:'S4Vectors'
## The following objects are masked from 'package:lubridate':
##
## second, second<-
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following object is masked from 'package:tidyr':
##
## expand
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
##
## 载入程序包:'IRanges'
## The following object is masked from 'package:lubridate':
##
## %within%
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## The following object is masked from 'package:purrr':
##
## reduce
## The following object is masked from 'package:grDevices':
##
## windows
##
## 载入程序包:'AnnotationDbi'
## The following object is masked from 'package:dplyr':
##
## select
##
unique_modulecol<-unique(moduleColors)
module_genes <- list()
for (color in unique_modulecol) {
genes_in_module <- colnames(expr0)[which(moduleColors == color)]
module_genes[[color]] <- genes_in_module}
M1<-module_genes[["blue"]][c(1:200)]
M2<-module_genes[["brown"]][c(1:200)]
M3<-module_genes[["turquoise"]][c(1:200)]
gene_data<-list(m1=M1,m2=M2,m3=M3)
#将基因名称和数据库一一对应
valid_genes <- AnnotationDbi::select(org.Hs.eg.db,
keys = unlist(gene_data),
columns = c("SYMBOL", "ENTREZID"),
keytype = "SYMBOL")
## 'select()' returned 1:many mapping between keys and columns
#将gene symbol转换为entrez id
gene_data_entrez <- lapply(gene_data, function(genes) {
valid_entrez <- AnnotationDbi::select(org.Hs.eg.db,
keys = genes,
columns = "ENTREZID",
keytype = "SYMBOL")
return(valid_entrez$ENTREZID)
})
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
#去除数据库中不存在的基因
gene_data_entrez[[1]]<-na.omit(gene_data_entrez[[1]])
gene_data_entrez[[2]]<-na.omit(gene_data_entrez[[2]])
gene_data_entrez[[3]]<-na.omit(gene_data_entrez[[3]])
library(clusterProfiler)
## clusterProfiler v4.12.6 Learn more at https://yulab-smu.top/contribution-knowledge-mining/
##
## Please cite:
##
## S Xu, E Hu, Y Cai, Z Xie, X Luo, L Zhan, W Tang, Q Wang, B Liu, R Wang,
## W Xie, T Wu, L Xie, G Yu. Using clusterProfiler to characterize
## multiomics data. Nature Protocols. 2024, doi:10.1038/s41596-024-01020-z
##
## 载入程序包:'clusterProfiler'
## The following object is masked from 'package:AnnotationDbi':
##
## select
## The following object is masked from 'package:IRanges':
##
## slice
## The following object is masked from 'package:S4Vectors':
##
## rename
## The following object is masked from 'package:purrr':
##
## simplify
## The following object is masked from 'package:stats':
##
## filter
gene_enrich<-compareCluster(
gene_data_entrez,
fun="enrichGO",
OrgDb="org.Hs.eg.db",
ont="BP",
pvalueCutoff = 0.05)
cnetplot(gene_enrich)
dotplot(gene_enrich)