Create: Jianming Zeng
Date: 2018-12-20 15:43:52
Email: jmzeng1314@163.com
Blog: http://www.bio-info-trainee.com/
Forum: http://www.biotrainee.com/thread-1376-1-1.html
CAFS/SUSTC/Eli Lilly/University of Macau Update
Log: 2018-12-20 First version
rm(list = ls()) ## 魔幻操作,一键清空~
options(stringsAsFactors = F)#在调用as.data.frame的时,将stringsAsFactors设置为FALSE可以避免character类型自动转化为factor类型
setwd("D:/R/R-4.0.5/bin/project_difGEO")
#Jimmy万能包
#library(devtools)
#install_github("jmzeng1314/GEOmirror")
#source('http://raw.githubusercontent.com/jmzeng1314/GEOmirror/master/R/geoChina.R')
#GSE85446
suppressMessages(library(GEOmirror))
#gSet <- geoChina('GSE85446')
load("D:/R/R-4.0.5/bin/project_difGEO/GSE24673_eSet.Rdata")
a <- gset[[1]]
#表达矩阵
suppressMessages(library(GEOquery))
exprSet <- exprs(a)
dim(exprSet)#看一下exprSet这个矩阵的维度
## [1] 28869 11
#临床数据
pSet <- pData(a) #通过查看说明书知道取对象a里的临床信息用pData
dim(pSet)
## [1] 11 41
#annotation
a@annotation
## [1] "GPL6244"
## [1] "GPL6480"
#其他下载方法,从网址下载并读进去或者使用getGEO()函数
{
#注意查看下载文件的大小,检查数据
f='GSE24673_eSet.Rdata'
suppressMessages(library(GEOquery))
# 这个包需要注意两个配置,一般来说自动化的配置是足够的。
#Setting options('download.file.method.GEOquery'='auto')
#Setting options('GEOquery.inmemory.gpl'=FALSE)
if(!file.exists(f)){
gset <- getGEO('GSE24673', destdir=".",
AnnotGPL = F, ## 注释文件
getGPL = F) ## 平台文件
save(gset,file=f) ## 保存到本地
}
#表达矩阵
load("D:/R/R-4.0.5/bin/project_difGEO/GSE24673_eSet.Rdata")
a <- gset[[1]]
exprSet <- exprs(a)
dim(exprSet)#看一下exprSet这个矩阵的维度
#临床数据
pSet <- pData(a) #通过查看说明书知道取对象a里的临床信息用pData
dim(pSet)
}
## [1] 11 41
#GPL6244
exprSet[1:4,1:4] #查看exprSet这个矩阵的1至4行和1至4列,逗号前为行,逗号后为列
## GSM607938 GSM607939 GSM607940 GSM607941
## 7896736 6.231 6.353 6.702 6.663
## 7896738 6.607 6.895 6.856 8.136
## 7896740 7.191 7.123 7.196 7.845
## 7896742 7.447 6.837 7.220 7.391
boxplot(exprSet,las =2)#las=2样本名称为竖排
group_list <- pSet$source_name_ch1
suppressMessages(library( hugene10sttranscriptcluster.db))#通过谷歌或者链接(http://www.bio-info-trainee.com/1399.html)
keytypes(hugene10sttranscriptcluster.db)#查看所以可以提取的,比较重要的是"ENTREZID","SYMBOL","ENSEMBL"
## [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS"
## [6] "ENTREZID" "ENZYME" "EVIDENCE" "EVIDENCEALL" "GENENAME"
## [11] "GO" "GOALL" "IPI" "MAP" "OMIM"
## [16] "ONTOLOGY" "ONTOLOGYALL" "PATH" "PFAM" "PMID"
## [21] "PROBEID" "PROSITE" "REFSEQ" "SYMBOL" "UCSCKG"
## [26] "UNIGENE" "UNIPROT"
ids <- toTable(hugene10sttranscriptclusterSYMBOL) #toTable这个函数:通过看说明书知道提取probe_id(探针名)和【想要啥要啥】的对应关系的表达矩阵的函数为toTable
head(ids) #head为查看前六行
## probe_id symbol
## 1 7896759 LINC01128
## 2 7896761 SAMD11
## 3 7896779 KLHL17
## 4 7896798 PLEKHN1
## 5 7896817 ISG15
## 6 7896822 AGRN
colnames(ids) <- c('probe_id','gene_id')
ids <- ids[ids$gene_id != '',]#删除空集
ids <- ids[ids$probe_id %in% rownames(exprSet),]#筛一下,只留下我们的表达矩阵里面需要的
exprSet <- exprSet[ids$probe_id,]
#如此一来,注释表就和表达矩阵一一对应了
exprSet[1:4,1:4]
## GSM607938 GSM607939 GSM607940 GSM607941
## 7896759 7.268 7.445 6.837 7.472
## 7896761 8.051 7.840 8.017 7.598
## 7896779 7.474 7.647 7.623 7.693
## 7896798 7.518 7.575 7.575 7.699
ids$median <- apply(exprSet,1,median)#对exprSet这个矩阵的每一行使用函数median;1为行;2为列
ids <- ids[order(ids$gene_id,ids$median,decreasing = T),]#对ids$symbol按照ids$median中位数从大到小排列的顺序排序,将对应的行赋值为一个新的ids
ids <- ids[!duplicated(ids$gene_id),]#将symbol这一列取出重复项,'!'为否,即取出不重复的项,去除重复的gene ,保留每个基因最大表达量结果s
exprSet <- exprSet[ids$probe_id,] #新的ids取出probe_id这一列,将dat按照取出的这一列中的每一行组成一个新的dat
dim(exprSet)
## [1] 18754 11
rownames(exprSet) <- ids$gene_id#把ids的symbol这一列中的每一行给dat作为dat的行名
exprSet[1:4,1:4] #保留每个基因ID第一次出现的信息
## GSM607938 GSM607939 GSM607940 GSM607941
## ZZZ3 8.574 8.633 8.627 8.014
## ZZEF1 8.222 8.075 8.045 8.218
## ZYX 7.669 7.432 7.496 7.592
## ZYG11B 8.854 8.662 8.854 8.215
save(exprSet,group_list,file = 'step1_output_GSE24673.Rdata')
rm(list = ls())
load('step1_output_GSE24673.Rdata')
# 看一看内参表达值
exprSet["ACTB",]#常见表达基因GAPDH、ACTB、PPIA
## GSM607938 GSM607939 GSM607940 GSM607941 GSM607942 GSM607943 GSM607944 GSM607945
## 11.645 11.660 11.641 11.943 11.916 11.953 11.780 12.011
## GSM607946 GSM607947 GSM607948
## 12.170 12.024 11.912
boxplot(exprSet,las = 2)
# 看看数据分布图
suppressMessages(library(ggplot2))
suppressMessages(library(reshape2))
#把表达矩阵转化成长list
exprSet_list <- melt(exprSet)
colnames(exprSet_list ) = c('gene','sample','value')
# 设置分组情况
#可以用已有的group数据,或者下载(再GEO的series的sample里面复制)
#或者自己设置,group的情况可以去GEO网页上显示的来命名,这里6个样品,分别是对照和药物处理,各3个重复,这里我只区分对照和处理,就不标明1,2,3了
#group_list <- c(rep('control',3),rep('Vemurafenib',3))
#把长列表的表达矩阵里group一列命名成具体的样品名
exprSet_list$group <- rep(group_list,each = nrow(exprSet))
head(exprSet_list)
## gene sample value group
## 1 ZZZ3 GSM607938 8.574 Retinoblastoma -Chemotreated
## 2 ZZEF1 GSM607938 8.222 Retinoblastoma -Chemotreated
## 3 ZYX GSM607938 7.669 Retinoblastoma -Chemotreated
## 4 ZYG11B GSM607938 8.854 Retinoblastoma -Chemotreated
## 5 ZYG11A GSM607938 5.970 Retinoblastoma -Chemotreated
## 6 ZXDC GSM607938 8.543 Retinoblastoma -Chemotreated
# 开始画图
# 箱型图
ggplot(exprSet_list,aes(x=sample,y= value,fill = group),palette("default"))+
geom_boxplot()
#画小提琴图
ggplot(exprSet_list,aes(x=sample,y= value,fill=group))+
geom_violin()
#画光滑密度图看有无离群值
ggplot(exprSet_list,aes(value,col=group))+
geom_density()+
facet_wrap(~sample,nrow =6)
# 聚类分析,看看分组对不对
exprSet_h <- exprSet
colnames(exprSet_h) <- group_list
hc <- hclust(dist(t(exprSet)))#t指turn转;dist是欧氏距离
plot(hc)
#BiocManager::install("ggfortify")
suppressMessages(library(ggfortify))
df=as.data.frame(t(exprSet))
df$group=group_list
autoplot(prcomp(df[,1:(ncol(df)-1)]),data=df,colour='group')
rm(list = ls())
##获得表达矩阵
suppressMessages(library(limma))
load("D:/R/R-4.0.5/bin/project_visual/visualization/GSE24673/step1_output_GSE24673.Rdata")
dim(exprSet)
## [1] 18754 11
##制作分组矩阵
#把分组变成2组
group_list_dif <- group_list[-c(10,11)]
exprSet_dif <- exprSet[,-c(10,11)]
group_list_dif
## [1] "Retinoblastoma -Chemotreated" "Retinoblastoma -Chemotreated"
## [3] "Retinoblastoma -Chemotreated" "Retinoblastoma -NO Chemotreated"
## [5] "Retinoblastoma -NO Chemotreated" "Retinoblastoma -NO Chemotreated"
## [7] "Retinoblastoma -Chemotreated" "Retinoblastoma -Chemotreated"
## [9] "Retinoblastoma -Chemotreated"
head(exprSet_dif,3)
## GSM607938 GSM607939 GSM607940 GSM607941 GSM607942 GSM607943 GSM607944
## ZZZ3 8.574 8.633 8.627 8.014 8.052 8.279 8.807
## ZZEF1 8.222 8.075 8.045 8.218 8.144 8.409 8.792
## ZYX 7.669 7.432 7.496 7.592 7.485 7.400 7.669
## GSM607945 GSM607946
## ZZZ3 9.122 9.031
## ZZEF1 8.802 8.878
## ZYX 7.714 7.700
#输入分组依据的数据,得到每个样品的对应位置,0代表否,1代表是
group_list_dif <- factor(group_list_dif,levels = c("Retinoblastoma -Chemotreated","Retinoblastoma -NO Chemotreated"),labels = c("prcs","ctrl"))
design <- model.matrix(~0+group_list_dif)
colnames(design) <- levels(group_list_dif)
rownames(design) <- colnames(exprSet_dif)
design
## prcs ctrl
## GSM607938 1 0
## GSM607939 1 0
## GSM607940 1 0
## GSM607941 0 1
## GSM607942 0 1
## GSM607943 0 1
## GSM607944 1 0
## GSM607945 1 0
## GSM607946 1 0
## attr(,"assign")
## [1] 1 1
## attr(,"contrasts")
## attr(,"contrasts")$group_list_dif
## [1] "contr.treatment"
##制作比较矩阵
#这一步很关键,makeContrasts(**实验组-对照组**,levels = design)
contrast<-makeContrasts(paste("prcs","ctrl",sep = "-"),levels = design)
contrast ##这个矩阵声明,我们要把progres.组跟stable进行差异分析比较
## Contrasts
## Levels prcs-ctrl
## prcs 1
## ctrl -1
## Contrasts
## Levels prcs-ctrl
## prcs 1一定要记住让实验组做1
## ctrl -1
开始进行差异表达分析(这是具体步骤(省略),可以直接用后面的函数)
fit <- lmFit(exprSet_dif,design)
fit2 <- contrasts.fit(fit, contrast) ##这一步很重要,大家可以自行看看效果 fit2 <- eBayes(fit2) ## default no trend !!! *eBayes() with trend=TRUE
tempOutput = topTable(fit2, coef=1, n=Inf)#原始结果 nrDEG = na.omit(tempOutput) #删除空集 save(nrDEG, file = “GSE37265_dif_whole.Rdata”) head(nrDEG)看看就行了,我也看不懂
DEG <- function(expr42872,design,contrast){
fit = lmFit(expr42872, design)
fit2 = contrasts.fit(fit, contrast)
fit2 = eBayes(fit2)
mtx = topTable(fit2, coef=1, n=Inf)
deg_mtx = na.omit(mtx)
return(deg_mtx)
}
DEG_mtx <- DEG(exprSet_dif,design,contrast)#得到差异分析的结果
##检查差异分析的结果
#随机抽取一个基因检查一下
exprSet_dif["DDX3Y",]
## GSM607938 GSM607939 GSM607940 GSM607941 GSM607942 GSM607943 GSM607944 GSM607945
## 9.905 10.104 9.931 5.796 5.643 5.775 9.965 10.034
## GSM607946
## 10.030
#基因在处理组高表达,,logFC为正值,结果正确
##筛选符合要求的差异表达基因
suppressMessages(library(tidyverse))
dif <- DEG_mtx %>% filter(adj.P.Val <0.05 & logFC > 2)
dim(dif)
## [1] 66 6
#筛选出来66个基因
rownames(dif) <- dif[,1]
dif <- dif[,-1]
write.csv(dif,file="dif_limma_matrix_GSE24673.csv")
save(DEG_mtx,exprSet_dif,group_list_dif,file="dif_limma_output_GSE24673.Rdata")
rm(list = ls())