rm(list=ls())
setwd("D:/R/R-4.0.5/bin/project_practise/SXtree/SXtree_beforeGEO")
suppressMessages(library(tidyverse))
# 思路
#ID转换相当于翻译
##首先导入需要翻译的数据
##获取翻译索引表
##原始数据处理:根据索引表处理需要翻译的数据——转换成索引表可以识别的模式
##索引表处理:根据原始数据取索引表的子集,要求与原始数据一一对应
##merge一下
## 法1
suppressMessages(library(org.Hs.eg.db))
columns(org.Hs.eg.db)
## [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS"
## [6] "ENTREZID" "ENZYME" "EVIDENCE" "EVIDENCEALL" "GENENAME"
## [11] "GO" "GOALL" "IPI" "MAP" "OMIM"
## [16] "ONTOLOGY" "ONTOLOGYALL" "PATH" "PFAM" "PMID"
## [21] "PROSITE" "REFSEQ" "SYMBOL" "UCSCKG" "UNIGENE"
## [26] "UNIPROT"
id <- read.table("id.txt",header = T,sep = "\t")
myid <- id %>% separate(id,c("ENSEMBL","a"))
keys <- myid[,1]
prbe2id <- AnnotationDbi::select(org.Hs.eg.db,keys=keys,c("ENSEMBL","SYMBOL"),keytype="ENSEMBL")
## 'select()' returned 1:1 mapping between keys and columns
## 法二
p2i_ENS <- toTable(org.Hs.egENSEMBL)
p2i_SYM <- toTable(org.Hs.egSYMBOL)
p2i <- merge(p2i_ENS,p2i_SYM)
ids2 <- merge(myid,p2i,by.x = "ENSEMBL",by.y = "ensembl_id")
## 别人处理数据的方法
suppressMessages(library(stringr))
ensembl_id = str_split(id[,1], pattern = "[.]", simplify = T)[,1]
ensembl_id = as.data.frame(ensembl_id)
因为与tidyverse包里的select()重名,直接使用会报错,需要AnnotationDbi::select()
rm(list=ls())
#导入数据
id_2 <- read.table("id2.txt",header = T,sep = "\t")
#数据直接能用,不需要处理
#myid_2 <- separate(id_2,id,c("ida","idb","idc"),fill = "right")
#获取翻译索引表
suppressMessages(library(hgu133a.db))
keytypes(hgu133a.db)
## [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS"
## [6] "ENTREZID" "ENZYME" "EVIDENCE" "EVIDENCEALL" "GENENAME"
## [11] "GO" "GOALL" "IPI" "MAP" "OMIM"
## [16] "ONTOLOGY" "ONTOLOGYALL" "PATH" "PFAM" "PMID"
## [21] "PROBEID" "PROSITE" "REFSEQ" "SYMBOL" "UCSCKG"
## [26] "UNIGENE" "UNIPROT"
p2g <- toTable(hgu133aSYMBOL)
#数据比较小,且原始数据没有重复,可以直接merge
ids <- merge(id_2,p2g,by.x="id", by.y="probe_id")
获得内置数据集“sCLLex”中基因“TP53”的表达量,根据分组信息画分组箱型图
1、获得数据集(对象)的表达矩阵和临床信息 2、获得基因注释表 3、根据注释表找到基因对应的探针 4、回到表达矩阵,找到表达值 5、根据表达值和分组信息绘图
# 获取表达矩阵和临床信息
suppressMessages(library(CLL))
data(sCLLex)
str(sCLLex)
## Formal class 'ExpressionSet' [package "Biobase"] with 7 slots
## ..@ experimentData :Formal class 'MIAME' [package "Biobase"] with 13 slots
## .. .. ..@ name : chr ""
## .. .. ..@ lab : chr ""
## .. .. ..@ contact : chr ""
## .. .. ..@ title : chr ""
## .. .. ..@ abstract : chr ""
## .. .. ..@ url : chr ""
## .. .. ..@ pubMedIds : chr ""
## .. .. ..@ samples : list()
## .. .. ..@ hybridizations : list()
## .. .. ..@ normControls : list()
## .. .. ..@ preprocessing :List of 2
## .. .. .. ..$ filenames : chr [1:24] "/export/home/rgentlem/RG/Genetics/Sabina/CLL/CLL10.CEL" "/export/home/rgentlem/RG/Genetics/Sabina/CLL/CLL11.CEL" "/export/home/rgentlem/RG/Genetics/Sabina/CLL/CLL12.CEL" "/export/home/rgentlem/RG/Genetics/Sabina/CLL/CLL13.CEL" ...
## .. .. .. ..$ affyversion: chr NA
## .. .. ..@ other : list()
## .. .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slot
## .. .. .. .. ..@ .Data:List of 1
## .. .. .. .. .. ..$ : int [1:3] 1 0 0
## ..@ assayData :<environment: 0x000000002accf6f0>
## ..@ phenoData :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
## .. .. ..@ varMetadata :'data.frame': 2 obs. of 1 variable:
## .. .. .. ..$ labelDescription: chr [1:2] "Sample ID" "Stable/Progressive"
## .. .. ..@ data :'data.frame': 22 obs. of 2 variables:
## .. .. .. ..$ SampleID: 'AsIs' chr [1:22] "CLL11" "CLL12" "CLL13" "CLL14" ...
## .. .. .. ..$ Disease : Factor w/ 2 levels "progres.","stable": 1 2 1 1 1 1 2 2 1 2 ...
## .. .. ..@ dimLabels : chr [1:2] "sampleNames" "sampleColumns"
## .. .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slot
## .. .. .. .. ..@ .Data:List of 1
## .. .. .. .. .. ..$ : int [1:3] 1 1 0
## ..@ featureData :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
## .. .. ..@ varMetadata :'data.frame': 0 obs. of 1 variable:
## .. .. .. ..$ labelDescription: chr(0)
## .. .. ..@ data :'data.frame': 12625 obs. of 0 variables
## .. .. ..@ dimLabels : chr [1:2] "featureNames" "featureColumns"
## .. .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slot
## .. .. .. .. ..@ .Data:List of 1
## .. .. .. .. .. ..$ : int [1:3] 1 1 0
## ..@ annotation : chr "hgu95av2"
## ..@ protocolData :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
## .. .. ..@ varMetadata :'data.frame': 0 obs. of 1 variable:
## .. .. .. ..$ labelDescription: chr(0)
## .. .. ..@ data :'data.frame': 22 obs. of 0 variables
## .. .. ..@ dimLabels : chr [1:2] "sampleNames" "sampleColumns"
## .. .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slot
## .. .. .. .. ..@ .Data:List of 1
## .. .. .. .. .. ..$ : int [1:3] 1 1 0
## ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slot
## .. .. ..@ .Data:List of 4
## .. .. .. ..$ : int [1:3] 2 10 0
## .. .. .. ..$ : int [1:3] 2 5 5
## .. .. .. ..$ : int [1:3] 1 3 0
## .. .. .. ..$ : int [1:3] 1 0 0
#表达矩阵
expr <- sCLLex@assayData[["exprs"]]#正确的方法是#使用exprs()但这样也行
#临床信息
pd <- pData(sCLLex)
# 获取翻译索引表:获得探针注释信息
#Annotation: hgu95av2
BiocManager::install("hgu95av2.db")
## Bioconductor version 3.12 (BiocManager 1.30.16), R 4.0.5 (2021-03-31)
## Warning: package(s) not installed when version(s) same as current; use `force = TRUE` to
## re-install: 'hgu95av2.db'
## Old packages: 'forestplot', 'haven', 'readr', 'tibble', 'utf8', 'vroom', 'xfun'
suppressMessages(library(hgu95av2.db))
keytypes(hgu95av2.db)
## [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS"
## [6] "ENTREZID" "ENZYME" "EVIDENCE" "EVIDENCEALL" "GENENAME"
## [11] "GO" "GOALL" "IPI" "MAP" "OMIM"
## [16] "ONTOLOGY" "ONTOLOGYALL" "PATH" "PFAM" "PMID"
## [21] "PROBEID" "PROSITE" "REFSEQ" "SYMBOL" "UCSCKG"
## [26] "UNIGENE" "UNIPROT"
p2g <- toTable(hgu95av2SYMBOL)
dim(p2g)
## [1] 11457 2
#获得翻译索引表的子集——TP53对应的探针
probe <- p2g[p2g$symbol == "TP53",][,1]
# 开始绘图
## 简单箱型图
expr_TP53 <- expr[probe,]
boxplot(expr_TP53[1,]~pd$Disease)
boxplot(expr_TP53)
#ggplot和ggpubr的输入形式都要求是dataframe里面的列,所以先合成dataframe
d <-cbind(as.data.frame(expr_TP53[1,]),as.data.frame(pd$Disease))
head(d)
## expr_TP53[1, ] pd$Disease
## CLL11.CEL 2.723427 progres.
## CLL12.CEL 2.938367 stable
## CLL13.CEL 2.179530 progres.
## CLL14.CEL 2.762752 progres.
## CLL15.CEL 2.702204 progres.
## CLL16.CEL 2.717181 progres.
colnames(d)<-c("e","Disease")
## ggpubr
suppressMessages(library(ggpubr))
ggboxplot(d, x="Disease", y ="e",
palette = "GEO",#色板"jco";"GEO"
add = "jitter",
color = "Disease",
fill = "Disease")
## ggplot2
suppressMessages(library(ggplot2))
ggplot(d,aes(x=Disease,y=e,fill=Disease))+geom_boxplot()
rm(list = ls())
找到BRCA1基因在TCGA数据库的乳腺癌数据集(Breast Invasive Carcinoma (TCGA, PanCancer Atlas))的表达情况 1、使用接口:http://www.cbioportal.org/ 2、下载数据 3、进行分析
BRCA1 <- read_tsv("plot.txt")
## Rows: 1068 Columns: 5
## -- Column specification --------------------------------------------------------
## Delimiter: "\t"
## chr (4): Sample Id, BRCA1: Putative copy-number alterations from GISTIC, Mut...
## dbl (1): BRCA1: mRNA Expression, RSEM (Batch normalized from Illumina HiSeq_...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
colnames(BRCA1)
## [1] "Sample Id"
## [2] "BRCA1: Putative copy-number alterations from GISTIC"
## [3] "BRCA1: mRNA Expression, RSEM (Batch normalized from Illumina HiSeq_RNASeqV2)"
## [4] "Mutations"
## [5] "Copy Number Alterations"
colnames(BRCA1) <- c("id","group","expression","copy number")
suppressPackageStartupMessages(library(ggstatsplot))
ggbetweenstats(data = BRCA1, x="group", y="expression")
suppressPackageStartupMessages(library(ggpubr))
ggboxplot(data=BRCA1, x="group", y="expression", color = "group",add = "point", shape="group")
## Warning: Ignoring unknown aesthetics: show.legend
找到TP53基因在TCGA数据库的乳腺癌数据集的表达量分组看其是否影响生存 1、oncolnc:http://www.oncolnc.org/ 2、下载数据进行分析
TP53 <- read_csv("BRCA_7157_50_50.csv")
## Rows: 1006 Columns: 5
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (3): Patient, Status, Group
## dbl (2): Days, Expression
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
suppressPackageStartupMessages(library(ggstatsplot))
ggbetweenstats(data = TP53 , x="Group", y="Expression")
#生存分析
suppressPackageStartupMessages(library(survival))
suppressPackageStartupMessages(library(survminer))
table(TP53$Status)
##
## Alive Dead
## 871 135
## Alive Dead
## 871 135
TP53$Status <- ifelse(TP53$Status=='Dead',1,0)
sfit <- survfit(Surv(Days,Status)~Group,data = TP53)
ggsurvplot(sfit, conf.int = F, pval=T)
rm(list = ls())
下载数据集GSE17215的表达矩阵并且提取下面的基因画热图
# 下载表达矩阵和临床信息
suppressMessages(library(GEOquery))
#gget <- getGEO("GSE17215")
#save(gget,file="step1.Rdata")
load("D:/R/R-4.0.5/bin/project_practise/SXtree/SXtree_beforeGEO/step1.Rdata")
#表达矩阵
expr <- exprs(gget[[1]])
expr[1:5,1:5]
## GSM431121 GSM431122 GSM431123 GSM431124 GSM431125
## 1007_s_at 481.59980 596.79062 2721.83475 2566.31563 2737.76320
## 1053_at 306.82972 224.97956 246.73288 251.85537 245.55677
## 117_at 30.32576 27.28749 40.12724 40.04009 48.14327
## 121_at 125.04107 147.67525 138.18675 160.26449 151.04978
## 1255_g_at 19.87286 18.08049 19.11944 19.19024 21.11857
# 临床信息
pd <- pData(gget[[1]])
save(expr,pd,file = "step1_expr_pd.Rdata")
## 提取基因
#把大神给的基因导进来
gene <- read.table("gene.txt",header = F,sep = " ")
## Warning in read.table("gene.txt", header = F, sep = " "): incomplete final line
## found by readTableHeader on 'gene.txt'
c_gene <- t(gene)[,1]
#获取转换索引
#看gget得知平台为GPL3921,对应R包hthgu133a
BiocManager::install("hthgu133a.db")
## Bioconductor version 3.12 (BiocManager 1.30.16), R 4.0.5 (2021-03-31)
## Warning: package(s) not installed when version(s) same as current; use `force = TRUE` to
## re-install: 'hthgu133a.db'
## Old packages: 'forestplot', 'haven', 'readr', 'tibble', 'utf8', 'vroom', 'xfun'
suppressMessages(library(hthgu133a.db))
ls(hthgu133a.db)#所以ls没有keytypes全
## [1] "conn" "packageName"
keytypes(hthgu133a.db)
## [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS"
## [6] "ENTREZID" "ENZYME" "EVIDENCE" "EVIDENCEALL" "GENENAME"
## [11] "GO" "GOALL" "IPI" "MAP" "OMIM"
## [16] "ONTOLOGY" "ONTOLOGYALL" "PATH" "PFAM" "PMID"
## [21] "PROBEID" "PROSITE" "REFSEQ" "SYMBOL" "UCSCKG"
## [26] "UNIGENE" "UNIPROT"
p2g <- toTable(hthgu133aSYMBOL)
#生成一个db,目标基因对应的探针号
myp2g <- p2g[p2g$symbol %in% c_gene,]
head(myp2g)
## probe_id symbol
## 851 201397_at PHGDH
## 1150 201710_at MYBL2
## 1256 201820_at KRT5
## 1326 201890_at RRM2
## 1416 201983_s_at EGFR
## 1417 201984_s_at EGFR
#目标是基因的热图,不是探针的热图,所以最终用来画图的db行名一定是基因
table(myp2g$symbol)
##
## ACTR3B BAG1 BCL2 BIRC5 BLVRA CCNB1 CCNE1 CDC20 CDC6 CDH3
## 1 2 4 3 4 1 1 1 2 1
## CENPF CEP55 EGFR ERBB2 ESR1 EXO1 FGFR4 FOXA1 FOXC1 GRB7
## 3 1 6 2 9 1 2 1 1 1
## KIF2C KRT14 KRT5 MAPT MDM2 MELK MIA MKI67 MLPH MMP11
## 2 1 1 4 5 1 1 4 1 4
## MYBL2 MYC NAT1 PGR PHGDH PTTG1 RRM2 SFRP1 SLC39A6 TYMS
## 1 1 1 1 1 1 2 3 2 2
## UBE2C
## 1
#有的基因对应多个探针,我想看一下这几个探针检测的结果差异大不大
#选择一个对应多个探针的基因试一下
a1 <- myp2g[myp2g$symbol == "EGFR",][,1]
#提取这些探针的表达矩阵
b1 <- as.data.frame(expr[a1,])
boxplot(t(b1))
#差异非常明显,所以决定取最大值
rm(a1,b1)
#现在要获取expr的子集,包含目标基因对应的探针,一对多的取最大值
expr_1 <- expr[myp2g$probe_id,]
# 排序
##首先每个探针有6个样,取中位数作为排序依据,把中位数放到探针表新生成的一列中
myp2g$median <- apply(expr_1,1,median)
#如果把新列放到expr中会报错,因为重复运算
#错误代码:myp2g <- myp2g %>% arrange(desc(median))#arrange因为无法分组排序
myp2g_2 <- myp2g[order(myp2g$symbol,myp2g$median,decreasing = T),]
myp2g <- myp2g_2[!duplicated(myp2g_2$symbol),]#生成一个基因对应一个检测值最大的探针的翻译索引
#取expr子集,准备作图的db
expr_1 <- expr_1[myp2g$probe_id,]
rownames(expr_1) <- myp2g$symbol#把名字转换成基因名
#热图
suppressPackageStartupMessages(library(pheatmap))
n = t(scale(t(expr_1)))
pheatmap(n,show_rownames = F,clustering_distance_rows = "correlation")
## 7-分组绘制热图 下载数据集GSE24673的表达矩阵计算样本的相关性并且绘制热图,需要标记上样本分组信息 其实这道题可以不用做ID转换的
#下载数据
suppressMessages(library(GEOquery))
#gget_GSE24673 <- getGEO("GSE24673",getGPL = F)
#save(gget_GSE24673,file="GSE24675_gget.Rdata")
load("D:/R/R-4.0.5/bin/project_practise/SXtree/SXtree_beforeGEO/GSE24675_gget.Rdata")
expr_24673 <- exprs(gget_GSE24673[[1]])
pd_24673 <- pData(gget_GSE24673[[1]])
#分组信息
gse24673_group <- pd_24673[,"source_name_ch1"]
gse24673_group <- as.data.frame(gse24673_group,row.names = rownames(pd_24673))
colnames(gse24673_group) <- "group_list"
# ID转化————————————————————————————————————————
##注释信息GPL6244
#BiocManager::install("hugene10sttranscriptcluster.db")
#suppressMessages(library(hugene10sttranscriptcluster.db))
#keytypes(hugene10sttranscriptcluster.db)
#p2g <- toTable(hugene10sttranscriptclusterSYMBOL)
#save(p2g,file = "annotation_GPL6244.Rdata")
load("D:/R/R-4.0.5/bin/project_practise/SXtree/SXtree_beforeGEO/annotation_GPL6244.Rdata")
p2g <- p2g[p2g$probe_id %in% rownames(expr_24673),]
#expr肯定多个探针对应1个基因,我要找到基因对应的最佳探针,取注释表的子集
#给expr排序去重,先生成每列的中位数
#expr和p2g应该一一对应?
expr_1 <- expr_24673[p2g$probe_id,]
dim(expr_1)
## [1] 19610 11
## [1] 19610 11
p2g$median <- apply(expr_1,1,median)
p2g <- p2g %>% arrange(desc(symbol,median))
p2g_1 <- p2g[!duplicated(p2g$symbol),]
expr_1 <- expr_1[p2g_1$probe_id,]
rownames(expr_1) <- p2g_1$symbol
#了解表达矩阵
boxplot(expr_1)
expr_1[c("GAPDH","ACTB"),]
## GSM607938 GSM607939 GSM607940 GSM607941 GSM607942 GSM607943 GSM607944
## GAPDH 12.592 12.706 12.659 12.469 12.335 12.469 12.691
## ACTB 11.645 11.660 11.641 11.943 11.916 11.953 11.780
## GSM607945 GSM607946 GSM607947 GSM607948
## GAPDH 12.941 12.907 13.224 13.576
## ACTB 12.011 12.170 12.024 11.912
#分组箱型图
suppressMessages(library(ggplot2))
suppressMessages(library(reshape2))
exprl <- melt(expr_1)
str(exprl)
## 'data.frame': 206294 obs. of 3 variables:
## $ Var1 : Factor w/ 18754 levels "ZZZ3","ZZEF1",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Var2 : Factor w/ 11 levels "GSM607938","GSM607939",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ value: num 8.57 8.22 7.67 8.85 5.97 ...
colnames(exprl) <- c("gene","sample","exp")
#处理分组信息
exprl$group <- rep(gse24673_group[,1],each=nrow(expr_1))
ggplot(exprl,aes(x=sample,y=exp,fill = group))+
geom_boxplot()
#热图
suppressMessages(library(pheatmap))
#expr_heat <- as.data.frame(t(scale(t(expr_1))))
#expr_heat[c("GAPDH","ACTB"),]
#p <- pheatmap(expr_heat,show_rownames = F,clustering_distance_rows = "correlation",annotation_col = gse24673_group)
expr_h <- as.data.frame(expr_1)
pheatmap(expr_h,scale = "row",show_rownames = F, annotation_col = gse24673_group)
rm(list=ls())
找到 GPL6244 platform of Affymetrix Human Gene 1.0 ST Array 对应的R的bioconductor注释包,并且安装它!
#BiocManager::install("hugene10sttranscriptcluster.db")
options()$repos
## CRAN
## "@CRAN@"
options()$BioC_mirror
## NULL
options(BioC_mirror="https://mirrors.ustc.edu.cn/bioc/")
options("repos" = c(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/"))
#BiocManager::install("请输入自己找到的R包",ask = F,update = F)
options()$repos
## CRAN
## "https://mirrors.tuna.tsinghua.edu.cn/CRAN/"
options()$BioC_mirror
## [1] "https://mirrors.ustc.edu.cn/bioc/"
下载数据集GSE42872的表达矩阵,并且分别挑选出 所有样本的(平均表达量/sd/mad/)最大的探针,并且找到它们对应的基因。
suppressMessages(library(GEOquery))
gget_42872 <- getGEO("GSE42872",getGPL = F)
## Found 1 file(s)
## GSE42872_series_matrix.txt.gz
## Rows: 33297 Columns: 7
## -- Column specification --------------------------------------------------------
## Delimiter: "\t"
## dbl (7): ID_REF, GSM1052615, GSM1052616, GSM1052617, GSM1052618, GSM1052619,...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
expr_42872 <- exprs(gget_42872[[1]])
pd_42872 <- pData(gget_42872[[1]])
save(expr_42872,pd_42872,file = "expr_pd_42872.Rdata")
#排序
sort(apply(expr_42872,1,mean),decreasing = T)[1]
## 7978905
## 14.53288
## 7978905
## 14.53288
sort(apply(expr_42872,1,sd),decreasing = T)[1]
## 8133876
## 3.166429
## 8133876
## 3.166429
sort(apply(expr_42872,1,mad),decreasing = T)[1]
## 8133876
## 4.268561
## 8133876
## 4.268561
#寻找对应基因probe_id = 7978905
suppressMessages(library(hugene10sttranscriptcluster.db))
keytypes(hugene10sttranscriptcluster.db)
## [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS"
## [6] "ENTREZID" "ENZYME" "EVIDENCE" "EVIDENCEALL" "GENENAME"
## [11] "GO" "GOALL" "IPI" "MAP" "OMIM"
## [16] "ONTOLOGY" "ONTOLOGYALL" "PATH" "PFAM" "PMID"
## [21] "PROBEID" "PROSITE" "REFSEQ" "SYMBOL" "UCSCKG"
## [26] "UNIGENE" "UNIPROT"
p2g <- toTable(hugene10sttranscriptclusterSYMBOL)
p2g %>% filter(probe_id == "7978905")
## [1] probe_id symbol
## <0 rows> (or 0-length row.names)
#结果注释文件里面没有这个探针?
p2g %>% filter(probe_id %in% 8133876)
## probe_id symbol
## 1 8133876 CD36
我写的代码复杂多了,因为我不知道合并过程,是一步一步运算的; 另外,apply()这个函数的结果是生成一个向量,里面的元素有自己的名字,所以直接使用[1]即可选出探针+表达量 sort()是简单排序
注释文件里面没有那个探针,这就很尴尬了
下载数据集GSE42872的表达矩阵,并且根据分组使用limma做差异分析,得到差异结果矩阵
load("D:/R/R-4.0.5/bin/project_practise/SXtree/SXtree_beforeGEO/expr_pd_42872.Rdata")
group <- pd_42872$title
group <- as.data.frame(group) %>% separate(group,c(1,2,3,"process",5))
group <- group[,4]
group <- factor(group,levels = c("Vemurafenib","Control"))
## ID转换
p2g <- p2g[p2g$probe_id %in% rownames(expr_42872),]
expr_42872 <- expr_42872[p2g$probe_id,]
p2g$median <- apply(expr_42872,1,median)
p2g <- p2g %>% arrange(desc(symbol,median))
p2g <- p2g[!duplicated(p2g$symbol),]
expr_42872 <- expr_42872[p2g$probe_id,]
rownames(expr_42872) <- p2g$symbol
## 了解表达矩阵
### 内参GAPDH,ACTB
expr_42872["GAPDH",]
## GSM1052615 GSM1052616 GSM1052617 GSM1052618 GSM1052619 GSM1052620
## 14.3187 14.3622 14.3638 14.4085 14.3569 14.3229
expr_42872["ACTB",]
## GSM1052615 GSM1052616 GSM1052617 GSM1052618 GSM1052619 GSM1052620
## 13.8811 13.9002 13.8822 13.7773 13.6732 13.5363
boxplot(expr_42872)
### 分组箱型图、光滑密度图、聚类分析
library(ggplot2)
library(ggpubr)
library(reshape2)
theme_set(theme_pubclean())
exprl <- melt(expr_42872)
#一个sample被重复了nrow(expr)次
exprl$group <- rep(group,each=nrow(expr_42872))
colnames(exprl)[1:2] <- c("gene","sample")
#箱型图
ggplot(exprl,aes(x=sample,y=value,fill=group))+geom_boxplot()
#密度图
ggplot(exprl,aes(x=value,fill = group))+geom_density()+facet_wrap(exprl$sample)
#聚类分析
exprh <- expr_42872
colnames(exprh) <- group
hc <- hclust(dist(t(exprh)))
plot(hc)
#PCA
#BiocManager::install("ggfortify")
library(ggfortify)
df <- as.data.frame(t(expr_42872))
df$group <- group
autoplot(prcomp(df[,1:(ncol(df)-1)]),data=df,colour='group')
save(expr_42872,group,file = "final_expr_group.Rdata")
## 差异分析
#制作分组矩阵
library(limma)
##
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
#输入分组依据的数据,得到每个样品的对应位置,0代表否,1代表是
design <- model.matrix(~0+group)
colnames(design)=levels(group)
rownames(design)=colnames(expr_42872)
design
## Vemurafenib Control
## GSM1052615 0 1
## GSM1052616 0 1
## GSM1052617 0 1
## GSM1052618 1 0
## GSM1052619 1 0
## GSM1052620 1 0
## attr(,"assign")
## [1] 1 1
## attr(,"contrasts")
## attr(,"contrasts")$group
## [1] "contr.treatment"
#制作比较矩阵
#这一步很关键,makeContrasts(**实验组-对照组**,levels = design)
contrast.matrix<-makeContrasts(paste("Vemurafenib","Control",sep = "-"),levels = design)
contrast.matrix ##这个矩阵声明,我们要把progres.组跟stable进行差异分析比较
## Contrasts
## Levels Vemurafenib-Control
## Vemurafenib 1
## Control -1
## Contrasts
## Levels progres.-stable
## progres. 1一定要记住让实验组做1
## stable -1
#开始进行差异表达分析
##step 1
fit <- lmFit(expr_42872,design)
##step2
fit2 <- contrasts.fit(fit, contrast.matrix) ##这一步很重要,大家可以自行看看效果
fit2 <- eBayes(fit2) ## default no trend !!!
##eBayes() with trend=TRUE
##step3
tempOutput = topTable(fit2, coef=1, n=Inf)#原始结果
nrDEG = na.omit(tempOutput) #删除空集
save(nrDEG, file = "GSE42872_dif_whole.Rdata")
head(nrDEG)
## logFC AveExpr t P.Value adj.P.Val B
## CD36 5.780170 7.370282 79.48115 1.137956e-16 2.140837e-12 26.81057
## DUSP6 -4.212683 9.106625 -62.20220 1.725717e-15 1.231191e-11 25.05005
## DCT 5.633027 8.763220 61.48263 1.963308e-15 1.231191e-11 24.95805
## SPRY2 -3.801663 9.726468 -53.77478 8.661682e-15 4.073806e-11 23.84792
## MOXD1 3.263063 10.171635 46.91674 3.922086e-14 1.235966e-10 22.62821
## ETV4 -3.843247 9.667077 -46.89544 3.941847e-14 1.235966e-10 22.62401
##检查
expr_42872[c("CD36","DUSP6"),]
## GSM1052615 GSM1052616 GSM1052617 GSM1052618 GSM1052619 GSM1052620
## CD36 4.5461 4.4021 4.49239 10.25060 10.2148 10.31570
## DUSP6 11.2531 11.2076 11.17820 6.98791 7.0293 6.98364
##筛选
dif <- tempOutput %>% filter(adj.P.Val <0.01 & abs(logFC) > 2)
dim(dif)
## [1] 135 6
#筛选出来247个基因
write.csv(dif,file="Differential_limma_matrix_GSE42872.csv")
# 画热图
library(pheatmap)
getgene <- head(rownames(dif),25)
expr_heat <- expr_42872[getgene,]
group_heat <- data.frame(group,row.names=colnames(expr_heat))
pheatmap(expr_heat,sacle="row",show_rownames = F,annotation_col = group_heat)
# 画火山图
load("D:/R/R-4.0.5/bin/project_practise/SXtree/SXtree_beforeGEO/GSE42872_dif_whole.Rdata")
suppressPackageStartupMessages(library(cowplot))
suppressPackageStartupMessages(library(ggrepel))
suppressPackageStartupMessages(library(ggstatsplot))
#准备数据
nrDEG$change <- as.factor(with(nrDEG,ifelse( adj.P.Val < 0.01,ifelse(logFC > 0, "up", "down"),"not" )))
nrDEG$gene <- rownames(nrDEG)
ggplot(data = nrDEG, aes(x = logFC, y = -log10(adj.P.Val))) +
##选定数据源和x,y坐标
geom_point(size = 1.75, aes(color = change), show.legend = T) +
###描点,设置大小、颜色分类和图示
scale_color_manual(values = c("royalblue", "grey","red")) +
####颜色值
ylim(0, 10) +
######纵坐标的量程
labs(x = 'Log2(fold change)', y = '-log10(adp)')+
#######标签
geom_hline(yintercept = -log10(0.01), linetype = 'dotdash', color = 'grey20') +
#######添加水平参考线,p<0.01
geom_vline(xintercept = c(-2, 2), linetype = 'dotdash', color = 'grey20') +
#添加垂直参考线,log(fold_change)绝对值大于1
geom_text_repel(data = filter(nrDEG, abs(logFC)>2),
###第一步选标签来源数据,利用subset()选子集
size = 2,
aes(label = gene))#映射
## Warning: Removed 4 rows containing missing values (geom_point).
## Warning: Removed 4 rows containing missing values (geom_text_repel).
## Warning: ggrepel: 104 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps