middle

2021-08-06

中级习题

rm(list=ls())
setwd("D:/R/R-4.0.5/bin/project_practise/SXtree/SXtree_beforeGEO")
suppressMessages(library(tidyverse)) 

1-基因ID转换|org.Hs.eg.db

# 思路
#ID转换相当于翻译
##首先导入需要翻译的数据
##获取翻译索引表
##原始数据处理:根据索引表处理需要翻译的数据——转换成索引表可以识别的模式
##索引表处理:根据原始数据取索引表的子集,要求与原始数据一一对应
##merge一下

## 法1
suppressMessages(library(org.Hs.eg.db)) 
columns(org.Hs.eg.db)
##  [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS"
##  [6] "ENTREZID"     "ENZYME"       "EVIDENCE"     "EVIDENCEALL"  "GENENAME"    
## [11] "GO"           "GOALL"        "IPI"          "MAP"          "OMIM"        
## [16] "ONTOLOGY"     "ONTOLOGYALL"  "PATH"         "PFAM"         "PMID"        
## [21] "PROSITE"      "REFSEQ"       "SYMBOL"       "UCSCKG"       "UNIGENE"     
## [26] "UNIPROT"
id <- read.table("id.txt",header = T,sep = "\t")
myid <- id %>% separate(id,c("ENSEMBL","a"))
keys <- myid[,1]
prbe2id <- AnnotationDbi::select(org.Hs.eg.db,keys=keys,c("ENSEMBL","SYMBOL"),keytype="ENSEMBL")
## 'select()' returned 1:1 mapping between keys and columns
## 法二
p2i_ENS <- toTable(org.Hs.egENSEMBL)
p2i_SYM <- toTable(org.Hs.egSYMBOL)
p2i <- merge(p2i_ENS,p2i_SYM)

ids2 <- merge(myid,p2i,by.x = "ENSEMBL",by.y = "ensembl_id")

## 别人处理数据的方法
suppressMessages(library(stringr))
ensembl_id = str_split(id[,1], pattern = "[.]", simplify = T)[,1]
ensembl_id = as.data.frame(ensembl_id)

注意select()

因为与tidyverse包里的select()重名,直接使用会报错,需要AnnotationDbi::select()

2-探针ID转换|hgu133a.db

rm(list=ls())
#导入数据
id_2 <- read.table("id2.txt",header = T,sep = "\t")
#数据直接能用,不需要处理
#myid_2 <- separate(id_2,id,c("ida","idb","idc"),fill = "right")
#获取翻译索引表
suppressMessages(library(hgu133a.db))
keytypes(hgu133a.db)
##  [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS"
##  [6] "ENTREZID"     "ENZYME"       "EVIDENCE"     "EVIDENCEALL"  "GENENAME"    
## [11] "GO"           "GOALL"        "IPI"          "MAP"          "OMIM"        
## [16] "ONTOLOGY"     "ONTOLOGYALL"  "PATH"         "PFAM"         "PMID"        
## [21] "PROBEID"      "PROSITE"      "REFSEQ"       "SYMBOL"       "UCSCKG"      
## [26] "UNIGENE"      "UNIPROT"
p2g <- toTable(hgu133aSYMBOL)
#数据比较小,且原始数据没有重复,可以直接merge
ids <- merge(id_2,p2g,by.x="id", by.y="probe_id")

3-下载数据绘图

获得内置数据集“sCLLex”中基因“TP53”的表达量,根据分组信息画分组箱型图

思路

1、获得数据集(对象)的表达矩阵和临床信息 2、获得基因注释表 3、根据注释表找到基因对应的探针 4、回到表达矩阵,找到表达值 5、根据表达值和分组信息绘图

# 获取表达矩阵和临床信息
suppressMessages(library(CLL))
data(sCLLex)
str(sCLLex)
## Formal class 'ExpressionSet' [package "Biobase"] with 7 slots
##   ..@ experimentData   :Formal class 'MIAME' [package "Biobase"] with 13 slots
##   .. .. ..@ name             : chr ""
##   .. .. ..@ lab              : chr ""
##   .. .. ..@ contact          : chr ""
##   .. .. ..@ title            : chr ""
##   .. .. ..@ abstract         : chr ""
##   .. .. ..@ url              : chr ""
##   .. .. ..@ pubMedIds        : chr ""
##   .. .. ..@ samples          : list()
##   .. .. ..@ hybridizations   : list()
##   .. .. ..@ normControls     : list()
##   .. .. ..@ preprocessing    :List of 2
##   .. .. .. ..$ filenames  : chr [1:24] "/export/home/rgentlem/RG/Genetics/Sabina/CLL/CLL10.CEL" "/export/home/rgentlem/RG/Genetics/Sabina/CLL/CLL11.CEL" "/export/home/rgentlem/RG/Genetics/Sabina/CLL/CLL12.CEL" "/export/home/rgentlem/RG/Genetics/Sabina/CLL/CLL13.CEL" ...
##   .. .. .. ..$ affyversion: chr NA
##   .. .. ..@ other            : list()
##   .. .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slot
##   .. .. .. .. ..@ .Data:List of 1
##   .. .. .. .. .. ..$ : int [1:3] 1 0 0
##   ..@ assayData        :<environment: 0x000000002accf6f0> 
##   ..@ phenoData        :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
##   .. .. ..@ varMetadata      :'data.frame':  2 obs. of  1 variable:
##   .. .. .. ..$ labelDescription: chr [1:2] "Sample ID" "Stable/Progressive"
##   .. .. ..@ data             :'data.frame':  22 obs. of  2 variables:
##   .. .. .. ..$ SampleID: 'AsIs' chr [1:22] "CLL11" "CLL12" "CLL13" "CLL14" ...
##   .. .. .. ..$ Disease : Factor w/ 2 levels "progres.","stable": 1 2 1 1 1 1 2 2 1 2 ...
##   .. .. ..@ dimLabels        : chr [1:2] "sampleNames" "sampleColumns"
##   .. .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slot
##   .. .. .. .. ..@ .Data:List of 1
##   .. .. .. .. .. ..$ : int [1:3] 1 1 0
##   ..@ featureData      :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
##   .. .. ..@ varMetadata      :'data.frame':  0 obs. of  1 variable:
##   .. .. .. ..$ labelDescription: chr(0) 
##   .. .. ..@ data             :'data.frame':  12625 obs. of  0 variables
##   .. .. ..@ dimLabels        : chr [1:2] "featureNames" "featureColumns"
##   .. .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slot
##   .. .. .. .. ..@ .Data:List of 1
##   .. .. .. .. .. ..$ : int [1:3] 1 1 0
##   ..@ annotation       : chr "hgu95av2"
##   ..@ protocolData     :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
##   .. .. ..@ varMetadata      :'data.frame':  0 obs. of  1 variable:
##   .. .. .. ..$ labelDescription: chr(0) 
##   .. .. ..@ data             :'data.frame':  22 obs. of  0 variables
##   .. .. ..@ dimLabels        : chr [1:2] "sampleNames" "sampleColumns"
##   .. .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slot
##   .. .. .. .. ..@ .Data:List of 1
##   .. .. .. .. .. ..$ : int [1:3] 1 1 0
##   ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slot
##   .. .. ..@ .Data:List of 4
##   .. .. .. ..$ : int [1:3] 2 10 0
##   .. .. .. ..$ : int [1:3] 2 5 5
##   .. .. .. ..$ : int [1:3] 1 3 0
##   .. .. .. ..$ : int [1:3] 1 0 0
#表达矩阵
expr <- sCLLex@assayData[["exprs"]]#正确的方法是#使用exprs()但这样也行
#临床信息
pd <- pData(sCLLex)

# 获取翻译索引表:获得探针注释信息
#Annotation: hgu95av2
BiocManager::install("hgu95av2.db")
## Bioconductor version 3.12 (BiocManager 1.30.16), R 4.0.5 (2021-03-31)
## Warning: package(s) not installed when version(s) same as current; use `force = TRUE` to
##   re-install: 'hgu95av2.db'
## Old packages: 'forestplot', 'haven', 'readr', 'tibble', 'utf8', 'vroom', 'xfun'
suppressMessages(library(hgu95av2.db))
keytypes(hgu95av2.db)
##  [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS"
##  [6] "ENTREZID"     "ENZYME"       "EVIDENCE"     "EVIDENCEALL"  "GENENAME"    
## [11] "GO"           "GOALL"        "IPI"          "MAP"          "OMIM"        
## [16] "ONTOLOGY"     "ONTOLOGYALL"  "PATH"         "PFAM"         "PMID"        
## [21] "PROBEID"      "PROSITE"      "REFSEQ"       "SYMBOL"       "UCSCKG"      
## [26] "UNIGENE"      "UNIPROT"
p2g <- toTable(hgu95av2SYMBOL)
dim(p2g)
## [1] 11457     2
#获得翻译索引表的子集——TP53对应的探针
probe <- p2g[p2g$symbol == "TP53",][,1]

# 开始绘图

## 简单箱型图
expr_TP53 <- expr[probe,]
boxplot(expr_TP53[1,]~pd$Disease)

boxplot(expr_TP53)

#ggplot和ggpubr的输入形式都要求是dataframe里面的列,所以先合成dataframe
d <-cbind(as.data.frame(expr_TP53[1,]),as.data.frame(pd$Disease))
head(d)
##           expr_TP53[1, ] pd$Disease
## CLL11.CEL       2.723427   progres.
## CLL12.CEL       2.938367     stable
## CLL13.CEL       2.179530   progres.
## CLL14.CEL       2.762752   progres.
## CLL15.CEL       2.702204   progres.
## CLL16.CEL       2.717181   progres.
colnames(d)<-c("e","Disease")

## ggpubr
suppressMessages(library(ggpubr))
ggboxplot(d, x="Disease", y ="e",
             palette = "GEO",#色板"jco";"GEO"
             add = "jitter",
             color = "Disease",
             fill = "Disease")

## ggplot2
suppressMessages(library(ggplot2))
ggplot(d,aes(x=Disease,y=e,fill=Disease))+geom_boxplot()

rm(list = ls())

4-TCGA数据下载

找到BRCA1基因在TCGA数据库的乳腺癌数据集(Breast Invasive Carcinoma (TCGA, PanCancer Atlas))的表达情况 1、使用接口:http://www.cbioportal.org/ 2、下载数据 3、进行分析

BRCA1 <- read_tsv("plot.txt")
## Rows: 1068 Columns: 5
## -- Column specification --------------------------------------------------------
## Delimiter: "\t"
## chr (4): Sample Id, BRCA1: Putative copy-number alterations from GISTIC, Mut...
## dbl (1): BRCA1: mRNA Expression, RSEM (Batch normalized from Illumina HiSeq_...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
colnames(BRCA1)
## [1] "Sample Id"                                                                   
## [2] "BRCA1: Putative copy-number alterations from GISTIC"                         
## [3] "BRCA1: mRNA Expression, RSEM (Batch normalized from Illumina HiSeq_RNASeqV2)"
## [4] "Mutations"                                                                   
## [5] "Copy Number Alterations"
colnames(BRCA1) <- c("id","group","expression","copy number")

suppressPackageStartupMessages(library(ggstatsplot))
ggbetweenstats(data = BRCA1, x="group", y="expression")

suppressPackageStartupMessages(library(ggpubr))
ggboxplot(data=BRCA1, x="group", y="expression", color = "group",add = "point", shape="group")
## Warning: Ignoring unknown aesthetics: show.legend

5-onconlnc数据下载、生存分析

找到TP53基因在TCGA数据库的乳腺癌数据集的表达量分组看其是否影响生存 1、oncolnc:http://www.oncolnc.org/ 2、下载数据进行分析

TP53 <- read_csv("BRCA_7157_50_50.csv")
## Rows: 1006 Columns: 5
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (3): Patient, Status, Group
## dbl (2): Days, Expression
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
suppressPackageStartupMessages(library(ggstatsplot))
ggbetweenstats(data = TP53 , x="Group", y="Expression")

#生存分析
suppressPackageStartupMessages(library(survival))
suppressPackageStartupMessages(library(survminer))
table(TP53$Status)
## 
## Alive  Dead 
##   871   135
## Alive Dead
## 871   135

TP53$Status <- ifelse(TP53$Status=='Dead',1,0)
sfit <- survfit(Surv(Days,Status)~Group,data = TP53)
ggsurvplot(sfit, conf.int = F, pval=T)

rm(list = ls())

6-表达矩阵和热图

下载数据集GSE17215的表达矩阵并且提取下面的基因画热图

# 下载表达矩阵和临床信息
suppressMessages(library(GEOquery)) 
#gget <- getGEO("GSE17215")
#save(gget,file="step1.Rdata")
load("D:/R/R-4.0.5/bin/project_practise/SXtree/SXtree_beforeGEO/step1.Rdata")

#表达矩阵
expr <- exprs(gget[[1]])
expr[1:5,1:5]
##           GSM431121 GSM431122  GSM431123  GSM431124  GSM431125
## 1007_s_at 481.59980 596.79062 2721.83475 2566.31563 2737.76320
## 1053_at   306.82972 224.97956  246.73288  251.85537  245.55677
## 117_at     30.32576  27.28749   40.12724   40.04009   48.14327
## 121_at    125.04107 147.67525  138.18675  160.26449  151.04978
## 1255_g_at  19.87286  18.08049   19.11944   19.19024   21.11857
# 临床信息
pd <- pData(gget[[1]])

save(expr,pd,file = "step1_expr_pd.Rdata")

## 提取基因
#把大神给的基因导进来
gene <- read.table("gene.txt",header = F,sep = " ")
## Warning in read.table("gene.txt", header = F, sep = " "): incomplete final line
## found by readTableHeader on 'gene.txt'
c_gene <- t(gene)[,1]

#获取转换索引
#看gget得知平台为GPL3921,对应R包hthgu133a
BiocManager::install("hthgu133a.db")
## Bioconductor version 3.12 (BiocManager 1.30.16), R 4.0.5 (2021-03-31)
## Warning: package(s) not installed when version(s) same as current; use `force = TRUE` to
##   re-install: 'hthgu133a.db'
## Old packages: 'forestplot', 'haven', 'readr', 'tibble', 'utf8', 'vroom', 'xfun'
suppressMessages(library(hthgu133a.db)) 
ls(hthgu133a.db)#所以ls没有keytypes全
## [1] "conn"        "packageName"
keytypes(hthgu133a.db)
##  [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS"
##  [6] "ENTREZID"     "ENZYME"       "EVIDENCE"     "EVIDENCEALL"  "GENENAME"    
## [11] "GO"           "GOALL"        "IPI"          "MAP"          "OMIM"        
## [16] "ONTOLOGY"     "ONTOLOGYALL"  "PATH"         "PFAM"         "PMID"        
## [21] "PROBEID"      "PROSITE"      "REFSEQ"       "SYMBOL"       "UCSCKG"      
## [26] "UNIGENE"      "UNIPROT"
p2g <- toTable(hthgu133aSYMBOL)

#生成一个db,目标基因对应的探针号
myp2g <- p2g[p2g$symbol %in% c_gene,]
head(myp2g)
##         probe_id symbol
## 851    201397_at  PHGDH
## 1150   201710_at  MYBL2
## 1256   201820_at   KRT5
## 1326   201890_at   RRM2
## 1416 201983_s_at   EGFR
## 1417 201984_s_at   EGFR
#目标是基因的热图,不是探针的热图,所以最终用来画图的db行名一定是基因
table(myp2g$symbol)
## 
##  ACTR3B    BAG1    BCL2   BIRC5   BLVRA   CCNB1   CCNE1   CDC20    CDC6    CDH3 
##       1       2       4       3       4       1       1       1       2       1 
##   CENPF   CEP55    EGFR   ERBB2    ESR1    EXO1   FGFR4   FOXA1   FOXC1    GRB7 
##       3       1       6       2       9       1       2       1       1       1 
##   KIF2C   KRT14    KRT5    MAPT    MDM2    MELK     MIA   MKI67    MLPH   MMP11 
##       2       1       1       4       5       1       1       4       1       4 
##   MYBL2     MYC    NAT1     PGR   PHGDH   PTTG1    RRM2   SFRP1 SLC39A6    TYMS 
##       1       1       1       1       1       1       2       3       2       2 
##   UBE2C 
##       1
#有的基因对应多个探针,我想看一下这几个探针检测的结果差异大不大
#选择一个对应多个探针的基因试一下
a1 <- myp2g[myp2g$symbol == "EGFR",][,1]
#提取这些探针的表达矩阵
b1 <- as.data.frame(expr[a1,])
boxplot(t(b1))

#差异非常明显,所以决定取最大值
rm(a1,b1)

#现在要获取expr的子集,包含目标基因对应的探针,一对多的取最大值
expr_1 <- expr[myp2g$probe_id,]
# 排序
##首先每个探针有6个样,取中位数作为排序依据,把中位数放到探针表新生成的一列中
myp2g$median <- apply(expr_1,1,median)
#如果把新列放到expr中会报错,因为重复运算
#错误代码:myp2g <- myp2g %>% arrange(desc(median))#arrange因为无法分组排序
myp2g_2 <- myp2g[order(myp2g$symbol,myp2g$median,decreasing = T),]
myp2g <- myp2g_2[!duplicated(myp2g_2$symbol),]#生成一个基因对应一个检测值最大的探针的翻译索引

#取expr子集,准备作图的db
expr_1 <- expr_1[myp2g$probe_id,]
rownames(expr_1) <- myp2g$symbol#把名字转换成基因名

#热图
suppressPackageStartupMessages(library(pheatmap))
n = t(scale(t(expr_1)))
pheatmap(n,show_rownames = F,clustering_distance_rows = "correlation")

## 7-分组绘制热图 下载数据集GSE24673的表达矩阵计算样本的相关性并且绘制热图,需要标记上样本分组信息 其实这道题可以不用做ID转换的

#下载数据
suppressMessages(library(GEOquery))
#gget_GSE24673 <- getGEO("GSE24673",getGPL = F)
#save(gget_GSE24673,file="GSE24675_gget.Rdata")
load("D:/R/R-4.0.5/bin/project_practise/SXtree/SXtree_beforeGEO/GSE24675_gget.Rdata")

expr_24673 <- exprs(gget_GSE24673[[1]])
pd_24673 <- pData(gget_GSE24673[[1]])
#分组信息
gse24673_group <- pd_24673[,"source_name_ch1"]
gse24673_group <- as.data.frame(gse24673_group,row.names = rownames(pd_24673))
colnames(gse24673_group) <- "group_list"

# ID转化————————————————————————————————————————
##注释信息GPL6244
#BiocManager::install("hugene10sttranscriptcluster.db")
#suppressMessages(library(hugene10sttranscriptcluster.db))
#keytypes(hugene10sttranscriptcluster.db)
#p2g <- toTable(hugene10sttranscriptclusterSYMBOL)
#save(p2g,file = "annotation_GPL6244.Rdata")
load("D:/R/R-4.0.5/bin/project_practise/SXtree/SXtree_beforeGEO/annotation_GPL6244.Rdata")

p2g <- p2g[p2g$probe_id %in% rownames(expr_24673),]
#expr肯定多个探针对应1个基因,我要找到基因对应的最佳探针,取注释表的子集

#给expr排序去重,先生成每列的中位数
#expr和p2g应该一一对应?
expr_1 <- expr_24673[p2g$probe_id,]
dim(expr_1)
## [1] 19610    11
## [1] 19610    11
p2g$median <- apply(expr_1,1,median)
p2g <- p2g %>% arrange(desc(symbol,median))
p2g_1 <- p2g[!duplicated(p2g$symbol),]
expr_1 <- expr_1[p2g_1$probe_id,]
rownames(expr_1) <- p2g_1$symbol

#了解表达矩阵
boxplot(expr_1)

expr_1[c("GAPDH","ACTB"),]
##       GSM607938 GSM607939 GSM607940 GSM607941 GSM607942 GSM607943 GSM607944
## GAPDH    12.592    12.706    12.659    12.469    12.335    12.469    12.691
## ACTB     11.645    11.660    11.641    11.943    11.916    11.953    11.780
##       GSM607945 GSM607946 GSM607947 GSM607948
## GAPDH    12.941    12.907    13.224    13.576
## ACTB     12.011    12.170    12.024    11.912
#分组箱型图
suppressMessages(library(ggplot2))
suppressMessages(library(reshape2))
exprl <- melt(expr_1)
str(exprl)
## 'data.frame':    206294 obs. of  3 variables:
##  $ Var1 : Factor w/ 18754 levels "ZZZ3","ZZEF1",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Var2 : Factor w/ 11 levels "GSM607938","GSM607939",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ value: num  8.57 8.22 7.67 8.85 5.97 ...
colnames(exprl) <- c("gene","sample","exp")
#处理分组信息
exprl$group <- rep(gse24673_group[,1],each=nrow(expr_1))

ggplot(exprl,aes(x=sample,y=exp,fill = group))+
  geom_boxplot()

#热图
suppressMessages(library(pheatmap))
#expr_heat <- as.data.frame(t(scale(t(expr_1))))
#expr_heat[c("GAPDH","ACTB"),]
#p <- pheatmap(expr_heat,show_rownames = F,clustering_distance_rows = "correlation",annotation_col = gse24673_group)

expr_h <- as.data.frame(expr_1)
pheatmap(expr_h,scale = "row",show_rownames = F, annotation_col = gse24673_group)

rm(list=ls())

分析:

8-当R包难以下载时

找到 GPL6244 platform of Affymetrix Human Gene 1.0 ST Array 对应的R的bioconductor注释包,并且安装它!

#BiocManager::install("hugene10sttranscriptcluster.db")

options()$repos
##     CRAN 
## "@CRAN@"
options()$BioC_mirror 
## NULL
options(BioC_mirror="https://mirrors.ustc.edu.cn/bioc/")
options("repos" = c(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/"))
#BiocManager::install("请输入自己找到的R包",ask = F,update = F)
options()$repos
##                                         CRAN 
## "https://mirrors.tuna.tsinghua.edu.cn/CRAN/"
options()$BioC_mirror
## [1] "https://mirrors.ustc.edu.cn/bioc/"

9-寻找表达量最大的探针|apply()函数

下载数据集GSE42872的表达矩阵,并且分别挑选出 所有样本的(平均表达量/sd/mad/)最大的探针,并且找到它们对应的基因。

suppressMessages(library(GEOquery))
gget_42872 <- getGEO("GSE42872",getGPL = F)
## Found 1 file(s)
## GSE42872_series_matrix.txt.gz
## Rows: 33297 Columns: 7
## -- Column specification --------------------------------------------------------
## Delimiter: "\t"
## dbl (7): ID_REF, GSM1052615, GSM1052616, GSM1052617, GSM1052618, GSM1052619,...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
expr_42872 <- exprs(gget_42872[[1]])
pd_42872 <- pData(gget_42872[[1]])
save(expr_42872,pd_42872,file = "expr_pd_42872.Rdata")

#排序
sort(apply(expr_42872,1,mean),decreasing = T)[1]
##  7978905 
## 14.53288
## 7978905
## 14.53288

sort(apply(expr_42872,1,sd),decreasing = T)[1]
##  8133876 
## 3.166429
## 8133876
## 3.166429

sort(apply(expr_42872,1,mad),decreasing = T)[1]
##  8133876 
## 4.268561
## 8133876
## 4.268561

#寻找对应基因probe_id = 7978905
suppressMessages(library(hugene10sttranscriptcluster.db)) 
keytypes(hugene10sttranscriptcluster.db)
##  [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS"
##  [6] "ENTREZID"     "ENZYME"       "EVIDENCE"     "EVIDENCEALL"  "GENENAME"    
## [11] "GO"           "GOALL"        "IPI"          "MAP"          "OMIM"        
## [16] "ONTOLOGY"     "ONTOLOGYALL"  "PATH"         "PFAM"         "PMID"        
## [21] "PROBEID"      "PROSITE"      "REFSEQ"       "SYMBOL"       "UCSCKG"      
## [26] "UNIGENE"      "UNIPROT"
p2g <- toTable(hugene10sttranscriptclusterSYMBOL)
p2g %>% filter(probe_id == "7978905")
## [1] probe_id symbol  
## <0 rows> (or 0-length row.names)
#结果注释文件里面没有这个探针?
p2g %>% filter(probe_id %in% 8133876)
##   probe_id symbol
## 1  8133876   CD36

分析

我写的代码复杂多了,因为我不知道合并过程,是一步一步运算的; 另外,apply()这个函数的结果是生成一个向量,里面的元素有自己的名字,所以直接使用[1]即可选出探针+表达量 sort()是简单排序

问题

注释文件里面没有那个探针,这就很尴尬了

10-差异分析

下载数据集GSE42872的表达矩阵,并且根据分组使用limma做差异分析,得到差异结果矩阵

load("D:/R/R-4.0.5/bin/project_practise/SXtree/SXtree_beforeGEO/expr_pd_42872.Rdata")
group <- pd_42872$title
group <- as.data.frame(group) %>% separate(group,c(1,2,3,"process",5))
group <- group[,4]
group <- factor(group,levels = c("Vemurafenib","Control"))

## ID转换
p2g <- p2g[p2g$probe_id %in% rownames(expr_42872),]
expr_42872 <- expr_42872[p2g$probe_id,]
p2g$median <- apply(expr_42872,1,median)
p2g <- p2g %>% arrange(desc(symbol,median))
p2g <- p2g[!duplicated(p2g$symbol),]
expr_42872 <- expr_42872[p2g$probe_id,]
rownames(expr_42872) <- p2g$symbol

## 了解表达矩阵
### 内参GAPDH,ACTB
expr_42872["GAPDH",]
## GSM1052615 GSM1052616 GSM1052617 GSM1052618 GSM1052619 GSM1052620 
##    14.3187    14.3622    14.3638    14.4085    14.3569    14.3229
expr_42872["ACTB",]
## GSM1052615 GSM1052616 GSM1052617 GSM1052618 GSM1052619 GSM1052620 
##    13.8811    13.9002    13.8822    13.7773    13.6732    13.5363
boxplot(expr_42872)

### 分组箱型图、光滑密度图、聚类分析
library(ggplot2)
library(ggpubr)
library(reshape2)
theme_set(theme_pubclean())

exprl <- melt(expr_42872)
#一个sample被重复了nrow(expr)次
exprl$group <- rep(group,each=nrow(expr_42872))
colnames(exprl)[1:2] <- c("gene","sample")

#箱型图
ggplot(exprl,aes(x=sample,y=value,fill=group))+geom_boxplot()

#密度图
ggplot(exprl,aes(x=value,fill = group))+geom_density()+facet_wrap(exprl$sample)

#聚类分析
exprh <- expr_42872 
colnames(exprh) <- group
hc <- hclust(dist(t(exprh)))
plot(hc)

#PCA
#BiocManager::install("ggfortify")
library(ggfortify)
df <- as.data.frame(t(expr_42872))
df$group <- group
autoplot(prcomp(df[,1:(ncol(df)-1)]),data=df,colour='group')

save(expr_42872,group,file = "final_expr_group.Rdata")

## 差异分析
#制作分组矩阵
library(limma)
## 
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA
#输入分组依据的数据,得到每个样品的对应位置,0代表否,1代表是
design <- model.matrix(~0+group)
colnames(design)=levels(group)
rownames(design)=colnames(expr_42872)
design
##            Vemurafenib Control
## GSM1052615           0       1
## GSM1052616           0       1
## GSM1052617           0       1
## GSM1052618           1       0
## GSM1052619           1       0
## GSM1052620           1       0
## attr(,"assign")
## [1] 1 1
## attr(,"contrasts")
## attr(,"contrasts")$group
## [1] "contr.treatment"
#制作比较矩阵
#这一步很关键,makeContrasts(**实验组-对照组**,levels = design)
contrast.matrix<-makeContrasts(paste("Vemurafenib","Control",sep = "-"),levels = design)
contrast.matrix ##这个矩阵声明,我们要把progres.组跟stable进行差异分析比较
##              Contrasts
## Levels        Vemurafenib-Control
##   Vemurafenib                   1
##   Control                      -1
##           Contrasts
## Levels     progres.-stable
##   progres.               1一定要记住让实验组做1
##   stable                -1

#开始进行差异表达分析

##step 1 
fit <- lmFit(expr_42872,design)

##step2 
fit2 <- contrasts.fit(fit, contrast.matrix) ##这一步很重要,大家可以自行看看效果
fit2 <- eBayes(fit2)  ## default no trend !!!
##eBayes() with trend=TRUE

##step3  
tempOutput = topTable(fit2, coef=1, n=Inf)#原始结果
nrDEG = na.omit(tempOutput) #删除空集
save(nrDEG, file = "GSE42872_dif_whole.Rdata")
head(nrDEG)
##           logFC   AveExpr         t      P.Value    adj.P.Val        B
## CD36   5.780170  7.370282  79.48115 1.137956e-16 2.140837e-12 26.81057
## DUSP6 -4.212683  9.106625 -62.20220 1.725717e-15 1.231191e-11 25.05005
## DCT    5.633027  8.763220  61.48263 1.963308e-15 1.231191e-11 24.95805
## SPRY2 -3.801663  9.726468 -53.77478 8.661682e-15 4.073806e-11 23.84792
## MOXD1  3.263063 10.171635  46.91674 3.922086e-14 1.235966e-10 22.62821
## ETV4  -3.843247  9.667077 -46.89544 3.941847e-14 1.235966e-10 22.62401
##检查
expr_42872[c("CD36","DUSP6"),]
##       GSM1052615 GSM1052616 GSM1052617 GSM1052618 GSM1052619 GSM1052620
## CD36      4.5461     4.4021    4.49239   10.25060    10.2148   10.31570
## DUSP6    11.2531    11.2076   11.17820    6.98791     7.0293    6.98364
##筛选
dif <- tempOutput %>% filter(adj.P.Val <0.01 & abs(logFC) > 2)
dim(dif)
## [1] 135   6
#筛选出来247个基因
write.csv(dif,file="Differential_limma_matrix_GSE42872.csv")

# 画热图
library(pheatmap)
getgene <- head(rownames(dif),25)
expr_heat <- expr_42872[getgene,]
group_heat <- data.frame(group,row.names=colnames(expr_heat))
pheatmap(expr_heat,sacle="row",show_rownames = F,annotation_col = group_heat)

# 画火山图
load("D:/R/R-4.0.5/bin/project_practise/SXtree/SXtree_beforeGEO/GSE42872_dif_whole.Rdata")
suppressPackageStartupMessages(library(cowplot)) 
suppressPackageStartupMessages(library(ggrepel)) 
suppressPackageStartupMessages(library(ggstatsplot)) 

#准备数据
nrDEG$change <- as.factor(with(nrDEG,ifelse( adj.P.Val < 0.01,ifelse(logFC > 0, "up", "down"),"not" )))
nrDEG$gene <- rownames(nrDEG)

ggplot(data = nrDEG, aes(x = logFC, y = -log10(adj.P.Val))) +
  ##选定数据源和x,y坐标
  geom_point(size = 1.75, aes(color = change), show.legend = T) +
  ###描点,设置大小、颜色分类和图示
  scale_color_manual(values = c("royalblue", "grey","red")) + 
  ####颜色值
  ylim(0, 10) + 
  ######纵坐标的量程
  labs(x = 'Log2(fold change)', y = '-log10(adp)')+ 
  #######标签
  geom_hline(yintercept = -log10(0.01), linetype = 'dotdash', color = 'grey20') + 
  #######添加水平参考线,p<0.01
  geom_vline(xintercept = c(-2, 2), linetype = 'dotdash', color = 'grey20') + 
  #添加垂直参考线,log(fold_change)绝对值大于1
  geom_text_repel(data = filter(nrDEG, abs(logFC)>2),
                  ###第一步选标签来源数据,利用subset()选子集
                  size = 2,
                  aes(label = gene))#映射
## Warning: Removed 4 rows containing missing values (geom_point).
## Warning: Removed 4 rows containing missing values (geom_text_repel).
## Warning: ggrepel: 104 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps