20 question

安装一些R包：
数据包： ALL, CLL, pasilla, airway
软件包：limma，DESeq2，clusterProfiler
工具包：reshape2
绘图包：ggplot2
不同领域的R包使用频率不一样，在生物信息学领域，尤其需要掌握bioconductor系列包。

# install package from CRAN
install.package("reshape2") #reshape2

# install package from bioconductor
if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
BiocManager::install("CLL", version = "3.8")

了解ExpressionSet对象，比如CLL包里面就有data(sCLLex)，找到它包含的元素，提取其表达矩阵(使用exprs函数)，查看其大小.
参考：http://www.bio-info-trainee.com/bioconductor_China/software/limma.html
参考：https://github.com/bioconductor-china/basic/blob/master/ExpressionSet.md

library(CLL)

## Loading required package: affy

## Loading required package: BiocGenerics

## Loading required package: parallel

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind,
##     colMeans, colnames, colSums, dirname, do.call, duplicated,
##     eval, evalq, Filter, Find, get, grep, grepl, intersect,
##     is.unsorted, lapply, lengths, Map, mapply, match, mget, order,
##     paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind,
##     Reduce, rowMeans, rownames, rowSums, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which, which.max,
##     which.min

## Loading required package: Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

data(sCLLex)
exprSet=exprs(sCLLex)
##sCLLex是依赖于CLL这个package的一个对象
samples=sampleNames(sCLLex)
pdata=pData(sCLLex)
group_list=as.character(pdata[,2])
dim(exprSet)

## [1] 12625    22

exprSet[1:5,1:5]

##           CLL11.CEL CLL12.CEL CLL13.CEL CLL14.CEL CLL15.CEL
## 1000_at    5.743132  6.219412  5.523328  5.340477  5.229904
## 1001_at    2.285143  2.291229  2.287986  2.295313  2.662170
## 1002_f_at  3.309294  3.318466  3.354423  3.327130  3.365113
## 1003_s_at  1.085264  1.117288  1.084010  1.103217  1.074243
## 1004_at    7.544884  7.671801  7.474025  7.152482  6.902932

了解 str,head,help函数，作用于第二步提取到的表达矩阵

str(sCLLex)

## Formal class 'ExpressionSet' [package "Biobase"] with 7 slots
##   ..@ experimentData   :Formal class 'MIAME' [package "Biobase"] with 13 slots
##   .. .. ..@ name             : chr ""
##   .. .. ..@ lab              : chr ""
##   .. .. ..@ contact          : chr ""
##   .. .. ..@ title            : chr ""
##   .. .. ..@ abstract         : chr ""
##   .. .. ..@ url              : chr ""
##   .. .. ..@ pubMedIds        : chr ""
##   .. .. ..@ samples          : list()
##   .. .. ..@ hybridizations   : list()
##   .. .. ..@ normControls     : list()
##   .. .. ..@ preprocessing    :List of 2
##   .. .. .. ..$ filenames  : chr [1:24] "/export/home/rgentlem/RG/Genetics/Sabina/CLL/CLL10.CEL" "/export/home/rgentlem/RG/Genetics/Sabina/CLL/CLL11.CEL" "/export/home/rgentlem/RG/Genetics/Sabina/CLL/CLL12.CEL" "/export/home/rgentlem/RG/Genetics/Sabina/CLL/CLL13.CEL" ...
##   .. .. .. ..$ affyversion: chr NA
##   .. .. ..@ other            : list()
##   .. .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slot
##   .. .. .. .. ..@ .Data:List of 1
##   .. .. .. .. .. ..$ : int [1:3] 1 0 0
##   ..@ assayData        :<environment: 0x00000000196caec8> 
##   ..@ phenoData        :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
##   .. .. ..@ varMetadata      :'data.frame':  2 obs. of  1 variable:
##   .. .. .. ..$ labelDescription: chr [1:2] "Sample ID" "Stable/Progressive"
##   .. .. ..@ data             :'data.frame':  22 obs. of  2 variables:
##   .. .. .. ..$ SampleID: 'AsIs' chr [1:22] "CLL11" "CLL12" "CLL13" "CLL14" ...
##   .. .. .. ..$ Disease : Factor w/ 2 levels "progres.","stable": 1 2 1 1 1 1 2 2 1 2 ...
##   .. .. ..@ dimLabels        : chr [1:2] "sampleNames" "sampleColumns"
##   .. .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slot
##   .. .. .. .. ..@ .Data:List of 1
##   .. .. .. .. .. ..$ : int [1:3] 1 1 0
##   ..@ featureData      :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
##   .. .. ..@ varMetadata      :'data.frame':  0 obs. of  1 variable:
##   .. .. .. ..$ labelDescription: chr(0) 
##   .. .. ..@ data             :'data.frame':  12625 obs. of  0 variables
##   .. .. ..@ dimLabels        : chr [1:2] "featureNames" "featureColumns"
##   .. .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slot
##   .. .. .. .. ..@ .Data:List of 1
##   .. .. .. .. .. ..$ : int [1:3] 1 1 0
##   ..@ annotation       : chr "hgu95av2"
##   ..@ protocolData     :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
##   .. .. ..@ varMetadata      :'data.frame':  0 obs. of  1 variable:
##   .. .. .. ..$ labelDescription: chr(0) 
##   .. .. ..@ data             :'data.frame':  22 obs. of  0 variables
##   .. .. ..@ dimLabels        : chr [1:2] "sampleNames" "sampleColumns"
##   .. .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slot
##   .. .. .. .. ..@ .Data:List of 1
##   .. .. .. .. .. ..$ : int [1:3] 1 1 0
##   ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slot
##   .. .. ..@ .Data:List of 4
##   .. .. .. ..$ : int [1:3] 2 10 0
##   .. .. .. ..$ : int [1:3] 2 5 5
##   .. .. .. ..$ : int [1:3] 1 3 0
##   .. .. .. ..$ : int [1:3] 1 0 0

head(sCLLex)

## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 1 features, 22 samples 
##   element names: exprs 
## protocolData: none
## phenoData
##   sampleNames: CLL11.CEL CLL12.CEL ... CLL9.CEL (22 total)
##   varLabels: SampleID Disease
##   varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation: hgu95av2

help(sCLLex)

## starting httpd help server ... done

安装并了解 hgu95av2.db 包,看看 ls(“package:hgu95av2.db”) 后显示的那些变量

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
BiocManager::install("hgu95av2.db", version = "3.8")

library(hgu95av2.db)

## Loading required package: AnnotationDbi

## Loading required package: stats4

## Loading required package: IRanges

## Loading required package: S4Vectors

## 
## Attaching package: 'S4Vectors'

## The following object is masked from 'package:base':
## 
##     expand.grid

## 
## Attaching package: 'IRanges'

## The following object is masked from 'package:grDevices':
## 
##     windows

## Loading required package: org.Hs.eg.db

##

##

ls("package:hgu95av2.db")

##  [1] "hgu95av2"              "hgu95av2.db"          
##  [3] "hgu95av2_dbconn"       "hgu95av2_dbfile"      
##  [5] "hgu95av2_dbInfo"       "hgu95av2_dbschema"    
##  [7] "hgu95av2ACCNUM"        "hgu95av2ALIAS2PROBE"  
##  [9] "hgu95av2CHR"           "hgu95av2CHRLENGTHS"   
## [11] "hgu95av2CHRLOC"        "hgu95av2CHRLOCEND"    
## [13] "hgu95av2ENSEMBL"       "hgu95av2ENSEMBL2PROBE"
## [15] "hgu95av2ENTREZID"      "hgu95av2ENZYME"       
## [17] "hgu95av2ENZYME2PROBE"  "hgu95av2GENENAME"     
## [19] "hgu95av2GO"            "hgu95av2GO2ALLPROBES" 
## [21] "hgu95av2GO2PROBE"      "hgu95av2MAP"          
## [23] "hgu95av2MAPCOUNTS"     "hgu95av2OMIM"         
## [25] "hgu95av2ORGANISM"      "hgu95av2ORGPKG"       
## [27] "hgu95av2PATH"          "hgu95av2PATH2PROBE"   
## [29] "hgu95av2PFAM"          "hgu95av2PMID"         
## [31] "hgu95av2PMID2PROBE"    "hgu95av2PROSITE"      
## [33] "hgu95av2REFSEQ"        "hgu95av2SYMBOL"       
## [35] "hgu95av2UNIGENE"       "hgu95av2UNIPROT"

理解 head(toTable(hgu95av2SYMBOL)) 的用法，找到 TP53 基因对应的探针ID

ids=toTable(hgu95av2SYMBOL)
ids[ids$symbol=="TP53",]

##       probe_id symbol
## 966    1939_at   TP53
## 997  1974_s_at   TP53
## 1420  31618_at   TP53

理解探针与基因的对应关系，总共多少个基因，基因最多对应多少个探针，是哪些基因，是不是因为这些基因很长，所以在其上面设计多个探针呢？

length(unique(ids$symbol))

## [1] 8585

tail(sort(table(ids$symbol)))

## 
## YME1L1  GAPDH INPP4A    MYB PTGER3  STAT1 
##      7      8      8      8      8      8

table(sort(table(ids$symbol)))

## 
##    1    2    3    4    5    6    7    8 
## 6555 1428  451  102   22   16    6    5

plot(table(sort(table(ids$symbol))))

第二步提取到的表达矩阵是12625个探针在22个样本的表达量矩阵，找到那些不在 hgu95av2.db 包收录的对应着SYMBOL的探针。提示：有1165个探针是没有对应基因名字的。

str(exprSet)

##  num [1:12625, 1:22] 5.74 2.29 3.31 1.09 7.54 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:12625] "1000_at" "1001_at" "1002_f_at" "1003_s_at" ...
##   ..$ : chr [1:22] "CLL11.CEL" "CLL12.CEL" "CLL13.CEL" "CLL14.CEL" ...

dim(exprSet)

## [1] 12625    22

exprSet[1:5,1:5]

##           CLL11.CEL CLL12.CEL CLL13.CEL CLL14.CEL CLL15.CEL
## 1000_at    5.743132  6.219412  5.523328  5.340477  5.229904
## 1001_at    2.285143  2.291229  2.287986  2.295313  2.662170
## 1002_f_at  3.309294  3.318466  3.354423  3.327130  3.365113
## 1003_s_at  1.085264  1.117288  1.084010  1.103217  1.074243
## 1004_at    7.544884  7.671801  7.474025  7.152482  6.902932

table(rownames(exprSet) %in% ids$probe_id)

## 
## FALSE  TRUE 
##  1165 11460

noMap = setdiff(rownames(exprSet),ids$probe_id)

过滤表达矩阵，删除那1165个没有对应基因名字的探针。

exprSet=exprSet[rownames(exprSet) %in% ids$probe_id,]
dim(exprSet)

## [1] 11460    22

整合表达矩阵，多个探针对应一个基因的情况下，只保留在所有样本里面平均表达量最大的那个探针。

提示，理解 tapply,by,aggregate,split 函数 , 首先对每个基因找到最大表达量的探针。
然后根据得到探针去过滤原始表达矩阵

ids=ids[match(rownames(exprSet),ids$probe_id),]
head(ids)

##    probe_id  symbol
## 1   1000_at   MAPK3
## 2   1001_at    TIE1
## 3 1002_f_at CYP2C19
## 4 1003_s_at   CXCR5
## 5   1004_at   CXCR5
## 6   1005_at   DUSP1

exprSet[1:5,1:5]

##           CLL11.CEL CLL12.CEL CLL13.CEL CLL14.CEL CLL15.CEL
## 1000_at    5.743132  6.219412  5.523328  5.340477  5.229904
## 1001_at    2.285143  2.291229  2.287986  2.295313  2.662170
## 1002_f_at  3.309294  3.318466  3.354423  3.327130  3.365113
## 1003_s_at  1.085264  1.117288  1.084010  1.103217  1.074243
## 1004_at    7.544884  7.671801  7.474025  7.152482  6.902932

tmp = by(exprSet,ids$symbol,function(x) rownames(x)[which.max(rowMeans(x))] )
probes = as.character(tmp)
exprSet=exprSet[rownames(exprSet) %in% probes ,]
dim(exprSet)

## [1] 8585   22

把过滤后的表达矩阵更改行名为基因的symbol，因为这个时候探针和基因是一对一关系了。

rownames(exprSet)=ids[match(rownames(exprSet),ids$probe_id),2]
exprSet[1:5,1:5]

##         CLL11.CEL CLL12.CEL CLL13.CEL CLL14.CEL CLL15.CEL
## MAPK3    5.743132  6.219412  5.523328  5.340477  5.229904
## TIE1     2.285143  2.291229  2.287986  2.295313  2.662170
## CYP2C19  3.309294  3.318466  3.354423  3.327130  3.365113
## CXCR5    7.544884  7.671801  7.474025  7.152482  6.902932
## DUSP1    5.083793  7.610593  7.631311  6.518594  5.059087

library(reshape2)
exprSet_L=melt(exprSet)
colnames(exprSet_L)=c('probe','sample','value')
exprSet_L$group=rep(group_list,each=nrow(exprSet))

对第10步得到的表达矩阵进行探索，先画第一个样本的所有基因的表达量的boxplot,hist,density ，然后画所有样本的这些图

参考：http://bio-info-trainee.com/tmp/basic_visualization_for_expression_matrix.html
理解ggplot2的绘图语法，数据和图形元素的映射关系

library(ggplot2)
p=ggplot(exprSet_L,aes(x=sample,y=value,fill=group))+geom_boxplot()
print(p)

p=ggplot(exprSet_L,aes(x=sample,y=value,fill=group))+geom_violin()
print(p)

p=ggplot(exprSet_L,aes(value,fill=group))+geom_histogram(bins = 200)+facet_wrap(~sample, nrow = 4)
print(p)

p=ggplot(exprSet_L,aes(value,col=group))+geom_density()+facet_wrap(~sample, nrow = 4)
print(p)

p=ggplot(exprSet_L,aes(value,col=group))+geom_density()
print(p)

p=ggplot(exprSet_L,aes(x=sample,y=value,fill=group))+geom_boxplot()
p=p+stat_summary(fun.y="mean",geom="point",shape=23,size=3,fill="red")
p=p+theme_set(theme_set(theme_bw(base_size=20)))
p=p+theme(text=element_text(face='bold'),axis.text.x=element_text(angle=30,hjust=1),axis.title=element_blank())
print(p)

理解统计学指标mean,median,max,min,sd,var,mad并计算出每个基因在所有样本的这些统计学指标，最后按照mad值排序，取top 50 mad值的基因，得到列表。注意：这个题目出的并不合规，请仔细看。

g_mean <- tail(sort(apply(exprSet,1,mean)),50)
g_median <- tail(sort(apply(exprSet,1,median)),50)
g_max <- tail(sort(apply(exprSet,1,max)),50)
g_min <- tail(sort(apply(exprSet,1,min)),50)
g_sd <- tail(sort(apply(exprSet,1,sd)),50)
g_var <- tail(sort(apply(exprSet,1,var)),50)
g_mad <- tail(sort(apply(exprSet,1,mad)),50)
g_mad

##     PFN2   TNFSF9 ARHGAP44   P2RY14  THEMIS2      LPL    ANXA4    DUSP6 
## 1.481294 1.485155 1.488032 1.505107 1.507287 1.532212 1.533327 1.551320 
##    DUSP5     H1FX     FLNA   CLEC2B   TSPYL2   ZNF266   S100A9    NR4A2 
## 1.553782 1.557412 1.574436 1.578055 1.582430 1.587748 1.608285 1.612875 
##    TGFBI     ARF6    APBB2     VCAN    RBM38     CAPG   PLXNC1     RGS2 
## 1.643149 1.654548 1.674443 1.681098 1.703638 1.713747 1.718297 1.770151 
##   RNASE6    VAMP5     CYBB     GNLY     CCL3     OAS1    TRIB2  ZNF804A 
## 1.775430 1.791017 1.811973 1.814364 1.862311 1.883509 1.937294 1.986163 
##      IGH    PCDH9    VIPR1   COBLL1  GUSBP11   S100A8      HBB   LHFPL2 
## 2.090866 2.144223 2.171912 2.179666 2.204212 2.220970 2.267136 2.317045 
##     FCN1    ZAP70    IGLC1   LGALS1      FOS   SLAMF1     TCF7      DMD 
## 2.371590 2.579046 2.590895 2.600604 2.938078 2.944105 2.993352 3.071541 
##  IGF2BP3   FAM30A 
## 3.234011 3.982191

names(g_mad)

##  [1] "PFN2"     "TNFSF9"   "ARHGAP44" "P2RY14"   "THEMIS2"  "LPL"     
##  [7] "ANXA4"    "DUSP6"    "DUSP5"    "H1FX"     "FLNA"     "CLEC2B"  
## [13] "TSPYL2"   "ZNF266"   "S100A9"   "NR4A2"    "TGFBI"    "ARF6"    
## [19] "APBB2"    "VCAN"     "RBM38"    "CAPG"     "PLXNC1"   "RGS2"    
## [25] "RNASE6"   "VAMP5"    "CYBB"     "GNLY"     "CCL3"     "OAS1"    
## [31] "TRIB2"    "ZNF804A"  "IGH"      "PCDH9"    "VIPR1"    "COBLL1"  
## [37] "GUSBP11"  "S100A8"   "HBB"      "LHFPL2"   "FCN1"     "ZAP70"   
## [43] "IGLC1"    "LGALS1"   "FOS"      "SLAMF1"   "TCF7"     "DMD"     
## [49] "IGF2BP3"  "FAM30A"

根据第12步骤得到top 50 mad值的基因列表来取表达矩阵的子集，并且热图可视化子表达矩阵。试试看其它5种热图的包的不同效果。

library(pheatmap)
choose_gene=names(tail(sort(apply(exprSet,1,mad)),50))
choose_matrix=exprSet[choose_gene,]
choose_matrix=t(scale(t(choose_matrix)))
pheatmap(choose_matrix)

14. 取不同统计学指标mean,median,max,mean,sd,var,mad的各top50基因列表，使用UpSetR包来看他们之间的overlap情况。

library(UpSetR)
g_all <- unique(c(names(g_mean),names(g_median),names(g_max),names(g_min),
names(g_sd),names(g_var),names(g_mad) ))
dat=data.frame(g_all=g_all,
g_mean=ifelse(g_all %in% names(g_mean) ,1,0),
g_median=ifelse(g_all %in% names(g_median) ,1,0),
g_max=ifelse(g_all %in% names(g_max) ,1,0),
g_min=ifelse(g_all %in% names(g_min) ,1,0),
g_sd=ifelse(g_all %in% names(g_sd) ,1,0),
g_var=ifelse(g_all %in% names(g_var) ,1,0),
g_mad=ifelse(g_all %in% names(g_mad) ,1,0)
)
upset(dat,nsets = 7)

15. 在第二步的基础上面提取CLL包里面的data(sCLLex) 数据对象的样本的表型数据。

pdata=pData(sCLLex)
group_list=as.character(pdata[,2])
group_list

##  [1] "progres." "stable"   "progres." "progres." "progres." "progres."
##  [7] "stable"   "stable"   "progres." "stable"   "progres." "stable"  
## [13] "progres." "stable"   "stable"   "progres." "progres." "progres."
## [19] "progres." "progres." "progres." "stable"

dim(exprSet)

## [1] 8585   22

exprSet[1:5,1:5]

##         CLL11.CEL CLL12.CEL CLL13.CEL CLL14.CEL CLL15.CEL
## MAPK3    5.743132  6.219412  5.523328  5.340477  5.229904
## TIE1     2.285143  2.291229  2.287986  2.295313  2.662170
## CYP2C19  3.309294  3.318466  3.354423  3.327130  3.365113
## CXCR5    7.544884  7.671801  7.474025  7.152482  6.902932
## DUSP1    5.083793  7.610593  7.631311  6.518594  5.059087

对所有样本的表达矩阵进行聚类并且绘图，然后添加样本的临床表型数据信息(更改样本名)

pdata=pData(sCLLex)
group_list=as.character(pdata[,2])
group_list

##  [1] "progres." "stable"   "progres." "progres." "progres." "progres."
##  [7] "stable"   "stable"   "progres." "stable"   "progres." "stable"  
## [13] "progres." "stable"   "stable"   "progres." "progres." "progres."
## [19] "progres." "progres." "progres." "stable"

dim(exprSet)

## [1] 8585   22

exprSet[1:5,1:5]

##         CLL11.CEL CLL12.CEL CLL13.CEL CLL14.CEL CLL15.CEL
## MAPK3    5.743132  6.219412  5.523328  5.340477  5.229904
## TIE1     2.285143  2.291229  2.287986  2.295313  2.662170
## CYP2C19  3.309294  3.318466  3.354423  3.327130  3.365113
## CXCR5    7.544884  7.671801  7.474025  7.152482  6.902932
## DUSP1    5.083793  7.610593  7.631311  6.518594  5.059087

对所有样本的表达矩阵进行PCA分析并且绘图，同样要添加表型信息。

library(ggfortify)
df=as.data.frame(t(exprSet))
df$group=group_list
autoplot(prcomp( df[,1:(ncol(df)-1)] ), data=df,colour = 'group')

18. 根据表达矩阵及样本分组信息进行批量T检验，得到检验结果表格

library(ggfortify)
df=as.data.frame(t(exprSet))
df$group=group_list
autoplot(prcomp( df[,1:(ncol(df)-1)] ), data=df,colour = 'group')

19. 使用limma包对表达矩阵及样本分组信息进行差异分析，得到差异分析表格，重点看logFC和P值，画个火山图(就是logFC和-log10(P值)的散点图。)。

# DEG by limma
suppressMessages(library(limma))
design <- model.matrix(~0+factor(group_list))
colnames(design)=levels(factor(group_list))
rownames(design)=colnames(exprSet)
design

##           progres. stable
## CLL11.CEL        1      0
## CLL12.CEL        0      1
## CLL13.CEL        1      0
## CLL14.CEL        1      0
## CLL15.CEL        1      0
## CLL16.CEL        1      0
## CLL17.CEL        0      1
## CLL18.CEL        0      1
## CLL19.CEL        1      0
## CLL20.CEL        0      1
## CLL21.CEL        1      0
## CLL22.CEL        0      1
## CLL23.CEL        1      0
## CLL24.CEL        0      1
## CLL2.CEL         0      1
## CLL3.CEL         1      0
## CLL4.CEL         1      0
## CLL5.CEL         1      0
## CLL6.CEL         1      0
## CLL7.CEL         1      0
## CLL8.CEL         1      0
## CLL9.CEL         0      1
## attr(,"assign")
## [1] 1 1
## attr(,"contrasts")
## attr(,"contrasts")$`factor(group_list)`
## [1] "contr.treatment"

contrast.matrix<-makeContrasts(paste0(unique(group_list),collapse = "-"),levels = design)
contrast.matrix

##           Contrasts
## Levels     progres.-stable
##   progres.               1
##   stable                -1

##这个矩阵声明，我们要把progres.组跟stable进行差异分析比较
##step1
fit <- lmFit(exprSet,design)
##step2
fit2 <- contrasts.fit(fit, contrast.matrix) ##这一步很重要，大家可以自行看看效果
fit2 <- eBayes(fit2) ## default no trend !!!
##eBayes() with trend=TRUE
##step3
tempOutput = topTable(fit2, coef=1, n=Inf)
nrDEG = na.omit(tempOutput)
#write.csv(nrDEG2,"limma_notrend.results.csv",quote = F)
head(nrDEG)

##              logFC  AveExpr         t      P.Value  adj.P.Val        B
## TBC1D2B -1.0284628 5.620700 -5.837398 8.240961e-06 0.02236713 3.351813
## CLIC1    0.9888221 9.954273  5.772843 9.560006e-06 0.02236713 3.230775
## DLEU1    1.8301554 6.950685  5.740883 1.029092e-05 0.02236713 3.170615
## SH3BP2  -1.3835699 4.463438 -5.735418 1.042149e-05 0.02236713 3.160313
## GPM6A    2.5471980 6.915045  5.043180 5.268833e-05 0.08731397 1.821657
## YTHDC2  -0.5187135 7.602354 -4.873724 7.881207e-05 0.08731397 1.485027

## volcano plot
DEG=nrDEG
logFC_cutoff <- with(DEG,mean(abs( logFC)) + 2*sd(abs( logFC)) )
DEG$change = as.factor(ifelse(DEG$P.Value < 0.05 & abs(DEG$logFC) > logFC_cutoff,
ifelse(DEG$logFC > logFC_cutoff ,'UP','DOWN'),'NOT')
)
this_tile <- paste0('Cutoff for logFC is ',round(logFC_cutoff,3),
'\nThe number of up gene is ',nrow(DEG[DEG$change =='UP',]) ,
'\nThe number of down gene is ',nrow(DEG[DEG$change =='DOWN',])
)
library(ggplot2)
g = ggplot(data=DEG, aes(x=logFC, y=-log10(P.Value), color=change)) +
geom_point(alpha=0.4, size=1.75) +
theme_set(theme_set(theme_bw(base_size=20)))+
xlab("log2 fold change") + ylab("-log10 p-value") +
ggtitle( this_tile ) + theme(plot.title = element_text(size=15,hjust = 0.5))+
scale_colour_manual(values = c('blue','black','red')) ## corresponding to the levels(res$change)
print(g)

20. 对T检验结果的P值和limma包差异分析的P值画散点图，看看哪些基因相差很大。

### different P values
head(nrDEG)
head(DEG_t.test)
DEG_t.test=DEG_t.test[rownames(nrDEG),]
plot(DEG_t.test[,3],nrDEG[,1])
plot(DEG_t.test[,4],nrDEG[,4])
plot(-log10(DEG_t.test[,4]),-log10(nrDEG[,4]))

exprSet['GAPDH',]
exprSet['ACTB',]
exprSet['DLEU1',]
library(ggplot2)
library(ggpubr)
my_comparisons <- list(
c("stable", "progres.")
)
dat=data.frame(group=group_list,
sampleID= names(exprSet['DLEU1',]),
values= as.numeric(exprSet['DLEU1',]))
ggboxplot(
dat, x = "group", y = "values",
color = "group",
add = "jitter"
)+
stat_compare_means(comparisons = my_comparisons, method = "t.test")
## heatmap
library(pheatmap)
choose_gene=head(rownames(nrDEG),25)
choose_matrix=exprSet[choose_gene,]
choose_matrix=t(scale(t(choose_matrix)))
pheatmap(choose_matrix)

怎么用for循环计算1+2+3+….+99+100的平均数是多少

sum=0
for(i in 1:100){
  sum=sum+i
}
sum

## [1] 5050