r-20.knit

生信人学习R语言20题

题目：http://www.bio-info-trainee.com/3409.html

答案代码：https://github.com/jmzeng1314/5years/blob/master/learn-R/tasks/3-r-20-codes.R

笔记：https://app.yinxiang.com/shard/s38/nl/27509653/7162e7d8-2c05-4dd7-94dd-b987b9a9f13e

2021-8-13

1-R包安装

setwd("D:/R/R-4.0.5/bin/project_practise/SXtree/SXtree_beforeGEO/R-20") 
rm(list=ls()) 
#BiocManager::install("pasilla")
#BiocManager::install("airway")

suppressPackageStartupMessages(library(airway))#气道平滑肌细胞
suppressPackageStartupMessages(library(pasilla))
suppressPackageStartupMessages(library(CLL))#慢性淋巴细胞白血病
suppressPackageStartupMessages(library(ALL))#急性淋巴细胞白血病

suppressMessages(library(limma)) 
suppressMessages(library(DESeq2)) 
suppressMessages(library(clusterProfiler)) 

suppressPackageStartupMessages(library(reshape2))
suppressPackageStartupMessages(library(ggplot2))
suppressMessages(library(tidyverse))

2-ExpressionSet 对象

data("sCLLex")
expr_sCLLex <- exprs(sCLLex)
pd_sCLLex <- pData(sCLLex)

3-用来了解表达矩阵的函数

str(expr_sCLLex)

##  num [1:12625, 1:22] 5.74 2.29 3.31 1.09 7.54 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:12625] "1000_at" "1001_at" "1002_f_at" "1003_s_at" ...
##   ..$ : chr [1:22] "CLL11.CEL" "CLL12.CEL" "CLL13.CEL" "CLL14.CEL" ...

head(expr_sCLLex)

##           CLL11.CEL CLL12.CEL CLL13.CEL CLL14.CEL CLL15.CEL CLL16.CEL CLL17.CEL
## 1000_at    5.743132  6.219412  5.523328  5.340477  5.229904  4.920686  5.325348
## 1001_at    2.285143  2.291229  2.287986  2.295313  2.662170  2.278040  2.350796
## 1002_f_at  3.309294  3.318466  3.354423  3.327130  3.365113  3.568353  3.502440
## 1003_s_at  1.085264  1.117288  1.084010  1.103217  1.074243  1.073097  1.091264
## 1004_at    7.544884  7.671801  7.474025  7.152482  6.902932  7.368660  6.456285
## 1005_at    5.083793  7.610593  7.631311  6.518594  5.059087  4.855161  5.176975
##           CLL18.CEL CLL19.CEL CLL20.CEL CLL21.CEL CLL22.CEL CLL23.CEL CLL24.CEL
## 1000_at    4.826131  5.212387  5.285830  5.581859  6.251678  5.480752  5.216033
## 1001_at    2.325163  2.432635  2.256547  2.348389  2.263849  2.264434  2.344079
## 1002_f_at  3.394410  3.617099  3.279726  3.391734  3.306811  3.341444  3.798335
## 1003_s_at  1.076470  1.132558  1.096870  1.138386  1.061184  1.046188  1.082169
## 1004_at    6.824862  7.304803  8.757756  6.695295  7.372185  7.616749  6.839187
## 1005_at    4.874563  4.097580  9.250585  7.657463  7.683677  8.700667  5.546996
##           CLL2.CEL CLL3.CEL CLL4.CEL CLL5.CEL CLL6.CEL CLL7.CEL CLL8.CEL
## 1000_at   5.966942 5.397508 5.281720 5.414718 5.460626 5.897821 5.253883
## 1001_at   2.350073 2.406846 2.341961 2.372928 2.356978 2.222276 2.254772
## 1002_f_at 3.427736 3.453564 3.412944 3.411922 3.396466 3.247276 3.255148
## 1003_s_at 1.501367 1.191339 1.139510 1.153548 1.135671 1.074631 1.166247
## 1004_at   7.545673 8.802729 7.307752 7.491507 8.063202 7.014543 8.019108
## 1005_at   9.611601 5.732182 6.483191 6.072558 9.919125 5.149411 4.989604
##           CLL9.CEL
## 1000_at   5.214155
## 1001_at   2.358544
## 1002_f_at 3.365746
## 1003_s_at 1.151184
## 1004_at   7.432331
## 1005_at   5.339848

4-注释包

suppressPackageStartupMessages(library(hgu95av2.db))
ls("package:hgu95av2.db")

##  [1] "hgu95av2"              "hgu95av2.db"           "hgu95av2_dbconn"      
##  [4] "hgu95av2_dbfile"       "hgu95av2_dbInfo"       "hgu95av2_dbschema"    
##  [7] "hgu95av2ACCNUM"        "hgu95av2ALIAS2PROBE"   "hgu95av2CHR"          
## [10] "hgu95av2CHRLENGTHS"    "hgu95av2CHRLOC"        "hgu95av2CHRLOCEND"    
## [13] "hgu95av2ENSEMBL"       "hgu95av2ENSEMBL2PROBE" "hgu95av2ENTREZID"     
## [16] "hgu95av2ENZYME"        "hgu95av2ENZYME2PROBE"  "hgu95av2GENENAME"     
## [19] "hgu95av2GO"            "hgu95av2GO2ALLPROBES"  "hgu95av2GO2PROBE"     
## [22] "hgu95av2MAP"           "hgu95av2MAPCOUNTS"     "hgu95av2OMIM"         
## [25] "hgu95av2ORGANISM"      "hgu95av2ORGPKG"        "hgu95av2PATH"         
## [28] "hgu95av2PATH2PROBE"    "hgu95av2PFAM"          "hgu95av2PMID"         
## [31] "hgu95av2PMID2PROBE"    "hgu95av2PROSITE"       "hgu95av2REFSEQ"       
## [34] "hgu95av2SYMBOL"        "hgu95av2UNIGENE"       "hgu95av2UNIPROT"

5-找到TP53对应的探针ID

ids <- toTable(hgu95av2SYMBOL)
ids[ids$symbol=="TP53",]

##       probe_id symbol
## 966    1939_at   TP53
## 997  1974_s_at   TP53
## 1420  31618_at   TP53

6-探针和基因的对应关系

summary(ids)

##    probe_id            symbol         
##  Length:11457       Length:11457      
##  Class :character   Class :character  
##  Mode  :character   Mode  :character

#table(ids$symbol)

7&8-找到不在的并过滤掉

notin <- expr_sCLLex[!rownames(expr_sCLLex)%in%ids$probe_id,]
dim(notin)

## [1] 1168   22

expr_sCLLex <- expr_sCLLex[rownames(expr_sCLLex)%in%ids$probe_id,]

9&10—找到每个基因对应的表达量最大的探针

ids <- ids[ids$probe_id%in%rownames(expr_sCLLex),]
head(ids)

##    probe_id  symbol
## 1   1000_at   MAPK3
## 2   1001_at    TIE1
## 3 1002_f_at CYP2C19
## 4 1003_s_at   CXCR5
## 5   1004_at   CXCR5
## 6   1005_at   DUSP1

head(expr_sCLLex)

##           CLL11.CEL CLL12.CEL CLL13.CEL CLL14.CEL CLL15.CEL CLL16.CEL CLL17.CEL
## 1000_at    5.743132  6.219412  5.523328  5.340477  5.229904  4.920686  5.325348
## 1001_at    2.285143  2.291229  2.287986  2.295313  2.662170  2.278040  2.350796
## 1002_f_at  3.309294  3.318466  3.354423  3.327130  3.365113  3.568353  3.502440
## 1003_s_at  1.085264  1.117288  1.084010  1.103217  1.074243  1.073097  1.091264
## 1004_at    7.544884  7.671801  7.474025  7.152482  6.902932  7.368660  6.456285
## 1005_at    5.083793  7.610593  7.631311  6.518594  5.059087  4.855161  5.176975
##           CLL18.CEL CLL19.CEL CLL20.CEL CLL21.CEL CLL22.CEL CLL23.CEL CLL24.CEL
## 1000_at    4.826131  5.212387  5.285830  5.581859  6.251678  5.480752  5.216033
## 1001_at    2.325163  2.432635  2.256547  2.348389  2.263849  2.264434  2.344079
## 1002_f_at  3.394410  3.617099  3.279726  3.391734  3.306811  3.341444  3.798335
## 1003_s_at  1.076470  1.132558  1.096870  1.138386  1.061184  1.046188  1.082169
## 1004_at    6.824862  7.304803  8.757756  6.695295  7.372185  7.616749  6.839187
## 1005_at    4.874563  4.097580  9.250585  7.657463  7.683677  8.700667  5.546996
##           CLL2.CEL CLL3.CEL CLL4.CEL CLL5.CEL CLL6.CEL CLL7.CEL CLL8.CEL
## 1000_at   5.966942 5.397508 5.281720 5.414718 5.460626 5.897821 5.253883
## 1001_at   2.350073 2.406846 2.341961 2.372928 2.356978 2.222276 2.254772
## 1002_f_at 3.427736 3.453564 3.412944 3.411922 3.396466 3.247276 3.255148
## 1003_s_at 1.501367 1.191339 1.139510 1.153548 1.135671 1.074631 1.166247
## 1004_at   7.545673 8.802729 7.307752 7.491507 8.063202 7.014543 8.019108
## 1005_at   9.611601 5.732182 6.483191 6.072558 9.919125 5.149411 4.989604
##           CLL9.CEL
## 1000_at   5.214155
## 1001_at   2.358544
## 1002_f_at 3.365746
## 1003_s_at 1.151184
## 1004_at   7.432331
## 1005_at   5.339848

ids$mean <- apply(expr_sCLLex,1,mean)
ids <- arrange(ids,desc(symbol,mean))
ids <- ids[!duplicated(ids$symbol),]
expr_sCLLex <- expr_sCLLex[ids$probe_id,]
rownames(expr_sCLLex) <- ids$symbol
save(expr_sCLLex,pd_sCLLex,file = "expr_pd_sCLLex.Rdata")

11-了解表达矩阵

### 第一个sample
rm(list = ls())
load("D:/R/R-4.0.5/bin/project_practise/SXtree/SXtree_beforeGEO/R-20/expr_pd_sCLLex.Rdata")
boxplot(expr_sCLLex[,1])

table(pd_sCLLex$Disease)

## 
## progres.   stable 
##       14        8

pd_sCLLex$group <- as.factor(pd_sCLLex$Disease )


exprl <- melt(expr_sCLLex)
colnames(exprl) <- c("gene","sample","expr")
exprl$group <- rep(pd_sCLLex$group,each=nrow(expr_sCLLex))
table(exprl$group)

## 
## progres.   stable 
##   120148    68656

exprl_CLL11 <- exprl %>% filter(sample == "CLL11.CEL")

ggplot(exprl_CLL11,aes(x=expr))+geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(exprl_CLL11,aes(x=expr))+geom_density()

ggplot(exprl,aes(x=sample,y=expr,fill=group))+geom_boxplot()+
  theme(text = element_text(size=13),
        axis.text.x = element_text(angle=135, hjust=1))

ggplot(exprl,aes(x=expr,fill=group))+geom_histogram()+facet_wrap(exprl$sample)+theme_bw()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(exprl,aes(x=expr,col=group))+geom_density()+facet_wrap(exprl$sample)+theme_bw()

ggplot(exprl,aes(expr,col=group))+geom_density()

suppressMessages(library(ggpubr))
ggviolin(exprl,x="sample",y="expr",fill = "group")+
  theme(text = element_text(size=10),
        axis.text.x = element_text(angle = 90,hjust = 1))

#gpairs
#install.packages("gpairs")
library(gpairs)
expr_pairs <- expr_sCLLex[,1:6]
gpairs(expr_pairs)

#cluster
expr_cluster <- expr_sCLLex
colnames(expr_cluster) <- pd_sCLLex$group
out.dist <- dist(t(expr_cluster),method = "euclidean")
out.hclust <- hclust(out.dist,method = "complete")
plot(out.hclust)

hc <- hclust(dist(t(expr_cluster)))#t指turn转；dist是欧氏距离
#这个代码只是把上面的代码简并了一下，是一样的，method默认是"complete"
plot(hc)

#PCA
pc <- prcomp(t(expr_sCLLex),scale=T)
pcx <- data.frame(pc$x)
pcr <- cbind(samples=rownames(pcx),pd_sCLLex$group,pcx)
ggplot(pcr,aes(PC1,PC2))+geom_point(size=3,aes(color=pd_sCLLex$group))+
  geom_text(aes(label=samples),hjust=-0.1,vjust=-0.3,size=2.5)

#BiocManager::install("ggfortify")
suppressMessages(library(ggfortify)) 
df <- as.data.frame(t(expr_sCLLex))
df$group=pd_sCLLex$group
autoplot(prcomp(df[,1:(ncol(df)-1)]),data=df,colour='group')

save(expr_sCLLex,exprl,pd_sCLLex,file = "expr.Rdata")
rm(list=ls())

热图

load("D:/R/R-4.0.5/bin/project_practise/SXtree/SXtree_beforeGEO/R-20/expr_pd_sCLLex.Rdata")
library(pheatmap)

#  差异分析

## 分组设计矩阵
group_list <- factor(pd_sCLLex$Disease)
design <- model.matrix(~0+group_list)
colnames(design) <- levels(group_list)
rownames(design) <- colnames(expr_sCLLex)
design

##           progres. stable
## CLL11.CEL        1      0
## CLL12.CEL        0      1
## CLL13.CEL        1      0
## CLL14.CEL        1      0
## CLL15.CEL        1      0
## CLL16.CEL        1      0
## CLL17.CEL        0      1
## CLL18.CEL        0      1
## CLL19.CEL        1      0
## CLL20.CEL        0      1
## CLL21.CEL        1      0
## CLL22.CEL        0      1
## CLL23.CEL        1      0
## CLL24.CEL        0      1
## CLL2.CEL         0      1
## CLL3.CEL         1      0
## CLL4.CEL         1      0
## CLL5.CEL         1      0
## CLL6.CEL         1      0
## CLL7.CEL         1      0
## CLL8.CEL         1      0
## CLL9.CEL         0      1
## attr(,"assign")
## [1] 1 1
## attr(,"contrasts")
## attr(,"contrasts")$group_list
## [1] "contr.treatment"

## 比较矩阵
contrast<-makeContrasts(paste("progres.","stable",sep = "-"),levels = design)
contrast ##这个矩阵声明，我们要把progres.组跟stable进行差异分析比较

##           Contrasts
## Levels     progres.-stable
##   progres.               1
##   stable                -1

DEG <- function(expr42872,design,contrast){
  fit = lmFit(expr42872, design)
  fit2 = contrasts.fit(fit, contrast)
  fit2 = eBayes(fit2)
  mtx = topTable(fit2, coef=1, n=Inf)
  deg_mtx = na.omit(mtx)
  return(deg_mtx)
}
DEG_mtx <- DEG(expr_sCLLex,design,contrast)#得到差异分析的结果
expr_sCLLex["TBC1D2B",]#检查

## CLL11.CEL CLL12.CEL CLL13.CEL CLL14.CEL CLL15.CEL CLL16.CEL CLL17.CEL CLL18.CEL 
##  5.191501  6.584007  5.161696  5.021954  4.829027  5.473421  6.193658  6.835712 
## CLL19.CEL CLL20.CEL CLL21.CEL CLL22.CEL CLL23.CEL CLL24.CEL  CLL2.CEL  CLL3.CEL 
##  5.218793  6.617105  5.210030  6.197718  4.783289  5.245601  6.652838  5.247241 
##  CLL4.CEL  CLL5.CEL  CLL6.CEL  CLL7.CEL  CLL8.CEL  CLL9.CEL 
##  5.406573  5.832655  5.447913  4.812068  5.817831  5.874775

## 绘制热图：利用归一化后的表达矩阵
getgene <- head(rownames(DEG_mtx),25)#获取选择的基因列表
getgene_matrix <- expr_sCLLex[getgene,]#还是利用表达矩阵画图
getgene_matrix <- t(scale(t(getgene_matrix)))
group_list <- as.data.frame(group_list)
rownames(group_list) <- colnames(getgene_matrix)
pheatmap(getgene_matrix,
         show_rownames = F,
         annotation_col = group_list,
         border_color=NA)

save(DEG_mtx,group_list,file = "limma_output_sCLLex.Rdata")
rm(list = ls())

火山图

load("D:/R/R-4.0.5/bin/project_practise/SXtree/SXtree_beforeGEO/R-20/limma_output_sCLLex.Rdata")
load("D:/R/R-4.0.5/bin/project_practise/SXtree/SXtree_beforeGEO/R-20/expr_pd_sCLLex.Rdata")

## 准备数据
DEG_mtx$change <- as.factor(ifelse(DEG_mtx$P.Value<0.05,
                                   ifelse(DEG_mtx$logFC>0,"up","down"),"not"))#标注方向
DEG_mtx$gene_name <- rownames(DEG_mtx)#准备一下标注列

## 开始绘图
suppressPackageStartupMessages(library(cowplot)) 
suppressPackageStartupMessages(library(ggrepel)) 
suppressPackageStartupMessages(library(ggstatsplot)) 

ggplot(data = DEG_mtx, aes(x = logFC, y = -log10(P.Value))) +
  ##选定数据源和x,y坐标
  geom_point(size = 1.75, aes(color = change), show.legend = T,alpha=0.4) +
  ###描点，设置大小、颜色分类和图示
  scale_color_manual(values = c("royalblue", "grey","red")) + 
  ####颜色值
  ylim(0, 5) + 
  ######纵坐标的量程
  labs(x = 'Log2(fold change)', y = '-log10(p)')+ 
  #######标签
  geom_hline(yintercept = -log10(0.05), linetype = 'dotdash', color = 'grey20') + 
  #######添加水平参考线，p<0.01
  geom_vline(xintercept = c(-1, 1), linetype = 'dotdash', color = 'grey20') + 
  #添加垂直参考线，log(fold_change)绝对值大于1
  geom_text_repel(data = filter(DEG_mtx, abs(logFC)>1 & P.Value < 0.05),
                  ###第一步选标签来源数据，利用subset（）选子集
                  size = 2,
                  aes(label = gene_name))+#映射
  theme_bw()

## Warning: Removed 2 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_text_repel).

## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

12-统计学概念

#mean-算术平均值
mean(expr_sCLLex[1,])

## [1] 6.696363

#var-方差
var(expr_sCLLex[1,])

## [1] 0.1678388

#mad-绝对中位差
mad(expr_sCLLex[1,])

## [1] 0.2591946

#sd-标准差
sd(expr_sCLLex[1,])

## [1] 0.4096813

#median-中位数
median(expr_sCLLex[1,])

## [1] 6.648816

mad <- apply(expr_sCLLex,1,mad)
#getTop50 <- head(sort(mad))
getTop50 <- as.data.frame(cbind(rownames(expr_sCLLex),as.numeric(mad)))
getTop50 <- arrange(getTop50,desc(mad))
Top50 <- head(getTop50$V1,50)

expr_mad <- expr_sCLLex[Top50,]
expr_mad <- expr_mad[,-23]

13-前50个基因4种方式画热图

suppressMessages(library(pheatmap))
pheatmap(expr_mad,scale = "row",show_rownames = F,
         treeheight_row = 15,
         treeheight_col = 15,
         cluster_cols = T,
         annotation_col = group_list)

#2
expr_mad <- t(scale(t(expr_mad)))
pheatmap(expr_mad,show_rownames = F)

#3
heatmap(expr_mad)

#4
expr_madl <- melt(expr_mad)
colnames(expr_madl) <- c("gene","sample","value")
ggplot(expr_madl,aes(sample,gene))+
  geom_tile(aes(fill=value),colour = "white")+
  scale_fill_gradient(low = "white",high = "steelblue")

#5
#install.packages("gplots")
suppressPackageStartupMessages(library(gplots))
#heatmap.2(expr_mad[1:10,],key = T,symkey = F,density.info = "none", trace = "none")

symkey 即图例下面的刻度
trace图片上面的虚线
density.info 图例的虚线

图纸过小会报错：Error in .External.graphics(C_layout, num.rows, num.cols, mat, as.integer(num.figures), :

图形狀態不对;

拉大图纸就能解决了

14-UpsetR集合重叠部分的可视化

#median
STATS_FUN <- function(median,expr_sCLLex,n){
  a <- apply(expr_sCLLex,1,median)
  b <- as.data.frame(cbind(rownames(expr_sCLLex),a))
  b <- arrange(b,desc(a))
  Topgene <- head(b$V1,n)
    return(Topgene)
}

e_median <- STATS_FUN(median,expr_sCLLex,50)
e_mad <- STATS_FUN(mad,expr_sCLLex,50)
e_sd <- STATS_FUN(sd,expr_sCLLex,50)
e_var <- STATS_FUN(var,expr_sCLLex,50)
e_mean <- STATS_FUN(mean,expr_sCLLex,50)
e_min <- STATS_FUN(min,expr_sCLLex,50)
e_max <- STATS_FUN(max,expr_sCLLex,50)

#install.packages("UpSetR")
suppressPackageStartupMessages(library(UpSetR))
suppressPackageStartupMessages(library(magrittr)) 

all <- unique(c(e_mean,e_mad,e_max,e_min,e_sd,e_var,e_median,fromLast=T))#取所有集合的交集

e_all <- data.frame(all ,
                    e_mean=ifelse(all %in% e_mean,1,0),
                    e_median=ifelse(all %in% e_median,1,0),
                    e_max=ifelse(all %in% e_max,1,0),
                    e_min=ifelse(all %in% e_min,1,0),
                    e_sd=ifelse(all %in% e_sd,1,0),
                    e_var=ifelse(all %in% e_var,1,0),
                    e_mad=ifelse(all %in% e_mad,1,0))
head(e_all)#相当于在总体里面分别绘制每个集合

##       all e_mean e_median e_max e_min e_sd e_var e_mad
## 1    ADAR      1        1     0     1    0     0     1
## 2 SEPTIN7      1        1     0     0    0     0     1
## 3   PCBP1      1        0     0     0    0     0     0
## 4   SYPL1      1        1     0     0    0     0     1
## 5    CCT2      1        1     0     0    0     0     1
## 6  DYNLL1      1        0     0     0    0     0     0

upset(e_all,nsets = 7,sets.bar.color = "#BD1111")#绘制韦恩图，查看数据的交集

16-聚类分析

clust <- t(expr_sCLLex)
clust_dist <- dist(clust,method = "euclidean")
hc <- hclust(clust_dist,method = "ward.D")#one of "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC)
#美化
suppressPackageStartupMessages(library(factoextra))
fviz_dend(hc, k = 4 ,cex = 0.5,k_colors = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"),color_labels_by_k = TRUE, rect = TRUE)

## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

17-PCA

法一：ggplot

pc <- prcomp(t(expr_sCLLex),scale=T)
pcx <- data.frame(pc$x)
pcr <- cbind(samples=rownames(pcx),pd_sCLLex$Disease,pcx)
ggplot(pcr,aes(PC1,PC2))+geom_point(size=3,aes(color=pd_sCLLex$Disease))+
  geom_text(aes(label=samples),hjust=-0.1,vjust=-0.3,size=2.5)

法二：ggfortify

# BiocManager::install('ggfortify')
library(ggfortify)
df=as.data.frame(t(expr_sCLLex))
df$group=group_list 
#autoplot(prcomp( df[,1:(ncol(df)-1)] ), data=df,colour = 'group')

18-t 检验

#根据表达矩阵及样本分组信息进行批量T检验，得到检验结果表格 #相当于自己手动做一个差异分析

#把表达矩阵顺序理一下
load("D:/R/R-4.0.5/bin/project_practise/SXtree/SXtree_beforeGEO/R-20/expr_pd_sCLLex.Rdata")
group_list <- as.factor(pd_sCLLex$Disease)
dat <- expr_sCLLex
group1 = which(group_list == levels(group_list)[1])
group2 = which(group_list == levels(group_list)[2])
dat1 = dat[, group1]
dat2 = dat[, group2]
dat = cbind(dat1, dat2)

#看一下是不是排好了
pd <- pd_sCLLex[colnames(dat),]
pd$Disease

##  [1] progres. progres. progres. progres. progres. progres. progres. progres.
##  [9] progres. progres. progres. progres. progres. progres. stable   stable  
## [17] stable   stable   stable   stable   stable   stable  
## Levels: progres. stable

datl <- melt(dat)
colnames(datl) <- c("gene","sample","value")
datl$group <- rep(pd$Disease,each=nrow(dat))
ggplot(datl,aes(x=sample,y=value,color=group))+geom_boxplot()

pvals <- apply(expr_sCLLex, 1, function(x){
  t.test(as.numeric(x)~group_list)$p.value
})

#说明
t.test(as.numeric(expr_sCLLex[1,])~group_list)#t.test的结果是一个对象

## 
##  Welch Two Sample t-test
## 
## data:  as.numeric(expr_sCLLex[1, ]) by group_list
## t = -0.18376, df = 10.212, p-value = 0.8578
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.5073847  0.4298704
## sample estimates:
## mean in group progres.   mean in group stable 
##               6.682270               6.721027

t.test(as.numeric(expr_sCLLex[1,])~group_list)$p.value#提取其中的p.value

## [1] 0.8578006

#调整P值
p.adj = p.adjust(pvals, method = "BH")
avg_1 = rowMeans(dat1)#每行取均值
## ZZZ3      ZZEF1        ZYX      ZWINT       ZW10     ZSWIM8 
## 6.6822698  5.9423115  4.0781838  3.1241724  4.4772748  4.1420219 
## ZSCAN9    ZSCAN26    ZSCAN12      ZRSR2       ZPR1       ZPBP 
## 4.8713534  3.5238623  5.4831598  6.6684662  5.9943004  3.6880121
avg_2 = rowMeans(dat2)

log2FC = avg_2-avg_1#原来这就是logFC的意义

DEG_t.test = cbind(avg_1, avg_2, log2FC, pvals, p.adj)
DEG_t.test = DEG_t.test[order(DEG_t.test[,4]),]
DEG_t.test = as.data.frame(DEG_t.test)
head(DEG_t.test)

##           avg_1    avg_2     log2FC        pvals     p.adj
## SGSM2  7.875615 8.791753  0.9161377 1.629755e-05 0.1398656
## PDE8A  6.622749 7.965007  1.3422581 4.058944e-05 0.1656022
## DLEU1  7.616197 5.786041 -1.8301554 6.965416e-05 0.1656022
## LDOC1  4.456446 2.152471 -2.3039752 8.993339e-05 0.1656022
## USP6NL 5.988866 7.058738  1.0698718 9.648226e-05 0.1656022
## COMMD4 4.157971 3.407405 -0.7505660 2.454557e-04 0.2586085

20-比较自己t-检验和limma的区别

对T检验结果的P值和limma包差异分析的P值画散点图，看看哪些基因相差很大。

head(DEG_mtx)

##              logFC  AveExpr         t      P.Value  adj.P.Val        B change
## TBC1D2B -1.0284628 5.620700 -5.835358 8.339552e-06 0.02269911 3.317813   down
## CLIC1    0.9888221 9.954273  5.771016 9.668143e-06 0.02269911 3.198031     up
## DLEU1    1.8301554 6.950685  5.736605 1.046549e-05 0.02269911 3.133707     up
## SH3BP2  -1.3835699 4.463438 -5.731889 1.057987e-05 0.02269911 3.124878   down
## GPM6A    2.5471980 6.915045  5.038894 5.350441e-05 0.08807806 1.793442     up
## YTHDC2  -0.5187135 7.602354 -4.876863 7.860368e-05 0.08807806 1.473560   down
##         gene_name
## TBC1D2B   TBC1D2B
## CLIC1       CLIC1
## DLEU1       DLEU1
## SH3BP2     SH3BP2
## GPM6A       GPM6A
## YTHDC2     YTHDC2

head(DEG_t.test)

##           avg_1    avg_2     log2FC        pvals     p.adj
## SGSM2  7.875615 8.791753  0.9161377 1.629755e-05 0.1398656
## PDE8A  6.622749 7.965007  1.3422581 4.058944e-05 0.1656022
## DLEU1  7.616197 5.786041 -1.8301554 6.965416e-05 0.1656022
## LDOC1  4.456446 2.152471 -2.3039752 8.993339e-05 0.1656022
## USP6NL 5.988866 7.058738  1.0698718 9.648226e-05 0.1656022
## COMMD4 4.157971 3.407405 -0.7505660 2.454557e-04 0.2586085

DEG_t.test=DEG_t.test[rownames(DEG_mtx),]#排序使其一一对应
plot(DEG_t.test[,3],DEG_mtx[,1]) ## 可以看到logFC是相反的

plot(DEG_t.test[,4],DEG_mtx[,4]) # 可以看到使用limma包和t.test本身的p值差异尚可接受

plot(-log10(DEG_t.test[,4]),-log10(DEG_mtx[,4]))

library(ggplot2)
library(ggpubr)
my_comparisons <- list(
  c("stable", "progres.")
)
dat=data.frame(group=group_list,
               sampleID= names(expr_sCLLex['DLEU1',]),
               values= as.numeric(expr_sCLLex['DLEU1',]))
ggboxplot(
  dat, x = "group", y = "values",
  color = "group",
  add = "jitter"
)+
  stat_compare_means(comparisons = my_comparisons, method = "t.test")