题目:http://www.bio-info-trainee.com/3409.html
答案代码:https://github.com/jmzeng1314/5years/blob/master/learn-R/tasks/3-r-20-codes.R
笔记:https://app.yinxiang.com/shard/s38/nl/27509653/7162e7d8-2c05-4dd7-94dd-b987b9a9f13e
2021-8-13
setwd("D:/R/R-4.0.5/bin/project_practise/SXtree/SXtree_beforeGEO/R-20")
rm(list=ls())
#BiocManager::install("pasilla")
#BiocManager::install("airway")
suppressPackageStartupMessages(library(airway))#气道平滑肌细胞
suppressPackageStartupMessages(library(pasilla))
suppressPackageStartupMessages(library(CLL))#慢性淋巴细胞白血病
suppressPackageStartupMessages(library(ALL))#急性淋巴细胞白血病
suppressMessages(library(limma))
suppressMessages(library(DESeq2))
suppressMessages(library(clusterProfiler))
suppressPackageStartupMessages(library(reshape2))
suppressPackageStartupMessages(library(ggplot2))
suppressMessages(library(tidyverse))
data("sCLLex")
expr_sCLLex <- exprs(sCLLex)
pd_sCLLex <- pData(sCLLex)
str(expr_sCLLex)
## num [1:12625, 1:22] 5.74 2.29 3.31 1.09 7.54 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:12625] "1000_at" "1001_at" "1002_f_at" "1003_s_at" ...
## ..$ : chr [1:22] "CLL11.CEL" "CLL12.CEL" "CLL13.CEL" "CLL14.CEL" ...
head(expr_sCLLex)
## CLL11.CEL CLL12.CEL CLL13.CEL CLL14.CEL CLL15.CEL CLL16.CEL CLL17.CEL
## 1000_at 5.743132 6.219412 5.523328 5.340477 5.229904 4.920686 5.325348
## 1001_at 2.285143 2.291229 2.287986 2.295313 2.662170 2.278040 2.350796
## 1002_f_at 3.309294 3.318466 3.354423 3.327130 3.365113 3.568353 3.502440
## 1003_s_at 1.085264 1.117288 1.084010 1.103217 1.074243 1.073097 1.091264
## 1004_at 7.544884 7.671801 7.474025 7.152482 6.902932 7.368660 6.456285
## 1005_at 5.083793 7.610593 7.631311 6.518594 5.059087 4.855161 5.176975
## CLL18.CEL CLL19.CEL CLL20.CEL CLL21.CEL CLL22.CEL CLL23.CEL CLL24.CEL
## 1000_at 4.826131 5.212387 5.285830 5.581859 6.251678 5.480752 5.216033
## 1001_at 2.325163 2.432635 2.256547 2.348389 2.263849 2.264434 2.344079
## 1002_f_at 3.394410 3.617099 3.279726 3.391734 3.306811 3.341444 3.798335
## 1003_s_at 1.076470 1.132558 1.096870 1.138386 1.061184 1.046188 1.082169
## 1004_at 6.824862 7.304803 8.757756 6.695295 7.372185 7.616749 6.839187
## 1005_at 4.874563 4.097580 9.250585 7.657463 7.683677 8.700667 5.546996
## CLL2.CEL CLL3.CEL CLL4.CEL CLL5.CEL CLL6.CEL CLL7.CEL CLL8.CEL
## 1000_at 5.966942 5.397508 5.281720 5.414718 5.460626 5.897821 5.253883
## 1001_at 2.350073 2.406846 2.341961 2.372928 2.356978 2.222276 2.254772
## 1002_f_at 3.427736 3.453564 3.412944 3.411922 3.396466 3.247276 3.255148
## 1003_s_at 1.501367 1.191339 1.139510 1.153548 1.135671 1.074631 1.166247
## 1004_at 7.545673 8.802729 7.307752 7.491507 8.063202 7.014543 8.019108
## 1005_at 9.611601 5.732182 6.483191 6.072558 9.919125 5.149411 4.989604
## CLL9.CEL
## 1000_at 5.214155
## 1001_at 2.358544
## 1002_f_at 3.365746
## 1003_s_at 1.151184
## 1004_at 7.432331
## 1005_at 5.339848
suppressPackageStartupMessages(library(hgu95av2.db))
ls("package:hgu95av2.db")
## [1] "hgu95av2" "hgu95av2.db" "hgu95av2_dbconn"
## [4] "hgu95av2_dbfile" "hgu95av2_dbInfo" "hgu95av2_dbschema"
## [7] "hgu95av2ACCNUM" "hgu95av2ALIAS2PROBE" "hgu95av2CHR"
## [10] "hgu95av2CHRLENGTHS" "hgu95av2CHRLOC" "hgu95av2CHRLOCEND"
## [13] "hgu95av2ENSEMBL" "hgu95av2ENSEMBL2PROBE" "hgu95av2ENTREZID"
## [16] "hgu95av2ENZYME" "hgu95av2ENZYME2PROBE" "hgu95av2GENENAME"
## [19] "hgu95av2GO" "hgu95av2GO2ALLPROBES" "hgu95av2GO2PROBE"
## [22] "hgu95av2MAP" "hgu95av2MAPCOUNTS" "hgu95av2OMIM"
## [25] "hgu95av2ORGANISM" "hgu95av2ORGPKG" "hgu95av2PATH"
## [28] "hgu95av2PATH2PROBE" "hgu95av2PFAM" "hgu95av2PMID"
## [31] "hgu95av2PMID2PROBE" "hgu95av2PROSITE" "hgu95av2REFSEQ"
## [34] "hgu95av2SYMBOL" "hgu95av2UNIGENE" "hgu95av2UNIPROT"
ids <- toTable(hgu95av2SYMBOL)
ids[ids$symbol=="TP53",]
## probe_id symbol
## 966 1939_at TP53
## 997 1974_s_at TP53
## 1420 31618_at TP53
summary(ids)
## probe_id symbol
## Length:11457 Length:11457
## Class :character Class :character
## Mode :character Mode :character
#table(ids$symbol)
notin <- expr_sCLLex[!rownames(expr_sCLLex)%in%ids$probe_id,]
dim(notin)
## [1] 1168 22
expr_sCLLex <- expr_sCLLex[rownames(expr_sCLLex)%in%ids$probe_id,]
ids <- ids[ids$probe_id%in%rownames(expr_sCLLex),]
head(ids)
## probe_id symbol
## 1 1000_at MAPK3
## 2 1001_at TIE1
## 3 1002_f_at CYP2C19
## 4 1003_s_at CXCR5
## 5 1004_at CXCR5
## 6 1005_at DUSP1
head(expr_sCLLex)
## CLL11.CEL CLL12.CEL CLL13.CEL CLL14.CEL CLL15.CEL CLL16.CEL CLL17.CEL
## 1000_at 5.743132 6.219412 5.523328 5.340477 5.229904 4.920686 5.325348
## 1001_at 2.285143 2.291229 2.287986 2.295313 2.662170 2.278040 2.350796
## 1002_f_at 3.309294 3.318466 3.354423 3.327130 3.365113 3.568353 3.502440
## 1003_s_at 1.085264 1.117288 1.084010 1.103217 1.074243 1.073097 1.091264
## 1004_at 7.544884 7.671801 7.474025 7.152482 6.902932 7.368660 6.456285
## 1005_at 5.083793 7.610593 7.631311 6.518594 5.059087 4.855161 5.176975
## CLL18.CEL CLL19.CEL CLL20.CEL CLL21.CEL CLL22.CEL CLL23.CEL CLL24.CEL
## 1000_at 4.826131 5.212387 5.285830 5.581859 6.251678 5.480752 5.216033
## 1001_at 2.325163 2.432635 2.256547 2.348389 2.263849 2.264434 2.344079
## 1002_f_at 3.394410 3.617099 3.279726 3.391734 3.306811 3.341444 3.798335
## 1003_s_at 1.076470 1.132558 1.096870 1.138386 1.061184 1.046188 1.082169
## 1004_at 6.824862 7.304803 8.757756 6.695295 7.372185 7.616749 6.839187
## 1005_at 4.874563 4.097580 9.250585 7.657463 7.683677 8.700667 5.546996
## CLL2.CEL CLL3.CEL CLL4.CEL CLL5.CEL CLL6.CEL CLL7.CEL CLL8.CEL
## 1000_at 5.966942 5.397508 5.281720 5.414718 5.460626 5.897821 5.253883
## 1001_at 2.350073 2.406846 2.341961 2.372928 2.356978 2.222276 2.254772
## 1002_f_at 3.427736 3.453564 3.412944 3.411922 3.396466 3.247276 3.255148
## 1003_s_at 1.501367 1.191339 1.139510 1.153548 1.135671 1.074631 1.166247
## 1004_at 7.545673 8.802729 7.307752 7.491507 8.063202 7.014543 8.019108
## 1005_at 9.611601 5.732182 6.483191 6.072558 9.919125 5.149411 4.989604
## CLL9.CEL
## 1000_at 5.214155
## 1001_at 2.358544
## 1002_f_at 3.365746
## 1003_s_at 1.151184
## 1004_at 7.432331
## 1005_at 5.339848
ids$mean <- apply(expr_sCLLex,1,mean)
ids <- arrange(ids,desc(symbol,mean))
ids <- ids[!duplicated(ids$symbol),]
expr_sCLLex <- expr_sCLLex[ids$probe_id,]
rownames(expr_sCLLex) <- ids$symbol
save(expr_sCLLex,pd_sCLLex,file = "expr_pd_sCLLex.Rdata")
### 第一个sample
rm(list = ls())
load("D:/R/R-4.0.5/bin/project_practise/SXtree/SXtree_beforeGEO/R-20/expr_pd_sCLLex.Rdata")
boxplot(expr_sCLLex[,1])
table(pd_sCLLex$Disease)
##
## progres. stable
## 14 8
pd_sCLLex$group <- as.factor(pd_sCLLex$Disease )
exprl <- melt(expr_sCLLex)
colnames(exprl) <- c("gene","sample","expr")
exprl$group <- rep(pd_sCLLex$group,each=nrow(expr_sCLLex))
table(exprl$group)
##
## progres. stable
## 120148 68656
exprl_CLL11 <- exprl %>% filter(sample == "CLL11.CEL")
ggplot(exprl_CLL11,aes(x=expr))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(exprl_CLL11,aes(x=expr))+geom_density()
ggplot(exprl,aes(x=sample,y=expr,fill=group))+geom_boxplot()+
theme(text = element_text(size=13),
axis.text.x = element_text(angle=135, hjust=1))
ggplot(exprl,aes(x=expr,fill=group))+geom_histogram()+facet_wrap(exprl$sample)+theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(exprl,aes(x=expr,col=group))+geom_density()+facet_wrap(exprl$sample)+theme_bw()
ggplot(exprl,aes(expr,col=group))+geom_density()
suppressMessages(library(ggpubr))
ggviolin(exprl,x="sample",y="expr",fill = "group")+
theme(text = element_text(size=10),
axis.text.x = element_text(angle = 90,hjust = 1))
#gpairs
#install.packages("gpairs")
library(gpairs)
expr_pairs <- expr_sCLLex[,1:6]
gpairs(expr_pairs)
#cluster
expr_cluster <- expr_sCLLex
colnames(expr_cluster) <- pd_sCLLex$group
out.dist <- dist(t(expr_cluster),method = "euclidean")
out.hclust <- hclust(out.dist,method = "complete")
plot(out.hclust)
hc <- hclust(dist(t(expr_cluster)))#t指turn转;dist是欧氏距离
#这个代码只是把上面的代码简并了一下,是一样的,method默认是"complete"
plot(hc)
#PCA
pc <- prcomp(t(expr_sCLLex),scale=T)
pcx <- data.frame(pc$x)
pcr <- cbind(samples=rownames(pcx),pd_sCLLex$group,pcx)
ggplot(pcr,aes(PC1,PC2))+geom_point(size=3,aes(color=pd_sCLLex$group))+
geom_text(aes(label=samples),hjust=-0.1,vjust=-0.3,size=2.5)
#BiocManager::install("ggfortify")
suppressMessages(library(ggfortify))
df <- as.data.frame(t(expr_sCLLex))
df$group=pd_sCLLex$group
autoplot(prcomp(df[,1:(ncol(df)-1)]),data=df,colour='group')
save(expr_sCLLex,exprl,pd_sCLLex,file = "expr.Rdata")
rm(list=ls())
load("D:/R/R-4.0.5/bin/project_practise/SXtree/SXtree_beforeGEO/R-20/expr_pd_sCLLex.Rdata")
library(pheatmap)
# 差异分析
## 分组设计矩阵
group_list <- factor(pd_sCLLex$Disease)
design <- model.matrix(~0+group_list)
colnames(design) <- levels(group_list)
rownames(design) <- colnames(expr_sCLLex)
design
## progres. stable
## CLL11.CEL 1 0
## CLL12.CEL 0 1
## CLL13.CEL 1 0
## CLL14.CEL 1 0
## CLL15.CEL 1 0
## CLL16.CEL 1 0
## CLL17.CEL 0 1
## CLL18.CEL 0 1
## CLL19.CEL 1 0
## CLL20.CEL 0 1
## CLL21.CEL 1 0
## CLL22.CEL 0 1
## CLL23.CEL 1 0
## CLL24.CEL 0 1
## CLL2.CEL 0 1
## CLL3.CEL 1 0
## CLL4.CEL 1 0
## CLL5.CEL 1 0
## CLL6.CEL 1 0
## CLL7.CEL 1 0
## CLL8.CEL 1 0
## CLL9.CEL 0 1
## attr(,"assign")
## [1] 1 1
## attr(,"contrasts")
## attr(,"contrasts")$group_list
## [1] "contr.treatment"
## 比较矩阵
contrast<-makeContrasts(paste("progres.","stable",sep = "-"),levels = design)
contrast ##这个矩阵声明,我们要把progres.组跟stable进行差异分析比较
## Contrasts
## Levels progres.-stable
## progres. 1
## stable -1
DEG <- function(expr42872,design,contrast){
fit = lmFit(expr42872, design)
fit2 = contrasts.fit(fit, contrast)
fit2 = eBayes(fit2)
mtx = topTable(fit2, coef=1, n=Inf)
deg_mtx = na.omit(mtx)
return(deg_mtx)
}
DEG_mtx <- DEG(expr_sCLLex,design,contrast)#得到差异分析的结果
expr_sCLLex["TBC1D2B",]#检查
## CLL11.CEL CLL12.CEL CLL13.CEL CLL14.CEL CLL15.CEL CLL16.CEL CLL17.CEL CLL18.CEL
## 5.191501 6.584007 5.161696 5.021954 4.829027 5.473421 6.193658 6.835712
## CLL19.CEL CLL20.CEL CLL21.CEL CLL22.CEL CLL23.CEL CLL24.CEL CLL2.CEL CLL3.CEL
## 5.218793 6.617105 5.210030 6.197718 4.783289 5.245601 6.652838 5.247241
## CLL4.CEL CLL5.CEL CLL6.CEL CLL7.CEL CLL8.CEL CLL9.CEL
## 5.406573 5.832655 5.447913 4.812068 5.817831 5.874775
## 绘制热图:利用归一化后的表达矩阵
getgene <- head(rownames(DEG_mtx),25)#获取选择的基因列表
getgene_matrix <- expr_sCLLex[getgene,]#还是利用表达矩阵画图
getgene_matrix <- t(scale(t(getgene_matrix)))
group_list <- as.data.frame(group_list)
rownames(group_list) <- colnames(getgene_matrix)
pheatmap(getgene_matrix,
show_rownames = F,
annotation_col = group_list,
border_color=NA)
save(DEG_mtx,group_list,file = "limma_output_sCLLex.Rdata")
rm(list = ls())
load("D:/R/R-4.0.5/bin/project_practise/SXtree/SXtree_beforeGEO/R-20/limma_output_sCLLex.Rdata")
load("D:/R/R-4.0.5/bin/project_practise/SXtree/SXtree_beforeGEO/R-20/expr_pd_sCLLex.Rdata")
## 准备数据
DEG_mtx$change <- as.factor(ifelse(DEG_mtx$P.Value<0.05,
ifelse(DEG_mtx$logFC>0,"up","down"),"not"))#标注方向
DEG_mtx$gene_name <- rownames(DEG_mtx)#准备一下标注列
## 开始绘图
suppressPackageStartupMessages(library(cowplot))
suppressPackageStartupMessages(library(ggrepel))
suppressPackageStartupMessages(library(ggstatsplot))
ggplot(data = DEG_mtx, aes(x = logFC, y = -log10(P.Value))) +
##选定数据源和x,y坐标
geom_point(size = 1.75, aes(color = change), show.legend = T,alpha=0.4) +
###描点,设置大小、颜色分类和图示
scale_color_manual(values = c("royalblue", "grey","red")) +
####颜色值
ylim(0, 5) +
######纵坐标的量程
labs(x = 'Log2(fold change)', y = '-log10(p)')+
#######标签
geom_hline(yintercept = -log10(0.05), linetype = 'dotdash', color = 'grey20') +
#######添加水平参考线,p<0.01
geom_vline(xintercept = c(-1, 1), linetype = 'dotdash', color = 'grey20') +
#添加垂直参考线,log(fold_change)绝对值大于1
geom_text_repel(data = filter(DEG_mtx, abs(logFC)>1 & P.Value < 0.05),
###第一步选标签来源数据,利用subset()选子集
size = 2,
aes(label = gene_name))+#映射
theme_bw()
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_text_repel).
## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
#mean-算术平均值
mean(expr_sCLLex[1,])
## [1] 6.696363
#var-方差
var(expr_sCLLex[1,])
## [1] 0.1678388
#mad-绝对中位差
mad(expr_sCLLex[1,])
## [1] 0.2591946
#sd-标准差
sd(expr_sCLLex[1,])
## [1] 0.4096813
#median-中位数
median(expr_sCLLex[1,])
## [1] 6.648816
mad <- apply(expr_sCLLex,1,mad)
#getTop50 <- head(sort(mad))
getTop50 <- as.data.frame(cbind(rownames(expr_sCLLex),as.numeric(mad)))
getTop50 <- arrange(getTop50,desc(mad))
Top50 <- head(getTop50$V1,50)
expr_mad <- expr_sCLLex[Top50,]
expr_mad <- expr_mad[,-23]
suppressMessages(library(pheatmap))
pheatmap(expr_mad,scale = "row",show_rownames = F,
treeheight_row = 15,
treeheight_col = 15,
cluster_cols = T,
annotation_col = group_list)
#2
expr_mad <- t(scale(t(expr_mad)))
pheatmap(expr_mad,show_rownames = F)
#3
heatmap(expr_mad)
#4
expr_madl <- melt(expr_mad)
colnames(expr_madl) <- c("gene","sample","value")
ggplot(expr_madl,aes(sample,gene))+
geom_tile(aes(fill=value),colour = "white")+
scale_fill_gradient(low = "white",high = "steelblue")
#5
#install.packages("gplots")
suppressPackageStartupMessages(library(gplots))
#heatmap.2(expr_mad[1:10,],key = T,symkey = F,density.info = "none", trace = "none")
symkey 即图例下面的刻度
trace图片上面的虚线
density.info 图例的虚线
图纸过小会报错:Error in .External.graphics(C_layout, num.rows, num.cols, mat, as.integer(num.figures), :
图形狀態不对;
拉大图纸就能解决了
#median
STATS_FUN <- function(median,expr_sCLLex,n){
a <- apply(expr_sCLLex,1,median)
b <- as.data.frame(cbind(rownames(expr_sCLLex),a))
b <- arrange(b,desc(a))
Topgene <- head(b$V1,n)
return(Topgene)
}
e_median <- STATS_FUN(median,expr_sCLLex,50)
e_mad <- STATS_FUN(mad,expr_sCLLex,50)
e_sd <- STATS_FUN(sd,expr_sCLLex,50)
e_var <- STATS_FUN(var,expr_sCLLex,50)
e_mean <- STATS_FUN(mean,expr_sCLLex,50)
e_min <- STATS_FUN(min,expr_sCLLex,50)
e_max <- STATS_FUN(max,expr_sCLLex,50)
#install.packages("UpSetR")
suppressPackageStartupMessages(library(UpSetR))
suppressPackageStartupMessages(library(magrittr))
all <- unique(c(e_mean,e_mad,e_max,e_min,e_sd,e_var,e_median,fromLast=T))#取所有集合的交集
e_all <- data.frame(all ,
e_mean=ifelse(all %in% e_mean,1,0),
e_median=ifelse(all %in% e_median,1,0),
e_max=ifelse(all %in% e_max,1,0),
e_min=ifelse(all %in% e_min,1,0),
e_sd=ifelse(all %in% e_sd,1,0),
e_var=ifelse(all %in% e_var,1,0),
e_mad=ifelse(all %in% e_mad,1,0))
head(e_all)#相当于在总体里面分别绘制每个集合
## all e_mean e_median e_max e_min e_sd e_var e_mad
## 1 ADAR 1 1 0 1 0 0 1
## 2 SEPTIN7 1 1 0 0 0 0 1
## 3 PCBP1 1 0 0 0 0 0 0
## 4 SYPL1 1 1 0 0 0 0 1
## 5 CCT2 1 1 0 0 0 0 1
## 6 DYNLL1 1 0 0 0 0 0 0
upset(e_all,nsets = 7,sets.bar.color = "#BD1111")#绘制韦恩图,查看数据的交集
clust <- t(expr_sCLLex)
clust_dist <- dist(clust,method = "euclidean")
hc <- hclust(clust_dist,method = "ward.D")#one of "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC)
#美化
suppressPackageStartupMessages(library(factoextra))
fviz_dend(hc, k = 4 ,cex = 0.5,k_colors = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"),color_labels_by_k = TRUE, rect = TRUE)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
pc <- prcomp(t(expr_sCLLex),scale=T)
pcx <- data.frame(pc$x)
pcr <- cbind(samples=rownames(pcx),pd_sCLLex$Disease,pcx)
ggplot(pcr,aes(PC1,PC2))+geom_point(size=3,aes(color=pd_sCLLex$Disease))+
geom_text(aes(label=samples),hjust=-0.1,vjust=-0.3,size=2.5)
# BiocManager::install('ggfortify')
library(ggfortify)
df=as.data.frame(t(expr_sCLLex))
df$group=group_list
#autoplot(prcomp( df[,1:(ncol(df)-1)] ), data=df,colour = 'group')
#根据表达矩阵及样本分组信息进行批量T检验,得到检验结果表格 #相当于自己手动做一个差异分析
#把表达矩阵顺序理一下
load("D:/R/R-4.0.5/bin/project_practise/SXtree/SXtree_beforeGEO/R-20/expr_pd_sCLLex.Rdata")
group_list <- as.factor(pd_sCLLex$Disease)
dat <- expr_sCLLex
group1 = which(group_list == levels(group_list)[1])
group2 = which(group_list == levels(group_list)[2])
dat1 = dat[, group1]
dat2 = dat[, group2]
dat = cbind(dat1, dat2)
#看一下是不是排好了
pd <- pd_sCLLex[colnames(dat),]
pd$Disease
## [1] progres. progres. progres. progres. progres. progres. progres. progres.
## [9] progres. progres. progres. progres. progres. progres. stable stable
## [17] stable stable stable stable stable stable
## Levels: progres. stable
datl <- melt(dat)
colnames(datl) <- c("gene","sample","value")
datl$group <- rep(pd$Disease,each=nrow(dat))
ggplot(datl,aes(x=sample,y=value,color=group))+geom_boxplot()
pvals <- apply(expr_sCLLex, 1, function(x){
t.test(as.numeric(x)~group_list)$p.value
})
#说明
t.test(as.numeric(expr_sCLLex[1,])~group_list)#t.test的结果是一个对象
##
## Welch Two Sample t-test
##
## data: as.numeric(expr_sCLLex[1, ]) by group_list
## t = -0.18376, df = 10.212, p-value = 0.8578
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.5073847 0.4298704
## sample estimates:
## mean in group progres. mean in group stable
## 6.682270 6.721027
t.test(as.numeric(expr_sCLLex[1,])~group_list)$p.value#提取其中的p.value
## [1] 0.8578006
#调整P值
p.adj = p.adjust(pvals, method = "BH")
avg_1 = rowMeans(dat1)#每行取均值
## ZZZ3 ZZEF1 ZYX ZWINT ZW10 ZSWIM8
## 6.6822698 5.9423115 4.0781838 3.1241724 4.4772748 4.1420219
## ZSCAN9 ZSCAN26 ZSCAN12 ZRSR2 ZPR1 ZPBP
## 4.8713534 3.5238623 5.4831598 6.6684662 5.9943004 3.6880121
avg_2 = rowMeans(dat2)
log2FC = avg_2-avg_1#原来这就是logFC的意义
DEG_t.test = cbind(avg_1, avg_2, log2FC, pvals, p.adj)
DEG_t.test = DEG_t.test[order(DEG_t.test[,4]),]
DEG_t.test = as.data.frame(DEG_t.test)
head(DEG_t.test)
## avg_1 avg_2 log2FC pvals p.adj
## SGSM2 7.875615 8.791753 0.9161377 1.629755e-05 0.1398656
## PDE8A 6.622749 7.965007 1.3422581 4.058944e-05 0.1656022
## DLEU1 7.616197 5.786041 -1.8301554 6.965416e-05 0.1656022
## LDOC1 4.456446 2.152471 -2.3039752 8.993339e-05 0.1656022
## USP6NL 5.988866 7.058738 1.0698718 9.648226e-05 0.1656022
## COMMD4 4.157971 3.407405 -0.7505660 2.454557e-04 0.2586085
对T检验结果的P值和limma包差异分析的P值画散点图,看看哪些基因相差很大。
head(DEG_mtx)
## logFC AveExpr t P.Value adj.P.Val B change
## TBC1D2B -1.0284628 5.620700 -5.835358 8.339552e-06 0.02269911 3.317813 down
## CLIC1 0.9888221 9.954273 5.771016 9.668143e-06 0.02269911 3.198031 up
## DLEU1 1.8301554 6.950685 5.736605 1.046549e-05 0.02269911 3.133707 up
## SH3BP2 -1.3835699 4.463438 -5.731889 1.057987e-05 0.02269911 3.124878 down
## GPM6A 2.5471980 6.915045 5.038894 5.350441e-05 0.08807806 1.793442 up
## YTHDC2 -0.5187135 7.602354 -4.876863 7.860368e-05 0.08807806 1.473560 down
## gene_name
## TBC1D2B TBC1D2B
## CLIC1 CLIC1
## DLEU1 DLEU1
## SH3BP2 SH3BP2
## GPM6A GPM6A
## YTHDC2 YTHDC2
head(DEG_t.test)
## avg_1 avg_2 log2FC pvals p.adj
## SGSM2 7.875615 8.791753 0.9161377 1.629755e-05 0.1398656
## PDE8A 6.622749 7.965007 1.3422581 4.058944e-05 0.1656022
## DLEU1 7.616197 5.786041 -1.8301554 6.965416e-05 0.1656022
## LDOC1 4.456446 2.152471 -2.3039752 8.993339e-05 0.1656022
## USP6NL 5.988866 7.058738 1.0698718 9.648226e-05 0.1656022
## COMMD4 4.157971 3.407405 -0.7505660 2.454557e-04 0.2586085
DEG_t.test=DEG_t.test[rownames(DEG_mtx),]#排序使其一一对应
plot(DEG_t.test[,3],DEG_mtx[,1]) ## 可以看到logFC是相反的
plot(DEG_t.test[,4],DEG_mtx[,4]) # 可以看到使用limma包和t.test本身的p值差异尚可接受
plot(-log10(DEG_t.test[,4]),-log10(DEG_mtx[,4]))
library(ggplot2)
library(ggpubr)
my_comparisons <- list(
c("stable", "progres.")
)
dat=data.frame(group=group_list,
sampleID= names(expr_sCLLex['DLEU1',]),
values= as.numeric(expr_sCLLex['DLEU1',]))
ggboxplot(
dat, x = "group", y = "values",
color = "group",
add = "jitter"
)+
stat_compare_means(comparisons = my_comparisons, method = "t.test")