setwd("D:/R/R-4.0.5/bin/project_visual/visualization/scrip")
rm(list=ls())

suppressPackageStartupMessages(library(reshape2))
suppressPackageStartupMessages(library(ggplot2))
suppressMessages(library(tidyverse))
suppressMessages(library(clusterProfiler))

load("D:/R/R-4.0.5/bin/project_practise/SXtree/SXtree_beforeGEO/R-20/expr_pd_sCLLex.Rdata")

单一样本

箱型图

#箱型图
boxplot(expr_sCLLex[,1])

直方图和密度图

#ggplot()绘图数据准备
exprl <- melt(expr_sCLLex)
colnames(exprl) <- c("gene","sample","expr")
exprl$group <- rep(pd_sCLLex$Disease,each=nrow(expr_sCLLex))

exprl_CLL11 <- exprl %>% filter(sample == "CLL11.CEL")

#直方图和密度图
ggplot(exprl_CLL11,aes(x=expr))+geom_histogram()+theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(exprl_CLL11,aes(x=expr))+geom_density()+theme_bw()

分组绘制箱型图、直方图、密度图

#数据准备
table(pd_sCLLex$Disease)
## 
## progres.   stable 
##       14        8
pd_sCLLex$group <- as.factor(pd_sCLLex$Disease )

#分组箱型图
ggplot(exprl,aes(x=sample,y=expr,fill=group))+geom_boxplot()+
  theme(text = element_text(size=13),
        axis.text.x = element_text(angle=135, hjust=1))

#直方图
ggplot(exprl,aes(x=expr,fill=group))+geom_histogram()+facet_wrap(exprl$sample)+theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#密度图
ggplot(exprl,aes(x=expr,col=group))+geom_density()+facet_wrap(exprl$sample)+theme_bw()

ggplot(exprl,aes(expr,col=group))+geom_density()

suppressMessages(library(ggpubr))

#小提琴图
ggviolin(exprl,x="sample",y="expr",fill = "group")+
  theme(text = element_text(size=10),
        axis.text.x = element_text(angle = 90,hjust = 1))

gpairs

gpairs——多变量的相关关系,即多个变量两两画散点图

#install.packages("gpairs")
library(gpairs)
expr_pairs <- expr_sCLLex[,1:6]
gpairs(expr_pairs)

cluster聚类分析

expr_cluster <- expr_sCLLex
colnames(expr_cluster) <- pd_sCLLex$group
#out.dist <- dist(t(expr_cluster),method = "euclidean")
#out.hclust <- hclust(out.dist,method = "complete")
hc <- hclust(dist(t(expr_cluster)))#这个代码只是把上面的代码简并了一下,是一样的,method默认是"complete"
plot(hc)

高级树状聚类分析图的画法

数据准备hclust()

hc <- hclust(dist(t(expr_sCLLex),method = "euclidean"),
             method = "ward.D")
  • 聚类方法可以选:“ward.D”, “ward.D2”, “single”, “complete”, “average” (= UPGMA), “mcquitty” (= WPGMA), “median” (= WPGMC) or “centroid” (= UPGMC)其中之一,具体数学原理省略,可以试试看哪种结果比较好;
  • 输入样本间的距离、计算距离的方法、聚类方法;输出一个封装好的列表,直接plot列表就能出图

绘图fviz_dend()

suppressPackageStartupMessages(library(factoextra))#高级可视化的包
fviz_dend(hc, k = 4 ,cex = 0.5,k_colors = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"),color_labels_by_k = T, rect = TRUE)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

参数说明

  • k为分组
  • cex 字体大小
  • color_labels_by_k 是否同时给标签上色
  • rect 是否要外面的矩形方框

PCA

方法一:用ggplot()

准备数据

  • 最终结果:样本名+分组+分析结果

  • 使用运算函数:prcomp()结果输出一个对象,其中的x为主成分分析结果

pc <- prcomp(t(expr_sCLLex),scale=T)
class(pc)#pc是一个封装好的对象,其中的x是主成分 分析的结果
## [1] "prcomp"
pcx <- data.frame(pc$x)
head(pcx,3)
##                 PC1        PC2       PC3       PC4        PC5       PC6
## CLL11.CEL -46.85689  14.463746  17.26379  -1.77114   6.923367 -9.006708
## CLL12.CEL -22.03909   8.439319 -31.50484 -48.12062  35.350428 -2.473449
## CLL13.CEL -61.72712 -10.386196  28.51325  12.07738 -26.940805 -3.192176
##                PC7       PC8       PC9       PC10       PC11       PC12
## CLL11.CEL 14.73949 -6.934635 13.581109  -4.859836   4.268705  -1.111283
## CLL12.CEL -5.60240 -9.688525  5.002439   8.983478  -6.131755  -8.681008
## CLL13.CEL 13.45881  1.909465  4.927676 -17.793132 -23.149501 -17.621971
##                 PC13      PC14         PC15      PC16       PC17       PC18
## CLL11.CEL -6.7781881  17.37202  -4.35242499 18.496281   0.178317  17.884427
## CLL12.CEL  0.6323493 -25.15555  -0.07925103 22.019632 -13.905205  -7.864069
## CLL13.CEL -8.8274219 -19.33089 -23.03844446 -2.220767   9.475638 -13.662748
##                PC19       PC20      PC21         PC22
## CLL11.CEL 27.642539 -20.105829  3.584657 1.376243e-13
## CLL12.CEL  2.206418   5.190389 -2.154347 1.009271e-13
## CLL13.CEL  4.941798   5.728124 -3.498910 6.398181e-14
pcr <- cbind(samples=rownames(pcx),pd_sCLLex$Disease,pcx)#加上样本和分组信息
head(pcr)
##             samples pd_sCLLex$Disease        PC1        PC2        PC3
## CLL11.CEL CLL11.CEL          progres. -46.856885  14.463746  17.263794
## CLL12.CEL CLL12.CEL            stable -22.039089   8.439319 -31.504845
## CLL13.CEL CLL13.CEL          progres. -61.727121 -10.386196  28.513253
## CLL14.CEL CLL14.CEL          progres. -49.054707 -10.202203 -16.721321
## CLL15.CEL CLL15.CEL          progres.   1.141835  18.773784  28.301385
## CLL16.CEL CLL16.CEL          progres.  32.653663  51.063530  -6.182675
##                  PC4        PC5       PC6        PC7        PC8        PC9
## CLL11.CEL  -1.771140   6.923367 -9.006708  14.739488  -6.934635  13.581109
## CLL12.CEL -48.120623  35.350428 -2.473449  -5.602400  -9.688525   5.002439
## CLL13.CEL  12.077384 -26.940805 -3.192176  13.458808   1.909465   4.927676
## CLL14.CEL   7.546233   1.589274 14.297332  -1.719354 -28.400677 -21.958378
## CLL15.CEL  14.248706 -12.115247 19.570496   4.796330  -3.067718 -34.120146
## CLL16.CEL  34.931700 -32.103064  1.710500 -20.257911   2.909489  -4.537663
##                 PC10       PC11        PC12        PC13       PC14         PC15
## CLL11.CEL  -4.859836   4.268705  -1.1112834  -6.7781881  17.372020  -4.35242499
## CLL12.CEL   8.983478  -6.131755  -8.6810079   0.6323493 -25.155551  -0.07925103
## CLL13.CEL -17.793132 -23.149501 -17.6219714  -8.8274219 -19.330890 -23.03844446
## CLL14.CEL  15.760874  12.899864  -0.6006821 -15.6392494  -1.658030  17.14739249
## CLL15.CEL -24.816528   8.868848  23.4662184  14.1284253 -14.504122   6.88664102
## CLL16.CEL  30.083816  -3.387800 -25.7013128  18.6407371  -1.587664   1.86653584
##                 PC16       PC17       PC18      PC19       PC20      PC21
## CLL11.CEL  18.496281   0.178317  17.884427 27.642539 -20.105829  3.584657
## CLL12.CEL  22.019632 -13.905205  -7.864069  2.206418   5.190389 -2.154347
## CLL13.CEL  -2.220767   9.475638 -13.662748  4.941798   5.728124 -3.498910
## CLL14.CEL -15.622991  14.835333 -13.188203 19.668413   3.264009 -4.113575
## CLL15.CEL  13.827600 -14.098071   3.963714  3.284001   5.522894 -1.377382
## CLL16.CEL   7.880383  -6.792640   1.371190  2.861844  -6.285913 -3.505023
##                   PC22
## CLL11.CEL 1.376243e-13
## CLL12.CEL 1.009271e-13
## CLL13.CEL 6.398181e-14
## CLL14.CEL 1.254760e-13
## CLL15.CEL 1.155465e-13
## CLL16.CEL 6.083935e-14

画图

PC2~PC1,color ~ group

ggplot(pcr,aes(PC1,PC2))+geom_point(size=3,aes(color=pd_sCLLex$Disease))+
  geom_text(aes(label=samples),hjust=-0.1,vjust=-0.3,size=2.5)

方法二:R包ggfortify

缺点在于没办法标注

#BiocManager::install("ggfortify")
suppressMessages(library(ggfortify)) 
df <- as.data.frame(t(expr_sCLLex))
df$group=pd_sCLLex$group#df就是转了一下再加上group
autoplot(prcomp(df[,1:(ncol(df)-1)]),data=df,colour='group')#也是使用prcomp

方法三:椭圆PCA

library("FactoMineR")#画主成分分析图需要加载这两个包
library("factoextra") 
exprSet <- expr_sCLLex
group_list <- as.factor(pd_sCLLex$Disease)
df=as.data.frame(t(exprSet))
dat.pca <- PCA(df, graph = FALSE)#现在dat最后一列是group_list,需要重新赋值给一个dat.pca,这个矩阵是不含有分组信息的
fviz_pca_ind(dat.pca,
             geom.ind = "point", # show points only (nbut not "text")
             col.ind = group_list, # color by groups
             # palette = c("#00AFBB", "#E7B800"),
             addEllipses = TRUE, # Concentration ellipses
             legend.title = "Groups")

## UpsetR集合交集可视化

数据准备

STATS_FUN <- function(median,expr_sCLLex,n){
  a <- apply(expr_sCLLex,1,median)
  b <- as.data.frame(cbind(rownames(expr_sCLLex),a))
  b <- arrange(b,desc(a))
  Topgene <- head(b$V1,n)
    return(Topgene)
}

e_median <- STATS_FUN(median,expr_sCLLex,50)
e_mad <- STATS_FUN(mad,expr_sCLLex,50)
e_sd <- STATS_FUN(sd,expr_sCLLex,50)
e_var <- STATS_FUN(var,expr_sCLLex,50)
e_mean <- STATS_FUN(mean,expr_sCLLex,50)
e_min <- STATS_FUN(min,expr_sCLLex,50)
e_max <- STATS_FUN(max,expr_sCLLex,50)

all <- unique(c(e_mean,e_mad,e_max,e_min,e_sd,e_var,e_median,fromLast=T))#取所有集合的交集

e_all <- data.frame(all ,
                    e_mean=ifelse(all %in% e_mean,1,0),
                    e_median=ifelse(all %in% e_median,1,0),
                    e_max=ifelse(all %in% e_max,1,0),
                    e_min=ifelse(all %in% e_min,1,0),
                    e_sd=ifelse(all %in% e_sd,1,0),
                    e_var=ifelse(all %in% e_var,1,0),
                    e_mad=ifelse(all %in% e_mad,1,0))##相当于在总体里面分别绘制每个集合

head(e_all)#总集+每个个体的坐标
##       all e_mean e_median e_max e_min e_sd e_var e_mad
## 1    ADAR      1        1     0     1    0     0     0
## 2 SEPTIN7      1        1     0     0    0     0     0
## 3   PCBP1      1        0     0     0    0     0     0
## 4   SYPL1      1        1     0     0    0     0     0
## 5    CCT2      1        1     0     0    0     0     0
## 6  DYNLL1      1        0     0     0    0     0     0

绘图

#install.packages("UpSetR")
suppressPackageStartupMessages(library(UpSetR))
suppressPackageStartupMessages(library(magrittr))

upset(e_all,nsets = 7,sets.bar.color = "#BD1111")#绘制韦恩图,查看数据的交集