ggplot绘制箱线图,封装为函数实现批量绘图

载入数据

load(file = "F:/Bioinfor_project/Breast/AS_research/AS/result/hubgene.Rdata")
head(data)
##                ID group    CCL14     HBA1    CCL16    TUBB3 PAM50 Os_time
## 1 TCGA.A1.A0SK.01  TNBC 3.372704 0.000000 0.000000 5.936486 Basal     967
## 2 TCGA.A1.A0SP.01  TNBC 4.970021 3.394930 0.000000 6.001432 Basal     584
## 3 TCGA.A2.A04U.01  TNBC 4.766927 0.000000 0.000000 5.043662 Basal    2654
## 4 TCGA.A2.A0CM.01  TNBC 4.192953 3.738738 2.842951 5.127705 Basal     754
## 5 TCGA.A2.A0D0.01  TNBC 0.000000 0.000000 0.000000 5.653455 Basal    2048
## 6 TCGA.A2.A0D2.01  TNBC 5.338654 0.000000 4.069236 6.043691 Basal    1027
##   OS_event RFS_time RFS_event age       ER       PR gender     HER2
## 1        1       NA        NA  54 Negative Negative FEMALE Negative
## 2        0       NA        NA  40 Negative Negative FEMALE Negative
## 3        0       NA        NA  47 Negative Negative FEMALE Negative
## 4        1       NA        NA  40 Negative Negative FEMALE Negative
## 5        0     2048         0  60 Negative Negative FEMALE Negative
## 6        0     1027         0  45 Negative Negative FEMALE Negative
##   Margin_status Node M_stage N_stage T_stage Pathologic stage
## 1      Positive    0      M0      N0      T2        Stage_IIA
## 2      Negative    0      M0      N0      T2        Stage_IIA
## 3      Negative    0      M0      N0      T2        Stage_IIA
## 4      Negative    0      M0      N0      T2        Stage_IIA
## 5      Negative    0      M0      N0      T2        Stage_IIA
## 6      Negative    0      M0      N0      T2        Stage_IIA
require(cowplot)
## Loading required package: cowplot
## Loading required package: ggplot2
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggplot2':
## 
##     ggsave
require(tidyverse)
## Loading required package: tidyverse
## -- Attaching packages ------------------------------------------- tidyverse 1.2.1 --
## √ tibble  2.1.3     √ purrr   0.3.2
## √ tidyr   0.8.3     √ dplyr   0.8.3
## √ readr   1.3.1     √ stringr 1.4.0
## √ tibble  2.1.3     √ forcats 0.4.0
## Warning: package 'dplyr' was built under R version 3.6.1
## -- Conflicts ---------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter()   masks stats::filter()
## x cowplot::ggsave() masks ggplot2::ggsave()
## x dplyr::lag()      masks stats::lag()
require(ggplot2)
require(ggsci)
## Loading required package: ggsci
mydata<-data %>% 
  ## 基因表达数据gather,gather的范围应调整
  gather(key="gene",value="Expression",CCL14:TUBB3) %>% 
  ##
  dplyr::select(ID,gene,Expression,everything()) 
head(mydata)  
## # A tibble: 6 x 20
##   ID    gene  Expression group PAM50 Os_time OS_event RFS_time RFS_event
##   <chr> <chr>      <dbl> <chr> <fct>   <int>    <int>    <int>     <int>
## 1 TCGA~ CCL14       3.37 TNBC  Basal     967        1       NA        NA
## 2 TCGA~ CCL14       4.97 TNBC  Basal     584        0       NA        NA
## 3 TCGA~ CCL14       4.77 TNBC  Basal    2654        0       NA        NA
## 4 TCGA~ CCL14       4.19 TNBC  Basal     754        1       NA        NA
## 5 TCGA~ CCL14       0    TNBC  Basal    2048        0     2048         0
## 6 TCGA~ CCL14       5.34 TNBC  Basal    1027        0     1027         0
## # ... with 11 more variables: age <int>, ER <fct>, PR <fct>, gender <fct>,
## #   HER2 <fct>, Margin_status <fct>, Node <int>, M_stage <fct>,
## #   N_stage <fct>, T_stage <fct>, `Pathologic stage` <fct>

包装成绘图函数

输入感兴趣的基因,即绘制出箱线图 封装ggstatsplot

#if(F){
## 参数1:genename为基因名
## 参数2:data为清洁数据框,列为观测,行为sample
## 参数3:group为分组
## function()中data=data表示data默认为data,如果在该位置出现其它数据则为其它
head(data)
## # A tibble: 6 x 22
##   ID    group CCL14  HBA1 CCL16 TUBB3 PAM50 Os_time OS_event RFS_time
##   <chr> <chr> <dbl> <dbl> <dbl> <dbl> <fct>   <int>    <int>    <int>
## 1 TCGA~ TNBC   3.37  0     0     5.94 Basal     967        1       NA
## 2 TCGA~ TNBC   4.97  3.39  0     6.00 Basal     584        0       NA
## 3 TCGA~ TNBC   4.77  0     0     5.04 Basal    2654        0       NA
## 4 TCGA~ TNBC   4.19  3.74  2.84  5.13 Basal     754        1       NA
## 5 TCGA~ TNBC   0     0     0     5.65 Basal    2048        0     2048
## 6 TCGA~ TNBC   5.34  0     4.07  6.04 Basal    1027        0     1027
## # ... with 12 more variables: RFS_event <int>, age <int>, ER <fct>,
## #   PR <fct>, gender <fct>, HER2 <fct>, Margin_status <fct>, Node <int>,
## #   M_stage <fct>, N_stage <fct>, T_stage <fct>, `Pathologic stage` <fct>
colnames(data)
##  [1] "ID"               "group"            "CCL14"           
##  [4] "HBA1"             "CCL16"            "TUBB3"           
##  [7] "PAM50"            "Os_time"          "OS_event"        
## [10] "RFS_time"         "RFS_event"        "age"             
## [13] "ER"               "PR"               "gender"          
## [16] "HER2"             "Margin_status"    "Node"            
## [19] "M_stage"          "N_stage"          "T_stage"         
## [22] "Pathologic stage"
mydata<-data

### ggstatsplot封装为函数gene_plot
gene_staplot<-function(gene,data=mydata){
  require(ggstatsplot)
  ggbetweenstats(data = data, 
                 x = group, 
                 y = gene,
                 ylab = paste0("Expression of ",gene))
}
## 绘制感兴趣的基因的boxplot
gene_staplot("CCL14")
## Loading required package: ggstatsplot
## Warning: package 'ggstatsplot' was built under R version 3.6.1
## Registered S3 methods overwritten by 'broom.mixed':
##   method         from 
##   augment.lme    broom
##   augment.merMod broom
##   glance.lme     broom
##   glance.merMod  broom
##   glance.stanreg broom
##   tidy.brmsfit   broom
##   tidy.gamlss    broom
##   tidy.lme       broom
##   tidy.merMod    broom
##   tidy.rjags     broom
##   tidy.stanfit   broom
##   tidy.stanreg   broom
## Registered S3 methods overwritten by 'lme4':
##   method                          from
##   cooks.distance.influence.merMod car 
##   influence.merMod                car 
##   dfbeta.influence.merMod         car 
##   dfbetas.influence.merMod        car
## Warning in stats::pt(q = tvalue, df = dfvalue, ncp = x): 'pnt{final}'可能达
## 不到全精度
## Warning in stats::pt(q = tvalue, df = dfvalue, ncp = x): 'pnt{final}'可能达
## 不到全精度
## Note: Shapiro-Wilk Normality Test for Expression of CCL14 : p-value = < 0.001
## 
## Note: Bartlett's test for homogeneity of variances for factor group : p-value = < 0.001
## 

#ggsave("ggstatsplot.pdf")
## 批量绘图
require(tidyverse)
require(gridExtra)
## Loading required package: gridExtra
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
## 基因名的范围
gene<-colnames(data)[3:6]
p2<-lapply(gene, gene_staplot)
## Warning in stats::pt(q = tvalue, df = dfvalue, ncp = x): 'pnt{final}'可能达
## 不到全精度

## Warning in stats::pt(q = tvalue, df = dfvalue, ncp = x): 'pnt{final}'可能达
## 不到全精度
## Note: Shapiro-Wilk Normality Test for Expression of CCL14 : p-value = < 0.001
## 
## Note: Bartlett's test for homogeneity of variances for factor group : p-value = < 0.001
## 
## Warning in stats::pt(q = tvalue, df = dfvalue, ncp = x): 'pnt{final}'可能达
## 不到全精度

## Warning in stats::pt(q = tvalue, df = dfvalue, ncp = x): 'pnt{final}'可能达
## 不到全精度
## Note: Shapiro-Wilk Normality Test for Expression of HBA1 : p-value = < 0.001
## 
## Note: Bartlett's test for homogeneity of variances for factor group : p-value = < 0.001
## 
## Warning in stats::pt(q = tvalue, df = dfvalue, ncp = x): 'pnt{final}'可能达
## 不到全精度

## Warning in stats::pt(q = tvalue, df = dfvalue, ncp = x): 'pnt{final}'可能达
## 不到全精度
## Note: Shapiro-Wilk Normality Test for Expression of CCL16 : p-value = < 0.001
## 
## Note: Bartlett's test for homogeneity of variances for factor group : p-value = < 0.001
## 
## Note: Shapiro-Wilk Normality Test for Expression of TUBB3 : p-value = < 0.001
## 
## Note: Bartlett's test for homogeneity of variances for factor group : p-value = < 0.001
## 
p2<-do.call(grid.arrange, c(p2, ncol=2))

p2
## TableGrob (2 x 2) "arrange": 4 grobs
##   z     cells    name           grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (2-2,1-1) arrange gtable[layout]
## 4 4 (2-2,2-2) arrange gtable[layout]
#ggsave("F:/Bioinfor_project/Breast/AS_research/AS/result/four_gene_statplot.pdf",plot = p2,width =12,height = 10)

ggplot2绘制箱线图

head(mydata)
## # A tibble: 6 x 22
##   ID    group CCL14  HBA1 CCL16 TUBB3 PAM50 Os_time OS_event RFS_time
##   <chr> <chr> <dbl> <dbl> <dbl> <dbl> <fct>   <int>    <int>    <int>
## 1 TCGA~ TNBC   3.37  0     0     5.94 Basal     967        1       NA
## 2 TCGA~ TNBC   4.97  3.39  0     6.00 Basal     584        0       NA
## 3 TCGA~ TNBC   4.77  0     0     5.04 Basal    2654        0       NA
## 4 TCGA~ TNBC   4.19  3.74  2.84  5.13 Basal     754        1       NA
## 5 TCGA~ TNBC   0     0     0     5.65 Basal    2048        0     2048
## 6 TCGA~ TNBC   5.34  0     4.07  6.04 Basal    1027        0     1027
## # ... with 12 more variables: RFS_event <int>, age <int>, ER <fct>,
## #   PR <fct>, gender <fct>, HER2 <fct>, Margin_status <fct>, Node <int>,
## #   M_stage <fct>, N_stage <fct>, T_stage <fct>, `Pathologic stage` <fct>
gene_plot<-function(gene,data=mydata){
 y<-as.numeric(as.matrix(mydata[,gene]))##
ggplot(data=data, aes(x=group, y=y,fill=group)) + 
            geom_boxplot(alpha=0.7) +
            scale_y_continuous(name = "Expression")+
            scale_x_discrete(name = "group") +
            ggtitle(sprintf("Boxplot of %s ",gene)) +
            theme_bw() +
            theme(plot.title = element_text(size = 14, face =  "bold"),
            text = element_text(size = 12),
            axis.title = element_text(face="bold"),
            axis.text.x=element_text(size = 11))+
            scale_fill_lancet()
}

gene_plot("CCL14")

## 批量绘图
require(tidyverse)
require(gridExtra)
## 基因名的范围
gene<-colnames(data)[3:6]
p3<-lapply(gene, gene_plot)
marrangeGrob(p3,ncol = 2,nrow = 2)

ggpubr-添加pvalue等标签

require(ggpubr)
## Loading required package: ggpubr
## Loading required package: magrittr
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## Attaching package: 'ggpubr'
## The following object is masked from 'package:cowplot':
## 
##     get_legend
head(data)
## # A tibble: 6 x 22
##   ID    group CCL14  HBA1 CCL16 TUBB3 PAM50 Os_time OS_event RFS_time
##   <chr> <chr> <dbl> <dbl> <dbl> <dbl> <fct>   <int>    <int>    <int>
## 1 TCGA~ TNBC   3.37  0     0     5.94 Basal     967        1       NA
## 2 TCGA~ TNBC   4.97  3.39  0     6.00 Basal     584        0       NA
## 3 TCGA~ TNBC   4.77  0     0     5.04 Basal    2654        0       NA
## 4 TCGA~ TNBC   4.19  3.74  2.84  5.13 Basal     754        1       NA
## 5 TCGA~ TNBC   0     0     0     5.65 Basal    2048        0     2048
## 6 TCGA~ TNBC   5.34  0     4.07  6.04 Basal    1027        0     1027
## # ... with 12 more variables: RFS_event <int>, age <int>, ER <fct>,
## #   PR <fct>, gender <fct>, HER2 <fct>, Margin_status <fct>, Node <int>,
## #   M_stage <fct>, N_stage <fct>, T_stage <fct>, `Pathologic stage` <fct>
mydata<-data %>% 
  gather(key = "gene",value = "Expression",3:6) %>% 
  select(ID,group,gene,Expression,everything())

## 为了展示清楚数据格式的变化,让代码间断一下
head(mydata)
## # A tibble: 6 x 20
##   ID    group gene  Expression PAM50 Os_time OS_event RFS_time RFS_event
##   <chr> <chr> <chr>      <dbl> <fct>   <int>    <int>    <int>     <int>
## 1 TCGA~ TNBC  CCL14       3.37 Basal     967        1       NA        NA
## 2 TCGA~ TNBC  CCL14       4.97 Basal     584        0       NA        NA
## 3 TCGA~ TNBC  CCL14       4.77 Basal    2654        0       NA        NA
## 4 TCGA~ TNBC  CCL14       4.19 Basal     754        1       NA        NA
## 5 TCGA~ TNBC  CCL14       0    Basal    2048        0     2048         0
## 6 TCGA~ TNBC  CCL14       5.34 Basal    1027        0     1027         0
## # ... with 11 more variables: age <int>, ER <fct>, PR <fct>, gender <fct>,
## #   HER2 <fct>, Margin_status <fct>, Node <int>, M_stage <fct>,
## #   N_stage <fct>, T_stage <fct>, `Pathologic stage` <fct>
##  多个基因绘制在一起
p <- ggboxplot(mydata, x = "gene", y = "Expression",
          color = "group", # 边框
          palette = "lancet",#颜色
          add = "jitter")# 抖动
p+stat_compare_means(aes(group = group), label = "p.format")## p.format添加p

## 分面绘制
## x则调整为group
p <- ggboxplot(mydata, x = "group", y = "Expression",
          color = "group", # 边框
          palette = "lancet",#颜色
          add = "jitter",#抖动
          facet.by = "gene") 
p+stat_compare_means(aes(group = group), label = "p.format")## p.format添加p

封装为函数命名为group_box

head(mydata)
## # A tibble: 6 x 20
##   ID    group gene  Expression PAM50 Os_time OS_event RFS_time RFS_event
##   <chr> <chr> <chr>      <dbl> <fct>   <int>    <int>    <int>     <int>
## 1 TCGA~ TNBC  CCL14       3.37 Basal     967        1       NA        NA
## 2 TCGA~ TNBC  CCL14       4.97 Basal     584        0       NA        NA
## 3 TCGA~ TNBC  CCL14       4.77 Basal    2654        0       NA        NA
## 4 TCGA~ TNBC  CCL14       4.19 Basal     754        1       NA        NA
## 5 TCGA~ TNBC  CCL14       0    Basal    2048        0     2048         0
## 6 TCGA~ TNBC  CCL14       5.34 Basal    1027        0     1027         0
## # ... with 11 more variables: age <int>, ER <fct>, PR <fct>, gender <fct>,
## #   HER2 <fct>, Margin_status <fct>, Node <int>, M_stage <fct>,
## #   N_stage <fct>, T_stage <fct>, `Pathologic stage` <fct>
group_box<-function(group=group,data=mydata){
        p <- ggboxplot(data, x = "gene", y = "Expression",
          color = group, 
          palette = "nejm",
          add = "jitter")
p + stat_compare_means(aes(group = group))
}

## 输入感兴趣的分组,更换自己的数据mydata
group_box(group="PAM50",data = mydata)
## Warning: Removed 132 rows containing missing values (geom_point).

封装函数gene_box

head(data)
## # A tibble: 6 x 22
##   ID    group CCL14  HBA1 CCL16 TUBB3 PAM50 Os_time OS_event RFS_time
##   <chr> <chr> <dbl> <dbl> <dbl> <dbl> <fct>   <int>    <int>    <int>
## 1 TCGA~ TNBC   3.37  0     0     5.94 Basal     967        1       NA
## 2 TCGA~ TNBC   4.97  3.39  0     6.00 Basal     584        0       NA
## 3 TCGA~ TNBC   4.77  0     0     5.04 Basal    2654        0       NA
## 4 TCGA~ TNBC   4.19  3.74  2.84  5.13 Basal     754        1       NA
## 5 TCGA~ TNBC   0     0     0     5.65 Basal    2048        0     2048
## 6 TCGA~ TNBC   5.34  0     4.07  6.04 Basal    1027        0     1027
## # ... with 12 more variables: RFS_event <int>, age <int>, ER <fct>,
## #   PR <fct>, gender <fct>, HER2 <fct>, Margin_status <fct>, Node <int>,
## #   M_stage <fct>, N_stage <fct>, T_stage <fct>, `Pathologic stage` <fct>
usedata<-data
## 封装函数
gene_box<-function(gene="CCL14",group="group",data=usedata){
p <- ggboxplot(data, x = group, y = gene,
          ylab = sprintf("Expression of %s",gene),
          xlab = group,
          color = group, 
          palette = "nejm",
          add = "jitter")
p + stat_compare_means(aes(group = group),label = "p.format")
}
## 输入自己感兴趣的基因
gene_box(gene="CCL14")

## 输入自己感兴趣的基因+分组
gene_box(gene="CCL16",group="PAM50")
## Warning: Removed 33 rows containing missing values (geom_point).

批量绘制

require(gridExtra)
head(data)
## # A tibble: 6 x 22
##   ID    group CCL14  HBA1 CCL16 TUBB3 PAM50 Os_time OS_event RFS_time
##   <chr> <chr> <dbl> <dbl> <dbl> <dbl> <fct>   <int>    <int>    <int>
## 1 TCGA~ TNBC   3.37  0     0     5.94 Basal     967        1       NA
## 2 TCGA~ TNBC   4.97  3.39  0     6.00 Basal     584        0       NA
## 3 TCGA~ TNBC   4.77  0     0     5.04 Basal    2654        0       NA
## 4 TCGA~ TNBC   4.19  3.74  2.84  5.13 Basal     754        1       NA
## 5 TCGA~ TNBC   0     0     0     5.65 Basal    2048        0     2048
## 6 TCGA~ TNBC   5.34  0     4.07  6.04 Basal    1027        0     1027
## # ... with 12 more variables: RFS_event <int>, age <int>, ER <fct>,
## #   PR <fct>, gender <fct>, HER2 <fct>, Margin_status <fct>, Node <int>,
## #   M_stage <fct>, N_stage <fct>, T_stage <fct>, `Pathologic stage` <fct>
## 需要批量绘制的基因名
name<-colnames(data)[3:6]
## 批量绘图
p<-lapply(name,gene_box,group = "T_stage")
## 组图
do.call(grid.arrange,c(p,ncol=2))

函数再优化,调节box,颜色设置等

head(data)
## # A tibble: 6 x 22
##   ID    group CCL14  HBA1 CCL16 TUBB3 PAM50 Os_time OS_event RFS_time
##   <chr> <chr> <dbl> <dbl> <dbl> <dbl> <fct>   <int>    <int>    <int>
## 1 TCGA~ TNBC   3.37  0     0     5.94 Basal     967        1       NA
## 2 TCGA~ TNBC   4.97  3.39  0     6.00 Basal     584        0       NA
## 3 TCGA~ TNBC   4.77  0     0     5.04 Basal    2654        0       NA
## 4 TCGA~ TNBC   4.19  3.74  2.84  5.13 Basal     754        1       NA
## 5 TCGA~ TNBC   0     0     0     5.65 Basal    2048        0     2048
## 6 TCGA~ TNBC   5.34  0     4.07  6.04 Basal    1027        0     1027
## # ... with 12 more variables: RFS_event <int>, age <int>, ER <fct>,
## #   PR <fct>, gender <fct>, HER2 <fct>, Margin_status <fct>, Node <int>,
## #   M_stage <fct>, N_stage <fct>, T_stage <fct>, `Pathologic stage` <fct>
usedata<-data
## 封装函数
gene_box<-function(gene="CCL14",group="group",fill=T,p.signif=T,col="nejm",data=usedata){
  if (fill==T){
     if(p.signif==T){
p <- ggboxplot(data, x = group, y = gene,
          ylab = sprintf("Expression of %s",gene),
          xlab = group,
          fill = group, 
          palette = col,
          add = "jitter")
p + stat_compare_means(aes(group = group),label="p.signif")}else{
     p <- ggboxplot(data, x = group, y = gene,
          ylab = sprintf("Expression of %s",gene),
          xlab = group,
          fill = group, 
          palette = col,
          add = "jitter")
   p + stat_compare_means(aes(group = group),label= "p.format")} 
   }
  else {
    if(p.signif==T){
      p <- ggboxplot(data, x = group, y = gene,
          ylab = sprintf("Expression of %s",gene),
          xlab = group,
          color = group, 
          palette = col,
          add = "jitter")
      p + stat_compare_means(aes(group = group),label="p.signif")}else{p <- ggboxplot(data, x = group, y = gene,
          ylab = sprintf("Expression of %s",gene),
          xlab = group,
          color = group, 
          palette = col,
          add = "jitter")
      p + stat_compare_means(aes(group = group),label= "p.format")} 
      }
}  

## 输入自己感兴趣的基因
gene_box(gene="CCL14")

## 输入自己感兴趣的基因+分组
gene_box(gene="CCL16",group="PAM50",fill=T,p.signif = F,col = "jama")