load(file = "F:/Bioinfor_project/Breast/AS_research/AS/result/hubgene.Rdata")
head(data)
## ID group CCL14 HBA1 CCL16 TUBB3 PAM50 Os_time
## 1 TCGA.A1.A0SK.01 TNBC 3.372704 0.000000 0.000000 5.936486 Basal 967
## 2 TCGA.A1.A0SP.01 TNBC 4.970021 3.394930 0.000000 6.001432 Basal 584
## 3 TCGA.A2.A04U.01 TNBC 4.766927 0.000000 0.000000 5.043662 Basal 2654
## 4 TCGA.A2.A0CM.01 TNBC 4.192953 3.738738 2.842951 5.127705 Basal 754
## 5 TCGA.A2.A0D0.01 TNBC 0.000000 0.000000 0.000000 5.653455 Basal 2048
## 6 TCGA.A2.A0D2.01 TNBC 5.338654 0.000000 4.069236 6.043691 Basal 1027
## OS_event RFS_time RFS_event age ER PR gender HER2
## 1 1 NA NA 54 Negative Negative FEMALE Negative
## 2 0 NA NA 40 Negative Negative FEMALE Negative
## 3 0 NA NA 47 Negative Negative FEMALE Negative
## 4 1 NA NA 40 Negative Negative FEMALE Negative
## 5 0 2048 0 60 Negative Negative FEMALE Negative
## 6 0 1027 0 45 Negative Negative FEMALE Negative
## Margin_status Node M_stage N_stage T_stage Pathologic stage
## 1 Positive 0 M0 N0 T2 Stage_IIA
## 2 Negative 0 M0 N0 T2 Stage_IIA
## 3 Negative 0 M0 N0 T2 Stage_IIA
## 4 Negative 0 M0 N0 T2 Stage_IIA
## 5 Negative 0 M0 N0 T2 Stage_IIA
## 6 Negative 0 M0 N0 T2 Stage_IIA
require(cowplot)
## Loading required package: cowplot
## Loading required package: ggplot2
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggplot2':
##
## ggsave
require(tidyverse)
## Loading required package: tidyverse
## -- Attaching packages ------------------------------------------- tidyverse 1.2.1 --
## √ tibble 2.1.3 √ purrr 0.3.2
## √ tidyr 0.8.3 √ dplyr 0.8.3
## √ readr 1.3.1 √ stringr 1.4.0
## √ tibble 2.1.3 √ forcats 0.4.0
## Warning: package 'dplyr' was built under R version 3.6.1
## -- Conflicts ---------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x cowplot::ggsave() masks ggplot2::ggsave()
## x dplyr::lag() masks stats::lag()
require(ggplot2)
require(ggsci)
## Loading required package: ggsci
mydata<-data %>%
## 基因表达数据gather,gather的范围应调整
gather(key="gene",value="Expression",CCL14:TUBB3) %>%
##
dplyr::select(ID,gene,Expression,everything())
head(mydata)
## # A tibble: 6 x 20
## ID gene Expression group PAM50 Os_time OS_event RFS_time RFS_event
## <chr> <chr> <dbl> <chr> <fct> <int> <int> <int> <int>
## 1 TCGA~ CCL14 3.37 TNBC Basal 967 1 NA NA
## 2 TCGA~ CCL14 4.97 TNBC Basal 584 0 NA NA
## 3 TCGA~ CCL14 4.77 TNBC Basal 2654 0 NA NA
## 4 TCGA~ CCL14 4.19 TNBC Basal 754 1 NA NA
## 5 TCGA~ CCL14 0 TNBC Basal 2048 0 2048 0
## 6 TCGA~ CCL14 5.34 TNBC Basal 1027 0 1027 0
## # ... with 11 more variables: age <int>, ER <fct>, PR <fct>, gender <fct>,
## # HER2 <fct>, Margin_status <fct>, Node <int>, M_stage <fct>,
## # N_stage <fct>, T_stage <fct>, `Pathologic stage` <fct>
输入感兴趣的基因,即绘制出箱线图 封装ggstatsplot
#if(F){
## 参数1:genename为基因名
## 参数2:data为清洁数据框,列为观测,行为sample
## 参数3:group为分组
## function()中data=data表示data默认为data,如果在该位置出现其它数据则为其它
head(data)
## # A tibble: 6 x 22
## ID group CCL14 HBA1 CCL16 TUBB3 PAM50 Os_time OS_event RFS_time
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <fct> <int> <int> <int>
## 1 TCGA~ TNBC 3.37 0 0 5.94 Basal 967 1 NA
## 2 TCGA~ TNBC 4.97 3.39 0 6.00 Basal 584 0 NA
## 3 TCGA~ TNBC 4.77 0 0 5.04 Basal 2654 0 NA
## 4 TCGA~ TNBC 4.19 3.74 2.84 5.13 Basal 754 1 NA
## 5 TCGA~ TNBC 0 0 0 5.65 Basal 2048 0 2048
## 6 TCGA~ TNBC 5.34 0 4.07 6.04 Basal 1027 0 1027
## # ... with 12 more variables: RFS_event <int>, age <int>, ER <fct>,
## # PR <fct>, gender <fct>, HER2 <fct>, Margin_status <fct>, Node <int>,
## # M_stage <fct>, N_stage <fct>, T_stage <fct>, `Pathologic stage` <fct>
colnames(data)
## [1] "ID" "group" "CCL14"
## [4] "HBA1" "CCL16" "TUBB3"
## [7] "PAM50" "Os_time" "OS_event"
## [10] "RFS_time" "RFS_event" "age"
## [13] "ER" "PR" "gender"
## [16] "HER2" "Margin_status" "Node"
## [19] "M_stage" "N_stage" "T_stage"
## [22] "Pathologic stage"
mydata<-data
### ggstatsplot封装为函数gene_plot
gene_staplot<-function(gene,data=mydata){
require(ggstatsplot)
ggbetweenstats(data = data,
x = group,
y = gene,
ylab = paste0("Expression of ",gene))
}
## 绘制感兴趣的基因的boxplot
gene_staplot("CCL14")
## Loading required package: ggstatsplot
## Warning: package 'ggstatsplot' was built under R version 3.6.1
## Registered S3 methods overwritten by 'broom.mixed':
## method from
## augment.lme broom
## augment.merMod broom
## glance.lme broom
## glance.merMod broom
## glance.stanreg broom
## tidy.brmsfit broom
## tidy.gamlss broom
## tidy.lme broom
## tidy.merMod broom
## tidy.rjags broom
## tidy.stanfit broom
## tidy.stanreg broom
## Registered S3 methods overwritten by 'lme4':
## method from
## cooks.distance.influence.merMod car
## influence.merMod car
## dfbeta.influence.merMod car
## dfbetas.influence.merMod car
## Warning in stats::pt(q = tvalue, df = dfvalue, ncp = x): 'pnt{final}'可能达
## 不到全精度
## Warning in stats::pt(q = tvalue, df = dfvalue, ncp = x): 'pnt{final}'可能达
## 不到全精度
## Note: Shapiro-Wilk Normality Test for Expression of CCL14 : p-value = < 0.001
##
## Note: Bartlett's test for homogeneity of variances for factor group : p-value = < 0.001
##
#ggsave("ggstatsplot.pdf")
## 批量绘图
require(tidyverse)
require(gridExtra)
## Loading required package: gridExtra
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
## 基因名的范围
gene<-colnames(data)[3:6]
p2<-lapply(gene, gene_staplot)
## Warning in stats::pt(q = tvalue, df = dfvalue, ncp = x): 'pnt{final}'可能达
## 不到全精度
## Warning in stats::pt(q = tvalue, df = dfvalue, ncp = x): 'pnt{final}'可能达
## 不到全精度
## Note: Shapiro-Wilk Normality Test for Expression of CCL14 : p-value = < 0.001
##
## Note: Bartlett's test for homogeneity of variances for factor group : p-value = < 0.001
##
## Warning in stats::pt(q = tvalue, df = dfvalue, ncp = x): 'pnt{final}'可能达
## 不到全精度
## Warning in stats::pt(q = tvalue, df = dfvalue, ncp = x): 'pnt{final}'可能达
## 不到全精度
## Note: Shapiro-Wilk Normality Test for Expression of HBA1 : p-value = < 0.001
##
## Note: Bartlett's test for homogeneity of variances for factor group : p-value = < 0.001
##
## Warning in stats::pt(q = tvalue, df = dfvalue, ncp = x): 'pnt{final}'可能达
## 不到全精度
## Warning in stats::pt(q = tvalue, df = dfvalue, ncp = x): 'pnt{final}'可能达
## 不到全精度
## Note: Shapiro-Wilk Normality Test for Expression of CCL16 : p-value = < 0.001
##
## Note: Bartlett's test for homogeneity of variances for factor group : p-value = < 0.001
##
## Note: Shapiro-Wilk Normality Test for Expression of TUBB3 : p-value = < 0.001
##
## Note: Bartlett's test for homogeneity of variances for factor group : p-value = < 0.001
##
p2<-do.call(grid.arrange, c(p2, ncol=2))
p2
## TableGrob (2 x 2) "arrange": 4 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (2-2,1-1) arrange gtable[layout]
## 4 4 (2-2,2-2) arrange gtable[layout]
#ggsave("F:/Bioinfor_project/Breast/AS_research/AS/result/four_gene_statplot.pdf",plot = p2,width =12,height = 10)
head(mydata)
## # A tibble: 6 x 22
## ID group CCL14 HBA1 CCL16 TUBB3 PAM50 Os_time OS_event RFS_time
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <fct> <int> <int> <int>
## 1 TCGA~ TNBC 3.37 0 0 5.94 Basal 967 1 NA
## 2 TCGA~ TNBC 4.97 3.39 0 6.00 Basal 584 0 NA
## 3 TCGA~ TNBC 4.77 0 0 5.04 Basal 2654 0 NA
## 4 TCGA~ TNBC 4.19 3.74 2.84 5.13 Basal 754 1 NA
## 5 TCGA~ TNBC 0 0 0 5.65 Basal 2048 0 2048
## 6 TCGA~ TNBC 5.34 0 4.07 6.04 Basal 1027 0 1027
## # ... with 12 more variables: RFS_event <int>, age <int>, ER <fct>,
## # PR <fct>, gender <fct>, HER2 <fct>, Margin_status <fct>, Node <int>,
## # M_stage <fct>, N_stage <fct>, T_stage <fct>, `Pathologic stage` <fct>
gene_plot<-function(gene,data=mydata){
y<-as.numeric(as.matrix(mydata[,gene]))##
ggplot(data=data, aes(x=group, y=y,fill=group)) +
geom_boxplot(alpha=0.7) +
scale_y_continuous(name = "Expression")+
scale_x_discrete(name = "group") +
ggtitle(sprintf("Boxplot of %s ",gene)) +
theme_bw() +
theme(plot.title = element_text(size = 14, face = "bold"),
text = element_text(size = 12),
axis.title = element_text(face="bold"),
axis.text.x=element_text(size = 11))+
scale_fill_lancet()
}
gene_plot("CCL14")
## 批量绘图
require(tidyverse)
require(gridExtra)
## 基因名的范围
gene<-colnames(data)[3:6]
p3<-lapply(gene, gene_plot)
marrangeGrob(p3,ncol = 2,nrow = 2)
require(ggpubr)
## Loading required package: ggpubr
## Loading required package: magrittr
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
##
## Attaching package: 'ggpubr'
## The following object is masked from 'package:cowplot':
##
## get_legend
head(data)
## # A tibble: 6 x 22
## ID group CCL14 HBA1 CCL16 TUBB3 PAM50 Os_time OS_event RFS_time
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <fct> <int> <int> <int>
## 1 TCGA~ TNBC 3.37 0 0 5.94 Basal 967 1 NA
## 2 TCGA~ TNBC 4.97 3.39 0 6.00 Basal 584 0 NA
## 3 TCGA~ TNBC 4.77 0 0 5.04 Basal 2654 0 NA
## 4 TCGA~ TNBC 4.19 3.74 2.84 5.13 Basal 754 1 NA
## 5 TCGA~ TNBC 0 0 0 5.65 Basal 2048 0 2048
## 6 TCGA~ TNBC 5.34 0 4.07 6.04 Basal 1027 0 1027
## # ... with 12 more variables: RFS_event <int>, age <int>, ER <fct>,
## # PR <fct>, gender <fct>, HER2 <fct>, Margin_status <fct>, Node <int>,
## # M_stage <fct>, N_stage <fct>, T_stage <fct>, `Pathologic stage` <fct>
mydata<-data %>%
gather(key = "gene",value = "Expression",3:6) %>%
select(ID,group,gene,Expression,everything())
## 为了展示清楚数据格式的变化,让代码间断一下
head(mydata)
## # A tibble: 6 x 20
## ID group gene Expression PAM50 Os_time OS_event RFS_time RFS_event
## <chr> <chr> <chr> <dbl> <fct> <int> <int> <int> <int>
## 1 TCGA~ TNBC CCL14 3.37 Basal 967 1 NA NA
## 2 TCGA~ TNBC CCL14 4.97 Basal 584 0 NA NA
## 3 TCGA~ TNBC CCL14 4.77 Basal 2654 0 NA NA
## 4 TCGA~ TNBC CCL14 4.19 Basal 754 1 NA NA
## 5 TCGA~ TNBC CCL14 0 Basal 2048 0 2048 0
## 6 TCGA~ TNBC CCL14 5.34 Basal 1027 0 1027 0
## # ... with 11 more variables: age <int>, ER <fct>, PR <fct>, gender <fct>,
## # HER2 <fct>, Margin_status <fct>, Node <int>, M_stage <fct>,
## # N_stage <fct>, T_stage <fct>, `Pathologic stage` <fct>
## 多个基因绘制在一起
p <- ggboxplot(mydata, x = "gene", y = "Expression",
color = "group", # 边框
palette = "lancet",#颜色
add = "jitter")# 抖动
p+stat_compare_means(aes(group = group), label = "p.format")## p.format添加p
## 分面绘制
## x则调整为group
p <- ggboxplot(mydata, x = "group", y = "Expression",
color = "group", # 边框
palette = "lancet",#颜色
add = "jitter",#抖动
facet.by = "gene")
p+stat_compare_means(aes(group = group), label = "p.format")## p.format添加p
head(mydata)
## # A tibble: 6 x 20
## ID group gene Expression PAM50 Os_time OS_event RFS_time RFS_event
## <chr> <chr> <chr> <dbl> <fct> <int> <int> <int> <int>
## 1 TCGA~ TNBC CCL14 3.37 Basal 967 1 NA NA
## 2 TCGA~ TNBC CCL14 4.97 Basal 584 0 NA NA
## 3 TCGA~ TNBC CCL14 4.77 Basal 2654 0 NA NA
## 4 TCGA~ TNBC CCL14 4.19 Basal 754 1 NA NA
## 5 TCGA~ TNBC CCL14 0 Basal 2048 0 2048 0
## 6 TCGA~ TNBC CCL14 5.34 Basal 1027 0 1027 0
## # ... with 11 more variables: age <int>, ER <fct>, PR <fct>, gender <fct>,
## # HER2 <fct>, Margin_status <fct>, Node <int>, M_stage <fct>,
## # N_stage <fct>, T_stage <fct>, `Pathologic stage` <fct>
group_box<-function(group=group,data=mydata){
p <- ggboxplot(data, x = "gene", y = "Expression",
color = group,
palette = "nejm",
add = "jitter")
p + stat_compare_means(aes(group = group))
}
## 输入感兴趣的分组,更换自己的数据mydata
group_box(group="PAM50",data = mydata)
## Warning: Removed 132 rows containing missing values (geom_point).
head(data)
## # A tibble: 6 x 22
## ID group CCL14 HBA1 CCL16 TUBB3 PAM50 Os_time OS_event RFS_time
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <fct> <int> <int> <int>
## 1 TCGA~ TNBC 3.37 0 0 5.94 Basal 967 1 NA
## 2 TCGA~ TNBC 4.97 3.39 0 6.00 Basal 584 0 NA
## 3 TCGA~ TNBC 4.77 0 0 5.04 Basal 2654 0 NA
## 4 TCGA~ TNBC 4.19 3.74 2.84 5.13 Basal 754 1 NA
## 5 TCGA~ TNBC 0 0 0 5.65 Basal 2048 0 2048
## 6 TCGA~ TNBC 5.34 0 4.07 6.04 Basal 1027 0 1027
## # ... with 12 more variables: RFS_event <int>, age <int>, ER <fct>,
## # PR <fct>, gender <fct>, HER2 <fct>, Margin_status <fct>, Node <int>,
## # M_stage <fct>, N_stage <fct>, T_stage <fct>, `Pathologic stage` <fct>
usedata<-data
## 封装函数
gene_box<-function(gene="CCL14",group="group",data=usedata){
p <- ggboxplot(data, x = group, y = gene,
ylab = sprintf("Expression of %s",gene),
xlab = group,
color = group,
palette = "nejm",
add = "jitter")
p + stat_compare_means(aes(group = group),label = "p.format")
}
## 输入自己感兴趣的基因
gene_box(gene="CCL14")
## 输入自己感兴趣的基因+分组
gene_box(gene="CCL16",group="PAM50")
## Warning: Removed 33 rows containing missing values (geom_point).
require(gridExtra)
head(data)
## # A tibble: 6 x 22
## ID group CCL14 HBA1 CCL16 TUBB3 PAM50 Os_time OS_event RFS_time
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <fct> <int> <int> <int>
## 1 TCGA~ TNBC 3.37 0 0 5.94 Basal 967 1 NA
## 2 TCGA~ TNBC 4.97 3.39 0 6.00 Basal 584 0 NA
## 3 TCGA~ TNBC 4.77 0 0 5.04 Basal 2654 0 NA
## 4 TCGA~ TNBC 4.19 3.74 2.84 5.13 Basal 754 1 NA
## 5 TCGA~ TNBC 0 0 0 5.65 Basal 2048 0 2048
## 6 TCGA~ TNBC 5.34 0 4.07 6.04 Basal 1027 0 1027
## # ... with 12 more variables: RFS_event <int>, age <int>, ER <fct>,
## # PR <fct>, gender <fct>, HER2 <fct>, Margin_status <fct>, Node <int>,
## # M_stage <fct>, N_stage <fct>, T_stage <fct>, `Pathologic stage` <fct>
## 需要批量绘制的基因名
name<-colnames(data)[3:6]
## 批量绘图
p<-lapply(name,gene_box,group = "T_stage")
## 组图
do.call(grid.arrange,c(p,ncol=2))
head(data)
## # A tibble: 6 x 22
## ID group CCL14 HBA1 CCL16 TUBB3 PAM50 Os_time OS_event RFS_time
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <fct> <int> <int> <int>
## 1 TCGA~ TNBC 3.37 0 0 5.94 Basal 967 1 NA
## 2 TCGA~ TNBC 4.97 3.39 0 6.00 Basal 584 0 NA
## 3 TCGA~ TNBC 4.77 0 0 5.04 Basal 2654 0 NA
## 4 TCGA~ TNBC 4.19 3.74 2.84 5.13 Basal 754 1 NA
## 5 TCGA~ TNBC 0 0 0 5.65 Basal 2048 0 2048
## 6 TCGA~ TNBC 5.34 0 4.07 6.04 Basal 1027 0 1027
## # ... with 12 more variables: RFS_event <int>, age <int>, ER <fct>,
## # PR <fct>, gender <fct>, HER2 <fct>, Margin_status <fct>, Node <int>,
## # M_stage <fct>, N_stage <fct>, T_stage <fct>, `Pathologic stage` <fct>
usedata<-data
## 封装函数
gene_box<-function(gene="CCL14",group="group",fill=T,p.signif=T,col="nejm",data=usedata){
if (fill==T){
if(p.signif==T){
p <- ggboxplot(data, x = group, y = gene,
ylab = sprintf("Expression of %s",gene),
xlab = group,
fill = group,
palette = col,
add = "jitter")
p + stat_compare_means(aes(group = group),label="p.signif")}else{
p <- ggboxplot(data, x = group, y = gene,
ylab = sprintf("Expression of %s",gene),
xlab = group,
fill = group,
palette = col,
add = "jitter")
p + stat_compare_means(aes(group = group),label= "p.format")}
}
else {
if(p.signif==T){
p <- ggboxplot(data, x = group, y = gene,
ylab = sprintf("Expression of %s",gene),
xlab = group,
color = group,
palette = col,
add = "jitter")
p + stat_compare_means(aes(group = group),label="p.signif")}else{p <- ggboxplot(data, x = group, y = gene,
ylab = sprintf("Expression of %s",gene),
xlab = group,
color = group,
palette = col,
add = "jitter")
p + stat_compare_means(aes(group = group),label= "p.format")}
}
}
## 输入自己感兴趣的基因
gene_box(gene="CCL14")
## 输入自己感兴趣的基因+分组
gene_box(gene="CCL16",group="PAM50",fill=T,p.signif = F,col = "jama")