1 引言

一般基础R语言开展的统计分析结果不是整洁的表现形式。Kassambara,也就是ggpubr的作者,创造了rstatix。该包基于tidy架构和理念,可以方便的计算并呈现规整的统计结果,配合ggpubr包可以华丽地制作publication ready plot。

可以用以下形式安装

if(!require(devtools)) install.packages("devtools")
devtools::install_github("kassambara/rstatix") #或者
install.packages("rstatix")
library(rstatix)  
library(ggpubr)

2 描述性分析

2.1 常见统计量

  • 可以通过改变type参数选择统计量的类型。
    • type = c(“full”, “common”, “robust”, “five_number”, “mean_sd”, “mean_se”, “mean_ci”, “median_iqr”, “median_mad”, “quantile”, “mean”, “median”, “min”, “max”)
ToothGrowth %>% 
  get_summary_stats(len, type = "common")
## # A tibble: 1 x 10
##   variable     n   min   max median   iqr  mean    sd    se    ci
##   <chr>    <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 len         60   4.2  33.9   19.2  12.2  18.8  7.65 0.988  1.98
  • 也可以分组汇总
ToothGrowth %>% 
  group_by(dose) %>% 
  get_summary_stats(type = "full")
## # A tibble: 3 x 14
##    dose variable     n   min   max median    q1    q3   iqr   mad  mean    sd
##   <dbl> <chr>    <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1   0.5 len         20   4.2  21.5   9.85  7.22  12.2  5.03  4.00  10.6  4.5 
## 2   1   len         20  13.6  27.3  19.2  16.2   23.4  7.12  5.78  19.7  4.42
## 3   2   len         20  18.5  33.9  26.0  23.5   27.8  4.3   3.71  26.1  3.77
## # ... with 2 more variables: se <dbl>, ci <dbl>

3 比较均数

3.1 正态分布检验Shapiro-Wilk’s method

3.1.1 总体分布

df <- ToothGrowth %>% 
  mutate(dose = as.factor(dose))

hist(df$len)

df %>% ggplot(aes(len)) + 
  geom_histogram(boundary = 0, binwidth = 5) +
  scale_x_continuous(breaks = seq(0, 35, 5)) +
  scale_y_continuous(breaks = seq(0, 15, 1)) +
  theme_classic2()

# {ggpubr}Density plot
ggdensity(ToothGrowth$len, fill = "lightgray")

# {ggpubr}QQ plot
ggqqplot(ToothGrowth$len)

# {rstatix}正态性检验
df %>% shapiro_test(len)
## # A tibble: 1 x 3
##   variable statistic     p
##   <chr>        <dbl> <dbl>
## 1 len          0.967 0.109
## {stats}正态性检验,结果一致
df %$% shapiro.test(len)
## 
##  Shapiro-Wilk normality test
## 
## data:  len
## W = 0.96743, p-value = 0.1091

3.1.2 组内分布

# Shapiro Wilk normality test for one variable

df %>% group_by(supp) %>% shapiro_test(len)
## # A tibble: 2 x 4
##   supp  variable statistic      p
##   <fct> <chr>        <dbl>  <dbl>
## 1 OJ    len          0.918 0.0236
## 2 VC    len          0.966 0.428

3.2 统计分析

3.2.1 单样本t检验(One-sample t-test)

df %>% ggplot(aes(x = 1L, y = len))+ geom_violin() + geom_jitter() 

df %>% rstatix::t_test(len ~ 1, mu = 10)
## # A tibble: 1 x 7
##   .y.   group1 group2         n statistic    df        p
## * <chr> <chr>  <chr>      <int>     <dbl> <dbl>    <dbl>
## 1 len   1      null model    60      8.92    59 1.53e-12

3.2.2 独立双样本t检验(Two-sample unparied test)

df %>% rstatix::t_test(len ~ supp)
## # A tibble: 1 x 8
##   .y.   group1 group2    n1    n2 statistic    df      p
## * <chr> <chr>  <chr>  <int> <int>     <dbl> <dbl>  <dbl>
## 1 len   OJ     VC        30    30      1.92  55.3 0.0606
## BaseR

library(magrittr)
ToothGrowth %>% 
  as_tibble() %$% 
  t.test(len ~ supp) %>% 
  broom::tidy()
## # A tibble: 1 x 10
##   estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high
##      <dbl>     <dbl>     <dbl>     <dbl>   <dbl>     <dbl>    <dbl>     <dbl>
## 1     3.70      20.7      17.0      1.92  0.0606      55.3   -0.171      7.57
## # ... with 2 more variables: method <chr>, alternative <chr>

3.2.3 配对样本t检验(Two-sample paried test)

df %>% rstatix::t_test(len ~ supp, paired = TRUE)
## # A tibble: 1 x 8
##   .y.   group1 group2    n1    n2 statistic    df       p
## * <chr> <chr>  <chr>  <int> <int>     <dbl> <dbl>   <dbl>
## 1 len   OJ     VC        30    30      3.30    29 0.00255

3.2.4 分组双样本(Two-sample grouped)

df %>%
  group_by(dose) %>%
  t_test(data =., len ~ supp) %>%
  adjust_pvalue(method = "bonferroni") %>%
  add_significance("p.adj")
## # A tibble: 3 x 11
##   dose  .y.   group1 group2    n1    n2 statistic    df       p   p.adj
##   <fct> <chr> <chr>  <chr>  <int> <int>     <dbl> <dbl>   <dbl>   <dbl>
## 1 0.5   len   OJ     VC        10    10    3.17    15.0 0.00636 0.0191 
## 2 1     len   OJ     VC        10    10    4.03    15.4 0.00104 0.00312
## 3 2     len   OJ     VC        10    10   -0.0461  14.0 0.964   1      
## # ... with 1 more variable: p.adj.signif <chr>

3.2.5 两两比较(Pairwise comparisons)

# 交互比较
df %>% t_test(len ~ dose)
## # A tibble: 3 x 10
##   .y.   group1 group2    n1    n2 statistic    df        p    p.adj p.adj.signif
## * <chr> <chr>  <chr>  <int> <int>     <dbl> <dbl>    <dbl>    <dbl> <chr>       
## 1 len   0.5    1         20    20     -6.48  38.0 1.27e- 7 2.54e- 7 ****        
## 2 len   0.5    2         20    20    -11.8   36.9 4.40e-14 1.32e-13 ****        
## 3 len   1      2         20    20     -4.90  37.1 1.91e- 5 1.91e- 5 ****
# 参照组比较Comparison against reference group

df %>% t_test(len ~ dose, ref.group = "0.5")
## # A tibble: 2 x 10
##   .y.   group1 group2    n1    n2 statistic    df        p    p.adj p.adj.signif
## * <chr> <chr>  <chr>  <int> <int>     <dbl> <dbl>    <dbl>    <dbl> <chr>       
## 1 len   0.5    1         20    20     -6.48  38.0 1.27e- 7 1.27e- 7 ****        
## 2 len   0.5    2         20    20    -11.8   36.9 4.40e-14 8.80e-14 ****
# 分别与整体比较 Comparison against all

df %>% t_test(len ~ dose, ref.group = "all")
## # A tibble: 3 x 10
##   .y.   group1 group2    n1    n2 statistic    df         p   p.adj p.adj.signif
## * <chr> <chr>  <chr>  <int> <int>     <dbl> <dbl>     <dbl>   <dbl> <chr>       
## 1 len   all    0.5       60    20     5.82   56.4   2.90e-7 8.70e-7 ****        
## 2 len   all    1         60    20    -0.660  57.5   5.12e-1 5.12e-1 ns          
## 3 len   all    2         60    20    -5.61   66.5   4.25e-7 8.70e-7 ****

3.3 可视化呈现统计分析

3.3.1 基本实现

# 作图
p <- df %>%  ggboxplot(x = "supp", y = "len", fill = "supp", width = 0.5)
p

# 统计分析
stat.ttest <- df %>% 
  rstatix::t_test(len ~ supp)
stat.ttest
## # A tibble: 1 x 8
##   .y.   group1 group2    n1    n2 statistic    df      p
## * <chr> <chr>  <chr>  <int> <int>     <dbl> <dbl>  <dbl>
## 1 len   OJ     VC        30    30      1.92  55.3 0.0606
# 添加统计结果
p +  ggpubr::stat_pvalue_manual(stat.ttest, 
                                # label = "p",
                                label = "T-test, p = {p}", 
                                y.position = 35,
                                tip.length = 0)

3.3.2 分组比较

# 作图
p <- df %>%  ggboxplot(x = "supp", y = "len", width = 0.2,
                       ylim = c(0,40),
                       fill = "supp", facet.by = "dose")
# 统计分析
stat.ttest <- df %>% group_by(dose) %>% 
  rstatix::t_test(len ~ supp)
stat.ttest
## # A tibble: 3 x 9
##   dose  .y.   group1 group2    n1    n2 statistic    df       p
## * <fct> <chr> <chr>  <chr>  <int> <int>     <dbl> <dbl>   <dbl>
## 1 0.5   len   OJ     VC        10    10    3.17    15.0 0.00636
## 2 1     len   OJ     VC        10    10    4.03    15.4 0.00104
## 3 2     len   OJ     VC        10    10   -0.0461  14.0 0.964
# 添加统计结果
p +  ggpubr::stat_pvalue_manual(stat.ttest, 
                                # label = "p",
                                label = "T-test, p = {p}", 
                                y.position = 35,
                                tip.length = 0)

## 还可根据各分面设置LABEL位置
stat.ttest1 <- df %>% group_by(dose) %>% 
  rstatix::t_test(len ~ supp) %>% 
  add_xy_position()
stat.ttest1
## # A tibble: 3 x 13
##   dose  .y.   group1 group2    n1    n2 statistic    df       p y.position
##   <fct> <chr> <chr>  <chr>  <int> <int>     <dbl> <dbl>   <dbl>      <dbl>
## 1 0.5   len   OJ     VC        10    10    3.17    15.0 0.00636       24.2
## 2 1     len   OJ     VC        10    10    4.03    15.4 0.00104       30.0
## 3 2     len   OJ     VC        10    10   -0.0461  14.0 0.964         36.6
## # ... with 3 more variables: groups <named list>, xmin <int>, xmax <int>
p + stat_pvalue_manual(stat.ttest1, tip.length = 0)

3.3.3 两两比较

# 作图
p <- df %>% 
  ggboxplot(x = "dose", y = "len", fill = "dose", 
            width = .4, ylim = c(0,45))
p

# 统计分析
stat.ttest <- df %>% 
  t_test(len ~ dose)
stat.ttest
## # A tibble: 3 x 10
##   .y.   group1 group2    n1    n2 statistic    df        p    p.adj p.adj.signif
## * <chr> <chr>  <chr>  <int> <int>     <dbl> <dbl>    <dbl>    <dbl> <chr>       
## 1 len   0.5    1         20    20     -6.48  38.0 1.27e- 7 2.54e- 7 ****        
## 2 len   0.5    2         20    20    -11.8   36.9 4.40e-14 1.32e-13 ****        
## 3 len   1      2         20    20     -4.90  37.1 1.91e- 5 1.91e- 5 ****
# 添加统计结果
p + stat_pvalue_manual(stat.ttest,
                       label = "p = {p}",
                       # label = "{p.adj.signif}",
                       y.position = c(35, 39, 43),
                       tip.length = 0)

还可以

com <- list(c("0.5", "1"), c("0.5", "2"), c("1", "2"))
df %>% ggboxplot(x = "dose", y = "len", fill = "dose", 
            width = .4, ylim = c(0,45))+
  stat_compare_means(label.y = 43, method = "anova")+
  stat_compare_means(comparisons = com, method =  "t.test")

3.3.4 与整体比较

# T-test
stat.test <- df %>% t_test(len ~ dose, ref.group = "all")
stat.test
## # A tibble: 3 x 10
##   .y.   group1 group2    n1    n2 statistic    df         p   p.adj p.adj.signif
## * <chr> <chr>  <chr>  <int> <int>     <dbl> <dbl>     <dbl>   <dbl> <chr>       
## 1 len   all    0.5       60    20     5.82   56.4   2.90e-7 8.70e-7 ****        
## 2 len   all    1         60    20    -0.660  57.5   5.12e-1 5.12e-1 ns          
## 3 len   all    2         60    20    -5.61   66.5   4.25e-7 8.70e-7 ****
# Box plot with horizontal mean line
ggboxplot(df, x = "dose", y = "len", fill = 'dose', width = .3) +
  stat_pvalue_manual(
    stat.test, label = "p.adj.signif", 
    y.position = 35,
    remove.bracket = TRUE
    ) +
  geom_hline(yintercept = mean(df$len), linetype = 2)

Reference:Alboukadel Kassambara (2020). rstatix: Pipe-Friendly Framework for Basic Statistical Tests. R package version 0.6.0.999. https://rpkgs.datanovia.com/rstatix/