1 引言

t检验在健康数据科学中是很常见的操作,如何对多组样本一次批量开展t检验呢? 本文介绍如何用R语言快速批量开展多组两独立样本t检验。

2 准备工作

2.1 加载宏包

library(tidyverse)
library(rstatix)

2.2 导入数据

以经典数据集iris为例。

iris

3 简单两独立样本t检验

任务01:对iris数据集setosa和virginica物种间的Sepal.Length开展检验分析。

3.1 Base R

3.1.1 正态性检验

shapiro.test(iris$Sepal.Length[iris$Species == "setosa"])
## 
##  Shapiro-Wilk normality test
## 
## data:  iris$Sepal.Length[iris$Species == "setosa"]
## W = 0.9777, p-value = 0.4595
shapiro.test(iris$Sepal.Length[iris$Species == "virginica"])
## 
##  Shapiro-Wilk normality test
## 
## data:  iris$Sepal.Length[iris$Species == "virginica"]
## W = 0.97118, p-value = 0.2583

3.1.2 方差齐性检验

var.test(iris$Sepal.Length[iris$Species == "setosa"],
         iris$Sepal.Length[iris$Species == "virginica"])
## 
##  F test to compare two variances
## 
## data:  iris$Sepal.Length[iris$Species == "setosa"] and iris$Sepal.Length[iris$Species == "virginica"]
## F = 0.30729, num df = 49, denom df = 49, p-value = 6.366e-05
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.1743776 0.5414962
## sample estimates:
## ratio of variances 
##          0.3072862

3.1.3 t-test

# 正态分布但方差不齐,采用Welch t-test
ttest0 <- t.test(iris$Sepal.Length[iris$Species == "setosa"],
       iris$Sepal.Length[iris$Species == "virginica"])
ttest0
## 
##  Welch Two Sample t-test
## 
## data:  iris$Sepal.Length[iris$Species == "setosa"] and iris$Sepal.Length[iris$Species == "virginica"]
## t = -15.386, df = 76.516, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.78676 -1.37724
## sample estimates:
## mean of x mean of y 
##     5.006     6.588
ttest0 %>% 
  broom::tidy()

3.2 tidyverse R

3.2.1 正态性检验

iris %>% 
  filter(Species == "setosa") %>% 
  shapiro_test(Sepal.Length)

3.2.2 方差齐性检验

iris %>% 
  filter(Species %in% c("setosa", "virginica")) %>% 
  var.test(Sepal.Length ~ Species, data = .)
## 
##  F test to compare two variances
## 
## data:  Sepal.Length by Species
## F = 0.30729, num df = 49, denom df = 49, p-value = 6.366e-05
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.1743776 0.5414962
## sample estimates:
## ratio of variances 
##          0.3072862

3.2.3 t-test

iris %>% 
  filter(Species %in% c("setosa", "virginica")) %>%
  mutate(Species = as.character(Species)) %>%   #注意因子型分组变量无法运行t_test
  rstatix::t_test(Sepal.Length ~ Species,
                  # detailed = T
                  )

4 单变量批量两独立样本t检验

任务02:对iris数据集各两两物种间的Sepal.Length开展检验分析。

4.1 Base R

4.1.1 批量正态性检验

iris %>% 
        group_by(Species) %>% 
        summarise(
          shapiro.test(Sepal.Length) %>% broom::tidy()
        )

4.1.2 生成两独立样本比对组

# 生成两两比对组
two_groups <- iris %>% 
     distinct(Species) %>% 
     pull(Species) %>% 
     as.character() %>% 
     combn(2) %>% 
     as.data.frame() %>%
     select(setosa_versicolor = V1, setosa_virginica = V2, versicolor_virginica = V3) 

4.1.3 批量两独立变量方差齐性检验

# 两两方差齐性检验
two_groups %>% 
     map_dfr(
     ~ iris %>% 
          filter(Species %in% .x) %>% 
          summarise(
               var.test(Sepal.Length ~ Species) %>% 
                    broom::tidy()),
     .id = "group")

4.1.4 批量两独立样本t检验

# 两两t检验
two_groups %>% 
     map_dfr(
          ~iris %>% 
               filter(Species %in% .x) %>% 
               summarise(
                    t.test(Sepal.Length ~ Species, var.equal = FALSE) %>% 
                         broom::tidy()),
          .id = "group")

4.2 tidyverse R

4.2.1 批量两独立样本正态性检验

iris %>% 
     group_by(Species) %>% 
     shapiro_test(Sepal.Length)

4.2.2 批量两独立样本方差齐性检验

# 生成比对组
two_groups <- iris %>% 
     distinct(Species) %>% 
     pull(Species) %>% 
     as.character() %>% 
     combn(2) %>% 
     as.data.frame() %>%
     select(setosa_versicolor = V1, setosa_virginica = V2, versicolor_virginica = V3) 
# 方差齐性检验
two_groups %>% 
        map_dfr(
                ~ iris %>% 
                filter(Species %in% .x) %>% 
                summarise(
                  var.test(Sepal.Length ~ Species) %>%
                  # bartlett.test(Sepal.Length ~ Species) %>% 
                  broom::tidy()),
        .id = "group")

4.2.3 批量两独立样本t检验

iris %>% 
     t_test(Sepal.Length ~ Species)

5 多变量批量两独立样本t检验

任务03:对iris数据集各两两物种间的Sepal.Length、Sepal.Width、Petal.Length及Petal.Width开展检验分析。

5.1 tidyverse R

5.1.1 数据长宽转换

iris_long <- iris %>% 
        pivot_longer(1:4,
                     names_to = "items",
                     values_to = "cm")

5.1.2 多变量批量正态性检验

iris_long %>% 
        group_by(items, Species) %>% 
        shapiro_test(cm)

5.1.3 多变量批量两独立样本方差齐性检验

# 生成比对组
two_groups <- iris %>% 
     distinct(Species) %>% 
     pull(Species) %>% 
     as.character() %>% 
     combn(2) %>% 
     as.data.frame() %>%
     select(setosa_versicolor = V1, setosa_virginica = V2, versicolor_virginica = V3) 

two_groups %>% 
  map_dfr(
    ~ iris_long %>% 
        filter(Species %in% .x) %>% 
        group_by(items) %>%
        summarise(
                var.test(cm ~ Species) %>% 
                        broom::tidy()),
        .id = "group")

5.1.4 多变量批量两独立样本t检验

iris_long %>% 
        group_by(items) %>%
        t_test(cm ~ Species)

参考文献

1 Kassambara A. Rstatix: Pipe-friendly framework for basic statistical tests. 2021 https://CRAN.R-project.org/package=rstatix.

2 可我的家里有cy. 用Rmarkdown 写论文——解决参考文献与交叉引用. https://sspai.com/post/53998.