Performing multiple t-tests on different variables between the same two groups

Format RMarkdown

## Settings for RMarkdown http://yihui.name/knitr/options#chunk_options
opts_chunk$set(comment = "", warning = FALSE, message = FALSE, echo = TRUE, 
    tidy = FALSE, fig.width = 8, fig.height = 7)
options(width = 116, scipen = 10)

Background

It is often necessary to perform multiple t-tests (or Wilcoxon test) on different variables (age, height, weight, duration or disease) between the same groups (treatment arm vs placebo arm). However, to my knowledge, there is no function to aid this.

Update 2012-08-19

Ura R-jp Wiki, a “critique of the literature” site for R codes (http://blog.goo.ne.jp/r-de-r), kindly provided much simpler solutions.

library(survival)
data(kidney)

## Multiple t-tests for time, age, frail between genders.
lapply(kidney[,c("time", "age", "frail")], function(x) t.test(x ~ kidney$sex, var.equal = TRUE))
$time

    Two Sample t-test

data:  x by kidney$sex 
t = -1.706, df = 74, p-value = 0.09221
alternative hypothesis: true difference in means is not equal to 0 
95 percent confidence interval:
 -124.551    9.651 
sample estimates:
mean in group 1 mean in group 2 
           59.3           116.8 


$age

    Two Sample t-test

data:  x by kidney$sex 
t = -0.0693, df = 74, p-value = 0.9449
alternative hypothesis: true difference in means is not equal to 0 
95 percent confidence interval:
 -7.970  7.434 
sample estimates:
mean in group 1 mean in group 2 
          43.50           43.77 


$frail

    Two Sample t-test

data:  x by kidney$sex 
t = 0.9503, df = 74, p-value = 0.345
alternative hypothesis: true difference in means is not equal to 0 
95 percent confidence interval:
 -0.1872  0.5287 
sample estimates:
mean in group 1 mean in group 2 
          1.310           1.139 


## Multiple Wilcoxon rank sum tests for time, age, frail between genders.
lapply(kidney[,c("time", "age", "frail")], function(x) wilcox.test(x ~ kidney$sex))
$time

    Wilcoxon rank sum test with continuity correction

data:  x by kidney$sex 
W = 308.5, p-value = 0.003062
alternative hypothesis: true location shift is not equal to 0 


$age

    Wilcoxon rank sum test with continuity correction

data:  x by kidney$sex 
W = 563.5, p-value = 0.9717
alternative hypothesis: true location shift is not equal to 0 


$frail

    Wilcoxon rank sum test with continuity correction

data:  x by kidney$sex 
W = 618, p-value = 0.4965
alternative hypothesis: true location shift is not equal to 0 


## Multiple ANOVA for time, age, frail between diseases.
lapply(kidney[,c("time", "age", "frail")], function(x) anova(lm(x ~ kidney$disease)))
$time
Analysis of Variance Table

Response: x
               Df  Sum Sq Mean Sq F value Pr(>F)
kidney$disease  3   53745   17915    1.05   0.38
Residuals      72 1231643   17106               

$age
Analysis of Variance Table

Response: x
               Df Sum Sq Mean Sq F value     Pr(>F)    
kidney$disease  3   6152    2051    14.6 0.00000017 ***
Residuals      72  10144     141                       
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

$frail
Analysis of Variance Table

Response: x
               Df Sum Sq Mean Sq F value Pr(>F)
kidney$disease  3    1.8   0.603    1.28   0.29
Residuals      72   33.8   0.470               


## Multiple Kruskal-Wallis tests for time, age, frail between diseases.
lapply(kidney[,c("time", "age", "frail")], function(x) kruskal.test(x ~ kidney$disease))
$time

    Kruskal-Wallis rank sum test

data:  x by kidney$disease 
Kruskal-Wallis chi-squared = 1.186, df = 3, p-value = 0.7563


$age

    Kruskal-Wallis rank sum test

data:  x by kidney$disease 
Kruskal-Wallis chi-squared = 28.45, df = 3, p-value = 0.000002917


$frail

    Kruskal-Wallis rank sum test

data:  x by kidney$disease 
Kruskal-Wallis chi-squared = 1.234, df = 3, p-value = 0.7449

Define a higher-order function that controls testing functions (t.test, etc)

This is more complex, but one can get what comparison was done within the output, e.g., “time by disease ”.

multi.tests <- function(fun = t.test, df, vars, group.var, ...) {
    sapply(simplify = FALSE,                                    # sapply(simplify=T) better, elements named
           vars,                                                # loop on vector of outcome variable names
           function(var) {
               formula <- as.formula(paste(var, "~", group.var))# create a formula with outcome and grouping var.
               fun(data = df, formula, ...)                     # perform test with a given fun, default t.test
           }
           )
}

Load dataset kidney

library(survival)
data(kidney)
head(kidney)
  id time status age sex disease frail
1  1    8      1  28   1   Other   2.3
2  1   16      1  28   1   Other   2.3
3  2   23      1  48   2      GN   1.9
4  2   13      0  48   2      GN   1.9
5  3   22      1  32   1   Other   1.2
6  3   28      1  32   1   Other   1.2
str(kidney)
'data.frame':   76 obs. of  7 variables:
 $ id     : num  1 1 2 2 3 3 4 4 5 5 ...
 $ time   : num  8 16 23 13 22 28 447 318 30 12 ...
 $ status : num  1 1 1 0 1 1 1 1 1 1 ...
 $ age    : num  28 28 48 48 32 32 31 32 10 10 ...
 $ sex    : num  1 1 2 2 1 1 2 2 1 1 ...
 $ disease: Factor w/ 4 levels "Other","GN","AN",..: 1 1 2 2 1 1 1 1 1 1 ...
 $ frail  : num  2.3 2.3 1.9 1.9 1.2 1.2 0.5 0.5 1.5 1.5 ...

## Variable name extractor
ListVariableNames <- function(DATA) {
    cat('"', noquote(paste(names(DATA), collapse='","')), '"\n', sep='')
}
ListVariableNames(kidney)
"id","time","status","age","sex","disease","frail"

Multiple t-tests (add var.equal for traditional t-test)

res.multi.t.tests <-
    multi.tests(fun = t.test,
                df = kidney,
                vars = c("time","age","frail"),
                group.var = "sex",
                var.equal = TRUE)
res.multi.t.tests
$time

    Two Sample t-test

data:  time by sex 
t = -1.706, df = 74, p-value = 0.09221
alternative hypothesis: true difference in means is not equal to 0 
95 percent confidence interval:
 -124.551    9.651 
sample estimates:
mean in group 1 mean in group 2 
           59.3           116.8 


$age

    Two Sample t-test

data:  age by sex 
t = -0.0693, df = 74, p-value = 0.9449
alternative hypothesis: true difference in means is not equal to 0 
95 percent confidence interval:
 -7.970  7.434 
sample estimates:
mean in group 1 mean in group 2 
          43.50           43.77 


$frail

    Two Sample t-test

data:  frail by sex 
t = 0.9503, df = 74, p-value = 0.345
alternative hypothesis: true difference in means is not equal to 0 
95 percent confidence interval:
 -0.1872  0.5287 
sample estimates:
mean in group 1 mean in group 2 
          1.310           1.139 


## p-values can be extracted from the result object
data.frame(p.value = sapply(res.multi.t.tests, getElement, name = "p.value"))
      p.value
time  0.09221
age   0.94494
frail 0.34504

Multiple Wilcoxon rank sum tests

res.multi.wilcox.tests <-
    multi.tests(fun = wilcox.test,
                df = kidney,
                vars = c("time","age","frail"),
                group.var = "sex")
res.multi.wilcox.tests
$time

    Wilcoxon rank sum test with continuity correction

data:  time by sex 
W = 308.5, p-value = 0.003062
alternative hypothesis: true location shift is not equal to 0 


$age

    Wilcoxon rank sum test with continuity correction

data:  age by sex 
W = 563.5, p-value = 0.9717
alternative hypothesis: true location shift is not equal to 0 


$frail

    Wilcoxon rank sum test with continuity correction

data:  frail by sex 
W = 618, p-value = 0.4965
alternative hypothesis: true location shift is not equal to 0 

Multiple ANOVA: Make sure the grouping variable is a factor (categorical variable in R)

res.multi.anova <-
    multi.tests(fun = oneway.test,
                df = kidney,
                vars = c("time","age","frail"),
                group.var = "disease",
                var.equal = TRUE)
res.multi.anova
$time

    One-way analysis of means

data:  time and disease 
F = 1.047, num df = 3, denom df = 72, p-value = 0.377


$age

    One-way analysis of means

data:  age and disease 
F = 14.55, num df = 3, denom df = 72, p-value = 0.0000001666


$frail

    One-way analysis of means

data:  frail and disease 
F = 1.285, num df = 3, denom df = 72, p-value = 0.2862

Multiple Kruskal-Wallis tests

res.multi.kruskal.tests <-
    multi.tests(fun = kruskal.test,
                df = kidney,
                vars = c("time","age","frail"),
                group.var = "disease")
res.multi.kruskal.tests
$time

    Kruskal-Wallis rank sum test

data:  time by disease 
Kruskal-Wallis chi-squared = 1.186, df = 3, p-value = 0.7563


$age

    Kruskal-Wallis rank sum test

data:  age by disease 
Kruskal-Wallis chi-squared = 28.45, df = 3, p-value = 0.000002917


$frail

    Kruskal-Wallis rank sum test

data:  frail by disease 
Kruskal-Wallis chi-squared = 1.234, df = 3, p-value = 0.7449

Multiple plotting

Any function that accepts, continuous variable ~ categorical variable formula works

junk <- multi.tests(fun = plot,
                    df = kidney,
                    vars = c("time","age","frail"),
                    group.var = "disease")

plot of chunk unnamed-chunk-9 plot of chunk unnamed-chunk-9 plot of chunk unnamed-chunk-9