table1 package

https://cran.r-project.org/web/packages/table1/vignettes/table1-examples.html

https://www.bioinfo-scrounger.com/archives/855/

1.特征表

例1

library(boot)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.7     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
melanoma2 <- melanoma
melanoma2$status <- 
  factor(melanoma2$status,
         levels = c(2,1,3),
         labels = c("Alive",#reference 注意顺序
                    "Melanoma death",
                    "Non-melanoma death"))

利用table1 执行操作:

library(table1)
## 
## 载入程辑包:'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
table1(~factor(sex)+age+factor(ulcer)+thickness|status,data = melanoma2)
Alive
(N=134)
Melanoma death
(N=57)
Non-melanoma death
(N=14)
Overall
(N=205)
factor(sex)
0 91 (67.9%) 28 (49.1%) 7 (50.0%) 126 (61.5%)
1 43 (32.1%) 29 (50.9%) 7 (50.0%) 79 (38.5%)
age
Mean (SD) 50.0 (15.9) 55.1 (17.9) 65.3 (10.9) 52.5 (16.7)
Median [Min, Max] 52.0 [4.00, 84.0] 56.0 [14.0, 95.0] 65.0 [49.0, 86.0] 54.0 [4.00, 95.0]
factor(ulcer)
0 92 (68.7%) 16 (28.1%) 7 (50.0%) 115 (56.1%)
1 42 (31.3%) 41 (71.9%) 7 (50.0%) 90 (43.9%)
thickness
Mean (SD) 2.24 (2.33) 4.31 (3.57) 3.72 (3.63) 2.92 (2.96)
Median [Min, Max] 1.36 [0.100, 12.9] 3.54 [0.320, 17.4] 2.26 [0.160, 12.6] 1.94 [0.100, 17.4]

我们会发现,有些分类变量的生成不是很完美,当然可以在excel里面手动修改。其实原因在于没有很好的变量标签和类别,为连续性变量增加单位;把总计一列放到最左侧,。如何实现呢?

melanoma2$sex <- 
  factor(melanoma2$sex, levels=c(1,0),
         labels=c("Male", 
                  "Female"))
 
melanoma2$ulcer <- 
  factor(melanoma2$ulcer, levels=c(0,1),
         labels=c("Absent", 
                  "Present"))

label(melanoma2$sex)       <- "Sex"
label(melanoma2$age)       <- "Age"
label(melanoma2$ulcer)     <- "Ulceration"
label(melanoma2$thickness) <- "Thicknessᵃ"

units(melanoma2$age)       <- "years"
units(melanoma2$thickness) <- "mm"

caption  <- "Basic stats"
footnote <- "ᵃ Also known as Breslow thickness"

table1(~ sex + age + ulcer + thickness | status, data=melanoma2,
    overall=c(left="Total"), caption=caption, footnote=footnote)
Basic stats
Total
(N=205)
Alive
(N=134)
Melanoma death
(N=57)
Non-melanoma death
(N=14)

Also known as Breslow thickness

Sex
Male 79 (38.5%) 43 (32.1%) 29 (50.9%) 7 (50.0%)
Female 126 (61.5%) 91 (67.9%) 28 (49.1%) 7 (50.0%)
Age (years)
Mean (SD) 52.5 (16.7) 50.0 (15.9) 55.1 (17.9) 65.3 (10.9)
Median [Min, Max] 54.0 [4.00, 95.0] 52.0 [4.00, 84.0] 56.0 [14.0, 95.0] 65.0 [49.0, 86.0]
Ulceration
Absent 115 (56.1%) 92 (68.7%) 16 (28.1%) 7 (50.0%)
Present 90 (43.9%) 42 (31.3%) 41 (71.9%) 7 (50.0%)
Thickness (mm)
Mean (SD) 2.92 (2.96) 2.24 (2.33) 4.31 (3.57) 3.72 (3.63)
Median [Min, Max] 1.94 [0.100, 17.4] 1.36 [0.100, 12.9] 3.54 [0.320, 17.4] 2.26 [0.160, 12.6]

2.新的问题 两个“死亡”层(黑色素瘤和非黑色素瘤)应该是 在一个共同的标题下分组;连续变量年龄 和厚度仅显示平均值 (SD)(带±),而不显示中位数 [最小值、最大值] ; 使用两个有效数字而不是三个。我们需要进一步自定义输出 首先,我们使用列表以不同的方式设置标签:

labels <- list(
    variables=list(sex="Sex",
                   age="Age (years)",
                   ulcer="Ulceration",
                   thickness="Thicknessᵃ (mm)"),
    groups=list("", "", "Death"))

# 将"death"这个词从标签中移除, 因为上面groups定义了
levels(melanoma2$status) <- c("Alive", "Melanoma", "Non-melanoma")

接下来,我们定义分层或者说是列,按照我们希望的顺序显示

strata <- c(list(Total=melanoma2), split(melanoma2, melanoma2$status))

最后,我们可以使用自定义渲染器自定义内容。一个 自定义渲染可以是将向量作为第一个参数的函数 并返回一个(命名的)字符向量。还有一种更简单的方法 使用缩写代码语法而不是 一个渲染函数,但它允许较少控制舍入(见下文)。 例如,在这里,我们为连续和分类变量如下

#连续变量
my.render.cont <- function(x) {
    with(stats.apply.rounding(stats.default(x), digits=2), c("",
        "Mean (SD)"=sprintf("%s (&plusmn; %s)", MEAN, SD)))
}

#分类变量
my.render.cat <- function(x) {
    c("", sapply(stats.default(x), function(y) with(y,
        sprintf("%d (%0.0f %%)", FREQ, PCT))))
}

展示结果

table1(strata, labels, groupspan=c(1, 1, 2), caption=caption, footnote=footnote,
       render.continuous=my.render.cont, render.categorical=my.render.cat)
Basic stats
Death
Total
(N=205)
Alive
(N=134)
Melanoma
(N=57)
Non-melanoma
(N=14)

Also known as Breslow thickness

Sex
Male 79 (39 %) 43 (32 %) 29 (51 %) 7 (50 %)
Female 126 (61 %) 91 (68 %) 28 (49 %) 7 (50 %)
Age (years)
Mean (SD) 52 (± 17) 50 (± 16) 55 (± 18) 65 (± 11)
Ulceration
Absent 115 (56 %) 92 (69 %) 16 (28 %) 7 (50 %)
Present 90 (44 %) 42 (31 %) 41 (72 %) 7 (50 %)
Thickness (mm)
Mean (SD) 2.9 (± 3.0) 2.2 (± 2.3) 4.3 (± 3.6) 3.7 (± 3.6)

2.p和统计量.

pvalue <- function(x, ...) {
  # 构建数据y的向量,和分组变量g 
  y <- unlist(x)
  g <- factor(rep(1:length(x), times=sapply(x, length)))
  if (is.numeric(y)) {
    # 数值型变量,采用标准的t检验(两组比较)
    p <- t.test(y ~ g)$p.value
  } else {
    # 分类变量,此案用卡方检验
    p <- chisq.test(table(y, g))$p.value
  }
  # Format the p-value, using an HTML entity for the less-than sign.
  # The initial empty string places the output on the line below the variable label.
  c("", sub("<", "&lt;", format.pval(p, digits=3, eps=0.001)))
}

#统计量
stats <- function(x, ...) {
  y <- unlist(x)
  g <- factor(rep(1:length(x), times=sapply(x, length)))
  if (is.numeric(y)) {
    s <- t.test(y ~ g)$statistic
  } else {
    s <- chisq.test(table(y, g))$statistic
  }
  c("", sprintf("%.4f", s))
}

用两组的进行分析

library(MatchIt)

data(lalonde)

lalonde$treat    <- factor(lalonde$treat, levels=c(0, 1), labels=c("Control", "Treatment"))
lalonde$married  <- as.logical(lalonde$married == 1)
lalonde$nodegree <- as.logical(lalonde$nodegree == 1)
lalonde$race     <- factor(lalonde$race, levels=c("white", "black", "hispan"),
                           labels=c("White", "Black", "Hispanic"))

label(lalonde$race)     <- "Race"
label(lalonde$married)  <- "Married"
label(lalonde$nodegree) <- "No high school diploma"
label(lalonde$age)      <- "Age"
label(lalonde$re74)     <- "1974 Income"
label(lalonde$re75)     <- "1975 Income"
label(lalonde$re78)     <- "1978 Income"
units(lalonde$age)      <- "years"

结果显示

table1(~ age + race + married + nodegree + re74 + re75 + re78 | treat,
       data=lalonde, overall=F, extra.col=list(`P-value`=pvalue, `statistic`=stats))
Control
(N=429)
Treatment
(N=185)
P-value statistic
Age (years)
Mean (SD) 28.0 (10.8) 25.8 (7.16) 0.00291 2.9911
Median [Min, Max] 25.0 [16.0, 55.0] 25.0 [17.0, 48.0]
Race
White 281 (65.5%) 18 (9.7%) <0.001 224.0708
Black 87 (20.3%) 156 (84.3%)
Hispanic 61 (14.2%) 11 (5.9%)
Married
Yes 220 (51.3%) 35 (18.9%) <0.001 54.4276
No 209 (48.7%) 150 (81.1%)
No high school diploma
Yes 256 (59.7%) 131 (70.8%) 0.0113 6.4107
No 173 (40.3%) 54 (29.2%)
1974 Income
Mean (SD) 5620 (6790) 2100 (4890) <0.001 7.2456
Median [Min, Max] 2550 [0, 25900] 0 [0, 35000]
1975 Income
Mean (SD) 2470 (3290) 1530 (3220) 0.00115 3.2776
Median [Min, Max] 1090 [0, 18300] 0 [0, 25100]
1978 Income
Mean (SD) 6980 (7290) 6350 (7870) 0.349 0.9377
Median [Min, Max] 4980 [0, 25600] 4230 [0, 60300]
x <- table1(~ age + race + married + nodegree + re74 + re75 + re78 | treat,
       data=lalonde, overall=F, extra.col=list(`P-value`=pvalue, `statistic`=stats))
as.data.frame(x)
##                                     Control         Treatment   P-value
## 1                                   (N=429)           (N=185)          
## 2             Age (years)                                              
## 3               Mean (SD)       28.0 (10.8)       25.8 (7.16)   0.00291
## 4       Median [Min, Max] 25.0 [16.0, 55.0] 25.0 [17.0, 48.0]          
## 5                    Race                                              
## 6                   White       281 (65.5%)         18 (9.7%) &lt;0.001
## 7                   Black        87 (20.3%)       156 (84.3%)          
## 8                Hispanic        61 (14.2%)         11 (5.9%)          
## 9                 Married                                              
## 10                    Yes       220 (51.3%)        35 (18.9%) &lt;0.001
## 11                     No       209 (48.7%)       150 (81.1%)          
## 12 No high school diploma                                              
## 13                    Yes       256 (59.7%)       131 (70.8%)    0.0113
## 14                     No       173 (40.3%)        54 (29.2%)          
## 15            1974 Income                                              
## 16              Mean (SD)       5620 (6790)       2100 (4890) &lt;0.001
## 17      Median [Min, Max]   2550 [0, 25900]      0 [0, 35000]          
## 18            1975 Income                                              
## 19              Mean (SD)       2470 (3290)       1530 (3220)   0.00115
## 20      Median [Min, Max]   1090 [0, 18300]      0 [0, 25100]          
## 21            1978 Income                                              
## 22              Mean (SD)       6980 (7290)       6350 (7870)     0.349
## 23      Median [Min, Max]   4980 [0, 25600]   4230 [0, 60300]          
##    statistic
## 1           
## 2           
## 3     2.9911
## 4           
## 5           
## 6   224.0708
## 7           
## 8           
## 9           
## 10   54.4276
## 11          
## 12          
## 13    6.4107
## 14          
## 15          
## 16    7.2456
## 17          
## 18          
## 19    3.2776
## 20          
## 21          
## 22    0.9377
## 23
knitr::kable(as.data.frame(x),booktabs=TRUE)
Control Treatment P-value statistic
(N=429) (N=185)
Age (years)
Mean (SD) 28.0 (10.8) 25.8 (7.16) 0.00291 2.9911
Median [Min, Max] 25.0 [16.0, 55.0] 25.0 [17.0, 48.0]
Race
White 281 (65.5%) 18 (9.7%) <0.001 224.0708
Black 87 (20.3%) 156 (84.3%)
Hispanic 61 (14.2%) 11 (5.9%)
Married
Yes 220 (51.3%) 35 (18.9%) <0.001 54.4276
No 209 (48.7%) 150 (81.1%)
No high school diploma
Yes 256 (59.7%) 131 (70.8%) 0.0113 6.4107
No 173 (40.3%) 54 (29.2%)
1974 Income
Mean (SD) 5620 (6790) 2100 (4890) <0.001 7.2456
Median [Min, Max] 2550 [0, 25900] 0 [0, 35000]
1975 Income
Mean (SD) 2470 (3290) 1530 (3220) 0.00115 3.2776
Median [Min, Max] 1090 [0, 18300] 0 [0, 25100]
1978 Income
Mean (SD) 6980 (7290) 6350 (7870) 0.349 0.9377
Median [Min, Max] 4980 [0, 25600] 4230 [0, 60300]
t1kable(x)
## Warning in !is.null(rmarkdown::metadata$output) && rmarkdown::metadata$output
## %in% : 'length(x) = 2 > 1' in coercion to 'logical(1)'
  Control Treatment P-value statistic
(N=429) (N=185)
Age (years)
Mean (SD) 28.0 (10.8) 25.8 (7.16) 0.00291 2.9911
Median [Min, Max] 25.0 [16.0, 55.0] 25.0 [17.0, 48.0]
Race
White 281 (65.5%) 18 (9.7%) &lt;0.001 224.0708
Black 87 (20.3%) 156 (84.3%)
Hispanic 61 (14.2%) 11 (5.9%)
Married
Yes 220 (51.3%) 35 (18.9%) &lt;0.001 54.4276
No 209 (48.7%) 150 (81.1%)
No high school diploma
Yes 256 (59.7%) 131 (70.8%) 0.0113 6.4107
No 173 (40.3%) 54 (29.2%)
1974 Income
Mean (SD) 5620 (6790) 2100 (4890) &lt;0.001 7.2456
Median [Min, Max] 2550 [0, 25900] 0 [0, 35000]
1975 Income
Mean (SD) 2470 (3290) 1530 (3220) 0.00115 3.2776
Median [Min, Max] 1090 [0, 18300] 0 [0, 25100]
1978 Income
Mean (SD) 6980 (7290) 6350 (7870) 0.349 0.9377
Median [Min, Max] 4980 [0, 25600] 4230 [0, 60300]