# tbl_summary(
# data,
# by = NULL,
# label = NULL, # 默认标签
# statistic = list(all_continuous() ~ "{median} ({p25}, {p75})", all_categorical() ~"{n} ({p}%)"), # 统计汇总信息显示
# digits = NULL, # 小数位数
# type = NULL, # c("continuous", "continuous2", "categorical", "dichotomous")
# value = NULL,
# missing = c("ifany", "no", "always"),
# missing_text = "Unknown",
# missing_stat = "{N_miss}",
# sort = all_categorical(FALSE) ~ "alphanumeric",# c("alphanumeric", "frequency")
# percent = c("column", "row", "cell"),
# include = everything()
# )gtsummary包
{gtsummary}包
{gtsummary} 包 (Package index) 提供了一种优雅而灵活的方法,可以使用R编程语言创建可发布的分析和汇总表。{gtsummary} 包使用具有高度可定制功能的合理默认值汇总数据集、回归模型等。
1.{tbl_summary()} 绘制 Table 1
tbl_summary()
"dichotomous"“二分”分类变量显示在单行上,而不是每个变量水平显示一行。编码为TRUE/TRUE、0/1或yes/no的变量被假定为二分的,并且显示TRUE、1和yes行。否则,必须在value参数中指定要显示的值,例如value = list(varname ~“level to show”)
在R中轻松总结数据帧或数据块。非常适合呈现描述性统计数据,比较群体人口统计数据(例如为医学期刊创建Table 1)等。自动检测数据集中的连续变量、分类变量和二分变量,计算适当的描述性统计量,还包括每个变量中的缺失量。
tbl_summary()函数计算R中连续、分类和二分变量的描述性统计量,并将结果显示在一个漂亮的、可定制的汇总表中,以供发布(例如,Table 1或 demographic tables.)
1.1 Set up 设置
# install.packages("gtsummary")
library(gtsummary)1.2 Example data set 示例数据集
该数据集包含来自200名接受两种类型化疗(药物A或药物B)之一的患者的数据。结果是肿瘤缓解和死亡。
数据框中的每个变量都被分配了一个带有标签包的属性标签(即 attr(undefined,“label”)== “Chemotherapy Treatment”)。默认情况下,这些标签显示在{gtsummary}输出表中。在没有标签的数据框上使用{gtsummary}只会打印变量名称来代替变量标签;还有一个选项可以在以后添加标签。
head(trial)# A tibble: 6 × 8
trt age marker stage grade response death ttdeath
<chr> <dbl> <dbl> <fct> <fct> <int> <int> <dbl>
1 Drug A 23 0.16 T1 II 0 0 24
2 Drug B 9 1.11 T2 I 1 0 24
3 Drug A 31 0.277 T1 II 0 0 24
4 Drug A NA 2.07 T3 III 1 1 17.6
5 Drug A 51 2.77 T4 III 1 1 16.4
6 Drug B 39 0.613 T4 I 0 1 15.6
为了简洁起见,在本教程中,我们将使用试验数据集中的一个变量子集。
trial2 <- trial |> select(trt, age, grade)1.3 Basic Usage 基础用法
从试验数据集创建一个汇总统计表。tbl_summary()函数至少可以将数据框作为唯一输入,并返回数据框中每列的描述性统计信息。
trial2 |> tbl_summary()| Characteristic | N = 2001 |
|---|---|
| Chemotherapy Treatment | |
| Drug A | 98 (49%) |
| Drug B | 102 (51%) |
| Age | 47 (38, 57) |
| Unknown | 11 |
| Grade | |
| I | 68 (34%) |
| II | 68 (34%) |
| III | 64 (32%) |
| 1 n (%); Median (Q1, Q3) | |
注意这个基本用法的合理默认值;每个默认值都可以自定义。
自动检测变量类型,以便计算适当的描述性统计量。
数据集中的标签属性将自动打印。
缺失值在表中列为“unknown”。
Variable 缩进并添加脚注。
对于本研究数据,应按治疗组划分汇总统计量,可使用by= 参数进行划分。若要比较两个或多个组,请在函数调用中包含add_p(),它会检测变量类型并使用适当的统计测试。
trial2 |>
tbl_summary(by = trt) |>
add_p()| Characteristic | Drug A N = 981 |
Drug B N = 1021 |
p-value2 |
|---|---|---|---|
| Age | 46 (37, 60) | 48 (39, 56) | 0.7 |
| Unknown | 7 | 4 | |
| Grade | 0.9 | ||
| I | 35 (36%) | 33 (32%) | |
| II | 32 (33%) | 36 (35%) | |
| III | 31 (32%) | 33 (32%) | |
| 1 Median (Q1, Q3); n (%) | |||
| 2 Wilcoxon rank sum test; Pearson’s Chi-squared test | |||
1.4 Customize Output 自定义输出
有四种主要方法可以自定义汇总表的输出
使用
tbl_summary()函数参数使用
add_*()函数向汇总表中添加其他数据/信息使用
{gtsummary}函数修改汇总表外观使用
{gt}包函数修改表外观
1.4.1 修改 tbl_summary() 函数参数
tbl_summary()函数包含许多用于修改外观的输入选项。
修改tbl_summary()参数的示例
trial2 |>
tbl_summary(
by = trt, # 分组
statistic = list( # 统计量显示格式
all_continuous() ~ "{mean} ({sd})", # 连续型变量
all_categorical() ~ "{n} / {N} ({p}%)" # 分类型变量
),
digits = all_continuous() ~ 2, # 小数位数
label = grade ~ "Tumor Grade", # 变量重命名
missing_text = "(Missing)"# 缺失值
)| Characteristic | Drug A N = 981 |
Drug B N = 1021 |
|---|---|---|
| Age | 47.01 (14.71) | 47.45 (14.01) |
| (Missing) | 7 | 4 |
| Tumor Grade | ||
| I | 35 / 98 (36%) | 33 / 102 (32%) |
| II | 32 / 98 (33%) | 36 / 102 (35%) |
| III | 31 / 98 (32%) | 33 / 102 (32%) |
| 1 Mean (SD); n / N (%) | ||
有多种方法可以使用单个公式、公式列表和命名列表来指定 statistic= 参数。下表显示了指定连续变量年龄和标记的均值统计量的等效方法。任何接受公式的 {gtsummary} 函数参数都将接受这些变量中的每一个。
Select with Helpers eg:
all_continuous() ~ "{mean}"Select by Variable Name eg:
c(age, marker) ~ "{mean}"Select with Named List eg:
list(age = "{mean}", marker = "{mean}")
1.4.2 {gtsummary}函数添加信息
{gtsummary}包具有向tbl_summary()表添加信息或统计信息的函数。
# add_overall(
# x, # gtsummary
# last = FALSE, # 是否显示在最后一列
# col_label = "**Overall** \nN = {style_number(N)}", # 汇总列显示的列名,默认
# statistic = NULL, # 调用的统计参数,默认为 NULL/ ~"{p}% (n={n})" ~ c(1,0)
# digits = NULL, # 小数位数
# ...
# )1.4.3 {gtsummary} 用于格式化表格的函数
{gtsummary}包附带了专门用于修改和格式化汇总表的函数。
添加tbl_summary()系列函数的示例
trial2 |>
tbl_summary(by = trt) |> # 分组变量
add_p(pvalue_fun = label_style_pvalue(digits = 2)) |>
add_overall() |> # 添加汇总列
add_n() |> # 添加记数列
modify_header(label ~ "**Variable**") |> # 表头
modify_spanning_header(c("stat_1", "stat_2") ~ "**Treatment Received**") |>
modify_footnote( # 脚注
all_stat_cols() ~ "Median (IQR) or Frequency (%)"
) |>
modify_caption("**Table 1. Patient Characteristics**") |> # 图注
bold_labels() # 变量标签粗体| Variable | N | Overall N = 2001 |
Treatment Received
|
p-value2 | |
|---|---|---|---|---|---|
| Drug A N = 981 |
Drug B N = 1021 |
||||
| Age | 189 | 47 (38, 57) | 46 (37, 60) | 48 (39, 56) | 0.72 |
| Unknown | 11 | 7 | 4 | ||
| Grade | 200 | 0.87 | |||
| I | 68 (34%) | 35 (36%) | 33 (32%) | ||
| II | 68 (34%) | 32 (33%) | 36 (35%) | ||
| III | 64 (32%) | 31 (32%) | 33 (32%) | ||
| 1 Median (IQR) or Frequency (%) | |||||
| 2 Wilcoxon rank sum test; Pearson’s Chi-squared test | |||||
1.4.4 使用{gt}包函数修改表外观
{gt} 包包含了许多修改表格输出的强大功能。
要将 {gt} 包函数用于 {gtsummary} 表,必须先将汇总表转换为 gt 对象。为此,请在使用 {gtsummary} 函数完成修改后使用 as_gt() 函数。
trial2 |>
tbl_summary(by = trt, missing = "no") |>
add_n() |>
as_gt() |>
gt::tab_source_note(gt::md("*This data is simulated*"))| Characteristic | N | Drug A N = 981 |
Drug B N = 1021 |
|---|---|---|---|
| Age | 189 | 46 (37, 60) | 48 (39, 56) |
| Grade | 200 | ||
| I | 35 (36%) | 33 (32%) | |
| II | 32 (33%) | 36 (35%) | |
| III | 31 (32%) | 33 (32%) | |
| This data is simulated | |||
| 1 Median (Q1, Q3); n (%) | |||
1.5 Select Helpers 选择助手
- 整个
tidyverse中可用的所有{tidyselect}帮助程序,例如starts_with()、contains()和everything()(即任何可以与dplyr::select()函数一起使用的东西),都可以与{gtsummary}一起使用。 - 包中包含的其他
{gtsummary}选择器,用于补充tidyselect功能:Summary type
# all_continuous() # all_categorical()
1.6 Multi-line Continuous Summaries 多行连续摘要
连续变量也可以在多行上进行汇总-这是某些期刊中的常见格式。要更新连续变量以在多行上汇总,请将汇总类型更新为“continuous 2”(用于两行或多行上的汇总)。
trial2 |>
select(age, trt) |> # 选择age,trt列
tbl_summary(
by = trt, # 分组变量
type = all_continuous() ~ "continuous2", # 连续型变量在多行上汇总
statistic = all_continuous() ~ c( # 统计量显示
"{N_nonmiss}",
"{median} ({p25}, {p75})",
"{min}, {max}"
),
missing = "no" # 缺失值
) |>
add_p(pvalue_fun = label_style_pvalue(digits = 2)) #P值| Characteristic | Drug A N = 98 |
Drug B N = 102 |
p-value1 |
|---|---|---|---|
| Age | 0.72 | ||
| N Non-missing | 91 | 98 | |
| Median (Q1, Q3) | 46 (37, 60) | 48 (39, 56) | |
| Min, Max | 6, 78 | 9, 83 | |
| 1 Wilcoxon rank sum test | |||
1.7 Advanced Customization 高级定制
适用于所有{gtsummary}对象
{gtsummary}表有两个重要的内部对象:
当您将 tbl_summary() 函数的输出打印到R控制台或 R markdown文档中时,.$table_body 数据帧使用.$table_styling中列出的说明进行格式化.默认输出使用 as_gt()通过在.$table_body 上执行的一系列{gt}命令将{gtsummary}对象转换为{gt}对象,以下是使用tbl_summary() 保存的前
tbl_summary(trial2) |>
as_gt(return_calls = TRUE) |>
head(n = 4)$gt
gt::gt(data = x$table_body, groupname_col = NULL, caption = NULL)
$fmt_missing
$fmt_missing[[1]]
gt::sub_missing(columns = gt::everything(), missing_text = "")
$cols_merge
list()
$cols_align
$cols_align[[1]]
gt::cols_align(columns = c("variable", "var_type", "row_type",
"var_label", "stat_0"), align = "center")
$cols_align[[2]]
gt::cols_align(columns = "label", align = "left")
#> $gt
#> gt::gt(data = x$table_body, groupname_col = NULL, caption = NULL)
#>
#> $fmt_missing
#> $fmt_missing[[1]]
#> gt::sub_missing(columns = gt::everything(), missing_text = "")
#>
#>
#> $cols_merge
#> list()
#>
#> $cols_align
#> $cols_align[[1]]
#> gt::cols_align(columns = c("variable", "var_type", "row_type",
#> "var_label", "stat_0"), align = "center")
#>
#> $cols_align[[2]]
#> gt::cols_align(columns = "label", align = "left"){gt}函数按它们出现的顺序调用,从 gt::gt()开始。
如果不希望运行特定的{gt}函数(即希望更改默认输出格式),则可以在as_gt()函数中排除任何{gt}调用。在下面的示例中,将恢复默认对齐方式。
运行as_gt()函数后,可以使用{gt}函数向表中添加其他格式。在下面的示例中,源注释被添加到表中:
tbl_summary(trial2, by = trt) |>
as_gt(include = -cols_align) |>
gt::tab_source_note(gt::md("*This data is simulated*"))| Characteristic | Drug A N = 981 |
Drug B N = 1021 |
|---|---|---|
| Age | 46 (37, 60) | 48 (39, 56) |
| Unknown | 7 | 4 |
| Grade | ||
| I | 35 (36%) | 33 (32%) |
| II | 32 (33%) | 36 (35%) |
| III | 31 (32%) | 33 (32%) |
| This data is simulated | ||
| 1 Median (Q1, Q3); n (%) | ||
1.8 Set Default Options with Themes 使用主题设置默认选项
# set_gtsummary_theme(x, quiet)
#
# reset_gtsummary_theme()
#
# get_gtsummary_theme()
#
# with_gtsummary_theme(
# x,
# expr,
# env = rlang::caller_env(),
# msg_ignored_elements = NULL
# )
#
# check_gtsummary_theme(x)# # Setting JAMA theme for gtsummary
# set_gtsummary_theme(theme_gtsummary_journal("jama"))
# # Themes can be combined by including more than one
# set_gtsummary_theme(theme_gtsummary_compact())
#
# set_gtsummary_theme_ex1 <-
# trial |>
# tbl_summary(by = trt, include = c(age, grade, trt)) |>
# add_stat_label() |>
# as_gt()
#
# # reset gtsummary theme
# reset_gtsummary_theme()1.9 Survey Data 调查数据
{gtsummary}包还通过tbl_svysummary()函数支持调查数据(使用{survey}包创建的对象)。tbl_svysummary()和tbl_summary()的语法几乎相同,上面的示例也适用于调查摘要。(详情可见 :tbl_svysummary包 )
# tbl_svysummary(
# data,
# by = NULL,
# label = NULL,
# statistic = list(all_continuous() ~ "{median} ({p25}, {p75})", all_categorical() ~
# "{n} ({p}%)"),
# digits = NULL,
# type = NULL,
# value = NULL,
# missing = c("ifany", "no", "always"),
# missing_text = "Unknown",
# missing_stat = "{N_miss}",
# sort = all_categorical(FALSE) ~ "alphanumeric",
# percent = c("column", "row", "cell"),
# include = everything()
# )要开始,请安装{survey}软件包并加载apiclus 1数据集。
# install.packages("survey")# loading the api data set
data(api, package = "survey")在我们开始之前,我们将数据框转换为调查对象,注册ID和权重列,并设置有限总体校正列。
svy_apiclus1 <-
survey::svydesign(
id = ~dnum,
weights = ~pw,
data = apiclus1,
fpc = ~fpc
)创建survey对象后,我们现在可以使用tbl_svysummary()将其汇总为标准数据框。与tbl_summary()类似,tbl_svysummary() 接受 by= 参数,并与 add_p()和add_overall()函数一起工作。
无法将自定义函数传递给 tbl_svysummary()的statistic=参数。您必须使用一个预定义的汇总统计函数(例如{mean}、{median}),这些函数利用{survey}包中的函数来计算加权统计。
svy_apiclus1 |>
tbl_svysummary(
# stratify summary statistics by the "both" column
by = both,
# summarize a subset of the columns
include = c(api00, api99, both),
# adding labels to table
label = list(api00 = "API in 2000",
api99 = "API in 1999")
) |>
add_p() |> # comparing values by "both" column
add_overall() |>
# adding spanning header
modify_spanning_header(c("stat_1", "stat_2") ~ "**Met Both Targets**")| Characteristic | Overall N = 6,1941 |
Met Both Targets
|
p-value2 | |
|---|---|---|---|---|
| No N = 1,6921 |
Yes N = 4,5021 |
|||
| API in 2000 | 652 (552, 719) | 631 (559, 710) | 655 (551, 723) | 0.4 |
| API in 1999 | 615 (512, 692) | 632 (550, 701) | 613 (499, 687) | 0.2 |
| 1 Median (Q1, Q3) | ||||
| 2 Design-based KruskalWallis test | ||||
tbl_svysummary() 还可以处理加权调查数据,其中每行表示多个个体:
Titanic |>
as_tibble() |>
survey::svydesign(data = _, ids = ~1, weights = ~n) |>
tbl_svysummary(include = c(Age, Survived))| Characteristic | N = 2,2011 |
|---|---|
| Age | |
| Adult | 2,092 (95%) |
| Child | 109 (5.0%) |
| Survived | 711 (32%) |
| 1 n (%) | |
1.10 Cross Tables 交叉表
# tbl_cross(
# data,
# row = 1L,
# col = 2L,
# label = NULL,
# statistic = ifelse(percent == "none", "{n}", "{n} ({p}%)"),
# digits = NULL,
# percent = c("none", "column", "row", "cell"),
# margin = c("column", "row"),
# missing = c("ifany", "always", "no"),
# missing_text = "Unknown",
# margin_text = "Total"
# )使用tbl_cross() 比较数据中的两个分类变量。tbl_cross() 是tbl_summary() 的包装器,它:
自动向表中添加具有比较变量的名称或标签的跨越标头
默认使用
percent =“cell”添加行和列边距合计(可通过
margin参数自定义)显示行变量和列变量中缺少的数据(可通过缺少参数进行自定义)
trial |>
tbl_cross(
row = stage,
col = trt,
percent = "cell"
) |>
add_p()
Chemotherapy Treatment
|
Total | p-value1 | ||
|---|---|---|---|---|
| Drug A | Drug B | |||
| T Stage | 0.9 | |||
| T1 | 28 (14%) | 25 (13%) | 53 (27%) | |
| T2 | 25 (13%) | 29 (15%) | 54 (27%) | |
| T3 | 22 (11%) | 21 (11%) | 43 (22%) | |
| T4 | 23 (12%) | 27 (14%) | 50 (25%) | |
| Total | 98 (49%) | 102 (51%) | 200 (100%) | |
| 1 Pearson’s Chi-squared test | ||||
2. tbl_regression() 绘制回归分析结果
# tbl_regression(x, ...)
#
# # Default S3 method
# tbl_regression(
# x, # 回归模型对象
# label = NULL, # 变量命名,如:list(age = "Age", stage = "Path T Stage")
# exponentiate = FALSE, # 是否对系数估计值取幂的逻辑,默认为FALSE
# include = everything(), # 要包含在输出中的变量,默认为All
# show_single_row = NULL, # 默认情况下,分类变量打印在多行上。如果一个变量是二分的(例如是/否),并且您希望打印 回归系数,在这里包括变量名称。
# conf.level = 0.95, # 置信区间/可信区间的置信水平
# intercept = FALSE, # 指示是否在输出中包括截距
# estimate_fun = ifelse(exponentiate, label_style_ratio(), label_style_sigfig()), # 函数对系数估计值进行舍入和格式化
# pvalue_fun = label_style_pvalue(digits = 1),
# tidy_fun = broom.helpers::tidy_with_broom_or_parameters,
# add_estimate_to_reference_rows = FALSE, # 添加参考值
# conf.int = TRUE,
# ...
# )2.1 Setup 设置
# install.packages("gtsummary")
library(gtsummary)2.2 Example data set 示例数据集
trial 数据框中的每个变量都被分配了一个属性标签(labelled 包)。
2.3 Basic Usage 基础用法
让我们首先创建一个逻辑回归模型,使用trial数据集的变量年龄和等级来预测肿瘤反应。
# build logistic regression model
m1 <- glm(response ~ age + stage, trial, family = binomial)
# view raw model results
summary(m1)$coefficients Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.48622424 0.62022844 -2.3962530 0.01656365
age 0.01939109 0.01146813 1.6908683 0.09086195
stageT2 -0.54142643 0.44000267 -1.2305071 0.21850725
stageT3 -0.05953479 0.45042027 -0.1321761 0.89484501
stageT4 -0.23108633 0.44822835 -0.5155549 0.60616530
然后,我们将使用一个回归模型表来总结并显示这些结果,只需使用{gtsummary}中的一行代码。
tbl_regression(m1, exponentiate = TRUE)| Characteristic | OR1 | 95% CI1 | p-value |
|---|---|---|---|
| Age | 1.02 | 1.00, 1.04 | 0.091 |
| T Stage | |||
| T1 | — | — | |
| T2 | 0.58 | 0.24, 1.37 | 0.2 |
| T3 | 0.94 | 0.39, 2.28 | 0.9 |
| T4 | 0.79 | 0.33, 1.90 | 0.6 |
| 1 OR = Odds Ratio, CI = Confidence Interval | |||
该模型被识别为logistic回归,系数取幂,因此标题显示比值比为“OR”
系统会自动检测变量类型,并为分类变量添加Reference行
对模型估计值和置信区间进行四舍五入和格式化
由于数据集中的变量已被标记,因此这些标记将被带入
{gtsummary}输出表。如果数据没有被标记,默认是显示变量名。变量水平缩进并添加脚注
2.4 Customize Output 自定义输出
2.4.1 Modifying function arguments 修改函数参数
2.4.2 {gtsummary} functions to add information
2.4.3 {gtsummary} functions to format table {gtsummary}用于格式化表格的函数
2.4.4 {gt} functions to format table {gt}函数格式化表格
m1 |>
tbl_regression(exponentiate = TRUE) |>
as_gt() |>
gt::tab_source_note(gt::md("*This data is simulated*"))| Characteristic | OR1 | 95% CI1 | p-value |
|---|---|---|---|
| Age | 1.02 | 1.00, 1.04 | 0.091 |
| T Stage | |||
| T1 | — | — | |
| T2 | 0.58 | 0.24, 1.37 | 0.2 |
| T3 | 0.94 | 0.39, 2.28 | 0.9 |
| T4 | 0.79 | 0.33, 1.90 | 0.6 |
| This data is simulated | |||
| 1 OR = Odds Ratio, CI = Confidence Interval | |||
例如:
对系数取幂以给出比值比
报告阶段的总体p值-较大的p值四舍五入至两位小数
小于0.10的P值为粗体-变量标签为粗体
变量水平以斜体表示
# format results into data frame with global p-values
m1 |>
tbl_regression(
exponentiate = TRUE,
pvalue_fun = label_style_pvalue(digits = 2),
) |>
add_global_p() |>
bold_p(t = 0.10) |>
bold_labels() |>
italicize_levels()| Characteristic | OR1 | 95% CI1 | p-value |
|---|---|---|---|
| Age | 1.02 | 1.00, 1.04 | 0.087 |
| T Stage | 0.62 | ||
| T1 | — | — | |
| T2 | 0.58 | 0.24, 1.37 | |
| T3 | 0.94 | 0.39, 2.28 | |
| T4 | 0.79 | 0.33, 1.90 | |
| 1 OR = Odds Ratio, CI = Confidence Interval | |||
2.5 Univariate Regression 单变量回归
# tbl_uvregression(data, ...)
#
# # S3 method for class 'data.frame'
# tbl_uvregression(
# data, # 数据框
# y = NULL, # 模型结局(例如,y=复发或y=Surv(时间,复发))
# x = NULL, # 协变量(例如,x=trt 在include中指定的所有其他列将针对常数y或x进行回归
# method, # 回归方法或函数,lm、glm、survival::coxph、survey::svyglm
# method.args = list(),
# exponentiate = FALSE, # 是否对系数估计值取幂
# label = NULL, # 变量重命名
# include = everything(), # 包含在输出中的变量
# tidy_fun = broom.helpers::tidy_with_broom_or_parameters, # 默认使用broom::tidy()。 如果发生错误,则尝试使用 parameters::model_parameters(),如果已安装。
# hide_n = FALSE, # 隐藏N列
# show_single_row = NULL, # 默认情况下,分类变量打印在多行上
# conf.level = 0.95, # 置信区间/可信区间的置信水平
# estimate_fun = ifelse(exponentiate, label_style_ratio(), label_style_sigfig()),
# pvalue_fun = label_style_pvalue(digits = 1),
# formula = "{y} ~ {x}",# 模型公式的字符串
# add_estimate_to_reference_rows = FALSE,
# conf.int = TRUE,
# ...
# )
#
# # S3 method for class 'survey.design'
# tbl_uvregression(
# data,
# y = NULL,
# x = NULL,
# method,
# method.args = list(),
# exponentiate = FALSE,
# label = NULL,
# include = everything(),
# tidy_fun = broom.helpers::tidy_with_broom_or_parameters,
# hide_n = FALSE,
# show_single_row = NULL,
# conf.level = 0.95,
# estimate_fun = ifelse(exponentiate, label_style_ratio(), label_style_sigfig()),
# pvalue_fun = label_style_pvalue(digits = 1),
# formula = "{y} ~ {x}",
# add_estimate_to_reference_rows = FALSE,
# conf.int = TRUE,
# ...
# )函数的作用是:生成一个单变量回归模型表。该函数是tbl_regression()的包装器,因此接受几乎相同的函数参数。可以用与tbl_regression()类似的方式修改函数的结果
# Example 1 ----------------------------------
tbl_uvregression(
trial,
method = glm,
y = response,
method.args = list(family = binomial),
exponentiate = TRUE,
include = c("age", "grade")
)| Characteristic | N | OR1 | 95% CI1 | p-value |
|---|---|---|---|---|
| Age | 183 | 1.02 | 1.00, 1.04 | 0.10 |
| Grade | 193 | |||
| I | — | — | ||
| II | 0.95 | 0.45, 2.00 | 0.9 | |
| III | 1.10 | 0.52, 2.29 | 0.8 | |
| 1 OR = Odds Ratio, CI = Confidence Interval | ||||
# Example 2 ----------------------------------
# rounding pvalues to 2 decimal places
library(survival)
tbl_uvregression(
trial,
method = coxph,
y = Surv(ttdeath, death),
exponentiate = TRUE,
include = c("age", "grade", "response"),
pvalue_fun = label_style_pvalue(digits = 2)
)| Characteristic | N | HR1 | 95% CI1 | p-value |
|---|---|---|---|---|
| Age | 189 | 1.01 | 0.99, 1.02 | 0.33 |
| Grade | 200 | |||
| I | — | — | ||
| II | 1.28 | 0.80, 2.05 | 0.31 | |
| III | 1.69 | 1.07, 2.66 | 0.024 | |
| Tumor Response | 193 | 0.50 | 0.31, 0.78 | 0.003 |
| 1 HR = Hazard Ratio, CI = Confidence Interval | ||||
trial |>
tbl_uvregression(
method = glm,
y = response,
include = c(age, grade),
method.args = list(family = binomial),
exponentiate = TRUE,
pvalue_fun = label_style_pvalue(digits = 2)
) |>
add_global_p() |> # add global p-value
add_nevent() |> # add number of events of the outcome
add_q() |> # adjusts global p-values for multiple testing
bold_p() |> # bold p-values under a given threshold (default 0.05)
bold_p(t = 0.10, q = TRUE) |> # now bold q-values under the threshold of 0.10
bold_labels()| Characteristic | N | Event N | OR1 | 95% CI1 | p-value | q-value2 |
|---|---|---|---|---|---|---|
| Age | 183 | 58 | 1.02 | 1.00, 1.04 | 0.091 | 0.18 |
| Grade | 193 | 61 | 0.93 | 0.93 | ||
| I | — | — | ||||
| II | 0.95 | 0.45, 2.00 | ||||
| III | 1.10 | 0.52, 2.29 | ||||
| 1 OR = Odds Ratio, CI = Confidence Interval | ||||||
| 2 False discovery rate correction for multiple testing | ||||||
2.6 Setting Default Options 设置默认选项
2.7 Supported Models 已支持的模型
3. Frequently Asked Questions 常见问题
FAQ + Gallery: FAQ
3.1 Summary Tables 汇总表
3.1.1 组列上添加跨越标题以增加清晰度
modify_spanning_header(all_stat_cols() ~ "**Chemotherapy Treatment**")
trial |>
tbl_summary(
by = trt,
include = c(age, grade),
missing = "no",
statistic = all_continuous() ~ "{median} ({p25}, {p75})"
) |>
modify_header(all_stat_cols() ~ "**{level}** \nN = {n} ({style_percent(p)}%)") |>
add_n() |>
bold_labels() |>
modify_spanning_header(all_stat_cols() ~ "**Chemotherapy Treatment**")| Characteristic | N |
Chemotherapy Treatment
|
|
|---|---|---|---|
| Drug A N = 98 (49%)1 |
Drug B N = 102 (51%)1 |
||
| Age | 189 | 46 (37, 60) | 48 (39, 56) |
| Grade | 200 | ||
| I | 35 (36%) | 33 (32%) | |
| II | 32 (33%) | 36 (35%) | |
| III | 31 (32%) | 33 (32%) | |
| 1 Median (Q1, Q3); n (%) | |||
3.1.2 在多行上显示
trial |>
tbl_summary(
by = trt,
include = c(age, marker),
type = all_continuous() ~ "continuous2",
statistic =
all_continuous() ~ c("{N_nonmiss}",
"{mean} ({sd})",
"{median} ({p25}, {p75})",
"{min}, {max}"),
missing = "no"
) |>
italicize_levels()| Characteristic | Drug A N = 98 |
Drug B N = 102 |
|---|---|---|
| Age | ||
| N Non-missing | 91 | 98 |
| Mean (SD) | 47 (15) | 47 (14) |
| Median (Q1, Q3) | 46 (37, 60) | 48 (39, 56) |
| Min, Max | 6, 78 | 9, 83 |
| Marker Level (ng/mL) | ||
| N Non-missing | 92 | 98 |
| Mean (SD) | 1.02 (0.89) | 0.82 (0.83) |
| Median (Q1, Q3) | 0.84 (0.23, 1.60) | 0.52 (0.18, 1.21) |
| Min, Max | 0.00, 3.87 | 0.01, 3.64 |
3.1.3 修改格式化p值的函数,更改变量标签
trial |>
mutate(response = factor(response, labels = c("No Tumor Response", "Tumor Responded"))) |>
tbl_summary(
by = response,
include = c(age, grade),
missing = "no",
label = list(age ~ "Patient Age", grade ~ "Tumor Grade")
) |>
add_p(pvalue_fun = label_style_pvalue(digits = 2)) |>
add_q()7 missing rows in the "response" column have been removed.
| Characteristic | No Tumor Response N = 1321 |
Tumor Responded N = 611 |
p-value2 | q-value3 |
|---|---|---|---|---|
| Patient Age | 46 (36, 55) | 49 (43, 59) | 0.091 | 0.18 |
| Tumor Grade | 0.93 | 0.93 | ||
| I | 46 (35%) | 21 (34%) | ||
| II | 44 (33%) | 19 (31%) | ||
| III | 42 (32%) | 21 (34%) | ||
| 1 Median (Q1, Q3); n (%) | ||||
| 2 Wilcoxon rank sum test; Pearson’s Chi-squared test | ||||
| 3 False discovery rate correction for multiple testing | ||||
3.1.4 使用 forcats::fct_na_value_to_level将缺失值纳入
trial |>
mutate(
response =
factor(response, labels = c("No Tumor Response", "Tumor Responded")) |>
forcats::fct_na_value_to_level(level = "Missing Response Status")
) |>
tbl_summary(
by = response,
include = c(age, grade),
label = list(age ~ "Patient Age", grade ~ "Tumor Grade")
)| Characteristic | No Tumor Response N = 1321 |
Tumor Responded N = 611 |
Missing Response Status N = 71 |
|---|---|---|---|
| Patient Age | 46 (36, 55) | 49 (43, 59) | 52 (42, 57) |
| Unknown | 7 | 3 | 1 |
| Tumor Grade | |||
| I | 46 (35%) | 21 (34%) | 1 (14%) |
| II | 44 (33%) | 19 (31%) | 5 (71%) |
| III | 42 (32%) | 21 (34%) | 1 (14%) |
| 1 Median (Q1, Q3); n (%) | |||
3.1.5 报告两组之间的治疗差异
trial |>
tbl_summary(
by = trt,
include = c(response, marker),
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{p}%"
),
missing = "no"
) |>
add_difference() |>
add_n() |>
modify_header(all_stat_cols() ~ "**{level}**")| Characteristic | N | Drug A1 | Drug B1 | Difference2 | 95% CI2,3 | p-value2 |
|---|---|---|---|---|---|---|
| Tumor Response | 193 | 29% | 34% | -4.2% | -18%, 9.9% | 0.6 |
| Marker Level (ng/mL) | 190 | 1.02 (0.89) | 0.82 (0.83) | 0.20 | -0.05, 0.44 | 0.12 |
| 1 | ||||||
| 2 2-sample test for equality of proportions with continuity correction; Welch Two Sample t-test | ||||||
| 3 CI = Confidence Interval | ||||||
3.1.6 Paired t-test and McNemar’s test.
# imagine that each patient received Drug A and Drug B (adding ID showing their paired measurements)
trial_paired <-
trial |>
select(trt, marker, response) |>
mutate(.by = trt, id = dplyr::row_number())
# you must first delete incomplete pairs from the data, then you can build the table
trial_paired |>
# delete missing values
tidyr::drop_na() |>
# keep IDs with both measurements
dplyr::filter(.by = id, dplyr::n() == 2) |>
# summarize data
tbl_summary(by = trt, include = -id) |>
add_p(
test = list(marker ~ "paired.t.test",
response ~ "mcnemar.test"),
group = id
)| Characteristic | Drug A N = 831 |
Drug B N = 831 |
p-value2 |
|---|---|---|---|
| Marker Level (ng/mL) | 0.82 (0.22, 1.71) | 0.53 (0.17, 1.31) | 0.2 |
| Tumor Response | 21 (25%) | 28 (34%) | 0.3 |
| 1 Median (Q1, Q3); n (%) | |||
| 2 Paired t-test; McNemar’s Chi-squared test with continuity correction | |||
3.1.7 将所有组与单个参考组进行比较的p值
# table summarizing data with no p-values
small_trial <- trial |> select(grade, age, response)
t0 <- small_trial |>
tbl_summary(by = grade, missing = "no") |>
modify_header(all_stat_cols() ~ "**{level}**")
# table comparing grade I and II
t1 <- small_trial |>
dplyr::filter(grade %in% c("I", "II")) |>
tbl_summary(by = grade, missing = "no") |>
add_p() |>
modify_header(p.value ~ "**I vs. II**") |>
# hide summary stat columns
modify_column_hide(all_stat_cols())
# table comparing grade I and II
t2 <- small_trial |>
dplyr::filter(grade %in% c("I", "III")) |>
tbl_summary(by = grade, missing = "no") |>
add_p() |>
modify_header(p.value = "**I vs. III**") |>
# hide summary stat columns
modify_column_hide(all_stat_cols())
# merging the 3 tables together, and adding additional gt formatting
tbl_merge(list(t0, t1, t2)) |>
modify_spanning_header(
all_stat_cols() ~ "**Tumor Grade**",
starts_with("p.value") ~ "**p-values**"
)| Characteristic |
Tumor Grade
|
p-values
|
|||
|---|---|---|---|---|---|
| I1 | II1 | III1 | I vs. II2 | I vs. III2 | |
| Age | 47 (37, 56) | 49 (37, 57) | 47 (38, 58) | 0.7 | 0.5 |
| Tumor Response | 21 (31%) | 19 (30%) | 21 (33%) | >0.9 | 0.9 |
| 1 Median (Q1, Q3); n (%) | |||||
| 2 Wilcoxon rank sum test; Fisher’s exact test | |||||
3.1.8 多个变量分层的汇总表
trial |>
select(trt, grade, age, stage) |>
mutate(grade = paste("Grade", grade)) |>
tbl_strata(
strata = grade,
~ .x |>
tbl_summary(by = trt, missing = "no") |>
modify_header(all_stat_cols() ~ "**{level}**")
)| Characteristic |
Grade I
|
Grade II
|
Grade III
|
|||
|---|---|---|---|---|---|---|
| Drug A1 | Drug B1 | Drug A1 | Drug B1 | Drug A1 | Drug B1 | |
| Age | 46 (36, 60) | 48 (42, 55) | 45 (31, 55) | 51 (42, 58) | 52 (42, 61) | 45 (36, 52) |
| T Stage | ||||||
| T1 | 8 (23%) | 9 (27%) | 14 (44%) | 9 (25%) | 6 (19%) | 7 (21%) |
| T2 | 8 (23%) | 10 (30%) | 8 (25%) | 9 (25%) | 9 (29%) | 10 (30%) |
| T3 | 11 (31%) | 7 (21%) | 5 (16%) | 6 (17%) | 6 (19%) | 8 (24%) |
| T4 | 8 (23%) | 7 (21%) | 5 (16%) | 12 (33%) | 10 (32%) | 8 (24%) |
| 1 Median (Q1, Q3); n (%) | ||||||
3.2 Regression Tables 回归表
3.2.1 单变量回归表中包括观察数和事件数
trial |>
tbl_uvregression(
method = glm,
y = response,
include = c(age, grade),
method.args = list(family = binomial),
exponentiate = TRUE
) |>
add_nevent()| Characteristic | N | Event N | OR1 | 95% CI1 | p-value |
|---|---|---|---|---|---|
| Age | 183 | 58 | 1.02 | 1.00, 1.04 | 0.10 |
| Grade | 193 | 61 | |||
| I | — | — | |||
| II | 0.95 | 0.45, 2.00 | 0.9 | ||
| III | 1.10 | 0.52, 2.29 | 0.8 | ||
| 1 OR = Odds Ratio, CI = Confidence Interval | |||||
3.2.2 包括两个相关的模型并排与描述性统计
gt_r1 <- glm(response ~ trt + grade, trial, family = binomial) |>
tbl_regression(exponentiate = TRUE)
gt_r2 <- survival::coxph(survival::Surv(ttdeath, death) ~ trt + grade, trial) |>
tbl_regression(exponentiate = TRUE)
gt_t1 <- trial |>
tbl_summary(include = c(trt, grade), missing = "no") |>
add_n() |>
modify_header(stat_0 = "**n (%)**") |>
modify_footnote(stat_0 = NA_character_)
theme_gtsummary_compact()Setting theme "Compact"
tbl_merge(
list(gt_t1, gt_r1, gt_r2),
tab_spanner = c(NA_character_, "**Tumor Response**", "**Time to Death**")
)| Characteristic | N | n (%) |
Tumor Response
|
Time to Death
|
||||
|---|---|---|---|---|---|---|---|---|
| OR1 | 95% CI1 | p-value | HR1 | 95% CI1 | p-value | |||
| Chemotherapy Treatment | 200 | |||||||
| Drug A | 98 (49%) | — | — | — | — | |||
| Drug B | 102 (51%) | 1.21 | 0.66, 2.24 | 0.5 | 1.25 | 0.86, 1.81 | 0.2 | |
| Grade | 200 | |||||||
| I | 68 (34%) | — | — | — | — | |||
| II | 68 (34%) | 0.94 | 0.44, 1.98 | 0.9 | 1.28 | 0.80, 2.06 | 0.3 | |
| III | 64 (32%) | 1.09 | 0.52, 2.27 | 0.8 | 1.69 | 1.07, 2.66 | 0.024 | |
| 1 OR = Odds Ratio, CI = Confidence Interval, HR = Hazard Ratio | ||||||||
3.2.4 包括分类预测因子每个水平的事件数量
trial |>
tbl_uvregression(
method = survival::coxph,
y = survival::Surv(ttdeath, death),
include = c(stage, grade),
exponentiate = TRUE,
hide_n = TRUE
) |>
add_nevent(location = "level")| Characteristic | Event N | HR1 | 95% CI1 | p-value |
|---|---|---|---|---|
| T Stage | ||||
| T1 | 24 | — | — | |
| T2 | 27 | 1.18 | 0.68, 2.04 | 0.6 |
| T3 | 22 | 1.23 | 0.69, 2.20 | 0.5 |
| T4 | 39 | 2.48 | 1.49, 4.14 | <0.001 |
| Grade | ||||
| I | 33 | — | — | |
| II | 36 | 1.28 | 0.80, 2.05 | 0.3 |
| III | 43 | 1.69 | 1.07, 2.66 | 0.024 |
| 1 HR = Hazard Ratio, CI = Confidence Interval | ||||
3.2.5 回归模型,其中协变量保持不变,结果发生变化
trial |>
tbl_uvregression(
method = lm,
x = trt,
show_single_row = "trt",
hide_n = TRUE,
include = c(age, marker)
) |>
modify_header(label = "**Model Outcome**",
estimate = "**Treatment Coef.**") |>
modify_footnote(estimate = "Values larger than 0 indicate larger values in the Drug B group.")| Model Outcome | Treatment Coef.1 | 95% CI2 | p-value |
|---|---|---|---|
| Age | 0.44 | -3.7, 4.6 | 0.8 |
| Marker Level (ng/mL) | -0.20 | -0.44, 0.05 | 0.12 |
| 1 Values larger than 0 indicate larger values in the Drug B group. | |||
| 2 CI = Confidence Interval | |||
3.2.6 在p值较低的估计值上使用显著性星号
trial |>
tbl_uvregression(
method = survival::coxph,
y = survival::Surv(ttdeath, death),
include = c(stage, grade),
exponentiate = TRUE,
) |>
add_significance_stars()| Characteristic | N | HR1,2 | SE2 |
|---|---|---|---|
| T Stage | 200 | ||
| T1 | — | — | |
| T2 | 1.18 | 0.281 | |
| T3 | 1.23 | 0.295 | |
| T4 | 2.48*** | 0.260 | |
| Grade | 200 | ||
| I | — | — | |
| II | 1.28 | 0.241 | |
| III | 1.69* | 0.232 | |
| 1 p<0.05; p<0.01; p<0.001 | |||
| 2 HR = Hazard Ratio, SE = Standard Error | |||