setup

library(gtsummary)
library(tidyverse)

data<-read.csv("loan_data_cleaned.csv") %>% 
  mutate(loan_status=as.factor(loan_status))

tab<-data %>% 
  select(loan_status,loan_amnt,age,home_ownership)

output 1

tab %>% tbl_summary()

Characteristic	N = 29,091¹
loan_status
0	25,864 (89%)
1	3,227 (11%)
loan_amnt	8,000 (5,000, 12,250)
age	26 (23, 30)
home_ownership
MORTGAGE	12,001 (41%)
OTHER	97 (0.3%)
OWN	2,301 (7.9%)
RENT	14,692 (51%)
¹ n (%); Median (IQR)

summaries

Four types of summaries: continuous, continuous2, categorical, and dichotomous
Statistics are median (IQR) for continuous, n (%) for categorical/dichotomous
Variables coded 0/1, TRUE/FALSE, Yes/No treated as dichotomous
Lists NA values under “Unknown”
Label attributes are printed automatically

Customize tbl_summary()

tb<-tbl_summary(
  tab,
  by = loan_status,
)

output 2

tb

Characteristic	0, N = 25,864¹	1, N = 3,227¹
loan_amnt	8,000 (5,000, 12,250)	8,000 (4,850, 12,000)
age	26 (23, 30)	26 (23, 30)
home_ownership
MORTGAGE	10,820 (42%)	1,181 (37%)
OTHER	80 (0.3%)	17 (0.5%)
OWN	2,049 (7.9%)	252 (7.8%)
RENT	12,915 (50%)	1,777 (55%)
¹ Median (IQR); n (%)

Notes

by: specify a column variable for cross-tabulation
type: specify the summary type
statistic: customize the reported statistics

Customize tbl_summary()


tb2<-tbl_summary(
  tab,
  by = loan_status,
  type = age ~ "continuous2",
  statistic = 
    list(
      age ~ c("{mean} ({sd})", 
              "{min}, {max}"), 
      home_ownership ~ "{n} / {N} ({p}%)"
    ),
)

output 3

tb2

Characteristic	0, N = 25,864¹	1, N = 3,227¹
loan_amnt	8,000 (5,000, 12,250)	8,000 (4,850, 12,000)
age
Mean (SD)	28 (6)	27 (6)
Range	20, 94	20, 70
home_ownership
MORTGAGE	10,820 / 25,864 (42%)	1,181 / 3,227 (37%)
OTHER	80 / 25,864 (0.3%)	17 / 3,227 (0.5%)
OWN	2,049 / 25,864 (7.9%)	252 / 3,227 (7.8%)
RENT	12,915 / 25,864 (50%)	1,777 / 3,227 (55%)
¹ Median (IQR); n / N (%)

renaming


tb3<-tbl_summary(
  tab,
  by = loan_status,
  type = age ~ "continuous2",
  statistic = 
    list(
      age ~ c("{mean} ({sd})", 
              "{min}, {max}"), 
      home_ownership ~ "{n} / {N} ({p}%)"
    ),
  label = 
    home_ownership ~ "living status",
)

output 4

tb3

Characteristic	0, N = 25,864¹	1, N = 3,227¹
loan_amnt	8,000 (5,000, 12,250)	8,000 (4,850, 12,000)
age
Mean (SD)	28 (6)	27 (6)
Range	20, 94	20, 70
living status
MORTGAGE	10,820 / 25,864 (42%)	1,181 / 3,227 (37%)
OTHER	80 / 25,864 (0.3%)	17 / 3,227 (0.5%)
OWN	2,049 / 25,864 (7.9%)	252 / 3,227 (7.8%)
RENT	12,915 / 25,864 (50%)	1,777 / 3,227 (55%)
¹ Median (IQR); n / N (%)

few notes

Named list are OK too! label = list(age = "Patient Age")

Add-on functions in {gtsummary}

tbl_summary() objects can also be updated using related functions.

add_*() add additional column of statistics or information, e.g. p-values, q-values, overall statistics, treatment differences, N obs., and more
modify_*() modify table headers, spanning headers, footnotes, and more
bold_*()/italicize_*() style labels, variable levels, significant p-values

Update tbl_summary() with add_*()


tb5<-tab |>
  tbl_summary(
    by = loan_status
  ) |> 
  add_p() |> 
  add_q(method = "fdr")

output

tb5

Characteristic	0, N = 25,864¹	1, N = 3,227¹	p-value²	q-value³
loan_amnt	8,000 (5,000, 12,250)	8,000 (4,850, 12,000)	0.001	0.001
age	26 (23, 30)	26 (23, 30)	<0.001	<0.001
home_ownership			<0.001	<0.001
MORTGAGE	10,820 (42%)	1,181 (37%)
OTHER	80 (0.3%)	17 (0.5%)
OWN	2,049 (7.9%)	252 (7.8%)
RENT	12,915 (50%)	1,777 (55%)
¹ Median (IQR); n (%)
² Wilcoxon rank sum test; Pearson's Chi-squared test
³ False discovery rate correction for multiple testing

some notes

add_p(): adds a column of p-values
add_q(): adds a column of p-values adjusted for multiple comparisons through a call to p.adjust()

Update tbl_summary() with add_*()

mt<-tab |>
  tbl_summary(
    by = loan_status,
    missing = "no"
  ) |> 
  add_overall()

output

mt

Characteristic	Overall, N = 29,091¹	0, N = 25,864¹	1, N = 3,227¹
loan_amnt	8,000 (5,000, 12,250)	8,000 (5,000, 12,250)	8,000 (4,850, 12,000)
age	26 (23, 30)	26 (23, 30)	26 (23, 30)
home_ownership
MORTGAGE	12,001 (41%)	10,820 (42%)	1,181 (37%)
OTHER	97 (0.3%)	80 (0.3%)	17 (0.5%)
OWN	2,301 (7.9%)	2,049 (7.9%)	252 (7.8%)
RENT	14,692 (51%)	12,915 (50%)	1,777 (55%)
¹ Median (IQR); n (%)

add_overall(): adds a column of overall statistics

Update tbl_summary() with add_*()

tab |>
  tbl_summary(
    by = loan_status,
    missing = "no"
  ) |> 
  add_overall() |> 
  add_n()

Characteristic	N	Overall, N = 29,091¹	0, N = 25,864¹	1, N = 3,227¹
loan_amnt	29,091	8,000 (5,000, 12,250)	8,000 (5,000, 12,250)	8,000 (4,850, 12,000)
age	29,091	26 (23, 30)	26 (23, 30)	26 (23, 30)
home_ownership	29,091
MORTGAGE		12,001 (41%)	10,820 (42%)	1,181 (37%)
OTHER		97 (0.3%)	80 (0.3%)	17 (0.5%)
OWN		2,301 (7.9%)	2,049 (7.9%)	252 (7.8%)
RENT		14,692 (51%)	12,915 (50%)	1,777 (55%)
¹ Median (IQR); n (%)

Update with bold_()/italicize_()

mis<-tab |>
  tbl_summary(
    by = loan_status
  ) |>
  add_p() |> 
  bold_labels() |> 
  italicize_levels() |> 
  bold_p(t = 0.8)

output

mis

Characteristic	0, N = 25,864¹	1, N = 3,227¹	p-value²
loan_amnt	8,000 (5,000, 12,250)	8,000 (4,850, 12,000)	0.001
age	26 (23, 30)	26 (23, 30)	<0.001
home_ownership			<0.001
MORTGAGE	10,820 (42%)	1,181 (37%)
OTHER	80 (0.3%)	17 (0.5%)
OWN	2,049 (7.9%)	252 (7.8%)
RENT	12,915 (50%)	1,777 (55%)
¹ Median (IQR); n (%)
² Wilcoxon rank sum test; Pearson's Chi-squared test

some notes

bold_labels(): bold the variable labels
italicize_levels(): italicize the variable levels
bold_p(): bold p-values according a specified threshold

Update tbl_summary() with modify_*()

tbl <-
  tab |> 
  tbl_summary(by = loan_status, 
              missing = "no") |>
  modify_header(
      stat_1 ~ "**defaulted**",
      stat_2 ~ "**not defaulted**"
  ) |> 
  modify_spanning_header(
    all_stat_cols() ~ "**loan status**") |> 
  modify_footnote(
    all_stat_cols() ~ 
      paste("median (IQR) for continuous;",
            "n (%) for categorical")
  )

output

tbl

Characteristic	loan status
Characteristic	defaulted¹	not defaulted¹
loan_amnt	8,000 (5,000, 12,250)	8,000 (4,850, 12,000)
age	26 (23, 30)	26 (23, 30)
home_ownership
MORTGAGE	10,820 (42%)	1,181 (37%)
OTHER	80 (0.3%)	17 (0.5%)
OWN	2,049 (7.9%)	252 (7.8%)
RENT	12,915 (50%)	1,777 (55%)
¹ median (IQR) for continuous; n (%) for categorical

MODELING

m1 <- 
  glm(
    loan_status ~ age + grade,
    data = data,
    family = binomial(link = "logit")
  )

default output

summary(m1)
#> 
#> Call:
#> glm(formula = loan_status ~ age + grade, family = binomial(link = "logit"), 
#>     data = data)
#> 
#> Deviance Residuals: 
#>     Min       1Q   Median       3Q      Max  
#> -0.9947  -0.5549  -0.4668  -0.3482   2.5082  
#> 
#> Coefficients:
#>              Estimate Std. Error z value Pr(>|z|)    
#> (Intercept) -2.503115   0.097180 -25.758  < 2e-16 ***
#> age         -0.009976   0.003181  -3.136  0.00171 ** 
#> gradeB       0.640533   0.054916  11.664  < 2e-16 ***
#> gradeC       1.019223   0.057183  17.824  < 2e-16 ***
#> gradeD       1.259926   0.063114  19.963  < 2e-16 ***
#> gradeE       1.410418   0.094931  14.857  < 2e-16 ***
#> gradeF       1.765472   0.161902  10.905  < 2e-16 ***
#> gradeG       2.276245   0.279516   8.144 3.84e-16 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> (Dispersion parameter for binomial family taken to be 1)
#> 
#>     Null deviance: 20274  on 29090  degrees of freedom
#> Residual deviance: 19614  on 29083  degrees of freedom
#> AIC: 19630
#> 
#> Number of Fisher Scoring iterations: 5

Customize tbl_regression() output

tab<-tbl_regression(
  m1,
  exponentiate = TRUE
) |> 
  add_global_p() |>
  add_glance_table(
    include = c(nobs,
                logLik,
                AIC,
                BIC)
  )

output

tab

Characteristic	OR¹	95% CI¹	p-value
age	0.99	0.98, 1.00	0.001
grade			<0.001
A	—	—
B	1.90	1.70, 2.11
C	2.77	2.48, 3.10
D	3.53	3.12, 3.99
E	4.10	3.40, 4.93
F	5.84	4.22, 7.98
G	9.74	5.55, 16.7
No. Obs.	29,091
Log-likelihood	-9,807
AIC	19,630
BIC	19,697
¹ OR = Odds Ratio, CI = Confidence Interval

creating neat tables

creating neat tables

setup

output 1

summaries

Customize tbl_summary()

output 2

Notes

Customize tbl_summary()

output 3

renaming

output 4

few notes

Add-on functions in {gtsummary}

Update tbl_summary() with add_*()

output

some notes

Update tbl_summary() with add_*()

output

Update tbl_summary() with add_*()

Update with bold_()/italicize_()

output

some notes

Update tbl_summary() with modify_*()

output

MODELING

default output

Customize tbl_regression() output

output

creating neat tables

creating neat tables

setup

output 1

summaries

Customize tbl_summary()

output 2

Notes

Customize tbl_summary()

output 3

renaming

output 4

few notes

Add-on functions in {gtsummary}

Update tbl_summary() with add_*()

output

some notes

Update tbl_summary() with add_*()

output

Update tbl_summary() with add_*()

Update with bold_*()/italicize_*()

output

some notes

Update tbl_summary() with modify_*()

output

MODELING

default output

Customize tbl_regression() output

output

Update with bold_()/italicize_()