creating neat tables

setup

library(gtsummary)
library(tidyverse)

data<-read.csv("loan_data_cleaned.csv") %>% 
  mutate(loan_status=as.factor(loan_status))

tab<-data %>% 
  select(loan_status,loan_amnt,age,home_ownership)

output 1

tab %>% tbl_summary()
Characteristic N = 29,0911
loan_status
    0 25,864 (89%)
    1 3,227 (11%)
loan_amnt 8,000 (5,000, 12,250)
age 26 (23, 30)
home_ownership
    MORTGAGE 12,001 (41%)
    OTHER 97 (0.3%)
    OWN 2,301 (7.9%)
    RENT 14,692 (51%)
1 n (%); Median (IQR)

summaries

  • Four types of summaries: continuous, continuous2, categorical, and dichotomous

  • Statistics are median (IQR) for continuous, n (%) for categorical/dichotomous

  • Variables coded 0/1, TRUE/FALSE, Yes/No treated as dichotomous

  • Lists NA values under “Unknown”

  • Label attributes are printed automatically

Customize tbl_summary()

tb<-tbl_summary(
  tab,
  by = loan_status,
)

output 2

tb
Characteristic 0, N = 25,8641 1, N = 3,2271
loan_amnt 8,000 (5,000, 12,250) 8,000 (4,850, 12,000)
age 26 (23, 30) 26 (23, 30)
home_ownership
    MORTGAGE 10,820 (42%) 1,181 (37%)
    OTHER 80 (0.3%) 17 (0.5%)
    OWN 2,049 (7.9%) 252 (7.8%)
    RENT 12,915 (50%) 1,777 (55%)
1 Median (IQR); n (%)

Notes

  • by: specify a column variable for cross-tabulation

  • type: specify the summary type

  • statistic: customize the reported statistics

Customize tbl_summary()


tb2<-tbl_summary(
  tab,
  by = loan_status,
  type = age ~ "continuous2",
  statistic = 
    list(
      age ~ c("{mean} ({sd})", 
              "{min}, {max}"), 
      home_ownership ~ "{n} / {N} ({p}%)"
    ),
)

output 3

tb2
Characteristic 0, N = 25,8641 1, N = 3,2271
loan_amnt 8,000 (5,000, 12,250) 8,000 (4,850, 12,000)
age
    Mean (SD) 28 (6) 27 (6)
    Range 20, 94 20, 70
home_ownership
    MORTGAGE 10,820 / 25,864 (42%) 1,181 / 3,227 (37%)
    OTHER 80 / 25,864 (0.3%) 17 / 3,227 (0.5%)
    OWN 2,049 / 25,864 (7.9%) 252 / 3,227 (7.8%)
    RENT 12,915 / 25,864 (50%) 1,777 / 3,227 (55%)
1 Median (IQR); n / N (%)

renaming


tb3<-tbl_summary(
  tab,
  by = loan_status,
  type = age ~ "continuous2",
  statistic = 
    list(
      age ~ c("{mean} ({sd})", 
              "{min}, {max}"), 
      home_ownership ~ "{n} / {N} ({p}%)"
    ),
  label = 
    home_ownership ~ "living status",
)

output 4

tb3
Characteristic 0, N = 25,8641 1, N = 3,2271
loan_amnt 8,000 (5,000, 12,250) 8,000 (4,850, 12,000)
age
    Mean (SD) 28 (6) 27 (6)
    Range 20, 94 20, 70
living status
    MORTGAGE 10,820 / 25,864 (42%) 1,181 / 3,227 (37%)
    OTHER 80 / 25,864 (0.3%) 17 / 3,227 (0.5%)
    OWN 2,049 / 25,864 (7.9%) 252 / 3,227 (7.8%)
    RENT 12,915 / 25,864 (50%) 1,777 / 3,227 (55%)
1 Median (IQR); n / N (%)

few notes

Named list are OK too! label = list(age = "Patient Age")

Add-on functions in {gtsummary}

tbl_summary() objects can also be updated using related functions.

  • add_*() add additional column of statistics or information, e.g. p-values, q-values, overall statistics, treatment differences, N obs., and more

  • modify_*() modify table headers, spanning headers, footnotes, and more

  • bold_*()/italicize_*() style labels, variable levels, significant p-values

Update tbl_summary() with add_*()


tb5<-tab |>
  tbl_summary(
    by = loan_status
  ) |> 
  add_p() |> 
  add_q(method = "fdr")

output

tb5
Characteristic 0, N = 25,8641 1, N = 3,2271 p-value2 q-value3
loan_amnt 8,000 (5,000, 12,250) 8,000 (4,850, 12,000) 0.001 0.001
age 26 (23, 30) 26 (23, 30) <0.001 <0.001
home_ownership <0.001 <0.001
    MORTGAGE 10,820 (42%) 1,181 (37%)
    OTHER 80 (0.3%) 17 (0.5%)
    OWN 2,049 (7.9%) 252 (7.8%)
    RENT 12,915 (50%) 1,777 (55%)
1 Median (IQR); n (%)
2 Wilcoxon rank sum test; Pearson's Chi-squared test
3 False discovery rate correction for multiple testing

some notes

  • add_p(): adds a column of p-values

  • add_q(): adds a column of p-values adjusted for multiple comparisons through a call to p.adjust()

Update tbl_summary() with add_*()

mt<-tab |>
  tbl_summary(
    by = loan_status,
    missing = "no"
  ) |> 
  add_overall()

output

mt
Characteristic Overall, N = 29,0911 0, N = 25,8641 1, N = 3,2271
loan_amnt 8,000 (5,000, 12,250) 8,000 (5,000, 12,250) 8,000 (4,850, 12,000)
age 26 (23, 30) 26 (23, 30) 26 (23, 30)
home_ownership
    MORTGAGE 12,001 (41%) 10,820 (42%) 1,181 (37%)
    OTHER 97 (0.3%) 80 (0.3%) 17 (0.5%)
    OWN 2,301 (7.9%) 2,049 (7.9%) 252 (7.8%)
    RENT 14,692 (51%) 12,915 (50%) 1,777 (55%)
1 Median (IQR); n (%)
  • add_overall(): adds a column of overall statistics

Update tbl_summary() with add_*()

tab |>
  tbl_summary(
    by = loan_status,
    missing = "no"
  ) |> 
  add_overall() |> 
  add_n()
Characteristic N Overall, N = 29,0911 0, N = 25,8641 1, N = 3,2271
loan_amnt 29,091 8,000 (5,000, 12,250) 8,000 (5,000, 12,250) 8,000 (4,850, 12,000)
age 29,091 26 (23, 30) 26 (23, 30) 26 (23, 30)
home_ownership 29,091
    MORTGAGE 12,001 (41%) 10,820 (42%) 1,181 (37%)
    OTHER 97 (0.3%) 80 (0.3%) 17 (0.5%)
    OWN 2,301 (7.9%) 2,049 (7.9%) 252 (7.8%)
    RENT 14,692 (51%) 12,915 (50%) 1,777 (55%)
1 Median (IQR); n (%)

Update with bold_*()/italicize_*()

mis<-tab |>
  tbl_summary(
    by = loan_status
  ) |>
  add_p() |> 
  bold_labels() |> 
  italicize_levels() |> 
  bold_p(t = 0.8)

output

mis
Characteristic 0, N = 25,8641 1, N = 3,2271 p-value2
loan_amnt 8,000 (5,000, 12,250) 8,000 (4,850, 12,000) 0.001
age 26 (23, 30) 26 (23, 30) <0.001
home_ownership <0.001
    MORTGAGE 10,820 (42%) 1,181 (37%)
    OTHER 80 (0.3%) 17 (0.5%)
    OWN 2,049 (7.9%) 252 (7.8%)
    RENT 12,915 (50%) 1,777 (55%)
1 Median (IQR); n (%)
2 Wilcoxon rank sum test; Pearson's Chi-squared test

some notes

  • bold_labels(): bold the variable labels
  • italicize_levels(): italicize the variable levels
  • bold_p(): bold p-values according a specified threshold

Update tbl_summary() with modify_*()

tbl <-
  tab |> 
  tbl_summary(by = loan_status, 
              missing = "no") |>
  modify_header(
      stat_1 ~ "**defaulted**",
      stat_2 ~ "**not defaulted**"
  ) |> 
  modify_spanning_header(
    all_stat_cols() ~ "**loan status**") |> 
  modify_footnote(
    all_stat_cols() ~ 
      paste("median (IQR) for continuous;",
            "n (%) for categorical")
  )

output

tbl
Characteristic loan status
defaulted1 not defaulted1
loan_amnt 8,000 (5,000, 12,250) 8,000 (4,850, 12,000)
age 26 (23, 30) 26 (23, 30)
home_ownership
    MORTGAGE 10,820 (42%) 1,181 (37%)
    OTHER 80 (0.3%) 17 (0.5%)
    OWN 2,049 (7.9%) 252 (7.8%)
    RENT 12,915 (50%) 1,777 (55%)
1 median (IQR) for continuous; n (%) for categorical

MODELING

m1 <- 
  glm(
    loan_status ~ age + grade,
    data = data,
    family = binomial(link = "logit")
  )

default output

summary(m1)
#> 
#> Call:
#> glm(formula = loan_status ~ age + grade, family = binomial(link = "logit"), 
#>     data = data)
#> 
#> Deviance Residuals: 
#>     Min       1Q   Median       3Q      Max  
#> -0.9947  -0.5549  -0.4668  -0.3482   2.5082  
#> 
#> Coefficients:
#>              Estimate Std. Error z value Pr(>|z|)    
#> (Intercept) -2.503115   0.097180 -25.758  < 2e-16 ***
#> age         -0.009976   0.003181  -3.136  0.00171 ** 
#> gradeB       0.640533   0.054916  11.664  < 2e-16 ***
#> gradeC       1.019223   0.057183  17.824  < 2e-16 ***
#> gradeD       1.259926   0.063114  19.963  < 2e-16 ***
#> gradeE       1.410418   0.094931  14.857  < 2e-16 ***
#> gradeF       1.765472   0.161902  10.905  < 2e-16 ***
#> gradeG       2.276245   0.279516   8.144 3.84e-16 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> (Dispersion parameter for binomial family taken to be 1)
#> 
#>     Null deviance: 20274  on 29090  degrees of freedom
#> Residual deviance: 19614  on 29083  degrees of freedom
#> AIC: 19630
#> 
#> Number of Fisher Scoring iterations: 5

Customize tbl_regression() output

tab<-tbl_regression(
  m1,
  exponentiate = TRUE
) |> 
  add_global_p() |>
  add_glance_table(
    include = c(nobs,
                logLik,
                AIC,
                BIC)
  )

output

tab
Characteristic OR1 95% CI1 p-value
age 0.99 0.98, 1.00 0.001
grade <0.001
    A
    B 1.90 1.70, 2.11
    C 2.77 2.48, 3.10
    D 3.53 3.12, 3.99
    E 4.10 3.40, 4.93
    F 5.84 4.22, 7.98
    G 9.74 5.55, 16.7
No. Obs. 29,091
Log-likelihood -9,807
AIC 19,630
BIC 19,697
1 OR = Odds Ratio, CI = Confidence Interval