setup
library(gtsummary)
library(tidyverse)
data<-read.csv("loan_data_cleaned.csv") %>%
mutate(loan_status=as.factor(loan_status))
tab<-data %>%
select(loan_status,loan_amnt,age,home_ownership)
output 1
Characteristic |
N = 29,091 |
loan_status |
|
0 |
25,864 (89%) |
1 |
3,227 (11%) |
loan_amnt |
8,000 (5,000, 12,250) |
age |
26 (23, 30) |
home_ownership |
|
MORTGAGE |
12,001 (41%) |
OTHER |
97 (0.3%) |
OWN |
2,301 (7.9%) |
RENT |
14,692 (51%) |
summaries
Four types of summaries: continuous
,
continuous2
, categorical
, and
dichotomous
Statistics are median (IQR)
for continuous,
n (%)
for categorical/dichotomous
Variables coded 0/1
, TRUE/FALSE
,
Yes/No
treated as dichotomous
Lists NA
values under “Unknown”
Label attributes are printed automatically
Customize tbl_summary()
tb<-tbl_summary(
tab,
by = loan_status,
)
output 2
Characteristic |
0, N = 25,864 |
1, N = 3,227 |
loan_amnt |
8,000 (5,000, 12,250) |
8,000 (4,850, 12,000) |
age |
26 (23, 30) |
26 (23, 30) |
home_ownership |
|
|
MORTGAGE |
10,820 (42%) |
1,181 (37%) |
OTHER |
80 (0.3%) |
17 (0.5%) |
OWN |
2,049 (7.9%) |
252 (7.8%) |
RENT |
12,915 (50%) |
1,777 (55%) |
Notes
by
: specify a column variable for
cross-tabulation
type
: specify the summary type
statistic
: customize the reported
statistics
Customize tbl_summary()
tb2<-tbl_summary(
tab,
by = loan_status,
type = age ~ "continuous2",
statistic =
list(
age ~ c("{mean} ({sd})",
"{min}, {max}"),
home_ownership ~ "{n} / {N} ({p}%)"
),
)
output 3
Characteristic |
0, N = 25,864 |
1, N = 3,227 |
loan_amnt |
8,000 (5,000, 12,250) |
8,000 (4,850, 12,000) |
age |
|
|
Mean (SD) |
28 (6) |
27 (6) |
Range |
20, 94 |
20, 70 |
home_ownership |
|
|
MORTGAGE |
10,820 / 25,864 (42%) |
1,181 / 3,227 (37%) |
OTHER |
80 / 25,864 (0.3%) |
17 / 3,227 (0.5%) |
OWN |
2,049 / 25,864 (7.9%) |
252 / 3,227 (7.8%) |
RENT |
12,915 / 25,864 (50%) |
1,777 / 3,227 (55%) |
renaming
tb3<-tbl_summary(
tab,
by = loan_status,
type = age ~ "continuous2",
statistic =
list(
age ~ c("{mean} ({sd})",
"{min}, {max}"),
home_ownership ~ "{n} / {N} ({p}%)"
),
label =
home_ownership ~ "living status",
)
output 4
Characteristic |
0, N = 25,864 |
1, N = 3,227 |
loan_amnt |
8,000 (5,000, 12,250) |
8,000 (4,850, 12,000) |
age |
|
|
Mean (SD) |
28 (6) |
27 (6) |
Range |
20, 94 |
20, 70 |
living status |
|
|
MORTGAGE |
10,820 / 25,864 (42%) |
1,181 / 3,227 (37%) |
OTHER |
80 / 25,864 (0.3%) |
17 / 3,227 (0.5%) |
OWN |
2,049 / 25,864 (7.9%) |
252 / 3,227 (7.8%) |
RENT |
12,915 / 25,864 (50%) |
1,777 / 3,227 (55%) |
few notes
Named list are OK too!
label = list(age = "Patient Age")
Add-on functions in {gtsummary}
tbl_summary()
objects can also be updated using related
functions.
add_*()
add additional
column of statistics or information, e.g. p-values, q-values,
overall statistics, treatment differences, N obs., and more
modify_*()
modify
table headers, spanning headers, footnotes, and more
bold_*()/italicize_*()
style labels, variable levels, significant
p-values
Update tbl_summary() with add_*()
tb5<-tab |>
tbl_summary(
by = loan_status
) |>
add_p() |>
add_q(method = "fdr")
output
Characteristic |
0, N = 25,864 |
1, N = 3,227 |
p-value |
q-value |
loan_amnt |
8,000 (5,000, 12,250) |
8,000 (4,850, 12,000) |
0.001 |
0.001 |
age |
26 (23, 30) |
26 (23, 30) |
<0.001 |
<0.001 |
home_ownership |
|
|
<0.001 |
<0.001 |
MORTGAGE |
10,820 (42%) |
1,181 (37%) |
|
|
OTHER |
80 (0.3%) |
17 (0.5%) |
|
|
OWN |
2,049 (7.9%) |
252 (7.8%) |
|
|
RENT |
12,915 (50%) |
1,777 (55%) |
|
|
Update tbl_summary() with add_*()
mt<-tab |>
tbl_summary(
by = loan_status,
missing = "no"
) |>
add_overall()
output
Characteristic |
Overall, N = 29,091 |
0, N = 25,864 |
1, N = 3,227 |
loan_amnt |
8,000 (5,000, 12,250) |
8,000 (5,000, 12,250) |
8,000 (4,850, 12,000) |
age |
26 (23, 30) |
26 (23, 30) |
26 (23, 30) |
home_ownership |
|
|
|
MORTGAGE |
12,001 (41%) |
10,820 (42%) |
1,181 (37%) |
OTHER |
97 (0.3%) |
80 (0.3%) |
17 (0.5%) |
OWN |
2,301 (7.9%) |
2,049 (7.9%) |
252 (7.8%) |
RENT |
14,692 (51%) |
12,915 (50%) |
1,777 (55%) |
add_overall()
: adds a column of overall statistics
Update tbl_summary() with add_*()
tab |>
tbl_summary(
by = loan_status,
missing = "no"
) |>
add_overall() |>
add_n()
Characteristic |
N |
Overall, N = 29,091 |
0, N = 25,864 |
1, N = 3,227 |
loan_amnt |
29,091 |
8,000 (5,000, 12,250) |
8,000 (5,000, 12,250) |
8,000 (4,850, 12,000) |
age |
29,091 |
26 (23, 30) |
26 (23, 30) |
26 (23, 30) |
home_ownership |
29,091 |
|
|
|
MORTGAGE |
|
12,001 (41%) |
10,820 (42%) |
1,181 (37%) |
OTHER |
|
97 (0.3%) |
80 (0.3%) |
17 (0.5%) |
OWN |
|
2,301 (7.9%) |
2,049 (7.9%) |
252 (7.8%) |
RENT |
|
14,692 (51%) |
12,915 (50%) |
1,777 (55%) |
Update with bold_*()/italicize_*()
mis<-tab |>
tbl_summary(
by = loan_status
) |>
add_p() |>
bold_labels() |>
italicize_levels() |>
bold_p(t = 0.8)
output
Characteristic |
0, N = 25,864 |
1, N = 3,227 |
p-value |
loan_amnt |
8,000 (5,000, 12,250) |
8,000 (4,850, 12,000) |
0.001 |
age |
26 (23, 30) |
26 (23, 30) |
<0.001 |
home_ownership |
|
|
<0.001 |
MORTGAGE |
10,820 (42%) |
1,181 (37%) |
|
OTHER |
80 (0.3%) |
17 (0.5%) |
|
OWN |
2,049 (7.9%) |
252 (7.8%) |
|
RENT |
12,915 (50%) |
1,777 (55%) |
|
some notes
bold_labels()
: bold the variable labels
italicize_levels()
: italicize the variable levels
bold_p()
: bold p-values according a specified
threshold
Update tbl_summary() with modify_*()
tbl <-
tab |>
tbl_summary(by = loan_status,
missing = "no") |>
modify_header(
stat_1 ~ "**defaulted**",
stat_2 ~ "**not defaulted**"
) |>
modify_spanning_header(
all_stat_cols() ~ "**loan status**") |>
modify_footnote(
all_stat_cols() ~
paste("median (IQR) for continuous;",
"n (%) for categorical")
)
output
Characteristic |
loan status
|
defaulted |
not defaulted |
loan_amnt |
8,000 (5,000, 12,250) |
8,000 (4,850, 12,000) |
age |
26 (23, 30) |
26 (23, 30) |
home_ownership |
|
|
MORTGAGE |
10,820 (42%) |
1,181 (37%) |
OTHER |
80 (0.3%) |
17 (0.5%) |
OWN |
2,049 (7.9%) |
252 (7.8%) |
RENT |
12,915 (50%) |
1,777 (55%) |
MODELING
m1 <-
glm(
loan_status ~ age + grade,
data = data,
family = binomial(link = "logit")
)
default output
summary(m1)
#>
#> Call:
#> glm(formula = loan_status ~ age + grade, family = binomial(link = "logit"),
#> data = data)
#>
#> Deviance Residuals:
#> Min 1Q Median 3Q Max
#> -0.9947 -0.5549 -0.4668 -0.3482 2.5082
#>
#> Coefficients:
#> Estimate Std. Error z value Pr(>|z|)
#> (Intercept) -2.503115 0.097180 -25.758 < 2e-16 ***
#> age -0.009976 0.003181 -3.136 0.00171 **
#> gradeB 0.640533 0.054916 11.664 < 2e-16 ***
#> gradeC 1.019223 0.057183 17.824 < 2e-16 ***
#> gradeD 1.259926 0.063114 19.963 < 2e-16 ***
#> gradeE 1.410418 0.094931 14.857 < 2e-16 ***
#> gradeF 1.765472 0.161902 10.905 < 2e-16 ***
#> gradeG 2.276245 0.279516 8.144 3.84e-16 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for binomial family taken to be 1)
#>
#> Null deviance: 20274 on 29090 degrees of freedom
#> Residual deviance: 19614 on 29083 degrees of freedom
#> AIC: 19630
#>
#> Number of Fisher Scoring iterations: 5
Customize tbl_regression() output
tab<-tbl_regression(
m1,
exponentiate = TRUE
) |>
add_global_p() |>
add_glance_table(
include = c(nobs,
logLik,
AIC,
BIC)
)
output
Characteristic |
OR |
95% CI |
p-value |
age |
0.99 |
0.98, 1.00 |
0.001 |
grade |
|
|
<0.001 |
A |
— |
— |
|
B |
1.90 |
1.70, 2.11 |
|
C |
2.77 |
2.48, 3.10 |
|
D |
3.53 |
3.12, 3.99 |
|
E |
4.10 |
3.40, 4.93 |
|
F |
5.84 |
4.22, 7.98 |
|
G |
9.74 |
5.55, 16.7 |
|
No. Obs. |
29,091 |
|
|
Log-likelihood |
-9,807 |
|
|
AIC |
19,630 |
|
|
BIC |
19,697 |
|
|