Pulling Underlying Equation Corresponding to a Statistical Model
using Package Equatiomatic
lr <- glm(sex ~ species * bill_length_mm, data = penguins,family = binomial(link = "logit"))
extract_eq(lr, wrap = TRUE)
\[
\begin{aligned}
\log\left[ \frac { P( \operatorname{sex} = \operatorname{male} ) }{ 1 -
P( \operatorname{sex} = \operatorname{male} ) } \right] &= \alpha +
\beta_{1}(\operatorname{species}_{\operatorname{Chinstrap}}) +
\beta_{2}(\operatorname{species}_{\operatorname{Gentoo}}) +
\beta_{3}(\operatorname{bill\_length\_mm})\ + \\
&\quad \beta_{4}(\operatorname{species}_{\operatorname{Chinstrap}}
\times \operatorname{bill\_length\_mm}) +
\beta_{5}(\operatorname{species}_{\operatorname{Gentoo}} \times
\operatorname{bill\_length\_mm})
\end{aligned}
\]
extract_eq(lr, wrap = TRUE, show_distribution = TRUE) # show the how the data are assumed to be distributed.
\[
\begin{aligned}
\operatorname{sex} &\sim
Bernoulli\left(\operatorname{prob}_{\operatorname{sex} =
\operatorname{male}}= \hat{P}\right) \\
\log\left[ \frac { \hat{P} }{ 1 - \hat{P} } \right]
&= \alpha +
\beta_{1}(\operatorname{species}_{\operatorname{Chinstrap}}) +
\beta_{2}(\operatorname{species}_{\operatorname{Gentoo}}) +
\beta_{3}(\operatorname{bill\_length\_mm})\ + \\
&\quad \beta_{4}(\operatorname{species}_{\operatorname{Chinstrap}}
\times \operatorname{bill\_length\_mm}) +
\beta_{5}(\operatorname{species}_{\operatorname{Gentoo}} \times
\operatorname{bill\_length\_mm})
\end{aligned}
\]
## GLMM with individual-level variability (accounting for overdispersion)
## For this data set the model is the same as one allowing for a period:herd
## interaction, which the plot indicates could be needed.
cbpp$obs <- 1:nrow(cbpp)
gm2 <- glmer(cbind(incidence, size - incidence) ~ period +(1 | herd) + (1|obs),family = binomial, data = cbpp)
extract_eq(gm2,wrap=TRUE)
\[
\begin{aligned}
\operatorname{incidence}_{i} &\sim \operatorname{Binomial}(n = 1,
\operatorname{prob}_{\operatorname{incidence} = 1} = \widehat{P}) \\
\log\left[\frac{\hat{P}}{1 - \hat{P}} \right]
&=\alpha_{j[i],k[i]} \\
\alpha_{j} &\sim N \left(\gamma_{0}^{\alpha} +
\gamma_{1}^{\alpha}(\operatorname{period}_{\operatorname{2}}) +
\gamma_{2}^{\alpha}(\operatorname{period}_{\operatorname{3}}) +
\gamma_{3}^{\alpha}(\operatorname{period}_{\operatorname{4}}),
\sigma^2_{\alpha_{j}} \right)
\text{, for obs j = 1,} \dots \text{,J} \\
\alpha_{k} &\sim N \left(\mu_{\alpha_{k}},
\sigma^2_{\alpha_{k}} \right)
\text{, for herd k = 1,} \dots \text{,K}
\end{aligned}
\]
Using gtsummary to Create Publication Ready Output Tables from
Fitted Models
-Start by creating a logistic regression model to predict tumor
response using the variables age and grade from the trial data set.
tr<-head(trial)
tr %>%
kbl(caption = "Simulated Trial Data") %>%
kable_classic(full_width = F, html_font = "Cambria")
Simulated Trial Data
|
trt
|
age
|
marker
|
stage
|
grade
|
response
|
death
|
ttdeath
|
|
Drug A
|
23
|
0.160
|
T1
|
II
|
0
|
0
|
24.00
|
|
Drug B
|
9
|
1.107
|
T2
|
I
|
1
|
0
|
24.00
|
|
Drug A
|
31
|
0.277
|
T1
|
II
|
0
|
0
|
24.00
|
|
Drug A
|
NA
|
2.067
|
T3
|
III
|
1
|
1
|
17.64
|
|
Drug A
|
51
|
2.767
|
T4
|
III
|
1
|
1
|
16.43
|
|
Drug B
|
39
|
0.613
|
T4
|
I
|
0
|
1
|
15.64
|
m1 <- glm(response ~ age + stage, trial, family = binomial)
tbl_regression(m1, exponentiate = TRUE)
| Characteristic |
OR |
95% CI |
p-value |
| Age |
1.02 |
1.00, 1.04 |
0.091 |
| T Stage |
|
|
|
| T1 |
— |
— |
|
| T2 |
0.58 |
0.24, 1.37 |
0.2 |
| T3 |
0.94 |
0.39, 2.28 |
0.9 |
| T4 |
0.79 |
0.33, 1.90 |
0.6 |
Adding False Dsicovery Rates
trial %>%
select(response, age, grade) %>%
tbl_uvregression(
method = glm,
y = response,
method.args = list(family = binomial),
exponentiate = TRUE,
pvalue_fun = ~style_pvalue(.x, digits = 2)
) %>%
add_global_p() %>% # add global p-value
add_nevent() %>% # add number of events of the outcome
add_q() %>% # adjusts global p-values for multiple testing
bold_p() %>% # bold p-values under a given threshold (default 0.05)
bold_p(t = 0.10, q = TRUE) %>% # now bold q-values under the threshold of 0.10
bold_labels()
## add_q: Adjusting p-values with
## `stats::p.adjust(x$table_body$p.value, method = "fdr")`
| Characteristic |
N |
Event N |
OR |
95% CI |
p-value |
q-value |
| Age |
183 |
58 |
1.02 |
1.00, 1.04 |
0.091 |
0.18 |
| Grade |
193 |
61 |
|
|
0.93 |
0.93 |
| I |
|
|
— |
— |
|
|
| II |
|
|
0.95 |
0.45, 2.00 |
|
|
| III |
|
|
1.10 |
0.52, 2.29 |
|
|
Change Summary Statistics Globally
By default it summarize continuous variables with: Number of
missing values, Mean (SD), 25th - 75th quantiles, and Minimum-Maximum
values with an ANOVA (t-test with equal variances) p-value.
For categorical variables the default is to show: Number of
missing values and count (column percent) with a chi-square
p-value.
This default can be modified using the tableby.control function.
Note that test=FALSE and total=FALSE results in the total column and
p-value column not being printed.
mycontrols <- tableby.control(test=FALSE, total=FALSE,
numeric.test="kwt", cat.test="chisq",
numeric.stats=c("N", "median", "q1q3"),
cat.stats=c("countpct"),
stats.labels=list(N='Count', median='Median', q1q3='Q1,Q3'))
tab2 <- tableby(arm ~ sex + age, data=mockstudy, control=mycontrols)
summary(tab2)
| Gender |
|
|
|
| Male |
277 (64.7%) |
411 (59.5%) |
228 (60.0%) |
| Female |
151 (35.3%) |
280 (40.5%) |
152 (40.0%) |
| Age, yrs |
|
|
|
| Count |
428 |
691 |
380 |
| Median |
61.000 |
61.000 |
61.000 |
| Q1,Q3 |
53.000, 68.000 |
52.000, 69.000 |
52.000, 68.000 |
Modifying Summary Statistics
- Summary statistics for any individual variable can also be modified,
but it must be done as secondary arguments to the test function.
- The function names must be strings that are functions already
written for tableby.
summary(tab.test, pfootnote=TRUE) # default summary table with footnote display option TRUE
| Age, yrs |
|
|
|
|
0.6391 |
| Mean (SD) |
59.673 (11.365) |
60.301 (11.632) |
59.763 (11.499) |
59.985 (11.519) |
|
| Range |
27.000 - 88.000 |
19.000 - 88.000 |
26.000 - 85.000 |
19.000 - 88.000 |
|
| Body Mass Index (kg/m^2) |
|
|
|
|
0.8922 |
| N-Miss |
9 |
20 |
4 |
33 |
|
| Mean (SD) |
27.290 (5.552) |
27.210 (5.173) |
27.106 (5.751) |
27.206 (5.432) |
|
| Range |
14.053 - 53.008 |
16.649 - 49.130 |
15.430 - 60.243 |
14.053 - 60.243 |
|
| ast |
|
|
|
|
|
| N-Miss |
69 |
141 |
56 |
266 |
|
| Mean (SD) |
37.292 (28.036) |
35.202 (26.659) |
35.670 (25.807) |
35.933 (26.843) |
|
| Range |
10.000 - 205.000 |
7.000 - 174.000 |
5.000 - 176.000 |
5.000 - 205.000 |
|
- Kruskal-Wallis rank sum test
- Linear Model ANOVA
tab.test1 <- tableby(arm ~ kwt(ast, "Nmiss2","median") + anova(age, "N","mean") +
notest(bmi, "Nmiss","median"), data=mockstudy)
summary(tab.test1)
| ast |
|
|
|
|
0.039 |
| N-Miss |
69 |
141 |
56 |
266 |
|
| Median |
29.000 |
25.500 |
27.000 |
27.000 |
|
| Age, yrs |
|
|
|
|
0.614 |
| N |
428 |
691 |
380 |
1499 |
|
| Mean |
59.673 |
60.301 |
59.763 |
59.985 |
|
| Body Mass Index (kg/m^2) |
|
|
|
|
|
| N-Miss |
9 |
20 |
4 |
33 |
|
| Median |
26.234 |
26.525 |
25.978 |
26.325 |
|
Categorical Tests (Chisq and Fisher’s)
The formal tests for categorical variables against the levels of
the by variable, chisq and FE, have options to simulate p-values. We
show how to turn on the simulations for these with 500 replicates for
the Fisher’s test (FE).
The chi-square test on 2x2 tables applies Yates’ continuity
correction by default,with and without the correction that is applied to
treatment arm by sex, if we use subset to ignore one of the three
treatment arms.
set.seed(100)
tab.catsim <- tableby(arm ~ sex + race, cat.test="fe", simulate.p.value=TRUE, B=500, data=mockstudy)
t1<-tests(tab.catsim)
t1 %>%
kbl(caption = "Fisher's Exact Test for Count Data with simulated p-value") %>%
kable_classic(full_width = F, html_font = "Cambria")
Fisher’s Exact Test for Count Data with simulated p-value
|
Group
|
Variable
|
p.value
|
Method
|
|
Treatment Arm
|
sex
|
0.2195609
|
Fisher’s Exact Test for Count Data with simulated p-value (based on 500
replicates)
|
|
Treatment Arm
|
race
|
0.3093812
|
Fisher’s Exact Test for Count Data with simulated p-value (based on 500
replicates)
|
cat.correct <- tableby(arm ~ sex + race, cat.test="chisq", subset = !grepl("^F", arm), data=mockstudy)
t2<-tests(cat.correct)
t2 %>%
kbl(caption = "Chi-Square Test with correction for Count Data with simulated p-value") %>%
kable_classic(full_width = F, html_font = "Cambria")
Chi-Square Test with correction for Count Data with simulated p-value
|
Group
|
Variable
|
p.value
|
Method
|
|
Treatment Arm
|
sex
|
0.1666280
|
Pearson’s Chi-squared test
|
|
Treatment Arm
|
race
|
0.8108543
|
Pearson’s Chi-squared test
|
cat.nocorrect <- tableby(arm ~ sex + race, cat.test="chisq", subset = !grepl("^F", arm),chisq.correct=FALSE, data=mockstudy)
t3<-tests(cat.nocorrect)
t3 %>%
kbl(caption = "Chi-Square Test without correction for Count Data with simulated p-value") %>%
kable_classic(full_width = F, html_font = "Cambria")
Chi-Square Test without correction for Count Data with simulated p-value
|
Group
|
Variable
|
p.value
|
Method
|
|
Treatment Arm
|
sex
|
0.1666280
|
Pearson’s Chi-squared test
|
|
Treatment Arm
|
race
|
0.8108543
|
Pearson’s Chi-squared test
|
Summarize a Survival Variable
- Information presented by the survfit() function,can be seen with
tableby.
- The default is to show the median survival (time at which the
probability of survival = 50%).
- It is also possible to obtain summaries of the % survival at certain
time points (say the probability of surviving 1-year).
survfit(Surv(fu.time, fu.stat)~sex, data=mockstudy) # first look at info from survfit func
## Call: survfit(formula = Surv(fu.time, fu.stat) ~ sex, data = mockstudy)
##
## n events median 0.95LCL 0.95UCL
## sex=Male 916 829 550 515 590
## sex=Female 583 527 543 511 575
survdiff(Surv(fu.time, fu.stat)~sex, data=mockstudy)
## Call:
## survdiff(formula = Surv(fu.time, fu.stat) ~ sex, data = mockstudy)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## sex=Male 916 829 830 0.000370 0.000956
## sex=Female 583 527 526 0.000583 0.000956
##
## Chisq= 0 on 1 degrees of freedom, p= 1
# now apply tableby
summary(tableby(sex ~ Surv(fu.time, fu.stat), data=mockstudy))
| Surv(fu.time, fu.stat) |
|
|
|
0.975 |
| Events |
829 |
527 |
1356 |
|
| Median Survival |
550.000 |
543.000 |
546.000 |
|
summary(survfit(Surv(fu.time/365.25, fu.stat)~sex, data=mockstudy), times=1:5) # using survfit
## Call: survfit(formula = Surv(fu.time/365.25, fu.stat) ~ sex, data = mockstudy)
##
## sex=Male
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 1 626 286 0.6870 0.0153 0.6576 0.7177
## 2 309 311 0.3437 0.0158 0.3142 0.3761
## 3 152 151 0.1748 0.0127 0.1516 0.2015
## 4 57 61 0.0941 0.0104 0.0759 0.1168
## 5 24 16 0.0628 0.0095 0.0467 0.0844
##
## sex=Female
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 1 380 202 0.6531 0.0197 0.6155 0.693
## 2 190 189 0.3277 0.0195 0.2917 0.368
## 3 95 90 0.1701 0.0157 0.1420 0.204
## 4 51 32 0.1093 0.0133 0.0861 0.139
## 5 18 12 0.0745 0.0126 0.0534 0.104
summary(tableby(sex ~ Surv(fu.time/365.25, fu.stat), data=mockstudy, times=1:5, surv.stats=c("NeventsSurv","NriskSurv"))) # using tableby
| Surv(fu.time/365.25, fu.stat) |
|
|
|
0.975 |
| time = 1 |
286 (68.7) |
202 (65.3) |
488 (67.4) |
|
| time = 2 |
597 (34.4) |
391 (32.8) |
988 (33.7) |
|
| time = 3 |
748 (17.5) |
481 (17.0) |
1229 (17.3) |
|
| time = 4 |
809 (9.4) |
513 (10.9) |
1322 (10.1) |
|
| time = 5 |
825 (6.3) |
525 (7.4) |
1350 (6.8) |
|
| time = 1 |
626 (68.7) |
380 (65.3) |
1006 (67.4) |
|
| time = 2 |
309 (34.4) |
190 (32.8) |
499 (33.7) |
|
| time = 3 |
152 (17.5) |
95 (17.0) |
247 (17.3) |
|
| time = 4 |
57 (9.4) |
51 (10.9) |
108 (10.1) |
|
| time = 5 |
24 (6.3) |
18 (7.4) |
42 (6.8) |
|
Create Combinations of Variables
## create a variable combining the levels of mdquality.s and sex
with(mockstudy, table(interaction(mdquality.s,sex)))
0.Male 1.Male 0.Female 1.Female 77 686 47 437
summary(tableby(arm ~ interaction(mdquality.s,sex), data=mockstudy))
| interaction(mdquality.s, sex) |
|
|
|
|
0.493 |
| N-Miss |
55 |
156 |
41 |
252 |
|
| 0.Male |
29 (7.8%) |
31 (5.8%) |
17 (5.0%) |
77 (6.2%) |
|
| 1.Male |
214 (57.4%) |
285 (53.3%) |
187 (55.2%) |
686 (55.0%) |
|
| 0.Female |
12 (3.2%) |
21 (3.9%) |
14 (4.1%) |
47 (3.8%) |
|
| 1.Female |
118 (31.6%) |
198 (37.0%) |
121 (35.7%) |
437 (35.0%) |
|
## create a new grouping variable with combined levels of arm and sex
summary(tableby(interaction(mdquality.s, sex) ~ age + bmi, data=mockstudy, subset=arm=="F: FOLFOX"))
| Age, yrs |
|
|
|
|
|
0.190 |
| Mean (SD) |
63.065 (11.702) |
60.653 (11.833) |
60.810 (10.103) |
58.924 (11.366) |
60.159 (11.612) |
|
| Range |
41.000 - 82.000 |
19.000 - 88.000 |
42.000 - 81.000 |
29.000 - 83.000 |
19.000 - 88.000 |
|
| Body Mass Index (kg/m^2) |
|
|
|
|
|
0.894 |
| N-Miss |
0 |
6 |
1 |
5 |
12 |
|
| Mean (SD) |
26.633 (5.094) |
27.387 (4.704) |
27.359 (4.899) |
27.294 (5.671) |
27.307 (5.100) |
|
| Range |
20.177 - 41.766 |
17.927 - 47.458 |
19.801 - 39.369 |
16.799 - 44.841 |
16.799 - 47.458 |
|
Subsetting Tables
mytab <- tableby(arm ~ sex + alk.phos + age, data=mockstudy)
mytab2 <- mytab[c('age','sex','alk.phos')] #Change the ordering of the variables
summary(mytab2)
| Age, yrs |
|
|
|
|
0.614 |
| Mean (SD) |
59.673 (11.365) |
60.301 (11.632) |
59.763 (11.499) |
59.985 (11.519) |
|
| Range |
27.000 - 88.000 |
19.000 - 88.000 |
26.000 - 85.000 |
19.000 - 88.000 |
|
| Gender |
|
|
|
|
0.190 |
| Male |
277 (64.7%) |
411 (59.5%) |
228 (60.0%) |
916 (61.1%) |
|
| Female |
151 (35.3%) |
280 (40.5%) |
152 (40.0%) |
583 (38.9%) |
|
| alk.phos |
|
|
|
|
0.226 |
| N-Miss |
69 |
141 |
56 |
266 |
|
| Mean (SD) |
175.577 (128.608) |
161.984 (121.978) |
173.506 (138.564) |
168.969 (128.492) |
|
| Range |
11.000 - 858.000 |
10.000 - 1014.000 |
7.000 - 982.000 |
7.000 - 1014.000 |
|
summary(mytab[c(3,1)], digits = 3) # delete a variable
| Age, yrs |
|
|
|
|
0.614 |
| Mean (SD) |
59.673 (11.365) |
60.301 (11.632) |
59.763 (11.499) |
59.985 (11.519) |
|
| Range |
27.000 - 88.000 |
19.000 - 88.000 |
26.000 - 85.000 |
19.000 - 88.000 |
|
| Gender |
|
|
|
|
0.190 |
| Male |
277 (64.7%) |
411 (59.5%) |
228 (60.0%) |
916 (61.1%) |
|
| Female |
151 (35.3%) |
280 (40.5%) |
152 (40.0%) |
583 (38.9%) |
|
summary(mytab[mytab < 0.5]) # filter p-value<0.5
| Gender |
|
|
|
|
0.190 |
| Male |
277 (64.7%) |
411 (59.5%) |
228 (60.0%) |
916 (61.1%) |
|
| Female |
151 (35.3%) |
280 (40.5%) |
152 (40.0%) |
583 (38.9%) |
|
| alk.phos |
|
|
|
|
0.226 |
| N-Miss |
69 |
141 |
56 |
266 |
|
| Mean (SD) |
175.577 (128.608) |
161.984 (121.978) |
173.506 (138.564) |
168.969 (128.492) |
|
| Range |
11.000 - 858.000 |
10.000 - 1014.000 |
7.000 - 982.000 |
7.000 - 1014.000 |
|
summary(sort(mytab, decreasing = TRUE)) # sort p-value
| Age, yrs |
|
|
|
|
0.614 |
| Mean (SD) |
59.673 (11.365) |
60.301 (11.632) |
59.763 (11.499) |
59.985 (11.519) |
|
| Range |
27.000 - 88.000 |
19.000 - 88.000 |
26.000 - 85.000 |
19.000 - 88.000 |
|
| alk.phos |
|
|
|
|
0.226 |
| N-Miss |
69 |
141 |
56 |
266 |
|
| Mean (SD) |
175.577 (128.608) |
161.984 (121.978) |
173.506 (138.564) |
168.969 (128.492) |
|
| Range |
11.000 - 858.000 |
10.000 - 1014.000 |
7.000 - 982.000 |
7.000 - 1014.000 |
|
| Gender |
|
|
|
|
0.190 |
| Male |
277 (64.7%) |
411 (59.5%) |
228 (60.0%) |
916 (61.1%) |
|
| Female |
151 (35.3%) |
280 (40.5%) |
152 (40.0%) |
583 (38.9%) |
|
Add a Title to the Table
- When creating a pdf the tables are automatically numbered and the
title appears below the table. In Word and HTML, the titles appear
un-numbered and above the table.
t1 <- tableby(arm ~ sex + age, data=mockstudy)
summary(t1, title='Demographics')
Demographics
| Gender |
|
|
|
|
0.190 |
| Male |
277 (64.7%) |
411 (59.5%) |
228 (60.0%) |
916 (61.1%) |
|
| Female |
151 (35.3%) |
280 (40.5%) |
152 (40.0%) |
583 (38.9%) |
|
| Age, yrs |
|
|
|
|
0.614 |
| Mean (SD) |
59.673 (11.365) |
60.301 (11.632) |
59.763 (11.499) |
59.985 (11.519) |
|
| Range |
27.000 - 88.000 |
19.000 - 88.000 |
26.000 - 85.000 |
19.000 - 88.000 |
|
#With multiple left-hand sides, you can pass a vector or list to determine labels for each table:
summary(tableby(list(arm, sex) ~ age, data = mockstudy), title = c("arm table", "sex table"))
arm table
| Age, yrs |
|
|
|
|
0.614 |
| Mean (SD) |
59.673 (11.365) |
60.301 (11.632) |
59.763 (11.499) |
59.985 (11.519) |
|
| Range |
27.000 - 88.000 |
19.000 - 88.000 |
26.000 - 85.000 |
19.000 - 88.000 |
|
sex table
| Age, yrs |
|
|
|
0.048 |
| Mean (SD) |
60.455 (11.369) |
59.247 (11.722) |
59.985 (11.519) |
|
| Range |
19.000 - 88.000 |
22.000 - 88.000 |
19.000 - 88.000 |
|