We use the standard formula for sample size estimation in proportion studies:
\[ n = \frac{z^2 \cdot p \cdot q}{d^2} \]
where:
- z = z-value at 95% CI (1.96)
- p = estimated proportion (0.5 used as conservative
estimate)
- q = (1 - p)
- d = margin of error (0.05)
# Sample Size Calculation
z <- 1.96
p <- 0.5
d <- 0.05
n <- z^2 * p * (1 - p) / d^2
n
## [1] 384.16
The required sample size is approximately 384.
gtsummary# install.packages("gtsummary") # Uncomment if not installed
library(gtsummary)
library(dplyr)
We use the built-in trial dataset from
gtsummary:
head(trial)
## # A tibble: 6 × 8
## trt age marker stage grade response death ttdeath
## <chr> <dbl> <dbl> <fct> <fct> <int> <int> <dbl>
## 1 Drug A 23 0.16 T1 II 0 0 24
## 2 Drug B 9 1.11 T2 I 1 0 24
## 3 Drug A 31 0.277 T1 II 0 0 24
## 4 Drug A NA 2.07 T3 III 1 1 17.6
## 5 Drug A 51 2.77 T4 III 1 1 16.4
## 6 Drug B 39 0.613 T4 I 0 1 15.6
trial %>%
tbl_summary()
| Characteristic | N = 2001 |
|---|---|
| Chemotherapy Treatment | |
| Drug A | 98 (49%) |
| Drug B | 102 (51%) |
| Age | 47 (38, 57) |
| Unknown | 11 |
| Marker Level (ng/mL) | 0.64 (0.22, 1.41) |
| Unknown | 10 |
| T Stage | |
| T1 | 53 (27%) |
| T2 | 54 (27%) |
| T3 | 43 (22%) |
| T4 | 50 (25%) |
| Grade | |
| I | 68 (34%) |
| II | 68 (34%) |
| III | 64 (32%) |
| Tumor Response | 61 (32%) |
| Unknown | 7 |
| Patient Died | 112 (56%) |
| Months to Death/Censor | 22.4 (15.9, 24.0) |
| 1 n (%); Median (Q1, Q3) | |
trial2 <- trial %>%
select(trt, age, grade)
trial2 %>%
tbl_summary()
| Characteristic | N = 2001 |
|---|---|
| Chemotherapy Treatment | |
| Drug A | 98 (49%) |
| Drug B | 102 (51%) |
| Age | 47 (38, 57) |
| Unknown | 11 |
| Grade | |
| I | 68 (34%) |
| II | 68 (34%) |
| III | 64 (32%) |
| 1 n (%); Median (Q1, Q3) | |
trial %>%
tbl_summary(by = trt)
| Characteristic | Drug A N = 981 |
Drug B N = 1021 |
|---|---|---|
| Age | 46 (37, 60) | 48 (39, 56) |
| Unknown | 7 | 4 |
| Marker Level (ng/mL) | 0.84 (0.23, 1.60) | 0.52 (0.18, 1.21) |
| Unknown | 6 | 4 |
| T Stage | ||
| T1 | 28 (29%) | 25 (25%) |
| T2 | 25 (26%) | 29 (28%) |
| T3 | 22 (22%) | 21 (21%) |
| T4 | 23 (23%) | 27 (26%) |
| Grade | ||
| I | 35 (36%) | 33 (32%) |
| II | 32 (33%) | 36 (35%) |
| III | 31 (32%) | 33 (32%) |
| Tumor Response | 28 (29%) | 33 (34%) |
| Unknown | 3 | 4 |
| Patient Died | 52 (53%) | 60 (59%) |
| Months to Death/Censor | 23.5 (17.4, 24.0) | 21.2 (14.5, 24.0) |
| 1 Median (Q1, Q3); n (%) | ||
trial %>%
tbl_summary(by = trt) %>%
add_p()
| Characteristic | Drug A N = 981 |
Drug B N = 1021 |
p-value2 |
|---|---|---|---|
| Age | 46 (37, 60) | 48 (39, 56) | 0.7 |
| Unknown | 7 | 4 | |
| Marker Level (ng/mL) | 0.84 (0.23, 1.60) | 0.52 (0.18, 1.21) | 0.085 |
| Unknown | 6 | 4 | |
| T Stage | 0.9 | ||
| T1 | 28 (29%) | 25 (25%) | |
| T2 | 25 (26%) | 29 (28%) | |
| T3 | 22 (22%) | 21 (21%) | |
| T4 | 23 (23%) | 27 (26%) | |
| Grade | 0.9 | ||
| I | 35 (36%) | 33 (32%) | |
| II | 32 (33%) | 36 (35%) | |
| III | 31 (32%) | 33 (32%) | |
| Tumor Response | 28 (29%) | 33 (34%) | 0.5 |
| Unknown | 3 | 4 | |
| Patient Died | 52 (53%) | 60 (59%) | 0.4 |
| Months to Death/Censor | 23.5 (17.4, 24.0) | 21.2 (14.5, 24.0) | 0.14 |
| 1 Median (Q1, Q3); n (%) | |||
| 2 Wilcoxon rank sum test; Pearson’s Chi-squared test | |||
trial %>%
tbl_summary(
by = trt,
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} / {N} ({p}%)"
)
)
| Characteristic | Drug A N = 981 |
Drug B N = 1021 |
|---|---|---|
| Age | 47 (15) | 47 (14) |
| Unknown | 7 | 4 |
| Marker Level (ng/mL) | 1.02 (0.89) | 0.82 (0.83) |
| Unknown | 6 | 4 |
| T Stage | ||
| T1 | 28 / 98 (29%) | 25 / 102 (25%) |
| T2 | 25 / 98 (26%) | 29 / 102 (28%) |
| T3 | 22 / 98 (22%) | 21 / 102 (21%) |
| T4 | 23 / 98 (23%) | 27 / 102 (26%) |
| Grade | ||
| I | 35 / 98 (36%) | 33 / 102 (32%) |
| II | 32 / 98 (33%) | 36 / 102 (35%) |
| III | 31 / 98 (32%) | 33 / 102 (32%) |
| Tumor Response | 28 / 95 (29%) | 33 / 98 (34%) |
| Unknown | 3 | 4 |
| Patient Died | 52 / 98 (53%) | 60 / 102 (59%) |
| Months to Death/Censor | 20.2 (5.0) | 19.0 (5.5) |
| 1 Mean (SD); n / N (%) | ||
trial2 %>%
tbl_summary(by = trt) %>%
add_p() %>%
add_overall() %>%
add_n() %>%
modify_header(label ~ "**Variable**") %>%
modify_spanning_header(c("stat_1", "stat_2") ~ "**Treatment Received**") %>%
modify_footnote(all_stat_cols() ~ "Median (IQR) or Frequency (%)") %>%
modify_caption("**Table 1. Patient Characteristics**") %>%
bold_labels()
| Variable | N | Overall N = 2001 |
Treatment Received
|
p-value2 | |
|---|---|---|---|---|---|
| Drug A N = 981 |
Drug B N = 1021 |
||||
| Age | 189 | 47 (38, 57) | 46 (37, 60) | 48 (39, 56) | 0.7 |
| Unknown | 11 | 7 | 4 | ||
| Grade | 200 | 0.9 | |||
| I | 68 (34%) | 35 (36%) | 33 (32%) | ||
| II | 68 (34%) | 32 (33%) | 36 (35%) | ||
| III | 64 (32%) | 31 (32%) | 33 (32%) | ||
| 1 Median (IQR) or Frequency (%) | |||||
| 2 Wilcoxon rank sum test; Pearson’s Chi-squared test | |||||
trial2 %>%
tbl_summary(by = trt, missing = "no") %>%
add_n() %>%
as_gt() %>%
gt::tab_source_note(gt::md("*This data is simulated*"))
| Characteristic | N | Drug A N = 981 |
Drug B N = 1021 |
|---|---|---|---|
| Age | 189 | 46 (37, 60) | 48 (39, 56) |
| Grade | 200 | ||
| I | 35 (36%) | 33 (32%) | |
| II | 32 (33%) | 36 (35%) | |
| III | 31 (32%) | 33 (32%) | |
| This data is simulated | |||
| 1 Median (Q1, Q3); n (%) | |||
trial2 %>%
select(age, trt) %>%
tbl_summary(
by = trt,
type = all_continuous() ~ "continuous2",
statistic = all_continuous() ~ c(
"{N_nonmiss}",
"{median} ({p25}, {p75})",
"{min}, {max}"
),
missing = "no"
) %>%
add_p(pvalue_fun = ~ style_pvalue(.x, digits = 2))
| Characteristic | Drug A N = 98 |
Drug B N = 102 |
p-value1 |
|---|---|---|---|
| Age | 0.72 | ||
| N Non-missing | 91 | 98 | |
| Median (Q1, Q3) | 46 (37, 60) | 48 (39, 56) | |
| Min, Max | 6, 78 | 9, 83 | |
| 1 Wilcoxon rank sum test | |||
gtsummary package, we generated descriptive
statistics, cross tables, p-values, and well-formatted summary
tables.