Sample Size Calculation

We use the standard formula for sample size estimation in proportion studies:

\[ n = \frac{z^2 \cdot p \cdot q}{d^2} \]

where:
- z = z-value at 95% CI (1.96)
- p = estimated proportion (0.5 used as conservative estimate)
- q = (1 - p)
- d = margin of error (0.05)

# Sample Size Calculation
z <- 1.96
p <- 0.5
d <- 0.05
n <- z^2 * p * (1 - p) / d^2
n
## [1] 384.16

The required sample size is approximately 384.


Data Analysis with gtsummary

Load Packages

# install.packages("gtsummary") # Uncomment if not installed
library(gtsummary)
library(dplyr)

Example Data Set

We use the built-in trial dataset from gtsummary:

head(trial)
## # A tibble: 6 × 8
##   trt      age marker stage grade response death ttdeath
##   <chr>  <dbl>  <dbl> <fct> <fct>    <int> <int>   <dbl>
## 1 Drug A    23  0.16  T1    II           0     0    24  
## 2 Drug B     9  1.11  T2    I            1     0    24  
## 3 Drug A    31  0.277 T1    II           0     0    24  
## 4 Drug A    NA  2.07  T3    III          1     1    17.6
## 5 Drug A    51  2.77  T4    III          1     1    16.4
## 6 Drug B    39  0.613 T4    I            0     1    15.6

Basic Descriptive Table

trial %>%
  tbl_summary()
Characteristic N = 2001
Chemotherapy Treatment
    Drug A 98 (49%)
    Drug B 102 (51%)
Age 47 (38, 57)
    Unknown 11
Marker Level (ng/mL) 0.64 (0.22, 1.41)
    Unknown 10
T Stage
    T1 53 (27%)
    T2 54 (27%)
    T3 43 (22%)
    T4 50 (25%)
Grade
    I 68 (34%)
    II 68 (34%)
    III 64 (32%)
Tumor Response 61 (32%)
    Unknown 7
Patient Died 112 (56%)
Months to Death/Censor 22.4 (15.9, 24.0)
1 n (%); Median (Q1, Q3)

Select Variables

trial2 <- trial %>%
  select(trt, age, grade)

trial2 %>%
  tbl_summary()
Characteristic N = 2001
Chemotherapy Treatment
    Drug A 98 (49%)
    Drug B 102 (51%)
Age 47 (38, 57)
    Unknown 11
Grade
    I 68 (34%)
    II 68 (34%)
    III 64 (32%)
1 n (%); Median (Q1, Q3)

Cross Table Summary (by treatment)

trial %>%
  tbl_summary(by = trt)
Characteristic Drug A
N = 98
1
Drug B
N = 102
1
Age 46 (37, 60) 48 (39, 56)
    Unknown 7 4
Marker Level (ng/mL) 0.84 (0.23, 1.60) 0.52 (0.18, 1.21)
    Unknown 6 4
T Stage

    T1 28 (29%) 25 (25%)
    T2 25 (26%) 29 (28%)
    T3 22 (22%) 21 (21%)
    T4 23 (23%) 27 (26%)
Grade

    I 35 (36%) 33 (32%)
    II 32 (33%) 36 (35%)
    III 31 (32%) 33 (32%)
Tumor Response 28 (29%) 33 (34%)
    Unknown 3 4
Patient Died 52 (53%) 60 (59%)
Months to Death/Censor 23.5 (17.4, 24.0) 21.2 (14.5, 24.0)
1 Median (Q1, Q3); n (%)

Add p-values

trial %>%
  tbl_summary(by = trt) %>%
  add_p()
Characteristic Drug A
N = 98
1
Drug B
N = 102
1
p-value2
Age 46 (37, 60) 48 (39, 56) 0.7
    Unknown 7 4
Marker Level (ng/mL) 0.84 (0.23, 1.60) 0.52 (0.18, 1.21) 0.085
    Unknown 6 4
T Stage

0.9
    T1 28 (29%) 25 (25%)
    T2 25 (26%) 29 (28%)
    T3 22 (22%) 21 (21%)
    T4 23 (23%) 27 (26%)
Grade

0.9
    I 35 (36%) 33 (32%)
    II 32 (33%) 36 (35%)
    III 31 (32%) 33 (32%)
Tumor Response 28 (29%) 33 (34%) 0.5
    Unknown 3 4
Patient Died 52 (53%) 60 (59%) 0.4
Months to Death/Censor 23.5 (17.4, 24.0) 21.2 (14.5, 24.0) 0.14
1 Median (Q1, Q3); n (%)
2 Wilcoxon rank sum test; Pearson’s Chi-squared test

Custom Statistics (Mean/SD and Percentages)

trial %>%
  tbl_summary(
    by = trt,
    statistic = list(
      all_continuous() ~ "{mean} ({sd})",
      all_categorical() ~ "{n} / {N} ({p}%)"
    )
  )
Characteristic Drug A
N = 98
1
Drug B
N = 102
1
Age 47 (15) 47 (14)
    Unknown 7 4
Marker Level (ng/mL) 1.02 (0.89) 0.82 (0.83)
    Unknown 6 4
T Stage

    T1 28 / 98 (29%) 25 / 102 (25%)
    T2 25 / 98 (26%) 29 / 102 (28%)
    T3 22 / 98 (22%) 21 / 102 (21%)
    T4 23 / 98 (23%) 27 / 102 (26%)
Grade

    I 35 / 98 (36%) 33 / 102 (32%)
    II 32 / 98 (33%) 36 / 102 (35%)
    III 31 / 98 (32%) 33 / 102 (32%)
Tumor Response 28 / 95 (29%) 33 / 98 (34%)
    Unknown 3 4
Patient Died 52 / 98 (53%) 60 / 102 (59%)
Months to Death/Censor 20.2 (5.0) 19.0 (5.5)
1 Mean (SD); n / N (%)

Enhanced Table with Labels, Spanning Header, Footnote, Caption

trial2 %>%
  tbl_summary(by = trt) %>%
  add_p() %>%
  add_overall() %>%
  add_n() %>%
  modify_header(label ~ "**Variable**") %>%
  modify_spanning_header(c("stat_1", "stat_2") ~ "**Treatment Received**") %>%
  modify_footnote(all_stat_cols() ~ "Median (IQR) or Frequency (%)") %>%
  modify_caption("**Table 1. Patient Characteristics**") %>%
  bold_labels()
Table 1. Patient Characteristics
Variable N Overall
N = 200
1
Treatment Received
p-value2
Drug A
N = 98
1
Drug B
N = 102
1
Age 189 47 (38, 57) 46 (37, 60) 48 (39, 56) 0.7
    Unknown
11 7 4
Grade 200


0.9
    I
68 (34%) 35 (36%) 33 (32%)
    II
68 (34%) 32 (33%) 36 (35%)
    III
64 (32%) 31 (32%) 33 (32%)
1 Median (IQR) or Frequency (%)
2 Wilcoxon rank sum test; Pearson’s Chi-squared test

Add Source Note

trial2 %>%
  tbl_summary(by = trt, missing = "no") %>%
  add_n() %>%
  as_gt() %>%
  gt::tab_source_note(gt::md("*This data is simulated*"))
Characteristic N Drug A
N = 98
1
Drug B
N = 102
1
Age 189 46 (37, 60) 48 (39, 56)
Grade 200

    I
35 (36%) 33 (32%)
    II
32 (33%) 36 (35%)
    III
31 (32%) 33 (32%)
This data is simulated
1 Median (Q1, Q3); n (%)

Multi-layer Statistics

trial2 %>%
  select(age, trt) %>%
  tbl_summary(
    by = trt,
    type = all_continuous() ~ "continuous2",
    statistic = all_continuous() ~ c(
      "{N_nonmiss}",
      "{median} ({p25}, {p75})",
      "{min}, {max}"
    ),
    missing = "no"
  ) %>%
  add_p(pvalue_fun = ~ style_pvalue(.x, digits = 2))
Characteristic Drug A
N = 98
Drug B
N = 102
p-value1
Age

0.72
    N Non-missing 91 98
    Median (Q1, Q3) 46 (37, 60) 48 (39, 56)
    Min, Max 6, 78 9, 83
1 Wilcoxon rank sum test

Conclusion