Sample Size Calculation

We use the standard formula for sample size estimation in proportion studies:

\[ n = \frac{z^2 \cdot p \cdot q}{d^2} \]

where:
- z = z-value at 95% CI (1.96)
- p = estimated proportion (0.5 used as conservative estimate)
- q = (1 - p)
- d = margin of error (0.05)

# Sample Size Calculation
z <- 1.96
p <- 0.5
d <- 0.05
n <- z^2 * p * (1 - p) / d^2
n
## [1] 384.16

The required sample size is approximately 384.


Data Analysis with gtsummary

Load Packages

library(gtsummary)
library(dplyr)
library(ggplot2)   # for diamonds dataset

Dataset 1: mtcars

mtcars %>%
  select(mpg, cyl, gear) %>%
  tbl_summary(by = cyl) %>%
  add_p()
Characteristic 4
N = 11
1
6
N = 7
1
8
N = 14
1
p-value2
mpg 26.0 (22.8, 30.4) 19.7 (18.1, 21.0) 15.2 (14.3, 16.4) <0.001
gear


<0.001
    3 1 (9.1%) 2 (29%) 12 (86%)
    4 8 (73%) 4 (57%) 0 (0%)
    5 2 (18%) 1 (14%) 2 (14%)
1 Median (Q1, Q3); n (%)
2 Kruskal-Wallis rank sum test; Fisher’s exact test

Dataset 2: iris

iris %>%
  select(Sepal.Length, Sepal.Width, Species) %>%
  tbl_summary(by = Species) %>%
  add_p()
Characteristic setosa
N = 50
1
versicolor
N = 50
1
virginica
N = 50
1
p-value2
Sepal.Length 5.00 (4.80, 5.20) 5.90 (5.60, 6.30) 6.50 (6.20, 6.90) <0.001
Sepal.Width 3.40 (3.20, 3.70) 2.80 (2.50, 3.00) 3.00 (2.80, 3.20) <0.001
1 Median (Q1, Q3)
2 Kruskal-Wallis rank sum test

Dataset 3: diamonds (ggplot2)

diamonds %>%
  select(price, cut, color) %>%
  tbl_summary(by = cut) %>%
  add_p()
Characteristic Fair
N = 1,610
1
Good
N = 4,906
1
Very Good
N = 12,082
1
Premium
N = 13,791
1
Ideal
N = 21,551
1
p-value2
price 3,282 (2,050, 5,208) 3,051 (1,145, 5,028) 2,648 (912, 5,373) 3,185 (1,046, 6,296) 1,810 (878, 4,679) <0.001
color




<0.001
    D 163 (10%) 662 (13%) 1,513 (13%) 1,603 (12%) 2,834 (13%)
    E 224 (14%) 933 (19%) 2,400 (20%) 2,337 (17%) 3,903 (18%)
    F 312 (19%) 909 (19%) 2,164 (18%) 2,331 (17%) 3,826 (18%)
    G 314 (20%) 871 (18%) 2,299 (19%) 2,924 (21%) 4,884 (23%)
    H 303 (19%) 702 (14%) 1,824 (15%) 2,360 (17%) 3,115 (14%)
    I 175 (11%) 522 (11%) 1,204 (10.0%) 1,428 (10%) 2,093 (9.7%)
    J 119 (7.4%) 307 (6.3%) 678 (5.6%) 808 (5.9%) 896 (4.2%)
1 Median (Q1, Q3); n (%)
2 Kruskal-Wallis rank sum test; Pearson’s Chi-squared test

Dataset 4: ToothGrowth

ToothGrowth %>%
  select(len, supp, dose) %>%
  tbl_summary(by = supp) %>%
  add_p()
## The following warnings were returned during `add_p()`:
## ! For variable `len` (`supp`) and "estimate", "statistic", "p.value",
##   "conf.low", and "conf.high" statistics: cannot compute exact p-value with
##   ties
## ! For variable `len` (`supp`) and "estimate", "statistic", "p.value",
##   "conf.low", and "conf.high" statistics: cannot compute exact confidence
##   intervals with ties
Characteristic OJ
N = 30
1
VC
N = 30
1
p-value2
len 23 (15, 26) 17 (11, 23) 0.064
dose

>0.9
    0.5 10 (33%) 10 (33%)
    1 10 (33%) 10 (33%)
    2 10 (33%) 10 (33%)
1 Median (Q1, Q3); n (%)
2 Wilcoxon rank sum test; Pearson’s Chi-squared test

Dataset 5: PlantGrowth

PlantGrowth %>%
  select(weight, group) %>%
  tbl_summary(by = group) %>%
  add_p()
Characteristic ctrl
N = 10
1
trt1
N = 10
1
trt2
N = 10
1
p-value2
weight 5.16 (4.53, 5.33) 4.55 (4.17, 4.89) 5.44 (5.26, 5.80) 0.018
1 Median (Q1, Q3)
2 Kruskal-Wallis rank sum test

trial %>%
  tbl_summary(
    by = trt,
    statistic = list(
      all_continuous() ~ "{mean} ({sd})",
      all_categorical() ~ "{n} / {N} ({p}%)"
    )
  )
Characteristic Drug A
N = 98
1
Drug B
N = 102
1
Age 47 (15) 47 (14)
    Unknown 7 4
Marker Level (ng/mL) 1.02 (0.89) 0.82 (0.83)
    Unknown 6 4
T Stage

    T1 28 / 98 (29%) 25 / 102 (25%)
    T2 25 / 98 (26%) 29 / 102 (28%)
    T3 22 / 98 (22%) 21 / 102 (21%)
    T4 23 / 98 (23%) 27 / 102 (26%)
Grade

    I 35 / 98 (36%) 33 / 102 (32%)
    II 32 / 98 (33%) 36 / 102 (35%)
    III 31 / 98 (32%) 33 / 102 (32%)
Tumor Response 28 / 95 (29%) 33 / 98 (34%)
    Unknown 3 4
Patient Died 52 / 98 (53%) 60 / 102 (59%)
Months to Death/Censor 20.2 (5.0) 19.0 (5.5)
1 Mean (SD); n / N (%)

Conclusion