Preparation

setwd("/Users/superclumsy/Library/CloudStorage/GoogleDrive-nguyenkhoiquan@gmail.com/My Drive/01 JHU MPH/140.623 STAT 3/Problem Set")

library(tidyverse)
library(dplyr)
library(skimr)
library(readr)

Guoshu Kong function ()

nepal621 = read_csv("nepal621.csv")

head(nepal621)
## # A tibble: 6 × 4
##   sex   age   trt     status
##   <chr> <chr> <chr>   <chr> 
## 1 Male  3-4   Placebo Alive 
## 2 Male  1-2   Vit A   Alive 
## 3 Male  3-4   Placebo Alive 
## 4 Male  3-4   Vit A   Alive 
## 5 Male  <1    Vit A   Alive 
## 6 Male  3-4   Placebo Alive

1a. Estimate sample size within 0.5% range

Step 1: Convert status to binary: 1 = Dead, 0 = Alive

nepal621 <- nepal621 %>%
  mutate(status = ifelse(status == "Alive", 0, 1))
table(nepal621$status)
## 
##     0     1 
## 26598   523

Step 2: Calculate mortality rate for groups who are younger than 3 years old

table(nepal621$age)
## 
##    <1   1-2   3-4 
##  5386 11157 10578
nepal_under3 <- nepal621 %>% filter(age %in% c("<1", "1-2"))
print(mbelow3 <- mean(nepal_under3$status))
## [1] 0.02689959

Step 3: Estimate Sample size using the mortality rate calculated.

# Define function to calculate sample size #

sample_size_prop <- function(p, error = 0.005, confidence = 0.95) {
  Z <- qnorm(1 - (1 - confidence) / 2)  # Z-score for confidence level
  n <- (Z^2 * p * (1 - p)) / (error^2)  # Sample size formula
  return(ceiling(n))  # Round up to the next integer
}

n_nepal <- sample_size_prop(mbelow3)

# Print results
cat("Estimated 16-month mortality rate from Nepal data:", round(mbelow3, 4), "\n")
## Estimated 16-month mortality rate from Nepal data: 0.0269
cat("Required sample size to estimate within ±0.5%:", n_nepal, "\n")
## Required sample size to estimate within ±0.5%: 4023

1b. Compute sample sizes for a range of plausible mortality rates

Because we don’t know the mortality rates, we try to calculate a range of sample size for different mortality rates, ranging from 0.01 to 0.10.

mrate <- seq(0.01, 0.10, by = 0.01)
sample_sizes <- sapply(mrate, sample_size_prop)

sample_size_table <- data.frame(Mortality_Rate = mrate, Required_Sample_Size = sample_sizes)

print(sample_size_table)
##    Mortality_Rate Required_Sample_Size
## 1            0.01                 1522
## 2            0.02                 3012
## 3            0.03                 4472
## 4            0.04                 5901
## 5            0.05                 7299
## 6            0.06                 8667
## 7            0.07                10004
## 8            0.08                11310
## 9            0.09                12585
## 10           0.10                13830

2. Mortality rate in Vit_A & placebo group in age < 3 years old

mortality_rates <- nepal_under3 %>%
  group_by(trt) %>%
  summarise(mortality_rate = mean(status))

print(mortality_rates)
## # A tibble: 2 × 2
##   trt     mortality_rate
##   <chr>            <dbl>
## 1 Placebo         0.0294
## 2 Vit A           0.0245
power.prop.test(
  p1 = 0.0294,  # Placebo mortality
  p2 = 0.0245,  # Vitamin A mortality
  power = 0.8,  # 80% power
  sig.level = 0.05,  # sig 0.05
  alternative = "two.sided"
)
## 
##      Two-sample comparison of proportions power calculation 
## 
##               n = 17143.9
##              p1 = 0.0294
##              p2 = 0.0245
##       sig.level = 0.05
##           power = 0.8
##     alternative = two.sided
## 
## NOTE: n is number in *each* group

3. Confirm with hand calculation

\[ n_1 = n_2 = \frac{\left( Z_{\alpha/2} \sqrt{2 \bar{p} \bar{q}} + Z_{\beta} \sqrt{p_1 q_1 + p_2 q_2} \right)^2}{\Delta^2} \] Where:

\[ \begin{aligned} p_1 & = 0.0294, & \text{Placebo mortality} \\ p_2 & = 0.0245, & \text{Vitamin A mortality} \\ Z_{\beta} & = 0.84 & \text{at } \beta = 0.2 \text{ (power = 80%)} \\ Z_{\alpha/2} & = 1.96 & \text{at } \alpha = 0.05 \\ \bar{p} & = \frac{p_1 + p_2}{2} = 0.02695 \\ \bar{q} & = 1 - \bar{p} = 0.97305 \end{aligned} \]

The sample size for each group is \(n_1 = n_2 = 17,125\) participants, which is approximately close to the estimate by the power.prop.test() function at \(17,144\) participants in question 2.

4. Define function to calculate sample size for two proportions by various assumptions

sample_size_rr <- function(p1, rr, power = 0.8, sig.level = 0.05) {
  p2 <- p1 / rr  #Treatment group mortality rate, given the RR for CONTROL group.
  
  result <- power.prop.test(p1 = p1, p2 = p2, power = power, sig.level = sig.level, alternative = "two.sided")
  return(ceiling(result$n) * 2)  # Total sample size (both groups)
}

Define assumptions

p1_values <- c(0.0294, 0.0294 - 0.005, 0.0294 + 0.005)  # Control group mortality rates
rr_values <- c(1.2, 1.5, 1.75)  # Relative risk values

Compute sample sizes

sample_size_matrix <- expand.grid(Control_Mortality = p1_values, Relative_Risk = rr_values)
sample_size_matrix$Total_Sample_Size <- mapply(sample_size_rr, sample_size_matrix$Control_Mortality, sample_size_matrix$Relative_Risk)

Display table

library(dplyr)
sample_size_table <- sample_size_matrix %>%
  arrange(Control_Mortality, Relative_Risk)

print(sample_size_table)
##   Control_Mortality Relative_Risk Total_Sample_Size
## 1            0.0244          1.20             41510
## 2            0.0244          1.50              9452
## 3            0.0244          1.75              5398
## 4            0.0294          1.20             34288
## 5            0.0294          1.50              7812
## 6            0.0294          1.75              4462
## 7            0.0344          1.20             29166
## 8            0.0344          1.50              6648
## 9            0.0344          1.75              3798

What I observe from the table is that when the mortality in the control group increases, the total sample size estimate reduces. Similarly, when the relative risk for the control group increases, the total sample size estimate also reduces.

5. Define function to calculate sample size when RR > 1 with 90% power

sample_size_rr_90 <- function(p1, rr, power = 0.9, sig.level = 0.05) {
  p2 <- p1 / rr  #Treatment group mortality rate, given the RR for CONTROL group.
  
  result <- power.prop.test(p1 = p1, p2 = p2, power = power, sig.level = sig.level, alternative = "two.sided")
  return(ceiling(result$n) * 2)  # Total sample size (both groups)
}

Define assumptions

p1_values <- c(0.0294, 0.0294 - 0.005, 0.0294 + 0.005)  # Control group mortality rates
rr_values <- c(1.2, 1.5, 1.75)  # Relative risk values

Compute sample sizes with 90% power

sample_size_matrix_90 <- expand.grid(Control_Mortality = p1_values, Relative_Risk = rr_values)
sample_size_matrix_90$Total_Sample_Size <- mapply(sample_size_rr_90, sample_size_matrix_90$Control_Mortality, sample_size_matrix_90$Relative_Risk)

Display table

sample_size_table_90 <- sample_size_matrix_90 %>%
  arrange(Control_Mortality, Relative_Risk)

print(sample_size_table_90)
##   Control_Mortality Relative_Risk Total_Sample_Size
## 1            0.0244          1.20             55568
## 2            0.0244          1.50             12654
## 3            0.0244          1.75              7224
## 4            0.0294          1.20             45902
## 5            0.0294          1.50             10456
## 6            0.0294          1.75              5972
## 7            0.0344          1.20             39044
## 8            0.0344          1.50              8898
## 9            0.0344          1.75              5082

Beyond what had been observed in question 4, this example expands my understanding in sample size estimation when power increases. In this scenario, increasing the power from 80% to 90% while keeping all other assumptions unchanged leads to a substantial increase in the required total sample size.

This observation is consistent with the general principle of statistical power analysis: higher power requires larger sample sizes.

6. Select a design

In my opinion, the findings from parts 4 and 5 suggest that assuming a control group mortality rate of \(0.0294\) (consistent with the Nepal placebo group) and a relative risk of 1.2 is preferred This combination, while requiring a larger sample size of \(34,288\) participants for 80% power (or \(45,902\) for 90% power), allows for a more robust study design.

However, the limit of funding and resources may not allow us to conduct such a study. In those scenarios, a less conservative approach can be of interest, where we assume a relative risk of 1.5. This change can significantly reduce the required sample size. With the same control group mortality rate (regarding the Nepal placebo group), the total sample size drops to \(7,812\) for 80% power – less than one-fourth that needed for a relative risk of 1.2.

This trade-off between precision and feasibility should be carefully considered based on the study’s specific constraints and objectives.