setwd("/Users/superclumsy/Library/CloudStorage/GoogleDrive-nguyenkhoiquan@gmail.com/My Drive/01 JHU MPH/140.623 STAT 3/Problem Set")
library(tidyverse)
library(dplyr)
library(skimr)
library(readr)
Guoshu Kong function ()
nepal621 = read_csv("nepal621.csv")
head(nepal621)
## # A tibble: 6 × 4
## sex age trt status
## <chr> <chr> <chr> <chr>
## 1 Male 3-4 Placebo Alive
## 2 Male 1-2 Vit A Alive
## 3 Male 3-4 Placebo Alive
## 4 Male 3-4 Vit A Alive
## 5 Male <1 Vit A Alive
## 6 Male 3-4 Placebo Alive
Step 1: Convert status to binary: 1 = Dead, 0 = Alive
nepal621 <- nepal621 %>%
mutate(status = ifelse(status == "Alive", 0, 1))
table(nepal621$status)
##
## 0 1
## 26598 523
Step 2: Calculate mortality rate for groups who are younger than 3 years old
table(nepal621$age)
##
## <1 1-2 3-4
## 5386 11157 10578
nepal_under3 <- nepal621 %>% filter(age %in% c("<1", "1-2"))
print(mbelow3 <- mean(nepal_under3$status))
## [1] 0.02689959
Step 3: Estimate Sample size using the mortality rate calculated.
# Define function to calculate sample size #
sample_size_prop <- function(p, error = 0.005, confidence = 0.95) {
Z <- qnorm(1 - (1 - confidence) / 2) # Z-score for confidence level
n <- (Z^2 * p * (1 - p)) / (error^2) # Sample size formula
return(ceiling(n)) # Round up to the next integer
}
n_nepal <- sample_size_prop(mbelow3)
# Print results
cat("Estimated 16-month mortality rate from Nepal data:", round(mbelow3, 4), "\n")
## Estimated 16-month mortality rate from Nepal data: 0.0269
cat("Required sample size to estimate within ±0.5%:", n_nepal, "\n")
## Required sample size to estimate within ±0.5%: 4023
Because we don’t know the mortality rates, we try to calculate a range of sample size for different mortality rates, ranging from 0.01 to 0.10.
mrate <- seq(0.01, 0.10, by = 0.01)
sample_sizes <- sapply(mrate, sample_size_prop)
sample_size_table <- data.frame(Mortality_Rate = mrate, Required_Sample_Size = sample_sizes)
print(sample_size_table)
## Mortality_Rate Required_Sample_Size
## 1 0.01 1522
## 2 0.02 3012
## 3 0.03 4472
## 4 0.04 5901
## 5 0.05 7299
## 6 0.06 8667
## 7 0.07 10004
## 8 0.08 11310
## 9 0.09 12585
## 10 0.10 13830
mortality_rates <- nepal_under3 %>%
group_by(trt) %>%
summarise(mortality_rate = mean(status))
print(mortality_rates)
## # A tibble: 2 × 2
## trt mortality_rate
## <chr> <dbl>
## 1 Placebo 0.0294
## 2 Vit A 0.0245
power.prop.test(
p1 = 0.0294, # Placebo mortality
p2 = 0.0245, # Vitamin A mortality
power = 0.8, # 80% power
sig.level = 0.05, # sig 0.05
alternative = "two.sided"
)
##
## Two-sample comparison of proportions power calculation
##
## n = 17143.9
## p1 = 0.0294
## p2 = 0.0245
## sig.level = 0.05
## power = 0.8
## alternative = two.sided
##
## NOTE: n is number in *each* group
\[ n_1 = n_2 = \frac{\left( Z_{\alpha/2} \sqrt{2 \bar{p} \bar{q}} + Z_{\beta} \sqrt{p_1 q_1 + p_2 q_2} \right)^2}{\Delta^2} \] Where:
\[ \begin{aligned} p_1 & = 0.0294, & \text{Placebo mortality} \\ p_2 & = 0.0245, & \text{Vitamin A mortality} \\ Z_{\beta} & = 0.84 & \text{at } \beta = 0.2 \text{ (power = 80%)} \\ Z_{\alpha/2} & = 1.96 & \text{at } \alpha = 0.05 \\ \bar{p} & = \frac{p_1 + p_2}{2} = 0.02695 \\ \bar{q} & = 1 - \bar{p} = 0.97305 \end{aligned} \]
The sample size for each group is \(n_1 = n_2 = 17,125\) participants, which is
approximately close to the estimate by the
power.prop.test() function at \(17,144\) participants in question 2.
sample_size_rr <- function(p1, rr, power = 0.8, sig.level = 0.05) {
p2 <- p1 / rr #Treatment group mortality rate, given the RR for CONTROL group.
result <- power.prop.test(p1 = p1, p2 = p2, power = power, sig.level = sig.level, alternative = "two.sided")
return(ceiling(result$n) * 2) # Total sample size (both groups)
}
Define assumptions
p1_values <- c(0.0294, 0.0294 - 0.005, 0.0294 + 0.005) # Control group mortality rates
rr_values <- c(1.2, 1.5, 1.75) # Relative risk values
Compute sample sizes
sample_size_matrix <- expand.grid(Control_Mortality = p1_values, Relative_Risk = rr_values)
sample_size_matrix$Total_Sample_Size <- mapply(sample_size_rr, sample_size_matrix$Control_Mortality, sample_size_matrix$Relative_Risk)
Display table
library(dplyr)
sample_size_table <- sample_size_matrix %>%
arrange(Control_Mortality, Relative_Risk)
print(sample_size_table)
## Control_Mortality Relative_Risk Total_Sample_Size
## 1 0.0244 1.20 41510
## 2 0.0244 1.50 9452
## 3 0.0244 1.75 5398
## 4 0.0294 1.20 34288
## 5 0.0294 1.50 7812
## 6 0.0294 1.75 4462
## 7 0.0344 1.20 29166
## 8 0.0344 1.50 6648
## 9 0.0344 1.75 3798
What I observe from the table is that when the mortality in the control group increases, the total sample size estimate reduces. Similarly, when the relative risk for the control group increases, the total sample size estimate also reduces.
sample_size_rr_90 <- function(p1, rr, power = 0.9, sig.level = 0.05) {
p2 <- p1 / rr #Treatment group mortality rate, given the RR for CONTROL group.
result <- power.prop.test(p1 = p1, p2 = p2, power = power, sig.level = sig.level, alternative = "two.sided")
return(ceiling(result$n) * 2) # Total sample size (both groups)
}
Define assumptions
p1_values <- c(0.0294, 0.0294 - 0.005, 0.0294 + 0.005) # Control group mortality rates
rr_values <- c(1.2, 1.5, 1.75) # Relative risk values
Compute sample sizes with 90% power
sample_size_matrix_90 <- expand.grid(Control_Mortality = p1_values, Relative_Risk = rr_values)
sample_size_matrix_90$Total_Sample_Size <- mapply(sample_size_rr_90, sample_size_matrix_90$Control_Mortality, sample_size_matrix_90$Relative_Risk)
Display table
sample_size_table_90 <- sample_size_matrix_90 %>%
arrange(Control_Mortality, Relative_Risk)
print(sample_size_table_90)
## Control_Mortality Relative_Risk Total_Sample_Size
## 1 0.0244 1.20 55568
## 2 0.0244 1.50 12654
## 3 0.0244 1.75 7224
## 4 0.0294 1.20 45902
## 5 0.0294 1.50 10456
## 6 0.0294 1.75 5972
## 7 0.0344 1.20 39044
## 8 0.0344 1.50 8898
## 9 0.0344 1.75 5082
Beyond what had been observed in question 4, this example expands my understanding in sample size estimation when power increases. In this scenario, increasing the power from 80% to 90% while keeping all other assumptions unchanged leads to a substantial increase in the required total sample size.
This observation is consistent with the general principle of statistical power analysis: higher power requires larger sample sizes.
In my opinion, the findings from parts 4 and 5 suggest that assuming a control group mortality rate of \(0.0294\) (consistent with the Nepal placebo group) and a relative risk of 1.2 is preferred This combination, while requiring a larger sample size of \(34,288\) participants for 80% power (or \(45,902\) for 90% power), allows for a more robust study design.
However, the limit of funding and resources may not allow us to conduct such a study. In those scenarios, a less conservative approach can be of interest, where we assume a relative risk of 1.5. This change can significantly reduce the required sample size. With the same control group mortality rate (regarding the Nepal placebo group), the total sample size drops to \(7,812\) for 80% power – less than one-fourth that needed for a relative risk of 1.2.
This trade-off between precision and feasibility should be carefully considered based on the study’s specific constraints and objectives.