SFSA-Simulation-for-Selecting-Best-Sample-Size.R

#MONTE CARLO Simulation#
library(haven)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library (rsconnect)
library(rmarkdown)
library(knitr)
#--------------------------------------------------
# 1. Load data
#--------------------------------------------------

Household_Dataset_Final_Nov <- read_sav(
  "C:/Users/ahmad.khalid/OneDrive - World Food Programme/Ahmad Khalid/EFSA/SFSA 2025/Syntax creation/Household Dataset - Final_Nov.sav"
)

#--------------------------------------------------
# 2. Prepare variables (factors + binary indicators)
#--------------------------------------------------

Household_Dataset_Final_Nov <- Household_Dataset_Final_Nov %>%
  mutate(
    Province = as_factor(Prov),
    CommType_label = as_factor(CommType),
    Prov_Comm = paste(Province, CommType_label),
    
    # Binary indicators
    PoorFCS = ifelse(FCS < 28, 1, 0),
    Vulnerable = ifelse(Vul == 2, 1, 0)
  )

#--------------------------------------------------
# 3. PRE-SPLIT DATA
#--------------------------------------------------

domain_list <- split(Household_Dataset_Final_Nov, 
                     Household_Dataset_Final_Nov$Prov_Comm)

#--------------------------------------------------
# 4. MONTE CARLO FUNCTION
#--------------------------------------------------

simulate_domain_fast <- function(target_n, n_sim = 300) {
  
  results <- data.frame(
    FCS = numeric(n_sim),
    PoorFCS = numeric(n_sim),
    Vul = numeric(n_sim)
  )
  
  for (i in 1:n_sim) {
    
    total_fcs <- 0
    total_poor <- 0
    total_vul <- 0
    total_n <- 0
    
    for (data_d in domain_list) {
      
      n_take <- min(target_n, nrow(data_d))
      idx <- sample.int(nrow(data_d), n_take)
      
     
      fcs_vals <- data_d$FCS[idx]
      poor_vals <- data_d$PoorFCS[idx]
      vul_vals <- data_d$Vulnerable[idx]
      
      total_fcs <- total_fcs + sum(fcs_vals, na.rm = TRUE)
      total_poor <- total_poor + sum(poor_vals, na.rm = TRUE)
      total_vul <- total_vul + sum(vul_vals, na.rm = TRUE)
      
      total_n <- total_n + length(fcs_vals)
    }
    
  
    results$FCS[i] <- total_fcs / total_n
    results$PoorFCS[i] <- total_poor / total_n
    results$Vul[i] <- total_vul / total_n
  }
  
  return(results)
}

#--------------------------------------------------
# 5. TEST RUN 
#--------------------------------------------------

sim_test <- simulate_domain_fast(480, n_sim = 50)

#--------------------------------------------------
# 6. FINAL SIMULATIONS
#--------------------------------------------------

sim_480 <- simulate_domain_fast(480)
sim_369 <- simulate_domain_fast(369)
sim_350 <- simulate_domain_fast(350)
sim_250 <- simulate_domain_fast(250)
sim_150 <- simulate_domain_fast(150)

#--------------------------------------------------
# 7. RESULTS – MEANS
#--------------------------------------------------

colMeans(sim_480)

##        FCS    PoorFCS        Vul 
## 36.7849427  0.2786748  0.7643124

colMeans(sim_369)

##        FCS    PoorFCS        Vul 
## 36.7856404  0.2787753  0.7643676

colMeans(sim_350)

##        FCS    PoorFCS        Vul 
## 36.7838767  0.2786256  0.7643456

colMeans(sim_250)

##        FCS    PoorFCS        Vul 
## 36.7834702  0.2787748  0.7644741

colMeans(sim_150)

##        FCS    PoorFCS        Vul 
## 36.7775440  0.2785793  0.7647373

#--------------------------------------------------
# 8. VARIABILITY (STANDARD DEVIATION)
#--------------------------------------------------

apply(sim_480, 2, sd)

##          FCS      PoorFCS          Vul 
## 0.0087240631 0.0002461298 0.0002814686

apply(sim_369, 2, sd)

##         FCS     PoorFCS         Vul 
## 0.044186807 0.001480904 0.001512486

apply(sim_350, 2, sd)

##         FCS     PoorFCS         Vul 
## 0.052299535 0.001689416 0.001779150

apply(sim_250, 2, sd)

##         FCS     PoorFCS         Vul 
## 0.076476362 0.002654604 0.002574442

apply(sim_150, 2, sd)

##         FCS     PoorFCS         Vul 
## 0.120380903 0.004082568 0.004082408

#--------------------------------------------------
# 9. CONFIDENCE INTERVALS (PoorFCS)
#--------------------------------------------------

quantile(sim_480$PoorFCS, c(0.025, 0.975))

##      2.5%     97.5% 
## 0.2782073 0.2791796

quantile(sim_350$PoorFCS, c(0.025, 0.975))

##      2.5%     97.5% 
## 0.2752651 0.2818111

quantile(sim_250$PoorFCS, c(0.025, 0.975))

##      2.5%     97.5% 
## 0.2733667 0.2837800

quantile(sim_150$PoorFCS, c(0.025, 0.975))

##      2.5%     97.5% 
## 0.2706667 0.2862222

#--------------------------------------------------
# 10. VISUALIZATION
#--------------------------------------------------

# WFP colors
wfp_blue <- "#0072CE"
wfp_light_blue <- "#6BAED6"

# Global font size settings
par(mfrow = c(1,3),
    mar = c(6,5,4,2),        # margins (bottom, left, top, right)
    cex.main = 1.4,          # title size
    cex.lab = 1.3,           # axis labels
    cex.axis = 1.2,          # axis numbers
    bg = "white")

#-------------------------------
# 1. PoorFCS
#-------------------------------
boxplot(sim_480$PoorFCS,
        sim_369$PoorFCS,
        sim_350$PoorFCS,
        sim_250$PoorFCS,
        sim_150$PoorFCS,
        names = c("480","369","350","250","150"),
        col = adjustcolor(wfp_light_blue, 0.7),
        border = wfp_blue,
        main = "Poor Food Consumption",
        ylab = "Proportion",
        frame = FALSE)

abline(h = mean(sim_480$PoorFCS), col = wfp_blue, lwd = 2, lty = 2)

#-------------------------------
# 2. Mean FCS
#-------------------------------
boxplot(sim_480$FCS,
        sim_369$FCS,
        sim_350$FCS,
        sim_250$FCS,
        sim_150$FCS,
        names = c("480","369","350","250","150"),
        col = adjustcolor(wfp_light_blue, 0.7),
        border = wfp_blue,
        main = "Mean Food Consumption Score",
        ylab = "Mean FCS",
        frame = FALSE)

abline(h = mean(sim_480$FCS), col = wfp_blue, lwd = 2, lty = 2)

#-------------------------------
# 3. Vulnerability
#-------------------------------
boxplot(sim_480$Vul,
        sim_369$Vul,
        sim_350$Vul,
        sim_250$Vul,
        sim_150$Vul,
        names = c("480","369","350","250","150"),
        col = adjustcolor(wfp_light_blue, 0.7),
        border = wfp_blue,
        main = "Vulnerability (CARI)",
        ylab = "Proportion",
        frame = FALSE)

abline(h = mean(sim_480$Vul), col = wfp_blue, lwd = 2, lty = 2)

# PoorFCS
boxplot(sim_480$PoorFCS,
        sim_369$PoorFCS,
        sim_350$PoorFCS,
        sim_250$PoorFCS,
        sim_150$PoorFCS,
        names = c("480","369","350","250","150"),
        main = "Impact of Sample Size on Poor FCS",
        ylab = "Proportion")

# FCS
boxplot(sim_480$FCS,
        sim_369$FCS,
        sim_350$FCS,
        sim_250$FCS,
        sim_150$FCS,
        names = c("480","369","350","250","150"),
        main = "Impact of Sample Size on Mean FCS",
        ylab = "Mean FCS")

# Vulnerability
boxplot(sim_480$Vul,
        sim_369$Vul,
        sim_350$Vul,
        sim_250$Vul,
        sim_150$Vul,
        names = c("480","369","350","250","150"),
        main = "Impact of Sample Size on Vulnerability",
        ylab = "Proportion")

#Function to compute summary (ME + CI)
#--------------------------------------------------

sim_list <- list(sim_480, sim_369, sim_350, sim_250, sim_150)
sample_sizes <- c(480, 369, 350, 250, 150)

#--------------------------------------------------
# 6. Function to compute statistics (clean output)
#--------------------------------------------------

summarize_sim <- function(sim_data, var_name) {
  
  x <- sim_data[[var_name]]
  
  mean_val <- mean(x)
  sd_val <- sd(x)
  
  ci_lower <- quantile(x, 0.025)
  ci_upper <- quantile(x, 0.975)
  
  ci_width <- ci_upper - ci_lower
  me <- ci_width / 2
  
  return(c(mean_val, sd_val, ci_lower, ci_upper, ci_width, me))
}

#--------------------------------------------------
# 7. Build clean table 
#--------------------------------------------------

build_table <- function(sim_list, indicator_name, percent = FALSE) {
  
  results <- data.frame(
    Sample_Size = numeric(),
    Mean = numeric(),
    SD = numeric(),
    CI_Lower = numeric(),
    CI_Upper = numeric(),
    CI_Width = numeric(),
    Margin_Error = numeric()
  )
  
  for (i in seq_along(sim_list)) {
    
    stats <- summarize_sim(sim_list[[i]], indicator_name)
    
    results <- rbind(results, data.frame(
      Sample_Size = sample_sizes[i],
      Mean = stats[1],
      SD = stats[2],
      CI_Lower = stats[3],
      CI_Upper = stats[4],
      CI_Width = stats[5],
      Margin_Error = stats[6]
    ))
  }
  
  # Convert to %
  if (percent) {
    results[, -1] <- results[, -1] * 100
  }
  
  rownames(results) <- NULL 
  
  return(results)
}

#--------------------------------------------------
# 8. Generate final tables
#--------------------------------------------------

table_FCS <- build_table(sim_list, "FCS", percent = FALSE)
table_PoorFCS <- build_table(sim_list, "PoorFCS", percent = TRUE)
table_Vul <- build_table(sim_list, "Vul", percent = TRUE)

#--------------------------------------------------
# 9. Round for reporting
#--------------------------------------------------

table_FCS <- round(table_FCS, 3)
table_PoorFCS <- round(table_PoorFCS, 2)
table_Vul <- round(table_Vul, 2)

#--------------------------------------------------
# 10. View results
#--------------------------------------------------

table_FCS

##   Sample_Size   Mean    SD CI_Lower CI_Upper CI_Width Margin_Error
## 1         480 36.785 0.009   36.766   36.801    0.035        0.017
## 2         369 36.786 0.044   36.696   36.875    0.179        0.089
## 3         350 36.784 0.052   36.684   36.885    0.201        0.101
## 4         250 36.783 0.076   36.648   36.938    0.289        0.145
## 5         150 36.778 0.120   36.584   37.022    0.438        0.219

table_PoorFCS

##   Sample_Size  Mean   SD CI_Lower CI_Upper CI_Width Margin_Error
## 1         480 27.87 0.02    27.82    27.92     0.10         0.05
## 2         369 27.88 0.15    27.57    28.15     0.57         0.29
## 3         350 27.86 0.17    27.53    28.18     0.65         0.33
## 4         250 27.88 0.27    27.34    28.38     1.04         0.52
## 5         150 27.86 0.41    27.07    28.62     1.56         0.78

table_Vul

##   Sample_Size  Mean   SD CI_Lower CI_Upper CI_Width Margin_Error
## 1         480 76.43 0.03    76.37    76.49     0.11         0.06
## 2         369 76.44 0.15    76.13    76.73     0.60         0.30
## 3         350 76.43 0.18    76.13    76.78     0.65         0.32
## 4         250 76.45 0.26    75.91    76.93     1.03         0.51
## 5         150 76.47 0.41    75.70    77.16     1.46         0.73

SFSA-Simulation-for-Selecting-Best-Sample-Size.R

Ahmad Khalid.Amarkha

2026-06-08