This notebook contains organised code for each step of the weighting process for the CODPHIA data.

The major steps and progress (at 30/04/2025) are as follows:

  1. [done] Load and organise/join the required datasets
  2. [done] Calculate PSU weights
  3. [not started] Calculate household weights
  4. [not started] Calculate person-level interview weights
  5. [not started] Calculate person-level blood test weights
  6. [started] Create jackknife replicate weights

1 Calculate EA weights

1.1 What Are Base Weights?

Base weights compensate for unequal selection probabilities due to multi-stage sampling designs.
They are the inverse of the probability of selection at each stage. Make reference to the CODPHIA second stage sample, the base weight is psuwt0 = column(ES) = ADJ_PSU_WT, while the ADJ_PSU_WT = 1/column(ER) = 1/ADJ_PROB2

\[ \text{PSU weight} = \frac{1}{\text{Selection Probability}} \]

1.2 Loading data to calculate base weights

load second stage sample, remove duplicates in terms of EAs so that you are left with 396 EAs. The reason for using second stage sample instead of 1st stage sample is that the 2nd stage sample has all information from 1st stage and from the household listing

# This section is to import packages
# and prepare for importing data
rm(list = ls())



# clean/remove everything
#install.packages("pacman")
# we use pacman to import all packages
pacman::p_load(tidyverse,
               haven,
               rio, 
               ggplot2, 
               janitor, 
               survey, 
               lubridate, 
               readxl,
               BART,
               future,
               furrr,
               progressr,
               caret,
               progressr
               
)


# create path to the datasets
drc_sample_dir <- "C:/Users/Kupamupindit/OneDrive - Columbia University Irving Medical Center/Documents/ICAP work/Countries/DRC/Sampling/2nd_stage_sampling/"



###################################################################

# read_excel comes from the package readxl, str_c means concatenate 2 strings as shown below
# if you press shift+ctr+M you will get a pipe = %>% 
# if you press ctr+s you will save your script, its a shortcut for saving 
# you can google shortcuts for Rstudio, it will give you a whole list of all these shortcuts
# clean_name comes from the package janitor, the purpose of clean_name is to make your variable names proper
drc_zd_in <- read_excel(str_c(drc_sample_dir,"CODPHIA_second_stage_sampling_09_08_2024.xlsx")) %>%
  clean_names() %>% 
  distinct(ea_id, .keep_all=TRUE) 

1.2.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.1

2 calculating base weights

2.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.1

psu_base_weights <- drc_zd_in |> 
  mutate(segment_adjustment = if_else(is.na(segment), 1, segmentsize / easize),
         segadj = adj_prob2 / adj_prob1,
         psuwt0 = 1/(adj_prob1 * segment_adjustment))

# Check
psu_base_weights |> 
  ggplot(aes(x = adj_psu_wt, y = psuwt0)) +
  geom_point() +
  geom_smooth(method = lm)
## `geom_smooth()` using formula = 'y ~ x'

psu_base_weights |> 
  ggplot(aes(x = factor(strata),
             y = psuwt0)) +
  geom_boxplot() +
  labs(x = "Stratum/Region",
       y = "PSU base weight") +
  theme_bw()

psu_base_weights %>% 
  group_by(province) |> 
  mutate(psu_num_instrata = row_number()) |> 
  ggplot(aes(x = psu_num_instrata,
             y = psuwt0, 
             fill = factor(type_secteur, levels = c(1,2), 
                           labels = c("Urban", "Rural")))) +
  geom_col() +
  facet_wrap(~province) +
  theme_bw() +
  labs(fill = "Urban/Rural",
       x = "PSU number within stratum",
       y = "PSU base weight")

3 Household base weights

There are several steps for the household weights:

3.1 Calculate the within-EA household sampling weight.

We need the counts of households listed and selected to compute the household selection probability and weight.

The sample design was to take a systematic random sample of an average of 35 households from each EA. The household selection probability, \(p_{hh,i}\) in EA \(i\) is then

\[ p_{hh,i} = \frac{35}{N_{\textrm{elig},i}}, \] where \(N_{\textrm{elig},i}\) is the number of eligible households in EA \(i\).

# sampled <- read_excel(str_c(drc_sample_dir,"CODPHIA_EAs_with_sampled_HHs_HK.xlsx")) %>% 
#   select(NOM_COURT,Sampled_HH)
# listing <- read_excel(str_c(drc_sample_dir,"CODPHIA_HKL_HHL_080624.xlsx")) %>% 
#   left_join(sampled)
#   
# # create the flag of hhs that were sampled
# hh_sample_file_in <- listing %>%
#   mutate(hh_smpflg = ifelse(is.na(Sampled_HH), 0, 1))
# # hh_smpglg, create a household flag of the those EAs that were selected
# 
# 
# # creer hhs probs
# hh_selection_probs <- hh_sample_file_in |> 
#   group_by(strata) |> 
#   summarise(hhcount = n(),
#             selected_hh_count = sum(hh_smpflg == "1")) |> 
#   mutate(hh_selection_prob = selected_hh_count / hhcount)
# 
# # summary(hh_selection_probs$hh_selection_prob)
# # hist(hh_selection_probs$hh_selection_prob, breaks = 40)
# 
# 
# 
# selected_hh <- hh_sample_file_in |> 
#   filter(hh_smpflg == "1") |> 
#   select(cod_distric, nom_distric, strata, grappe, eahhid, stobs_d, stvac_d, resyn_d, hh_smpflg) |> 
#   left_join(
#     psu_base_weights |> select(-cod_distric, -nom_distric, -strata, -eahhid, -stobs_d, -stvac_d, -resyn_d),
#     by = "grappe"
#   ) |> 
#   select(-c(totalsegment, segmentselect, segmented, segmos, 
#             adj_prob1, adj_prob2)) |> 
#   left_join(hh_selection_probs) |> 
#   mutate(hhbwt0 = 1/( (1/psuwt0) * hh_selection_prob)) %>% 
#   ungroup() %>% 
#   arrange(grappe)
# 
# 
# summary(selected_hh$hhbwt0)
# hist(selected_hh$hhbwt0, breaks = 40)
# 
# selected_hh |> 
#   group_by(strata) |> 
#   summarise(min = min(hhbwt0),
#             low_Q = quantile(hhbwt0, 0.25),
#             medianwt = median(hhbwt0),
#             meanwt = mean(hhbwt0),
#             upp_Q = quantile(hhbwt0, 0.75),
#             max = max(hhbwt0))
# 
# 
# selected_hh |> 
#   ggplot(aes(y = hhbwt0)) +
#   facet_wrap(~strata)+
#   geom_boxplot() +
#   theme_bw() +
#   labs(x = "Household Base Weight",
#        y = "Region")