This notebook contains organised code for each step of the weighting process for the CIPHIA2 data.

The major steps and progress (at 20/03/2022) are as follows:

  1. [done] Load and organise/join the required datasets
  2. [done] Calculate PSU weights
  3. [done] Calculate household weights
  4. [started] Calculate person-level interview weights
  5. [not started] Calculate person-level blood test weights
  6. [started] Create jackknife replicate weights

1 Calculate EA weights

1.1 What Are Base Weights?

Base weights compensate for unequal selection probabilities due to multi-stage sampling designs.
They are the inverse of the probability of selection at each stage. Make reference to the CIPHIA2 second stage sample, the base weight is psuwt0 = column(ES) = ADJ_PSU_WT, while the ADJ_PSU_WT = 1/column(ER) = 1/ADJ_PROB2

\[ \text{PSU weight} = \frac{1}{\text{Selection Probability}} \]

1.2 Loading data to calculate base weights

load second stage sample, remove duplicates in terms of EAs so that you are left with 396 EAs. The reason for using second stage sample instead of 1st stage sample is that the 2nd stage sample has all information from 1st stage and from the household listing

# This section is to import packages
# and prepare for importing data
rm(list = ls())



# clean/remove everything
#install.packages("pacman")
# we use pacman to import all packages
pacman::p_load(tidyverse,
               haven,
               rio, 
               ggplot2, 
               janitor, 
               survey, 
               lubridate, 
               readxl,
               BART,
               future,
               furrr,
               progressr,
               caret
               
)

# create path to the datasets
civ2_sample_dir <- "C:/Users/kupamupindit/OneDrive - Columbia University Irving Medical Center/Documents/ICAP work/Countries/Cote D'Ivoire/CIPHIA2/Sampling/2nd_stage_sampling/"

###################################################################

# read_excel comes from the package readxl, str_c means concatenate 2 strings as shown below
# if you press shift+ctr+M you will get a pipe = %>% 
# if you press ctr+s you will save your script, its a shortcut for saving 
# you can google shortcuts for Rstudio, it will give you a whole list of all these shortcuts
# clean_name comes from the package janitor, the purpose of clean_name is to make your variable names proper
civ2_zd_in <- read_excel(str_c(civ2_sample_dir,"CIPHIA2_2nd_stage_sample_09_07_2024_v4.xlsx")) %>%
  clean_names() %>% 
  distinct(ea_code, .keep_all=TRUE) 

1.3 Exploratory plots

psu_base_weights <- civ2_zd_in |> 
  mutate(psuwt0 = adj_psu_wt)


psu_base_weights |> 
  ggplot(aes(x = factor(strata),
             y = psuwt0)) +
  geom_boxplot() +
  labs(x = "Stratum/Region",
       y = "PSU base weight") +
  theme_bw()

psu_base_weights %>% 
  group_by(nom_reg) |> 
  mutate(psu_num_instrata = row_number()) |> 
  ggplot(aes(x = psu_num_instrata,
             y = psuwt0, 
             fill = factor(nom_milieu, levels = c(1,2), 
                           labels = c("Urban", "Rural")))) +
  geom_col() +
  facet_wrap(~nom_reg) +
  theme_bw() +
  labs(fill = "Urban/Rural",
       x = "PSU number within stratum",
       y = "PSU base weight")

2 Household base weights

There are several steps for the household weights:

2.1 Calculate the within-EA household sampling weight.

We need the counts of households listed and selected to compute the household selection probability and weight.

The sample design was to take a systematic random sample of an average of 35 households from each EA. The household selection probability, \(p_{hh,i}\) in EA \(i\) is then

\[ p_{hh,i} = \frac{35}{N_{\textrm{elig},i}}, \]

where \(N_{\textrm{elig},i}\) is the number of eligible households in EA \(i\).

civ2_zd_in1 <- read_excel(str_c(civ2_sample_dir,"CIPHIA2_2nd_stage_sample_09_07_2024_v4.xlsx")) %>%
  clean_names()
# create the flag of hhs that were sampled
hh_sample_file_in <- civ2_zd_in1 %>%
  rename(hh_smpflg = flag_ea) %>%
  mutate(hh_smpflg = ifelse(is.na(hh_smpflg), 0, hh_smpflg))
# hh_smpglg, create a household flag of the those EAs that were selected


# creer hhs probs
hh_selection_probs <- hh_sample_file_in |> 
  group_by(strata) |> 
  summarise(hhcount = n(),
            selected_hh_count = sum(hh_smpflg == "1")) |> 
  mutate(hh_selection_prob = selected_hh_count / hhcount)

# summary(hh_selection_probs$hh_selection_prob)
# hist(hh_selection_probs$hh_selection_prob, breaks = 40)



selected_hh <- hh_sample_file_in |> 
  filter(hh_smpflg == "1") |> 
  select(cod_distric, nom_distric, strata, grappe, eahhid, stobs_d, stvac_d, resyn_d, hh_smpflg) |> 
  left_join(
    psu_base_weights |> select(-cod_distric, -nom_distric, -strata, -eahhid, -stobs_d, -stvac_d, -resyn_d),
    by = "grappe"
  ) |> 
  select(-c(totalsegment, segmentselect, segmented, segmos, 
            adj_prob1, adj_prob2)) |> 
  left_join(hh_selection_probs) |> 
  mutate(hhbwt0 = 1/( (1/psuwt0) * hh_selection_prob)) %>% 
  ungroup() %>% 
  arrange(grappe)
## Joining with `by = join_by(strata)`
summary(selected_hh$hhbwt0)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   7.222  66.708 104.295 120.957 165.029 379.270
hist(selected_hh$hhbwt0, breaks = 40)

selected_hh |> 
  group_by(strata) |> 
  summarise(min = min(hhbwt0),
            low_Q = quantile(hhbwt0, 0.25),
            medianwt = median(hhbwt0),
            meanwt = mean(hhbwt0),
            upp_Q = quantile(hhbwt0, 0.75),
            max = max(hhbwt0))
## # A tibble: 14 × 7
##    strata   min low_Q medianwt meanwt upp_Q   max
##     <dbl> <dbl> <dbl>    <dbl>  <dbl> <dbl> <dbl>
##  1      1 32.1   75.0    103.   117.  134.  379. 
##  2      2 18.4   23.4     27.4   32.5  43.5  54.5
##  3      3 43.6   88.6    144.   142.  204.  254. 
##  4      4 46.0  101.     130.   145.  199.  228. 
##  5      5  7.22  10.7     22.1   20.8  28.4  35.2
##  6      6 56.0  114.     155.   148.  183.  218. 
##  7      7 83.0  155.     198.   193.  224.  305. 
##  8      8 37.2   57.4     73.7   80.4  97.2 141. 
##  9      9 68.1  100.     122.   145.  167.  321. 
## 10     10 56.9  108.     169.   163.  198.  277. 
## 11     11 33.6   88.0    117.   160.  223.  378. 
## 12     12 74.8  165.     210.   201.  250.  282. 
## 13     13 28.7   46.0     67.0   71.0  87.3 121. 
## 14     14 39.0   52.3     77.7   97.5 121.  298.
selected_hh |> 
  ggplot(aes(y = hhbwt0)) +
  facet_wrap(~strata)+
  geom_boxplot() +
  theme_bw() +
  labs(x = "Household Base Weight",
       y = "Region")

selected_hh1 <- selected_hh %>%
  distinct(grappe, .keep_all = TRUE) %>%
  rename(hhr_eacode1=unicite) %>% 
  mutate(
    hhr_eacode1 = as.character(hhr_eacode1), # Convert to character
    hhr_eacode1 = case_when(
      nchar(hhr_eacode1) == 9 ~ paste0("00", hhr_eacode1),  # Add 2 leading zeros if 1 digits
      nchar(hhr_eacode1) == 10 ~ paste0("0", hhr_eacode1),   # Add 1 leading zero if 2 digits
      TRUE ~ hhr_eacode1 # Keep as is if 10 digits
      
    ),
    hhr_eacode = str_c("CI",cod_distric, hhr_eacode1)
  ) %>%
  rename(HHI_EACODE=hhr_eacode) %>% 
  select(HHI_EACODE, hhbwt0)   # Keep only necessary columns



# View the transformed data
head(selected_hh1)
## # A tibble: 6 × 2
##   HHI_EACODE      hhbwt0
##   <chr>            <dbl>
## 1 CI0101002026030  125. 
## 2 CI0101002046014  149. 
## 3 CI0101002056042   41.0
## 4 CI0101002020069   78.5
## 5 CI0101002020182   62.3
## 6 CI0101002020256   32.1

2.2 Calculate the weight adjustment for households of unknown eligibility

The variable UPCODE_STAT_HH on the household interview file contains the final response status for each household. The codes are:

UPCODE_STAT_HH Definition
1 Respondent
2 Non-respondent
3 Ineligible
4 Unknown eligibility

In this step, the weights of the households with unknown eligibility are re-distributed to the other households in the EA. The adjustment factor in EA \(i\) is computed as

\[ A^{\textrm{HH elig}}_i = \Sigma_{j=1}^{n_i} W_{ij} / \Sigma_{j = 1}^{n_{i,1} + n_{i,2} + n_{i,3}} W_{ij}. \] The values \(n_{i,1-3}\) are the number of households in EA \(i\) with final status equal to 1, 2, or 3, as defined in the table above. The total number of selected households is \(n_i\), and \(W_{ij}\) is the household base weight of household \(j\) in EA \(i\). Note that because of the sample design, all households in each EA have the same selection probability/weight, so the value of this weight cancels out in computing the adjustment factor.

indir <- "C:/Users/kupamupindit/OneDrive - Columbia University Irving Medical Center/Documents/ICAP work/Countries/Cote D'Ivoire/CIPHIA2/Data/"

civ2_hh <- import(str_c(indir, "CI_FFcorr_hh_int_DRAFT_20250325.xlsx"),
                  guess_max = 10000) |> 
  filter(is.na(DMFLAG) | DMFLAG != "DISCARD")

### Household Status ###

# civ2_hh |> 
#   count(RESULTNDT,
#         UPCODE_RESLTNDT,
#         STARTINT, 
#         HHELIG, 
#         HHCONSTAT, 
#         HHQDTHSINS, 
#         ROSTER_START, 
#         HHQINSHH, 
#         HHQASSIGN_INST)


civ2_hhstatus <- civ2_hh |> 
  mutate(
    COMB_UPCODED_RESLTNDT = coalesce(as.numeric(UPCODE_RESLTNDT), as.numeric(RESULTNDT)),
    UPCODE_STAT_HH = case_when(
    # Responded
    is.na(RESULTNDT) & (STARTINT == 1 & HHELIG == 1 & HHCONSTAT == 1 &
                          !is.na(HHQDTHSINS) & !is.na(ROSTER_START) & !is.na(HHQINSHH) &
                          !is.na(HHQASSIGN_INST)) ~ 1,
    is.na(RESULTNDT) & (STARTINT == 4 & !is.na(ROSTER_START)) ~ 1,
    # Non-responding
    COMB_UPCODED_RESLTNDT %in% c(1, 2, 7, 8 , 9) ~ 2,
    is.na(RESULTNDT) & (HHELIG == 2 | HHCONSTAT %in% c(2, 3) | (HHELIG == 1 & is.na(HHCONSTAT)) |
                          STARTINT == 4 & is.na(ROSTER_START)) ~ 2,
    # Inelgible
    COMB_UPCODED_RESLTNDT %in% c(3, 4, 6) ~ 3,
    # Unknown
    is.na(COMB_UPCODED_RESLTNDT) | COMB_UPCODED_RESLTNDT %in% c(5, 96) ~ 4,
    .default = 4
    )
  )
    
# Checking HH status derivation
# 
# civ2_hhstatus |> 
#   count(UPCODE_STAT_HH,
#         RESULTNDT,
#         UPCODE_RESLTNDT,
#         STARTINT, 
#         HHELIG, 
#         HHCONSTAT, 
#         HHQDTHSINS, 
#         ROSTER_START, 
#         HHQINSHH, 
#         HHQASSIGN_INST)

civ2_hhstatus |> 
  tabyl(UPCODE_STAT_HH)
##  UPCODE_STAT_HH     n    percent
##               1 11879 0.85485032
##               2  1220 0.08779505
##               3   778 0.05598733
##               4    19 0.00136730
# Proportion in each household status category
civ2_hhstatus |> 
  rename(hh_status=UPCODE_STAT_HH) %>% 
  ggplot(aes(x = factor(HHI_REGION), 
             fill = factor(hh_status, levels = 1:4,
                           labels = c("Eligible Respondent",
                                      "Eligible Non-respondent",
                                      "Ineligible Household",
                                      "Unknown Eligibility")))) +
  geom_bar(position = position_fill(reverse = TRUE)) +
  scale_fill_viridis_d() +
  labs(x = "Region",
       y = "Proportion",
       fill = "hh_status") +
  theme_bw()

hh_uknwn_adj <- civ2_hh %>%
  filter(!is.na(HHI_EACODE)) %>%
  select(HHI_EACODE, HHI_REGION, UPCODE_STAT_HH) %>%
  left_join(selected_hh1) %>% 
  group_by(HHI_EACODE) %>% 
  mutate(
    hhstatus = UPCODE_STAT_HH,

    # Calculate adjustment
    hh_adj_one = sum(hhbwt0) / ( sum(hhbwt0 * (hhstatus == 1)) + 
                               sum(hhbwt0 * (hhstatus == 2)) + 
                               sum(hhbwt0 * (hhstatus == 3))
    ),
    # Calculate weights 
    hh_wt_one = case_when(hhstatus %in% c(1, 2) ~ hh_adj_one * hhbwt0,
                      hhstatus %in% c(3, 4) ~ 0)
  ) %>% 
  ungroup()
## Joining with `by = join_by(HHI_EACODE)`

2.3 Compute non-response adjusted household weight

Non-response adjustment for households is done by dividing the responding household weights by the response rate in each EA. The non response adjustment factor is

\[ A^{\textrm{HH NR}}_i = \Sigma_{j = 1}^{n_{i,1} + n_{i,2}} W^{\textit{hh2}}_{ij} / \Sigma_{j=1}^{n_i} W^{hh2}_{ij}, \] where \(W^{\textit{hh2}}_{ij}\) is the eligibility-adjusted weight of household \(j\) in EA \(i\), and the other quantities are as in the previous equation.

hh_nr_adj <- hh_uknwn_adj %>%
    group_by(HHI_EACODE) %>% 
  mutate(hh_adj_two = sum(hh_wt_one) / ( sum(hh_wt_one * (hhstatus == 1))),
         hh_wt_two = if_else(hhstatus == 1, 
                                  hh_adj_two * hh_wt_one,
                                  0)) %>% 
  ungroup()


final_hh_weights <- hh_nr_adj %>% 
  rename(EAID=HHI_EACODE, hhwt0=hh_wt_two )
hh_nr_adj |> 
  distinct(HHI_EACODE, .keep_all = TRUE) |> 
  ggplot(aes(x = hh_adj_two,
             y = HHI_REGION)) +
  geom_boxplot() +
  theme_bw() +
  labs(x = "HH adjustment factor two",
       y = "Region")

3 Calculating person-level interview weights

3.1 Load roster and individual data

civ2_csproroster <- import(str_c(indir, "CI_FFcorr_roster_DRAFT_20250325.xlsx"),
                  guess_max = 10000) |> 
  filter(is.na(DMFLAG) | DMFLAG != "DISCARD") 

civ2_csproindiv <- import(str_c(indir, "CI_FFcorr_ind_int_DRAFT_20250325.xlsx"),
                  guess_max = 10000) |> 
  filter(is.na(DMFLAG) | DMFLAG != "DISCARD") %>% 
  mutate(roster_elig=PRIMARY_ROSTER_STATUS) # we need to check this
  
#  mutate(roster_elig=PRIMARY_ROSTER_STATUS) # need to relook a this, could not find the same variable as RUPHIA
# just used PRIMARY_ROSTER_STATUS as a placeholder for roster_elig
roster_status_vars <- civ2_csproroster %>%
  mutate(indiv_status = ROSTERED_FINAL_STATUS) %>% 
  left_join(select(civ2_csproindiv, EA_HHID_LN_FIXED, INDSTATUS,roster_elig)) %>%
  select(roster_sex = SEX, 
         roster_age = AGEYEARS,
         RELATTOHH, SLEEPHERE,
         EA_HHID_FIXED, EA_HHID_LN_FIXED, 
         indiv_status, ELIGIBLE1, HHR_EACODE,INDSTATUS
  )
## Joining with `by = join_by(EA_HHID_LN_FIXED)`
indiv_status_vars <- civ2_csproindiv %>%
  select(IND_REGION, IND_DISTRICT, IND_EACODE, 
         EA_HHID_FIXED, EA_HHID_LN_FIXED, INDISPSTATUS, CONFAGEY)

# AGEYEARS, UPCODE_IND0040,roster_elig cannot find
roster_ind_status_vars <- roster_status_vars %>%
  mutate(EAID = as.character(HHR_EACODE)) %>%
  left_join(select(final_hh_weights, EAID, hhwt0)) %>%
  left_join(indiv_status_vars) %>%
  mutate(
    roster_age_gt_15 = if_else(roster_age >= 15, 1, 0, missing = -9),
    int_age_gt_15 = if_else(CONFAGEY >= 15, 1, 0, missing = -9),
    resident_status_roster = case_when(SLEEPHERE == 1 & roster_age >= 15 ~ "De facto person 15 years or older",
                                       SLEEPHERE != 1 & roster_age >= 15 ~ "Non-de facto person 15 years or older",
                                       roster_age < 15 ~ "Person under 15 years",
                                       TRUE ~ "OTHER"),
    status_for_table = case_when(SLEEPHERE == 1 & roster_age >= 15 & int_age_gt_15 == 1 ~ 1,
                                 SLEEPHERE == 1 & roster_age >= 15 & int_age_gt_15 == 0 ~ 2,
                                 SLEEPHERE == 1 & roster_age >= 15 & int_age_gt_15 == -9 ~ 3,
                                 SLEEPHERE != 1 & roster_age >= 15 ~ 4,
                                 roster_age < 15 ~ 5,
                                 TRUE ~ 99999),
    # Give ineligible people indiv_status = 9 and INDSTATUS = 9
    INDSTATUS = if_else(ELIGIBLE1 == 2, 90, INDSTATUS),
    indiv_status = if_else(ELIGIBLE1 == 2, 9, indiv_status))
## Joining with `by = join_by(EAID)`
## Joining with `by = join_by(EA_HHID_FIXED, EA_HHID_LN_FIXED)`
table_counts <- roster_ind_status_vars %>% 
  mutate(status_for_table = factor(status_for_table, levels = 1:5),
         int_age_desc = case_when(
           status_for_table %in% c(4,5) ~ "NA",
           int_age_gt_15 == 1 ~ "15+",
           int_age_gt_15 == 0 ~ "Under 15",
           int_age_gt_15 == -9 ~ "Unknown")) %>%
  group_by(status_for_table, resident_status_roster, int_age_desc, .drop = FALSE) %>% 
  summarise(rostered_persons = n(), 
            wgt_number_rostered = sum(hhwt0)) %>%
  mutate(resident_status_roster = if_else(status_for_table != 2, resident_status_roster, "De facto person 15 years or older"),
         int_age_desc = if_else(status_for_table != 2, int_age_desc, "Under 15")) %>%
  adorn_totals(where = "row")
## `summarise()` has grouped output by 'status_for_table',
## 'resident_status_roster'. You can override using the `.groups` argument.
print(table_counts)
##  status_for_table                resident_status_roster int_age_desc
##                 1     De facto person 15 years or older          15+
##                 2     De facto person 15 years or older     Under 15
##                 3     De facto person 15 years or older      Unknown
##                 4 Non-de facto person 15 years or older           NA
##                 5                 Person under 15 years           NA
##             Total                                     -            -
##  rostered_persons wgt_number_rostered
##           1026414         124443888.3
##                 0                 0.0
##               648             63702.5
##            150685          18753120.5
##            665478          82891635.8
##           1843225         226152347.1
int_wgts_phase_one_input <- select(civ2_csproroster, EA_HHID_FIXED, EA_HHID_LN_FIXED, HHR_EACODE, SEX, AGEYEARS, SLEEPHERE) %>%
  left_join(select(civ2_csproindiv, EA_HHID_LN_FIXED, CONFAGEY)) %>%
  mutate(EAID = as.character(HHR_EACODE),
         roster_age_gt_15 = case_when(AGEYEARS >= 0 & AGEYEARS < 15 ~ 0,
                                      AGEYEARS >= 15 ~ 1,
                                      TRUE ~ -9),
         int_elig_status = case_when(roster_age_gt_15 == 1 & CONFAGEY >= 15 ~ 1,
                                     roster_age_gt_15 == 1 & CONFAGEY >= 0 & CONFAGEY < 15 ~ 2,
                                     roster_age_gt_15 == 1 ~ 3,
                                     roster_age_gt_15 == 0 ~ 4,
                                     TRUE ~ 99999),
         age_sex_cell = case_when(#SLEEPHERE != 1 ~ "Non-de facto",
           SEX == 1 & roster_age_gt_15 == 1 & AGEYEARS < 18 ~ "Male 15-17",
           SEX == 1 & roster_age_gt_15 == 1 & AGEYEARS >= 18 ~ "Male 18+",
           SEX == 2 & roster_age_gt_15 == 1 & AGEYEARS < 18 ~ "Female 15-17",
           SEX == 2 & roster_age_gt_15 == 1 & AGEYEARS >= 18 ~ "Female 18+",
           AGEYEARS < 15 ~ "Under 15",
           TRUE ~ "Missing/Unknown")) %>%
  left_join(select(final_hh_weights, EAID, hhwt0))
## Joining with `by = join_by(EA_HHID_LN_FIXED)`
## Joining with `by = join_by(EAID)`
# Checks
# zz<-indiv_wgts_phase_one_input %>% group_by(SLEEPHERE, AGEYEARS, CONFAGEY, int_elig_status) %>% tally()
# indiv_wgts_phase_one_input %>% group_by(int_elig_status, age_sex_cell) %>% tally()

int_wgts_phase_one_adj <- int_wgts_phase_one_input %>%
  filter(int_elig_status %in% c(1,2,3)) %>%
  group_by(age_sex_cell, int_elig_status) %>%
  summarise(count = n(),
            weighted_count = sum(hhwt0)) %>%
  pivot_wider(values_from = c(count, weighted_count), 
              names_from = int_elig_status, 
              names_glue = "status_{int_elig_status}_{.value}",
              values_fill = 0) %>%
  mutate(phase_one_int_adj_factor = (status_1_weighted_count + status_3_weighted_count)/status_1_weighted_count)
## `summarise()` has grouped output by 'age_sex_cell'. You can override using the
## `.groups` argument.
int_wgts_phase_one_adjusted <- int_wgts_phase_one_input %>%
  left_join(select(int_wgts_phase_one_adj, age_sex_cell, phase_one_int_adj_factor)) %>%
  mutate(phase_one_adj_int_wgt = hhwt0 * phase_one_int_adj_factor)
## Joining with `by = join_by(age_sex_cell)`
int_wgts_phase_one_adjusted %>% group_by(age_sex_cell) %>%
  summarise(mean_weight = mean(phase_one_adj_int_wgt))
## # A tibble: 5 × 2
##   age_sex_cell mean_weight
##   <chr>              <dbl>
## 1 Female 15-17        148.
## 2 Female 18+          137.
## 3 Male 15-17          140.
## 4 Male 18+            143.
## 5 Under 15             NA

##Interview non-response adjustment

*remove all the process variables first and remain with the important variables

# Helper function to split out the multiple-select options which have 
# concatenated values like "ABD", "AE", "F" etc.
expand_mult <- function(x, i) {
  if_else(grepl(as.character(i), x, fixed = TRUE), 1, 0)
}

# Select and tidy up all useful household variables
hhvars <- civ2_hhstatus %>% 
  filter(UPCODE_STAT_HH == 1) %>%
  select(36:146, 
         SICK_HOUSEHOLD, contains("EA_HHID")) %>%
  # Correcting/filling count variables
  mutate(DEATHCOUNT = replace_na(DEATHCOUNT, 0),
         # Expand multiple selection
         across(c(HHQITEMS, HHQOWN), list(~expand_mult(.x, "A"),
                                          ~expand_mult(.x, "B"),
                                          ~expand_mult(.x, "C"),
                                          ~expand_mult(.x, "D"),
                                          ~expand_mult(.x, "E"),
                                          ~expand_mult(.x, "F"),
                                          ~expand_mult(.x, "G"),
                                          ~expand_mult(.x, "Y"),
                                          ~expand_mult(.x, "Z")),
                .names = "{.col}{.fn}"),
         # Fill NAs with no/missing where appropriate
         across(c(DEATHS), ~if_else(.x %in% c(-8, -9), 2, .x)),
         across(ends_with("DUR"), ~if_else(.x %in% c(-8,-9), 0, .x)),
         across(ends_with("NUM"), ~if_else(.x == -7, 1, .x)),
         #  across(c(HHEMANCMINOR, HHQFOODFRQ, HHQHUNG, HHQHUNGFRQ, HHQHUNGDAY,
         #           ends_with("DUR")), ~replace_na(.x, 0)),
         across(c(TOILETSHARE), ~replace_na(.x, 2))) %>%
  # Variables to drop
  select(-c(HHCNTSTATUS, HHQITEMS, HHQOWN, HHCAGREEDT, HHSTFSIG, HHRINS4,
            contains("HHCONST"),
            starts_with(c("DIED", "DTH", "INELIG")))) %>%
  # Merge don't know and refused
  mutate(across(where(is.numeric), ~if_else(.x == -8, -9, .x))) %>%
  # For duration/count variables, combine into a simple yes/no to avoid small cell sizes
  mutate(across(ends_with(c("DUR","NUM")), ~if_else(.x > 0, 1, 0))) %>%
  # Remove variables with only one value
  select(where(function(col) length(unique(col)) > 1))


# Select and tidy up all useful roster variables
roster_vars <- civ2_csproroster %>%
  # Variables to drop
  select(-c(HHR_SURVEYFORMSDT, HHR_HHI_UUID, HHR_DEVICEID, STAFFIDHH,
            HHRSTS, HHR_HHMEM_ID, HHRETS, HHDRSTS, INTRODR, AGEMONTHS,
            HHDRETS, MINORLINENUM, HHCRSTS, HHCRETS, DMFLAG, DMCOMMENT,
            HHRINS5, HHQOVCINS1, HHQSUPPSTART, HHQOVCEND, OVCLINENUM,
            LIVEAWAYM, LIVEAWAYY, LIVECOUNTRY, 
            LIVESUBPF, LIVESUBPFLIVECOUNTRYOTH, 
            CORRECT_LN, HHQASSIGN_INST, HHQASSIGN, WITHDRAWHH,
            GRAPPE, EA_HHID_FIXED, HHR_EACODE, HHR_SHH, HHR_DISTRICT, 
            HHR_REGION, HHR_DEPARTEMENT, HHR_SOUSPREFID, HHR_EACODE_ORIG, 
            HHR_SHH_ORIG, HHR_TEAM_ID, CODEMILIEU, NOM_MILIEU,
            NATMOMNM, MOMFEMNAME, DADMALENAME, MALEGUARDNAME,
            RELATTOHH, ROSTERED_FINAL_STATUS, ELIGIBLE1,
            HHCCONSEL, HHCEMANC, HHCCONT0))%>%
  # Fill NAs with no/missing where appropriate
  mutate(across(c(SICK3MO), ~replace_na(.x, -8)),
         across(c(EMANCIPATED, LIVEAWAY, starts_with("SUPPORT")), ~replace_na(.x, 2)),
         across(c(VULNERABLE_CHILD), ~replace_na(.x, 0))) %>%
  # Merge don't know and refused
  mutate(across(where(is.numeric), ~if_else(.x == -8, -9, .x))) %>%
  # Remove variables with only one value
  select(where(function(col) length(unique(col)) > 1)) %>%
  left_join(select(indiv_status_vars, EA_HHID_LN_FIXED)) #%>%
## Joining with `by = join_by(EA_HHID_LN_FIXED)`
#filter(roster_elig == 1)

# Create binary interview response variable
interview_response <- civ2_csproindiv %>%
  filter(roster_elig == 1) %>%
  mutate(ind_responded = if_else(INDIVCNTSTATUS == 1, 1, 0)) %>%
  # not sure about the INDIVCNTSTATUS
  select(ind_responded, EA_HHID_FIXED, EA_HHID_LN_FIXED, roster_elig)

# Join together all the variables and filter to only 
# de facto eligible persons.
nr_vars <- hhvars %>%
  left_join(interview_response) %>%
  left_join(roster_vars)
## Joining with `by = join_by(EA_HHID_FIXED)`
## Joining with `by = join_by(EA_HHID_LN_FIXED)`
#  filter(roster_elig == 1) # left this out for now

# Add recodes for the teen years variable which is used to split up the
# data for the non-response model.
nr_vars_recodes <- nr_vars %>%
  mutate(H_AGETEENYEARS = if_else(AGEYEARS >= 15 & AGEYEARS <= 17, 1, 2))

3.2 Model by sex & age years variables

# Make the appropriate variables numeric and the rest factors.
# This is required for the BART model fitting to handle the 
# data correctly.
int_nr_data_in <- nr_vars_recodes %>%
#  select(-INDSTATUS, -ROSTERED_FINAL_STATUS, -roster_elig, -ELIGIBLE1) %>%
  select(H_AGETEENYEARS, SEX, EA_HHID_FIXED, EA_HHID_LN_FIXED, ind_responded, everything()) %>%
  arrange(H_AGETEENYEARS, SEX, EA_HHID_FIXED, EA_HHID_LN_FIXED) %>%
  mutate(age_group = case_when(AGEYEARS < 18 ~ AGEYEARS,
                               between(AGEYEARS, 18, 24) ~ 1,
                               between(AGEYEARS, 25, 29) ~ 2,
                               between(AGEYEARS, 30, 39) ~ 3,
                               between(AGEYEARS, 40, 49) ~ 4,
                               between(AGEYEARS, 50, 59) ~ 5,
                               AGEYEARS >= 60  ~ 6,
                               TRUE ~ 99),
         hh_age_group = case_when(between(HHAGEYEARS, 15, 24) ~ 1,
                                  between(HHAGEYEARS, 25, 29) ~ 2,
                                  between(HHAGEYEARS, 30, 39) ~ 3,
                                  between(HHAGEYEARS, 40, 49) ~ 4,
                                  between(HHAGEYEARS, 50, 59) ~ 5,
                                  HHAGEYEARS >= 60  ~ 6,
                                  TRUE ~ 99),
         across(-c(starts_with("TOTAL"), ends_with("NUM"), ends_with("DUR"),
                   DEATHCOUNT, AGEYEARS, ROOMSLEEP, HHAGEYEARS, ind_responded), as_factor),
         across(c(starts_with("TOTAL"), ends_with("NUM"), ends_with("DUR"),
                   DEATHCOUNT, AGEYEARS, ROOMSLEEP, HHAGEYEARS, ind_responded))) %>% 
  select(-SLEEPHERE, -DEATHCOUNT, -HHEMANCMINOR, -LIVEAWAY,
         -AGEYEARS, -HHAGEYEARS)

## The following code prepares a model matrix for interview non response

# For variables which are already two-leveled
# int_binary_vars <- int_nr_data_in %>% 
#   select(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX, ind_responded) %>% 
#   select(EA_HHID_LN_FIXED, where(function(col) length(unique(col)) == 2), -ind_responded) %>%
#   mutate(across(-c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX), as.numeric)) %>%
#   mutate(across(-c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX), ~if_else(.x == 2, 0, .x))) %>%
#   mutate(across(-c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX), as.factor))

int_binary_vars <- int_nr_data_in %>% 
  select(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX, where(function(col) length(unique(col)) == 2), -ind_responded) %>%
  mutate(across(-c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX), as.numeric)) %>%
  mutate(across(-c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX), ~if_else(.x == 2, 0, .x))) %>%
  mutate(across(-c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX), as.factor))

# Non-binary variables
int_nr_data_mod_matrix_in <- int_nr_data_in %>%
  select(where(function(col) length(unique(col)) != 2), ind_responded) %>%
  select(-contains("EA_HHID"))

dmy_int_nr_data <- dummyVars("ind_responded ~ .", data = int_nr_data_mod_matrix_in)
df_int_nr_data = int_nr_data_mod_matrix_in

model_matrix_int_nr <- predict(dmy_int_nr_data, newdata = df_int_nr_data) %>%
  as_tibble() %>%
  cbind(ind_responded = int_nr_data_mod_matrix_in$ind_responded) %>%
  cbind(EA_HHID_LN_FIXED = int_nr_data_in$EA_HHID_LN_FIXED) %>%
  left_join(int_binary_vars) %>%
  mutate(across(-c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX, ind_responded), as.factor))
## Joining with `by = join_by(EA_HHID_LN_FIXED)`
## Identify/remove variables from non-response model where proportion
## or number in the smallest category very low (eg < 10%)

variable_response_props_int <- model_matrix_int_nr %>%
  pivot_longer(cols = -c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX, ind_responded), 
               names_to = "variable", values_to = "value") %>%
  group_by(H_AGETEENYEARS, SEX, variable, value) %>%
  summarise(respondents = sum(ind_responded),
            totalN = n(),
            rrate = respondents / totalN) %>%
  mutate(category_proportion = totalN / sum(totalN))
## `summarise()` has grouped output by 'H_AGETEENYEARS', 'SEX', 'variable'. You
## can override using the `.groups` argument.
int_nr_vars_dropped <- variable_response_props_int %>%
  mutate(drop = if_else(!grepl("age", variable) & 
                          (totalN < 25 | category_proportion < 10/100), 1, NA_real_)) %>%
  filter(drop == 1)
  

# final_int_nr_model_matrix <- model_matrix_int_nr %>%
#   pivot_longer(cols = -c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX, ind_responded), 
#                names_to = "variable", values_to = "value") %>%
#   left_join(select(int_nr_vars_dropped, H_AGETEENYEARS, SEX, variable, drop)) %>%
#   filter(is.na(drop)) %>%
#   pivot_wider(id_cols = c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX, ind_responded), 
#               names_from = variable, values_from = value) %>%
#   mutate(across(-c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX, ind_responded), 
#                  ~replace_na(as.numeric(.x), 0)))


final_int_nr_model_matrix <- model_matrix_int_nr %>%
  mutate(ind_responded = if_else(is.na(ind_responded), 0, ind_responded)) %>% 
  pivot_longer(cols = -c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX, ind_responded), 
               names_to = "variable", values_to = "value") %>%
  left_join(select(int_nr_vars_dropped, H_AGETEENYEARS, SEX, variable, drop)) %>%
  filter(is.na(drop)) %>%
  group_by(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX, ind_responded, variable) %>%
  summarise(value = first(value), .groups = "drop") %>%  # pick first value if multiple
  pivot_wider(id_cols = c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX, ind_responded), 
              names_from = variable, values_from = value) %>%
  mutate(across(-c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX, ind_responded), 
                ~replace_na(as.numeric(.x), 0)))
## Joining with `by = join_by(H_AGETEENYEARS, SEX, variable)`
# Function to fit a single model
# Input needs to contain the predictors and the response variable
fit_bart_model_int <- function(data, nskip = 50, ndpost = 100) {
  
  predictor_matrix <- data %>%
    select(where(function(col) length(unique(col)) > 1)) %>%
    remove_empty(which = "cols") %>%
    select(-c(EA_HHID_LN_FIXED, ind_responded)) %>%
    data.frame()
  
  response <- data$ind_responded
  
  # bartFit <- pbart(predictor_matrix, response, binaryOffset = mean(response),
  #                nskip = nskip, ndpost = ndpost)
  bartFit <- lbart(predictor_matrix, response, binaryOffset = mean(response),
                   nskip = nskip, ndpost = ndpost)
}

# Function to just output the fitted response probabilities
fit_response_probabilities_int <- function(data) {
  
  bartFit <- fit_bart_model_int(data)
  
  response <- data$ind_responded
  response_probability <- bartFit$prob.train.mean
  
  tibble(data$EA_HHID_LN_FIXED, response, response_probability)
}

# nested <- final_int_nr_model_matrix %>%
#   group_by(H_AGETEENYEARS, SEX) %>%
#   tidyr::nest() %>%
#   mutate(bart = lapply(data, fit_response_probabilities))
# 
# response_probs <- nested %>%
#   select(-data) %>%
#   unnest(cols = bart) %>%
#   rename(EA_HHID_LN_FIXED = `data$EA_HHID_LN_FIXED`)


#  parallel workers
plan(multisession, workers = parallel::detectCores() - 1)

# Set the global handler once — outside everything
handlers("progress")

# Clean and nest
nested_int <- final_int_nr_model_matrix %>%
  filter(!is.na(H_AGETEENYEARS), !is.na(SEX)) %>%
  group_by(H_AGETEENYEARS, SEX) %>%
  nest() %>%
  mutate(data = map(data, ~ filter(.x, !is.na(ind_responded)))) %>%
  filter(map_int(data, nrow) > 1)

# Safe BART wrapper
safe_fit_bart <- possibly(fit_bart_model_int, otherwise = NULL)

# Running in parallel with progress
with_progress({
  p <- progressor(along = nested_int$data)
  
  nested_int <- nested_int %>%
    mutate(bart = future_map(data, function(x) {
      p()  # progress update
      safe_fit_bart(x, nskip = 20, ndpost = 100)
    }))
})
## *****Into main of lbart
## *****Data:
## data:n,p,np: 1004, 798, 0
## y1,yn: 1, 1
## x1,x[n*p]: 1.000000, 1.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 1 ... 1
## *****burn and ndpost: 20, 100
## *****Prior:
## beta,alpha,tau: 2.000000,0.950000,0.129526
## *****Dirichlet:sparse,a,b,rho,augment: 0,0.5,1,798,0
## *****nkeeptrain,nkeeptest: 100, 100
## *****printevery: 100
## 
## MCMC
## done 0 (out of 120)
## done 100 (out of 120)
## time: 4s
## check counts
## trcnt,tecnt: 100,0
## *****Into main of lbart
## *****Data:
## data:n,p,np: 1173, 766, 0
## y1,yn: 1, 1
## x1,x[n*p]: 2.000000, 0.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 1 ... 1
## *****burn and ndpost: 20, 100
## *****Prior:
## beta,alpha,tau: 2.000000,0.950000,0.129526
## *****Dirichlet:sparse,a,b,rho,augment: 0,0.5,1,766,0
## *****nkeeptrain,nkeeptest: 100, 100
## *****printevery: 100
## 
## MCMC
## done 0 (out of 120)
## done 100 (out of 120)
## time: 4s
## check counts
## trcnt,tecnt: 100,0
## *****Into main of lbart
## *****Data:
## data:n,p,np: 11268, 166, 0
## y1,yn: 1, 1
## x1,x[n*p]: 2.000000, 0.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 1 ... 1
## *****burn and ndpost: 20, 100
## *****Prior:
## beta,alpha,tau: 2.000000,0.950000,0.129526
## *****Dirichlet:sparse,a,b,rho,augment: 0,0.5,1,166,0
## *****nkeeptrain,nkeeptest: 100, 100
## *****printevery: 100
## 
## MCMC
## done 0 (out of 120)
## done 100 (out of 120)
## time: 36s
## check counts
## trcnt,tecnt: 100,0
## *****Into main of lbart
## *****Data:
## data:n,p,np: 12689, 267, 0
## y1,yn: 1, 1
## x1,x[n*p]: 1.000000, 1.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 1 ... 1
## *****burn and ndpost: 20, 100
## *****Prior:
## beta,alpha,tau: 2.000000,0.950000,0.129526
## *****Dirichlet:sparse,a,b,rho,augment: 0,0.5,1,267,0
## *****nkeeptrain,nkeeptest: 100, 100
## *****printevery: 100
## 
## MCMC
## done 0 (out of 120)
## done 100 (out of 120)
## time: 41s
## check counts
## trcnt,tecnt: 100,0
response_probs_int <- nested_int %>%
  mutate(prob_df = map2(data, bart, function(d, model) {
    if (is.null(model)) {
      return(tibble(
        EA_HHID_LN_FIXED = d$EA_HHID_LN_FIXED,
        response_probability_int = NA_real_
      ))
    } else {
      tibble(
        EA_HHID_LN_FIXED = d$EA_HHID_LN_FIXED,
        response_probability_int = model$prob.train.mean
      )
    }
  })) %>%
  select(prob_df) %>%
  unnest(prob_df)
## Adding missing grouping variables: `H_AGETEENYEARS`, `SEX`
response_probs_int_full <- final_int_nr_model_matrix %>%
  select(-H_AGETEENYEARS, -SEX) %>% 
  left_join(response_probs_int, by = "EA_HHID_LN_FIXED") %>% 
  select(SEX, H_AGETEENYEARS, EA_HHID_LN_FIXED, ind_responded, response_probability_int) %>%
  ungroup()


nr_adj_int_weights <- response_probs_int_full %>%
  mutate(SEX = as.numeric(SEX)) %>%
  left_join(int_wgts_phase_one_adjusted) %>%
  mutate(nr_adj_int_wgt = if_else(ind_responded == 1, phase_one_adj_int_wgt / response_probability_int, NA_real_))
## Joining with `by = join_by(SEX, EA_HHID_LN_FIXED)`
nr_adj_int_weights %>% 
  mutate(sex_f = factor(SEX, levels = c(1,2), labels = c("Male", "Female")),
         teen_f = factor(H_AGETEENYEARS, levels = c(1,2), labels = c("15-17", "18+"))) %>%
  ggplot(aes(x = factor(EAID), y = response_probability_int)) +
  geom_boxplot() +
  facet_grid(cols = ggplot2::vars(teen_f), rows = ggplot2::vars(sex_f)) +
  theme_bw()

3.3 Trimming individual weights

civ2_svy <- svydesign(ids = ~EAID,
                      weights = ~nr_adj_int_wgt,
                      data = nr_adj_int_weights %>% filter(!is.na(nr_adj_int_wgt)))


summary(weights(civ2_svy))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   49.74  148.93  174.75  256.58  961.87
civ2_svy_trimmed <- trimWeights(civ2_svy, 
                                  lower = -Inf,
                                  upper = 3.5 * median(weights(civ2_svy)),
                                  strict = TRUE)

summary(weights(civ2_svy_trimmed))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.927  51.671 150.852 174.751 258.509 521.239
trimmed_int_weights <- tibble(civ2_svy_trimmed$variables, trmpnr1w0 = weights(civ2_svy_trimmed))