This notebook contains organised code for each step of the weighting process for the CIPHIA2 data.

The major steps and progress (at 11/08/2022) are as follows:

  1. [done] Load and organise/join the required datasets
  2. [done] Calculate PSU weights
  3. [done] Calculate household weights
  4. [done] Calculate person-level interview weights
  5. [done] Calculate person-level blood test weights
  6. [done] Create jackknife replicate weights

1 Calculate EA weights

1.1 What Are Base Weights?

Base weights compensate for unequal selection probabilities due to multi-stage sampling designs.
They are the inverse of the probability of selection at each stage. Make reference to the CIPHIA2 second stage sample, the base weight is psuwt0 = column(ES) = ADJ_PSU_WT, while the ADJ_PSU_WT = 1/column(ER) = 1/ADJ_PROB2

\[ \text{PSU weight} = \frac{1}{\text{Selection Probability}} \]

1.2 Loading data to calculate base weights

load second stage sample, remove duplicates in terms of EAs so that you are left with 396 EAs. The reason for using second stage sample instead of 1st stage sample is that the 2nd stage sample has all information from 1st stage and from the household listing

# This section is to import packages
# and prepare for importing data
rm(list = ls())



# clean/remove everything
#install.packages("pacman")
# we use pacman to import all packages
pacman::p_load(tidyverse,
               haven,
               rio, 
               ggplot2, 
               janitor, 
               survey, 
               lubridate, 
               readxl,
               BART,
               future,
               furrr,
               progressr,
               caret,
               readxl,
               srvyr
               
               
)
# create directory paths to the datasets

########
# Dir path to 2nd stage sample
########
civ2_sample_dir <- "C:/Users/kupamupindit/OneDrive - Columbia University Irving Medical Center/Documents/ICAP work/Countries/Cote D'Ivoire/CIPHIA2/Sampling/2nd_stage_sampling/"


########
# Dir path to datasets
#######
indir <- "C:/Users/kupamupindit/OneDrive - Columbia University Irving Medical Center/Documents/ICAP work/Countries/Cote D'Ivoire/CIPHIA2/Data/"

###################################################################

# read_excel comes from the package readxl, str_c means concatenate 2 strings as shown below
# if you press shift+ctr+M you will get a pipe = %>% 
# if you press ctr+s you will save your script, its a shortcut for saving 
# you can google shortcuts for Rstudio, it will give you a whole list of all these shortcuts
# clean_name comes from the package janitor, the purpose of clean_name is to make your variable names proper


#########
# Second stage sample
#########
civ2_zd_in <- read_excel(str_c(civ2_sample_dir,"CIPHIA2_2nd_stage_sample_09_07_2024_v4.xlsx")) %>%
  clean_names() %>% 
  distinct(ea_code, .keep_all=TRUE) 
#############


#########
# Household listing data
###########

listing <- read_dta(str_c(civ2_sample_dir,"masterlist_prox1.dta")) %>% 
  select(GEOVAR_EA, CodDistric,UNICITE) %>% 
  group_by(GEOVAR_EA) %>% 
  mutate(HH_count=n()) %>% 
  distinct(GEOVAR_EA, .keep_all = TRUE)

###############

#############
# Sampled households
###############
sampled <- read_excel(str_c(civ2_sample_dir, "CIPHIA2_2nd_stage_sample_09_07_2024_v4.xlsx")) %>%
  mutate(Sampled_HH=QC1) %>% 
  select(GEOVAR_EA,Sampled_HH) %>%
  distinct(GEOVAR_EA, .keep_all = TRUE) %>% 
  left_join(listing)
## Joining with `by = join_by(GEOVAR_EA)`
#############

########
# Roster dataset
#######
civ2_csproroster <- import(str_c(indir, "ci_ffstat_roster.sas7bdat")) |> 
  filter(is.na(DMFLAG) | DMFLAG != "DISCARD")
######

#########
# Household dataset
###########
civ2_hh <- import(str_c(indir, "ci_ffstat_hh_int.sas7bdat")) |> 
  filter(is.na(DMFLAG) | DMFLAG != "DISCARD")
######

######
# Individual dataset
######
civ2_csproindiv <- import(str_c(indir, "ci_ffstat_ind_int.sas7bdat")) |> 
  filter(is.na(DMFLAG) | DMFLAG != "DISCARD") 

#  mutate(roster_elig=PRIMARY_ROSTER_STATUS) # need to relook a this, could not find the same variable as RUPHIA
# just used PRIMARY_ROSTER_STATUS as a placeholder for roster_elig
  

###########
# Biomarker dataset
###########
biomarker <- import(str_c(indir,"CI2Biomarker20250606.sas7bdat")) %>% 
  rename(ea_hhid_ln_fixed=ea_hhID_lN_fixed, hiv1statusfinalsurvey=hiv1StatusFinalPatient)
########

1.3 Exploratory plots

psu_base_weights <- civ2_zd_in |> 
  mutate(psuwt0 = adj_psu_wt)


psu_base_weights |> 
  ggplot(aes(x = factor(strata),
             y = psuwt0)) +
  geom_boxplot() +
  labs(x = "Stratum/Region",
       y = "PSU base weight") +
  theme_bw()

psu_base_weights %>% 
  group_by(nom_reg) |> 
  mutate(psu_num_instrata = row_number()) |> 
  ggplot(aes(x = psu_num_instrata,
             y = psuwt0, 
             fill = factor(code_milieu, levels = c(1,2), 
                           labels = c("Urban", "Rural")))) +
  geom_col() +
  facet_wrap(~nom_reg) +
  theme_bw() +
  labs(fill = "Urban/Rural",
       x = "PSU number within stratum",
       y = "PSU base weight")

2 Household base weights

There are several steps for the household weights:

2.1 Calculate the within-EA household sampling weight.

We need the counts of households listed and selected to compute the household selection probability and weight.

The sample design was to take a systematic random sample of an average of 35 households from each EA. The household selection probability, \(p_{hh,i}\) in EA \(i\) is then

\[ p_{hh,i} = \frac{h_i}{N_{\textrm{elig},i}}, \]

where \({h_i}\) is the target number of household per each EA \({i}\) and \(N_{\textrm{elig},i}\) is the number of eligible households in EA \(i\).

psu_weights <- psu_base_weights %>% 
  select(geovar_ea,psuwt0) %>% 
  rename(GEOVAR_EA=geovar_ea)
  

h_selection_weights <- sampled %>%
  
  # Join with PSU weights to compute combined household weights
  # Joindre avec les poids des grappes (PSU) pour calculer les poids combinés
  left_join(psu_weights, by = "GEOVAR_EA") %>%
  
  # Calculate household selection weight: max(1, HH_NUM_CNT / Sampled_HH)
  # Calculer le poids de sélection du ménage : max(1, HH_NUM_CNT / Sampled_HH)
  mutate(
    hh_selection_weight = pmax(1.0, HH_count / Sampled_HH),
    
    # Final base weight: product of PSU weight and household selection weight
    # Poids de base final : produit du poids PSU et du poids de sélection du ménage
    hhbwt0 = psuwt0 * hh_selection_weight
  ) 

h_selection_weights
## # A tibble: 396 × 8
##    GEOVAR_EA   Sampled_HH CodDistric UNICITE HH_count psuwt0 hh_selection_weight
##    <chr>            <dbl> <chr>        <dbl>    <int>  <dbl>               <dbl>
##  1 0101002026…         51 01          1.00e9      198  125.                 3.88
##  2 0101002046…         55 01          1.00e9      181  149.                 3.29
##  3 0101002056…         25 01          1.00e9      300   41.0               12   
##  4 0101002020…         35 01          1.00e9      221   78.5                6.31
##  5 0101002020…         30 01          1.00e9      232   62.3                7.73
##  6 0101002020…         15 01          1.00e9      189   32.1               12.6 
##  7 0101002020…         66 01          1.00e9      150  215.                 2.27
##  8 0101002030…         28 01          1.00e9      177   77.3                6.32
##  9 0101002030…         70 01          1.00e9      241  379.                 3.44
## 10 0101002030…         42 01          1.00e9      144  142.                 3.43
## # ℹ 386 more rows
## # ℹ 1 more variable: hhbwt0 <dbl>
summary(h_selection_weights$hhbwt0)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   21.69  353.77  495.62  510.30  666.38 1642.13
hist(h_selection_weights$hhbwt0, breaks = 40)

h_selection_weights |> 
  group_by(CodDistric) |> 
  summarise(min = min(hhbwt0),
            low_Q = quantile(hhbwt0, 0.25),
            medianwt = median(hhbwt0),
            meanwt = mean(hhbwt0),
            upp_Q = quantile(hhbwt0, 0.75),
            max = max(hhbwt0))
## # A tibble: 14 × 7
##    CodDistric   min low_Q medianwt meanwt upp_Q   max
##    <chr>      <dbl> <dbl>    <dbl>  <dbl> <dbl> <dbl>
##  1 01         128.  484.     489.   492.  493.  1306.
##  2 02         145.  155.     157.   157.  158.   176.
##  3 03         547.  567.     571.   583.  575.  1040.
##  4 04         660.  666.     670.   691.  675.  1090.
##  5 05          21.7  69.8     70.2   69.4  71.0  111.
##  6 06         563.  620.     623.   621.  627.   635.
##  7 07         826.  831.     839.   840.  846.   868.
##  8 08         145.  356.     359.   348.  362.   381.
##  9 09         699.  706.     714.   721.  720.   823.
## 10 10         563.  745.     750.   744.  755.   768.
## 11 11         446.  531.     534.   551.  537.   835.
## 12 12         417.  835.     839.   864.  847.  1642.
## 13 13         255.  298.     300.   299.  303.   309.
## 14 14         336.  344.     346.   346.  349.   352.
h_selection_weights |> 
  ggplot(aes(y = hhbwt0)) +
  facet_wrap(~CodDistric)+
  geom_boxplot() +
  theme_bw() +
  labs(x = "Household Base Weight",
       y = "Region")

selected_hh <- h_selection_weights %>%
  rename(hhr_eacode1=UNICITE) %>% 
  mutate(
    hhr_eacode1 = as.character(hhr_eacode1), # Convert to character
    hhr_eacode1 = case_when(
      nchar(hhr_eacode1) == 9 ~ paste0("00", hhr_eacode1),  # Add 2 leading zeros if 1 digits
      nchar(hhr_eacode1) == 10 ~ paste0("0", hhr_eacode1),   # Add 1 leading zero if 2 digits
      TRUE ~ hhr_eacode1 # Keep as is if 10 digits
      
    ),
    HHI_EACODE = str_c("CI",CodDistric, hhr_eacode1)
  ) %>%
  select(HHI_EACODE, hhbwt0)   # Keep only necessary columns

2.2 Calculate the weight adjustment for households of unknown eligibility

The variable UPCODE_STAT_HH on the household interview file contains the final response status for each household. The codes are:

UPCODE_STAT_HH Definition
1 Respondent
2 Non-respondent
3 Ineligible
4 Unknown eligibility

In this step, the weights of the households with unknown eligibility are re-distributed to the other households in the EA. The adjustment factor in EA \(i\) is computed as

\[ A^{\textrm{HH elig}}_i = \Sigma_{j=1}^{n_i} W_{ij} / \Sigma_{j = 1}^{n_{i,1} + n_{i,2} + n_{i,3}} W_{ij}. \] The values \(n_{i,1-3}\) are the number of households in EA \(i\) with final status equal to 1, 2, or 3, as defined in the table above. The total number of selected households is \(n_i\), and \(W_{ij}\) is the household base weight of household \(j\) in EA \(i\). Note that because of the sample design, all households in each EA have the same selection probability/weight, so the value of this weight cancels out in computing the adjustment factor.

### Household Status ###

# civ2_hh |> 
#   count(RESULTNDT,
#         UPCODE_RESLTNDT,
#         STARTINT, 
#         HHELIG, 
#         HHCONSTAT, 
#         HHQDTHSINS, 
#         ROSTER_START, 
#         HHQINSHH, 
#         HHQASSIGN_INST)


civ2_hhstatus <- civ2_hh 
  # mutate(
  #   COMB_UPCODED_RESLTNDT = coalesce(as.numeric(UPCODE_RESLTNDT), as.numeric(RESULTNDT)),
  #   UPCODE_STAT_HH = case_when(
  #   # Responded
  #   is.na(RESULTNDT) & (STARTINT == 1 & HHELIG == 1 & HHCONSTAT == 1 &
  #                         !is.na(HHQDTHSINS) & !is.na(ROSTER_START) & !is.na(HHQINSHH) &
  #                         !is.na(HHQASSIGN_INST)) ~ 1,
  #   is.na(RESULTNDT) & (STARTINT == 4 & !is.na(ROSTER_START)) ~ 1,
  #   # Non-responding
  #   COMB_UPCODED_RESLTNDT %in% c(1, 2, 7, 8 , 9) ~ 2,
  #   is.na(RESULTNDT) & (HHELIG == 2 | HHCONSTAT %in% c(2, 3) | (HHELIG == 1 & is.na(HHCONSTAT)) |
  #                         STARTINT == 4 & is.na(ROSTER_START)) ~ 2,
  #   # Inelgible
  #   COMB_UPCODED_RESLTNDT %in% c(3, 4, 6) ~ 3,
  #   # Unknown
  #   is.na(COMB_UPCODED_RESLTNDT) | COMB_UPCODED_RESLTNDT %in% c(5, 96) ~ 4,
  #   .default = 4
  #   )
  # )
  #   
# Checking HH status derivation
# 
# civ2_hhstatus |> 
#   count(UPCODE_STAT_HH,
#         RESULTNDT,
#         UPCODE_RESLTNDT,
#         STARTINT, 
#         HHELIG, 
#         HHCONSTAT, 
#         HHQDTHSINS, 
#         ROSTER_START, 
#         HHQINSHH, 
#         HHQASSIGN_INST)

civ2_hhstatus |>
  tabyl(HH_STATUS)
##  HH_STATUS     n      percent
##          1 11871 0.8542746114
##          2  1237 0.0890184226
##          3   779 0.0560592976
##          4     9 0.0006476684
  # tabyl(UPCODE_STAT_HH)
# Proportion in each household status category
civ2_hhstatus |> 
  rename(hh_status=HH_STATUS) %>% 
  ggplot(aes(x = factor(HHI_DISTRICT), 
             fill = factor(hh_status, levels = 1:4,
                           labels = c("Eligible Respondent",
                                      "Eligible Non-respondent",
                                      "Ineligible Household",
                                      "Unknown Eligibility")))) +
  geom_bar(position = position_fill(reverse = TRUE)) +
  scale_fill_viridis_d() +
  labs(x = "Region",
       y = "Proportion",
       fill = "hh_status") +
  theme_bw()

hh_uknwn_adj <- civ2_hh %>%
  filter(!is.na(HHI_EACODE)) %>%
  select(HHI_EACODE, EA_HHID_FIXED, HHI_DISTRICT, HH_STATUS) %>%
  left_join(selected_hh) %>% 
  group_by(HHI_EACODE) %>% 
  mutate(
    hhstatus = HH_STATUS,

    # Calculate adjustment
    hh_adj_one = sum(hhbwt0) / ( sum(hhbwt0 * (hhstatus == 1)) + 
                               sum(hhbwt0 * (hhstatus == 2)) + 
                               sum(hhbwt0 * (hhstatus == 3))
    ),
    # Calculate weights 
    hh_wt_one = case_when(hhstatus %in% c(1, 2) ~ hh_adj_one * hhbwt0,
                      hhstatus %in% c(3, 4) ~ 0)
  ) %>% 
  ungroup()
## Joining with `by = join_by(HHI_EACODE)`
hh_uknwn_adj
## # A tibble: 13,896 × 8
##    HHI_EACODE    EA_HHID_FIXED HHI_DISTRICT HH_STATUS hhbwt0 hhstatus hh_adj_one
##    <chr>         <chr>         <chr>            <dbl>  <dbl>    <dbl>      <dbl>
##  1 CI0101002020… CI0101002020… 01                   1   496.        1          1
##  2 CI0101002020… CI0101002020… 01                   2   496.        2          1
##  3 CI0101002020… CI0101002020… 01                   1   496.        1          1
##  4 CI0101002020… CI0101002020… 01                   1   496.        1          1
##  5 CI0101002020… CI0101002020… 01                   1   496.        1          1
##  6 CI0101002020… CI0101002020… 01                   1   496.        1          1
##  7 CI0101002020… CI0101002020… 01                   2   496.        2          1
##  8 CI0101002020… CI0101002020… 01                   2   496.        2          1
##  9 CI0101002020… CI0101002020… 01                   1   496.        1          1
## 10 CI0101002020… CI0101002020… 01                   3   496.        3          1
## # ℹ 13,886 more rows
## # ℹ 1 more variable: hh_wt_one <dbl>

2.3 Compute non-response adjusted household weight

Non-response adjustment for households is done by dividing the responding household weights by the response rate in each EA. The non response adjustment factor is

\[ A^{\textrm{HH NR}}_i = \Sigma_{j = 1}^{n_{i,1} + n_{i,2}} W^{\textit{hh2}}_{ij} / \Sigma_{j=1}^{n_i} W^{hh2}_{ij}, \] where \(W^{\textit{hh2}}_{ij}\) is the eligibility-adjusted weight of household \(j\) in EA \(i\), and the other quantities are as in the previous equation.

hh_nr_adj <- hh_uknwn_adj %>%
    group_by(HHI_EACODE) %>% 
  mutate(hh_adj_two = sum(hh_wt_one) / ( sum(hh_wt_one * (hhstatus == 1))),
         hh_wt_two = if_else(hhstatus == 1, 
                                  hh_adj_two * hh_wt_one,
                                  0)) %>% 
  ungroup()




final_hh_weights <- hh_nr_adj %>% 
  rename(EAID=HHI_EACODE, hhwt0=hh_wt_two )

final_hh_weights
## # A tibble: 13,896 × 10
##    EAID          EA_HHID_FIXED HHI_DISTRICT HH_STATUS hhbwt0 hhstatus hh_adj_one
##    <chr>         <chr>         <chr>            <dbl>  <dbl>    <dbl>      <dbl>
##  1 CI0101002020… CI0101002020… 01                   1   496.        1          1
##  2 CI0101002020… CI0101002020… 01                   2   496.        2          1
##  3 CI0101002020… CI0101002020… 01                   1   496.        1          1
##  4 CI0101002020… CI0101002020… 01                   1   496.        1          1
##  5 CI0101002020… CI0101002020… 01                   1   496.        1          1
##  6 CI0101002020… CI0101002020… 01                   1   496.        1          1
##  7 CI0101002020… CI0101002020… 01                   2   496.        2          1
##  8 CI0101002020… CI0101002020… 01                   2   496.        2          1
##  9 CI0101002020… CI0101002020… 01                   1   496.        1          1
## 10 CI0101002020… CI0101002020… 01                   3   496.        3          1
## # ℹ 13,886 more rows
## # ℹ 3 more variables: hh_wt_one <dbl>, hh_adj_two <dbl>, hhwt0 <dbl>
hh_nr_adj |> 
  distinct(HHI_EACODE, .keep_all = TRUE) |> 
  ggplot(aes(x = hh_adj_two,
             y = HHI_DISTRICT)) +
  geom_boxplot() +
  theme_bw() +
  labs(x = "HH adjustment factor two",
       y = "Region")

3 Calculating person-level interview weights

roster_status_vars <- civ2_csproroster %>%
  rename(indiv_status = ROSTER_ELIG) %>% 
  left_join(select(civ2_csproindiv, EA_HHID_LN_FIXED, INDSTATUS,ROSTER_ELIG)) %>%
  select(roster_sex = SEX, 
         roster_age = AGEYEARS,
         RELATTOHH, SLEEPHERE,
         EA_HHID_FIXED, EA_HHID_LN_FIXED, 
         indiv_status, ELIGIBLE1, HHR_EACODE,INDSTATUS
  )
## Joining with `by = join_by(EA_HHID_LN_FIXED)`
indiv_status_vars <- civ2_csproindiv %>%
  select(IND_REGION, IND_DISTRICT, IND_EACODE, 
         EA_HHID_FIXED, EA_HHID_LN_FIXED, INDISPSTATUS, CONFAGEY)

# AGEYEARS, UPCODE_IND0040,roster_elig cannot find
roster_ind_status_vars <- roster_status_vars %>%
  mutate(EAID = as.character(HHR_EACODE)) %>%
  left_join(select(final_hh_weights, EAID, EA_HHID_FIXED, hhwt0)) %>%
  left_join(indiv_status_vars) %>%
  mutate(
    roster_age_gt_15 = if_else(roster_age >= 15, 1, 0, missing = -9),
    int_age_gt_15 = if_else(CONFAGEY >= 15, 1, 0, missing = -9),
    resident_status_roster = case_when(SLEEPHERE == 1 & roster_age >= 15 ~ "De facto person 15 years or older",
                                       SLEEPHERE != 1 & roster_age >= 15 ~ "Non-de facto person 15 years or older",
                                       roster_age < 15 ~ "Person under 15 years",
                                       TRUE ~ "OTHER"),
    status_for_table = case_when(SLEEPHERE == 1 & roster_age >= 15 & int_age_gt_15 == 1 ~ 1,
                                 SLEEPHERE == 1 & roster_age >= 15 & int_age_gt_15 == 0 ~ 2,
                                 SLEEPHERE == 1 & roster_age >= 15 & int_age_gt_15 == -9 ~ 3,
                                 SLEEPHERE != 1 & roster_age >= 15 ~ 4,
                                 roster_age < 15 ~ 5,
                                 TRUE ~ 99999),
    # Give ineligible people indiv_status = 9 and INDSTATUS = 9
    INDSTATUS = if_else(ELIGIBLE1 == 2, 90, INDSTATUS),
    indiv_status = if_else(ELIGIBLE1 == 2, 9, indiv_status))
## Joining with `by = join_by(EA_HHID_FIXED, EAID)`
## Joining with `by = join_by(EA_HHID_FIXED, EA_HHID_LN_FIXED)`
table_counts <- roster_ind_status_vars %>% 
  mutate(status_for_table = factor(status_for_table, levels = 1:5),
         int_age_desc = case_when(
           status_for_table %in% c(4,5) ~ "NA",
           int_age_gt_15 == 1 ~ "15+",
           int_age_gt_15 == 0 ~ "Under 15",
           int_age_gt_15 == -9 ~ "Unknown")) %>%
  group_by(status_for_table, resident_status_roster, int_age_desc, .drop = FALSE) %>% 
  summarise(rostered_persons = n(), 
            wgt_number_rostered = sum(hhwt0)) %>%
  mutate(resident_status_roster = if_else(status_for_table != 2, resident_status_roster, "De facto person 15 years or older"),
         int_age_desc = if_else(status_for_table != 2, int_age_desc, "Under 15")) %>%
  adorn_totals(where = "row")
## `summarise()` has grouped output by 'status_for_table',
## 'resident_status_roster'. You can override using the `.groups` argument.
print(table_counts)
##  status_for_table                resident_status_roster int_age_desc
##                 1     De facto person 15 years or older          15+
##                 2     De facto person 15 years or older     Under 15
##                 3     De facto person 15 years or older      Unknown
##                 4 Non-de facto person 15 years or older           NA
##                 5                 Person under 15 years           NA
##             Total                                     -            -
##  rostered_persons wgt_number_rostered
##             26138        14755650.250
##                 0               0.000
##                15            2541.992
##              3803         2252231.322
##             16890         9750081.191
##             46846        26760504.755

3.1 First-phase interview weight adjustment

int_wgts_phase_one_input <- select(civ2_csproroster, EA_HHID_FIXED, EA_HHID_LN_FIXED, HHR_EACODE, SEX, AGEYEARS, SLEEPHERE) %>%
  left_join(select(civ2_csproindiv, EA_HHID_LN_FIXED, CONFAGEY)) %>%
  mutate(EAID = as.character(HHR_EACODE),
         roster_age_gt_15 = case_when(AGEYEARS >= 0 & AGEYEARS < 15 ~ 0,
                                      AGEYEARS >= 15 ~ 1,
                                      TRUE ~ -9),
         int_elig_status = case_when(roster_age_gt_15 == 1 & CONFAGEY >= 15 ~ 1,
                                     roster_age_gt_15 == 1 & CONFAGEY >= 0 & CONFAGEY < 15 ~ 2,
                                     roster_age_gt_15 == 1 ~ 3,
                                     roster_age_gt_15 == 0 ~ 4,
                                     TRUE ~ 99999),
         age_sex_cell = case_when(#SLEEPHERE != 1 ~ "Non-de facto",
           SEX == 1 & roster_age_gt_15 == 1 & AGEYEARS < 18 ~ "Male 15-17",
           SEX == 1 & roster_age_gt_15 == 1 & AGEYEARS >= 18 ~ "Male 18+",
           SEX == 2 & roster_age_gt_15 == 1 & AGEYEARS < 18 ~ "Female 15-17",
           SEX == 2 & roster_age_gt_15 == 1 & AGEYEARS >= 18 ~ "Female 18+",
           AGEYEARS < 15 ~ "Under 15",
           TRUE ~ "Missing/Unknown")) %>%
  left_join(select(final_hh_weights, EA_HHID_FIXED, hhwt0))
## Joining with `by = join_by(EA_HHID_LN_FIXED)`
## Joining with `by = join_by(EA_HHID_FIXED)`
# Checks
# zz<-indiv_wgts_phase_one_input %>% group_by(SLEEPHERE, AGEYEARS, CONFAGEY, int_elig_status) %>% tally()
# indiv_wgts_phase_one_input %>% group_by(int_elig_status, age_sex_cell) %>% tally()

int_wgts_phase_one_adj <- int_wgts_phase_one_input %>%
  filter(int_elig_status %in% c(1,2,3)) %>%
  group_by(age_sex_cell, int_elig_status) %>%
  summarise(count = n(),
            weighted_count = sum(hhwt0)) %>%
  pivot_wider(values_from = c(count, weighted_count), 
              names_from = int_elig_status, 
              names_glue = "status_{int_elig_status}_{.value}",
              values_fill = 0) %>%
  mutate(phase_one_int_adj_factor = (status_1_weighted_count + status_3_weighted_count)/status_1_weighted_count)
## `summarise()` has grouped output by 'age_sex_cell'. You can override using the
## `.groups` argument.
int_wgts_phase_one_adjusted <- int_wgts_phase_one_input %>%
  left_join(select(int_wgts_phase_one_adj, age_sex_cell, phase_one_int_adj_factor)) %>%
  mutate(phase_one_adj_int_wgt = hhwt0 * phase_one_int_adj_factor)
## Joining with `by = join_by(age_sex_cell)`
int_wgts_phase_one_adjusted %>% group_by(age_sex_cell) %>%
  summarise(mean_weight = mean(phase_one_adj_int_wgt))
## # A tibble: 5 × 2
##   age_sex_cell mean_weight
##   <chr>              <dbl>
## 1 Female 15-17        681.
## 2 Female 18+          638.
## 3 Male 15-17          663.
## 4 Male 18+            670.
## 5 Under 15             NA

##Interview non-response adjustment

*remove all the process variables first and remain with the important variables

# Helper function to split out the multiple-select options which have 
# concatenated values like "ABD", "AE", "F" etc.
expand_mult <- function(x, i) {
  if_else(grepl(as.character(i), x, fixed = TRUE), 1, 0)
}

# Select and tidy up all useful household variables
hhvars <- civ2_hhstatus %>% 
  filter(HH_STATUS == 1) %>%
  select(36:113, 
         SICK_HOUSEHOLD, contains("EA_HHID")) %>%
  # Correcting/filling count variables
  mutate(DEATHCOUNT = replace_na(DEATHCOUNT, 0),
         # Expand multiple selection
         across(c(HHQITEMS, HHQOWN), list(~expand_mult(.x, "A"),
                                          ~expand_mult(.x, "B"),
                                          ~expand_mult(.x, "C"),
                                          ~expand_mult(.x, "D"),
                                          ~expand_mult(.x, "E"),
                                          ~expand_mult(.x, "F"),
                                          ~expand_mult(.x, "G"),
                                          ~expand_mult(.x, "H"),
                                           ~expand_mult(.x, "I"),
                                           ~expand_mult(.x, "X"),
                                           ~expand_mult(.x, "Y"),
                                           ~expand_mult(.x, "Z")),
                .names = "{.col}{.fn}"),
         # Fill NAs with no/missing where appropriate
         across(c(DEATHS), ~if_else(.x %in% c(-8, -9), 2, .x)),
         across(ends_with("DUR"), ~if_else(.x %in% c(-8,-9), 0, .x)),
         across(ends_with("NUM"), ~if_else(.x == -7, 1, .x)),
         #across(c(HHEMANCMINOR, HHQFOODFRQ, HHQHUNG, HHQHUNGFRQ, HHQHUNGDAY,
         across(c(HHEMANCMINOR,
                   ends_with("DUR")), ~replace_na(.x, 0)),
         across(c(TOILETSHARE), ~replace_na(.x, 2))) %>%
  # Variables to drop
  select(-c(HHCNTSTATUS, HHQITEMS, HHQOWN, HHCAGREEDT, HHSTFSIG, HHRINS4,
            contains("HHCONST"),
            starts_with(c("DIED", "DTH", "INELIG")))) %>%
  # Merge don't know and refused
  mutate(across(where(is.numeric), ~if_else(.x == -8, -9, .x))) %>%
  # For duration/count variables, combine into a simple yes/no to avoid small cell sizes
  mutate(across(ends_with(c("DUR","NUM")), ~if_else(.x > 0, 1, 0))) %>%
  # Remove variables with only one value
  select(where(function(col) length(unique(col)) > 1))


# Select and tidy up all useful roster variables
roster_vars <- civ2_csproroster %>%
  # Variables to drop
  select(-c(HHR_SURVEYFORMSDT, HHR_HHI_UUID, HHR_DEVICEID, STAFFIDHH,
            HHRSTS, HHR_HHMEM_ID, HHRETS, HHDRSTS, INTRODR, AGEMONTHS,
            HHDRETS, MINORLINENUM, HHCRSTS, HHCRETS, DMFLAG, DMCOMMENT,
            HHRINS5, HHQOVCINS1, HHQSUPPSTART, HHQOVCEND, OVCLINENUM,
            LIVEAWAYM, LIVEAWAYY, LIVECOUNTRY, 
            LIVESUBPF, LIVESUBPFLIVECOUNTRYOTH, 
            CORRECT_LN, HHQASSIGN_INST, HHQASSIGN, WITHDRAWHH,
            GRAPPE, EA_HHID_FIXED, HHR_EACODE, HHR_SHH, HHR_DISTRICT, 
            HHR_REGION, HHR_DEPARTEMENT, HHR_SOUSPREFID, HHR_EACODE_ORIG, 
            HHR_SHH_ORIG, HHR_TEAM_ID, CODEMILIEU, NOM_MILIEU,
            NATMOMNM, MOMFEMNAME, DADMALENAME, MALEGUARDNAME,
            RELATTOHH, ROSTERED_FINAL_STATUS, ELIGIBLE1,
            HHCCONSEL, HHCEMANC, HHCCONT0))%>%
  # Fill NAs with no/missing where appropriate
  mutate(across(c(SICK3MO), ~replace_na(.x, -8)),
         across(c(EMANCIPATED, LIVEAWAY, starts_with("SUPPORT")), ~replace_na(.x, 2)),
         across(c(VULNERABLE_CHILD), ~replace_na(.x, 0))) %>%
  # Merge don't know and refused
  mutate(across(where(is.numeric), ~if_else(.x == -8, -9, .x))) %>%
  # Remove variables with only one value
  select(where(function(col) length(unique(col)) > 1)) %>%
  left_join(select(indiv_status_vars, EA_HHID_LN_FIXED)) #%>%
## Joining with `by = join_by(EA_HHID_LN_FIXED)`
#filter(roster_elig == 1)

# Create binary interview response variable
interview_response <- civ2_csproindiv %>%
  filter(HH_STATUS == 1) %>%
  mutate(ind_responded = if_else(INDIVCNTSTATUS == 1, 1, 0)) %>%
  # not sure about the INDIVCNTSTATUS
  select(ind_responded, EA_HHID_FIXED, EA_HHID_LN_FIXED, HH_STATUS)

# Join together all the variables and filter to only 
# de facto eligible persons.
# nr_vars <- interview_response %>%
#   left_join(hhvars) %>%
#   left_join(roster_vars) %>% 
#   select(-c( HHELANGNAT, HHELANGNATOT, HHELANG, HHELANGOT, HHCAGRN, HHLNGNAT_LNG, HHLNGNAT_LNGOTH, HHLNGVINT_LNG, HHLNGVINT_LNGOTH, HHTRNSLUSE, SICK_HOUSEHOLD, DEATHS,HHELANGACCYN,HHELANGACC,HHEHEARACM, HHI_ENDSURVEY))
# #  filter(roster_elig == 1) # left this out for now

nr_vars <- interview_response %>%
  left_join(hhvars) %>%
  left_join(roster_vars) %>% 
  select(
    -c(
      HHELANGNAT, HHELANGNATOT, HHELANG, HHELANGOT, HHCAGRN, HHLNGNAT_LNG,
      HHLNGNAT_LNGOTH, HHLNGVINT_LNG, HHLNGVINT_LNGOTH, HHTRNSLUSE, SICK_HOUSEHOLD,
      DEATHS, HHELANGACCYN, HHELANGACC, HHEHEARACM, HHI_ENDSURVEY,
      # making sure that interview status var is NOT carried forward
      INDIV_STATUS
  ))
## Joining with `by = join_by(EA_HHID_FIXED)`
## Joining with `by = join_by(EA_HHID_LN_FIXED, HH_STATUS)`
# Add recodes for the teen years variable which is used to split up the
# data for the non-response model.
nr_vars_recodes <- nr_vars %>%
  mutate(H_AGETEENYEARS = if_else(AGEYEARS >= 15 & AGEYEARS <= 17, 1, 2))

names(nr_vars)
##  [1] "ind_responded"    "EA_HHID_FIXED"    "EA_HHID_LN_FIXED" "HH_STATUS"       
##  [5] "HHEHEARING"       "HHAGEYEARS"       "HHEMANCMINOR"     "DEATHCOUNT"      
##  [9] "WATERSOURCE"      "WATERSOURCEOT"    "TOILETTYPE"       "TOILETTYPEOT"    
## [13] "TOILETSHARE"      "COOKINGFUEL"      "COOKINGFUELOT"    "MATFLOOR"        
## [17] "MATFLOOROT"       "MATROOF"          "MATROOFOT"        "MATEXWALLS"      
## [21] "MATEXWALLSOT"     "ROOMSLEEP"        "OWNCOWNUM"        "OWNGOATNUM"      
## [25] "OWNCHIKNNUM"      "OWNDOGNUM"        "OWNHORSENUM"      "OWNPIGNUM"       
## [29] "ECONSUP12"        "ECONSUP12OT"      "HHQITEMS1"        "HHQITEMS2"       
## [33] "HHQITEMS3"        "HHQITEMS4"        "HHQITEMS5"        "HHQITEMS6"       
## [37] "HHQITEMS7"        "HHQITEMS8"        "HHQITEMS9"        "HHQOWN1"         
## [41] "HHQOWN2"          "HHQOWN3"          "HHQOWN4"          "HHQOWN5"         
## [45] "HHQOWN6"          "HHQOWN7"          "HHQOWN8"          "HHQOWN9"         
## [49] "HHQOWN11"         "SEX"              "LIVEHERE"         "SLEEPHERE"       
## [53] "AGEYEARS"         "EMANCIPATED"      "LIVEAWAY"         "SICK3MO"         
## [57] "CHENSCH"          "MOMALIVE"         "MOMHHM"           "FEMGUARDHHM"     
## [61] "DADALIVE"         "DADHHM"           "MALEGUARDHHM"     "MOMSICK"         
## [65] "MOMAIDS"          "DADSICK"          "DADAIDS"          "VULNERABLE_CHILD"
## [69] "SUPPORTMED12"     "SUPPORTEMOT12"    "SUPPORTEMOT3"     "SUPPORTMATER12"  
## [73] "SUPPORTMATER3"    "SUPPORTSOCIAL12"  "SUPPORTSOCIAL3"   "SUPPORTSCHOL12"  
## [77] "ROSTER_ELIG"

3.2 Model by sex & age years variables

# Make the appropriate variables numeric and the rest factors.
# This is required for the BART model fitting to handle the 
# data correctly.
int_nr_data_in <- nr_vars_recodes %>%
#  select(-INDSTATUS, -ROSTERED_FINAL_STATUS, -roster_elig, -ELIGIBLE1) %>%
  select(H_AGETEENYEARS, SEX, EA_HHID_FIXED, EA_HHID_LN_FIXED, ind_responded, everything()) %>%
  arrange(H_AGETEENYEARS, SEX, EA_HHID_FIXED, EA_HHID_LN_FIXED) %>%
  mutate(age_group = case_when(AGEYEARS < 18 ~ AGEYEARS,
                               between(AGEYEARS, 18, 24) ~ 1,
                               between(AGEYEARS, 25, 29) ~ 2,
                               between(AGEYEARS, 30, 39) ~ 3,
                               between(AGEYEARS, 40, 49) ~ 4,
                               between(AGEYEARS, 50, 59) ~ 5,
                               AGEYEARS >= 60  ~ 6,
                               TRUE ~ 99),
         hh_age_group = case_when(between(HHAGEYEARS, 15, 24) ~ 1,
                                  between(HHAGEYEARS, 25, 29) ~ 2,
                                  between(HHAGEYEARS, 30, 39) ~ 3,
                                  between(HHAGEYEARS, 40, 49) ~ 4,
                                  between(HHAGEYEARS, 50, 59) ~ 5,
                                  HHAGEYEARS >= 60  ~ 6,
                                  TRUE ~ 99),
         across(-c(starts_with("TOTAL"), ends_with("NUM"), ends_with("DUR"),
                   DEATHCOUNT, AGEYEARS, ROOMSLEEP, HHAGEYEARS, ind_responded), as_factor),
         across(c(starts_with("TOTAL"), ends_with("NUM"), ends_with("DUR"),
                   DEATHCOUNT, AGEYEARS, ROOMSLEEP, HHAGEYEARS, ind_responded))) %>% 
  # select(-SLEEPHERE, -DEATHCOUNT, -HHEMANCMINOR, -LIVEAWAY,
  #        -AGEYEARS, -HHAGEYEARS)
    select(-SLEEPHERE, -DEATHCOUNT, -HHEMANCMINOR, -LIVEAWAY,
         -AGEYEARS, -HHAGEYEARS, -HH_STATUS, -ROSTER_ELIG)


## The following code prepares a model matrix for interview non response

# For variables which are already two-leveled
# int_binary_vars <- int_nr_data_in %>% 
#   select(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX, ind_responded) %>% 
#   select(EA_HHID_LN_FIXED, where(function(col) length(unique(col)) == 2), -ind_responded) %>%
#   mutate(across(-c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX), as.numeric)) %>%
#   mutate(across(-c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX), ~if_else(.x == 2, 0, .x))) %>%
#   mutate(across(-c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX), as.factor))

int_binary_vars <- int_nr_data_in %>% 
  select(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX, where(function(col) length(unique(col)) == 2), -ind_responded) %>%
  mutate(across(-c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX), as.numeric)) %>%
  mutate(across(-c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX), ~if_else(.x == 2, 0, .x))) %>%
  mutate(across(-c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX), as.factor))

# Non-binary variables
int_nr_data_mod_matrix_in <- int_nr_data_in %>%
  select(where(function(col) length(unique(col)) != 2), ind_responded) %>%
  select(-contains("EA_HHID"))

dmy_int_nr_data <- dummyVars("ind_responded ~ .", data = int_nr_data_mod_matrix_in)
df_int_nr_data = int_nr_data_mod_matrix_in

model_matrix_int_nr <- predict(dmy_int_nr_data, newdata = df_int_nr_data) %>%
  as_tibble() %>%
  cbind(ind_responded = int_nr_data_mod_matrix_in$ind_responded) %>%
  cbind(EA_HHID_LN_FIXED = int_nr_data_in$EA_HHID_LN_FIXED) %>%
  left_join(int_binary_vars) %>%
  mutate(across(-c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX, ind_responded), as.factor))
## Joining with `by = join_by(EA_HHID_LN_FIXED)`
## Identify/remove variables from non-response model where proportion
## or number in the smallest category very low (eg < 10%)

variable_response_props_int <- model_matrix_int_nr %>%
  pivot_longer(cols = -c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX, ind_responded), 
               names_to = "variable", values_to = "value") %>%
  group_by(H_AGETEENYEARS, SEX, variable, value) %>%
  summarise(respondents = sum(ind_responded),
            totalN = n(),
            rrate = respondents / totalN) %>%
  mutate(category_proportion = totalN / sum(totalN))
## `summarise()` has grouped output by 'H_AGETEENYEARS', 'SEX', 'variable'. You
## can override using the `.groups` argument.
int_nr_vars_dropped <- variable_response_props_int %>%
  mutate(drop = if_else(!grepl("age", variable) & 
                          (totalN < 25 | category_proportion < 10/100), 1, NA_real_)) %>%
  filter(drop == 1)
  

# final_int_nr_model_matrix <- model_matrix_int_nr %>%
#   pivot_longer(cols = -c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX, ind_responded), 
#                names_to = "variable", values_to = "value") %>%
#   left_join(select(int_nr_vars_dropped, H_AGETEENYEARS, SEX, variable, drop)) %>%
#   filter(is.na(drop)) %>%
#   pivot_wider(id_cols = c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX, ind_responded), 
#               names_from = variable, values_from = value) %>%
#   mutate(across(-c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX, ind_responded), 
#                  ~replace_na(as.numeric(.x), 0)))


final_int_nr_model_matrix <- model_matrix_int_nr %>%
  mutate(ind_responded = if_else(is.na(ind_responded), 0, ind_responded)) %>% 
  pivot_longer(cols = -c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX, ind_responded), 
               names_to = "variable", values_to = "value") %>%
  left_join(select(int_nr_vars_dropped, H_AGETEENYEARS, SEX, variable, drop)) %>%
  filter(is.na(drop)) %>%
  group_by(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX, ind_responded, variable) %>%
  summarise(value = first(value), .groups = "drop") %>%  # pick first value if multiple
  pivot_wider(id_cols = c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX, ind_responded), 
              names_from = variable, values_from = value) %>%
  mutate(across(-c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX, ind_responded), 
                ~replace_na(as.numeric(.x), 0)))
## Joining with `by = join_by(H_AGETEENYEARS, SEX, variable)`
#Function to fit a single model
# Input needs to contain the predictors and the response variable
fit_bart_model_int <- function(data, nskip = 10, ndpost = 20) {

  predictor_matrix <- data %>%
    select(where(function(col) length(unique(col)) > 1)) %>%
    remove_empty(which = "cols") %>%
    select(-c(EA_HHID_LN_FIXED, ind_responded)) %>%
    data.frame()
  
  response <- data$ind_responded

  # bartFit <- pbart(predictor_matrix, response, binaryOffset = mean(response),
  #                nskip = nskip, ndpost = ndpost)
  bartFit <- lbart(predictor_matrix, response, binaryOffset = mean(response),
                 nskip = nskip, ndpost = ndpost)
}

# Function to just output the fitted response probabilities
fit_response_probabilities_int <- function(data) {

  bartFit <- fit_bart_model_int(data)

  response <- data$ind_responded
  response_probability <- bartFit$prob.train.mean

  tibble(data$EA_HHID_LN_FIXED, response, response_probability)
}

# nested <- final_int_nr_model_matrix %>%
#   group_by(H_AGETEENYEARS, SEX) %>%
#   tidyr::nest() %>%
#   mutate(bart = lapply(data, fit_response_probabilities))
# 
# response_probs <- nested %>%
#   select(-data) %>%
#   unnest(cols = bart) %>%
#   rename(EA_HHID_LN_FIXED = `data$EA_HHID_LN_FIXED`)


nested_int <- final_int_nr_model_matrix %>%
  filter(!is.na(H_AGETEENYEARS), !is.na(SEX)) %>%
  group_by(H_AGETEENYEARS, SEX) %>%
  nest() %>%
  mutate(bart = lapply(data, fit_bart_model_int))
## *****Into main of lbart
## *****Data:
## data:n,p,np: 1005, 48, 0
## y1,yn: 1, 1
## x1,x[n*p]: 1.000000, 1.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 1 ... 1
## *****burn and ndpost: 10, 20
## *****Prior:
## beta,alpha,tau: 2.000000,0.950000,0.129526
## *****Dirichlet:sparse,a,b,rho,augment: 0,0.5,1,48,0
## *****nkeeptrain,nkeeptest: 20, 20
## *****printevery: 100
## 
## MCMC
## done 0 (out of 30)
## time: 1s
## check counts
## trcnt,tecnt: 20,0
## *****Into main of lbart
## *****Data:
## data:n,p,np: 1173, 46, 0
## y1,yn: 1, 1
## x1,x[n*p]: 2.000000, 2.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 1 ... 2
## *****burn and ndpost: 10, 20
## *****Prior:
## beta,alpha,tau: 2.000000,0.950000,0.129526
## *****Dirichlet:sparse,a,b,rho,augment: 0,0.5,1,46,0
## *****nkeeptrain,nkeeptest: 20, 20
## *****printevery: 100
## 
## MCMC
## done 0 (out of 30)
## time: 2s
## check counts
## trcnt,tecnt: 20,0
## *****Into main of lbart
## *****Data:
## data:n,p,np: 11268, 40, 0
## y1,yn: 1, 1
## x1,x[n*p]: 2.000000, 1.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 1 ... 1
## *****burn and ndpost: 10, 20
## *****Prior:
## beta,alpha,tau: 2.000000,0.950000,0.129526
## *****Dirichlet:sparse,a,b,rho,augment: 0,0.5,1,40,0
## *****nkeeptrain,nkeeptest: 20, 20
## *****printevery: 100
## 
## MCMC
## done 0 (out of 30)
## time: 16s
## check counts
## trcnt,tecnt: 20,0
## *****Into main of lbart
## *****Data:
## data:n,p,np: 12692, 41, 0
## y1,yn: 1, 1
## x1,x[n*p]: 2.000000, 1.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 1 ... 1
## *****burn and ndpost: 10, 20
## *****Prior:
## beta,alpha,tau: 2.000000,0.950000,0.129526
## *****Dirichlet:sparse,a,b,rho,augment: 0,0.5,1,41,0
## *****nkeeptrain,nkeeptest: 20, 20
## *****printevery: 100
## 
## MCMC
## done 0 (out of 30)
## time: 18s
## check counts
## trcnt,tecnt: 20,0
#export(nested_int, "nested_int_model_updated.Rdata")

#nested_int <- import("nested_int_model_updated.Rdata")

# response_probs_int <- nested_int %>%
#   mutate(response_probability_int = lapply(bart, function(x) x$prob.train.mean)) %>%
#   select(response_probability_int, data) %>%
#   unnest(cols = c(data, response_probability_int)) %>%
#   select(H_AGETEENYEARS, SEX, EA_HHID_LN_FIXED, ind_responded, response_probability_int) %>%
#   ungroup()

response_probs_int <- nested_int %>%
  mutate(response_probability_int = lapply(bart, function(x) x$prob.train.mean)) %>%
  select(response_probability_int, data) %>%
  unnest(cols = c(data, response_probability_int)) %>%
  select(H_AGETEENYEARS, SEX, EA_HHID_LN_FIXED, ind_responded, response_probability_int) %>%
  ungroup() |> 
# Cap response probability to not be too small
  mutate(response_probability_int = pmax(0.1, response_probability_int))
## Adding missing grouping variables: `H_AGETEENYEARS`, `SEX`
nr_adj_int_weights <- response_probs_int %>%
  mutate(SEX = as.numeric(SEX)) %>%
  left_join(int_wgts_phase_one_adjusted) %>%
  mutate(nr_adj_int_wgt = if_else(ind_responded == 1, phase_one_adj_int_wgt / response_probability_int, NA_real_)) %>% 
      filter(!is.na(H_AGETEENYEARS), !is.na(SEX))
## Joining with `by = join_by(SEX, EA_HHID_LN_FIXED)`
nr_adj_dataset_int <- int_nr_data_in %>%
  select(-H_AGETEENYEARS, -SEX) %>%
  left_join(nr_adj_int_weights, by = c("EA_HHID_LN_FIXED" = "EA_HHID_LN_FIXED")) %>% 
    filter(!is.na(H_AGETEENYEARS), !is.na(SEX))


nr_adj_dataset_int %>% 
  mutate(sex_f = factor(SEX, levels = c(1,2), labels = c("Male", "Female")),
         teen_f = factor(H_AGETEENYEARS, levels = c(1,2), labels = c("15-17", "18+"))) %>%
  ggplot(aes(x = factor(EAID), y = response_probability_int)) +
  geom_boxplot() +
  facet_grid(cols = ggplot2::vars(teen_f), rows = ggplot2::vars(sex_f)) +
  theme_bw()

# Plot of distribution
response_probs_int %>% 
  ggplot(aes(x = response_probability_int, fill = factor(ind_responded))) +
  geom_density(alpha = 0.5) +
  theme_bw()

response_probs_int %>% 
  ggplot(aes(x = factor(ind_responded), y = response_probability_int, fill = factor(ind_responded))) +
  geom_violin() +
  #geom_boxplot() +
  theme_bw() +
  scale_y_continuous(limits = c(0.5,1))

response_probs_int %>% 
  ungroup() %>%
  filter(ind_responded == 1) %>% 
  select(response_probability_int) %>%
  summary()
##  response_probability_int
##  Min.   :0.4177          
##  1st Qu.:0.7379          
##  Median :0.8255          
##  Mean   :0.8026          
##  3rd Qu.:0.8766          
##  Max.   :0.9746

3.3 Trimming individual weights

#keeping this section so as to keep the structure of the code, just increased the cut off for trimming
civ2_svy <- svydesign(ids = ~EAID,
                      weights = ~nr_adj_int_wgt,
                      data = nr_adj_int_weights %>% filter(!is.na(nr_adj_int_wgt)))


summary(weights(civ2_svy))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   27.05  465.90  858.38  804.51 1072.55 3735.24
civ2_svy_trimmed <- trimWeights(civ2_svy, 
                                  lower = -Inf,
                                  upper = 20.5 * median(weights(civ2_svy)), 
                                  #upper = 3.5 * median(weights(civ2_svy)),

                                  strict = TRUE)

summary(weights(civ2_svy_trimmed))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   27.05  465.90  858.38  804.51 1072.55 3735.24
trimmed_int_weights <- tibble(civ2_svy_trimmed$variables, trmpnr1w0 = weights(civ2_svy_trimmed))

3.4 Postratification of individual weights

ref_totals <- read_excel("C:/Users/kupamupindit/OneDrive - Columbia University Irving Medical Center/Documents/ICAP work/Countries/Cote D'Ivoire/CIPHIA2/Data/DONNEES_population_2024.xlsx") %>% 
  pivot_longer(cols = c(male, female),
               names_to = "sex",
               values_to = "population") %>% 
  mutate(age_group = str_replace_all(age_group, "\\s*-\\s*", "-")) %>% 
  mutate(cod_distric = case_when(
    district == "AUTONOME D'ABIDJAN" ~ "01",
    district == "AUTONOME DE YAMOUSSOUKRO" ~ "02",
    district == "BAS-SASSANDRA" ~ "03",
    district == "COMOE" ~ "04",
    district == "DENGUELE" ~ "05",
    district == "GOH-DJIBOUA" ~ "06",
    district == "HAUT-SASSANDRA-MARAHOUE" ~ "07",
    district == "LACS" ~ "08",
    district == "LAGUNES" ~ "09",
    district == "MONTAGNES" ~ "10",
    district == "SAVANES" ~ "11",
    district == "VALLEE DU BANDAMA" ~ "12",
    district == "WOROBA" ~ "13",
    district == "ZANZAN" ~ "14",
    TRUE ~ NA_character_  # fallback if name doesn't match
  )) %>% 
    mutate(sex = case_when(
    sex == "male" ~ 1,
    sex == "female" ~ 2,
    TRUE ~ NA_real_
  )) %>% 
  group_by(cod_distric, sex, age_group) %>%
  summarise(pop = sum(population))
## New names:
## `summarise()` has grouped output by 'cod_distric', 'sex'. You can override
## using the `.groups` argument.
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...11`
## • `` -> `...12`
## • `` -> `...13`
## • `` -> `...14`
## • `` -> `...15`
## • `` -> `...16`
## • `` -> `...17`
## • `` -> `...18`
## • `` -> `...19`
individual_reference_totals_adj <- ref_totals


  #summarise actual totals
intwt_totals <- trimmed_int_weights %>%
  mutate(cod_distric = str_sub(EAID, 3, 4)) %>% 
  filter(!is.na(trmpnr1w0)) %>%
  #left_join(geo_level_codes) %>% 
  mutate(age_group = case_when(between(CONFAGEY, 15, 19) ~ "15-19",
                               between(CONFAGEY, 20, 24) ~ "20-24",
                               between(CONFAGEY, 25, 29) ~ "25-29",
                               between(CONFAGEY, 30, 34) ~ "30-34",
                               between(CONFAGEY, 35, 39) ~ "35-39",
                               between(CONFAGEY, 40, 44) ~ "40-44",
                               between(CONFAGEY, 45, 49) ~ "45-49",
                               between(CONFAGEY, 50, 54) ~ "50-54",
                               between(CONFAGEY, 55, 59) ~ "55-59",
                               between(CONFAGEY, 60, 64) ~ "60-64",
                               CONFAGEY >= 65            ~ "65+",
                               TRUE ~ "ERROR"),
         sex = SEX) %>%
  #group_by(L1_Name, sex, age_group) %>% # By L1 area, age, and sex the cells are very small
  group_by(cod_distric, sex, age_group) %>%
  #group_by(sex, age_group) %>%
  summarise(totalN = n(),
            totalWgtN = sum(trmpnr1w0))
## `summarise()` has grouped output by 'cod_distric', 'sex'. You can override
## using the `.groups` argument.
#compute adjustment factor
int_poststrat_adj <- intwt_totals %>%
  left_join(individual_reference_totals_adj) %>%
  mutate(int_postrat_adj_factor = pop / totalWgtN)
## Joining with `by = join_by(cod_distric, sex, age_group)`
           #UN_Count_adj / totalWgtN)
           
 
# join to indiv level file and calculate weight
poststrat_int_wgts <- trimmed_int_weights %>% 
  mutate(cod_distric = str_sub(EAID, 3, 4)) %>% 
# left_join(geo_level_codes) %>%
 mutate(age_group = case_when(between(CONFAGEY, 15, 19) ~ "15-19",
                               between(CONFAGEY, 20, 24) ~ "20-24",
                               between(CONFAGEY, 25, 29) ~ "25-29",
                               between(CONFAGEY, 30, 34) ~ "30-34",
                               between(CONFAGEY, 35, 39) ~ "35-39",
                               between(CONFAGEY, 40, 44) ~ "40-44",
                               between(CONFAGEY, 45, 49) ~ "45-49",
                               between(CONFAGEY, 50, 54) ~ "50-54",
                               between(CONFAGEY, 55, 59) ~ "55-59",
                               between(CONFAGEY, 60, 64) ~ "60-64",
                               CONFAGEY >= 65            ~ "65+",
                               TRUE ~ "ERROR"),
         sex = SEX)           %>%
  left_join(int_poststrat_adj) %>%
  mutate(intwt0 = int_postrat_adj_factor * trmpnr1w0)
## Joining with `by = join_by(cod_distric, age_group, sex)`
int_pstrt_weight_plot <- poststrat_int_wgts %>% 
  ggplot(aes(x = as_factor(EAID), 
             y = trmpnr1w0,
             fill = cod_distric)) +
  geom_boxplot() +
  theme_bw() + 
  labs(x = "EA Number",
       y = "Trimmed Individual Weight",
       title = "Trimmed Interview Weight Distribution by EA",
       fill = "HHR_REGION")

int_pstrt_weight_plot

int_pstrt_factor_plot <- int_poststrat_adj %>% 
  ggplot(aes(x = as_factor(age_group), 
             y = int_postrat_adj_factor,
             fill = factor(sex, levels = c(1,2), labels = c("Male", "Female")))) +
  geom_col(position = position_dodge()) +
  facet_wrap(~cod_distric, nrow = 3) +
  theme_bw() + 
  labs(x = "Age Group",
       y = "Interview Poststratification Factor",
       title = "Interview Poststratification Adjustment Factors",
       fill = "Sex")+
   theme(axis.text.x = element_text(angle = 90, hjust = 1))


int_pstrt_factor_plot

# Max weights vs. median (to determine if trimming may be needed)
#max(poststrat_int_wgts$intwt0, na.rm = TRUE) / median(poststrat_int_wgts$intwt0, na.rm = TRUE)

#PracTools::deffK(filter(poststrat_int_wgts, !is.na(intwt0))$intwt0)

#Calculate person-level blood test weights

#DISCLOSE, CONDOMWHERE, CONDOMNOTEASYRSN, ADHIVPREV, TBSYMPASSESS, CHRONICCOND, CHRONICMED, - Multi select
# ARVAMTNUM - number

# Interview variables
blood_nr_variables <- civ2_csproindiv %>%
    left_join(select(civ2_csproroster, EA_HHID_LN_FIXED, SLEEPHERE, LIVEHERE, ELIGIBLE1, ROSTERED_FINAL_STATUS, AGEYEARS)) %>%
    mutate(roster_elig = ELIGIBLE1, #if_else(AGEYEARS >= 15 & SLEEPHERE == 1, 1, 0, missing = 0),
         indiv_status = case_when(
           #roster_elig == 1 & SLEEPHERE == 2 & LIVEHERE == 1 & AGEYEARS >= 15 ~ 9,
           roster_elig == 0 ~ 5,
           CONFAGEY >= 15 & ENDMSG1 == "A" ~ 1,
           CONFAGEY >= 15 & ENDMSG1 != "A" ~ 2,
           CONFAGEY < 15 ~ 3,
           is.na(CONFAGEY) ~ 4,
           TRUE ~ 999999)) %>%
 # select(EA_HHID_LN_FIXED, SLEEPHERE, LIVEHERE, AGEYEARS, CONFAGEY, ENDMSG1, INDSTATUS, roster_elig, indiv_status) %>% 
# it says joining by LN_FIXED and AGEYEARS is this correct? 
 # left_join(select(civ2_csproroster, EA_HHID_LN_FIXED, SLEEPHERE)) %>%
  filter(indiv_status == 1,
         SLEEPHERE == 1,
         CONFAGEY >= 15) %>%
 # select(30:448, 537:552) %>%
   # Expand multiple selection
  mutate(across(c(DISCLOSE, CONDOMWHERE, CONDOMNOTEASYRSN, 
                  ADHIVPREV, TBSYMPASSESS, CHRONICCOND, CHRONICMED,
                  HIVTSTNVRRSN, CMETHOD), 
                list(~expand_mult(.x, "A"),
                                   ~expand_mult(.x, "B"),
                                   ~expand_mult(.x, "C"),
                                   ~expand_mult(.x, "D"),
                                   ~expand_mult(.x, "E"),
                                   ~expand_mult(.x, "F"),
                                   ~expand_mult(.x, "G"),
                                   ~expand_mult(.x, "H"),
                                   ~expand_mult(.x, "I"),
                                   ~expand_mult(.x, "J"),
                                   ~expand_mult(.x, "K"),
                                   ~expand_mult(.x, "L"),
                                   ~expand_mult(.x, "M"),
                                   ~expand_mult(.x, "N"),
                                   ~expand_mult(.x, "W"),
                                   ~expand_mult(.x, "X"),
                                   ~expand_mult(.x, "Y"),
                                   ~expand_mult(.x, "Z")),
                .names = "{.col}{.fn}")) %>%
  select(-c(ends_with(c("TS", "DT", "_HOLD", "PARAG", "START", "INST", "INSTR")), 
            contains(c("GEND", "AGE", "STAFFID", "SIG", "INSTART", "INDCONSTFDT",
                       "CONSTAT", "INFUTCON", "ASYBIOGT", "ASYBLST", "ASYFUTR")),
            starts_with(c("INCON", "INSTR", "PPRM", "INAGREEINST", "WIFENM", "NEWNAME", "PARTHHLINE", "SXPREPS", "CHILDMORE"))), 
         -c(DISCLOSE, CONDOMWHERE, CONDOMNOTEASYRSN, HIVTSTNVRRSN, CMETHOD,
                  ADHIVPREV, TBSYMPASSESS, CHRONICCOND, CHRONICMED,
            ADELIGEDT, INECONS, PPELIGSDT,
            CONFAGEY, CONFGEND, INEEMAN, INECHILD, ELIGSTAT_PARENT, PPGUARD, PPGUARDREL, PPCHEEMAN, PPCHEEMANC, PPCHEHEARING, PPCHEHEARACM, PPCHELANGACC,PPCHELANGACCYN, PPCHECOGN, PPCHADULT, PPCHECONS, ELIGSTAT_CHILD, CHEEMAN, CHEEMANC, CHEHEARING, CHEHEARACM, CHELANGACC, CHELANGACCYN, CHECOGN, CHECONS, CONSTAT_ADULT, INAGREEIN, CONSTAT_PARENT,  INFUTCONST2,
            PPCHCONSTAT, CONSTAT_CHILD, ASOAGREEST1, ASOAGREEST2, ASYBIOGTST2, ASYFUTRST2,ASYASCONFIRM, ASYBLSTST2, ASOCONSTAT, CONSTCHID, ENROLL, PTIDSCAN, PTIDCOMF, INPAPER, ENELIGTHX, CONFEMAN_INELIG, CONSTAT_INELIG, LIVETIMEDK, MONTHWHENM, MONTHWHENY, NEWHSELECT,  BIRTHDAY, BIRTHMON, BIRTHYR, HIVTFPOSM, HIVTFPOSY, HIVLASTNEGM, HIVLASTNEGY, HIVLASTNEGDK, HIVPOSPROVM, HIVPOSPROVY, HIVCLINIC_GV1, HIVCLINIC, HIVCLINICOTH, HIVCLM, HIVCLY, ARVFTM, ARVFTY, VLTESTLSTM, VLTESTLSTY, CERVCNTSM, CERVCNTSY, ARVAMDK, CORRECT_LN, IND_HHI_UUID_ORIG, DMFLAG, DMCOMMENT, HIVTESTM, HIVTESTY)) %>%
    # Fill NAs with no/missing where appropriate
 mutate(
  across(c(HIVSELFTST), ~replace_na(.x, 1)),
  across(c(ACCHEAR, ACCOM1, ACCLANG, ACCLANGYN, ACCCOGNITIVE, ASOAGREE, WORK12MO, WORK7DAYS, CURMAR,  
           starts_with("SUPPORT")), ~replace_na(.x, 2)),

  # numeric replacement
  across(where(is.numeric) & (starts_with("ARV") | starts_with("VL") | starts_with("MEDIN") |
                              starts_with("TB") | starts_with("CERCN") | starts_with("PART") | 
                              starts_with("CHILD")), ~replace_na(.x, 0)),

  # character replacement
  across(where(is.character) & (starts_with("ARV") | starts_with("VL") | starts_with("MEDIN") |
                                starts_with("TB") | starts_with("CERCN") | starts_with("PART") | 
                                starts_with("CHILD")), ~replace_na(.x, "missing")),

  # the rest
  across(c(SCHLCUR, SCHCOM, OUTREGIONTYPE, OUTREGIONWHRSP, OUTREGIONWHRC, MONTHOUTEVER, MONTHTIMES,
           WHEREOUTSP, WHEREOUTC, REASONAWAY, WORKIND, NORMWORK, NUMWIF, WIFLIVEEW, WIFEWHERE,
           HUSLIVEW, HUSOTWIF, HUSNWIF, CHILDA2012, PRGTWIN, LIVEB, ALCNUMDAY, ALCSIXMORE, CONDOMGET,
           ADTPSX, ADDISHIV, ADHIVSCHMTG, HFHIVTSTOFFER, HIVTSTLOCATION, HIVTSTRSN, HIVTSTRSLT,
           HIVCARE, HIVCNOTRSN, HIVCRLTC, CLINCHANGE, TRAVELTIME, TRAVELDIFF),
         ~replace_na(.x, 0))
)%>%
  # Combine don't know and missing values
  mutate(across(where(is.numeric), ~if_else(.x == -8, -9, .x))) %>%
  remove_empty(which = "cols") %>%
  # Remove variables with only one value
  select(where(function(col) length(unique(col)) > 1))
## Joining with `by = join_by(EA_HHID_LN_FIXED, AGEYEARS)`
# Bring in variables used for interview adjustment
# int_nr_data_in
int_nr_data_vars <- int_nr_data_in %>%
  mutate(SEX = as.double(SEX))

blood_nr_variables_selected <- blood_nr_variables %>%
  select(-EMANCIPATED, -roster_elig,-LIVEHERE) %>% 
  left_join(int_nr_data_vars) %>%
  # Remove any variables that aren't helpful
  select(-INDSTATUS) %>%
  # Replace remaining missings with -9
  mutate(across(where(is.numeric), ~as.factor(replace_na(.x, -9)))) %>%
  # Remove variables with only one value
  select(where(function(col) length(unique(col)) > 1))
## Joining with `by = join_by(EA_HHID_LN_FIXED, EA_HHID_FIXED, SEX)`
# Blood response variable
blood_response <- blood_nr_variables_selected %>%
  left_join(biomarker, by = c("EA_HHID_LN_FIXED" = "ea_hhid_ln_fixed")) %>%
  mutate(bt_status = if_else(hiv1statusfinalsurvey %in% c("Positive", "Negative"), 1, 2, missing = 2)) %>%
  select(EA_HHID_LN_FIXED, bt_status)

blood_nr_data_in <- blood_nr_variables_selected %>%
  left_join(blood_response)
## Joining with `by = join_by(EA_HHID_LN_FIXED)`
names(blood_nr_data_in)
##   [1] "GRAPPE"              "EA_HHID_LN_FIXED"    "EA_HHID_FIXED"      
##   [4] "IND_EACODE"          "IND_SHH"             "IND_DISTRICT"       
##   [7] "IND_REGION"          "IND_DEPARTEMENT"     "IND_SOUSPREFID"     
##  [10] "IND_EACODE_ORIG"     "IND_SHH_ORIG"        "CODEMILIEU"         
##  [13] "NOM_MILIEU"          "IND_TEAM_ID"         "IND_HHMEM_ID"       
##  [16] "IND_HHI_UUID"        "IND_DEVICEID"        "IND_HUBUUID"        
##  [19] "DEVICEID_SECONDARY"  "INDFORMSTATUS"       "INDIVHHRCOLL"       
##  [22] "INSTATUSPW"          "INFORMSTATUSR"       "INFORMSTATUSREASON" 
##  [25] "CONSENTCHANGE"       "INDIVCNTSTATUS"      "IND0040"            
##  [28] "ELIGSTAT_ADULT"      "ACCHEAR"             "ACCOM1"             
##  [31] "ACCLANG"             "ACCLANGYN"           "ACCCOGNITIVE"       
##  [34] "PPGUARDRELOTH"       "INLANGADNAT"         "INLANGADNATOT"      
##  [37] "INLANGAD"            "INLANGADOT"          "ASOLANGNAT"         
##  [40] "ASOLANGNATOT"        "ASOLANG"             "ASOLANGOT"          
##  [43] "ASOAGREE"            "CONFEMAN"            "CONSTCHRESN1"       
##  [46] "CONSTCHRESN2"        "CONSTCHRESN3"        "CONSTCHRESN4"       
##  [49] "LNGNAT_LNG"          "LNGNAT_LNGOTH"       "LNGVINT_LNG"        
##  [52] "LNGVINT_LNGOTH"      "TRNSLUSE"            "SCHLAT"             
##  [55] "SCHLCUR"             "SCHENR"              "SCHCOM"             
##  [58] "LIVETIMENUM"         "LIVETIME"            "OUTREGIONTYPE"      
##  [61] "OUTREGIONWHRSP"      "OUTREGIONWHRC"       "OUTREGIONWHROTH"    
##  [64] "MONTHOUTEVER"        "MONTHTIMES"          "WHEREOUTCOMM"       
##  [67] "WHEREOUTSP"          "WHEREOUTC"           "WHEREOUTOTH"        
##  [70] "REASONAWAY"          "REASONAWATOTH"       "WORK12MO"           
##  [73] "WORK7DAYS"           "WORKIND"             "WORKINDOTH"         
##  [76] "NORMWORK"            "EVERMAR"             "CURMAR"             
##  [79] "NUMWIF"              "WIFLIVEEW"           "WIFEWHERE"          
##  [82] "HUSLIVEW"            "HUSOTWIF"            "HUSNWIF"            
##  [85] "LIVEB"               "CHILDA2012"          "PRGTWIN"            
##  [88] "REPCHNUM"            "PRGCARE"             "HIVTSBP"            
##  [91] "HIVPSBP"             "ARVFVST"             "HIVTPRG"            
##  [94] "HIVRTPG"             "ARVTKPG"             "HIVTSAD"            
##  [97] "HIVRTAD"             "PREGNANT"            "AVOIDPREG"          
## [100] "CMETHODOTH"          "PRGTWINPLUS1"        "PRGTWINPLUS2"       
## [103] "PRGTWINPLUS3"        "PRGTWINPLUS4"        "PRGTWINPLUS5"       
## [106] "CHILDALIVE1"         "CHILDALIVE2"         "CHILDALIVE3"        
## [109] "CHILDALIVE4"         "CHILDALIVE5"         "CHILDBRSTFD1"       
## [112] "CHILDBRSTFD2"        "CHILDBRSTFD3"        "CHILDBRSTFD4"       
## [115] "CHILDBRSTFD5"        "CHILDBRSTFDNOW1"     "CHILDBRSTFDNOW2"    
## [118] "CHILDBRSTFDNOW3"     "CHILDBRSTFDNOW4"     "CHILDBRSTFDNOW5"    
## [121] "CHTSTHIVBIRTH1"      "CHTSTHIVBIRTH2"      "CHTSTHIVBIRTH3"     
## [124] "CHTSTHIVBIRTH4"      "CHTSTHIVBIRTH5"      "CHTSTHIVRESULT1"    
## [127] "CHTSTHIVRESULT2"     "CHTSTHIVBRSTFD1"     "CHTSTHIVBRSTFD2"    
## [130] "CHTSTHIVRESULTLAST1" "CHTSTHIVRESULTLAST2" "CHILDLIVEW"         
## [133] "CHILDLIVEWNUM"       "CHILDNOTLIVEW"       "CHILDNOTLIVEWNUM"   
## [136] "CHILDTOTALCONF"      "CHILD19YR"           "CHILD14YR"          
## [139] "MCSTATUS"            "MCPLANS"             "MCWHOTRAD"          
## [142] "MCWHOMED"            "LIFETIMESEX"         "PART12MONUM"        
## [145] "PARTLIVEW1"          "PARTLIVEW2"          "PARTLIVEW3"         
## [148] "PARTINIT1"           "PARTINIT2"           "PARTINIT3"          
## [151] "PARTRECENT1"         "PARTRELATION1"       "PARTRELATION2"      
## [154] "PARTRELATION3"       "PARTRELATIONOTH1"    "PARTRELATIONOTH2"   
## [157] "PARTRELATIONOTH3"    "PARTLASTCNDM1"       "PARTLASTCNDM2"      
## [160] "PARTLASTCNDM3"       "PARTLASTETOH1"       "PARTLASTETOH2"      
## [163] "PARTLASTETOH3"       "PARTKNOWHIV1"        "PARTKNOWHIV2"       
## [166] "PARTKNOWHIV3"        "PARTHIVSAT1"         "PARTHIVSAT2"        
## [169] "PARTHIVSAT3"         "HFLAST12MO"          "HFHIVTSTOFFER"      
## [172] "HIVTSTEVER"          "HIVTSTNVRRSNOTH"     "HIVTSTLOCATION"     
## [175] "HIVTSTLOCATIONOTH"   "HIVTSTRSN"           "HIVTSTRSNOTH"       
## [178] "HIVTSTRSLT"          "HIVPOSPROV"          "HIVSELFTST"         
## [181] "DISCLOSEOTH"         "PRPEVRHDR"           "PREPEVER"           
## [184] "PREPCURNT"           "PREPWDTK"            "HIVCARE"            
## [187] "HIVCNOTRSN"          "HIVCRLTC"            "HIVCLINIC_GV2"      
## [190] "CLINCHANGE"          "TRAVELTIME"          "TRAVELDIFF"         
## [193] "ARVSTAKENEV"         "ARVSNOTTAKE"         "ARVSCURRENT"        
## [196] "ARVSNOTCURRSN"       "ARVSNOTCURRSNOTH"    "ARVLOC"             
## [199] "ARVAMTNUM"           "ARVAMT"              "ARVSWITCH"          
## [202] "ARVSWITCHWHY"        "ARVSWITCHWHYOTH"     "ARVINTERR"          
## [205] "ARVSMISSDAYS"        "VLTEST"              "VLTESTRESULT"       
## [208] "TBTPT"               "MEDINHCURR"          "MEDINHMONTHS"       
## [211] "TBCLINVISIT"         "TBCLINHIVTST"        "TBDIAGN"            
## [214] "TBTREATED"           "TBTTREATCURR"        "TBTREAT6MOFULL"     
## [217] "CERVCNTST"           "CERNCNRSLT"          "CERNCNTRT"          
## [220] "CERNCNTRTWHEN"       "HPVVACC"             "HPVVACCDOSE"        
## [223] "LITTLEINTEREST"      "DEPRESSED"           "ANXIETY"            
## [226] "WORRY"               "CHRONICCOND_OTH"     "CHRONICMED_OTH"     
## [229] "ALCFREQ"             "ALCNUMDAY"           "ALCSIXMORE"         
## [232] "CONDOMWHEREOTH"      "CONDOMGET"           "CONDOMNOTEASYRSNOTH"
## [235] "ADTPSX"              "ADDISHIV"            "ADHIVPREVOTH"       
## [238] "ADHIVSCHMTG"         "LOCPTID"             "LOCGPSPREP"         
## [241] "LOCGPS_STARTTIME"    "LOCGPS_SATELITES"    "LOCGPS_ACCURACY"    
## [244] "LOCGPS_READTIME"     "LOCGPS_ELAPSEDTIME"  "LOCGPS"             
## [247] "LOCINSTR1"           "LOCINSTR2"           "HIVCLINICC"         
## [250] "HIVCLINIC2"          "LOCPROV"             "LOCPROV2"           
## [253] "LOCFAC"              "LOCCNTPH"            "LOCOWSAPNR"         
## [256] "LOCMOBONR"           "LOCLANDONR"          "LOCOTHONR"          
## [259] "LOCPHPRF"            "LOCPHTM"             "LOCPHMSG"           
## [262] "LOCVISIT"            "LOCINSTR3"           "LOCOTHC"            
## [265] "LOCEND"              "LOCINSTR015"         "LOCINSTR115"        
## [268] "LOCINSTR215"         "HIVCLINIC215"        "LOCCNTPH15"         
## [271] "LOCOWSAPNR15"        "LOCMOBONR15"         "LOCLANDONR15"       
## [274] "LOCPHPRF15"          "LOCINSTR415"         "LOCPHTM15"          
## [277] "LOCPHMSG15"          "LOCVISIT15"          "LOCINSTR315"        
## [280] "LOCOTHC15"           "LOCEND15"            "HIVSTATUS"          
## [283] "PTIDS"               "CONFBC"              "BIOPTIDMATCH"       
## [286] "BIOPTIDMATCH2"       "SPECDATE"            "BIOTEAMID"          
## [289] "COLTYPEAD"           "BIOCLCTFL"           "HIVSTATUS_2T"       
## [292] "BIO2HIVTST1"         "BIO2HIVTST1LT"       "BIO2HIVTST1EX"      
## [295] "BIO2HIVTST1RS"       "BIO2HIVTST1LT2"      "BIO2HIVTST1EX2"     
## [298] "BIO2HIVTST1RS2"      "BIO2HIVTST2LT"       "BIO2HIVTST2EX"      
## [301] "BIO2HIVTST2RS"       "BIO2HIVTST2LT2"      "BIO2HIVTST2EX2"     
## [304] "BIO2HIVTST2RS2"      "BIO2HIVPOS"          "BIO2INVALID"        
## [307] "BIO2HIVIND"          "BIO2HIVNEG"          "BIO2COMMENT"        
## [310] "LTCSTATUS"           "LTCCNTSTATUS"        "LTCPTID"            
## [313] "LTCSTRT"             "LTCVRBL"             "LTCVRBLST1"         
## [316] "LTCVRBLST2"          "LTCHOW"              "LTCNOCONFIRM"       
## [319] "LTCPHPRF"            "LTCCONCONFIRM"       "LTCCONSTCHRESN2"    
## [322] "LTCCONSTCHRESN3"     "LTCCONSTCHNM"        "LTCCONSTCHID"       
## [325] "ARVSCURRENTINST1"    "ARVSCURRENTINST2"    "WITHDRAWAL"         
## [328] "WITHDRAWRSN"         "ENDSURVEY"           "ENDSUMMARY"         
## [331] "INDSTATUS_CALC"      "ENDTHANKS"           "ENDCOMMENT"         
## [334] "SEX"                 "KNOWN_HIV_STATUS"    "ELIGIBLE1"          
## [337] "DISCLOSE1"           "DISCLOSE2"           "DISCLOSE3"          
## [340] "DISCLOSE4"           "DISCLOSE5"           "DISCLOSE16"         
## [343] "DISCLOSE18"          "CONDOMWHERE1"        "CONDOMWHERE2"       
## [346] "CONDOMWHERE3"        "CONDOMWHERE4"        "CONDOMWHERE5"       
## [349] "CONDOMWHERE6"        "CONDOMWHERE16"       "CONDOMWHERE17"      
## [352] "CONDOMWHERE18"       "CONDOMNOTEASYRSN1"   "CONDOMNOTEASYRSN2"  
## [355] "CONDOMNOTEASYRSN3"   "CONDOMNOTEASYRSN4"   "CONDOMNOTEASYRSN5"  
## [358] "CONDOMNOTEASYRSN6"   "CONDOMNOTEASYRSN16"  "CONDOMNOTEASYRSN17" 
## [361] "CONDOMNOTEASYRSN18"  "ADHIVPREV1"          "ADHIVPREV2"         
## [364] "ADHIVPREV3"          "ADHIVPREV15"         "ADHIVPREV16"        
## [367] "ADHIVPREV17"         "ADHIVPREV18"         "TBSYMPASSESS1"      
## [370] "TBSYMPASSESS2"       "TBSYMPASSESS3"       "TBSYMPASSESS4"      
## [373] "TBSYMPASSESS5"       "CHRONICCOND1"        "CHRONICCOND2"       
## [376] "CHRONICCOND3"        "CHRONICCOND4"        "CHRONICCOND5"       
## [379] "CHRONICCOND6"        "CHRONICCOND7"        "CHRONICCOND9"       
## [382] "CHRONICCOND16"       "CHRONICCOND17"       "CHRONICCOND18"      
## [385] "CHRONICMED1"         "CHRONICMED2"         "CHRONICMED3"        
## [388] "CHRONICMED4"         "CHRONICMED5"         "CHRONICMED6"        
## [391] "CHRONICMED7"         "CHRONICMED9"         "CHRONICMED16"       
## [394] "CHRONICMED17"        "CHRONICMED18"        "HIVTSTNVRRSN1"      
## [397] "HIVTSTNVRRSN2"       "HIVTSTNVRRSN3"       "HIVTSTNVRRSN4"      
## [400] "HIVTSTNVRRSN5"       "HIVTSTNVRRSN6"       "HIVTSTNVRRSN7"      
## [403] "HIVTSTNVRRSN8"       "HIVTSTNVRRSN9"       "HIVTSTNVRRSN10"     
## [406] "HIVTSTNVRRSN11"      "HIVTSTNVRRSN12"      "HIVTSTNVRRSN16"     
## [409] "HIVTSTNVRRSN17"      "HIVTSTNVRRSN18"      "CMETHOD1"           
## [412] "CMETHOD2"            "CMETHOD3"            "CMETHOD4"           
## [415] "CMETHOD5"            "CMETHOD6"            "CMETHOD7"           
## [418] "CMETHOD8"            "CMETHOD9"            "CMETHOD10"          
## [421] "CMETHOD11"           "CMETHOD16"           "CMETHOD17"          
## [424] "H_AGETEENYEARS"      "ind_responded"       "HHEHEARING"         
## [427] "WATERSOURCE"         "WATERSOURCEOT"       "TOILETTYPE"         
## [430] "TOILETTYPEOT"        "TOILETSHARE"         "COOKINGFUEL"        
## [433] "COOKINGFUELOT"       "MATFLOOR"            "MATFLOOROT"         
## [436] "MATROOF"             "MATROOFOT"           "MATEXWALLS"         
## [439] "MATEXWALLSOT"        "ROOMSLEEP"           "OWNCOWNUM"          
## [442] "OWNGOATNUM"          "OWNCHIKNNUM"         "OWNDOGNUM"          
## [445] "OWNHORSENUM"         "OWNPIGNUM"           "ECONSUP12"          
## [448] "ECONSUP12OT"         "HHQITEMS1"           "HHQITEMS2"          
## [451] "HHQITEMS3"           "HHQITEMS4"           "HHQITEMS5"          
## [454] "HHQITEMS6"           "HHQITEMS7"           "HHQITEMS8"          
## [457] "HHQITEMS9"           "HHQOWN1"             "HHQOWN2"            
## [460] "HHQOWN3"             "HHQOWN4"             "HHQOWN5"            
## [463] "HHQOWN6"             "HHQOWN7"             "HHQOWN8"            
## [466] "HHQOWN9"             "HHQOWN11"            "LIVEHERE"           
## [469] "EMANCIPATED"         "SICK3MO"             "CHENSCH"            
## [472] "MOMALIVE"            "MOMHHM"              "FEMGUARDHHM"        
## [475] "DADALIVE"            "DADHHM"              "MALEGUARDHHM"       
## [478] "MOMSICK"             "MOMAIDS"             "DADSICK"            
## [481] "DADAIDS"             "VULNERABLE_CHILD"    "SUPPORTMED12"       
## [484] "SUPPORTEMOT12"       "SUPPORTEMOT3"        "SUPPORTMATER12"     
## [487] "SUPPORTMATER3"       "SUPPORTSOCIAL12"     "SUPPORTSOCIAL3"     
## [490] "SUPPORTSCHOL12"      "age_group"           "hh_age_group"       
## [493] "bt_status"
target_vars <- c("H_AGETEENYEARS", "SEX", "EA_HHID_FIXED", "EA_HHID_LN_FIXED", "bt_status",
  "ACCHEAR", "ACCLITER", "ACCVISUAL", "ADDISHIV", "ADHIVSCHMTG", "ADTPSX", 
  "AGE", "AGE_GROUP", "AGEMAR_GRP", "ALCFREQ", "ALCLOC", "ALCNUMDAY", 
  "ALCSIXMORE", "ANXIETY", "ARVAMT", "ARVAMTCOVID", "ARVAMTNUM", "ARVFTM", 
  "ARVFTY", "ARVFVST", "ARVINTERR", "ARVINTERRCOVID", "ARVLOC", "ARVLOCCOVID", 
  "ARVNRPG", "ARVSCURRENT", "ARVSMSISDAYS", "ARVSNOTCURRSN", "ARVSNOTTAKE", 
  "ARVSTAKENEV", "ARVSWITCH", "ARVSWITCHWHY", "ARVTKPG", "ASOLANG", "AVOIDPREG", 
  "BIO_RESPONDED", "BIRTHDAY", "BIRTHMON", "BIRTHYR", "BLOODTRANS", "CERNCRSLT", 
  "CERNCNTRT", "CERVCNTSM", "CERVCNTST", "CERVCNTSY", "CHEAGE", "CHEAGEC", 
  "CHEGEND", "CHEGENDC", "CHELITER", "CHENSCH", "CHEVISUAL", "CHILDA2012", 
  "CHILDALIVE1", "CHILDALIVE2", "CHILDALIVE4", "CHILDBRSTFD1", "CHILDBRSTFD2", 
  "CHILDBRSTFDNOW1", "CHILDBRSTFDNOW2", "CHILDBRSTFDNOW3", "CHILDBRSTFDNOW4", 
  "CHILDBRSTFDNOW5", "CHTSTHIVAGE1", "CHTSTHIVAGE2", "CHTSTHIVAGELAST1", 
  "CHTSTHIVAGELAST2", "CHTSTHIVAGELASTDK1", "CHTSTHIVAGELASTNUM1", 
  "CHTSTHIVAGELASTNUM2", "CHTSTHIVAGENUM1", "CHTSTHIVAGENUM2", "CHTSTHIVBIRTH1", 
  "CHTSTHIVBIRTH2", "CHTSTHIVBIRTH3", "CHTSTHIVBIRTH4", "CHTSTHIVBIRTH5", 
  "CHTSTHIVBRSTFD1", "CHTSTHIVBRSTFD2", "CHTSTHIVRESULT1", "CHTSTHIVRESULTLAST1", 
  "CLINCHANGE", "CONDOMGET", "CONFEMAN", "CONFEMAN_INELIG", "CONSTAT_ADULT", 
  "CONSTAT_CHILD", "CONSTAT_PARENT", "COOKINGFUEL", "CURMAR", "DADAIDS", 
  "DADALIVE", "DADHHM", "DADSICK", "DEATHAGEMO1", "DEATHAGEMO2", "DEATHAGEYR1", 
  "DEATHAGEYR2", "DEATHCOUNT", "DEPRESSED", "DRUG", "DWELLBELONG", "ECONSUPCOVID", 
  "ELIGSTAT_CHILD", "ELIGSTAT_PARENT", "EVERMAR", "FEMGUARDHHM", "FIRSTSXAGE_GRP", 
  "FIRSTSXAGEC", "HEPATITIS", "HFHIVTSTOFFER", "HFLAST12MO", "HH_AGEGROUP", 
  "HHEHEARACM", "HHELANG", "HIVCARE", "HIVCLM", "HIVCLY", "HIVCNOTRSN", 
  "HIVCRTLC", "HIVLASTNEGM", "HIVLASTNEGY", "HIVPOSROV", "HIVPOSROVM", 
  "HIVPOSROVY", "HIVPSBP", "HIVRTAD", "HIVRTPG", "HIVSELFTST", "HIVTESTM", 
  "HIVTESTY", "HIVTFPOSM", "HIVTFPOSY", "HIVTPRG", "HIVTSAD", "HIVTSBP", 
  "HIVTSTEVER", "HIVTSTLOCATION", "HIVTSTRSLT", "HIVTSTRSN", "HPVVACC", 
  "HUSLIVEW", "HUSNWIF", "HUSOTWIF", "INEEMAN", "INLANGAD", "LIFETIMESEX", 
  "LIGHTINGFUEL", "LITTLEINTEREST", "LIVEB", "LIVETIME", "LIVETIMEDK", 
  "LIVETIMENUM", "LNGNAT_LNG", "LNGVINT_LNG", "MALEGUARDHHM", "MATEXWALLS", 
  "MATFLOOR", "MATROOF", "MCAGE_GRP", "MCILLTYPE", "MCLOC", "MCPLANS", 
  "MCSTATUS", "MCWHO", "MEDINHCURR", "MEDINHMONTHS", "MOMAIDS", "MOMALIVE", 
  "MOMHHM", "MOMSICK", "MONTHOUTEVER", "MONTHTIMES", "MONTHWHENM", "MONTHWHENY", 
  "NORMWORK", "NUMWIF", "OUTREGIONTYPE", "OUTREGIONWHR", "PART12MONUM", 
  "PARTAGE1_GRP", "PARTAGE2_GRP", "PARTAGE3_GRP", "PARTGEND1", "PARTGEND2", 
  "PARTGEND3", "PARTHIVSAT1", "PARTHIVSAT2", "PARTHIVSAT3", "PARTKNOWHIV1", 
  "PARTKNOWHIV2", "PARTKNOWHIV3", "PARTLASTCNDM1", "PARTLASTCNDM2", "PARTLASTCNDM3", 
  "PARTLASTCNDMRSN1", "PARTLASTCNDMRSN2", "PARTLASTCNDMRSN3", "PARTLASTETOH1", 
  "PARTLASTETOH2", "PARTLASTETOH3", "PARTLIVEW1", "PARTLIVEW2", "PARTLIVEW3", 
  "PARTRECENT1", "PARTRELATION1", "PARTRELATION2", "PARTRELATION3", "PPGUARDREL", 
  "PREGNANT", "PREPCURNT", "PREPEVER", "PREPWDTK", "PRGCARE", "PRGSYTHOFFER1", 
  "PRGSYTHOFFER2", "PRGSYTHOFFER3", "PRGSYTHPOS1", "PRGSYTHPOS2", "PRGSYPTHTST1", 
  "PRGSYPTHTST2", "PRGSYPTHTST3", "PRGSYPTHTREAT1", "PRGTWIN", "PRGTWINPLUS1", 
  "PRGTWINPLUS2", "PRGTWINPLUS3", "PRPEVRHDR", "REASONAWAY", "RELATTOHH", 
  "ROOMSLEEP", "SCHCOM", "SCHCOMC1", "SCHLAT", "SCHLCUR", "SUPPORTEMOT12", 
  "SUPPORTEMOT3", "SUPPORTMATER12", "SUPPORTMED12", "SUPPORTSCHOL12", 
  "SUPPORTSOCIAL12", "TBCLINHIVTST", "TBCLINVISIT", "TBDIAGN", "TBTPT", 
  "TBTREAT6MOFULL", "TBTREATED", "TBTTREATCURR", "TOILETSHARE", "TOILETTYPE", 
  "TRAVELDIFF", "TRAVELTIME", "VARSTRAT", "VLTEST", "VLTESTLSTM", "VLTESTLSTY", 
  "VLTESTRESULT", "VULNERABLE_CHILD", "WATERSOURCE", "WHEREOUT", "WIFWHERE", 
  "WIFLIVEEW", "WORK12MO", "WORK7DAYS", "WORKIND", "WORRY", "WIFEWHERE", "PRGTWINPLUS5", "CHILDALIVE5", "TOILETTYPEOT", "COOKINGFUELOT","OUTREGIONWHRC", "WATERSOURCEOT", "HIVCRLTC", "CHTSTHIVRESULT2", "WHEREOUTC", "CHILDALIVE3", "MATEXWALLSOT", "CHILDBRSTFD5", "AGE_GROUP", "HIVPOSPROV",
"CHILDBRSTFD4", "CERNCNRSLT", "OUTREGIONWHRSP", "CHILDBRSTFD3", "PRGTWINPLUS4",
"HH_AGE_GROUP", "ARVSMISSDAYS", "HIVTSTLOCATIONOTH", "CHTSTHIVRESULTLAST2"

)




blood_nr_data_in <- blood_nr_data_in %>%
  select(any_of(target_vars))

names(blood_nr_data_in)
##   [1] "H_AGETEENYEARS"      "SEX"                 "EA_HHID_FIXED"      
##   [4] "EA_HHID_LN_FIXED"    "bt_status"           "ACCHEAR"            
##   [7] "ADDISHIV"            "ADHIVSCHMTG"         "ADTPSX"             
##  [10] "ALCFREQ"             "ALCNUMDAY"           "ALCSIXMORE"         
##  [13] "ANXIETY"             "ARVAMT"              "ARVAMTNUM"          
##  [16] "ARVFVST"             "ARVINTERR"           "ARVLOC"             
##  [19] "ARVSCURRENT"         "ARVSNOTCURRSN"       "ARVSNOTTAKE"        
##  [22] "ARVSTAKENEV"         "ARVSWITCH"           "ARVSWITCHWHY"       
##  [25] "ARVTKPG"             "ASOLANG"             "AVOIDPREG"          
##  [28] "CERNCNTRT"           "CERVCNTST"           "CHENSCH"            
##  [31] "CHILDA2012"          "CHILDALIVE1"         "CHILDALIVE2"        
##  [34] "CHILDALIVE4"         "CHILDBRSTFD1"        "CHILDBRSTFD2"       
##  [37] "CHILDBRSTFDNOW1"     "CHILDBRSTFDNOW2"     "CHILDBRSTFDNOW3"    
##  [40] "CHILDBRSTFDNOW4"     "CHILDBRSTFDNOW5"     "CHTSTHIVBIRTH1"     
##  [43] "CHTSTHIVBIRTH2"      "CHTSTHIVBIRTH3"      "CHTSTHIVBIRTH4"     
##  [46] "CHTSTHIVBIRTH5"      "CHTSTHIVBRSTFD1"     "CHTSTHIVBRSTFD2"    
##  [49] "CHTSTHIVRESULT1"     "CHTSTHIVRESULTLAST1" "CLINCHANGE"         
##  [52] "CONDOMGET"           "CONFEMAN"            "COOKINGFUEL"        
##  [55] "CURMAR"              "DADAIDS"             "DADALIVE"           
##  [58] "DADHHM"              "DADSICK"             "DEPRESSED"          
##  [61] "EVERMAR"             "FEMGUARDHHM"         "HFHIVTSTOFFER"      
##  [64] "HFLAST12MO"          "HIVCARE"             "HIVCNOTRSN"         
##  [67] "HIVPSBP"             "HIVRTAD"             "HIVRTPG"            
##  [70] "HIVSELFTST"          "HIVTPRG"             "HIVTSAD"            
##  [73] "HIVTSBP"             "HIVTSTEVER"          "HIVTSTLOCATION"     
##  [76] "HIVTSTRSLT"          "HIVTSTRSN"           "HPVVACC"            
##  [79] "HUSLIVEW"            "HUSNWIF"             "HUSOTWIF"           
##  [82] "INLANGAD"            "LIFETIMESEX"         "LITTLEINTEREST"     
##  [85] "LIVEB"               "LIVETIME"            "LIVETIMENUM"        
##  [88] "LNGNAT_LNG"          "LNGVINT_LNG"         "MALEGUARDHHM"       
##  [91] "MATEXWALLS"          "MATFLOOR"            "MATROOF"            
##  [94] "MCPLANS"             "MCSTATUS"            "MEDINHCURR"         
##  [97] "MEDINHMONTHS"        "MOMAIDS"             "MOMALIVE"           
## [100] "MOMHHM"              "MOMSICK"             "MONTHOUTEVER"       
## [103] "MONTHTIMES"          "NORMWORK"            "NUMWIF"             
## [106] "OUTREGIONTYPE"       "PART12MONUM"         "PARTHIVSAT1"        
## [109] "PARTHIVSAT2"         "PARTHIVSAT3"         "PARTKNOWHIV1"       
## [112] "PARTKNOWHIV2"        "PARTKNOWHIV3"        "PARTLASTCNDM1"      
## [115] "PARTLASTCNDM2"       "PARTLASTCNDM3"       "PARTLASTETOH1"      
## [118] "PARTLASTETOH2"       "PARTLASTETOH3"       "PARTLIVEW1"         
## [121] "PARTLIVEW2"          "PARTLIVEW3"          "PARTRECENT1"        
## [124] "PARTRELATION1"       "PARTRELATION2"       "PARTRELATION3"      
## [127] "PREGNANT"            "PREPCURNT"           "PREPEVER"           
## [130] "PREPWDTK"            "PRGCARE"             "PRGTWIN"            
## [133] "PRGTWINPLUS1"        "PRGTWINPLUS2"        "PRGTWINPLUS3"       
## [136] "PRPEVRHDR"           "REASONAWAY"          "ROOMSLEEP"          
## [139] "SCHCOM"              "SCHLAT"              "SCHLCUR"            
## [142] "SUPPORTEMOT12"       "SUPPORTEMOT3"        "SUPPORTMATER12"     
## [145] "SUPPORTMED12"        "SUPPORTSCHOL12"      "SUPPORTSOCIAL12"    
## [148] "TBCLINHIVTST"        "TBCLINVISIT"         "TBDIAGN"            
## [151] "TBTPT"               "TBTREAT6MOFULL"      "TBTREATED"          
## [154] "TBTTREATCURR"        "TOILETSHARE"         "TOILETTYPE"         
## [157] "TRAVELDIFF"          "TRAVELTIME"          "VLTEST"             
## [160] "VLTESTRESULT"        "VULNERABLE_CHILD"    "WATERSOURCE"        
## [163] "WIFLIVEEW"           "WORK12MO"            "WORK7DAYS"          
## [166] "WORKIND"             "WORRY"               "WIFEWHERE"          
## [169] "PRGTWINPLUS5"        "CHILDALIVE5"         "TOILETTYPEOT"       
## [172] "COOKINGFUELOT"       "OUTREGIONWHRC"       "WATERSOURCEOT"      
## [175] "HIVCRLTC"            "CHTSTHIVRESULT2"     "WHEREOUTC"          
## [178] "CHILDALIVE3"         "MATEXWALLSOT"        "CHILDBRSTFD5"       
## [181] "HIVPOSPROV"          "CHILDBRSTFD4"        "CERNCNRSLT"         
## [184] "OUTREGIONWHRSP"      "CHILDBRSTFD3"        "PRGTWINPLUS4"       
## [187] "ARVSMISSDAYS"        "HIVTSTLOCATIONOTH"   "CHTSTHIVRESULTLAST2"
# Make the appropriate variables numeric and the rest factors.
# This is required for the BART model fitting to handle the 
# data correctly.
nr_data_in_bio <- blood_nr_data_in %>%
  select(H_AGETEENYEARS, SEX, EA_HHID_FIXED, EA_HHID_LN_FIXED, bt_status, everything()) %>%
  arrange(H_AGETEENYEARS, SEX, EA_HHID_FIXED, EA_HHID_LN_FIXED) %>%
  mutate(response_bio = if_else(bt_status == 1, 1, 0)) %>%
  select(-bt_status) %>%
  mutate(across(where(is.numeric), ~if_else(as.numeric(.x) > 0, 1, 0)),
         across(-response_bio, as_factor))

# For variables which are already two-leveled
bio_binary_vars <- nr_data_in_bio %>% 
  select(EA_HHID_LN_FIXED, where(function(col) length(unique(col)) == 2), -response_bio) %>%
  mutate(across(-c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX), as.numeric)) %>%
  mutate(across(-c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX), ~if_else(.x == 2, 0, .x))) %>%
  mutate(across(-c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX), as.factor))

# Non-binary variables
bio_nr_data_matrix_in <- nr_data_in_bio %>%
  select(where(function(col) length(unique(col)) != 2), response_bio) %>%
  select(-contains("EA_HHID"))

dmy_bio_nr_data <- dummyVars("response_bio ~ .", data = bio_nr_data_matrix_in)
df_bio_nr_data <- bio_nr_data_matrix_in

model_matrix_nr_bio <-predict(dmy_bio_nr_data, newdata = df_bio_nr_data) %>%
  as_tibble() %>%
  # mutate(H_AGETEENYEARS = if_else(H_AGETEENYEARS.2 == 1, 2, 1),
  #        SEX = if_else(SEX.2 == 1, 2, 1)) %>%
  # select(-H_AGETEENYEARS.1, -H_AGETEENYEARS.2, -SEX.1, -SEX.2) %>%
  cbind(response_bio = nr_data_in_bio$response_bio) %>%
  cbind(EA_HHID_LN_FIXED = nr_data_in_bio$EA_HHID_LN_FIXED) %>%
  left_join(bio_binary_vars) %>%
  mutate(across(-c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX, response_bio), as.factor))
## Joining with `by = join_by(EA_HHID_LN_FIXED)`
variable_response_props_bio <- model_matrix_nr_bio %>%
  pivot_longer(cols = -c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX, response_bio), 
               names_to = "variable", values_to = "value") %>%
  group_by(H_AGETEENYEARS, SEX, variable, value) %>%
  summarise(respondents = sum(response_bio),
            totalN = n(),
            rrate = respondents / totalN) %>%
  mutate(category_proportion = totalN / sum(totalN))
## `summarise()` has grouped output by 'H_AGETEENYEARS', 'SEX', 'variable'. You
## can override using the `.groups` argument.
bio_nr_variables_dropped <- variable_response_props_bio %>%
  mutate(drop = if_else(!grepl("age", variable) & 
                          (totalN < 25 | category_proportion < 10/100), 1, NA_real_)) %>%
  filter(drop == 1)

final_bio_nr_model_matrix <- model_matrix_nr_bio %>%
  pivot_longer(cols = -c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX, response_bio), 
               names_to = "variable", values_to = "value") %>%
  left_join(select(bio_nr_variables_dropped, H_AGETEENYEARS, SEX, variable, drop)) %>%
  filter(is.na(drop)) %>%
  pivot_wider(id_cols = c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX, response_bio), 
              names_from = variable, values_from = value) %>%
  rename_with(~gsub(" ", ".", .x, fixed = TRUE)) %>%
  rename_with(~gsub("-", ".", .x, fixed = TRUE)) %>%
  mutate(across(-c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX, response_bio), 
                 ~replace_na(as.numeric(.x), 0)))
## Joining with `by = join_by(H_AGETEENYEARS, SEX, variable)`
## Model by sex & teenage years variables


# Function to fit a single model
# Input needs to contain the predictors and the response variable
fit_bart_model_bio <- function(data, nskip = 10, ndpost = 20) {

  predictor_matrix <- data %>%
    select(where(function(col) length(unique(col)) > 1)) %>%
    remove_empty(which = "cols") %>%
    select(-c(EA_HHID_LN_FIXED, response_bio)) %>%
    data.frame()
  
  response_bio <- data$response_bio

  # bartFit <- pbart(predictor_matrix, response_bio, binaryOffset = mean(response_bio),
  #                nskip = nskip, ndpost = ndpost)
  
  bartFit <- lbart(predictor_matrix, response_bio, binaryOffset = mean(response_bio),
                 nskip = nskip, ndpost = ndpost)
}


# Function to just output the fitted response probabilities
get_response_probabilities_bio <- function(data) {

  bartFit <- fit_bart_model_bio(data)

  response_bio <- data$response_bio
  response_probability_bio <- bartFit$prob.train.mean

  tibble(data$EA_HHID_LN_FIXED, response_bio, response_probability_bio)
}

nested_bio <- final_bio_nr_model_matrix %>%
  group_by(H_AGETEENYEARS, SEX) %>%
  nest() %>%
  mutate(bart = lapply(data, fit_bart_model_bio)) 
## *****Into main of lbart
## *****Data:
## data:n,p,np: 886, 104, 0
## y1,yn: 1, 1
## x1,x[n*p]: 2.000000, 1.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 1 ... 1
## *****burn and ndpost: 10, 20
## *****Prior:
## beta,alpha,tau: 2.000000,0.950000,0.129526
## *****Dirichlet:sparse,a,b,rho,augment: 0,0.5,1,104,0
## *****nkeeptrain,nkeeptest: 20, 20
## *****printevery: 100
## 
## MCMC
## done 0 (out of 30)
## time: 1s
## check counts
## trcnt,tecnt: 20,0
## *****Into main of lbart
## *****Data:
## data:n,p,np: 1093, 105, 0
## y1,yn: 1, 1
## x1,x[n*p]: 1.000000, 2.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 1 ... 1
## *****burn and ndpost: 10, 20
## *****Prior:
## beta,alpha,tau: 2.000000,0.950000,0.129526
## *****Dirichlet:sparse,a,b,rho,augment: 0,0.5,1,105,0
## *****nkeeptrain,nkeeptest: 20, 20
## *****printevery: 100
## 
## MCMC
## done 0 (out of 30)
## time: 1s
## check counts
## trcnt,tecnt: 20,0
## *****Into main of lbart
## *****Data:
## data:n,p,np: 10531, 127, 0
## y1,yn: 1, 1
## x1,x[n*p]: 2.000000, 1.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 1 ... 1
## *****burn and ndpost: 10, 20
## *****Prior:
## beta,alpha,tau: 2.000000,0.950000,0.129526
## *****Dirichlet:sparse,a,b,rho,augment: 0,0.5,1,127,0
## *****nkeeptrain,nkeeptest: 20, 20
## *****printevery: 100
## 
## MCMC
## done 0 (out of 30)
## time: 9s
## check counts
## trcnt,tecnt: 20,0
## *****Into main of lbart
## *****Data:
## data:n,p,np: 11948, 153, 0
## y1,yn: 1, 1
## x1,x[n*p]: 2.000000, 1.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 1 ... 1
## *****burn and ndpost: 10, 20
## *****Prior:
## beta,alpha,tau: 2.000000,0.950000,0.129526
## *****Dirichlet:sparse,a,b,rho,augment: 0,0.5,1,153,0
## *****nkeeptrain,nkeeptest: 20, 20
## *****printevery: 100
## 
## MCMC
## done 0 (out of 30)
## time: 14s
## check counts
## trcnt,tecnt: 20,0
#export(nested_bio, "nested_bio_model_updated.Rdata")

#nested_bio <- import("nested_bio_model_updated.Rdata")

response_probs_bio <- nested_bio %>%
  mutate(response_probability_bio = lapply(bart, function(x) x$prob.train.mean)) %>%
  select(response_probability_bio, data) %>%
  unnest(cols = c(data, response_probability_bio)) %>%
  select(H_AGETEENYEARS, SEX, EA_HHID_LN_FIXED, response_bio, response_probability_bio) %>%
  ungroup()
## Adding missing grouping variables: `H_AGETEENYEARS`, `SEX`
# nested_bio <- final_bio_nr_model_matrix %>%
#   group_by(H_AGETEENYEARS, SEX) %>%
#   nest() %>%
#   mutate(bart = lapply(data, fit_response_probabilities_bio))
# 
# # Plots and analysis
# response_probs_bio <- nested_bio %>%
#   select(-data) %>%
#   unnest(cols = bart) %>%
#   rename(EA_HHID_LN_FIXED = `data$EA_HHID_LN_FIXED`)

nr_adj_bio_weights <- response_probs_bio %>%
  mutate(H_AGETEENYEARS = factor(H_AGETEENYEARS),
         SEX = as.numeric(SEX)) %>%
  left_join(trimmed_int_weights) %>%
  mutate(nr_adj_bio_wgt = if_else(response_bio == 1, trmpnr1w0 / response_probability_bio, NA_real_))
## Joining with `by = join_by(H_AGETEENYEARS, SEX, EA_HHID_LN_FIXED)`
nr_adj_dataset_bio <- nr_data_in_bio %>%
  select(-H_AGETEENYEARS, -SEX) %>%
  left_join(nr_adj_bio_weights, by = c("EA_HHID_LN_FIXED" = "EA_HHID_LN_FIXED"))

# Plot of distribution
response_probs_bio %>% 
  ggplot(aes(x = response_probability_bio, fill = factor(response_bio))) +
  geom_density(alpha = 0.5) +
  theme_bw()

response_probs_bio %>% 
  ggplot(aes(x = factor(response_bio), y = response_probability_bio, fill = factor(response_bio))) +
  geom_violin() +
  #geom_boxplot() +
  theme_bw() +
  scale_y_continuous(limits = c(0.5,1))

response_probs_bio %>% 
  ungroup() %>%
  filter(response_bio == 1) %>% 
  select(response_probability_bio) %>%
  summary()
##  response_probability_bio
##  Min.   :0.4585          
##  1st Qu.:0.9211          
##  Median :0.9532          
##  Mean   :0.9345          
##  3rd Qu.:0.9706          
##  Max.   :0.9981

3.5 Triming biomaker weights

civ2_svy_bio <- nr_adj_bio_weights %>%
  filter(!is.na(nr_adj_bio_wgt)) %>%
  as_survey_design(ids = EAID,
                   weights = nr_adj_bio_wgt)

summary(weights(civ2_svy_bio))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   28.09  479.33  896.90  854.96 1150.84 4610.25
max(weights(civ2_svy_bio))/median(weights(civ2_svy_bio))
## [1] 5.140232
civ2_svy_trimmed_bio <- trimWeights(civ2_svy_bio, 
                                      lower = -Inf,
                                      upper = 20.5 * median(weights(civ2_svy_bio)),
                                      #upper = 3.5 * median(weights(civ2_svy_bio)),

                                      strict = TRUE)

summary(weights(civ2_svy_trimmed_bio))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   28.09  479.33  896.90  854.96 1150.84 4610.25
trimmed_bio_weights <- tibble(civ2_svy_trimmed_bio$variables, trmbtnr1w0 = weights(civ2_svy_trimmed_bio))
max(weights(civ2_svy_trimmed_bio))/median(weights(civ2_svy_trimmed_bio))
## [1] 5.140232

3.6 Postratification of biomaker weights

# Individual reference totals already computed above 
# and stored in individual_reference_totals

# summarise actual totals
biowt_totals <- trimmed_bio_weights %>%
  filter(!is.na(trmbtnr1w0)) %>%
  mutate(cod_distric = str_sub(EA_HHID_FIXED, 3, 4)) %>% 
  mutate(age_group = case_when(between(CONFAGEY, 15, 19) ~ "15-19",
                               between(CONFAGEY, 20, 24) ~ "20-24",
                               between(CONFAGEY, 25, 29) ~ "25-29",
                               between(CONFAGEY, 30, 34) ~ "30-34",
                               between(CONFAGEY, 35, 39) ~ "35-39",
                               between(CONFAGEY, 40, 44) ~ "40-44",
                               between(CONFAGEY, 45, 49) ~ "45-49",
                               between(CONFAGEY, 50, 54) ~ "50-54",
                               between(CONFAGEY, 55, 59) ~ "55-59",
                               between(CONFAGEY, 60, 64) ~ "60-64",
                               CONFAGEY >= 65            ~ "65+",
                               TRUE ~ "ERROR"),
         sex = SEX) %>% 
  group_by(cod_distric, sex, age_group) %>%
  summarise(totalNBio = n(),
            totalWgtNBio = sum(trmbtnr1w0))
## `summarise()` has grouped output by 'cod_distric', 'sex'. You can override
## using the `.groups` argument.
#compute adjustment factor
bio_poststrat_adj <- biowt_totals %>%
  left_join(individual_reference_totals_adj) %>%
  mutate(bio_postrat_adj_factor = pop / totalWgtNBio)
## Joining with `by = join_by(cod_distric, sex, age_group)`
# join to indiv level file and calculate weight
poststrat_bio_wgts <- trimmed_bio_weights %>% 
  mutate(cod_distric = str_sub(EA_HHID_FIXED, 3, 4)) %>% 
  mutate(age_group = case_when(between(CONFAGEY, 15, 19) ~ "15-19",
                               between(CONFAGEY, 20, 24) ~ "20-24",
                               between(CONFAGEY, 25, 29) ~ "25-29",
                               between(CONFAGEY, 30, 34) ~ "30-34",
                               between(CONFAGEY, 35, 39) ~ "35-39",
                               between(CONFAGEY, 40, 44) ~ "40-44",
                               between(CONFAGEY, 45, 49) ~ "45-49",
                               between(CONFAGEY, 50, 54) ~ "50-54",
                               between(CONFAGEY, 55, 59) ~ "55-59",
                               between(CONFAGEY, 60, 64) ~ "60-64",
                               CONFAGEY >= 65            ~ "65+",
                               TRUE ~ "ERROR"),
         sex = SEX) %>% 
  select(-starts_with("PopUN")) %>%
  left_join(bio_poststrat_adj, by = c("cod_distric", "sex", "age_group")) %>%
  mutate(btwt0 = bio_postrat_adj_factor * trmbtnr1w0)

mycols <- c("steelblue", "salmon")  # or any other 2 colors you prefer

bio_pstrt_adj_plot <- bio_poststrat_adj %>% 
  ggplot(aes(x = as_factor(age_group), 
             y = bio_postrat_adj_factor,
             fill = factor(sex, levels = c(1,2), labels = c("Male", "Female")))) +
  #geom_boxplot() +
  geom_col(position = position_dodge()) +
  facet_wrap(~cod_distric) +
  #scale_color_manual(values = mycols[1:2]) +
  scale_fill_manual(values = mycols[1:2]) +
  theme_bw() + 
  labs(x = "Age Group",
       y = "Biomarker Poststratification Factor",
       title = "Biomarker Poststratification Adjustment by Sex and Age Group",
       fill = "Sex")+
   theme(axis.text.x = element_text(angle = 90, hjust = 1))

mycols <- c(
  "#1f78b4", "#33a02c", "#e31a1c", "#ff7f00", "#6a3d9a",
  "#a6cee3", "#b2df8a", "#fb9a99", "#fdbf6f", "#cab2d6",
  "#ffff99", "#b15928", "#8dd3c7", "#fccde5"
)


bio_pstrt_adj_plot

bio_pstrt_weight_plot <- poststrat_bio_wgts %>% 
  ggplot(aes(x = as_factor(EA_HHID_FIXED), 
             y = btwt0,
             fill = cod_distric)) +
  geom_boxplot() +
  #geom_col() +
  scale_fill_manual(values = mycols) +
  theme_bw() + 
  labs(x = "EA Number",
       y = "Final Biomarker Weight",
       title = "Final Biomarker Weight Distribution by EA",
       fill = "Region")

bio_pstrt_weight_plot

# poststrat_bio_wgts %>% 
#   ggplot(aes(x = intwt0, y = btwt0, color = factor(SEX), shape = factor(H_AGETEENYEARS))) + 
#   geom_point(alpha = 0.5) +
#   coord_equal()
# 
# # Max weights vs. median (to determine if trimming may be needed)
# # And design effects
# max(poststrat_bio_wgts$nr_adj_int_wgt, na.rm = TRUE) / median(poststrat_bio_wgts$nr_adj_int_wgt, na.rm = TRUE)
# max(poststrat_bio_wgts$intwt0, na.rm = TRUE) / median(poststrat_bio_wgts$intwt0, na.rm = TRUE)
# PracTools::deffK(filter(poststrat_bio_wgts, !is.na(intwt0))$intwt0)
# 
# max(poststrat_bio_wgts$btwt0, na.rm = TRUE) / median(poststrat_bio_wgts$btwt0, na.rm = TRUE)
# PracTools::deffK(filter(poststrat_bio_wgts, !is.na(btwt0))$btwt0)
# save.image("C:/Users/kupamupindit/OneDrive - Columbia University Irving Medical Center/Documents/ICAP work/Countries/Cote D'Ivoire/CIPHIA2/Weighting/my_workspace.RData")
#load("my_workspace.RData")

4 Variance estimation, jacknife replicate weights

4.1 Setup of Variance Strata and Units

civ2_zd_in <- read_excel(str_c(civ2_sample_dir,"CIPHIA2_2nd_stage_sample_09_07_2024_v4.xlsx")) %>%
  clean_names() %>% 
  distinct(ea_code, .keep_all=TRUE) 


##############################
# On this section we are creating our 
# Varstrat and varunits
################################

# vous souhaitez mettre les ZD par 2, ou par paires, 
# rappelez-vous lors de l'atelier de pondération, l'idée est de jumeler les ZD

# you want to put the ZDs into 2, or pairs, remember 
# during the workshop on weighting, idea is to pair the ZDs

# Tacking 3s just on the end
civ2_varunits <- civ2_zd_in %>% # shift control M = shortcut for pipe  %>%
  mutate(psuid = format(ea_code, scientific = FALSE)) %>%  # mutate est creer nouvelle variable
  select(cod_reg, nom_reg, grappe, psuid, geovar_ea) |> 
  arrange(cod_reg, grappe) |> 
 # filter(id_province!=1)|> # when using filter please double ==
  group_by(nom_reg) |> 
  mutate(varunit = 1 + 
           case_when(row_number() == n() & row_number() %% 2 != 0 ~ 2,
                     .default = (row_number() - 1) %% 2)) %>% 
  rename(strata=nom_reg) %>% 
  ungroup()

#the last part of the code consists of generating the pairs
#modular function, if I split something 
#by 2 and the answer is not zero so I put 2.

#la dernière partie du code consiste à générer les paires
#fonction modulaire, si je divise quelque chose 
#par 2 et que la réponse n'est pas zéro alors je mets 2.

civ2_varstrats <- civ2_varunits |> 
  filter(varunit == 1) |> 
  select(strata, psuid, varunit, geovar_ea) |> 
  mutate(varstrat = row_number())

civ2_varstrat_varunit <- civ2_varunits |> 
  left_join(civ2_varstrats) |> 
  fill(varstrat)
## Joining with `by = join_by(strata, psuid, geovar_ea, varunit)`
#the above two pieces of code is to join the varstrat and varunits together
#les deux morceaux de code ci-dessus doivent joindre le varstrat et les varunits ensemble

4.1.1 Random Deletion Method for Replicates

set.seed(877014)

# Random deletions
# here we randomly delete 1 EA from each pair of enumeration areas
deleted <- civ2_varstrat_varunit |> 
  group_by(varstrat) |> 
  sample_n(1)


# Base weight multiplier and deletion flag
civ2_base_weight_mult <- civ2_varstrat_varunit |> 
  group_by(varstrat) |> 
  mutate(psu_wt_mult0 = 1,
         deletion_factor = if_else(psuid %in% deleted$psuid, 0, n() / (n() - 1))) 

4.1.2 Construction of Replicate Multipliers

# Make one replicate
make_base_rep <- function(data, rep_number) {
  
  wtname <- str_pad(rep_number, 3, pad = 0)
  
  data |> 
    ungroup() |> 
    mutate("psu_wt_mult{wtname}" := 
             case_when(varstrat == rep_number ~ psu_wt_mult0 * deletion_factor,
                       .default = psu_wt_mult0)) |> 
    select(starts_with("psu_wt_mult"), -psu_wt_mult0)
  
}

# group_by(varstrat):
#   
#   Groups the dataset civ2_varstrat_varunit by the column varstrat. 
#Grouping allows operations within each unique varstrat value.
# mutate(psu_wt_mult0 = 1):
#   Creates a new column psu_wt_mult0 initialized with a constant value of 1. 
#This serves as a base multiplier for weights.
# mutate(deletion_factor = ...):
#   Creates another new column deletion_factor. This determines how weights should be adjusted based on whether the primary sampling unit (ZD) was "deleted."
# Condition (if_else(psuid %in% deleted$psuid, ...)):
#   If the current psuid exists in the psuid column of the deleted dataset, the deletion_factor is set to 0 (indicating deletion).
# Otherwise, deletion_factor adjusts for the reduced sample size when a PSU is excluded (ensuring proper weight redistribution).


# comments in french
# group_by(varstrat):
# Regroupe l'ensemble de données civ2_varstrat_varunit par la colonne varstrat. 
#Grouping permet des opérations au sein de chaque valeur varstrat unique.
# muter(psu_wt_mult0 = 1) :
#   
# Crée une nouvelle colonne psu_wt_mult0 initialisée avec une valeur constante de 1. 
#Cela sert de multiplicateur de base pour les poids.
# muter(deletion_factor = ...) :
#   
# Crée une autre nouvelle colonne delete_factor. Cela détermine la manière dont les poids doivent être ajustés selon que l'unité d'échantillonnage primaire (UPE) a été « supprimée » ou non.
# Condition (if_else(psuid %in% supprimé$psuid, ...)) :
# Si le psuid actuel existe dans la colonne psuid de l'ensemble de données supprimé, le deletion_factor est défini sur 0 (indiquant la suppression).
# Sinon, deletion_factor s'ajuste à la taille réduite de l'échantillon lorsqu'une PSU est exclue (garantissant une redistribution appropriée du poids).

test <- make_base_rep(civ2_base_weight_mult, 2)

civ2_mult_wts_out <- lapply(seq(1:max(civ2_base_weight_mult$varstrat)), 
                           \(x) make_base_rep(civ2_base_weight_mult, x)) |> 
  reduce(cbind)

# Apply the `make_base_rep` function to each value in the sequence from 1 to the maximum value of `varstrat` 
# in the `civ2_base_weight_mult` dataframe. This is achieved using `lapply`, where `seq(1:max(civ2_base_weight_mult$varstrat))`
# generates the sequence of numbers for each stratum (varstrat). 

# The function `make_base_rep` is executed for each stratum (`x`), and the output for all strata is collected 
# into a list. The lambda syntax (\(x)) is used for a concise inline function.

# The resulting list of outputs from `lapply` is then combined into a single dataframe or matrix by reducing
# the list with `cbind` (column binding). This ensures all outputs are aligned as columns.

# Finally, the entire pipeline is stored in the object `civ2_mult_wts_out`.


# Appliquer la fonction `make_base_rep` à chaque valeur de la séquence de 1 à la valeur maximale de `varstrat` 
# dans le dataframe `civ2_base_weight_mult`. Ceci est réalisé en utilisant `lapply`, où `seq(1:max(civ2_base_weight_mult$varstrat))`
# génère la séquence de nombres pour chaque strate (varstrat). 

# La fonction `make_base_rep` est exécutée pour chaque strate (`x`) et la sortie pour toutes les strates est collectée 
# dans une liste. La syntaxe lambda (\(x)) est utilisée pour une fonction en ligne concise.

# La liste résultante des sorties de `lapply` est ensuite combinée en une seule trame de données ou matrice en réduisant
# la liste avec `cbind` (reliure de colonnes). Cela garantit que toutes les sorties sont alignées sous forme de colonnes.

# Enfin, l'intégralité du pipeline est stockée dans l'objet `civ2_mult_wts_out`.

psu_rep_mults <- cbind(civ2_base_weight_mult, civ2_mult_wts_out) %>% 
  select(-c(cod_reg, strata))

psu_basewt_reps <- psu_base_weights |> 
  select(-grappe) %>% 
  left_join(psu_rep_mults) |> 
  mutate(across(c(starts_with("psu_wt_mult"), -psu_wt_mult0), \(x) psuwt0 * x, 
                .names = "newwt{.col}")) |> 
  rename_with(.cols = contains("newwt"), 
              \(x) gsub("newwtpsu_wt_mult", "psuwt", x)) 
## Joining with `by = join_by(geovar_ea)`

4.1.3 Replicate function example

# Use dataset file from the computation of the 
# original hhbwt0 weight variable as input
psuwt_reps <- psu_basewt_reps |> 
  select(geovar_ea,varunit,varstrat, starts_with("psuwt")) |> 
  mutate(EAID = paste0("CI", geovar_ea)) 

input <- final_hh_weights %>% 
  left_join(psuwt_reps)
## Joining with `by = join_by(EAID)`
# Function which outputs the specified number 
# replicate weight
compute_hhbwt_rep <- function(data, rep_number) {
  
  # Padded weight number suffix, eg. 3 becomes "003"
  wtnum <- str_pad(rep_number, 3, pad = 0)
  
  # Name of the replicate weight to be used
  # eg. "psuwt003"
  repwt <- str_c("psuwt", wtnum)
  
  input |> 
    # Create variable for the replicate weight to make this more readable
    mutate(psuwt_rep = .data[[repwt]],
           # Compute weight using replicate and name with replicate number
           "hhbwt{wtnum}" := 1/ ( (1 / psuwt_rep) * (psuwt0/hhbwt0))) |>
    # Select just the replicate weight variable
    select(str_c("hhbwt", wtnum))
  
}

# Generate all the weights and join together by looping through
# from 1 to the number of varstrats
civ_hhbwt_reps <- lapply(seq(1:max(psuwt_reps$varstrat)), 
                           \(x) compute_hhbwt_rep(input, x)) |> 
  reduce(cbind)


rep_weights <- lapply(seq(1:max(psuwt_reps$varstrat)), \(x) compute_hhbwt_rep(input, x))

rep_weights_df <- as.data.frame(rep_weights)
names(rep_weights_df) <- paste0("hhbwt", str_pad(seq_along(rep_weights_df), 3, pad = "0"))

hhbwts_reps <- bind_cols(input, rep_weights_df) %>% 
  select(-c(HHI_DISTRICT, hhstatus,hh_adj_one,hh_wt_one,hh_adj_two,geovar_ea, contains("psuwt")))

4.2 Household replicate weights

# Added by GR
# Phase one
hh_adj_rep_in <- hhbwts_reps |> 
  left_join(select(civ2_hh, EA_HHID_FIXED, HHI_EACODE, HHI_DISTRICT)) |> 
  arrange(varstrat, varunit) |> 
  group_by(HHI_EACODE) |> 
  ungroup()
## Joining with `by = join_by(EA_HHID_FIXED)`
hhwt_rep_adj <- function(weight_vec) {
  
  #weight_vec <- hh_adj_rep_in$hhbwt001
  
  # Phase 1
  hh_uknwn_adj <- hh_adj_rep_in %>%
    mutate(repwt = weight_vec) |> 
    select(HHI_EACODE, EA_HHID_FIXED, HHI_DISTRICT, HH_STATUS, varstrat, varunit, repwt) %>%
    group_by(HHI_EACODE) %>% 
    mutate(
      # Calculate adjustment
      hh_adj_one = sum(repwt) / ( sum(repwt * (HH_STATUS == 1)) + 
                                    sum(repwt * (HH_STATUS == 2)) + 
                                    sum(repwt * (HH_STATUS == 3))),
      # Calculate weights 
      hh_wt_one = case_when(repwt == 0 ~ 0,
                            HH_STATUS %in% c(1, 2) ~ hh_adj_one * repwt,
                            HH_STATUS %in% c(3, 4) ~ 0),
      hh_adj_two = sum(hh_wt_one) / ( sum(hh_wt_one * (HH_STATUS == 1))),
      hh_wt_two = if_else(HH_STATUS == 1 & repwt > 0, 
                          hh_adj_two * hh_wt_one,
                          0)) %>% 
    ungroup()
  
  # test <- hh_uknwn_adj |> 
  #   filter(varstrat == 1)
  
  # Return final 'hhwt'
  return(hh_uknwn_adj$hh_wt_two)

}

# Quick test
# test <- hh_adj_rep_in |> 
#   left_join(select(final_hh_weights, EA_HHID_FIXED, hhwt0)) |> 
#   mutate(hhwt001_new = hhwt_rep_adj(hhbwt001),
#          hhwt002_new = hhwt_rep_adj(hhbwt002)) |> 
#   arrange(varstrat, varunit) |> 
#   select(ends_with("new"), everything())
# 
# test2 <- test |> 
#   filter(varstrat == 1)

# All reps
final_hh_weights <- hh_adj_rep_in |> 
  mutate(across(matches("^hhbwt\\d{3}$"), hhwt_rep_adj, .names = "adj_{.col}"))

final_hh_weights_rename <- final_hh_weights |> 
  arrange(varstrat, varunit) |> 
  rename_with(\(x) str_replace(x, "adj_hhbwt", "hhwt"))



# Random sample of households to check
# hh_sample <- final_hh_weights_rename |> 
#   slice_sample(n = 10)
# 
# hh_sample_long <- hh_sample |> 
#   pivot_longer(cols = starts_with("hhwt"), names_to = "weight_num", values_to = "weight_val")
# 
# hh_sample_long |> 
#   ggplot(aes(x = EA_HHID_FIXED, y = weight_val)) +
#   geom_jitter(alpha = 0.5)

4.3 Individual replicate weights

# Attaching to each person
int_wgts_phase_one_input_reps <- int_wgts_phase_one_input %>%
  left_join(
    #final_hhwts_reps %>% 
    final_hh_weights_rename |>
      #select(EA_HHID_FIXED, varunit, varstrat, starts_with("hhbwt")),
      select(EA_HHID_FIXED, varunit, varstrat, starts_with("hhwt")),
    by = "EA_HHID_FIXED"
  )
# phase 1 replicates
int_firststage_wt_rep <- function(wt_vector) {
  adj_tbl <- int_wgts_phase_one_input_reps %>%
    mutate(repwt = wt_vector) %>% 
    filter(int_elig_status %in% c(1,2,3), repwt > 0) %>%
    group_by(age_sex_cell, int_elig_status) %>%
    summarise(weighted_count = sum(repwt), .groups = "drop_last") %>%
    tidyr::pivot_wider(values_from = weighted_count,
                       names_from = int_elig_status,
                       names_glue = "status_{int_elig_status}",
                       values_fill = 0) %>%
    mutate(phase_one_int_adj_factor = (status_1 + status_3) / status_1) %>%
    select(age_sex_cell, phase_one_int_adj_factor)

  int_wgts_phase_one_input_reps %>%
    mutate(repwt = wt_vector) %>%
    left_join(adj_tbl, by = "age_sex_cell") %>%
    mutate(phase_one_adj_int_wgt = repwt * phase_one_int_adj_factor) %>%
    pull(phase_one_adj_int_wgt)
}

# Compute phase-one adjusted interview replicate columns: phase_one_adj_wt001 … 
int_phase1_reps <- int_wgts_phase_one_input_reps %>%
  #mutate(across(matches("^hhbwt\\d{3}$"),
  mutate(across(matches("^hhwt\\d{3}$"),
                int_firststage_wt_rep,
                .names = "phase_one_adj_wt{str_sub(.col, -3)}"))
# NR adjustment for individual dataset


# Joining fixed response probabilities
int_phase1_reps <- int_phase1_reps %>%
  left_join(select(response_probs_int,
                   EA_HHID_LN_FIXED, ind_responded, response_probability_int),
            by = "EA_HHID_LN_FIXED")
#this cap is just for abitrary
cap_int <- 20.5 * median(nr_adj_int_weights$nr_adj_int_wgt, na.rm = TRUE)

int_nr_reps <- int_phase1_reps %>%
  mutate(
    across(matches("^phase_one_adj_wt\\d{3}$"),
           ~ if_else(ind_responded == 1, .x / response_probability_int, NA_real_),
           .names = "nr_adj_int_wgt{str_sub(.col, -3)}"),
    across(matches("^nr_adj_int_wgt\\d{3}$"),
           ~ pmin(.x, cap_int))
  )

int_nr_reps |> 
  group_by(ind_responded) |> 
  summarise(max = max(nr_adj_int_wgt001, na.rm = TRUE))
## Warning: There were 2 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `max = max(nr_adj_int_wgt001, na.rm = TRUE)`.
## ℹ In group 1: `ind_responded = 0`.
## Caused by warning in `max()`:
## ! no non-missing arguments to max; returning -Inf
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
## # A tibble: 3 × 2
##   ind_responded   max
##           <dbl> <dbl>
## 1             0 -Inf 
## 2             1 3735.
## 3            NA -Inf
# postrat replicate
poststrat_one_intrep <- function(df, repcol, targets_df) {
  tmp <- df %>%
    transmute(
      cod_distric = stringr::str_sub(EAID, 3, 4),
      sex = SEX,
      age_group = dplyr::case_when(
        dplyr::between(CONFAGEY, 15, 19) ~ "15-19",
        dplyr::between(CONFAGEY, 20, 24) ~ "20-24",
        dplyr::between(CONFAGEY, 25, 29) ~ "25-29",
        dplyr::between(CONFAGEY, 30, 34) ~ "30-34",
        dplyr::between(CONFAGEY, 35, 39) ~ "35-39",
        dplyr::between(CONFAGEY, 40, 44) ~ "40-44",
        dplyr::between(CONFAGEY, 45, 49) ~ "45-49",
        dplyr::between(CONFAGEY, 50, 54) ~ "50-54",
        dplyr::between(CONFAGEY, 55, 59) ~ "55-59",
        dplyr::between(CONFAGEY, 60, 64) ~ "60-64",
        CONFAGEY >= 65 ~ "65+",
        TRUE ~ "ERROR"
      ),
      w = .data[[repcol]]
    ) %>% 
    filter(!is.na(w))

  tots <- tmp %>% group_by(cod_distric, sex, age_group) %>%
    summarise(totalW = sum(w), .groups = "drop")

  adj <- tots %>% left_join(targets_df, by = c("cod_distric","sex","age_group")) %>%
    mutate(f = pop / totalW) %>% select(cod_distric, sex, age_group, f)

  out <- tmp %>% left_join(adj, by = c("cod_distric","sex","age_group")) %>%
    mutate(final = w * f) %>% pull(final)

  # put back into original length/order
  res <- rep(NA_real_, nrow(df))
  res[!is.na(df[[repcol]])] <- out
  res
}

# We already have EAID, SEX, CONFAGEY inside int_phase1_reps via earlier joins
int_rep_cols <- grep("^nr_adj_int_wgt\\d{3}$", names(int_nr_reps), value = TRUE)

for (rc in int_rep_cols) {
  newname <- sub("^nr_adj_int_wgt", "intwt", rc)
  int_nr_reps[[newname]] <- poststrat_one_intrep(int_nr_reps, rc, individual_reference_totals_adj)
}

final_indwts_reps <- int_nr_reps %>%
  left_join(
    poststrat_int_wgts %>%
      select(EA_HHID_LN_FIXED, EAID, intwt0),
    by = c("EA_HHID_LN_FIXED", "EAID")
  ) %>%
  select(EA_HHID_LN_FIXED, EAID, varunit, varstrat, intwt0, matches("^intwt\\d{3}$"))

4.4 Biomarker replicate weights

# creating replicates for biomaker
bio_base <- int_nr_reps %>%
  left_join(select(response_probs_bio,
                   EA_HHID_LN_FIXED, response_bio, response_probability_bio),
            by = "EA_HHID_LN_FIXED")

bio_base <- bio_base %>%
  mutate(across(matches("^nr_adj_int_wgt\\d{3}$"),
                ~ pmin(.x, cap_int),
                .names = "trmpnr1w{str_sub(.col, -3)}"))

# NR for biomarker per replicate
for (rc in grep("^trmpnr1w\\d{3}$", names(bio_base), value = TRUE)) {
  newname <- sub("^trmpnr1w", "nr_adj_bio_wgt", rc)
  bio_base[[newname]] <- ifelse(
    bio_base$response_bio == 1,
    bio_base[[rc]] / bio_base$response_probability_bio,
    NA_real_
  )
}

# Optional trimming for bio NR
cap_bio <- 20.5 * median(nr_adj_bio_weights$nr_adj_bio_wgt, na.rm = TRUE)

bio_base <- bio_base %>%
  mutate(across(matches("^nr_adj_bio_wgt\\d{3}$"),
                ~ pmin(.x, cap_bio),
                .names = "trmbtnr1w{str_sub(.col, -3)}"))

# Post-stratify each replicate to get btwtXXX
poststrat_one_biorep <- function(df, repcol, targets_df) {
  tmp <- df %>%
    transmute(
      cod_distric = stringr::str_sub(EA_HHID_FIXED, 3, 4),
      sex = SEX,
      age_group = dplyr::case_when(
        dplyr::between(CONFAGEY, 15, 19) ~ "15-19",
        dplyr::between(CONFAGEY, 20, 24) ~ "20-24",
        dplyr::between(CONFAGEY, 25, 29) ~ "25-29",
        dplyr::between(CONFAGEY, 30, 34) ~ "30-34",
        dplyr::between(CONFAGEY, 35, 39) ~ "35-39",
        dplyr::between(CONFAGEY, 40, 44) ~ "40-44",
        dplyr::between(CONFAGEY, 45, 49) ~ "45-49",
        dplyr::between(CONFAGEY, 50, 54) ~ "50-54",
        dplyr::between(CONFAGEY, 55, 59) ~ "55-59",
        dplyr::between(CONFAGEY, 60, 64) ~ "60-64",
        CONFAGEY >= 65 ~ "65+",
        TRUE ~ "ERROR"
      ),
      w = .data[[repcol]]
    ) %>% 
    filter(!is.na(w))

  tots <- tmp %>% group_by(cod_distric, sex, age_group) %>%
    summarise(totalW = sum(w), .groups = "drop")

  adj <- tots %>% left_join(targets_df, by = c("cod_distric","sex","age_group")) %>%
    mutate(f = pop / totalW) %>% select(cod_distric, sex, age_group, f)

  out <- tmp %>% left_join(adj, by = c("cod_distric","sex","age_group")) %>%
    mutate(final = w * f) %>% pull(final)

  res <- rep(NA_real_, nrow(df))
  res[!is.na(df[[repcol]])] <- out
  res
}

bio_rep_cols <- grep("^trmbtnr1w\\d{3}$", names(bio_base), value = TRUE)

for (rc in bio_rep_cols) {
  newname <- sub("^trmbtnr1w", "btwt", rc)
  bio_base[[newname]] <- poststrat_one_biorep(bio_base, rc, individual_reference_totals_adj)
}

final_bloodwts_reps <- bio_base %>%
  left_join(
    poststrat_bio_wgts %>% select(EA_HHID_LN_FIXED, btwt0),
    by = "EA_HHID_LN_FIXED" ) %>%
  select(EA_HHID_LN_FIXED, EA_HHID_FIXED, varunit, varstrat, btwt0, matches("^btwt\\d{3}$"))

4.5 Final datasets

hh_data_weights <- civ2_hh %>% 
  left_join(final_hh_weights_rename)
## Joining with `by = join_by(EA_HHID_FIXED, HHI_EACODE, HHI_DISTRICT, HH_STATUS)`
indiv_data_weights <- civ2_csproindiv %>% 
  left_join(final_indwts_reps)
## Joining with `by = join_by(EA_HHID_LN_FIXED)`
bio_data_weights <- biomarker %>%
  rename(EA_HHID_LN_FIXED=ea_hhid_ln_fixed) %>% 
  left_join(final_bloodwts_reps)
## Joining with `by = join_by(EA_HHID_LN_FIXED)`
output_dir <- "C:/Users/kupamupindit/OneDrive - Columbia University Irving Medical Center/Documents/ICAP work/Countries/Cote D'Ivoire/CIPHIA2/Weighting/Output_data/"

export(hh_data_weights, str_c(output_dir, "CIV2_household_weights_v6_20082025.csv"))

export(indiv_data_weights, str_c(output_dir, "CIV2_individual_weights_v6_20082025.csv"))

export(bio_data_weights, str_c(output_dir, "CIV2_biomaker_weights_v6_20082025.csv"))

4.6 Plots to check replicates

# Random sample of households
hh_sample <- hh_data_weights |> 
  slice_sample(n = 10)

hh_sample_long <- hh_sample |> 
  pivot_longer(cols = starts_with("hhwt"), names_to = "weight_num", values_to = "weight_val")

hh_sample_long |> 
  ggplot(aes(x = EA_HHID_FIXED, y = weight_val)) +
  geom_boxplot()

hh_data_weights |> 
  select(hhwt0, hhwt001, hhwt002) |> 
  pivot_longer(cols = starts_with("hhwt"), names_to = "weight_num", values_to = "weight_val") |> 
  ggplot(aes(x = weight_val, fill = weight_num)) +
  geom_density(alpha = 0.25)

# Random sample of individuals
ind_sample <- indiv_data_weights |> 
  slice_sample(n = 10)

ind_sample_long <- ind_sample |> 
  pivot_longer(cols = starts_with("intwt"), names_to = "weight_num", values_to = "weight_val")

ind_sample_long |> 
  ggplot(aes(x = EA_HHID_LN_FIXED, y = weight_val)) +
  geom_boxplot()
## Warning: Removed 567 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

indiv_data_weights |> 
  select(intwt0, intwt001, intwt002) |> 
  pivot_longer(cols = starts_with("intwt"), names_to = "weight_num", values_to = "weight_val") |> 
  ggplot(aes(x = weight_val, fill = weight_num)) +
  geom_density(alpha = 0.25)
## Warning: Removed 18009 rows containing non-finite outside the scale range
## (`stat_density()`).

max(indiv_data_weights$intwt001, na.rm = TRUE)
## [1] 5311.044
bigwt <- indiv_data_weights |> 
  filter(intwt0 > 20000)

# Random sample of bioweights
bio_sample <- bio_data_weights |> 
  slice_sample(n = 10)

bio_sample_long <- bio_sample |> 
  pivot_longer(cols = starts_with("btwt"), names_to = "weight_num", values_to = "weight_val")

bio_sample_long |> 
  ggplot(aes(x = EA_HHID_LN_FIXED, y = weight_val)) +
  geom_boxplot()

bio_data_weights |> 
  select(btwt0, btwt001, btwt002) |> 
  pivot_longer(cols = starts_with("btwt"), names_to = "weight_num", values_to = "weight_val") |> 
  ggplot(aes(x = weight_val, fill = weight_num)) +
  geom_density(alpha = 0.25)
## Warning: Removed 11124 rows containing non-finite outside the scale range
## (`stat_density()`).

check <- poststrat_int_wgts %>% 
  filter(EA_HHID_LN_FIXED=="CI010100214067300106")