This notebook contains organised code for each step of the weighting process for the CIPHIA2 data.
The major steps and progress (at 20/03/2022) are as follows:
Base weights compensate for unequal selection
probabilities due to multi-stage sampling designs.
They are the inverse of the probability of selection at
each stage. Make reference to the CIPHIA2 second stage sample, the base
weight is psuwt0 = column(ES) = ADJ_PSU_WT, while the ADJ_PSU_WT =
1/column(ER) = 1/ADJ_PROB2
\[ \text{PSU weight} = \frac{1}{\text{Selection Probability}} \]
load second stage sample, remove duplicates in terms of EAs so that you are left with 396 EAs. The reason for using second stage sample instead of 1st stage sample is that the 2nd stage sample has all information from 1st stage and from the household listing
# This section is to import packages
# and prepare for importing data
rm(list = ls())
# clean/remove everything
#install.packages("pacman")
# we use pacman to import all packages
pacman::p_load(tidyverse,
haven,
rio,
ggplot2,
janitor,
survey,
lubridate,
readxl,
BART,
future,
furrr,
progressr,
caret
)
# create path to the datasets
civ2_sample_dir <- "C:/Users/kupamupindit/OneDrive - Columbia University Irving Medical Center/Documents/ICAP work/Countries/Cote D'Ivoire/CIPHIA2/Sampling/2nd_stage_sampling/"
###################################################################
# read_excel comes from the package readxl, str_c means concatenate 2 strings as shown below
# if you press shift+ctr+M you will get a pipe = %>%
# if you press ctr+s you will save your script, its a shortcut for saving
# you can google shortcuts for Rstudio, it will give you a whole list of all these shortcuts
# clean_name comes from the package janitor, the purpose of clean_name is to make your variable names proper
civ2_zd_in <- read_excel(str_c(civ2_sample_dir,"CIPHIA2_2nd_stage_sample_09_07_2024_v4.xlsx")) %>%
clean_names() %>%
distinct(ea_code, .keep_all=TRUE)
psu_base_weights <- civ2_zd_in |>
mutate(psuwt0 = adj_psu_wt)
psu_base_weights |>
ggplot(aes(x = factor(strata),
y = psuwt0)) +
geom_boxplot() +
labs(x = "Stratum/Region",
y = "PSU base weight") +
theme_bw()
psu_base_weights %>%
group_by(nom_reg) |>
mutate(psu_num_instrata = row_number()) |>
ggplot(aes(x = psu_num_instrata,
y = psuwt0,
fill = factor(nom_milieu, levels = c(1,2),
labels = c("Urban", "Rural")))) +
geom_col() +
facet_wrap(~nom_reg) +
theme_bw() +
labs(fill = "Urban/Rural",
x = "PSU number within stratum",
y = "PSU base weight")
There are several steps for the household weights:
We need the counts of households listed and selected to compute the household selection probability and weight.
The sample design was to take a systematic random sample of an average of 35 households from each EA. The household selection probability, \(p_{hh,i}\) in EA \(i\) is then
\[ p_{hh,i} = \frac{35}{N_{\textrm{elig},i}}, \]
where \(N_{\textrm{elig},i}\) is the number of eligible households in EA \(i\).
civ2_zd_in1 <- read_excel(str_c(civ2_sample_dir,"CIPHIA2_2nd_stage_sample_09_07_2024_v4.xlsx")) %>%
clean_names()
# create the flag of hhs that were sampled
hh_sample_file_in <- civ2_zd_in1 %>%
rename(hh_smpflg = flag_ea) %>%
mutate(hh_smpflg = ifelse(is.na(hh_smpflg), 0, hh_smpflg))
# hh_smpglg, create a household flag of the those EAs that were selected
# creer hhs probs
hh_selection_probs <- hh_sample_file_in |>
group_by(strata) |>
summarise(hhcount = n(),
selected_hh_count = sum(hh_smpflg == "1")) |>
mutate(hh_selection_prob = selected_hh_count / hhcount)
# summary(hh_selection_probs$hh_selection_prob)
# hist(hh_selection_probs$hh_selection_prob, breaks = 40)
selected_hh <- hh_sample_file_in |>
filter(hh_smpflg == "1") |>
select(cod_distric, nom_distric, strata, grappe, eahhid, stobs_d, stvac_d, resyn_d, hh_smpflg) |>
left_join(
psu_base_weights |> select(-cod_distric, -nom_distric, -strata, -eahhid, -stobs_d, -stvac_d, -resyn_d),
by = "grappe"
) |>
select(-c(totalsegment, segmentselect, segmented, segmos,
adj_prob1, adj_prob2)) |>
left_join(hh_selection_probs) |>
mutate(hhbwt0 = 1/( (1/psuwt0) * hh_selection_prob)) %>%
ungroup() %>%
arrange(grappe)
## Joining with `by = join_by(strata)`
summary(selected_hh$hhbwt0)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7.222 66.708 104.295 120.957 165.029 379.270
hist(selected_hh$hhbwt0, breaks = 40)
selected_hh |>
group_by(strata) |>
summarise(min = min(hhbwt0),
low_Q = quantile(hhbwt0, 0.25),
medianwt = median(hhbwt0),
meanwt = mean(hhbwt0),
upp_Q = quantile(hhbwt0, 0.75),
max = max(hhbwt0))
## # A tibble: 14 × 7
## strata min low_Q medianwt meanwt upp_Q max
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 32.1 75.0 103. 117. 134. 379.
## 2 2 18.4 23.4 27.4 32.5 43.5 54.5
## 3 3 43.6 88.6 144. 142. 204. 254.
## 4 4 46.0 101. 130. 145. 199. 228.
## 5 5 7.22 10.7 22.1 20.8 28.4 35.2
## 6 6 56.0 114. 155. 148. 183. 218.
## 7 7 83.0 155. 198. 193. 224. 305.
## 8 8 37.2 57.4 73.7 80.4 97.2 141.
## 9 9 68.1 100. 122. 145. 167. 321.
## 10 10 56.9 108. 169. 163. 198. 277.
## 11 11 33.6 88.0 117. 160. 223. 378.
## 12 12 74.8 165. 210. 201. 250. 282.
## 13 13 28.7 46.0 67.0 71.0 87.3 121.
## 14 14 39.0 52.3 77.7 97.5 121. 298.
selected_hh |>
ggplot(aes(y = hhbwt0)) +
facet_wrap(~strata)+
geom_boxplot() +
theme_bw() +
labs(x = "Household Base Weight",
y = "Region")
selected_hh1 <- selected_hh %>%
distinct(grappe, .keep_all = TRUE) %>%
rename(hhr_eacode1=unicite) %>%
mutate(
hhr_eacode1 = as.character(hhr_eacode1), # Convert to character
hhr_eacode1 = case_when(
nchar(hhr_eacode1) == 9 ~ paste0("00", hhr_eacode1), # Add 2 leading zeros if 1 digits
nchar(hhr_eacode1) == 10 ~ paste0("0", hhr_eacode1), # Add 1 leading zero if 2 digits
TRUE ~ hhr_eacode1 # Keep as is if 10 digits
),
hhr_eacode = str_c("CI",cod_distric, hhr_eacode1)
) %>%
rename(HHI_EACODE=hhr_eacode) %>%
select(HHI_EACODE, hhbwt0) # Keep only necessary columns
# View the transformed data
head(selected_hh1)
## # A tibble: 6 × 2
## HHI_EACODE hhbwt0
## <chr> <dbl>
## 1 CI0101002026030 125.
## 2 CI0101002046014 149.
## 3 CI0101002056042 41.0
## 4 CI0101002020069 78.5
## 5 CI0101002020182 62.3
## 6 CI0101002020256 32.1
The variable UPCODE_STAT_HH
on the household interview
file contains the final response status for each household. The codes
are:
UPCODE_STAT_HH | Definition |
---|---|
1 | Respondent |
2 | Non-respondent |
3 | Ineligible |
4 | Unknown eligibility |
In this step, the weights of the households with unknown eligibility are re-distributed to the other households in the EA. The adjustment factor in EA \(i\) is computed as
\[ A^{\textrm{HH elig}}_i = \Sigma_{j=1}^{n_i} W_{ij} / \Sigma_{j = 1}^{n_{i,1} + n_{i,2} + n_{i,3}} W_{ij}. \] The values \(n_{i,1-3}\) are the number of households in EA \(i\) with final status equal to 1, 2, or 3, as defined in the table above. The total number of selected households is \(n_i\), and \(W_{ij}\) is the household base weight of household \(j\) in EA \(i\). Note that because of the sample design, all households in each EA have the same selection probability/weight, so the value of this weight cancels out in computing the adjustment factor.
indir <- "C:/Users/kupamupindit/OneDrive - Columbia University Irving Medical Center/Documents/ICAP work/Countries/Cote D'Ivoire/CIPHIA2/Data/"
civ2_hh <- import(str_c(indir, "CI_FFcorr_hh_int_DRAFT_20250325.xlsx"),
guess_max = 10000) |>
filter(is.na(DMFLAG) | DMFLAG != "DISCARD")
### Household Status ###
# civ2_hh |>
# count(RESULTNDT,
# UPCODE_RESLTNDT,
# STARTINT,
# HHELIG,
# HHCONSTAT,
# HHQDTHSINS,
# ROSTER_START,
# HHQINSHH,
# HHQASSIGN_INST)
civ2_hhstatus <- civ2_hh |>
mutate(
COMB_UPCODED_RESLTNDT = coalesce(as.numeric(UPCODE_RESLTNDT), as.numeric(RESULTNDT)),
UPCODE_STAT_HH = case_when(
# Responded
is.na(RESULTNDT) & (STARTINT == 1 & HHELIG == 1 & HHCONSTAT == 1 &
!is.na(HHQDTHSINS) & !is.na(ROSTER_START) & !is.na(HHQINSHH) &
!is.na(HHQASSIGN_INST)) ~ 1,
is.na(RESULTNDT) & (STARTINT == 4 & !is.na(ROSTER_START)) ~ 1,
# Non-responding
COMB_UPCODED_RESLTNDT %in% c(1, 2, 7, 8 , 9) ~ 2,
is.na(RESULTNDT) & (HHELIG == 2 | HHCONSTAT %in% c(2, 3) | (HHELIG == 1 & is.na(HHCONSTAT)) |
STARTINT == 4 & is.na(ROSTER_START)) ~ 2,
# Inelgible
COMB_UPCODED_RESLTNDT %in% c(3, 4, 6) ~ 3,
# Unknown
is.na(COMB_UPCODED_RESLTNDT) | COMB_UPCODED_RESLTNDT %in% c(5, 96) ~ 4,
.default = 4
)
)
# Checking HH status derivation
#
# civ2_hhstatus |>
# count(UPCODE_STAT_HH,
# RESULTNDT,
# UPCODE_RESLTNDT,
# STARTINT,
# HHELIG,
# HHCONSTAT,
# HHQDTHSINS,
# ROSTER_START,
# HHQINSHH,
# HHQASSIGN_INST)
civ2_hhstatus |>
tabyl(UPCODE_STAT_HH)
## UPCODE_STAT_HH n percent
## 1 11879 0.85485032
## 2 1220 0.08779505
## 3 778 0.05598733
## 4 19 0.00136730
# Proportion in each household status category
civ2_hhstatus |>
rename(hh_status=UPCODE_STAT_HH) %>%
ggplot(aes(x = factor(HHI_REGION),
fill = factor(hh_status, levels = 1:4,
labels = c("Eligible Respondent",
"Eligible Non-respondent",
"Ineligible Household",
"Unknown Eligibility")))) +
geom_bar(position = position_fill(reverse = TRUE)) +
scale_fill_viridis_d() +
labs(x = "Region",
y = "Proportion",
fill = "hh_status") +
theme_bw()
hh_uknwn_adj <- civ2_hh %>%
filter(!is.na(HHI_EACODE)) %>%
select(HHI_EACODE, HHI_REGION, UPCODE_STAT_HH) %>%
left_join(selected_hh1) %>%
group_by(HHI_EACODE) %>%
mutate(
hhstatus = UPCODE_STAT_HH,
# Calculate adjustment
hh_adj_one = sum(hhbwt0) / ( sum(hhbwt0 * (hhstatus == 1)) +
sum(hhbwt0 * (hhstatus == 2)) +
sum(hhbwt0 * (hhstatus == 3))
),
# Calculate weights
hh_wt_one = case_when(hhstatus %in% c(1, 2) ~ hh_adj_one * hhbwt0,
hhstatus %in% c(3, 4) ~ 0)
) %>%
ungroup()
## Joining with `by = join_by(HHI_EACODE)`
Non-response adjustment for households is done by dividing the responding household weights by the response rate in each EA. The non response adjustment factor is
\[ A^{\textrm{HH NR}}_i = \Sigma_{j = 1}^{n_{i,1} + n_{i,2}} W^{\textit{hh2}}_{ij} / \Sigma_{j=1}^{n_i} W^{hh2}_{ij}, \] where \(W^{\textit{hh2}}_{ij}\) is the eligibility-adjusted weight of household \(j\) in EA \(i\), and the other quantities are as in the previous equation.
hh_nr_adj <- hh_uknwn_adj %>%
group_by(HHI_EACODE) %>%
mutate(hh_adj_two = sum(hh_wt_one) / ( sum(hh_wt_one * (hhstatus == 1))),
hh_wt_two = if_else(hhstatus == 1,
hh_adj_two * hh_wt_one,
0)) %>%
ungroup()
final_hh_weights <- hh_nr_adj %>%
rename(EAID=HHI_EACODE, hhwt0=hh_wt_two )
hh_nr_adj |>
distinct(HHI_EACODE, .keep_all = TRUE) |>
ggplot(aes(x = hh_adj_two,
y = HHI_REGION)) +
geom_boxplot() +
theme_bw() +
labs(x = "HH adjustment factor two",
y = "Region")
civ2_csproroster <- import(str_c(indir, "CI_FFcorr_roster_DRAFT_20250325.xlsx"),
guess_max = 10000) |>
filter(is.na(DMFLAG) | DMFLAG != "DISCARD")
civ2_csproindiv <- import(str_c(indir, "CI_FFcorr_ind_int_DRAFT_20250325.xlsx"),
guess_max = 10000) |>
filter(is.na(DMFLAG) | DMFLAG != "DISCARD") %>%
mutate(roster_elig=PRIMARY_ROSTER_STATUS) # we need to check this
# mutate(roster_elig=PRIMARY_ROSTER_STATUS) # need to relook a this, could not find the same variable as RUPHIA
# just used PRIMARY_ROSTER_STATUS as a placeholder for roster_elig
roster_status_vars <- civ2_csproroster %>%
mutate(indiv_status = ROSTERED_FINAL_STATUS) %>%
left_join(select(civ2_csproindiv, EA_HHID_LN_FIXED, INDSTATUS,roster_elig)) %>%
select(roster_sex = SEX,
roster_age = AGEYEARS,
RELATTOHH, SLEEPHERE,
EA_HHID_FIXED, EA_HHID_LN_FIXED,
indiv_status, ELIGIBLE1, HHR_EACODE,INDSTATUS
)
## Joining with `by = join_by(EA_HHID_LN_FIXED)`
indiv_status_vars <- civ2_csproindiv %>%
select(IND_REGION, IND_DISTRICT, IND_EACODE,
EA_HHID_FIXED, EA_HHID_LN_FIXED, INDISPSTATUS, CONFAGEY)
# AGEYEARS, UPCODE_IND0040,roster_elig cannot find
roster_ind_status_vars <- roster_status_vars %>%
mutate(EAID = as.character(HHR_EACODE)) %>%
left_join(select(final_hh_weights, EAID, hhwt0)) %>%
left_join(indiv_status_vars) %>%
mutate(
roster_age_gt_15 = if_else(roster_age >= 15, 1, 0, missing = -9),
int_age_gt_15 = if_else(CONFAGEY >= 15, 1, 0, missing = -9),
resident_status_roster = case_when(SLEEPHERE == 1 & roster_age >= 15 ~ "De facto person 15 years or older",
SLEEPHERE != 1 & roster_age >= 15 ~ "Non-de facto person 15 years or older",
roster_age < 15 ~ "Person under 15 years",
TRUE ~ "OTHER"),
status_for_table = case_when(SLEEPHERE == 1 & roster_age >= 15 & int_age_gt_15 == 1 ~ 1,
SLEEPHERE == 1 & roster_age >= 15 & int_age_gt_15 == 0 ~ 2,
SLEEPHERE == 1 & roster_age >= 15 & int_age_gt_15 == -9 ~ 3,
SLEEPHERE != 1 & roster_age >= 15 ~ 4,
roster_age < 15 ~ 5,
TRUE ~ 99999),
# Give ineligible people indiv_status = 9 and INDSTATUS = 9
INDSTATUS = if_else(ELIGIBLE1 == 2, 90, INDSTATUS),
indiv_status = if_else(ELIGIBLE1 == 2, 9, indiv_status))
## Joining with `by = join_by(EAID)`
## Joining with `by = join_by(EA_HHID_FIXED, EA_HHID_LN_FIXED)`
table_counts <- roster_ind_status_vars %>%
mutate(status_for_table = factor(status_for_table, levels = 1:5),
int_age_desc = case_when(
status_for_table %in% c(4,5) ~ "NA",
int_age_gt_15 == 1 ~ "15+",
int_age_gt_15 == 0 ~ "Under 15",
int_age_gt_15 == -9 ~ "Unknown")) %>%
group_by(status_for_table, resident_status_roster, int_age_desc, .drop = FALSE) %>%
summarise(rostered_persons = n(),
wgt_number_rostered = sum(hhwt0)) %>%
mutate(resident_status_roster = if_else(status_for_table != 2, resident_status_roster, "De facto person 15 years or older"),
int_age_desc = if_else(status_for_table != 2, int_age_desc, "Under 15")) %>%
adorn_totals(where = "row")
## `summarise()` has grouped output by 'status_for_table',
## 'resident_status_roster'. You can override using the `.groups` argument.
print(table_counts)
## status_for_table resident_status_roster int_age_desc
## 1 De facto person 15 years or older 15+
## 2 De facto person 15 years or older Under 15
## 3 De facto person 15 years or older Unknown
## 4 Non-de facto person 15 years or older NA
## 5 Person under 15 years NA
## Total - -
## rostered_persons wgt_number_rostered
## 1026414 124443888.3
## 0 0.0
## 648 63702.5
## 150685 18753120.5
## 665478 82891635.8
## 1843225 226152347.1
int_wgts_phase_one_input <- select(civ2_csproroster, EA_HHID_FIXED, EA_HHID_LN_FIXED, HHR_EACODE, SEX, AGEYEARS, SLEEPHERE) %>%
left_join(select(civ2_csproindiv, EA_HHID_LN_FIXED, CONFAGEY)) %>%
mutate(EAID = as.character(HHR_EACODE),
roster_age_gt_15 = case_when(AGEYEARS >= 0 & AGEYEARS < 15 ~ 0,
AGEYEARS >= 15 ~ 1,
TRUE ~ -9),
int_elig_status = case_when(roster_age_gt_15 == 1 & CONFAGEY >= 15 ~ 1,
roster_age_gt_15 == 1 & CONFAGEY >= 0 & CONFAGEY < 15 ~ 2,
roster_age_gt_15 == 1 ~ 3,
roster_age_gt_15 == 0 ~ 4,
TRUE ~ 99999),
age_sex_cell = case_when(#SLEEPHERE != 1 ~ "Non-de facto",
SEX == 1 & roster_age_gt_15 == 1 & AGEYEARS < 18 ~ "Male 15-17",
SEX == 1 & roster_age_gt_15 == 1 & AGEYEARS >= 18 ~ "Male 18+",
SEX == 2 & roster_age_gt_15 == 1 & AGEYEARS < 18 ~ "Female 15-17",
SEX == 2 & roster_age_gt_15 == 1 & AGEYEARS >= 18 ~ "Female 18+",
AGEYEARS < 15 ~ "Under 15",
TRUE ~ "Missing/Unknown")) %>%
left_join(select(final_hh_weights, EAID, hhwt0))
## Joining with `by = join_by(EA_HHID_LN_FIXED)`
## Joining with `by = join_by(EAID)`
# Checks
# zz<-indiv_wgts_phase_one_input %>% group_by(SLEEPHERE, AGEYEARS, CONFAGEY, int_elig_status) %>% tally()
# indiv_wgts_phase_one_input %>% group_by(int_elig_status, age_sex_cell) %>% tally()
int_wgts_phase_one_adj <- int_wgts_phase_one_input %>%
filter(int_elig_status %in% c(1,2,3)) %>%
group_by(age_sex_cell, int_elig_status) %>%
summarise(count = n(),
weighted_count = sum(hhwt0)) %>%
pivot_wider(values_from = c(count, weighted_count),
names_from = int_elig_status,
names_glue = "status_{int_elig_status}_{.value}",
values_fill = 0) %>%
mutate(phase_one_int_adj_factor = (status_1_weighted_count + status_3_weighted_count)/status_1_weighted_count)
## `summarise()` has grouped output by 'age_sex_cell'. You can override using the
## `.groups` argument.
int_wgts_phase_one_adjusted <- int_wgts_phase_one_input %>%
left_join(select(int_wgts_phase_one_adj, age_sex_cell, phase_one_int_adj_factor)) %>%
mutate(phase_one_adj_int_wgt = hhwt0 * phase_one_int_adj_factor)
## Joining with `by = join_by(age_sex_cell)`
int_wgts_phase_one_adjusted %>% group_by(age_sex_cell) %>%
summarise(mean_weight = mean(phase_one_adj_int_wgt))
## # A tibble: 5 × 2
## age_sex_cell mean_weight
## <chr> <dbl>
## 1 Female 15-17 148.
## 2 Female 18+ 137.
## 3 Male 15-17 140.
## 4 Male 18+ 143.
## 5 Under 15 NA
##Interview non-response adjustment
*remove all the process variables first and remain with the important variables
# Helper function to split out the multiple-select options which have
# concatenated values like "ABD", "AE", "F" etc.
expand_mult <- function(x, i) {
if_else(grepl(as.character(i), x, fixed = TRUE), 1, 0)
}
# Select and tidy up all useful household variables
hhvars <- civ2_hhstatus %>%
filter(UPCODE_STAT_HH == 1) %>%
select(36:146,
SICK_HOUSEHOLD, contains("EA_HHID")) %>%
# Correcting/filling count variables
mutate(DEATHCOUNT = replace_na(DEATHCOUNT, 0),
# Expand multiple selection
across(c(HHQITEMS, HHQOWN), list(~expand_mult(.x, "A"),
~expand_mult(.x, "B"),
~expand_mult(.x, "C"),
~expand_mult(.x, "D"),
~expand_mult(.x, "E"),
~expand_mult(.x, "F"),
~expand_mult(.x, "G"),
~expand_mult(.x, "Y"),
~expand_mult(.x, "Z")),
.names = "{.col}{.fn}"),
# Fill NAs with no/missing where appropriate
across(c(DEATHS), ~if_else(.x %in% c(-8, -9), 2, .x)),
across(ends_with("DUR"), ~if_else(.x %in% c(-8,-9), 0, .x)),
across(ends_with("NUM"), ~if_else(.x == -7, 1, .x)),
# across(c(HHEMANCMINOR, HHQFOODFRQ, HHQHUNG, HHQHUNGFRQ, HHQHUNGDAY,
# ends_with("DUR")), ~replace_na(.x, 0)),
across(c(TOILETSHARE), ~replace_na(.x, 2))) %>%
# Variables to drop
select(-c(HHCNTSTATUS, HHQITEMS, HHQOWN, HHCAGREEDT, HHSTFSIG, HHRINS4,
contains("HHCONST"),
starts_with(c("DIED", "DTH", "INELIG")))) %>%
# Merge don't know and refused
mutate(across(where(is.numeric), ~if_else(.x == -8, -9, .x))) %>%
# For duration/count variables, combine into a simple yes/no to avoid small cell sizes
mutate(across(ends_with(c("DUR","NUM")), ~if_else(.x > 0, 1, 0))) %>%
# Remove variables with only one value
select(where(function(col) length(unique(col)) > 1))
# Select and tidy up all useful roster variables
roster_vars <- civ2_csproroster %>%
# Variables to drop
select(-c(HHR_SURVEYFORMSDT, HHR_HHI_UUID, HHR_DEVICEID, STAFFIDHH,
HHRSTS, HHR_HHMEM_ID, HHRETS, HHDRSTS, INTRODR, AGEMONTHS,
HHDRETS, MINORLINENUM, HHCRSTS, HHCRETS, DMFLAG, DMCOMMENT,
HHRINS5, HHQOVCINS1, HHQSUPPSTART, HHQOVCEND, OVCLINENUM,
LIVEAWAYM, LIVEAWAYY, LIVECOUNTRY,
LIVESUBPF, LIVESUBPFLIVECOUNTRYOTH,
CORRECT_LN, HHQASSIGN_INST, HHQASSIGN, WITHDRAWHH,
GRAPPE, EA_HHID_FIXED, HHR_EACODE, HHR_SHH, HHR_DISTRICT,
HHR_REGION, HHR_DEPARTEMENT, HHR_SOUSPREFID, HHR_EACODE_ORIG,
HHR_SHH_ORIG, HHR_TEAM_ID, CODEMILIEU, NOM_MILIEU,
NATMOMNM, MOMFEMNAME, DADMALENAME, MALEGUARDNAME,
RELATTOHH, ROSTERED_FINAL_STATUS, ELIGIBLE1,
HHCCONSEL, HHCEMANC, HHCCONT0))%>%
# Fill NAs with no/missing where appropriate
mutate(across(c(SICK3MO), ~replace_na(.x, -8)),
across(c(EMANCIPATED, LIVEAWAY, starts_with("SUPPORT")), ~replace_na(.x, 2)),
across(c(VULNERABLE_CHILD), ~replace_na(.x, 0))) %>%
# Merge don't know and refused
mutate(across(where(is.numeric), ~if_else(.x == -8, -9, .x))) %>%
# Remove variables with only one value
select(where(function(col) length(unique(col)) > 1)) %>%
left_join(select(indiv_status_vars, EA_HHID_LN_FIXED)) #%>%
## Joining with `by = join_by(EA_HHID_LN_FIXED)`
#filter(roster_elig == 1)
# Create binary interview response variable
interview_response <- civ2_csproindiv %>%
filter(roster_elig == 1) %>%
mutate(ind_responded = if_else(INDIVCNTSTATUS == 1, 1, 0)) %>%
# not sure about the INDIVCNTSTATUS
select(ind_responded, EA_HHID_FIXED, EA_HHID_LN_FIXED, roster_elig)
# Join together all the variables and filter to only
# de facto eligible persons.
nr_vars <- hhvars %>%
left_join(interview_response) %>%
left_join(roster_vars)
## Joining with `by = join_by(EA_HHID_FIXED)`
## Joining with `by = join_by(EA_HHID_LN_FIXED)`
# filter(roster_elig == 1) # left this out for now
# Add recodes for the teen years variable which is used to split up the
# data for the non-response model.
nr_vars_recodes <- nr_vars %>%
mutate(H_AGETEENYEARS = if_else(AGEYEARS >= 15 & AGEYEARS <= 17, 1, 2))
# Make the appropriate variables numeric and the rest factors.
# This is required for the BART model fitting to handle the
# data correctly.
int_nr_data_in <- nr_vars_recodes %>%
# select(-INDSTATUS, -ROSTERED_FINAL_STATUS, -roster_elig, -ELIGIBLE1) %>%
select(H_AGETEENYEARS, SEX, EA_HHID_FIXED, EA_HHID_LN_FIXED, ind_responded, everything()) %>%
arrange(H_AGETEENYEARS, SEX, EA_HHID_FIXED, EA_HHID_LN_FIXED) %>%
mutate(age_group = case_when(AGEYEARS < 18 ~ AGEYEARS,
between(AGEYEARS, 18, 24) ~ 1,
between(AGEYEARS, 25, 29) ~ 2,
between(AGEYEARS, 30, 39) ~ 3,
between(AGEYEARS, 40, 49) ~ 4,
between(AGEYEARS, 50, 59) ~ 5,
AGEYEARS >= 60 ~ 6,
TRUE ~ 99),
hh_age_group = case_when(between(HHAGEYEARS, 15, 24) ~ 1,
between(HHAGEYEARS, 25, 29) ~ 2,
between(HHAGEYEARS, 30, 39) ~ 3,
between(HHAGEYEARS, 40, 49) ~ 4,
between(HHAGEYEARS, 50, 59) ~ 5,
HHAGEYEARS >= 60 ~ 6,
TRUE ~ 99),
across(-c(starts_with("TOTAL"), ends_with("NUM"), ends_with("DUR"),
DEATHCOUNT, AGEYEARS, ROOMSLEEP, HHAGEYEARS, ind_responded), as_factor),
across(c(starts_with("TOTAL"), ends_with("NUM"), ends_with("DUR"),
DEATHCOUNT, AGEYEARS, ROOMSLEEP, HHAGEYEARS, ind_responded))) %>%
select(-SLEEPHERE, -DEATHCOUNT, -HHEMANCMINOR, -LIVEAWAY,
-AGEYEARS, -HHAGEYEARS)
## The following code prepares a model matrix for interview non response
# For variables which are already two-leveled
# int_binary_vars <- int_nr_data_in %>%
# select(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX, ind_responded) %>%
# select(EA_HHID_LN_FIXED, where(function(col) length(unique(col)) == 2), -ind_responded) %>%
# mutate(across(-c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX), as.numeric)) %>%
# mutate(across(-c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX), ~if_else(.x == 2, 0, .x))) %>%
# mutate(across(-c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX), as.factor))
int_binary_vars <- int_nr_data_in %>%
select(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX, where(function(col) length(unique(col)) == 2), -ind_responded) %>%
mutate(across(-c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX), as.numeric)) %>%
mutate(across(-c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX), ~if_else(.x == 2, 0, .x))) %>%
mutate(across(-c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX), as.factor))
# Non-binary variables
int_nr_data_mod_matrix_in <- int_nr_data_in %>%
select(where(function(col) length(unique(col)) != 2), ind_responded) %>%
select(-contains("EA_HHID"))
dmy_int_nr_data <- dummyVars("ind_responded ~ .", data = int_nr_data_mod_matrix_in)
df_int_nr_data = int_nr_data_mod_matrix_in
model_matrix_int_nr <- predict(dmy_int_nr_data, newdata = df_int_nr_data) %>%
as_tibble() %>%
cbind(ind_responded = int_nr_data_mod_matrix_in$ind_responded) %>%
cbind(EA_HHID_LN_FIXED = int_nr_data_in$EA_HHID_LN_FIXED) %>%
left_join(int_binary_vars) %>%
mutate(across(-c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX, ind_responded), as.factor))
## Joining with `by = join_by(EA_HHID_LN_FIXED)`
## Identify/remove variables from non-response model where proportion
## or number in the smallest category very low (eg < 10%)
variable_response_props_int <- model_matrix_int_nr %>%
pivot_longer(cols = -c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX, ind_responded),
names_to = "variable", values_to = "value") %>%
group_by(H_AGETEENYEARS, SEX, variable, value) %>%
summarise(respondents = sum(ind_responded),
totalN = n(),
rrate = respondents / totalN) %>%
mutate(category_proportion = totalN / sum(totalN))
## `summarise()` has grouped output by 'H_AGETEENYEARS', 'SEX', 'variable'. You
## can override using the `.groups` argument.
int_nr_vars_dropped <- variable_response_props_int %>%
mutate(drop = if_else(!grepl("age", variable) &
(totalN < 25 | category_proportion < 10/100), 1, NA_real_)) %>%
filter(drop == 1)
# final_int_nr_model_matrix <- model_matrix_int_nr %>%
# pivot_longer(cols = -c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX, ind_responded),
# names_to = "variable", values_to = "value") %>%
# left_join(select(int_nr_vars_dropped, H_AGETEENYEARS, SEX, variable, drop)) %>%
# filter(is.na(drop)) %>%
# pivot_wider(id_cols = c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX, ind_responded),
# names_from = variable, values_from = value) %>%
# mutate(across(-c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX, ind_responded),
# ~replace_na(as.numeric(.x), 0)))
final_int_nr_model_matrix <- model_matrix_int_nr %>%
mutate(ind_responded = if_else(is.na(ind_responded), 0, ind_responded)) %>%
pivot_longer(cols = -c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX, ind_responded),
names_to = "variable", values_to = "value") %>%
left_join(select(int_nr_vars_dropped, H_AGETEENYEARS, SEX, variable, drop)) %>%
filter(is.na(drop)) %>%
group_by(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX, ind_responded, variable) %>%
summarise(value = first(value), .groups = "drop") %>% # pick first value if multiple
pivot_wider(id_cols = c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX, ind_responded),
names_from = variable, values_from = value) %>%
mutate(across(-c(EA_HHID_LN_FIXED, H_AGETEENYEARS, SEX, ind_responded),
~replace_na(as.numeric(.x), 0)))
## Joining with `by = join_by(H_AGETEENYEARS, SEX, variable)`
# Function to fit a single model
# Input needs to contain the predictors and the response variable
fit_bart_model_int <- function(data, nskip = 50, ndpost = 100) {
predictor_matrix <- data %>%
select(where(function(col) length(unique(col)) > 1)) %>%
remove_empty(which = "cols") %>%
select(-c(EA_HHID_LN_FIXED, ind_responded)) %>%
data.frame()
response <- data$ind_responded
# bartFit <- pbart(predictor_matrix, response, binaryOffset = mean(response),
# nskip = nskip, ndpost = ndpost)
bartFit <- lbart(predictor_matrix, response, binaryOffset = mean(response),
nskip = nskip, ndpost = ndpost)
}
# Function to just output the fitted response probabilities
fit_response_probabilities_int <- function(data) {
bartFit <- fit_bart_model_int(data)
response <- data$ind_responded
response_probability <- bartFit$prob.train.mean
tibble(data$EA_HHID_LN_FIXED, response, response_probability)
}
# nested <- final_int_nr_model_matrix %>%
# group_by(H_AGETEENYEARS, SEX) %>%
# tidyr::nest() %>%
# mutate(bart = lapply(data, fit_response_probabilities))
#
# response_probs <- nested %>%
# select(-data) %>%
# unnest(cols = bart) %>%
# rename(EA_HHID_LN_FIXED = `data$EA_HHID_LN_FIXED`)
# parallel workers
plan(multisession, workers = parallel::detectCores() - 1)
# Set the global handler once — outside everything
handlers("progress")
# Clean and nest
nested_int <- final_int_nr_model_matrix %>%
filter(!is.na(H_AGETEENYEARS), !is.na(SEX)) %>%
group_by(H_AGETEENYEARS, SEX) %>%
nest() %>%
mutate(data = map(data, ~ filter(.x, !is.na(ind_responded)))) %>%
filter(map_int(data, nrow) > 1)
# Safe BART wrapper
safe_fit_bart <- possibly(fit_bart_model_int, otherwise = NULL)
# Running in parallel with progress
with_progress({
p <- progressor(along = nested_int$data)
nested_int <- nested_int %>%
mutate(bart = future_map(data, function(x) {
p() # progress update
safe_fit_bart(x, nskip = 20, ndpost = 100)
}))
})
## *****Into main of lbart
## *****Data:
## data:n,p,np: 1004, 798, 0
## y1,yn: 1, 1
## x1,x[n*p]: 1.000000, 1.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 1 ... 1
## *****burn and ndpost: 20, 100
## *****Prior:
## beta,alpha,tau: 2.000000,0.950000,0.129526
## *****Dirichlet:sparse,a,b,rho,augment: 0,0.5,1,798,0
## *****nkeeptrain,nkeeptest: 100, 100
## *****printevery: 100
##
## MCMC
## done 0 (out of 120)
## done 100 (out of 120)
## time: 4s
## check counts
## trcnt,tecnt: 100,0
## *****Into main of lbart
## *****Data:
## data:n,p,np: 1173, 766, 0
## y1,yn: 1, 1
## x1,x[n*p]: 2.000000, 0.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 1 ... 1
## *****burn and ndpost: 20, 100
## *****Prior:
## beta,alpha,tau: 2.000000,0.950000,0.129526
## *****Dirichlet:sparse,a,b,rho,augment: 0,0.5,1,766,0
## *****nkeeptrain,nkeeptest: 100, 100
## *****printevery: 100
##
## MCMC
## done 0 (out of 120)
## done 100 (out of 120)
## time: 4s
## check counts
## trcnt,tecnt: 100,0
## *****Into main of lbart
## *****Data:
## data:n,p,np: 11268, 166, 0
## y1,yn: 1, 1
## x1,x[n*p]: 2.000000, 0.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 1 ... 1
## *****burn and ndpost: 20, 100
## *****Prior:
## beta,alpha,tau: 2.000000,0.950000,0.129526
## *****Dirichlet:sparse,a,b,rho,augment: 0,0.5,1,166,0
## *****nkeeptrain,nkeeptest: 100, 100
## *****printevery: 100
##
## MCMC
## done 0 (out of 120)
## done 100 (out of 120)
## time: 36s
## check counts
## trcnt,tecnt: 100,0
## *****Into main of lbart
## *****Data:
## data:n,p,np: 12689, 267, 0
## y1,yn: 1, 1
## x1,x[n*p]: 1.000000, 1.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 1 ... 1
## *****burn and ndpost: 20, 100
## *****Prior:
## beta,alpha,tau: 2.000000,0.950000,0.129526
## *****Dirichlet:sparse,a,b,rho,augment: 0,0.5,1,267,0
## *****nkeeptrain,nkeeptest: 100, 100
## *****printevery: 100
##
## MCMC
## done 0 (out of 120)
## done 100 (out of 120)
## time: 41s
## check counts
## trcnt,tecnt: 100,0
response_probs_int <- nested_int %>%
mutate(prob_df = map2(data, bart, function(d, model) {
if (is.null(model)) {
return(tibble(
EA_HHID_LN_FIXED = d$EA_HHID_LN_FIXED,
response_probability_int = NA_real_
))
} else {
tibble(
EA_HHID_LN_FIXED = d$EA_HHID_LN_FIXED,
response_probability_int = model$prob.train.mean
)
}
})) %>%
select(prob_df) %>%
unnest(prob_df)
## Adding missing grouping variables: `H_AGETEENYEARS`, `SEX`
response_probs_int_full <- final_int_nr_model_matrix %>%
select(-H_AGETEENYEARS, -SEX) %>%
left_join(response_probs_int, by = "EA_HHID_LN_FIXED") %>%
select(SEX, H_AGETEENYEARS, EA_HHID_LN_FIXED, ind_responded, response_probability_int) %>%
ungroup()
nr_adj_int_weights <- response_probs_int_full %>%
mutate(SEX = as.numeric(SEX)) %>%
left_join(int_wgts_phase_one_adjusted) %>%
mutate(nr_adj_int_wgt = if_else(ind_responded == 1, phase_one_adj_int_wgt / response_probability_int, NA_real_))
## Joining with `by = join_by(SEX, EA_HHID_LN_FIXED)`
nr_adj_int_weights %>%
mutate(sex_f = factor(SEX, levels = c(1,2), labels = c("Male", "Female")),
teen_f = factor(H_AGETEENYEARS, levels = c(1,2), labels = c("15-17", "18+"))) %>%
ggplot(aes(x = factor(EAID), y = response_probability_int)) +
geom_boxplot() +
facet_grid(cols = ggplot2::vars(teen_f), rows = ggplot2::vars(sex_f)) +
theme_bw()
civ2_svy <- svydesign(ids = ~EAID,
weights = ~nr_adj_int_wgt,
data = nr_adj_int_weights %>% filter(!is.na(nr_adj_int_wgt)))
summary(weights(civ2_svy))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 49.74 148.93 174.75 256.58 961.87
civ2_svy_trimmed <- trimWeights(civ2_svy,
lower = -Inf,
upper = 3.5 * median(weights(civ2_svy)),
strict = TRUE)
summary(weights(civ2_svy_trimmed))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.927 51.671 150.852 174.751 258.509 521.239
trimmed_int_weights <- tibble(civ2_svy_trimmed$variables, trmpnr1w0 = weights(civ2_svy_trimmed))