pacman::p_load(haven, tibble, tidyverse, naniar, knitr, magrittr, kableExtra, scales)
midline_group <- read_dta("00-data/midline-data/Midline_IE_1_SC_G.dta")
midline_individual <- read_dta("00-data/midline-data/Midline_IE_2_L.dta")
# Function to create the data dictionary
svy_dict <- function(x, ...) {
dict <- tibble(
var_names = colnames(x),
var_labs = sjlabelled::get_label(x),
val_labs = sjlabelled::get_labels(x, values = "n"),
...
)
return(dict)
}
# Create the data dictionary for the single survey
midline_group_dict <- svy_dict(midline_group)
midline_ind_dict <- svy_dict(midline_individual)
head(midline_ind_dict)
## # A tibble: 6 × 3
## var_names var_labs val_labs
## <chr> <chr> <named list>
## 1 cim "cim" <chr [2,815]>
## 2 nis_d "" <chr [2,815]>
## 3 sexemembre "SEXE DU MEMBRE" <chr [2]>
## 4 hhsize "" <NULL>
## 5 adresse "" <chr [1,386]>
## 6 statut_wrong "STATUT DE RESIDENCE" <chr [4]>
The data dictionary allows us to look for keywords within questions more easily.
summary_age <- midline_individual %>%
summarise(
Variable = "Age",
Missing_Values = sum(is.na(q56)),
Non_Missing_Values = sum(!is.na(q56)),
Mean = mean(q56, na.rm = TRUE),
Median = median(q56, na.rm = TRUE),
SD = sd(q56, na.rm = TRUE),
Min = min(q56, na.rm = TRUE),
Max = max(q56, na.rm = TRUE)
)
# Create and style the table
summary_age %>%
kable(caption = "Summary Statistics for Age (q56)", align = 'c') %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = F,
position = "center")
| Variable | Missing_Values | Non_Missing_Values | Mean | Median | SD | Min | Max |
|---|---|---|---|---|---|---|---|
| Age | 205 | 12416 | 42.90214 | 41 | 13.47602 | 12 | 121 |
summary_gender <- midline_individual %>%
summarise(
Man = sum(sexemembre == 1, na.rm = TRUE),
Woman = sum(sexemembre == 2, na.rm = TRUE),
Missing = sum(is.na(sexemembre))
) %>%
mutate(Total = Man + Woman)
# Create and style the table for gender
summary_gender %>%
kable(caption = "Gender Distribution", align = 'c') %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = F,
position = "center")
| Man | Woman | Missing | Total |
|---|---|---|---|
| 7355 | 5061 | 205 | 12416 |
# Summary statistics for hhsize (household size)
summary_hhsize <- midline_individual %>%
summarise(
Variable = "Household Size",
Missing_Values = sum(is.na(hhsize)),
Mean = mean(hhsize, na.rm = TRUE),
Median = median(hhsize, na.rm = TRUE),
SD = sd(hhsize, na.rm = TRUE),
Min = min(hhsize, na.rm = TRUE),
Max = max(hhsize, na.rm = TRUE)
)
# Create and style the table
summary_hhsize %>%
kable(caption = "Summary Statistics for Household Size (hhsize)", align = 'c') %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = F,
position = "center")
| Variable | Missing_Values | Mean | Median | SD | Min | Max |
|---|---|---|---|---|---|---|
| Household Size | 205 | 4.89441 | 5 | 2.584201 | 1 | 16 |
CD136 [NOM] est de quelle nationalité ? 1=Congolaise 2=RDC 3=Camerounaise 4=RCA 5=Rwandaise 6=Malienne 7=Sénégalaise 8=Béninoise 9=angolaise 10=chinoide 10=autrte afrique 12=Francaise 13=Autre Europe 14=Autre
cd136_labels <- c(
`1` = "Congolese",
`2` = "DR Congo",
`3` = "Cameroonian",
`4` = "Central African",
`5` = "Rwandan",
`6` = "Malian",
`7` = "Senegalese",
`8` = "Beninese",
`9` = "Angolan",
`10` = "Chinese",
`11` = "Other African",
`12` = "French",
`13` = "Other European",
`14` = "Other Nationality"
)
# Summary statistics for CD136 (What is [NAME]'s nationality?)
summary_cd136 <- midline_individual %>%
mutate(
CD136_label = factor(CD136, levels = names(cd136_labels), labels = cd136_labels)
) %>%
group_by(CD136_label) %>%
summarise(
Count = n(),
Percentage = round((n() / nrow(midline_individual)) * 100, 2)
)
summary_cd136 %>%
kable(caption = "What is [NAME]'s nationality? (CD136)", align = 'c') %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = F,
position = "center")
| CD136_label | Count | Percentage |
|---|---|---|
| Congolese | 9526 | 75.48 |
| DR Congo | 1411 | 11.18 |
| Cameroonian | 11 | 0.09 |
| Central African | 1655 | 13.11 |
| Rwandan | 9 | 0.07 |
| Malian | 1 | 0.01 |
| Angolan | 1 | 0.01 |
| Chinese | 1 | 0.01 |
| Other African | 2 | 0.02 |
| Other Nationality | 4 | 0.03 |
CD137 Is [NAME] a refugee or asylum seeker in the ROC? 1 refugee 2 asylum seeker 3 Other
cd137_labels <- c(
`1` = "Refugee",
`2` = "Asylum Seeker",
`3` = "Other"
)
# Summary statistics for CD137 (Is [NAME] a refugee or asylum seeker in the ROC?)
summary_cd137 <- midline_individual %>%
mutate(
CD137_label = factor(CD137, levels = names(cd137_labels), labels = cd137_labels)
) %>%
group_by(CD137_label) %>%
summarise(
Count = n(),
Percentage = round((n() / nrow(midline_individual)) * 100, 2)
)
summary_cd137 %>%
kable(caption = "Is [NAME] a refugee or asylum seeker in the ROC? (CD137)",
align = 'c') %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = F,
position = "center")
| CD137_label | Count | Percentage |
|---|---|---|
| Refugee | 2548 | 20.19 |
| Asylum Seeker | 48 | 0.38 |
| Other | 499 | 3.95 |
| NA | 9526 | 75.48 |
The number of NAs is consistent with the amount of Congolese nationals
CE101. Has [NAME] ever been to school during her life? “1=‘Yes’ 2=‘No’
summary_ce101 <- midline_individual %>%
mutate(
CE101_label = factor(CE101, levels = c(1, 2), labels = c("Yes", "No"))
) %>%
group_by(CE101_label) %>%
summarise(
Count = n(),
Percentage = (n() / nrow(midline_individual)) * 100
)
summary_ce101 %>%
kable(caption = "Has [NAME] ever been to school during her life? (CE101)",
align = 'c') %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = F,
position = "center")
| CE101_label | Count | Percentage |
|---|---|---|
| Yes | 7416 | 58.75921 |
| No | 2790 | 22.10601 |
| NA | 2415 | 19.13478 |
Figure out why CE101 has so many NAs
# Define age groups
midline_individual <- midline_individual %>%
mutate(
Age_Group = cut(q56, breaks = c(0, 5, 10, 15, 20, 25, 30, 35, 40,
45, 50, 55, 60, Inf), right = FALSE,
labels = c("0-4", "5-9", "10-14", "15-19", "20-24",
"25-29", "30-34", "35-39", "40-44", "45-49",
"50-54", "55-59", "60+")),
CE101_label = factor(CE101, levels = c(1, 2), labels = c("Yes", "No")))
# Cross-tabulation of CE101 with age groups, including NAs
cross_table <- midline_individual %>%
group_by(Age_Group, CE101_label) %>%
summarise(Count = n(), .groups = 'drop') %>%
pivot_wider(names_from = CE101_label, values_from = Count,
values_fill = list(Count = 0)) %>%
mutate(Total = Yes + No + `NA`) %>%
select(Age_Group, Yes, No, `NA`, Total)
cross_table %>%
kable(caption = "Table of Ever been to School (CE101) by Age Group", align = 'c') %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = F,
position = "center")
| Age_Group | Yes | No | NA | Total |
|---|---|---|---|---|
| 10-14 | 15 | 3 | 3 | 21 |
| 15-19 | 47 | 23 | 24 | 94 |
| 20-24 | 327 | 154 | 219 | 700 |
| 25-29 | 527 | 228 | 312 | 1067 |
| 30-34 | 829 | 345 | 430 | 1604 |
| 35-39 | 1210 | 401 | 458 | 2069 |
| 40-44 | 1135 | 332 | 313 | 1780 |
| 45-49 | 1093 | 263 | 227 | 1583 |
| 50-54 | 805 | 279 | 153 | 1237 |
| 55-59 | 547 | 203 | 107 | 857 |
| 60+ | 766 | 510 | 128 | 1404 |
| NA | 115 | 49 | 41 | 205 |
Doesn’t seem like there’s necessarily a tendency of NAs for specific age groups…
Cereals:
Beans:
Milk and Dairy:
Meat, Poultry, Fish:
Muscle Meat:
Offal:
Fish:
Eggs:
Vegetables:
Detailed Vegetable Consumption:
Main Source of Detailed Vegetable Consumption:
Fruits:
Oil, Fat, Butter:
Sugar and Sugary Products:
Spices and Condiments:
long_data <- midline_individual %>%
pivot_longer(
cols = starts_with("bg5q"),
names_to = "variable",
values_to = "value"
)
long_data <- long_data %>%
mutate(
food_type = case_when(
str_detect(variable, "501a$") ~ "cereals",
str_detect(variable, "502a$") ~ "beans",
str_detect(variable, "503a$") ~ "milk_dairy",
str_detect(variable, "504aa$") ~ "muscle_meat",
str_detect(variable, "504ab$") ~ "offal",
str_detect(variable, "504ac$") ~ "fish",
str_detect(variable, "504ad$") ~ "eggs",
str_detect(variable, "505aa$") ~ "root_vegetables",
str_detect(variable, "505ab$") ~ "leafy_vegetables",
str_detect(variable, "506a$") ~ "fruits",
str_detect(variable, "507a$") ~ "oil_fat_butter",
str_detect(variable, "508a$") ~ "sugar_products",
str_detect(variable, "509a$") ~ "spices_condiments",
TRUE ~ NA_character_
),
question_type = case_when(
str_detect(variable, "a$") ~ "consumption_days",
str_detect(variable, "b$") ~ "source"
)
) %>%
filter(!is.na(food_type) & !is.na(question_type))
# Calculate the avg. number of consumption days for each food type at the individual level
consumption_summary <- long_data %>%
filter(question_type == "consumption_days") %>%
group_by(iid, food_type) %>%
summarize(
total_days = sum(as.numeric(value), na.rm = TRUE),
avg_days = mean(as.numeric(value), na.rm = TRUE)
) %>%
group_by(food_type) %>%
summarize(
avg_days = mean(avg_days, na.rm = TRUE),
median_days = median(total_days, na.rm = TRUE),
min_days = min(total_days, na.rm = TRUE),
max_days = max(total_days, na.rm = TRUE))
## `summarise()` has grouped output by 'iid'. You can override using the `.groups`
## argument.
consumption_distribution <- ggplot(consumption_summary,
aes(x = food_type, y = avg_days)) +
geom_col() +
labs(
title = "Average Number of Consumption Days for Each Food Type",
x = "Food Type",
y = "Average Number of Days"
) +
ylim(0, max(consumption_summary$avg_days, na.rm = TRUE) + 1) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
consumption_distribution
Individual level
# Calculate the total and average number of consumption days for each
# food type at the individual level
consumption_summary_ind <- long_data %>%
filter(question_type == "consumption_days") %>%
group_by(iid, food_type) %>%
summarize(
total_days = sum(as.numeric(value), na.rm = TRUE),
.groups = 'drop' # Ensure ungrouped result
) %>%
pivot_wider(names_from = food_type, values_from = total_days,
values_fill = list(total_days = 0))
# Convert the data to a long format for heatmap visualization
heatmap_data <- consumption_summary_ind %>%
pivot_longer(cols = -iid, names_to = "food_type", values_to = "total_days")
ggplot(heatmap_data, aes(x = food_type, y = as.factor(iid), fill = total_days)) +
geom_tile() +
scale_fill_gradient(low = "white", high = "blue", limits = c(0, 7), breaks = 0:7) +
labs(
title = "Individual Consumption Heatmap",
x = "Food Type",
y = "Individuals",
fill = "Total Days"
) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 8),
axis.text.y = element_blank(), # Hide y-axis text
axis.ticks.y = element_blank() # Hide y-axis ticks
)
Identifying individuals with zero consumption
By gender
# Define food consumption variables
food_consumption_vars <- c("bg5q501a", "bg5q502a", "bg5q503a", "bg5q504a", "bg5q504aa",
"bg5q504ab", "bg5q504ac", "bg5q504ad", "bg5q505a", "bg5q505aa",
"bg5q505ab", "bg5q506a", "bg5q507a", "bg5q508a", "bg5q509a")
# Transform the data to long format
long_data <- midline_individual %>%
select(iid, sexemembre, all_of(food_consumption_vars)) %>%
pivot_longer(cols = -c(iid, sexemembre), names_to = "food_type",
values_to = "consumption_days")
# Identify individuals with zero consumption for all food types
zero_consumption_individuals <- long_data %>%
group_by(iid, sexemembre) %>%
summarize(total_consumption = sum(as.numeric(consumption_days), na.rm = TRUE)) %>%
filter(total_consumption == 0)
## `summarise()` has grouped output by 'iid'. You can override using the `.groups`
## argument.
# Summarize the gender distribution
gender_distribution_zero_consumption_summary <- zero_consumption_individuals %>%
ungroup() %>%
summarize(
Men = sum(sexemembre == 1, na.rm = TRUE),
Women = sum(sexemembre == 2, na.rm = TRUE),
Missing = sum(is.na(sexemembre))
) %>%
mutate(Total = Men + Women + Missing)
gender_distribution_table <- gender_distribution_zero_consumption_summary %>%
kable("html", col.names = c("Men", "Women", "Missing", "Total"),
caption = "Gender Distribution of Individuals with Zero Food Consumption") %>%
kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condensed"))
gender_distribution_table
| Men | Women | Missing | Total |
|---|---|---|---|
| 389 | 375 | 205 | 969 |
Age
# Update age groups with more detailed breakdowns and assign to Age_Group2
midline_individual <- midline_individual %>%
mutate(
Age_Group2 = cut(q56, breaks = c(0, 5, 10, 15, 20, 25, 30, 35,
40, 45, 50, 55, 60, Inf), right = FALSE,
labels = c("0-4", "5-9", "10-14", "15-19", "20-24",
"25-29", "30-34", "35-39", "40-44", "45-49",
"50-54", "55-59", "60+")))
# Transform the data to long format
long_data_age <- midline_individual %>%
select(iid, Age_Group2, all_of(food_consumption_vars)) %>%
pivot_longer(cols = -c(iid, Age_Group2), names_to = "food_type",
values_to = "consumption_days")
# Identify individuals with zero consumption for all food types
zero_consumption_individuals_age <- long_data_age %>%
group_by(iid, Age_Group2) %>%
summarize(total_consumption = sum(as.numeric(consumption_days), na.rm = TRUE)) %>%
filter(total_consumption == 0)
## `summarise()` has grouped output by 'iid'. You can override using the `.groups`
## argument.
# Summarize the age distribution
age_distribution_zero_consumption_summary <- zero_consumption_individuals_age %>%
ungroup() %>%
count(Age_Group2) %>%
rename(Count = n)
age_distribution_table <- age_distribution_zero_consumption_summary %>%
kable("html", col.names = c("Age Group", "Count"),
caption = "Age Distribution of Individuals with Zero Food Consumption") %>%
kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condensed"))
age_distribution_table
| Age Group | Count |
|---|---|
| 15-19 | 17 |
| 20-24 | 40 |
| 25-29 | 50 |
| 30-34 | 87 |
| 35-39 | 109 |
| 40-44 | 110 |
| 45-49 | 90 |
| 50-54 | 88 |
| 55-59 | 31 |
| 60+ | 142 |
| NA | 205 |
Zero consumption based on nationality
# Transform data to long format
long_data_nationality <- midline_individual %>%
select(iid, CD136, all_of(food_consumption_vars)) %>%
pivot_longer(cols = -c(iid, CD136), names_to = "food_type",
values_to = "consumption_days")
# Identify individuals with zero consumption for all food types
zero_consumption_ind_nationality <- long_data_nationality %>%
group_by(iid, CD136) %>%
summarize(total_consumption = sum(as.numeric(consumption_days), na.rm = TRUE)) %>%
filter(total_consumption == 0)
## `summarise()` has grouped output by 'iid'. You can override using the `.groups`
## argument.
# Summarize nationality distribution
nationality_distribution_zero_consumption_summary <- zero_consumption_ind_nationality %>%
ungroup() %>%
count(CD136) %>%
mutate(Nationality = recode(CD136, !!!cd136_labels)) %>%
select(Nationality, n) %>%
rename(Count = n)
# Summary table
nationality_distribution_zero_consumption_summary %>%
kable("html", col.names = c("Nationality", "Count"),
caption = "Nationality Distribution of Individuals with Zero Food Consumption") %>%
kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condensed"))
| Nationality | Count |
|---|---|
| Congolese | 799 |
| DR Congo | 84 |
| Central African | 86 |
A701 “During the last 7 days, has [NAME] worked at least one hour, with or without pay, in a field or garden belonging to him or another member of the household? Or has [NAME] Did he raise animals, fish or hunt?
A702 “During the last 7 days, has [NAME] worked at least one hour, with or without pay, in a business, a processing activity, provided a service on his own account or on behalf of a other member of the household? For example as a craftsman, trader or lawyer, doctor or other self-employed person?
A703 “During the last 7 days, has [NAME] worked at least one hour, for a company, for the State, for a boss or any other person who is not a member of your household? (even part-time or occasionally)
A704 During the last 7 days, has [NAME] worked at least one hour as an apprentice with or without pay?
1 Yes 2 No -8 Don't know-9 Refused to answer”
# Summarize distribution of responses for each question
work_activity_distribution <- midline_individual %>%
select(iid, A701, A702, A703, A704) %>%
pivot_longer(cols = starts_with("A7"), names_to = "question",
values_to = "response") %>%
group_by(question, response) %>%
summarize(count = n(), .groups = 'drop')
# Rename questions
work_activity_distribution <- work_activity_distribution %>%
mutate(question = recode(question,
"A701" = "Agriculture",
"A702" = "Business_Service",
"A703" = "Other_Employee",
"A704" = "Apprentice"))
# Calculate percentages within each question group
work_activity_distribution <- work_activity_distribution %>%
group_by(question) %>%
mutate(percentage = (count / sum(count)) * 100)
# Pivot data to wide format
summary_work_activity <- work_activity_distribution %>%
pivot_wider(names_from = question, values_from = c(count, percentage),
names_sep = "_") %>%
arrange(response)
# Ensure columns are numeric numeric and round them to two decimal places
summary_work_activity <- summary_work_activity %>%
mutate(across(starts_with("percentage"), ~ round(as.numeric(.), 2)))
summary_work_activity %>%
kable("html", col.names = c("Response", "Agriculture N", "Business/Service N",
"Other Employee N", "Apprentice N", "Agriculture %",
"Business Service %", "Other Employee %", "Apprentice %"),
caption = "Summary of Work Activities in the Last 7 Days") %>%
kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condensed"))
| Response | Agriculture N | Business/Service N | Other Employee N | Apprentice N | Agriculture % | Business Service % | Other Employee % | Apprentice % |
|---|---|---|---|---|---|---|---|---|
| 0 | 4747 | 6764 | 7246 | 7199 | 37.61 | 53.59 | 57.41 | 57.04 |
| 1 | 2766 | 747 | 153 | 143 | 21.92 | 5.92 | 1.21 | 1.13 |
| NA | 5108 | 5110 | 5222 | 5279 | 40.47 | 40.49 | 41.38 | 41.83 |
A713 Average number of hours of work per day
A714 Average number of working days per week
A715 Average number of months of work per year
# Distribution of working time questions
working_time_summary <- midline_individual %>%
select(iid, A713, A714, A715) %>%
summarize(
avg_hours_per_day = round(mean(A713, na.rm = TRUE), 2),
sd_hours_per_day = round(sd(A713, na.rm = TRUE), 2),
min_hours_per_day = round(min(A713, na.rm = TRUE), 2),
max_hours_per_day = round(max(A713, na.rm = TRUE), 2),
median_hours_per_day = round(median(A713, na.rm = TRUE), 2),
na_hours_per_day = sum(is.na(A713)),
avg_days_per_week = round(mean(A714, na.rm = TRUE), 2),
sd_days_per_week = round(sd(A714, na.rm = TRUE), 2),
min_days_per_week = round(min(A714, na.rm = TRUE), 2),
max_days_per_week = round(max(A714, na.rm = TRUE), 2),
median_days_per_week = round(median(A714, na.rm = TRUE), 2),
na_days_per_week = sum(is.na(A714)),
avg_months_per_year = round(mean(A715, na.rm = TRUE), 2),
sd_months_per_year = round(sd(A715, na.rm = TRUE), 2),
min_months_per_year = round(min(A715, na.rm = TRUE), 2),
max_months_per_year = round(max(A715, na.rm = TRUE), 2),
median_months_per_year = round(median(A715, na.rm = TRUE), 2),
na_months_per_year = sum(is.na(A715))
)
# Summary table
working_time_distribution <- tibble(
Statistic = c("Mean", "Standard Deviation", "Minimum", "Maximum", "Median", "NA Count"),
`Average number of hours of work per day` = c(
working_time_summary$avg_hours_per_day,
working_time_summary$sd_hours_per_day,
working_time_summary$min_hours_per_day,
working_time_summary$max_hours_per_day,
working_time_summary$median_hours_per_day,
working_time_summary$na_hours_per_day
),
`Average number of working days per week` = c(
working_time_summary$avg_days_per_week,
working_time_summary$sd_days_per_week,
working_time_summary$min_days_per_week,
working_time_summary$max_days_per_week,
working_time_summary$median_days_per_week,
working_time_summary$na_days_per_week
),
`Average number of months of work per year` = c(
working_time_summary$avg_months_per_year,
working_time_summary$sd_months_per_year,
working_time_summary$min_months_per_year,
working_time_summary$max_months_per_year,
working_time_summary$median_months_per_year,
working_time_summary$na_months_per_year
)
)
working_time_distribution %>%
kable("html", col.names = c("Statistic", "Hours per Day", "Days per Week", "Months per Year"),
caption = "Summary of Working Time Distribution") %>%
kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condensed"))
| Statistic | Hours per Day | Days per Week | Months per Year |
|---|---|---|---|
| Mean | 5.85 | 4.98 | 9.37 |
| Standard Deviation | 2.36 | 1.46 | 3.06 |
| Minimum | 0.00 | 0.00 | 0.00 |
| Maximum | 16.00 | 7.00 | 12.00 |
| Median | 6.00 | 5.00 | 11.00 |
| NA Count | 9474.00 | 9474.00 | 9474.00 |
A716 Form of payment for main employment 1 Fixed salary (month,
fortnight, week) 2 On the working day or hour 3 On task 4 Committee 5
Benefits 6 In kind (products, food, etc.) 7 Is not paid
-8 Don't know-9 Refused to answer
A717 Last month’s income for this job
Social cohesion