Preparation

Load packages

pacman::p_load(haven, tibble, tidyverse, naniar, knitr, magrittr, kableExtra, scales)

Load data

midline_group <- read_dta("00-data/midline-data/Midline_IE_1_SC_G.dta")
midline_individual <- read_dta("00-data/midline-data/Midline_IE_2_L.dta")

Data dictionary

# Function to create the data dictionary
svy_dict <- function(x, ...) {
  dict <- tibble(
    var_names = colnames(x),
    var_labs = sjlabelled::get_label(x),
    val_labs = sjlabelled::get_labels(x, values = "n"),
    ...
  )
  return(dict)
}


# Create the data dictionary for the single survey
midline_group_dict <- svy_dict(midline_group)

midline_ind_dict <- svy_dict(midline_individual)

head(midline_ind_dict)

## # A tibble: 6 × 3
##   var_names    var_labs              val_labs     
##   <chr>        <chr>                 <named list> 
## 1 cim          "cim"                 <chr [2,815]>
## 2 nis_d        ""                    <chr [2,815]>
## 3 sexemembre   "SEXE DU MEMBRE"      <chr [2]>    
## 4 hhsize       ""                    <NULL>       
## 5 adresse      ""                    <chr [1,386]>
## 6 statut_wrong "STATUT DE RESIDENCE" <chr [4]>

The data dictionary allows us to look for keywords within questions more easily.

Summary Statistics

Age

summary_age <- midline_individual %>%
  summarise(
    Variable = "Age",
    Missing_Values = sum(is.na(q56)),
    Non_Missing_Values = sum(!is.na(q56)),
    Mean = mean(q56, na.rm = TRUE),
    Median = median(q56, na.rm = TRUE),
    SD = sd(q56, na.rm = TRUE),
    Min = min(q56, na.rm = TRUE),
    Max = max(q56, na.rm = TRUE)
  )

# Create and style the table
summary_age %>%
  kable(caption = "Summary Statistics for Age (q56)", align = 'c') %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = F, 
                position = "center")

Summary Statistics for Age (q56)
Variable	Missing_Values	Non_Missing_Values	Mean	Median	SD	Min	Max
Age	205	12416	42.90214	41	13.47602	12	121

Gender

summary_gender <- midline_individual %>%
  summarise(
    Man = sum(sexemembre == 1, na.rm = TRUE),
    Woman = sum(sexemembre == 2, na.rm = TRUE),
    Missing = sum(is.na(sexemembre))
  ) %>%
  mutate(Total = Man + Woman)

# Create and style the table for gender
summary_gender %>%
  kable(caption = "Gender Distribution", align = 'c') %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = F, 
                position = "center")

Gender Distribution
Man	Woman	Missing	Total
7355	5061	205	12416

Household size

# Summary statistics for hhsize (household size)
summary_hhsize <- midline_individual %>%
  summarise(
    Variable = "Household Size",
    Missing_Values = sum(is.na(hhsize)),
    Mean = mean(hhsize, na.rm = TRUE),
    Median = median(hhsize, na.rm = TRUE),
    SD = sd(hhsize, na.rm = TRUE),
    Min = min(hhsize, na.rm = TRUE),
    Max = max(hhsize, na.rm = TRUE)
  )

# Create and style the table
summary_hhsize %>%
  kable(caption = "Summary Statistics for Household Size (hhsize)", align = 'c') %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = F, 
                position = "center")

Summary Statistics for Household Size (hhsize)
Variable	Missing_Values	Mean	Median	SD	Min	Max
Household Size	205	4.89441	5	2.584201	1	16

Nationality

CD136 [NOM] est de quelle nationalité ? 1=Congolaise 2=RDC 3=Camerounaise 4=RCA 5=Rwandaise 6=Malienne 7=Sénégalaise 8=Béninoise 9=angolaise 10=chinoide 10=autrte afrique 12=Francaise 13=Autre Europe 14=Autre

cd136_labels <- c(
  `1` = "Congolese",
  `2` = "DR Congo",
  `3` = "Cameroonian",
  `4` = "Central African",
  `5` = "Rwandan",
  `6` = "Malian",
  `7` = "Senegalese",
  `8` = "Beninese",
  `9` = "Angolan",
  `10` = "Chinese",
  `11` = "Other African",
  `12` = "French",
  `13` = "Other European",
  `14` = "Other Nationality"
)

# Summary statistics for CD136 (What is [NAME]'s nationality?)
summary_cd136 <- midline_individual %>%
  mutate(
    CD136_label = factor(CD136, levels = names(cd136_labels), labels = cd136_labels)
  ) %>%
  group_by(CD136_label) %>%
  summarise(
    Count = n(),
    Percentage = round((n() / nrow(midline_individual)) * 100, 2)
  )

summary_cd136 %>%
  kable(caption = "What is [NAME]'s nationality? (CD136)", align = 'c') %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = F, 
                position = "center")

What is [NAME]’s nationality? (CD136)
CD136_label	Count	Percentage
Congolese	9526	75.48
DR Congo	1411	11.18
Cameroonian	11	0.09
Central African	1655	13.11
Rwandan	9	0.07
Malian	1	0.01
Angolan	1	0.01
Chinese	1	0.01
Other African	2	0.02
Other Nationality	4	0.03

CD137 Is [NAME] a refugee or asylum seeker in the ROC? 1 refugee 2 asylum seeker 3 Other

cd137_labels <- c(
  `1` = "Refugee",
  `2` = "Asylum Seeker",
  `3` = "Other"
)

# Summary statistics for CD137 (Is [NAME] a refugee or asylum seeker in the ROC?)
summary_cd137 <- midline_individual %>%
  mutate(
    CD137_label = factor(CD137, levels = names(cd137_labels), labels = cd137_labels)
  ) %>%
  group_by(CD137_label) %>%
  summarise(
    Count = n(),
    Percentage = round((n() / nrow(midline_individual)) * 100, 2)
  )

summary_cd137 %>%
  kable(caption = "Is [NAME] a refugee or asylum seeker in the ROC? (CD137)", 
        align = 'c') %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = F, 
                position = "center")

Is [NAME] a refugee or asylum seeker in the ROC? (CD137)
CD137_label	Count	Percentage
Refugee	2548	20.19
Asylum Seeker	48	0.38
Other	499	3.95
NA	9526	75.48

The number of NAs is consistent with the amount of Congolese nationals

Education

CE101. Has [NAME] ever been to school during her life? “1=‘Yes’ 2=‘No’

summary_ce101 <- midline_individual %>%
  mutate(
    CE101_label = factor(CE101, levels = c(1, 2), labels = c("Yes", "No"))
  ) %>%
  group_by(CE101_label) %>%
  summarise(
    Count = n(),
    Percentage = (n() / nrow(midline_individual)) * 100
  )

summary_ce101 %>%
  kable(caption = "Has [NAME] ever been to school during her life? (CE101)", 
        align = 'c') %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = F, 
                position = "center")

Has [NAME] ever been to school during her life? (CE101)
CE101_label	Count	Percentage
Yes	7416	58.75921
No	2790	22.10601
NA	2415	19.13478

Figure out why CE101 has so many NAs

# Define age groups
midline_individual <- midline_individual %>%
  mutate(
    Age_Group = cut(q56, breaks = c(0, 5, 10, 15, 20, 25, 30, 35, 40, 
                                    45, 50, 55, 60, Inf), right = FALSE, 
                    labels = c("0-4", "5-9", "10-14", "15-19", "20-24", 
                               "25-29", "30-34", "35-39", "40-44", "45-49", 
                               "50-54", "55-59", "60+")),
    CE101_label = factor(CE101, levels = c(1, 2), labels = c("Yes", "No")))

# Cross-tabulation of CE101 with age groups, including NAs
cross_table <- midline_individual %>%
  group_by(Age_Group, CE101_label) %>%
  summarise(Count = n(), .groups = 'drop') %>%
  pivot_wider(names_from = CE101_label, values_from = Count, 
              values_fill = list(Count = 0)) %>%
  mutate(Total = Yes + No + `NA`) %>%
  select(Age_Group, Yes, No, `NA`, Total)

cross_table %>%
  kable(caption = "Table of Ever been to School (CE101) by Age Group", align = 'c') %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = F, 
                position = "center")

Table of Ever been to School (CE101) by Age Group
Age_Group	Yes	No	NA	Total
10-14	15	3	3	21
15-19	47	23	24	94
20-24	327	154	219	700
25-29	527	228	312	1067
30-34	829	345	430	1604
35-39	1210	401	458	2069
40-44	1135	332	313	1780
45-49	1093	263	227	1583
50-54	805	279	153	1237
55-59	547	203	107	857
60+	766	510	128	1404
NA	115	49	41	205

Doesn’t seem like there’s necessarily a tendency of NAs for specific age groups…

Relevant Outcomes

Consumption (quantity - volume/amount, and quality - type)

Cereals:

bg5q501a
- French: SUR LES 7 DERNIERS JOURS, COMBIEN DE JOURS AVEZ-VOUS CONSOMMÉ DES CÉRÉALES, DES
- English: In the last 7 days, how many days did you consume cereals?
bg5q501b
- French: QUELLE EST LA PRINCIPALE SOURCE DES ALIMENTS CONSOMMES ?
- English: What is the main source of the consumed food?

Beans:

bg5q502a
- French: SUR LES 7 DERNIERS JOURS, COMBIEN DE JOURS AVEZ-VOUS CONSOMMÉ DES HARICOTS, DES
- English: In the last 7 days, how many days did you consume beans?
bg5q502b
- French: QUELLE EST LA PRINCIPALE SOURCE DES ALIMENTS CONSOMMES ?
- English: What is the main source of the consumed food?

Milk and Dairy:

bg5q503a
- French: SUR LES 7 DERNIERS JOURS, COMBIEN DE JOURS AVEZ-VOUS CONSOMMÉ DU LAIT OU UN AUTR
- English: In the last 7 days, how many days did you consume milk or other dairy products?
bg5q503b
- French: QUELLE EST LA PRINCIPALE SOURCE DES ALIMENTS CONSOMMES ?
- English: What is the main source of the consumed food?

Meat, Poultry, Fish:

bg5q504a
- French: SUR LES 7 DERNIERS JOURS, COMBIEN DE JOURS AVEZ-VOUS CONSOMMÉ DE LA VIANDE, DU P
- English: In the last 7 days, how many days did you consume meat, poultry, or fish?

Muscle Meat:

bg5q504aa
- French: SUR LES 7 DERNIERS JOURS, COMBIEN DE JOURS AVEZ-VOUS CONSOMMÉ DE LA VIANDE (MUSC
- English: In the last 7 days, how many days did you consume muscle meat?
bg5q504ba
- French: QUELLE EST LA PRINCIPALE SOURCE DES ALIMENTS CONSOMMES ?
- English: What is the main source of the consumed food?

Offal:

bg5q504ab
- French: SUR LES 7 DERNIERS JOURS, COMBIEN DE JOURS AVEZ-VOUS CONSOMMÉ DES ABATS: FOIE, R
- English: In the last 7 days, how many days did you consume offal: liver, kidneys, etc.?
bg5q504bb
- French: QUELLE EST LA PRINCIPALE SOURCE DES ALIMENTS CONSOMMES ?
- English: What is the main source of the consumed food?

Fish:

bg5q504ac
- French: SUR LES 7 DERNIERS JOURS, COMBIEN DE JOURS AVEZ-VOUS CONSOMMÉ DU POISSON: AUTRE
- English: In the last 7 days, how many days did you consume fish?
bg5q504bc
- French: QUELLE EST LA PRINCIPALE SOURCE DES ALIMENTS CONSOMMES ?
- English: What is the main source of the consumed food?

Eggs:

bg5q504ad
- French: SUR LES 7 DERNIERS JOURS, COMBIEN DE JOURS AVEZ-VOUS CONSOMMÉ DES ŒUFS ?
- English: In the last 7 days, how many days did you consume eggs?
bg5q504bd _ French: QUELLE EST LA PRINCIPALE SOURCE DES ALIMENTS CONSOMMES ?
- English: What is the main source of the consumed food?

Vegetables:

bg5q505a
- French: SUR LES 7 DERNIERS JOURS, COMBIEN DE JOURS AVEZ-VOUS CONSOMMÉ DES LÉGUMES ?
- English: In the last 7 days, how many days did you consume vegetables?
bg5q505b
- French: QUELLE EST LA PRINCIPALE SOURCE DES ALIMENTS CONSOMMES ?
- English: What is the main source of the consumed food?

Detailed Vegetable Consumption:

bg5q505aa
- French: SUR LES 7 DERNIERS JOURS, COMBIEN DE JOURS AVEZ-VOUS CONSOMMÉ DES LÉGUMES DE COU
- English: In the last 7 days, how many days did you consume root vegetables?
bg5q505ab
- French: SUR LES 7 DERNIERS JOURS, COMBIEN DE JOURS AVEZ-VOUS CONSOMMÉ DES LÉGUMES-FEUILL
- English: In the last 7 days, how many days did you consume leafy vegetables?

Main Source of Detailed Vegetable Consumption:

bg5q505ba
- French: QUELLE EST LA PRINCIPALE SOURCE DES ALIMENTS CONSOMMES ?
- English: What is the main source of the consumed food?
bg5q505bb
- French: QUELLE EST LA PRINCIPALE SOURCE DES ALIMENTS CONSOMMES ?
- English: What is the main source of the consumed food?

Fruits:

bg5q506a
- French: SUR LES 7 DERNIERS JOURS, COMBIEN DE JOURS AVEZ-VOUS CONSOMMÉ DES FRUITS : BANAN
- English: In the last 7 days, how many days did you consume fruits?
bg5q506b
- French: QUELLE EST LA PRINCIPALE SOURCE DES ALIMENTS CONSOMMES ?
- English: What is the main source of the consumed food?

Oil, Fat, Butter:

bg5q507a
- French: SUR LES 7 DERNIERS JOURS, COMBIEN DE JOURS AVEZ-VOUS CONSOMMÉ DE L’HUILE/GRAS/BE
- English: In the last 7 days, how many days did you consume oil/fat/butter?
bg5q507b
- French: QUELLE EST LA PRINCIPALE SOURCE DES ALIMENTS CONSOMMES ?
- English: What is the main source of the consumed food?

Sugar and Sugary Products:

bg5q508a
- French: SUR LES 7 DERNIERS JOURS, COMBIEN DE JOURS AVEZ-VOUS CONSOMMÉ DU SUCRE OU PRODUI
- English: In the last 7 days, how many days did you consume sugar or sugary products?
bg5q508b
- French: QUELLE EST LA PRINCIPALE SOURCE DES ALIMENTS CONSOMMES ?
- English: What is the main source of the consumed food?

Spices and Condiments:

bg5q509a
- French: SUR LES 7 DERNIERS JOURS, COMBIEN DE JOURS AVEZ-VOUS CONSOMMÉ DES EPICES/CONDIM
- English: In the last 7 days, how many days did you consume spices/condiments?
bg5q509b
- French: QUELLE EST LA PRINCIPALE SOURCE DES ALIMENTS CONSOMMES ?
- English: What is the main source of the consumed food?

long_data <- midline_individual %>%
  pivot_longer(
    cols = starts_with("bg5q"),
    names_to = "variable",
    values_to = "value"
  )

long_data <- long_data %>%
  mutate(
    food_type = case_when(
      str_detect(variable, "501a$") ~ "cereals",
      str_detect(variable, "502a$") ~ "beans",
      str_detect(variable, "503a$") ~ "milk_dairy",
      str_detect(variable, "504aa$") ~ "muscle_meat",
      str_detect(variable, "504ab$") ~ "offal",
      str_detect(variable, "504ac$") ~ "fish",
      str_detect(variable, "504ad$") ~ "eggs",
      str_detect(variable, "505aa$") ~ "root_vegetables",
      str_detect(variable, "505ab$") ~ "leafy_vegetables",
      str_detect(variable, "506a$") ~ "fruits",
      str_detect(variable, "507a$") ~ "oil_fat_butter",
      str_detect(variable, "508a$") ~ "sugar_products",
      str_detect(variable, "509a$") ~ "spices_condiments",
      TRUE ~ NA_character_
    ),
    question_type = case_when(
      str_detect(variable, "a$") ~ "consumption_days",
      str_detect(variable, "b$") ~ "source"
    )
  ) %>%
  filter(!is.na(food_type) & !is.na(question_type))

# Calculate the avg. number of consumption days for each food type at the individual level
consumption_summary <- long_data %>%
  filter(question_type == "consumption_days") %>%
  group_by(iid, food_type) %>%
  summarize(
    total_days = sum(as.numeric(value), na.rm = TRUE),
    avg_days = mean(as.numeric(value), na.rm = TRUE)
  ) %>%
  group_by(food_type) %>%
  summarize(
    avg_days = mean(avg_days, na.rm = TRUE),
    median_days = median(total_days, na.rm = TRUE),
    min_days = min(total_days, na.rm = TRUE),
    max_days = max(total_days, na.rm = TRUE))

## `summarise()` has grouped output by 'iid'. You can override using the `.groups`
## argument.

consumption_distribution <- ggplot(consumption_summary, 
                                   aes(x = food_type, y = avg_days)) +
  geom_col() +
  labs(
    title = "Average Number of Consumption Days for Each Food Type",
    x = "Food Type",
    y = "Average Number of Days"
  ) +
  ylim(0, max(consumption_summary$avg_days, na.rm = TRUE) + 1) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))


consumption_distribution

Individual level

# Calculate the total and average number of consumption days for each 
# food type at the individual level
consumption_summary_ind <- long_data %>%
  filter(question_type == "consumption_days") %>%
  group_by(iid, food_type) %>%
  summarize(
    total_days = sum(as.numeric(value), na.rm = TRUE),
    .groups = 'drop' # Ensure ungrouped result
  ) %>%
  pivot_wider(names_from = food_type, values_from = total_days, 
              values_fill = list(total_days = 0))

# Convert the data to a long format for heatmap visualization
heatmap_data <- consumption_summary_ind %>%
  pivot_longer(cols = -iid, names_to = "food_type", values_to = "total_days")

ggplot(heatmap_data, aes(x = food_type, y = as.factor(iid), fill = total_days)) +
  geom_tile() +
  scale_fill_gradient(low = "white", high = "blue", limits = c(0, 7), breaks = 0:7) +
  labs(
    title = "Individual Consumption Heatmap",
    x = "Food Type",
    y = "Individuals",
    fill = "Total Days"
  ) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 8),
    axis.text.y = element_blank(), # Hide y-axis text
    axis.ticks.y = element_blank()  # Hide y-axis ticks
  )

Identifying individuals with zero consumption

By gender

# Define food consumption variables
food_consumption_vars <- c("bg5q501a", "bg5q502a", "bg5q503a", "bg5q504a", "bg5q504aa", 
                           "bg5q504ab", "bg5q504ac", "bg5q504ad", "bg5q505a", "bg5q505aa", 
                           "bg5q505ab", "bg5q506a", "bg5q507a", "bg5q508a", "bg5q509a")

# Transform the data to long format
long_data <- midline_individual %>%
  select(iid, sexemembre, all_of(food_consumption_vars)) %>%
  pivot_longer(cols = -c(iid, sexemembre), names_to = "food_type", 
               values_to = "consumption_days")

# Identify individuals with zero consumption for all food types
zero_consumption_individuals <- long_data %>%
  group_by(iid, sexemembre) %>%
  summarize(total_consumption = sum(as.numeric(consumption_days), na.rm = TRUE)) %>%
  filter(total_consumption == 0)

## `summarise()` has grouped output by 'iid'. You can override using the `.groups`
## argument.

# Summarize the gender distribution
gender_distribution_zero_consumption_summary <- zero_consumption_individuals %>%
  ungroup() %>%
  summarize(
    Men = sum(sexemembre == 1, na.rm = TRUE),
    Women = sum(sexemembre == 2, na.rm = TRUE),
    Missing = sum(is.na(sexemembre))
  ) %>%
  mutate(Total = Men + Women + Missing)

gender_distribution_table <- gender_distribution_zero_consumption_summary %>%
  kable("html", col.names = c("Men", "Women", "Missing", "Total"),
        caption = "Gender Distribution of Individuals with Zero Food Consumption") %>%
  kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condensed"))

gender_distribution_table

Gender Distribution of Individuals with Zero Food Consumption
Men	Women	Missing	Total
389	375	205	969

Age

# Update age groups with more detailed breakdowns and assign to Age_Group2
midline_individual <- midline_individual %>%
  mutate(
    Age_Group2 = cut(q56, breaks = c(0, 5, 10, 15, 20, 25, 30, 35, 
                                     40, 45, 50, 55, 60, Inf), right = FALSE, 
                     labels = c("0-4", "5-9", "10-14", "15-19", "20-24", 
                                "25-29", "30-34", "35-39", "40-44", "45-49", 
                                "50-54", "55-59", "60+")))

# Transform the data to long format
long_data_age <- midline_individual %>%
  select(iid, Age_Group2, all_of(food_consumption_vars)) %>%
  pivot_longer(cols = -c(iid, Age_Group2), names_to = "food_type", 
               values_to = "consumption_days")

# Identify individuals with zero consumption for all food types
zero_consumption_individuals_age <- long_data_age %>%
  group_by(iid, Age_Group2) %>%
  summarize(total_consumption = sum(as.numeric(consumption_days), na.rm = TRUE)) %>%
  filter(total_consumption == 0)

## `summarise()` has grouped output by 'iid'. You can override using the `.groups`
## argument.

# Summarize the age distribution
age_distribution_zero_consumption_summary <- zero_consumption_individuals_age %>%
  ungroup() %>%
  count(Age_Group2) %>%
  rename(Count = n)

age_distribution_table <- age_distribution_zero_consumption_summary %>%
  kable("html", col.names = c("Age Group", "Count"),
        caption = "Age Distribution of Individuals with Zero Food Consumption") %>%
  kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condensed"))

age_distribution_table

Age Distribution of Individuals with Zero Food Consumption
Age Group	Count
15-19	17
20-24	40
25-29	50
30-34	87
35-39	109
40-44	110
45-49	90
50-54	88
55-59	31
60+	142
NA	205

Zero consumption based on nationality

# Transform data to long format
long_data_nationality <- midline_individual %>%
  select(iid, CD136, all_of(food_consumption_vars)) %>%
  pivot_longer(cols = -c(iid, CD136), names_to = "food_type", 
               values_to = "consumption_days")

# Identify individuals with zero consumption for all food types
zero_consumption_ind_nationality <- long_data_nationality %>%
  group_by(iid, CD136) %>%
  summarize(total_consumption = sum(as.numeric(consumption_days), na.rm = TRUE)) %>%
  filter(total_consumption == 0)

## `summarise()` has grouped output by 'iid'. You can override using the `.groups`
## argument.

# Summarize nationality distribution
nationality_distribution_zero_consumption_summary <- zero_consumption_ind_nationality %>%
  ungroup() %>%
  count(CD136) %>%
  mutate(Nationality = recode(CD136, !!!cd136_labels)) %>%
  select(Nationality, n) %>%
  rename(Count = n)

# Summary table
nationality_distribution_zero_consumption_summary %>%
  kable("html", col.names = c("Nationality", "Count"),
        caption = "Nationality Distribution of Individuals with Zero Food Consumption") %>%
  kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condensed"))

Nationality Distribution of Individuals with Zero Food Consumption
Nationality	Count
Congolese	799
DR Congo	84
Central African	86

Income and (self) employment status

A701 “During the last 7 days, has [NAME] worked at least one hour, with or without pay, in a field or garden belonging to him or another member of the household? Or has [NAME] Did he raise animals, fish or hunt?

A702 “During the last 7 days, has [NAME] worked at least one hour, with or without pay, in a business, a processing activity, provided a service on his own account or on behalf of a other member of the household? For example as a craftsman, trader or lawyer, doctor or other self-employed person?

A703 “During the last 7 days, has [NAME] worked at least one hour, for a company, for the State, for a boss or any other person who is not a member of your household? (even part-time or occasionally)

A704 During the last 7 days, has [NAME] worked at least one hour as an apprentice with or without pay?

1 Yes 2 No -8 Don't know-9 Refused to answer”

# Summarize distribution of responses for each question
work_activity_distribution <- midline_individual %>%
  select(iid, A701, A702, A703, A704) %>%
  pivot_longer(cols = starts_with("A7"), names_to = "question", 
               values_to = "response") %>%
  group_by(question, response) %>%
  summarize(count = n(), .groups = 'drop')

# Rename questions
work_activity_distribution <- work_activity_distribution %>%
  mutate(question = recode(question,
                           "A701" = "Agriculture",
                           "A702" = "Business_Service",
                           "A703" = "Other_Employee",
                           "A704" = "Apprentice"))

# Calculate percentages within each question group
work_activity_distribution <- work_activity_distribution %>%
  group_by(question) %>%
  mutate(percentage = (count / sum(count)) * 100)

# Pivot data to wide format
summary_work_activity <- work_activity_distribution %>%
  pivot_wider(names_from = question, values_from = c(count, percentage), 
              names_sep = "_") %>%
  arrange(response)

# Ensure columns are numeric numeric and round them to two decimal places
summary_work_activity <- summary_work_activity %>%
  mutate(across(starts_with("percentage"), ~ round(as.numeric(.), 2)))

summary_work_activity %>%
  kable("html", col.names = c("Response", "Agriculture N", "Business/Service N",
                              "Other Employee N", "Apprentice N", "Agriculture %",
                              "Business Service %", "Other Employee %", "Apprentice %"),
        caption = "Summary of Work Activities in the Last 7 Days") %>%
  kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condensed"))

Summary of Work Activities in the Last 7 Days
Response	Agriculture N	Business/Service N	Other Employee N	Apprentice N	Agriculture %	Business Service %	Other Employee %	Apprentice %
0	4747	6764	7246	7199	37.61	53.59	57.41	57.04
1	2766	747	153	143	21.92	5.92	1.21	1.13
NA	5108	5110	5222	5279	40.47	40.49	41.38	41.83

A713 Average number of hours of work per day

A714 Average number of working days per week

A715 Average number of months of work per year

# Distribution of working time questions
working_time_summary <- midline_individual %>%
  select(iid, A713, A714, A715) %>%
  summarize(
    avg_hours_per_day = round(mean(A713, na.rm = TRUE), 2),
    sd_hours_per_day = round(sd(A713, na.rm = TRUE), 2),
    min_hours_per_day = round(min(A713, na.rm = TRUE), 2),
    max_hours_per_day = round(max(A713, na.rm = TRUE), 2),
    median_hours_per_day = round(median(A713, na.rm = TRUE), 2),
    na_hours_per_day = sum(is.na(A713)),
    avg_days_per_week = round(mean(A714, na.rm = TRUE), 2),
    sd_days_per_week = round(sd(A714, na.rm = TRUE), 2),
    min_days_per_week = round(min(A714, na.rm = TRUE), 2),
    max_days_per_week = round(max(A714, na.rm = TRUE), 2),
    median_days_per_week = round(median(A714, na.rm = TRUE), 2),
    na_days_per_week = sum(is.na(A714)),
    avg_months_per_year = round(mean(A715, na.rm = TRUE), 2),
    sd_months_per_year = round(sd(A715, na.rm = TRUE), 2),
    min_months_per_year = round(min(A715, na.rm = TRUE), 2),
    max_months_per_year = round(max(A715, na.rm = TRUE), 2),
    median_months_per_year = round(median(A715, na.rm = TRUE), 2),
    na_months_per_year = sum(is.na(A715))
  )

# Summary table
working_time_distribution <- tibble(
  Statistic = c("Mean", "Standard Deviation", "Minimum", "Maximum", "Median", "NA Count"),
  `Average number of hours of work per day` = c(
    working_time_summary$avg_hours_per_day, 
    working_time_summary$sd_hours_per_day, 
    working_time_summary$min_hours_per_day, 
    working_time_summary$max_hours_per_day, 
    working_time_summary$median_hours_per_day, 
    working_time_summary$na_hours_per_day
  ),
  `Average number of working days per week` = c(
    working_time_summary$avg_days_per_week, 
    working_time_summary$sd_days_per_week, 
    working_time_summary$min_days_per_week, 
    working_time_summary$max_days_per_week, 
    working_time_summary$median_days_per_week, 
    working_time_summary$na_days_per_week
  ),
  `Average number of months of work per year` = c(
    working_time_summary$avg_months_per_year, 
    working_time_summary$sd_months_per_year, 
    working_time_summary$min_months_per_year, 
    working_time_summary$max_months_per_year, 
    working_time_summary$median_months_per_year, 
    working_time_summary$na_months_per_year
  )
)

working_time_distribution %>%
  kable("html", col.names = c("Statistic", "Hours per Day", "Days per Week", "Months per Year"),
        caption = "Summary of Working Time Distribution") %>%
  kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condensed"))

Summary of Working Time Distribution
Statistic	Hours per Day	Days per Week	Months per Year
Mean	5.85	4.98	9.37
Standard Deviation	2.36	1.46	3.06
Minimum	0.00	0.00	0.00
Maximum	16.00	7.00	12.00
Median	6.00	5.00	11.00
NA Count	9474.00	9474.00	9474.00

A716 Form of payment for main employment 1 Fixed salary (month, fortnight, week) 2 On the working day or hour 3 On task 4 Committee 5 Benefits 6 In kind (products, food, etc.) 7 Is not paid -8 Don't know-9 Refused to answer

A717 Last month’s income for this job

Midline Survey Exploration

2024-07-15

Preparation

Load packages

Load data

Data dictionary

Summary Statistics

Age

Gender

Household size

Nationality

Education

Relevant Outcomes

Consumption (quantity - volume/amount, and quality - type)

Income and (self) employment status

Schooling of Children, school enrolment

Health

Female empowerement (HH decision making)

Midline Survey Exploration

2024-07-15

Preparation

Load packages

Load data

Data dictionary

Summary Statistics

Age

Gender

Household size

Nationality

Education

Relevant Outcomes

Consumption (quantity - volume/amount, and quality - type)

Income and (self) employment status

Schooling of Children, school enrolment

Health

Social cohesion

Female empowerement (HH decision making)