Library and settings

library(tidyverse)
library(readxl)
library(nutriR)
library(ggplot2)
#library(tabulizer)
library(rnaturalearth)
library(sf)
library(DT)
library(lhs)
library(stringr)

Read in dataset

Country code and group data

country=read_excel("data/CLASS.xlsx", sheet=1) %>% janitor::clean_names()
code=read_excel("data/CLASS.xlsx", sheet=2)

datatable(
data = country,
caption = "Table 1: country code",
filter = "top",
style = "bootstrap4"
)

Calcuim nutrition data (\(a_i\))

calcium_preview=read.csv(file="data/v36_cnty.csv") %>% 
  filter(age>=17.5 & age<=47.5 & female=="1")

Calculate the shape for each region

calcium_distr=get_dists(nutrients="Calcium" ,  sexes="F", ages=15:50 ) %>% 
  drop_na() %>% 
  left_join(country, by=c("iso3"="code"))  %>%
  group_by(region, age_group) %>% 
  summarize(mean_shape=mean(g_shape))

calcium_world=calcium_distr %>% 
  group_by(age_group) %>% 
  summarize(means=mean(mean_shape))

calculate the proportion of people take insufficient calcuim

calciumNew= calcium_preview %>% 
  filter(urban=="999" & edu=="999" & year=="2018") %>% 
  rowwise() %>%
  left_join(country, by=c("iso3"="code")) %>% 
  mutate(
    age_group = case_when(
    age == 17.5 ~ "15-19",
    age == 22.5 ~ "20-24",
    age == 27.5 ~ "25-29",
    age == 32.5 ~ "30-34",
    age == 37.5 ~ "35-39",
    age == 42.5 ~ "40-44",
    age == 47.5 ~ "45-49",
    TRUE ~ as.character(age) 
  )
  ) %>% 
  left_join(calcium_distr, by=c("region"="region", "age_group"="age_group") ) %>% 
  mutate(
    
    mean_shape = case_when(
      is.na(mean_shape) & age_group == "15-19" ~ calcium_world$means[calcium_world$age_group == "15-19"],
      is.na(mean_shape) & age_group == "20-24" ~ calcium_world$means[calcium_world$age_group == "20-24"],
      is.na(mean_shape) & age_group == "25-29" ~ calcium_world$means[calcium_world$age_group == "25-29"],
      is.na(mean_shape) & age_group == "30-34" ~ calcium_world$means[calcium_world$age_group == "30-34"],
      is.na(mean_shape) & age_group == "35-39" ~ calcium_world$means[calcium_world$age_group == "35-39"],
      is.na(mean_shape) & age_group == "40-44" ~ calcium_world$means[calcium_world$age_group == "40-44"],
      is.na(mean_shape) & age_group == "45-49" ~ calcium_world$means[calcium_world$age_group == "45-49"],
      TRUE ~ mean_shape
    ),
    grate=mean_shape/median
    
  ) %>%  
  rowwise() %>% 
  mutate(pro_low=pgamma(800, shape = mean_shape, rate = grate)
           )
  
calciumNew$sev_result = NA # Create a new column to store the SEV results


for (i in seq_len(nrow(calciumNew))) {
  calciumNew$sev_result[i] = sev(
    ear=800,
    cv=0.1,
    shape=calciumNew$mean_shape[i],
    rate=calciumNew$grate[i]
  )
}

datatable(
data = calciumNew,
caption = "Table 2-1: Calcium data",
filter = "top",
style = "bootstrap4"
)

The age population distribution

file_path = "data/WPP2022_POP_F02_3_POPULATION_5-YEAR_AGE_GROUPS_FEMALE.xlsx"
pop = read_excel(file_path, skip=15) 
colnames(pop) = pop[1,]

pop = pop[-1, ] %>% 
  janitor::clean_names() %>% 
  filter(year=="2021") %>% 
  rename_with(~ gsub("x(\\d+)_", "\\1-", .), starts_with("x"))

Calculate the weight for each age group

pop_data = pop %>%
  select(iso3_alpha_code, "0-4":x100) %>%
  mutate(across("0-4":x100, as.numeric)) %>%
  drop_na() %>%
  rowwise() %>% 
  mutate(total = sum(c_across("15-19":"45-49")),
         across("15-19":"45-49", ~ . / total, .names = "proportion_{.col}")) %>% 
  select(iso3_alpha_code, starts_with("proportion_"))

datatable(
data = pop_data,
caption = "Table 2-2: Population distribution",
filter = "top",
style = "bootstrap4"
)

Turn the table from wide to long for joint

pop_long = pop_data %>%
  pivot_longer(cols = starts_with("proportion_"), names_to = "age_group", values_to = "proportion", names_prefix = "proportion_")

Joint the calcium with the population data to calculate

calciummerge=calciumNew %>% 
  left_join(pop_long, by=c("iso3"="iso3_alpha_code", "age_group"="age_group")) %>% 
  mutate(frac_sev=sev_result*proportion*0.01,
         frac_own=pro_low*proportion,) %>% 
  group_by(iso3, economy) %>% 
  summarize(tfrac_sev=sum(frac_sev),
            tfrac_own=sum(frac_own)) %>% 
  mutate(
         low_intake_sev= case_when(
   tfrac_sev> 0.25  ~ 1,
   TRUE ~ 0
  ),
  low_intake_own= case_when(
   tfrac_own> 0.25  ~ 1,
   TRUE ~ 0
  ))

datatable(
data = calciummerge,
caption = "Table 2-3: Insufficient calcium intake",
filter = "top",
style = "bootstrap4"
)  

EDA for region

calciummerge = calciummerge %>%
  left_join(country, by=c("iso3"="code"))

regions  = unique(calciummerge$region)

For the sev function calculation

for (region in regions) {
  region_data = calciummerge %>%
    filter(region == !!region)
  
  p <- ggplot(region_data, aes(x = reorder(economy.x, -tfrac_sev), y = tfrac_sev, fill = income_group)) +
    geom_bar(stat = "identity", alpha = 0.8) +
    geom_hline(yintercept = 0.25, linetype = "dashed", color = "red") +
    coord_flip() +
    xlab('Country') +
    ylab('Fraction') +
    ggtitle(paste('Insufficient Fraction in', region)) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
  
  print(p)
}

For my own calculation

for (region in regions) {
  region_data = calciummerge %>%
    filter(region == !!region)
  
  p <- ggplot(region_data, aes(x = reorder(economy.x, -tfrac_own), y = tfrac_own, fill = income_group)) +
    geom_bar(stat = "identity", alpha = 0.8) +
    geom_hline(yintercept = 0.25, linetype = "dashed", color = "red") +
    coord_flip() +
    xlab('Country') +
    ylab('Fraction') +
    ggtitle(paste('Insufficient Fraction in', region)) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
  
  print(p)
}

Country with missing data

country_left=anti_join(country, calciumNew, by=c("code"="iso3")) 

datatable(
data = country_left,
caption = "Table 3: Birth rate in 2021",
filter = "top",
style = "bootstrap4"
)

Antenatal care(ANC) (\(b_i\))

Compare to choose the suitable ANC data

Lastest UNICEF data –up to 2021, 24 NA, 192 total

path="data/fusion_GLOBAL_DATAFLOW_UNICEF_1.0_.MNCH_ANC4...csv"
ancf=read.csv(file=path) %>% 
  janitor::clean_names() 
anc <- ancf %>%
  group_by(ref_area_geographic_area) %>% 
  arrange(desc(time_period_time_period)) %>% 
  slice_head(n = 5) %>% 
  summarise(
    mean_anc = mean(obs_value_observation_value, na.rm = TRUE),
    sd_anc = sd(obs_value_observation_value, na.rm = TRUE)
  ) %>%
  ungroup()  %>% 
  separate(ref_area_geographic_area, into = c("code", "country"), sep = ": ") %>% 
  mutate(code = str_trim(code),
         country_name = str_trim(country)) %>% 
  select(code,country, mean_anc, sd_anc)

datatable(
data = anc,
caption = "Table 3: ANC",
filter = "top",
style = "bootstrap4"
)
anclogit=ancf %>% 
   separate(ref_area_geographic_area, into = c("code", "country"), sep = ": ") %>% 
  mutate(code = str_trim(code),
         country_name = str_trim(country)) %>%  
  left_join(country) %>%
  mutate( logit_obse=log((obs_value_observation_value*0.01)/(1-obs_value_observation_value*0.01))) %>%
  filter(!is.infinite(logit_obse)) %>% 
  group_by(region) %>% 
  summarise(logit_mean=mean(logit_obse),
            logit_var=sd(logit_obse)) %>% 
  mutate(mean_anc_region=100/(1+exp(-logit_mean)),
          derivative_inverse_logit = exp(-logit_mean) / (1 + exp(-logit_mean))^2,
         inverse_logit_sd= (derivative_inverse_logit^2) * logit_var,
         original_sd = inverse_logit_sd / 0.01
         )
anc=calciummerge %>% 
  left_join(anc, by=c("iso3"="code")) %>% 
  left_join(anclogit, by = "region") %>%
  mutate(sd_anc = ifelse(is.na(sd_anc), original_sd, sd_anc),
         mean_anc=ifelse(is.na(mean_anc), mean_anc_region,mean_anc),
    lower_anc = mean_anc - sd_anc, 
    upper_anc = mean_anc + sd_anc ) 

datatable(
data = anc,
caption = "Table 3: ANC",
filter = "top",
style = "bootstrap4"
)

Fertility (\(c_i\))

birthlower=read_excel("data/UN_PPP2022_Output_Births.xlsx",sheet=1, skip = 14)
birthmean=read_excel("data/UN_PPP2022_Output_Births.xlsx",sheet=3, skip = 14)
birthupper=read_excel("data/UN_PPP2022_Output_Births.xlsx",sheet=5, skip = 14)

colnames(birthlower) <- birthlower[1,]
colnames(birthmean) <- birthmean[1,]
colnames(birthupper) <- birthupper[1,]

birthlower <- birthlower[-1, ] %>% 
  janitor::clean_names()
birthmean <- birthmean[-1, ] %>% 
  janitor::clean_names()
birthupper <- birthupper[-1, ] %>% 
  janitor::clean_names()

birthnum= birthlower %>% 
  rbind(birthmean) %>% 
  rbind(birthupper)
birthnum2024= birthnum %>% 
  select(variant, iso3_alpha_code,region_subregion_country_or_area, x2024) %>% 
  drop_na() %>% 
  pivot_wider(names_from = variant, values_from = x2024) %>% 
  janitor::clean_names() %>% 
  rename(iso3=iso3_alpha_code, country=region_subregion_country_or_area, birthmean=median_pi, birthupper=upper_95_pi, birthlower=lower_95_pi)  

datatable(
data = birthnum2024,
caption = "Table 4-1: Birth number in 2024",
filter = "top",
style = "bootstrap4"
)
birthrate=read_excel("data/UN_PPP2022_Input_TFR.xlsx", sheet=3, skip = 14)
colnames(birthrate) <- birthrate[1,]


birthrate <- birthrate[-1, ] %>% 
  janitor::clean_names()
birthrate2024= birthrate %>% 
  select(iso3_alpha_code,region_subregion_country_or_area, x2024) %>% 
  drop_na() %>% 
  rename(iso3=iso3_alpha_code, country=region_subregion_country_or_area, rate_2024=x2024) %>% 
  left_join(pop, by=c("iso3"="iso3_alpha_code")) %>% 
  mutate(across("15-19":"45-49", as.numeric)) %>%
  rowwise() %>% 
  mutate(total = sum(c_across("15-19":"45-49")),
         rate_2024=as.numeric(rate_2024)) 

datatable(
data = birthrate2024,
caption = "Table 4-2: Birth rate in 2024",
filter = "top",
style = "bootstrap4"
)

Preterm birth rate(\(d_i\))

#pret <- extract_tables("/Users/cuihening/Desktop/harvard/nutrition/data/mmc1.pdf", pages = c(39, 40))
load(file="data/preterm.RData")
pret_bir=as.data.frame(pret[[1]]) %>% 
  rbind(., as.data.frame(pret[[2]]))

colnames(pret_bir) <- c("country", "PTB rate", "Lower UR", "Upper UR", "Source of PTB rate", "Estimated live births", "Estimated proportion of global live births", "Estimated number of preterm birth", "Lower UR (2%)", "Upper UR (98%)", "proportion of global preterm birth")

pret_bir= pret_bir[-c(1, 2), ] %>% 
  janitor::clean_names() %>% 
  rename(enpb=estimated_number_of_preterm_birth) %>% 
  select(c("country", "ptb_rate", lower_ur, upper_ur)) %>% 
  mutate(year=2014,
         country = if_else(country == "United States of America***", "United States", country)) %>% 
  left_join(birthnum2024) %>% 
  mutate(across(ptb_rate:upper_ur, as.numeric),
         birthmean=as.numeric(birthmean),
    across(ptb_rate:upper_ur, ~ . * birthmean, .names = "pretnum_{.col}"))


datatable(
data = pret_bir,
caption = "Table 5: Preterm birth rate",
filter = "top",
style = "bootstrap4"
) 

Number of women experiencing preeclampsia in 2020(\(e_i\))

preecdf=read.csv("data/IHME-GBD_2019_DATA-ed1f166b-1/IHME-GBD_2019_DATA-ed1f166b-1.csv") %>% 
  janitor::clean_names()
birth_preec=read.csv("data/IHME_GBD_2019_FERTILITY_1950_2019_LB_Y2020M08D05.CSV") %>% 
  janitor::clean_names() %>% 
  filter(year_id==2019)%>% 
  rename(birth_num=val) %>% 
  select(location_id, location_name, age_group_id, birth_num)
preec=preecdf %>% 
  mutate(country = if_else(location_name == "United States of America", "United States", location_name,)) %>% 
  select("country", location_id, age_id, age_name, val, upper, lower) %>%
  left_join(birth_preec, by=c("location_id"="location_id", "age_id"="age_group_id")) %>% 
  filter(age_id!=22) %>% 
  mutate(
    birth_num=as.numeric(birth_num),
    across(val:lower, ~ . / birth_num, .names = "precrate_{.col}")) %>% 
  group_by(country) %>% 
  summarize(preec_rate_mean=mean(precrate_val),
            preec_rate_upp=mean(precrate_upper),
            preec_rate_low=mean(precrate_lower)) %>% 
  select(country, preec_rate_mean,preec_rate_upp, preec_rate_low)



datatable(
data = preec,
caption = "Table 6: Prevalence of preeclampsia in 2020",
filter = "top",
style = "bootstrap4"
)   

Some EDA

world <- ne_countries(scale = "medium", returnclass = "sf")
preec_merged <- merge(world, preec, by.x = "name", by.y = "country")
ggplot(preec_merged) +
  geom_sf(aes(fill = preec_rate_mean)) +
  scale_fill_gradient(low = "white", high = "red") +
  theme_minimal() +
  labs(fill = "Total preeclampsia",
       title = "Total preeclampsia by Country",
       caption = "Global Burden of Disease Study 2019 (GBD 2019) Results")

RR preterm & PREEC

Create a function to create the gamma distribution:

gammapar <- function(tgt) {
  tgt <- as.numeric(tgt)
  mn <- tgt[1]; cir <- (tgt[3]-tgt[2])
  xopt <- function(b,mn,cir) {
    cir2 <- qgamma(c(1,39)/40,mn*b,b); 
    cir2 <- cir2[2]-cir2[1]
    (cir2-cir)^2 }
  zz <- optimize(xopt,c(0.1,100000),mn=mn,cir=cir)$minimum
  c(zz*mn,zz) }
rrpret= gammapar(c(0.76,0.6,0.97))
qgamma(c(20,1,39)/40,rrpret[1],rrpret[2])
## [1] 0.7560854 0.5861210 0.9561210
rrpreec= gammapar(c(0.45,0.31, 0.65))
qgamma(c(20,1,39)/40, rrpreec[1],rrpreec[2])
## [1] 0.4444034 0.2958948 0.6358949

Still birth(just import)

#stillb = extract_tables("/Users/cuihening/Desktop/harvard/nutrition/data/stillbirth.pdf", pages = c(38, 39,40,41))

Joint the table together

For region alignment in replacing the missing data in the anc: South AsiaSoutheast Asia Latin America & Caribbean, North AmericaAmerica Sub-Saharan Africa, Middle East & North AfricaAfrica Europe & Central AsiaEastern Mediterranean East Asia & PacifcAsia

calcium=anc %>% 
  left_join(pret_bir, by=c("iso3"="iso3")) %>% 
  select(-year, -economy.x, -country.x) %>% 
  rename(country=country.y) %>% 
  mutate(country = case_when(
    country == "Laos" ~ "Lao People's Democratic Republic",
    country == "Bosnia & Herzegovina" ~ "Bosnia and Herzegovina",
    TRUE ~ country
  )) %>% 
  left_join(preec)

draws <- randomLHS(n=nrow(calcium),k=1)
calcium$rrpret=rgamma(draws,rrpret[1],rrpret[2])
calcium$rrpreec=rgamma(draws,rrpreec[1],rrpreec[2])


datatable(
data = calcium,
caption = "Table 7: Insufficient calcium intake worldwide",
filter = "top",
style = "bootstrap4"
)

Supplementary

The new ANC4 data–also updated until the 2021, 15 NA, 162 total