Library and settings
library(tidyverse)
library(readxl)
library(nutriR)
library(ggplot2)
#library(tabulizer)
library(rnaturalearth)
library(sf)
library(DT)
library(lhs)
library(stringr)
country=read_excel("data/CLASS.xlsx", sheet=1) %>% janitor::clean_names()
code=read_excel("data/CLASS.xlsx", sheet=2)
datatable(
data = country,
caption = "Table 1: country code",
filter = "top",
style = "bootstrap4"
)
calcium_preview=read.csv(file="data/v36_cnty.csv") %>%
filter(age>=17.5 & age<=47.5 & female=="1")
Calculate the shape for each region
calcium_distr=get_dists(nutrients="Calcium" , sexes="F", ages=15:50 ) %>%
drop_na() %>%
left_join(country, by=c("iso3"="code")) %>%
group_by(region, age_group) %>%
summarize(mean_shape=mean(g_shape))
calcium_world=calcium_distr %>%
group_by(age_group) %>%
summarize(means=mean(mean_shape))
calculate the proportion of people take insufficient calcuim
calciumNew= calcium_preview %>%
filter(urban=="999" & edu=="999" & year=="2018") %>%
rowwise() %>%
left_join(country, by=c("iso3"="code")) %>%
mutate(
age_group = case_when(
age == 17.5 ~ "15-19",
age == 22.5 ~ "20-24",
age == 27.5 ~ "25-29",
age == 32.5 ~ "30-34",
age == 37.5 ~ "35-39",
age == 42.5 ~ "40-44",
age == 47.5 ~ "45-49",
TRUE ~ as.character(age)
)
) %>%
left_join(calcium_distr, by=c("region"="region", "age_group"="age_group") ) %>%
mutate(
mean_shape = case_when(
is.na(mean_shape) & age_group == "15-19" ~ calcium_world$means[calcium_world$age_group == "15-19"],
is.na(mean_shape) & age_group == "20-24" ~ calcium_world$means[calcium_world$age_group == "20-24"],
is.na(mean_shape) & age_group == "25-29" ~ calcium_world$means[calcium_world$age_group == "25-29"],
is.na(mean_shape) & age_group == "30-34" ~ calcium_world$means[calcium_world$age_group == "30-34"],
is.na(mean_shape) & age_group == "35-39" ~ calcium_world$means[calcium_world$age_group == "35-39"],
is.na(mean_shape) & age_group == "40-44" ~ calcium_world$means[calcium_world$age_group == "40-44"],
is.na(mean_shape) & age_group == "45-49" ~ calcium_world$means[calcium_world$age_group == "45-49"],
TRUE ~ mean_shape
),
grate=mean_shape/median
) %>%
rowwise() %>%
mutate(pro_low=pgamma(800, shape = mean_shape, rate = grate)
)
calciumNew$sev_result = NA # Create a new column to store the SEV results
for (i in seq_len(nrow(calciumNew))) {
calciumNew$sev_result[i] = sev(
ear=800,
cv=0.1,
shape=calciumNew$mean_shape[i],
rate=calciumNew$grate[i]
)
}
datatable(
data = calciumNew,
caption = "Table 2-1: Calcium data",
filter = "top",
style = "bootstrap4"
)
The age population distribution
file_path = "data/WPP2022_POP_F02_3_POPULATION_5-YEAR_AGE_GROUPS_FEMALE.xlsx"
pop = read_excel(file_path, skip=15)
colnames(pop) = pop[1,]
pop = pop[-1, ] %>%
janitor::clean_names() %>%
filter(year=="2021") %>%
rename_with(~ gsub("x(\\d+)_", "\\1-", .), starts_with("x"))
Calculate the weight for each age group
pop_data = pop %>%
select(iso3_alpha_code, "0-4":x100) %>%
mutate(across("0-4":x100, as.numeric)) %>%
drop_na() %>%
rowwise() %>%
mutate(total = sum(c_across("15-19":"45-49")),
across("15-19":"45-49", ~ . / total, .names = "proportion_{.col}")) %>%
select(iso3_alpha_code, starts_with("proportion_"))
datatable(
data = pop_data,
caption = "Table 2-2: Population distribution",
filter = "top",
style = "bootstrap4"
)
Turn the table from wide to long for joint
pop_long = pop_data %>%
pivot_longer(cols = starts_with("proportion_"), names_to = "age_group", values_to = "proportion", names_prefix = "proportion_")
Joint the calcium with the population data to calculate
calciummerge=calciumNew %>%
left_join(pop_long, by=c("iso3"="iso3_alpha_code", "age_group"="age_group")) %>%
mutate(frac_sev=sev_result*proportion*0.01,
frac_own=pro_low*proportion,) %>%
group_by(iso3, economy) %>%
summarize(tfrac_sev=sum(frac_sev),
tfrac_own=sum(frac_own)) %>%
mutate(
low_intake_sev= case_when(
tfrac_sev> 0.25 ~ 1,
TRUE ~ 0
),
low_intake_own= case_when(
tfrac_own> 0.25 ~ 1,
TRUE ~ 0
))
datatable(
data = calciummerge,
caption = "Table 2-3: Insufficient calcium intake",
filter = "top",
style = "bootstrap4"
)
EDA for region
calciummerge = calciummerge %>%
left_join(country, by=c("iso3"="code"))
regions = unique(calciummerge$region)
For the sev function calculation
for (region in regions) {
region_data = calciummerge %>%
filter(region == !!region)
p <- ggplot(region_data, aes(x = reorder(economy.x, -tfrac_sev), y = tfrac_sev, fill = income_group)) +
geom_bar(stat = "identity", alpha = 0.8) +
geom_hline(yintercept = 0.25, linetype = "dashed", color = "red") +
coord_flip() +
xlab('Country') +
ylab('Fraction') +
ggtitle(paste('Insufficient Fraction in', region)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(p)
}
For my own calculation
for (region in regions) {
region_data = calciummerge %>%
filter(region == !!region)
p <- ggplot(region_data, aes(x = reorder(economy.x, -tfrac_own), y = tfrac_own, fill = income_group)) +
geom_bar(stat = "identity", alpha = 0.8) +
geom_hline(yintercept = 0.25, linetype = "dashed", color = "red") +
coord_flip() +
xlab('Country') +
ylab('Fraction') +
ggtitle(paste('Insufficient Fraction in', region)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(p)
}
country_left=anti_join(country, calciumNew, by=c("code"="iso3"))
datatable(
data = country_left,
caption = "Table 3: Birth rate in 2021",
filter = "top",
style = "bootstrap4"
)
Compare to choose the suitable ANC data
Lastest UNICEF data –up to 2021, 24 NA, 192 total
path="data/fusion_GLOBAL_DATAFLOW_UNICEF_1.0_.MNCH_ANC4...csv"
ancf=read.csv(file=path) %>%
janitor::clean_names()
anc <- ancf %>%
group_by(ref_area_geographic_area) %>%
arrange(desc(time_period_time_period)) %>%
slice_head(n = 5) %>%
summarise(
mean_anc = mean(obs_value_observation_value, na.rm = TRUE),
sd_anc = sd(obs_value_observation_value, na.rm = TRUE)
) %>%
ungroup() %>%
separate(ref_area_geographic_area, into = c("code", "country"), sep = ": ") %>%
mutate(code = str_trim(code),
country_name = str_trim(country)) %>%
select(code,country, mean_anc, sd_anc)
datatable(
data = anc,
caption = "Table 3: ANC",
filter = "top",
style = "bootstrap4"
)
anclogit=ancf %>%
separate(ref_area_geographic_area, into = c("code", "country"), sep = ": ") %>%
mutate(code = str_trim(code),
country_name = str_trim(country)) %>%
left_join(country) %>%
mutate( logit_obse=log((obs_value_observation_value*0.01)/(1-obs_value_observation_value*0.01))) %>%
filter(!is.infinite(logit_obse)) %>%
group_by(region) %>%
summarise(logit_mean=mean(logit_obse),
logit_var=sd(logit_obse)) %>%
mutate(mean_anc_region=100/(1+exp(-logit_mean)),
derivative_inverse_logit = exp(-logit_mean) / (1 + exp(-logit_mean))^2,
inverse_logit_sd= (derivative_inverse_logit^2) * logit_var,
original_sd = inverse_logit_sd / 0.01
)
anc=calciummerge %>%
left_join(anc, by=c("iso3"="code")) %>%
left_join(anclogit, by = "region") %>%
mutate(sd_anc = ifelse(is.na(sd_anc), original_sd, sd_anc),
mean_anc=ifelse(is.na(mean_anc), mean_anc_region,mean_anc),
lower_anc = mean_anc - sd_anc,
upper_anc = mean_anc + sd_anc )
datatable(
data = anc,
caption = "Table 3: ANC",
filter = "top",
style = "bootstrap4"
)
birthlower=read_excel("data/UN_PPP2022_Output_Births.xlsx",sheet=1, skip = 14)
birthmean=read_excel("data/UN_PPP2022_Output_Births.xlsx",sheet=3, skip = 14)
birthupper=read_excel("data/UN_PPP2022_Output_Births.xlsx",sheet=5, skip = 14)
colnames(birthlower) <- birthlower[1,]
colnames(birthmean) <- birthmean[1,]
colnames(birthupper) <- birthupper[1,]
birthlower <- birthlower[-1, ] %>%
janitor::clean_names()
birthmean <- birthmean[-1, ] %>%
janitor::clean_names()
birthupper <- birthupper[-1, ] %>%
janitor::clean_names()
birthnum= birthlower %>%
rbind(birthmean) %>%
rbind(birthupper)
birthnum2024= birthnum %>%
select(variant, iso3_alpha_code,region_subregion_country_or_area, x2024) %>%
drop_na() %>%
pivot_wider(names_from = variant, values_from = x2024) %>%
janitor::clean_names() %>%
rename(iso3=iso3_alpha_code, country=region_subregion_country_or_area, birthmean=median_pi, birthupper=upper_95_pi, birthlower=lower_95_pi)
datatable(
data = birthnum2024,
caption = "Table 4-1: Birth number in 2024",
filter = "top",
style = "bootstrap4"
)
birthrate=read_excel("data/UN_PPP2022_Input_TFR.xlsx", sheet=3, skip = 14)
colnames(birthrate) <- birthrate[1,]
birthrate <- birthrate[-1, ] %>%
janitor::clean_names()
birthrate2024= birthrate %>%
select(iso3_alpha_code,region_subregion_country_or_area, x2024) %>%
drop_na() %>%
rename(iso3=iso3_alpha_code, country=region_subregion_country_or_area, rate_2024=x2024) %>%
left_join(pop, by=c("iso3"="iso3_alpha_code")) %>%
mutate(across("15-19":"45-49", as.numeric)) %>%
rowwise() %>%
mutate(total = sum(c_across("15-19":"45-49")),
rate_2024=as.numeric(rate_2024))
datatable(
data = birthrate2024,
caption = "Table 4-2: Birth rate in 2024",
filter = "top",
style = "bootstrap4"
)
#pret <- extract_tables("/Users/cuihening/Desktop/harvard/nutrition/data/mmc1.pdf", pages = c(39, 40))
load(file="data/preterm.RData")
pret_bir=as.data.frame(pret[[1]]) %>%
rbind(., as.data.frame(pret[[2]]))
colnames(pret_bir) <- c("country", "PTB rate", "Lower UR", "Upper UR", "Source of PTB rate", "Estimated live births", "Estimated proportion of global live births", "Estimated number of preterm birth", "Lower UR (2%)", "Upper UR (98%)", "proportion of global preterm birth")
pret_bir= pret_bir[-c(1, 2), ] %>%
janitor::clean_names() %>%
rename(enpb=estimated_number_of_preterm_birth) %>%
select(c("country", "ptb_rate", lower_ur, upper_ur)) %>%
mutate(year=2014,
country = if_else(country == "United States of America***", "United States", country)) %>%
left_join(birthnum2024) %>%
mutate(across(ptb_rate:upper_ur, as.numeric),
birthmean=as.numeric(birthmean),
across(ptb_rate:upper_ur, ~ . * birthmean, .names = "pretnum_{.col}"))
datatable(
data = pret_bir,
caption = "Table 5: Preterm birth rate",
filter = "top",
style = "bootstrap4"
)
preecdf=read.csv("data/IHME-GBD_2019_DATA-ed1f166b-1/IHME-GBD_2019_DATA-ed1f166b-1.csv") %>%
janitor::clean_names()
birth_preec=read.csv("data/IHME_GBD_2019_FERTILITY_1950_2019_LB_Y2020M08D05.CSV") %>%
janitor::clean_names() %>%
filter(year_id==2019)%>%
rename(birth_num=val) %>%
select(location_id, location_name, age_group_id, birth_num)
preec=preecdf %>%
mutate(country = if_else(location_name == "United States of America", "United States", location_name,)) %>%
select("country", location_id, age_id, age_name, val, upper, lower) %>%
left_join(birth_preec, by=c("location_id"="location_id", "age_id"="age_group_id")) %>%
filter(age_id!=22) %>%
mutate(
birth_num=as.numeric(birth_num),
across(val:lower, ~ . / birth_num, .names = "precrate_{.col}")) %>%
group_by(country) %>%
summarize(preec_rate_mean=mean(precrate_val),
preec_rate_upp=mean(precrate_upper),
preec_rate_low=mean(precrate_lower)) %>%
select(country, preec_rate_mean,preec_rate_upp, preec_rate_low)
datatable(
data = preec,
caption = "Table 6: Prevalence of preeclampsia in 2020",
filter = "top",
style = "bootstrap4"
)
Some EDA
world <- ne_countries(scale = "medium", returnclass = "sf")
preec_merged <- merge(world, preec, by.x = "name", by.y = "country")
ggplot(preec_merged) +
geom_sf(aes(fill = preec_rate_mean)) +
scale_fill_gradient(low = "white", high = "red") +
theme_minimal() +
labs(fill = "Total preeclampsia",
title = "Total preeclampsia by Country",
caption = "Global Burden of Disease Study 2019 (GBD 2019) Results")
Create a function to create the gamma distribution:
gammapar <- function(tgt) {
tgt <- as.numeric(tgt)
mn <- tgt[1]; cir <- (tgt[3]-tgt[2])
xopt <- function(b,mn,cir) {
cir2 <- qgamma(c(1,39)/40,mn*b,b);
cir2 <- cir2[2]-cir2[1]
(cir2-cir)^2 }
zz <- optimize(xopt,c(0.1,100000),mn=mn,cir=cir)$minimum
c(zz*mn,zz) }
rrpret= gammapar(c(0.76,0.6,0.97))
qgamma(c(20,1,39)/40,rrpret[1],rrpret[2])
## [1] 0.7560854 0.5861210 0.9561210
rrpreec= gammapar(c(0.45,0.31, 0.65))
qgamma(c(20,1,39)/40, rrpreec[1],rrpreec[2])
## [1] 0.4444034 0.2958948 0.6358949
#stillb = extract_tables("/Users/cuihening/Desktop/harvard/nutrition/data/stillbirth.pdf", pages = c(38, 39,40,41))
For region alignment in replacing the missing data in the anc:
South Asia–Southeast Asia
Latin America & Caribbean, North America
–America Sub-Saharan Africa,
Middle East & North Africa –Africa
Europe & Central Asia
–Eastern Mediterranean East Asia & Pacifc
– Asia
calcium=anc %>%
left_join(pret_bir, by=c("iso3"="iso3")) %>%
select(-year, -economy.x, -country.x) %>%
rename(country=country.y) %>%
mutate(country = case_when(
country == "Laos" ~ "Lao People's Democratic Republic",
country == "Bosnia & Herzegovina" ~ "Bosnia and Herzegovina",
TRUE ~ country
)) %>%
left_join(preec)
draws <- randomLHS(n=nrow(calcium),k=1)
calcium$rrpret=rgamma(draws,rrpret[1],rrpret[2])
calcium$rrpreec=rgamma(draws,rrpreec[1],rrpreec[2])
datatable(
data = calcium,
caption = "Table 7: Insufficient calcium intake worldwide",
filter = "top",
style = "bootstrap4"
)
The new ANC4 data–also updated until the 2021, 15 NA, 162 total