To explore descriptive statistics on exposure, outcome and sociodemographics including data availability.
A data base compiling fetal ultrasound and birth anthropometric outcomes, corresponding z-scores, sociodemographic information on pregnant and their children, and corresponding exposure data on 4 air pollutants by gestational week is used.
Data flow chart
Raw datasets
Birth dataset English Language: chamnha_healthdata Cases: n=620,866; Individual ids: 620,866; 2014-2019; Variables: ID, date of birth, gestational age, demographics
Birth dataset Swedish language: db Cases: n=639,236; Individual ids: 629,989; 2014-2019; Variables: ID, date of birth, gestational age, demographics
Ultrasound dataset: ultraljud_foster; Individual ids: 569038; 2014-2019; Variables: ID, date of ultrasound (several per case), number of fetuses, ultrasound measures (crl, bpd etc.)
KUB ultrasound dataset: kub_undersokningar; Cases: n=213,250; Individual ids: 205,390; 2014-2019; Variables: ID, date of birth, ultrasound measures
Exposure dataset: Individual ids: n=500,958; exposure data provided by spatiotemporal model
Code
# Review availability of cases in datasets used in this project#access datasets: Birth dataset English languagesetwd("P:/EnvEpi-Projekt/CHAMNHA/6_ultrasound_nikolaus")load("1_originaldata/chamnha_healthdata.Rda")#names(ch_ht)#num_unique_ids <- length(unique(ch_ht$Graviditet))#num_unique_ids#access datasets: Birth dataset Swedish languagesetwd("P:/EnvEpi-Projekt/CHAMNHA")load("1_paper_temp_ptb/02_datasets/health_data.Rda")#names(db)#num_unique_ids <- length(unique(db$GraviditetID))#num_unique_ids#access datasets: Ultrasound datasetsetwd("P:/EnvEpi-Projekt/CHAMNHA/6_ultrasound_nikolaus")data_us <-as.data.table(read.csv(file ="1_originaldata/ultraljud_foster.csv", sep =";")) #names(data_us)#num_unique_ids <- length(unique(data_us$graviditetid))#num_unique_ids #access datasets: KUB datasetsetwd("P:/EnvEpi-Projekt/CHAMNHA/6_ultrasound_nikolaus")load("2_processeddata/conceptiondate_all/conceptiondate_kub.Rda")#names(data1)#num_unique_ids <- length(unique(data1$id))#num_unique_ids
Identical cases matrix across datasets
Overlap matrix of identical cases between the four datasets Birth dataset English Language, Birth dataset Swedish language, Ultrasound dataset, KUB ultrasound dataset
Code
#Rename id variable, remove other cases, deduplicate cases bir_en <- ch_ht %>%rename(id = Graviditet) %>%select(id) %>%distinct(id, .keep_all =TRUE)bir_sw <- db %>%rename(id = GraviditetID) %>%select(id) %>%distinct(id, .keep_all =TRUE)ultra <- data_us %>%rename(id = graviditetid) %>%select(id) %>%distinct(id, .keep_all =TRUE)kub <- data1 %>%select(id) %>%distinct(id, .keep_all =TRUE)rm(data1, db, ch_ht, data_us)#Make a matrix of identical cases in the four datasets# Count the total number of cases in each datasetdataset_counts <-sapply(list(bir_en, bir_sw, ultra, kub), nrow)names(dataset_counts) <-c("bir_en", "bir_sw", "ultra", "kub")# Calculate the overlap matrixdatasets <-list(bir_en$id, bir_sw$id, ultra$id, kub$id)overlap_matrix <-matrix(0, nrow =4, ncol =4, dimnames =list(names(dataset_counts), names(dataset_counts)))for (i in1:4) {for (j in1:4) { overlap_matrix[i, j] <-length(intersect(datasets[[i]], datasets[[j]])) }}# Add dataset counts as row and column labelsrownames(overlap_matrix) <-paste0(names(dataset_counts), " (", dataset_counts, ")")colnames(overlap_matrix) <-paste0(names(dataset_counts), " (", dataset_counts, ")")print(overlap_matrix)
registered in the Swedish Pregnancy Register 2014-2019: birth_swe/db (n=629989) and birth_eng/ch_ht (n=620866)
??any ultrasound outcome during pregnancy available: ultrasound dataset (n=569038) and kub (n=205390) ??
date of conception via: 1) kub exam; 2) ultrasound exam; 3) birth dataset
residential address available for matching exposure with individual pregnant women (n=500,XXX)
?birth weight z-score?: birth weight z-score calculated by gestational age-specific birth weight provided by birth_eng/birth_swe datasets
Exclusion criteria
??no outcome data??
??no co-variates??
Flow chart
Data transformation
The dataset is modified for presentation purposes and variables. For example, variables on month, year and season of conception, birth weight z-score tertiles are defined.
To Do: Add data from Swedish birth dataset (sociodemographics need to be translated and merged)
Code
rm(list =ls())# set wd for manual work (even though already specified in YAML code)setwd("P:/EnvEpi-Projekt/CHAMNHA/6_ultrasound_nikolaus")load("2_processeddata/conceptiondate_all/conceptiondate_merged_ustype_gestage_maxultrasoundmeasurement_birthoutcomes_USzscores_EFWzscores_BWzscores_whozscores_sorted_outcavail_socdem_wide_expos.Rda")#rename databasedata <- data_exposurerm(data_exposure)# remove data without exposure and without zBWdata <- data %>%filter(!is.na(pm25_trim_first) &!is.na(pm10_trim_first) &!is.na(no2_trim_first) &!is.na(o3_trim_first)) %>%filter(!is.na(zBW))#rename variablesdata <- data %>%mutate(mat_age_cat = dplyr::recode(mat_age_cat, "<=24"="≤24", ">=35"="≥35"),fstart = dplyr::recode(fstart, "Elektiv sectio"="Cesarean section", "Induktion"="Induction", "Spontan"="Spontaneous") )#define season of conception#month of conceptiondata <-as.data.table(data)data[, month_conception :=as.numeric(format(conceptiondate, "%m"))]#year of conceptiondata[, year_conception :=as.numeric(format(conceptiondate, "%y"))]#year groupdata <- data %>%mutate(year_group =case_when( year_conception %in%c(13, 14) ~"2013/14", year_conception %in%c(15, 16) ~"2015/16", year_conception %in%c(17, 18, 19) ~"2017/18/19",TRUE~as.character(year_conception) ))#define season of conceptiondata <- data %>%mutate(season_conception =factor(case_when( month_conception %in%c(12, 1, 2) ~"Winter", month_conception %in%c(3, 4, 5) ~"Spring", month_conception %in%c(6, 7, 8) ~"Summer", month_conception %in%c(9, 10, 11) ~"Fall",TRUE~NA_character_ ), levels =c("Winter", "Spring", "Summer", "Fall")))# Create a new variable birth_weight_tertiles and birth weightdata <- data %>%mutate(zBW_tertiles =cut( zBW,breaks =quantile(zBW, probs =c(0, 1/3, 2/3, 1), na.rm =TRUE),labels =c("Low tertile zBW", "Mid tertile zBW", "High tertile zBW"),include.lowest =TRUE)) %>%mutate(BW_tertiles =cut( birth_weight,breaks =quantile(birth_weight, probs =c(0, 1/3, 2/3, 1), na.rm =TRUE),labels =c("Low tertile BW", "Mid tertile BW", "High tertile BW"),include.lowest =TRUE))# Create tertiles for each pollutantdata <- data %>%mutate(pm25_tertiles =cut( pm25_trim_first,breaks =quantile(pm25_trim_first, probs =c(0, 1/3, 2/3, 1), na.rm =TRUE),labels =c("Low tertile", "Mid tertile", "High tertile"),include.lowest =TRUE) ) %>%mutate(pm10_tertiles =cut( pm10_trim_first,breaks =quantile(pm10_trim_first, probs =c(0, 1/3, 2/3, 1), na.rm =TRUE),labels =c("Low tertile", "Mid tertile", "High tertile"),include.lowest =TRUE) ) %>%mutate(no2_tertiles =cut( no2_trim_first,breaks =quantile(no2_trim_first, probs =c(0, 1/3, 2/3, 1), na.rm =TRUE),labels =c("Low tertile", "Mid tertile", "High tertile"),include.lowest =TRUE) ) %>%mutate(o3_tertiles =cut( o3_trim_first,breaks =quantile(o3_trim_first, probs =c(0, 1/3, 2/3, 1), na.rm =TRUE),labels =c("Low tertile", "Mid tertile", "High tertile"),include.lowest =TRUE) )# Clean specific variablesdata <- data %>%mutate(meduc =if_else(meduc =="Don´t know", NA_character_, meduc)) %>%mutate(smoke_preg =factor(smoke_preg,levels =c("Yes", "No"),labels =c("Smoker", "Non-smoker")))#clean variable zBHC (n=61)data$zBHC[is.infinite(data$zBHC)] <-NA
Exposure descriptives
Mean and tertile exposure levels
Mean and tertile air pollution levels during first trimester in the study population.
Code
# Mean pollution levelstrialdata <- data |>select(pm25_trim_first, pm10_trim_first, no2_trim_first, o3_trim_first)mean_exposure_table <-tbl_summary( trialdata,label =list(pm25_trim_first ~" PM2.5 (µg/m3)", pm10_trim_first ~"PM10 (µg/m3)", no2_trim_first ~"NO2 (µg/m3)", o3_trim_first ~"O3 (µg/m3)"),statistic =list(all_continuous() ~"{mean} ({sd})"), digits =all_continuous() ~1,missing ="no") |>modify_header(label ="**Characteristics**") |>bold_labels() |>modify_caption("Table 2: Ambient air pollution levels during first trimester in the study population")# Tertile pollution levels# Summary table for PM10trialdata <- data |>select(pm25_trim_first, pm10_trim_first, no2_trim_first, o3_trim_first, pm25_tertiles, pm10_tertiles, no2_tertiles, o3_tertiles)summary_table_pm10 <-tbl_summary( trialdata,by = pm10_tertiles, include =c("pm10_trim_first"),label =list(pm10_trim_first ~"PM10 (µg/m3)"), statistic =list(all_continuous() ~"{mean} ({sd})" ),digits =all_continuous() ~1,missing ="no")# Summary table for NO2summary_table_no2 <-tbl_summary( trialdata,by = no2_tertiles, include =c("no2_trim_first"),label =list(no2_trim_first ~"NO2 (µg/m3)"), statistic =list(all_continuous() ~"{mean} ({sd})" ),digits =all_continuous() ~1,missing ="no")# Summary table for O3summary_table_o3 <-tbl_summary( trialdata,by = o3_tertiles, include =c("o3_trim_first"),label =list(o3_trim_first ~"O3 (µg/m3)"), statistic =list(all_continuous() ~"{mean} ({sd})" ),digits =all_continuous() ~1,missing ="no")# Summary table for PM25summary_table_pm25 <-tbl_summary( trialdata,by = pm25_tertiles, include =c("pm25_trim_first"),label =list(pm25_trim_first ~"PM2.5 (µg/m3)"), statistic =list(all_continuous() ~"{mean} ({sd})" ),digits =all_continuous() ~1,missing ="no")
Table 2: Ambient air pollution levels during first trimester in the study population
Characteristics
N = 497,4021
PM2.5 (µg/m3)
7.6 (2.0)
PM10 (µg/m3)
15.2 (4.1)
NO2 (µg/m3)
14.3 (8.0)
O3 (µg/m3)
53.3 (9.5)
1Mean (SD)
Characteristic
Low tertile
N = 165,8011
Mid tertile
N = 165,8001
High tertile
N = 165,8011
PM2.5 (µg/m3)
5.4 (1.0)
7.6 (0.5)
9.7 (1.1)
1Mean (SD)
Characteristic
Low tertile
N = 165,8011
Mid tertile
N = 165,8001
High tertile
N = 165,8011
PM10 (µg/m3)
11.2 (1.8)
14.7 (0.8)
19.8 (2.9)
1Mean (SD)
Characteristic
Low tertile
N = 165,8011
Mid tertile
N = 165,8001
High tertile
N = 165,8011
NO2 (µg/m3)
6.2 (2.7)
13.3 (1.6)
23.3 (5.9)
1Mean (SD)
Characteristic
Low tertile
N = 165,8011
Mid tertile
N = 165,8001
High tertile
N = 165,8011
O3 (µg/m3)
43.3 (3.2)
52.2 (3.1)
64.5 (4.7)
1Mean (SD)
Exposure in relation to WHO/EU guidelines
WHO/EU guidelines for individual air pollutants and proportion of population exposed to levels exceeding guuidelines during first trimester.
Code
thresholds <-tibble(Pollutant =c("PM2.5", "PM10", "NO2", "O3"),WHO_Threshold =c(5, 15, 10, 60),EU_Threshold =c(10, 20, 20, 100))# Calculate exceedance proportionsexceedance_data <- data %>%summarise(PM2.5_WHO =mean(pm25_trim_first > thresholds$WHO_Threshold[1], na.rm =TRUE),PM10_WHO =mean(pm10_trim_first > thresholds$WHO_Threshold[2], na.rm =TRUE),NO2_WHO =mean(no2_trim_first > thresholds$WHO_Threshold[3], na.rm =TRUE),O3_WHO =mean(o3_trim_first > thresholds$WHO_Threshold[4], na.rm =TRUE),PM2.5_EU =mean(pm25_trim_first > thresholds$EU_Threshold[1], na.rm =TRUE),PM10_EU =mean(pm10_trim_first > thresholds$EU_Threshold[2], na.rm =TRUE),NO2_EU =mean(no2_trim_first > thresholds$EU_Threshold[3], na.rm =TRUE),O3_EU =mean(o3_trim_first > thresholds$EU_Threshold[4], na.rm =TRUE) ) %>%pivot_longer(cols =everything(),names_to =c("Pollutant", "Standard"),names_sep ="_",values_to ="Proportion_Exceeding" ) %>%mutate(Proportion_Exceeding =round(Proportion_Exceeding *100, 1)) # Convert to percentage# Combine WHO and EU data into a single tablefinal_table <- thresholds %>%left_join( exceedance_data %>%filter(Standard =="WHO") %>%select(Pollutant, WHO_Exceedance = Proportion_Exceeding),by ="Pollutant" ) %>%left_join( exceedance_data %>%filter(Standard =="EU") %>%select(Pollutant, EU_Exceedance = Proportion_Exceeding),by ="Pollutant" )# Create the gt tablegt_table <- final_table %>%gt() %>%tab_header(title ="International Guidelines and Proportion of Cases Exceeding",subtitle ="WHO (2021) and EU (2024) Standards for Air Pollutants (Average in First Trimester)" ) %>%cols_label(Pollutant ="Pollutant",WHO_Threshold ="WHO Guideline (µg/m³)",WHO_Exceedance ="Proportion Exceeding WHO (%)",EU_Threshold ="EU Guideline (µg/m³)",EU_Exceedance ="Proportion Exceeding EU (%)" )
International Guidelines and Proportion of Cases Exceeding
WHO (2021) and EU (2024) Standards for Air Pollutants (Average in First Trimester)
Pollutant
WHO Guideline (µg/m³)
EU Guideline (µg/m³)
Proportion Exceeding WHO (%)
Proportion Exceeding EU (%)
PM2.5
5
10
88.9
10.1
PM10
15
20
45.7
13.4
NO2
10
20
69.7
20.2
O3
60
100
27.2
0.0
Exposure by season and year
Distribution of air pollutants by season and by year.
Code
#boxplots: air quality by season of conception#select data: air pollutants and seasondata1 <- data %>%select(pm25_wk_1, pm10_wk_1, o3_wk_1, no2_wk_1, season_conception)#reshape data into long formatdata1_long <- data1 %>%gather(variable, value, pm25_wk_1:no2_wk_1)#replace values of air pollutantsdata1_long <- data1_long %>%mutate(variable =case_when( variable =="pm25_wk_1"~"PM 2.5", variable =="pm10_wk_1"~"PM 10", variable =="no2_wk_1"~"NO2", variable =="o3_wk_1"~"O3",TRUE~ variable # Keep the original value if it doesn't match any condition )) %>%rename(Pollutant = variable)#plot combined boxplots: seasonairpoll_season <-ggplot(data1_long, aes(x = season_conception, y = value, fill = Pollutant)) +geom_boxplot() +labs(title ="Air Quality by season of conception",x ="Season of conception",y ="Concentration of air pollution (µm/m3)",fill ="Pollutant") +scale_fill_manual(values =c("blue", "red", "green", "purple")) +theme_classic() +theme(axis.text.x =element_text(size =16), axis.text.y =element_text(size =16), legend.text =element_text(size =16), legend.title =element_text(size =16), plot.title =element_text(size =20)) #boxplots: air quality by year#select data: air pollutants and seasondata2 <- data %>%select(pm25_wk_1, pm10_wk_1, o3_wk_1, no2_wk_1, year_group)#reshape data into long formatdata2_long <- data2 %>%gather(variable, value, pm25_wk_1:no2_wk_1)#replace values of air pollutantsdata2_long <- data2_long %>%mutate(variable =case_when( variable =="pm25_wk_1"~"PM 2.5", variable =="pm10_wk_1"~"PM 10", variable =="no2_wk_1"~"NO2", variable =="o3_wk_1"~"O3",TRUE~ variable # Keep the original value if it doesn't match any condition )) %>%rename(Pollutant = variable)#plot combined boxplots: seasonairpoll_year <-ggplot(data2_long, aes(x = year_group, y = value, fill = Pollutant)) +geom_boxplot() +labs(title ="Air Quality by year of conception",x ="Year of conception",y ="Concentration of air pollution (µm/m3)",fill ="Pollutant") +scale_fill_manual(values =c("blue", "red", "green", "purple")) +theme_classic() +theme(axis.text.x =element_text(size =16), axis.text.y =element_text(size =16), legend.text =element_text(size =16), legend.title =element_text(size =16), plot.title =element_text(size =20))
Correlation between exposure to multiple air pollutants during first trimester (gestational week 0-13) in the study population. Note: Blue: coefficient >0; red: coefficient <0.
Code
#load librariespacman::p_load(AMR, corrplot, gridGraphics)#set data to data.tablesetDT(data)# Correlation between first trimester exposuresdata1 <- data %>%filter(!is.na(pm25_trim_first) &!is.na(pm10_trim_first) &!is.na(no2_trim_first) &!is.na(o3_trim_first) )exp_matrix <- data1 %>%select( pm25_trim_first, pm10_trim_first, no2_trim_first, o3_trim_first )colnames(exp_matrix) <-c("Trim1 PM2.5", "Trim1 PM10", "Trim1 NO2", "Trim1 O3")#apply function cor to matrix "exp_matrixpolday <-cor(as.matrix(exp_matrix))corrplot(method ="ellipse", polday, type ="lower", order ="original", title =element_blank(), addCoef.col ='black', tl.col ="black", cl.pos ='n', col =COL2('RdBu'),tl.srt =0, diag =FALSE, mar =c(0, 0, 1, 0),tl.offset =0.71, col.lim =c(-1, 1),number.cex =1.5, tl.cex =1.2,cl.cex =1.3)
Correlation between exposure to multiple air pollutants during first trimester (gestational week 0-13). and for individual weeks in first, second and third trimester in the study population. Note: Blue: coefficient >0; red: coefficient <0.
Code
#Correlation between weekly exposures Week 1, Week 13, Week 28# Exclude cases with all exposures missing# Exclude cases with all exposures missingdata1 <- data %>%filter(!is.na(pm25_wk_1) &!is.na(pm10_wk_1) &!is.na(no2_wk_1) &!is.na(o3_wk_1) &!is.na(pm25_wk_13) &!is.na(pm10_wk_13) &!is.na(no2_wk_13) &!is.na(o3_wk_13) &!is.na(pm25_wk_28) &!is.na(pm10_wk_28) &!is.na(no2_wk_28) &!is.na(o3_wk_28) )# Create the exposure matrixexp_matrix <- data1 %>%select( pm25_wk_1, pm25_wk_13, pm25_wk_28, pm10_wk_1, pm10_wk_13, pm10_wk_28, no2_wk_1, no2_wk_13, no2_wk_28, o3_wk_1, o3_wk_13, o3_wk_28 )# Set the names for the figurecolnames(exp_matrix) <-c("W1 PM2.5", "W13 PM2.5", "W28 PM2.5","W1 PM10", "W13 PM10", "W28 PM10","W1 NO2", "W13 NO2", "W28 NO2","W1 O3", "W13 O3", "W28 O3")# Apply the cor function to the exposure matrixpolday <-cor(as.matrix(exp_matrix))# Save the correlation plotcorrplot_wk_W1W13W28 <-corrplot(method ="ellipse", polday, type ="lower", order ="original", title =element_blank(), addCoef.col ='black', tl.col ="black", cl.pos ='n', col =COL2('RdBu'),tl.srt =90, diag =FALSE, number.cex =0.6, mar =c(0, 0, 1, 0), tl.offset =0.5, tl.cex =0.8,col.lim =c(-1, 1),cl.cex =0.7)
Outcomes descriptives
Distribution of birth weight and birth weight ultrasound z-scores.
Code
#Box plot of zBW by tertiles#plot birth_weight_zscore_tertiles by actual birth weightboxplot_BWtertiles <-ggplot(data %>%filter(!is.na(birth_weight)), aes(x = BW_tertiles, y = birth_weight)) +geom_boxplot() +labs(x ="Birth Weight Tertiles", y ="Birth Weight (kg)") +ggtitle("Boxplot of birth weight by Birth Weight Tertiles")#plot birth_weight_zscore_tertiles by actual birth weightboxplot_zBWtertiles <-ggplot(data %>%filter(!is.na(zBW)), aes(x = zBW_tertiles, y = zBW)) +geom_boxplot() +labs(x ="Z-score Birth Weight Tertiles", y ="z-score Birth Weight") +ggtitle("Boxplot of z-score birth weight by z-score Birth Weight Tertiles")#Make ggplots for z-score distribution# Select relevant columnsselected_zscore_columns <- data %>%select( zAC_W16W23, zBPD_W16W23, zFL_W16W23, zAC_W24W31, zBPD_W24W31, zFL_W24W31, zAC_W32W40, zBPD_W32W40, zFL_W32W40 )# Reshape data to long formatlong_zscore_data <- selected_zscore_columns %>%pivot_longer(cols =everything(),names_to =c("Variable", "TimeWindow"),names_pattern ="z(.*)_(W\\d{2}W\\d{2})",values_to ="zScore") %>%filter(zScore >-10& zScore <10)zscore_color_mapping <-c("AC"="green", "BPD"="red", "FL"="blue")# Create a combined plot for z-scoreszscore_combined_plot <- long_zscore_data %>%ggplot(aes(x = zScore, fill = Variable, color = Variable)) +geom_histogram(aes(y = ..density..),position ="identity",alpha =0.05,bins =30 ) +geom_density(alpha =0.3, size =1) +scale_fill_manual(values = zscore_color_mapping) +scale_color_manual(values = zscore_color_mapping) +facet_wrap(~ TimeWindow, scales ="free", ncol =1) +labs(title ="Z-Scores for ultrasound measures in three gestational perods",x ="Z-Score",y ="Density" ) +theme_minimal() +theme(legend.title =element_blank(),strip.text =element_text(face ="bold") )#EFW z-scores# Select relevant columns for EFW z-scoresselected_efw_zscore_columns <- data %>%select( zEFW_W16W23, zEFW_W24W31, zEFW_W32W40 )# Reshape data to long format for EFW z-scoreslong_efw_zscore_data <- selected_efw_zscore_columns %>%pivot_longer(cols =everything(),names_to ="TimeWindow",names_pattern ="zEFW_(W\\d{2}W\\d{2})",values_to ="zScore" ) %>%filter(zScore >-10& zScore <10)# Define color mapping for EFW (if needed, can be single color)efw_color_mapping <-c("EFW"="purple")# Create the combined plot for EFW z-scoresefw_zscore_combined_plot <- long_efw_zscore_data %>%ggplot(aes(x = zScore, fill ="EFW", color ="EFW")) +geom_histogram(aes(y = ..density..),position ="identity",alpha =0.05,bins =30 ) +geom_density(alpha =0.3, size =1) +scale_fill_manual(values = efw_color_mapping) +scale_color_manual(values = efw_color_mapping) +facet_wrap(~ TimeWindow, scales ="free", ncol =1) +labs(title ="Z-Scores for EFW in three gestational periods",x ="Z-Score",y ="Density" ) +theme_minimal() +theme(legend.position ="none", # Hide legend since only one variablestrip.text =element_text(face ="bold") )