Init

library(kirkegaard)

## Loading required package: tidyverse

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: magrittr
## 
## 
## Attaching package: 'magrittr'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## 
## Loading required package: weights
## 
## Loading required package: Hmisc
## 
## 
## Attaching package: 'Hmisc'
## 
## 
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## 
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## 
## 
## Loading required package: assertthat
## 
## 
## Attaching package: 'assertthat'
## 
## 
## The following object is masked from 'package:tibble':
## 
##     has_name
## 
## 
## Loading required package: psych
## 
## 
## Attaching package: 'psych'
## 
## 
## The following object is masked from 'package:Hmisc':
## 
##     describe
## 
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## 
## 
## 
## Attaching package: 'kirkegaard'
## 
## 
## The following object is masked from 'package:psych':
## 
##     rescale
## 
## 
## The following object is masked from 'package:assertthat':
## 
##     are_equal
## 
## 
## The following object is masked from 'package:purrr':
## 
##     is_logical
## 
## 
## The following object is masked from 'package:base':
## 
##     +

load_packages(
  patchwork,
  googlesheets4,
  rms,
  ggeffects,
  rnaturalearth,
  sf
)

## Linking to GEOS 3.10.2, GDAL 3.4.1, PROJ 8.2.1; sf_use_s2() is TRUE

theme_set(theme_bw())

options(
    digits = 3
)

#multithreading
#library(future)
#plan(multisession(workers = 8))

Functions

Data

d_vac = read_csv("data/share-of-people-who-completed-the-initial-covid-19-vaccination-protocol.csv") %>% 
  df_legalize_names()

## Rows: 77011 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): Entity
## dbl  (1): People fully vaccinated (cumulative, per hundred)
## date (1): Day
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

d_lockdown = read_csv("data/covid-19-stringency-index.csv") %>% 
  df_legalize_names()

## Rows: 202760 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): Entity
## dbl  (3): Stringency index (non-vaccinated), Stringency index (vaccinated), ...
## date (1): Day
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

d_fert_owid = read_csv("data/children-born-per-woman/children-born-per-woman.csv") %>% 
  df_legalize_names()

## Rows: 18958 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Entity, Code
## dbl (2): Year, Fertility rate (period), historical
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

d_fert_bg = read_csv("data/tfr birth gauge.csv") %>% 
  df_legalize_names()

## Rows: 94 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Country
## dbl (6): Births_2024, Births_2025, Change, TFR_2023, TFR_2024, TFR_2025
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

d_hdi = read_csv("data/human-development-index/human-development-index.csv") %>% 
  df_legalize_names() %>% 
  filter(
    Year == 2019
  )

## Rows: 6603 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Entity, Code
## dbl (2): Year, Human Development Index
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

d_world = ne_countries(scale = "medium", returnclass = "sf")

d_world %>% 
  ggplot() +
  geom_sf() +
  theme_minimal() +
  ggtitle("World Countries Map")

Data recode

#for vac, we need the largest value for countries
d_vac_final = d_vac %>% 
  group_by(Entity) %>% 
  summarise(
    vaccine_coverage = max(People_fully_vaccinated_cumulative_per_hundred, na.rm = TRUE)
  ) %>% 
  ungroup() %>% 
  mutate(
    ISO = pu_translate(Entity, fuzzy = F)
  ) %>% 
  filter(
    #filter out non-countries, which are those without ISO codes
    !is.na(ISO)
  )

## No exact match: Africa

## No exact match: Asia

## No exact match: Bonaire Sint Eustatius and Saba

## No exact match: Europe

## No exact match: European Union (27)

## No exact match: High-income countries

## No exact match: Low-income countries

## No exact match: Lower-middle-income countries

## No exact match: North America

## No exact match: Oceania

## No exact match: South America

## No exact match: Upper-middle-income countries

## No exact match: World

#for lockdowns, we need a single value for a country, so we take the mean of the variables
d_lockdown_final = d_lockdown %>% 
  group_by(Entity) %>% 
  summarise(
    lockdown_index_nonvac = mean(Stringency_index_non_vaccinated, na.rm = TRUE),
    lockdown_index_vac = mean(Stringency_index_vaccinated, na.rm = TRUE),
    lockdown_index = mean(Stringency_index_weighted_average, na.rm = TRUE)
  ) %>% 
  ungroup() %>% 
  mutate(
    ISO = pu_translate(Entity, fuzzy = F)
  ) %>% 
  filter(
    #filter out non-countries, which are those without ISO codes
    !is.na(ISO)
  )

#fert data, we are interested in the 2019-final period, the decline in fertility in %
d_fert_final = d_fert_owid %>% 
  filter(
    Year %in% c(2019, 2023)
  ) %>% 
  dplyr::rename(
    ISO = Code
  ) %>% 
  plyr::ddply("ISO", function(x) {
    #skip if it doesnt have 2 rows
    if (nrow(x) != 2) return(NULL)
    
    tibble(
      fertility_2019 = x$Fertility_rate_period_historical[x$Year == 2019],
      fertility_2023 = x$Fertility_rate_period_historical[x$Year == 2023],
      fertility_decline_19_23 = (fertility_2023 - fertility_2019) / fertility_2019
    )
  }) %>%
  #join with 2025 numbers from bg
  left_join(
    d_fert_bg %>% mutate(
      ISO = pu_translate(Country)
    ) %>% select(ISO, TFR_2025)
  ) %>% 
  mutate(
    fertility_decline_19_25 = (TFR_2025 - fertility_2019) / fertility_2019
  )

## Joining with `by = join_by(ISO)`

#country groupings
gs4_deauth()
d_classifications = read_sheet("https://docs.google.com/spreadsheets/d/1ToJWNbwYY--w0_nh05slEv4ZCko4a-XZc1rAkymTxAA/edit?gid=0#gid=0") %>% 
  df_legalize_names()

## ✔ Reading from "Countries regional codings".

## ✔ Range 'Sheet1'.

#check for duplicated rows first
assert_that(!any(duplicated(d_fert_final$ISO)))

## [1] TRUE

assert_that(!any(duplicated(d_vac_final$ISO)))

## [1] TRUE

assert_that(!any(duplicated(d_lockdown_final$ISO)))

## [1] TRUE

#finally, join all these data
d = full_join(
    d_vac_final %>% select(-Entity),
    d_lockdown_final %>% select(-Entity), 
    by = "ISO"
  ) %>% 
  full_join(
    d_fert_final, 
    by = "ISO"
  ) %>% 
  full_join(
    d_classifications %>% select(ISO3, Region1, Region2, Region3) %>% rename(ISO = ISO3), 
    by = "ISO"
  ) %>% 
  full_join(
    d_hdi %>% rename(ISO = Code, HDI2019 = Human_Development_Index) %>%  select(ISO, HDI2019) %>% filter(!is.na(ISO)), 
    by = "ISO"
  ) %>%
  mutate(
    country = pu_translate(ISO, reverse = T)
  )

## No match: OWID_KOS

## No match: OWID_WRL

#check for duplicated rows
assert_that(!any(duplicated(d$ISO)))

## [1] TRUE

#regions as factors
d %<>% mutate(
  Region1 = Region1 %>% fct_relevel("Europe"),
  Region2 = Region2 %>% fct_relevel("Northern Europe"),
  Region3 = Region3 %>% fct_relevel("Northern Europe")
)

Analysis

Simple

#data for decline
d$fertility_decline_19_23 %>% miss_count(reverse = T)

## [1] 238

d$fertility_decline_19_25 %>% miss_count(reverse = T)

## [1] 92

#check correlations of key variables
d %>% select(
  vaccine_coverage, 
  lockdown_index_nonvac, 
  lockdown_index_vac, 
  lockdown_index, 
  fertility_decline_19_23, 
  fertility_decline_19_25,
  HDI2019
) %>% 
  GG_heatmap()

GG_save("figs/correlation_heatmap.png")

#scatterplots
p1 = d %>% 
  GG_scatter("lockdown_index", "fertility_decline_19_23", case_names = "ISO") +
  labs(
    x = "Lockdown index",
    y = "Fertility decline 2019-2023 (%)"
  )

p2 = d %>% 
  GG_scatter("lockdown_index", "fertility_decline_19_25", case_names = "ISO") +
  labs(
    x = "Lockdown index",
    y = "Fertility decline 2019-2025 (%)"
  )

p3 = d %>%
  GG_scatter("vaccine_coverage", "fertility_decline_19_23", case_names = "ISO") +
  labs(
    x = "Vaccine coverage (%)",
    y = "Fertility decline 2019-2023 (%)"
  )

p4 = d %>%
  GG_scatter("vaccine_coverage", "fertility_decline_19_25", case_names = "ISO") +
  labs(
    x = "Vaccine coverage (%)",
    y = "Fertility decline 2019-2025 (%)"
  )

p1 + p2 + p3 + p4 + plot_layout(ncol = 2)

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

GG_save("figs/scatter_lockdown_vaccine_fertility_decline.png")

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

#by region
p_region1 = d %>% 
  GG_group_means("fertility_decline_19_25", groupvar = "Region1") +
  scale_y_continuous(
    labels = scales::percent_format()
  ) +
  labs(
    x = "Region (large)",
    y = ""
  )

## Missing values were removed.

p_region2 = d %>%
  GG_group_means("fertility_decline_19_25", groupvar = "Region2") +
  scale_y_continuous(
    labels = scales::percent_format()
  ) +
  labs(
    x = "Region (medium)",
    y = "Fertility decline 2019-2025 (%)"
  ) +
  theme(
    #smaller text on x axis
    axis.text.x = element_text(size = 7, angle = 15, hjust = 1)
  )

## Missing values were removed.

p_region3 = d %>%
  GG_group_means("fertility_decline_19_25", groupvar = "Region3") +
  scale_y_continuous(
    labels = scales::percent_format()
  ) +
  labs(
    x = "Region (small)",
    y = ""
  ) +
  theme(
    #smaller text on x axis
    axis.text.x = element_text(size = 7, angle = 20, hjust = 1)
  )

## Missing values were removed.

p_region1 + p_region2 + p_region3 + plot_layout(ncol = 1)

GG_save("figs/scatter_fertility_decline_by_region.png")

d %>% 
  GG_scatter("fertility_decline_19_23", "fertility_decline_19_25", case_names = "ISO")

## `geom_smooth()` using formula = 'y ~ x'

p_fert23 = d %>% 
  GG_scatter("fertility_2019", "fertility_2023", case_names = "ISO") +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
  labs(
    x = "Fertility 2019",
    y = "Fertility 2023"
  )

p_fert25 = d %>% 
  GG_scatter("fertility_2019", "TFR_2025", case_names = "ISO") +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
  labs(
    x = "Fertility 2019",
    y = "Fertility 2025"
  )

p_fert23 + p_fert25 + plot_layout(ncol = 2)

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

GG_save("figs/scatter_fertility_2019_2023_2025.png")

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

#map
d_world2 = d_world %>% 
  left_join(
    d %>% select(ISO, fertility_decline_19_23, fertility_decline_19_25), 
    by = c("adm0_a3" = "ISO")
  )

d_world2 %>%
  ggplot() +
  geom_sf(aes(fill = fertility_decline_19_23)) +
  scale_fill_gradient2(
    high = "blue", 
    mid = "white", 
    low = "red", 
    midpoint = 0, 
    na.value = "grey50"
  ) +
  theme_minimal() +
  guides(
    fill = guide_colorbar(
      position = "bottom",
      title.position = "top"
    )
  ) +
  labs(
    fill = "Fertility decline 2019-2023 (%)",
    title = "Fertility decline in 2019-2023"
  )

GG_save("figs/fertility_decline_2019_2023_map.png")

d_world2 %>%
  ggplot() +
  geom_sf(aes(fill = fertility_decline_19_25)) +
  scale_fill_gradient2(
    high = "blue", 
    mid = "white", 
    low = "red", 
    midpoint = 0, 
    na.value = "grey50"
  ) +
  theme_minimal() +
  guides(
    fill = guide_colorbar(
      position = "bottom",
      title.position = "top"
    )
  ) +
  labs(
    fill = "Fertility decline 2019-2025 (%)",
    title = "Fertility decline in 2019-2025"
  )

GG_save("figs/fertility_decline_2019_2025_map.png")

#faster decline post-COVID?
d_fert_owid %<>% mutate(
  year2019 = Year >= 2019
) %>% 
  filter(!is.na(Code)) %>% 
  group_by(Code) %>%
  mutate(
  #calculate the difference in % between years
    fert_change = (Fertility_rate_period_historical - lag(Fertility_rate_period_historical, order_by = Year)) / lag(Fertility_rate_period_historical, order_by = Year),  
    fert_change_log = fert_change %>% log()
  ) %>% ungroup()

## Warning: There were 237 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `fert_change_log = fert_change %>% log()`.
## ℹ In group 1: `Code = "ABW"`.
## Caused by warning in `log()`:
## ! NaNs produced
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 236 remaining warnings.

lm(Fertility_rate_period_historical ~ Year * year2019, data = d_fert_owid) %>% 
  summary()

## 
## Call:
## lm(formula = Fertility_rate_period_historical ~ Year * year2019, 
##     data = d_fert_owid)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -4.602 -1.399 -0.246  1.448  4.927 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        9.16e+01   1.31e+00    69.9   <2e-16 ***
## Year              -4.41e-02   6.61e-04   -66.8   <2e-16 ***
## year2019TRUE      -1.44e+01   7.31e+01    -0.2     0.84    
## Year:year2019TRUE  7.13e-03   3.61e-02     0.2     0.84    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.76 on 17827 degrees of freedom
## Multiple R-squared:  0.234,  Adjusted R-squared:  0.234 
## F-statistic: 1.81e+03 on 3 and 17827 DF,  p-value: <2e-16

lm(fert_change ~ Year * year2019, data = d_fert_owid) %>% 
  summary()

## 
## Call:
## lm(formula = fert_change ~ Year * year2019, data = d_fert_owid)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4733 -0.0120  0.0026  0.0120  0.9234 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        2.52e-01   2.47e-02   10.18   <2e-16 ***
## Year              -1.32e-04   1.25e-05  -10.63   <2e-16 ***
## year2019TRUE      -1.20e+00   1.35e+00   -0.89     0.37    
## Year:year2019TRUE  5.96e-04   6.68e-04    0.89     0.37    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0326 on 17589 degrees of freedom
##   (238 observations deleted due to missingness)
## Multiple R-squared:  0.00743,    Adjusted R-squared:  0.00726 
## F-statistic: 43.9 on 3 and 17589 DF,  p-value: <2e-16

#plot decline
d_fert_owid %>% 
  filter(
    Year >= 1990
  ) %>% 
  ggplot(aes(x = Year, y = fert_change)) +
  geom_line(aes(group = Code)) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  labs(
    x = "Year",
    y = "Fertility change (%)",
    title = "Fertility change over time"
  ) +
  theme_minimal()

GG_save("figs/fertility_change_over_time.png")

#fertility decline by year
d_fert_owid %>% 
  filter(
    Year >= 1990
  ) %>% 
  ggplot(aes(x = Year, y = fert_change_log)) +
  #histogram of declines that year across countries
  geom_smooth() +
  # geom_histogram() +
  # geom_hline(yintercept = 0, linetype = "dashed") +
  labs(
    x = "Year",
    y = "Fertility change (log)",
    title = "Fertility change over time (log)"
  ) +
  theme_minimal()

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## Warning: Removed 6258 rows containing non-finite outside the scale range
## (`stat_smooth()`).

d_fert_owid %>% 
  filter(Year >= 1990) %>% 
  ggplot(aes(x = factor(Year), y = fert_change)) +
  geom_histogram(stat = "identity") +
  geom_hline(yintercept = 0, linetype = "dashed") +
  labs(
    x = "Year",
    y = "Fertility change (log)",
    title = "Fertility change over time (log)"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))

## Warning in geom_histogram(stat = "identity"): Ignoring unknown parameters:
## `binwidth`, `bins`, and `pad`

summary_df <- d_fert_owid %>%
  filter(Year >= 1990) %>%
  group_by(Year) %>%
  summarise(
    mean = mean(fert_change, na.rm = TRUE),
    median = median(fert_change, na.rm = TRUE),
    max = max(fert_change, na.rm = TRUE),
    min = min(fert_change, na.rm = TRUE),
    p10 = quantile(fert_change, 0.10, na.rm = TRUE),
    p90 = quantile(fert_change, 0.90, na.rm = TRUE),
    .groups = "drop"
  )

# Step 2: Pivot to long format
long_summary <- summary_df %>%
  pivot_longer(cols = -Year, names_to = "statistic", values_to = "value")

#mean mean decline
long_summary %>% 
  filter(statistic == "mean") %>% 
  describe2()

# Step 3: Plot
ggplot(long_summary, aes(x = Year, y = value, color = statistic)) +
  geom_line(linewidth = 1) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  labs(
    title = "Summary of Fertility Change % by Year",
    x = "Year",
    y = "Value",
    color = "Statistic"
  ) +
  theme_bw() +
  scale_y_continuous(labels = scales::percent)

GG_save("figs/fertility_change_summary_by_year.png")

Regressions

#data subset for fertility modeling
d2 = d %>% select(
  ISO, 
  Region1, 
  Region2, 
  Region3, 
  fertility_decline_19_23, 
  fertility_decline_19_25, 
  lockdown_index, 
  vaccine_coverage, 
  HDI2019
) %>% 
  filter(
    !is.na(fertility_decline_19_23),
    !is.na(lockdown_index),
    !is.na(vaccine_coverage),
    !is.na(HDI2019)
  )

lm_mods = list(
  "19-23" = ols(fertility_decline_19_23 ~ lockdown_index + vaccine_coverage + HDI2019, data = d),
  "19-23b" = ols(fertility_decline_19_23 ~ lockdown_index + vaccine_coverage + HDI2019 + Region2, data = d),
  "19-25" = ols(fertility_decline_19_25 ~ lockdown_index + vaccine_coverage + HDI2019, data = d),
  "19-25b" = ols(fertility_decline_19_25 ~ lockdown_index + vaccine_coverage + HDI2019 + Region1, data = d2),
  "23" = ols(fertility_2023 ~ fertility_2019 + lockdown_index + vaccine_coverage + HDI2019 + Region2, data = d),
  "25" = ols(TFR_2025 ~ fertility_2019 + lockdown_index + vaccine_coverage + HDI2019 + Region1, data = d)
)

lm_mods %>% summarize_models(collapse_factors = T)

lm_mods$`19-23b` %>% 
  ggpredict(terms = c("Region2")) %>% 
  plot(show_data = T) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  ) +
  labs(
    x = "Region",
    y = "Fertility decline 2019-2025 (%)"
  )

## Data points may overlap. Use the `jitter` argument to add some amount of
##   random variation to the location of data points and avoid overplotting.

GG_save("figs/ggpredict_region2_fertility_decline_19_23.png")

lm_mods$`25` %>%
  ggpredict(terms = c("lockdown_index")) %>% 
  plot(show_data = T) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  ) +
  labs(
    y = "Fertility in 2025"
  )

## Data points may overlap. Use the `jitter` argument to add some amount of
##   random variation to the location of data points and avoid overplotting.

COVID fertility decline

13 July, 2025

Init

Functions

Data

Data recode

Analysis

Simple

Regressions

Meta