Init

library(kirkegaard)
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: magrittr
## 
## 
## Attaching package: 'magrittr'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## 
## Loading required package: weights
## 
## Loading required package: Hmisc
## 
## 
## Attaching package: 'Hmisc'
## 
## 
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## 
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## 
## 
## Loading required package: assertthat
## 
## 
## Attaching package: 'assertthat'
## 
## 
## The following object is masked from 'package:tibble':
## 
##     has_name
## 
## 
## Loading required package: psych
## 
## 
## Attaching package: 'psych'
## 
## 
## The following object is masked from 'package:Hmisc':
## 
##     describe
## 
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## 
## 
## 
## Attaching package: 'kirkegaard'
## 
## 
## The following object is masked from 'package:psych':
## 
##     rescale
## 
## 
## The following object is masked from 'package:assertthat':
## 
##     are_equal
## 
## 
## The following object is masked from 'package:purrr':
## 
##     is_logical
## 
## 
## The following object is masked from 'package:base':
## 
##     +
load_packages(
  readxl,
  patchwork
)

theme_set(theme_bw())

options(
    digits = 3
)

Data

FOLK1C

#complete 1C
d_1c = read_excel("data/2025373516527613147FOLK1C.xlsx", range = "B3:BT69")
## New names:
## • `` -> `...1`
## • `` -> `...2`
#better names
names(d_1c)[1:2] = c("origin", "age_group")

#fill missing
d_1c$origin %<>% miss_locf()

#to long form
d_1c_long = d_1c %>% 
  pivot_longer(cols = -c(origin, age_group), names_to = "year", values_to = "count") %>% 
  mutate(
    #quarters as fractions
    #c(2, 5, 8, 11) / 12
    #[1] 0.167 0.417 0.667 0.917
    year2 = year %>% 
      str_replace("Q1", ".167") %>%
      str_replace("Q2", ".417") %>%
      str_replace("Q3", ".667") %>%
      str_replace("Q4", ".917") %>% 
      as.numeric()
  )

#compute fractions
d_1c_long %<>% plyr::ddply(c("age_group", "year"), function(dd) {
  dd$fraction = dd$count / sum(dd$count)
  dd
})

By sex

d_1c_sex = read_excel("data/20253752934527625048FOLK1C by sex.xlsx", range = "A3:D4557", guess_max = 10000)
## New names:
## • `` -> `...1`
## • `` -> `...2`
## • `` -> `...3`
#better names
names(d_1c_sex) = c("origin", "age_group", "year", "count")

#fill missing
d_1c_sex$origin %<>% miss_locf()
d_1c_sex$age_group %<>% miss_locf()

#recode year
d_1c_sex_long = d_1c_sex %>% 
  mutate(
    #quarters as fractions
    #c(2, 5, 8, 11) / 12
    #[1] 0.167 0.417 0.667 0.917
    year2 = year %>% 
      str_replace("Q1", ".167") %>%
      str_replace("Q2", ".417") %>%
      str_replace("Q3", ".667") %>%
      str_replace("Q4", ".917") %>% 
      as.numeric()
  )

FODIE

d_fodie = read_excel("data/20253744022527619226FODIE.xlsx", range = "A3:T36")
## New names:
## • `` -> `...1`
## • `` -> `...2`
#better names
names(d_fodie)[1:2] = c("origin", "age_group")

#fill missing
d_fodie$origin %<>% miss_locf()

#to long form
d_fodie_long = d_fodie %>% 
  pivot_longer(cols = -c(origin, age_group), names_to = "year", values_to = "births")

Combined table

#do the age categories match?
intersect(
  d_1c_sex$age_group %>% unique(),
  d_fodie$age_group %>% unique()
)
##  [1] "15-19 years" "20-24 years" "25-29 years" "30-34 years" "35-39 years"
##  [6] "40-44 years" "45-49 years" "50-54 years" "55-59 years" "60-64 years"
#join
d_folk1c_fodie = inner_join(
  #only keep Q2 values
  d_1c_sex_long %>% filter(str_detect(year, "Q2")) %>% mutate(year = str_replace(year, "Q2", "")),
  d_fodie_long,
  by = c("origin", "age_group", "year")
) %>% 
  #birth rate
  mutate(
    birth_rate = births / count,
    year = as.numeric(year)
  )

Analysis

Population size

#age groups
d_1c$age_group %>% unique()
##  [1] "Age, total"         "0-4 years"          "5-9 years"         
##  [4] "10-14 years"        "15-19 years"        "20-24 years"       
##  [7] "25-29 years"        "30-34 years"        "35-39 years"       
## [10] "40-44 years"        "45-49 years"        "50-54 years"       
## [13] "55-59 years"        "60-64 years"        "65-69 years"       
## [16] "70-74 years"        "75-79 years"        "80-84 years"       
## [19] "85-89 years"        "90-94 years"        "95-99 years"       
## [22] "100 years and over"
#combined plot with counts and percentages
#counts 0-4
p_folk1c_04 = d_1c_long %>% 
  filter(age_group %in% c("0-4 years")) %>% 
  ggplot(aes(x = year2, y = count, color = origin)) +
  geom_line() +
  labs(title = "Counts of persons by origin and for 0-4 year olds",
       x = "Year",
       y = "Count") +
  #remove color guide
  #smaller title font
  theme(
    legend.position = "none",
    plot.title = element_text(size = 10)
    )

#counts all
p_folk1c_all = d_1c_long %>% 
  filter(age_group %in% c("Age, total")) %>% 
  ggplot(aes(x = year2, y = count, color = origin)) +
  geom_line() +
  labs(title = "Counts of persons by origin and for the total population",
       x = "Year",
       y = "Count") +
  #remove color guide
  #smaller title font
  theme(
    legend.position = "none",
    plot.title = element_text(size = 10)
    )

#fractions 0-4
p_folk1c_04_frac = d_1c_long %>% 
  filter(age_group %in% c("0-4 years")) %>% 
  ggplot(aes(x = year2, y = fraction, color = origin)) +
  geom_line() +
  labs(title = "% of persons by origin and for 0-4 year olds",
       x = "Year",
       y = "Fraction") +
  scale_y_continuous(labels = scales::percent) +
  #remove color guide
  #smaller title font
  theme(
    legend.position = "none",
    plot.title = element_text(size = 10)
    )

#fraction all
p_folk1c_all_frac = d_1c_long %>% 
  filter(age_group %in% c("Age, total")) %>% 
  ggplot(aes(x = year2, y = fraction, color = origin)) +
  geom_line() +
  labs(title = "% of persons by origin and for the total population",
       x = "Year",
       y = "Fraction") +
  scale_y_continuous(labels = scales::percent) +
  #remove color guide
  #smaller title font
  theme(
    legend.position = "bottom",
    plot.title = element_text(size = 10),
    #smaller font size on color guide
    legend.text = element_text(size = 8)
    )

#combine
patchwork::wrap_plots(p_folk1c_04, p_folk1c_all, p_folk1c_04_frac, p_folk1c_all_frac, nrow = 2)

GG_save("figs/folk1c.png")

#danish fraction only
d_1c_long %>% 
  filter(origin == "Persons of Danish origin", age_group %in% c("0-4 years", "Age, total")) %>% 
  ggplot(aes(x = year2, y = fraction, color = age_group)) +
  geom_line() +
  labs(title = "% of persons Danish",
       x = "Year",
       y = "Fraction") +
  scale_y_continuous(labels = scales::percent)

GG_save("figs/folk1c_danish.png")

d_1c_long %>% 
  filter(origin == "Persons of Danish origin", age_group %in% c("0-4 years", "Age, total"))

Birth rates

#birth rates
d_folk1c_fodie %>% 
  ggplot(aes(x = year, y = birth_rate, color = origin, group = origin)) +
  geom_line() +
  labs(title = "Birth rates by age group and origin",
       x = "Year",
       y = "Birth rate") +
  facet_wrap(~age_group, scales = "free_y") +
  theme(
    legend.position = "bottom"
    )

GG_save("figs/birth_rates.png")

#weighted mean birth rate
d_folk1c_fodie_wmean = d_folk1c_fodie %>% plyr::ddply(c("origin", "year"), function(dd) {
  tibble(
    birth_rate = weighted.mean(dd$birth_rate, dd$count)
  )
})

#plot
d_folk1c_fodie_wmean %>% 
  ggplot(aes(x = year, y = birth_rate, color = origin, group = origin)) +
  geom_line() +
  labs(title = "Weighted mean birth rates by origin and year",
       x = "Year",
       y = "Birth rate") +
  theme(
    legend.position = "bottom"
    )

GG_save("figs/birth_rates_wmean.png")

#weighted mean, but using the danish age composition, so as to control for the age distribution differences
d_folk1c_fodie_wmean_danish = d_folk1c_fodie %>% 
  plyr::ddply(c("year"), function(dd) {
    #danish proportions
    dd_dk = dd %>% filter(origin == "Persons of Danish origin")
    dd_dk$fraction = dd_dk$count / sum(dd_dk$count)
    
    #by origin
    dd2 = dd %>% 
      inner_join(dd_dk %>% select(age_group, fraction), by = "age_group") %>% 
      group_by(origin) %>% 
      reframe(
        birth_rate = weighted.mean(birth_rate, fraction),
        TFR = birth_rate * 50
      )
    
    dd2
  })

#plot
d_folk1c_fodie_wmean_danish %>% 
  ggplot(aes(x = year, y = birth_rate, color = origin, group = origin)) +
  geom_line() +
  labs(title = "Weighted mean birth rates by origin and year, using Danish age composition",
       x = "Year",
       y = "Birth rate") +
  theme(
    legend.position = "bottom"
    )

GG_save("figs/birth_rates_wmean_danish.png")

#TFR plot
d_folk1c_fodie_wmean_danish %>% 
  ggplot(aes(x = year, y = TFR, color = origin, group = origin)) +
  geom_line() +
  labs(title = "Total Fertility Rate estimate by origin and year, using Danish age composition",
       x = "Year",
       y = "TFR") +
  theme(
    legend.position = "bottom"
    )

#origin of newborns as percentage of total
d_folk1c_fodie_frac = d_folk1c_fodie %>% 
  plyr::ddply(c("year", "origin"), function(dd) {
    tibble(
      newborns = dd$births %>% sum()
    )
  }) %>% 
  plyr::ddply(c("year"), function(dd) {
    dd$newborns_fraction = dd$newborns / sum(dd$newborns)
    dd
  })

#plot
d_folk1c_fodie_frac %>% 
  ggplot(aes(x = year, y = newborns_fraction, color = origin, group = origin)) +
  geom_line() +
  labs(title = "Newborns by origin of mother as fraction of total",
       x = "Year",
       y = "Fraction",
       color = "Origin of mother") +
  scale_y_continuous(labels = scales::percent) +
  theme(
    legend.position = "bottom"
    )

GG_save("figs/newborns_fraction.png")

Meta

#versions
write_sessioninfo()
## R version 4.4.2 (2024-10-31)
## Platform: x86_64-pc-linux-gnu
## Running under: Linux Mint 21.1
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_DK.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_DK.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_DK.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Europe/Brussels
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] patchwork_1.2.0       readxl_1.4.3          kirkegaard_2025-03-07
##  [4] psych_2.4.6.26        assertthat_0.2.1      weights_1.0.4        
##  [7] Hmisc_5.1-3           magrittr_2.0.3        lubridate_1.9.3      
## [10] forcats_1.0.0         stringr_1.5.1         dplyr_1.1.4          
## [13] purrr_1.0.2           readr_2.1.5           tidyr_1.3.1          
## [16] tibble_3.2.1          ggplot2_3.5.1         tidyverse_2.0.0      
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1  farver_2.1.2      fastmap_1.2.0     digest_0.6.37    
##  [5] rpart_4.1.24      timechange_0.3.0  lifecycle_1.0.4   cluster_2.1.8    
##  [9] survival_3.8-3    gdata_3.0.0       compiler_4.4.2    rlang_1.1.4      
## [13] sass_0.4.9        tools_4.4.2       utf8_1.2.4        yaml_2.3.10      
## [17] data.table_1.16.0 knitr_1.48        labeling_0.4.3    htmlwidgets_1.6.4
## [21] mnormt_2.1.1      plyr_1.8.9        withr_3.0.1       foreign_0.8-88   
## [25] nnet_7.3-20       grid_4.4.2        fansi_1.0.6       jomo_2.7-6       
## [29] colorspace_2.1-1  mice_3.16.0       scales_1.3.0      gtools_3.9.5     
## [33] iterators_1.0.14  MASS_7.3-64       cli_3.6.3         rmarkdown_2.28   
## [37] ragg_1.3.2        generics_0.1.3    rstudioapi_0.16.0 tzdb_0.4.0       
## [41] minqa_1.2.8       cachem_1.1.0      splines_4.4.2     parallel_4.4.2   
## [45] cellranger_1.1.0  base64enc_0.1-3   vctrs_0.6.5       boot_1.3-31      
## [49] glmnet_4.1-8      Matrix_1.7-2      jsonlite_1.8.8    hms_1.1.3        
## [53] mitml_0.4-5       Formula_1.2-5     htmlTable_2.4.3   systemfonts_1.1.0
## [57] foreach_1.5.2     jquerylib_0.1.4   rematch_2.0.0     glue_1.7.0       
## [61] nloptr_2.1.1      pan_1.9           codetools_0.2-19  stringi_1.8.4    
## [65] gtable_0.3.5      shape_1.4.6.1     lme4_1.1-35.5     munsell_0.5.1    
## [69] pillar_1.9.0      htmltools_0.5.8.1 R6_2.5.1          textshaping_0.4.0
## [73] evaluate_0.24.0   lattice_0.22-5    highr_0.11        backports_1.5.0  
## [77] broom_1.0.6       bslib_0.8.0       Rcpp_1.0.13       gridExtra_2.3    
## [81] nlme_3.1-167      checkmate_2.3.2   xfun_0.47         pkgconfig_2.0.3
#write data to file for reuse
# d %>% write_rds("data/data_for_reuse.rds")

#OSF
if (F) {
  library(osfr)
  
  #login
  osf_auth(readr::read_lines("~/.config/osf_token"))
  
  #the project we will use
  osf_proj = osf_retrieve_node("https://osf.io/XXX/")
  
  #upload all files in project
  #overwrite existing (versioning)
  osf_upload(
    osf_proj,
    path = c("data", "figures", "papers", "notebook.Rmd", "notebook.html", "sessions_info.txt"), 
    conflicts = "overwrite"
    )
}