library(kirkegaard)
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: magrittr
##
##
## Attaching package: 'magrittr'
##
##
## The following object is masked from 'package:purrr':
##
## set_names
##
##
## The following object is masked from 'package:tidyr':
##
## extract
##
##
## Loading required package: weights
##
## Loading required package: Hmisc
##
##
## Attaching package: 'Hmisc'
##
##
## The following objects are masked from 'package:dplyr':
##
## src, summarize
##
##
## The following objects are masked from 'package:base':
##
## format.pval, units
##
##
## Loading required package: assertthat
##
##
## Attaching package: 'assertthat'
##
##
## The following object is masked from 'package:tibble':
##
## has_name
##
##
## Loading required package: psych
##
##
## Attaching package: 'psych'
##
##
## The following object is masked from 'package:Hmisc':
##
## describe
##
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
##
##
##
## Attaching package: 'kirkegaard'
##
##
## The following object is masked from 'package:psych':
##
## rescale
##
##
## The following object is masked from 'package:assertthat':
##
## are_equal
##
##
## The following object is masked from 'package:purrr':
##
## is_logical
##
##
## The following object is masked from 'package:base':
##
## +
load_packages(
readxl,
patchwork
)
theme_set(theme_bw())
options(
digits = 3
)
#complete 1C
d_1c = read_excel("data/2025373516527613147FOLK1C.xlsx", range = "B3:BT69")
## New names:
## • `` -> `...1`
## • `` -> `...2`
#better names
names(d_1c)[1:2] = c("origin", "age_group")
#fill missing
d_1c$origin %<>% miss_locf()
#to long form
d_1c_long = d_1c %>%
pivot_longer(cols = -c(origin, age_group), names_to = "year", values_to = "count") %>%
mutate(
#quarters as fractions
#c(2, 5, 8, 11) / 12
#[1] 0.167 0.417 0.667 0.917
year2 = year %>%
str_replace("Q1", ".167") %>%
str_replace("Q2", ".417") %>%
str_replace("Q3", ".667") %>%
str_replace("Q4", ".917") %>%
as.numeric()
)
#compute fractions
d_1c_long %<>% plyr::ddply(c("age_group", "year"), function(dd) {
dd$fraction = dd$count / sum(dd$count)
dd
})
d_1c_sex = read_excel("data/20253752934527625048FOLK1C by sex.xlsx", range = "A3:D4557", guess_max = 10000)
## New names:
## • `` -> `...1`
## • `` -> `...2`
## • `` -> `...3`
#better names
names(d_1c_sex) = c("origin", "age_group", "year", "count")
#fill missing
d_1c_sex$origin %<>% miss_locf()
d_1c_sex$age_group %<>% miss_locf()
#recode year
d_1c_sex_long = d_1c_sex %>%
mutate(
#quarters as fractions
#c(2, 5, 8, 11) / 12
#[1] 0.167 0.417 0.667 0.917
year2 = year %>%
str_replace("Q1", ".167") %>%
str_replace("Q2", ".417") %>%
str_replace("Q3", ".667") %>%
str_replace("Q4", ".917") %>%
as.numeric()
)
d_fodie = read_excel("data/20253744022527619226FODIE.xlsx", range = "A3:T36")
## New names:
## • `` -> `...1`
## • `` -> `...2`
#better names
names(d_fodie)[1:2] = c("origin", "age_group")
#fill missing
d_fodie$origin %<>% miss_locf()
#to long form
d_fodie_long = d_fodie %>%
pivot_longer(cols = -c(origin, age_group), names_to = "year", values_to = "births")
#do the age categories match?
intersect(
d_1c_sex$age_group %>% unique(),
d_fodie$age_group %>% unique()
)
## [1] "15-19 years" "20-24 years" "25-29 years" "30-34 years" "35-39 years"
## [6] "40-44 years" "45-49 years" "50-54 years" "55-59 years" "60-64 years"
#join
d_folk1c_fodie = inner_join(
#only keep Q2 values
d_1c_sex_long %>% filter(str_detect(year, "Q2")) %>% mutate(year = str_replace(year, "Q2", "")),
d_fodie_long,
by = c("origin", "age_group", "year")
) %>%
#birth rate
mutate(
birth_rate = births / count,
year = as.numeric(year)
)
#age groups
d_1c$age_group %>% unique()
## [1] "Age, total" "0-4 years" "5-9 years"
## [4] "10-14 years" "15-19 years" "20-24 years"
## [7] "25-29 years" "30-34 years" "35-39 years"
## [10] "40-44 years" "45-49 years" "50-54 years"
## [13] "55-59 years" "60-64 years" "65-69 years"
## [16] "70-74 years" "75-79 years" "80-84 years"
## [19] "85-89 years" "90-94 years" "95-99 years"
## [22] "100 years and over"
#combined plot with counts and percentages
#counts 0-4
p_folk1c_04 = d_1c_long %>%
filter(age_group %in% c("0-4 years")) %>%
ggplot(aes(x = year2, y = count, color = origin)) +
geom_line() +
labs(title = "Counts of persons by origin and for 0-4 year olds",
x = "Year",
y = "Count") +
#remove color guide
#smaller title font
theme(
legend.position = "none",
plot.title = element_text(size = 10)
)
#counts all
p_folk1c_all = d_1c_long %>%
filter(age_group %in% c("Age, total")) %>%
ggplot(aes(x = year2, y = count, color = origin)) +
geom_line() +
labs(title = "Counts of persons by origin and for the total population",
x = "Year",
y = "Count") +
#remove color guide
#smaller title font
theme(
legend.position = "none",
plot.title = element_text(size = 10)
)
#fractions 0-4
p_folk1c_04_frac = d_1c_long %>%
filter(age_group %in% c("0-4 years")) %>%
ggplot(aes(x = year2, y = fraction, color = origin)) +
geom_line() +
labs(title = "% of persons by origin and for 0-4 year olds",
x = "Year",
y = "Fraction") +
scale_y_continuous(labels = scales::percent) +
#remove color guide
#smaller title font
theme(
legend.position = "none",
plot.title = element_text(size = 10)
)
#fraction all
p_folk1c_all_frac = d_1c_long %>%
filter(age_group %in% c("Age, total")) %>%
ggplot(aes(x = year2, y = fraction, color = origin)) +
geom_line() +
labs(title = "% of persons by origin and for the total population",
x = "Year",
y = "Fraction") +
scale_y_continuous(labels = scales::percent) +
#remove color guide
#smaller title font
theme(
legend.position = "bottom",
plot.title = element_text(size = 10),
#smaller font size on color guide
legend.text = element_text(size = 8)
)
#combine
patchwork::wrap_plots(p_folk1c_04, p_folk1c_all, p_folk1c_04_frac, p_folk1c_all_frac, nrow = 2)
GG_save("figs/folk1c.png")
#danish fraction only
d_1c_long %>%
filter(origin == "Persons of Danish origin", age_group %in% c("0-4 years", "Age, total")) %>%
ggplot(aes(x = year2, y = fraction, color = age_group)) +
geom_line() +
labs(title = "% of persons Danish",
x = "Year",
y = "Fraction") +
scale_y_continuous(labels = scales::percent)
GG_save("figs/folk1c_danish.png")
d_1c_long %>%
filter(origin == "Persons of Danish origin", age_group %in% c("0-4 years", "Age, total"))
#birth rates
d_folk1c_fodie %>%
ggplot(aes(x = year, y = birth_rate, color = origin, group = origin)) +
geom_line() +
labs(title = "Birth rates by age group and origin",
x = "Year",
y = "Birth rate") +
facet_wrap(~age_group, scales = "free_y") +
theme(
legend.position = "bottom"
)
GG_save("figs/birth_rates.png")
#weighted mean birth rate
d_folk1c_fodie_wmean = d_folk1c_fodie %>% plyr::ddply(c("origin", "year"), function(dd) {
tibble(
birth_rate = weighted.mean(dd$birth_rate, dd$count)
)
})
#plot
d_folk1c_fodie_wmean %>%
ggplot(aes(x = year, y = birth_rate, color = origin, group = origin)) +
geom_line() +
labs(title = "Weighted mean birth rates by origin and year",
x = "Year",
y = "Birth rate") +
theme(
legend.position = "bottom"
)
GG_save("figs/birth_rates_wmean.png")
#weighted mean, but using the danish age composition, so as to control for the age distribution differences
d_folk1c_fodie_wmean_danish = d_folk1c_fodie %>%
plyr::ddply(c("year"), function(dd) {
#danish proportions
dd_dk = dd %>% filter(origin == "Persons of Danish origin")
dd_dk$fraction = dd_dk$count / sum(dd_dk$count)
#by origin
dd2 = dd %>%
inner_join(dd_dk %>% select(age_group, fraction), by = "age_group") %>%
group_by(origin) %>%
reframe(
birth_rate = weighted.mean(birth_rate, fraction),
TFR = birth_rate * 50
)
dd2
})
#plot
d_folk1c_fodie_wmean_danish %>%
ggplot(aes(x = year, y = birth_rate, color = origin, group = origin)) +
geom_line() +
labs(title = "Weighted mean birth rates by origin and year, using Danish age composition",
x = "Year",
y = "Birth rate") +
theme(
legend.position = "bottom"
)
GG_save("figs/birth_rates_wmean_danish.png")
#TFR plot
d_folk1c_fodie_wmean_danish %>%
ggplot(aes(x = year, y = TFR, color = origin, group = origin)) +
geom_line() +
labs(title = "Total Fertility Rate estimate by origin and year, using Danish age composition",
x = "Year",
y = "TFR") +
theme(
legend.position = "bottom"
)
#origin of newborns as percentage of total
d_folk1c_fodie_frac = d_folk1c_fodie %>%
plyr::ddply(c("year", "origin"), function(dd) {
tibble(
newborns = dd$births %>% sum()
)
}) %>%
plyr::ddply(c("year"), function(dd) {
dd$newborns_fraction = dd$newborns / sum(dd$newborns)
dd
})
#plot
d_folk1c_fodie_frac %>%
ggplot(aes(x = year, y = newborns_fraction, color = origin, group = origin)) +
geom_line() +
labs(title = "Newborns by origin of mother as fraction of total",
x = "Year",
y = "Fraction",
color = "Origin of mother") +
scale_y_continuous(labels = scales::percent) +
theme(
legend.position = "bottom"
)
GG_save("figs/newborns_fraction.png")
#versions
write_sessioninfo()
## R version 4.4.2 (2024-10-31)
## Platform: x86_64-pc-linux-gnu
## Running under: Linux Mint 21.1
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_DK.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_DK.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_DK.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Europe/Brussels
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] patchwork_1.2.0 readxl_1.4.3 kirkegaard_2025-03-07
## [4] psych_2.4.6.26 assertthat_0.2.1 weights_1.0.4
## [7] Hmisc_5.1-3 magrittr_2.0.3 lubridate_1.9.3
## [10] forcats_1.0.0 stringr_1.5.1 dplyr_1.1.4
## [13] purrr_1.0.2 readr_2.1.5 tidyr_1.3.1
## [16] tibble_3.2.1 ggplot2_3.5.1 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.2.1 farver_2.1.2 fastmap_1.2.0 digest_0.6.37
## [5] rpart_4.1.24 timechange_0.3.0 lifecycle_1.0.4 cluster_2.1.8
## [9] survival_3.8-3 gdata_3.0.0 compiler_4.4.2 rlang_1.1.4
## [13] sass_0.4.9 tools_4.4.2 utf8_1.2.4 yaml_2.3.10
## [17] data.table_1.16.0 knitr_1.48 labeling_0.4.3 htmlwidgets_1.6.4
## [21] mnormt_2.1.1 plyr_1.8.9 withr_3.0.1 foreign_0.8-88
## [25] nnet_7.3-20 grid_4.4.2 fansi_1.0.6 jomo_2.7-6
## [29] colorspace_2.1-1 mice_3.16.0 scales_1.3.0 gtools_3.9.5
## [33] iterators_1.0.14 MASS_7.3-64 cli_3.6.3 rmarkdown_2.28
## [37] ragg_1.3.2 generics_0.1.3 rstudioapi_0.16.0 tzdb_0.4.0
## [41] minqa_1.2.8 cachem_1.1.0 splines_4.4.2 parallel_4.4.2
## [45] cellranger_1.1.0 base64enc_0.1-3 vctrs_0.6.5 boot_1.3-31
## [49] glmnet_4.1-8 Matrix_1.7-2 jsonlite_1.8.8 hms_1.1.3
## [53] mitml_0.4-5 Formula_1.2-5 htmlTable_2.4.3 systemfonts_1.1.0
## [57] foreach_1.5.2 jquerylib_0.1.4 rematch_2.0.0 glue_1.7.0
## [61] nloptr_2.1.1 pan_1.9 codetools_0.2-19 stringi_1.8.4
## [65] gtable_0.3.5 shape_1.4.6.1 lme4_1.1-35.5 munsell_0.5.1
## [69] pillar_1.9.0 htmltools_0.5.8.1 R6_2.5.1 textshaping_0.4.0
## [73] evaluate_0.24.0 lattice_0.22-5 highr_0.11 backports_1.5.0
## [77] broom_1.0.6 bslib_0.8.0 Rcpp_1.0.13 gridExtra_2.3
## [81] nlme_3.1-167 checkmate_2.3.2 xfun_0.47 pkgconfig_2.0.3
#write data to file for reuse
# d %>% write_rds("data/data_for_reuse.rds")
#OSF
if (F) {
library(osfr)
#login
osf_auth(readr::read_lines("~/.config/osf_token"))
#the project we will use
osf_proj = osf_retrieve_node("https://osf.io/XXX/")
#upload all files in project
#overwrite existing (versioning)
osf_upload(
osf_proj,
path = c("data", "figures", "papers", "notebook.Rmd", "notebook.html", "sessions_info.txt"),
conflicts = "overwrite"
)
}