pacman::p_load(janitor,
tidyverse,
WDI, # World bank data
scales,
httr, # to read online xls,
ggrepel,
countrycode)
theme_set(theme_minimal())
Sources:
Publications: https://www.nature.com/nature-index/country-outputs/generate/all/global
Women in science: https://www.oecd.org/sti/scoreboard.htm?i=TH_WRXRS&v=1&t=9999&s=LVA&r=3
df <- read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vQx0EAzcq52X0nGCMkBjG7xWj8mvMIFaoVFIJynfeVuH4bYMW4Hua2zQmefpAb1mLT7S5ZP07ILmOx4/pub?gid=1662469666&single=true&output=csv")
Add the continent
df$Region <- countrycode(
sourcevar = df$Country,
origin = "country.name",
destination = "region"
)
df$iso3c <- countrycode(
sourcevar = df$Country,
origin = "country.name",
destination = "iso3c"
)
wbd <-
WDI(
indicator = c("NY.GDP.PCAP.CD", "SP.POP.TOTL") ,
country = "all",
start = 2010,
end = 2022
)
wbd$iso3c <- countrycode(
sourcevar = wbd$iso2c,
origin = "iso2c",
destination = "iso3c"
)
wbd <- wbd %>%
filter(!is.na(iso3c)) %>%
rename("GDP per capita (current US$)" = NY.GDP.PCAP.CD,
"Population" = SP.POP.TOTL)
head(wbd)
## iso2c country year GDP per capita (current US$) Population iso3c
## 1 AD Andorra 2010 40849.76 84454 AND
## 2 AD Andorra 2011 43333.97 83748 AND
## 3 AD Andorra 2012 38684.57 82427 AND
## 4 AD Andorra 2013 39538.36 80770 AND
## 5 AD Andorra 2014 41302.38 79213 AND
## 6 AD Andorra 2015 35770.92 77993 AND
Keep only the last datapoint
wbd <- wbd %>%
group_by(iso3c) %>%
slice(which.max((year)))
df <- left_join(df, wbd, by = "iso3c")
rm(wbd)
head(df)
## # A tibble: 6 × 11
## Country Women…¹ Publi…² Share Region iso3c iso2c country year GDP p…³
## <chr> <dbl> <dbl> <dbl> <chr> <chr> <chr> <chr> <int> <dbl>
## 1 Austria 30.4 1177 356. Europ… AUT AT Austria 2021 53268.
## 2 Belgium 32.6 1131 386. Europ… BEL BE Belgium 2021 51768.
## 3 Chile 34.8 439 114. Latin… CHL CL Chile 2021 16503.
## 4 Costa Rica 45.2 26 2.94 Latin… CRI CR Costa … 2021 12509.
## 5 Czech Republic 27.6 721 194 Europ… CZE CZ Czech … 2021 26378.
## 6 Denmark 35.3 1197 406. Europ… DNK DK Denmark 2021 67803.
## # … with 1 more variable: Population <dbl>, and abbreviated variable names
## # ¹​`Women in science`, ²​Publications, ³​`GDP per capita (current US$)`
===================
Dataset ready
df %>%
ggplot(aes(x = fct_reorder(Country, `Women in science`),
y = `Women in science`,
fill = Region)) +
geom_col() +
coord_flip() +
geom_hline(yintercept = 50,
linetype = "dashed",
color = "grey 30") +
scale_y_continuous(labels = label_percent(scale = 1)) +
labs(title = "Percentage women in science, OECD countries",
x = "",
y = "") +
theme(legend.position = "top")
df %>%
ggplot(aes(
x = `GDP per capita (current US$)`,
y = `Women in science`,
label = iso3c,
color = Region
)) +
geom_point() +
ylim(0, 60) +
scale_y_continuous(labels = label_percent(scale = 1)) +
# scale_x_log10() +
coord_trans(x = "log10") +
scale_x_continuous(labels = label_dollar()) +
geom_text_repel(size = 3) +
labs(title = "Women in Science and GDP",
subtitle = "OECD Countries",
x = "GDP (Log scale)")
# Source https://www.scimagojr.com/countryrank.php
sci_url <-
'https://www.scimagojr.com/countryrank.php?out=xls'
scif <- tempfile()
download.file(sci_url, scif, mode = "wb")
sci <- readxl::read_excel(path = scif, sheet = 1)
rm(sci_url, scif)
Add the iso country to join afterward
sci$iso3c <- countrycode(
sourcevar = sci$Country,
origin = "country.name",
destination = "iso3c"
)
df <- left_join(df, sci, by = "iso3c") %>%
rename("Country" = "Country.x") %>%
select(-Country.y,
"Region" = "Region.x",
-Region.y,
-country)
rm(sci)
Dataset ready
Let’s check the productivity of the countries according to their % of WiS
df %>%
# mutate(Documents = Documents / Population * 1000) %>%
ggplot(aes(x = `Women in science`,
y = Documents,
color = Region,
# size = `GDP per capita (current US$)`,
label = iso3c)) +
geom_jitter(alpha = .6) +
labs(title = "Percentage of WiS and Scientific Documents",
y = "Documents (log scale)") +
# scale_y_continuous(labels = scales::label_number(scale = 1 / 1000000)) +
scale_x_continuous(labels = label_percent(scale = 1)) +
scale_size_continuous(labels = scales::label_number()) +
# scale_y_log10() +
scale_y_continuous(labels = scales::label_number()) +
coord_trans( y = "log10") +
geom_text_repel(size = 3) +
theme(legend.position = "top")
Visually, it seems that a negative correlation is present. But let’s adjust by the population of the countries
df %>%
mutate(Documents = Documents / Population * 1000) %>%
ggplot(aes(x = `Women in science`,
y = Documents,
color = Region,
# size = `GDP per capita (current US$)`,
label = iso3c
)) +
geom_jitter(alpha = .6) +
labs(title = "Percentage of WiS and Scientific Documents",
subtitle = "Documents adjusted per 1000 inhabitants",
y = "Documents") +
# scale_y_continuous(labels = scales::label_number(scale = 1 / 1000000)) +
scale_x_continuous(labels = label_percent(scale = 1)) +
scale_size_continuous(labels = scales::label_number()) +
# scale_y_log10() +
scale_y_continuous(labels = scales::label_number()) +
# coord_trans( y = "log10") +
geom_text_repel(size = 3) +
theme(legend.position = "top")
df %>%
# mutate(Documents = Documents / Population * 1000) %>%
ggplot(aes(x = `Women in science`,
y = `H index`,
color = Region,
# size = `GDP per capita (current US$)`,
label = iso3c )) +
geom_jitter(alpha = .6) +
labs(title = "Percentage of WiS and Scientific Impact (H index)",
y = "H index") +
# scale_y_continuous(labels = scales::label_number(scale = 1 / 1000000)) +
scale_x_continuous(labels = label_percent(scale = 1)) +
scale_size_continuous(labels = scales::label_number()) +
# scale_y_log10() +
scale_y_continuous(labels = scales::label_number()) +
# coord_trans( y = "log10") +
geom_text_repel(size = 3) +
theme(legend.position = "top")
df %>%
# mutate(Documents = Documents / Population * 1000) %>%
ggplot(aes(x = `Women in science`,
y = `Citations per document`,
color = Region,
label = iso3c,
size = `GDP per capita (current US$)`)) +
geom_jitter(alpha = .6) +
labs(title = "Percentage of WiS and Citations per document",
y = "Citations per document") +
# scale_y_continuous(labels = scales::label_number(scale = 1 / 1000000)) +
scale_x_continuous(labels = label_percent(scale = 1)) +
# scale_y_log10() +
scale_y_continuous(labels = scales::label_number()) +
scale_size_continuous(labels = scales::label_number()) +
# coord_trans( y = "log10") +
geom_text_repel(size = 3) +
theme(
legend.position = "top",
legend.box = "vertical",
legend.margin = margin()
)
df %>%
mutate("Self citations ratio" = `Self-citations` / Citations) %>%
ggplot(aes(x = `Women in science`,
y = `Self citations ratio`,
color = Region,
label = iso3c)) +
geom_jitter() +
labs(title = "Percentage of WiS and self citations per document",
y = "Self citations ratio (Self citations / Total citations)") +
scale_x_continuous(labels = label_percent(scale = 1)) +
geom_text_repel(size = 3) +
theme(legend.position = "top")
df %>%
ggplot(aes(x = `Women in science`,
y = `Citations per document`,
color = Region,
label = iso3c)) +
geom_jitter() +
labs(title = "Percentage of WiS and Citations per document",
y = "Citations per document") +
scale_x_continuous(labels = label_percent(scale = 1)) +
geom_text_repel(size = 3) +
theme(legend.position = "top")
df %>%
ggplot(
aes(
x = `GDP per capita (current US$)`,
y = `Citations per document`,
color = `Women in science`,
# color = Region,
label = iso3c
)
) +
geom_jitter(alpha = .6) +
labs(title = "GDP per capita and Citations per document",
y = "Citations per document",
x = "GDP per capita (current US$), log 10") +
# scale_x_continuous(labels = label_percent(scale = 1)) +
scale_x_continuous(labels = scales::label_number()) +
geom_text_repel(size = 3) +
scale_x_log10(labels = label_number(scale_cut = cut_long_scale())) +
theme(
legend.position = "top",
legend.box = "vertical",
legend.margin = margin()
)
df %>%
ggplot(
aes(
y = `GDP per capita (current US$)`,
x = `Women in science`,
color = `H index`,
# color = Region,
label = iso3c
)
) +
geom_jitter(alpha = .6) +
labs(title = "GDP per capita and WiS",
y = "GDP per capita (current US$)",
x = "Women in science") +
# scale_x_continuous(labels = label_percent(scale = 1)) +
scale_x_continuous(labels = scales::label_number()) +
geom_text_repel(size = 3) +
scale_y_log10(labels = label_number(scale_cut = cut_long_scale())) +
theme(
legend.position = "top",
legend.box = "vertical",
legend.margin = margin()
)
df %>%
mutate(Citations = Citations - `Self-citations`) %>%
ggplot(
aes(
y = Citations,
x = `H index`,
color = `GDP per capita (current US$)`,
size = log10(`GDP per capita (current US$)`),
# color = Region,
label = iso3c
)
) +
geom_jitter(alpha = .6) +
labs(title = "Quantity and impact of publications",
y = "Citations (excluiding self citations)",
x = "Citable documents") +
# scale_x_continuous(labels = label_percent(scale = 1)) +
scale_x_continuous(labels = scales::label_number()) +
geom_text_repel(size = 3) +
scale_y_log10(labels = label_number(scale_cut = cut_long_scale())) +
theme(
legend.position = "top",
legend.box = "vertical",
legend.margin = margin()
) +
scale_size_continuous(guide="none") +
scale_color_viridis_c(direction = -1, option = "inferno") +
# scale_colour_viridis() +
scale_color_continuous(labels = scales::label_number() )
df %>%
ggplot(
aes(
x = Population,
y = `Citable documents`,
label = iso3c,
color = Region,
size = `GDP per capita (current US$)`
)
) +
geom_jitter(alpha = .7) +
# scale_x_log10() +
# scale_y_log10() +
coord_trans(y = "log10", x = "log10") +
scale_x_continuous(labels = unit_format(unit = "M", scale = 1e-6)) +
scale_y_continuous(labels = unit_format(unit = "M", scale = 1e-6)) +
geom_text_repel(size = 3) +
theme(
legend.position = "top",
legend.box = "vertical",
legend.margin = margin()
) +
scale_size_continuous(labels = unit_format(unit = "K", scale = 1e-5)) +
labs(title = "Scientific productity, population and GDP")