pacman::p_load(janitor, 
               tidyverse, 
               WDI, # World bank data
               scales, 
               httr, # to read online xls, 
               ggrepel, 
               countrycode)
theme_set(theme_minimal())

Data

Sources:

Publications: https://www.nature.com/nature-index/country-outputs/generate/all/global

Women in science: https://www.oecd.org/sti/scoreboard.htm?i=TH_WRXRS&v=1&t=9999&s=LVA&r=3

df <- read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vQx0EAzcq52X0nGCMkBjG7xWj8mvMIFaoVFIJynfeVuH4bYMW4Hua2zQmefpAb1mLT7S5ZP07ILmOx4/pub?gid=1662469666&single=true&output=csv")

Countrycodes

Add the continent

df$Region <- countrycode(
  sourcevar = df$Country,
  origin = "country.name",
  destination = "region"
)
df$iso3c <- countrycode(
  sourcevar = df$Country,
  origin = "country.name",
  destination = "iso3c"
)

World bank data

wbd <-
  WDI(
    indicator = c("NY.GDP.PCAP.CD", "SP.POP.TOTL") ,
    country = "all",
    start = 2010,
    end = 2022
  )


wbd$iso3c <- countrycode(
  sourcevar = wbd$iso2c,
  origin = "iso2c",
  destination = "iso3c"
)

wbd <- wbd %>%
  filter(!is.na(iso3c)) %>% 
  rename("GDP per capita (current US$)" = NY.GDP.PCAP.CD, 
         "Population" = SP.POP.TOTL)

head(wbd)
##   iso2c country year GDP per capita (current US$) Population iso3c
## 1    AD Andorra 2010                     40849.76      84454   AND
## 2    AD Andorra 2011                     43333.97      83748   AND
## 3    AD Andorra 2012                     38684.57      82427   AND
## 4    AD Andorra 2013                     39538.36      80770   AND
## 5    AD Andorra 2014                     41302.38      79213   AND
## 6    AD Andorra 2015                     35770.92      77993   AND

Keep only the last datapoint

wbd <- wbd %>%
    group_by(iso3c) %>%
    slice(which.max((year)))

Join

df <- left_join(df, wbd, by = "iso3c")
rm(wbd)
head(df)
## # A tibble: 6 × 11
##   Country        Women…¹ Publi…²  Share Region iso3c iso2c country  year GDP p…³
##   <chr>            <dbl>   <dbl>  <dbl> <chr>  <chr> <chr> <chr>   <int>   <dbl>
## 1 Austria           30.4    1177 356.   Europ… AUT   AT    Austria  2021  53268.
## 2 Belgium           32.6    1131 386.   Europ… BEL   BE    Belgium  2021  51768.
## 3 Chile             34.8     439 114.   Latin… CHL   CL    Chile    2021  16503.
## 4 Costa Rica        45.2      26   2.94 Latin… CRI   CR    Costa …  2021  12509.
## 5 Czech Republic    27.6     721 194    Europ… CZE   CZ    Czech …  2021  26378.
## 6 Denmark           35.3    1197 406.   Europ… DNK   DK    Denmark  2021  67803.
## # … with 1 more variable: Population <dbl>, and abbreviated variable names
## #   ¹​`Women in science`, ²​Publications, ³​`GDP per capita (current US$)`

===================

Dataset ready

df %>%
  ggplot(aes(x = fct_reorder(Country, `Women in science`),
             y = `Women in science`, 
             fill = Region)) +
  geom_col() +
  coord_flip() +
  geom_hline(yintercept = 50,
             linetype = "dashed",
             color = "grey 30") + 
  scale_y_continuous(labels = label_percent(scale = 1)) +
  labs(title = "Percentage women in science, OECD countries", 
       x = "", 
       y = "") + 
  theme(legend.position = "top")

df %>%
  ggplot(aes(
    x = `GDP per capita (current US$)`,
    y = `Women in science`,
    label = iso3c,
    color = Region
  )) +
  geom_point() +
  ylim(0, 60) +
  scale_y_continuous(labels = label_percent(scale = 1)) +
  
  # scale_x_log10() +
  coord_trans(x = "log10") +
  scale_x_continuous(labels = label_dollar()) +
  geom_text_repel(size = 3)  +
  labs(title = "Women in Science and GDP",
       subtitle = "OECD Countries",
       x = "GDP (Log scale)")

Impact and publications data from Scimago

 # Source https://www.scimagojr.com/countryrank.php

sci_url <-
  'https://www.scimagojr.com/countryrank.php?out=xls'
scif <- tempfile()
download.file(sci_url, scif, mode = "wb")

sci <- readxl::read_excel(path = scif, sheet = 1)
rm(sci_url, scif)

Add the iso country to join afterward

sci$iso3c <- countrycode(
  sourcevar = sci$Country,
  origin = "country.name",
  destination = "iso3c"
)

Join

df <- left_join(df, sci, by = "iso3c") %>% 
  rename("Country" = "Country.x") %>% 
  select(-Country.y, 
         "Region" = "Region.x", 
         -Region.y, 
         -country)
rm(sci)

Dataset ready

Women and impact

Women and publications

Let’s check the productivity of the countries according to their % of WiS

df %>%
  # mutate(Documents = Documents / Population * 1000) %>% 
  ggplot(aes(x = `Women in science`,
             y = Documents, 
             color = Region, 
             # size = `GDP per capita (current US$)`, 
             label = iso3c)) +
  geom_jitter(alpha = .6) +
  labs(title = "Percentage of WiS and Scientific Documents",
       y = "Documents (log scale)") +
  # scale_y_continuous(labels = scales::label_number(scale = 1 / 1000000)) +
  scale_x_continuous(labels = label_percent(scale = 1)) +
  scale_size_continuous(labels = scales::label_number()) + 
  # scale_y_log10() +
  scale_y_continuous(labels = scales::label_number()) + 
  coord_trans( y = "log10") + 
  geom_text_repel(size = 3) +
  theme(legend.position = "top")

Visually, it seems that a negative correlation is present. But let’s adjust by the population of the countries

df %>%
  mutate(Documents = Documents / Population * 1000) %>% 
  ggplot(aes(x = `Women in science`,
             y = Documents, 
             color = Region, 
             # size = `GDP per capita (current US$)`, 
             label = iso3c 
             )) +
  geom_jitter(alpha = .6) +
  labs(title = "Percentage of WiS and Scientific Documents",
       subtitle = "Documents adjusted per 1000 inhabitants", 
       y = "Documents") +
  # scale_y_continuous(labels = scales::label_number(scale = 1 / 1000000)) +
  scale_x_continuous(labels = label_percent(scale = 1)) + 
  scale_size_continuous(labels = scales::label_number()) + 
  # scale_y_log10() +
  scale_y_continuous(labels = scales::label_number()) + 
  # coord_trans( y = "log10") + 
  geom_text_repel(size = 3) +
  theme(legend.position = "top")

Women and impact

df %>%
  # mutate(Documents = Documents / Population * 1000) %>% 
  ggplot(aes(x = `Women in science`,
             y = `H index`, 
             color = Region, 
             # size = `GDP per capita (current US$)`, 
             label = iso3c )) +
  geom_jitter(alpha = .6) +
  labs(title = "Percentage of WiS and Scientific Impact (H index)",
       y = "H index") +
  # scale_y_continuous(labels = scales::label_number(scale = 1 / 1000000)) +
  scale_x_continuous(labels = label_percent(scale = 1)) +
  scale_size_continuous(labels = scales::label_number()) + 
  # scale_y_log10() +
  scale_y_continuous(labels = scales::label_number()) + 
  # coord_trans( y = "log10") + 
  geom_text_repel(size = 3) +
  theme(legend.position = "top")

df %>%
  # mutate(Documents = Documents / Population * 1000) %>% 
  ggplot(aes(x = `Women in science`,
             y = `Citations per document`, 
             color = Region, 
             label = iso3c, 
             size = `GDP per capita (current US$)`)) +
  geom_jitter(alpha = .6) +
  labs(title = "Percentage of WiS and Citations per document",
       y = "Citations per document") +
  # scale_y_continuous(labels = scales::label_number(scale = 1 / 1000000)) +
  scale_x_continuous(labels = label_percent(scale = 1)) + 
  # scale_y_log10() +
  scale_y_continuous(labels = scales::label_number()) + 
  scale_size_continuous(labels = scales::label_number()) + 
  # coord_trans( y = "log10") + 
  geom_text_repel(size = 3) +
    theme(
    legend.position = "top",
    legend.box = "vertical",
    legend.margin = margin()
  )

Self citations

df %>% 
  mutate("Self citations ratio" = `Self-citations` / Citations) %>% 
  ggplot(aes(x = `Women in science`, 
             y = `Self citations ratio`, 
              color = Region, 
             label = iso3c)) + 
  geom_jitter() + 
   labs(title = "Percentage of WiS and self citations per document",
       y = "Self citations ratio (Self citations / Total citations)") +
  scale_x_continuous(labels = label_percent(scale = 1)) + 
  geom_text_repel(size = 3) + 
  theme(legend.position = "top") 

Citations per document

df %>% 
  ggplot(aes(x = `Women in science`, 
             y = `Citations per document`, 
              color = Region, 
             label = iso3c)) + 
  geom_jitter() + 
   labs(title = "Percentage of WiS and Citations per document",
       y = "Citations per document") +
  scale_x_continuous(labels = label_percent(scale = 1)) + 
  geom_text_repel(size = 3) + 
  theme(legend.position = "top") 

GDP and citations per document

df %>%
  ggplot(
    aes(
      x = `GDP per capita (current US$)`,
      y = `Citations per document`,
      color = `Women in science`,
      # color = Region,
      label = iso3c
    )
  ) +
  geom_jitter(alpha = .6) +
  labs(title = "GDP per capita and Citations per document",
       y = "Citations per document",
       x = "GDP per capita (current US$), log 10") +
  # scale_x_continuous(labels = label_percent(scale = 1)) +
  scale_x_continuous(labels = scales::label_number()) +
  
  geom_text_repel(size = 3) +
  scale_x_log10(labels = label_number(scale_cut = cut_long_scale())) +
  theme(
    legend.position = "top",
    legend.box = "vertical",
    legend.margin = margin()
  ) 

GDP and WiS

df %>%
  ggplot(
    aes(
      y = `GDP per capita (current US$)`,
      x = `Women in science`,
      color = `H index`,
      # color = Region,
      label = iso3c
    )
  ) +
  geom_jitter(alpha = .6) +
  labs(title = "GDP per capita and WiS",
       y = "GDP per capita (current US$)",
       x = "Women in science") +
  # scale_x_continuous(labels = label_percent(scale = 1)) +
  scale_x_continuous(labels = scales::label_number()) +
  
  geom_text_repel(size = 3) +
  scale_y_log10(labels = label_number(scale_cut = cut_long_scale())) +
  theme(
    legend.position = "top",
    legend.box = "vertical",
    legend.margin = margin()
  )

Other analysis

Quantity and citations

df %>% 
  mutate(Citations = Citations -  `Self-citations`) %>% 
    ggplot(
    aes(
      y = Citations, 
      x = `H index`,
      color = `GDP per capita (current US$)`,
      size = log10(`GDP per capita (current US$)`), 
      # color = Region,
      label = iso3c
    )
  ) +
  geom_jitter(alpha = .6) +
  labs(title = "Quantity and impact of publications",
       y = "Citations (excluiding self citations)",
       x = "Citable documents") +
  # scale_x_continuous(labels = label_percent(scale = 1)) +
  scale_x_continuous(labels = scales::label_number()) +
  
  geom_text_repel(size = 3) +
  scale_y_log10(labels = label_number(scale_cut = cut_long_scale())) +
  theme(
    legend.position = "top",
    legend.box = "vertical",
    legend.margin = margin()
  ) + 
  scale_size_continuous(guide="none") + 
  scale_color_viridis_c(direction = -1, option = "inferno") +
  # scale_colour_viridis() + 
  scale_color_continuous(labels = scales::label_number() ) 

Productivty adjusted by population

df %>%
  ggplot(
    aes(
      x = Population,
      y = `Citable documents`,
      label = iso3c,
      color = Region, 
      size = `GDP per capita (current US$)`
      
      
    )
  ) +
  geom_jitter(alpha = .7) +
  # scale_x_log10() +
  # scale_y_log10() +
  coord_trans(y = "log10", x = "log10") +
  scale_x_continuous(labels = unit_format(unit = "M", scale = 1e-6)) +
  scale_y_continuous(labels = unit_format(unit = "M", scale = 1e-6)) +
  geom_text_repel(size = 3)  +
   theme(
    legend.position = "top",
    legend.box = "vertical",
    legend.margin = margin()
  ) +
  scale_size_continuous(labels = unit_format(unit = "K", scale = 1e-5)) +
  labs(title = "Scientific productity, population and GDP")