pacman::p_load(janitor, 
               tidyverse, 
               WDI, # World bank data
               scales, 
               httr, # to read online xls, 
               ggrepel, 
               countrycode)

theme_set(theme_minimal())

Data

Sources:

Publications: https://www.nature.com/nature-index/country-outputs/generate/all/global

Women in science: https://www.oecd.org/sti/scoreboard.htm?i=TH_WRXRS&v=1&t=9999&s=LVA&r=3

df <- read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vQx0EAzcq52X0nGCMkBjG7xWj8mvMIFaoVFIJynfeVuH4bYMW4Hua2zQmefpAb1mLT7S5ZP07ILmOx4/pub?gid=1662469666&single=true&output=csv")

Countrycodes

Add the continent

df$Region <- countrycode(
  sourcevar = df$Country,
  origin = "country.name",
  destination = "region"
)

df$iso3c <- countrycode(
  sourcevar = df$Country,
  origin = "country.name",
  destination = "iso3c"
)

World bank data

wbd <-
  WDI(
    indicator = c("NY.GDP.PCAP.CD", "SP.POP.TOTL") ,
    country = "all",
    start = 2010,
    end = 2022
  )


wbd$iso3c <- countrycode(
  sourcevar = wbd$iso2c,
  origin = "iso2c",
  destination = "iso3c"
)

wbd <- wbd %>%
  filter(!is.na(iso3c)) %>% 
  rename("GDP per capita (current US$)" = NY.GDP.PCAP.CD, 
         "Population" = SP.POP.TOTL)

head(wbd)

##   iso2c country year GDP per capita (current US$) Population iso3c
## 1    AD Andorra 2010                     40849.76      84454   AND
## 2    AD Andorra 2011                     43333.97      83748   AND
## 3    AD Andorra 2012                     38684.57      82427   AND
## 4    AD Andorra 2013                     39538.36      80770   AND
## 5    AD Andorra 2014                     41302.38      79213   AND
## 6    AD Andorra 2015                     35770.92      77993   AND

Keep only the last datapoint

wbd <- wbd %>%
    group_by(iso3c) %>%
    slice(which.max((year)))

Join

df <- left_join(df, wbd, by = "iso3c")
rm(wbd)

head(df)

## # A tibble: 6 × 11
##   Country        Women…¹ Publi…²  Share Region iso3c iso2c country  year GDP p…³
##   <chr>            <dbl>   <dbl>  <dbl> <chr>  <chr> <chr> <chr>   <int>   <dbl>
## 1 Austria           30.4    1177 356.   Europ… AUT   AT    Austria  2021  53268.
## 2 Belgium           32.6    1131 386.   Europ… BEL   BE    Belgium  2021  51768.
## 3 Chile             34.8     439 114.   Latin… CHL   CL    Chile    2021  16503.
## 4 Costa Rica        45.2      26   2.94 Latin… CRI   CR    Costa …  2021  12509.
## 5 Czech Republic    27.6     721 194    Europ… CZE   CZ    Czech …  2021  26378.
## 6 Denmark           35.3    1197 406.   Europ… DNK   DK    Denmark  2021  67803.
## # … with 1 more variable: Population <dbl>, and abbreviated variable names
## #   ¹`Women in science`, ²Publications, ³`GDP per capita (current US$)`

===================

Dataset ready

df %>%
  ggplot(aes(x = fct_reorder(Country, `Women in science`),
             y = `Women in science`, 
             fill = Region)) +
  geom_col() +
  coord_flip() +
  geom_hline(yintercept = 50,
             linetype = "dashed",
             color = "grey 30") + 
  scale_y_continuous(labels = label_percent(scale = 1)) +
  labs(title = "Percentage women in science, OECD countries", 
       x = "", 
       y = "") + 
  theme(legend.position = "top")

df %>%
  ggplot(aes(
    x = `GDP per capita (current US$)`,
    y = `Women in science`,
    label = iso3c,
    color = Region
  )) +
  geom_point() +
  ylim(0, 60) +
  scale_y_continuous(labels = label_percent(scale = 1)) +
  
  # scale_x_log10() +
  coord_trans(x = "log10") +
  scale_x_continuous(labels = label_dollar()) +
  geom_text_repel(size = 3)  +
  labs(title = "Women in Science and GDP",
       subtitle = "OECD Countries",
       x = "GDP (Log scale)")

Impact and publications data from Scimago

 # Source https://www.scimagojr.com/countryrank.php

sci_url <-
  'https://www.scimagojr.com/countryrank.php?out=xls'
scif <- tempfile()
download.file(sci_url, scif, mode = "wb")

sci <- readxl::read_excel(path = scif, sheet = 1)

rm(sci_url, scif)

Add the iso country to join afterward

sci$iso3c <- countrycode(
  sourcevar = sci$Country,
  origin = "country.name",
  destination = "iso3c"
)

Join

df <- left_join(df, sci, by = "iso3c") %>% 
  rename("Country" = "Country.x") %>% 
  select(-Country.y, 
         "Region" = "Region.x", 
         -Region.y, 
         -country)

rm(sci)

Dataset ready

Women and impact

Women and publications

Let’s check the productivity of the countries according to their % of WiS

df %>%
  # mutate(Documents = Documents / Population * 1000) %>% 
  ggplot(aes(x = `Women in science`,
             y = Documents, 
             color = Region, 
             # size = `GDP per capita (current US$)`, 
             label = iso3c)) +
  geom_jitter(alpha = .6) +
  labs(title = "Percentage of WiS and Scientific Documents",
       y = "Documents (log scale)") +
  # scale_y_continuous(labels = scales::label_number(scale = 1 / 1000000)) +
  scale_x_continuous(labels = label_percent(scale = 1)) +
  scale_size_continuous(labels = scales::label_number()) + 
  # scale_y_log10() +
  scale_y_continuous(labels = scales::label_number()) + 
  coord_trans( y = "log10") + 
  geom_text_repel(size = 3) +
  theme(legend.position = "top")

Visually, it seems that a negative correlation is present. But let’s adjust by the population of the countries

df %>%
  mutate(Documents = Documents / Population * 1000) %>% 
  ggplot(aes(x = `Women in science`,
             y = Documents, 
             color = Region, 
             # size = `GDP per capita (current US$)`, 
             label = iso3c 
             )) +
  geom_jitter(alpha = .6) +
  labs(title = "Percentage of WiS and Scientific Documents",
       subtitle = "Documents adjusted per 1000 inhabitants", 
       y = "Documents") +
  # scale_y_continuous(labels = scales::label_number(scale = 1 / 1000000)) +
  scale_x_continuous(labels = label_percent(scale = 1)) + 
  scale_size_continuous(labels = scales::label_number()) + 
  # scale_y_log10() +
  scale_y_continuous(labels = scales::label_number()) + 
  # coord_trans( y = "log10") + 
  geom_text_repel(size = 3) +
  theme(legend.position = "top")

Women and impact

df %>%
  # mutate(Documents = Documents / Population * 1000) %>% 
  ggplot(aes(x = `Women in science`,
             y = `H index`, 
             color = Region, 
             # size = `GDP per capita (current US$)`, 
             label = iso3c )) +
  geom_jitter(alpha = .6) +
  labs(title = "Percentage of WiS and Scientific Impact (H index)",
       y = "H index") +
  # scale_y_continuous(labels = scales::label_number(scale = 1 / 1000000)) +
  scale_x_continuous(labels = label_percent(scale = 1)) +
  scale_size_continuous(labels = scales::label_number()) + 
  # scale_y_log10() +
  scale_y_continuous(labels = scales::label_number()) + 
  # coord_trans( y = "log10") + 
  geom_text_repel(size = 3) +
  theme(legend.position = "top")

df %>%
  # mutate(Documents = Documents / Population * 1000) %>% 
  ggplot(aes(x = `Women in science`,
             y = `Citations per document`, 
             color = Region, 
             label = iso3c, 
             size = `GDP per capita (current US$)`)) +
  geom_jitter(alpha = .6) +
  labs(title = "Percentage of WiS and Citations per document",
       y = "Citations per document") +
  # scale_y_continuous(labels = scales::label_number(scale = 1 / 1000000)) +
  scale_x_continuous(labels = label_percent(scale = 1)) + 
  # scale_y_log10() +
  scale_y_continuous(labels = scales::label_number()) + 
  scale_size_continuous(labels = scales::label_number()) + 
  # coord_trans( y = "log10") + 
  geom_text_repel(size = 3) +
    theme(
    legend.position = "top",
    legend.box = "vertical",
    legend.margin = margin()
  )

Self citations

df %>% 
  mutate("Self citations ratio" = `Self-citations` / Citations) %>% 
  ggplot(aes(x = `Women in science`, 
             y = `Self citations ratio`, 
              color = Region, 
             label = iso3c)) + 
  geom_jitter() + 
   labs(title = "Percentage of WiS and self citations per document",
       y = "Self citations ratio (Self citations / Total citations)") +
  scale_x_continuous(labels = label_percent(scale = 1)) + 
  geom_text_repel(size = 3) + 
  theme(legend.position = "top")

Citations per document

df %>% 
  ggplot(aes(x = `Women in science`, 
             y = `Citations per document`, 
              color = Region, 
             label = iso3c)) + 
  geom_jitter() + 
   labs(title = "Percentage of WiS and Citations per document",
       y = "Citations per document") +
  scale_x_continuous(labels = label_percent(scale = 1)) + 
  geom_text_repel(size = 3) + 
  theme(legend.position = "top")

GDP and citations per document

df %>%
  ggplot(
    aes(
      x = `GDP per capita (current US$)`,
      y = `Citations per document`,
      color = `Women in science`,
      # color = Region,
      label = iso3c
    )
  ) +
  geom_jitter(alpha = .6) +
  labs(title = "GDP per capita and Citations per document",
       y = "Citations per document",
       x = "GDP per capita (current US$), log 10") +
  # scale_x_continuous(labels = label_percent(scale = 1)) +
  scale_x_continuous(labels = scales::label_number()) +
  
  geom_text_repel(size = 3) +
  scale_x_log10(labels = label_number(scale_cut = cut_long_scale())) +
  theme(
    legend.position = "top",
    legend.box = "vertical",
    legend.margin = margin()
  )

GDP and WiS

df %>%
  ggplot(
    aes(
      y = `GDP per capita (current US$)`,
      x = `Women in science`,
      color = `H index`,
      # color = Region,
      label = iso3c
    )
  ) +
  geom_jitter(alpha = .6) +
  labs(title = "GDP per capita and WiS",
       y = "GDP per capita (current US$)",
       x = "Women in science") +
  # scale_x_continuous(labels = label_percent(scale = 1)) +
  scale_x_continuous(labels = scales::label_number()) +
  
  geom_text_repel(size = 3) +
  scale_y_log10(labels = label_number(scale_cut = cut_long_scale())) +
  theme(
    legend.position = "top",
    legend.box = "vertical",
    legend.margin = margin()
  )

Other analysis

Quantity and citations

df %>% 
  mutate(Citations = Citations -  `Self-citations`) %>% 
    ggplot(
    aes(
      y = Citations, 
      x = `H index`,
      color = `GDP per capita (current US$)`,
      size = log10(`GDP per capita (current US$)`), 
      # color = Region,
      label = iso3c
    )
  ) +
  geom_jitter(alpha = .6) +
  labs(title = "Quantity and impact of publications",
       y = "Citations (excluiding self citations)",
       x = "Citable documents") +
  # scale_x_continuous(labels = label_percent(scale = 1)) +
  scale_x_continuous(labels = scales::label_number()) +
  
  geom_text_repel(size = 3) +
  scale_y_log10(labels = label_number(scale_cut = cut_long_scale())) +
  theme(
    legend.position = "top",
    legend.box = "vertical",
    legend.margin = margin()
  ) + 
  scale_size_continuous(guide="none") + 
  scale_color_viridis_c(direction = -1, option = "inferno") +
  # scale_colour_viridis() + 
  scale_color_continuous(labels = scales::label_number() )

Productivty adjusted by population

df %>%
  ggplot(
    aes(
      x = Population,
      y = `Citable documents`,
      label = iso3c,
      color = Region, 
      size = `GDP per capita (current US$)`
      
      
    )
  ) +
  geom_jitter(alpha = .7) +
  # scale_x_log10() +
  # scale_y_log10() +
  coord_trans(y = "log10", x = "log10") +
  scale_x_continuous(labels = unit_format(unit = "M", scale = 1e-6)) +
  scale_y_continuous(labels = unit_format(unit = "M", scale = 1e-6)) +
  geom_text_repel(size = 3)  +
   theme(
    legend.position = "top",
    legend.box = "vertical",
    legend.margin = margin()
  ) +
  scale_size_continuous(labels = unit_format(unit = "K", scale = 1e-5)) +
  labs(title = "Scientific productity, population and GDP")

Women in Science

Sergio Uribe

16 September, 2022

Data

Countrycodes

World bank data

Join

Impact and publications data from Scimago

Join

Women and impact

Women and publications

Women and impact

Self citations

Citations per document

GDP and citations per document

GDP and WiS

Other analysis

Quantity and citations

Productivty adjusted by population