Introduction

This analysis explores the coverage of Digital Object Identifiers (DOIs) in scientific journals across different countries, income groups, and continents. We’ll visualize various aspects of DOI coverage and investigate potential patterns or disparities.

Load Required Libraries and Data

First, we’ll load the necessary R libraries and import our dataset.

library(ggplot2)
library(dplyr)

# Read the data
data <- read.table("C:\\Users\\dgenk\\Documentos Locales\\ScholCommLab\\OpenAlex Coverage\\data\\globalCoverageData.txt", sep="\t", header = TRUE)

DOI Presence Analysis

Let’s start by analyzing the presence of DOIs in journals.

# Create a new data frame to count journals with DOIs > 0 and DOIs = 0 or null
doi_counts <- data %>%
  mutate(DOI_Status = ifelse(DOIs > 0, "DOIs > 0", "DOIs = 0 or null")) %>%
  group_by(DOI_Status) %>%
  summarise(Journal_Count = n())

# Create a bar plot
ggplot(doi_counts, aes(x = DOI_Status, y = Journal_Count, fill = DOI_Status)) +
  geom_bar(stat = "identity") +
  labs(title = "Number of Journals with DOIs > 0 vs DOIs = 0 or Null",
       x = "DOI Status",
       y = "Number of Journals") +
  theme_minimal() +
  scale_fill_manual(values = c("DOIs > 0" = "blue", "DOIs = 0 or null" = "red"))

This chart shows the distribution of journals with and without DOIs.

DOI Match Percentage Analysis

Now, let’s analyze the percentage of DOIs that are matched for journals with DOIs.

data <- data %>%
  mutate(percentage_matched = (matchedDOIs / DOIs) * 100) %>%
  filter(DOIs > 0)

# Histogram of number of journals by increasing percentage of DOI match
ggplot(data, aes(x = percentage_matched)) +
  geom_histogram(binwidth = 5, fill = "green", color = "black", alpha = 0.7) +
  labs(title = "Histogram of Journals by Percentage of DOI Match",
       x = "Percentage of DOI Match",
       y = "Number of Journals") +
  theme_minimal()

This histogram shows the distribution of journals based on the percentage of their DOIs that are matched.

DOI Coverage by Country

Let’s examine DOI coverage across different countries.

# Create a new column for DOI match percentage categories
data <- data %>%
  mutate(DOI_Category = case_when(
    percentage_matched <= 10 ~ "0-10%",
    percentage_matched > 10 & percentage_matched <= 50 ~ "11-50%",
    percentage_matched > 50 & percentage_matched <= 90 ~ "51-90%",
    percentage_matched > 90 ~ ">90%",
    TRUE ~ "Unknown"
  ))

# Count the number of journals in each DOI match percentage category by country
country_distribution <- data %>%
  group_by(country_consolidated, DOI_Category) %>%
  summarise(Journal_Count = n(), .groups = 'drop')

# Calculate total journals per country
total_journals <- country_distribution %>%
  group_by(country_consolidated) %>%
  summarise(Total_Count = sum(Journal_Count), .groups = 'drop')

# Merge total journals with country distribution
country_distribution <- country_distribution %>%
  left_join(total_journals, by = "country_consolidated") %>%
  mutate(Percentage = Journal_Count / Total_Count * 100)

# Identify top countries for each DOI coverage category
top_countries <- country_distribution %>%
  group_by(DOI_Category) %>%
  top_n(3, Percentage) %>%  # Get top 3 countries for each category
  ungroup()

# Create the stacked bar chart for top countries
ggplot(top_countries, aes(x = reorder(country_consolidated, -Percentage), y = Percentage, fill = DOI_Category)) +
  geom_bar(stat = "identity") +
  coord_flip() +  # Flip coordinates for better readability
  labs(title = "Top Countries by Percentage of Journals in DOI Coverage Categories",
       x = "Country",
       y = "Percentage of Journals",
       fill = "DOI Coverage Category") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

This chart shows the top countries for each DOI coverage category.

DOI Coverage by Income Group

Now, let’s analyze DOI coverage across different income groups.

# Count the number of journals in each DOI match percentage category by income group
income_distribution <- data %>%
  group_by(INCOME_GROUP, DOI_Category) %>%
  summarise(Journal_Count = n(), .groups = 'drop')

# Calculate total journals per income group
total_journals_income <- income_distribution %>%
  group_by(INCOME_GROUP) %>%
  summarise(Total_Count = sum(Journal_Count), .groups = 'drop')

# Merge total journals with income distribution
income_distribution <- income_distribution %>%
  left_join(total_journals_income, by = "INCOME_GROUP") %>%
  mutate(Percentage = Journal_Count / Total_Count * 100)

# Create the stacked bar chart for income groups
ggplot(income_distribution, aes(x = INCOME_GROUP, y = Percentage, fill = DOI_Category)) +
  geom_bar(stat = "identity") +
  labs(title = "Percentage of Journals by DOI Coverage Category by Income Level",
       x = "Income Level",
       y = "Percentage of Journals",
       fill = "DOI Coverage Category") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

This chart illustrates the distribution of DOI coverage categories across different income groups.

DOI Coverage by Continent

Let’s examine DOI coverage across different continents.

CONTINENT_distribution <- data %>%
  group_by(REGION, DOI_Category) %>%
  summarise(Journal_Count = n(), .groups = 'drop')

# Calculate total journals per CONTINENT
total_journals_CONTINENT <- CONTINENT_distribution %>%
  group_by(REGION) %>%
  summarise(Total_Count = sum(Journal_Count), .groups = 'drop')

# Merge total journals with CONTINENT distribution
CONTINENT_distribution <- CONTINENT_distribution %>%
  left_join(total_journals_CONTINENT, by = "REGION") %>%
  mutate(Percentage = Journal_Count / Total_Count * 100)

# Create the stacked bar chart for CONTINENTs
ggplot(CONTINENT_distribution, aes(x = REGION, y = Percentage, fill = DOI_Category)) +
  geom_bar(stat = "identity") +
  labs(title = "Percentage of Journals by DOI Coverage Category by CONTINENT",
       x = "CONTINENT",
       y = "Percentage of Journals",
       fill = "DOI Coverage Category") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

This chart shows the distribution of DOI coverage categories across different continents.

Relationship Between Number of DOIs and Matched DOIs

Let’s visualize the relationship between the number of DOIs and the number of matched DOIs.

# Scatter plot between number of DOIs vs number of DOIs matched
ggplot(data, aes(x = DOIs, y = matchedDOIs)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "green") +
  labs(title = "Number of DOIs vs Number of DOIs Matched",
       x = "Number of DOIs",
       y = "Number of DOIs Matched") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

This scatter plot shows the relationship between the total number of DOIs and the number of matched DOIs for each journal.

Summary Statistics

Finally, let’s calculate some summary statistics to get an overall picture of DOI coverage.

# Additional statistics to understand DOI coverage
summary_stats <- data %>%
  summarise(
    total_journals = n(),
    total_dois = sum(DOIs, na.rm = TRUE),
    total_matched_dois = sum(matchedDOIs, na.rm = TRUE),
    avg_percentage_matched = mean(percentage_matched, na.rm = TRUE),
    
  )

print(summary_stats)
##   total_journals total_dois total_matched_dois avg_percentage_matched
## 1          25293    2156125            1778607               80.52436