Introduction

This analysis examines the distribution and registration patterns of DOIs (Digital Object Identifiers) across different registration agencies for journals using OJS (JUOJS) - ONLY ACTIVE JOURNALS. We provide: 1. The overlap between DOI.org and Crossref registrations 2. The presence of DataCite DOIs in the Crossref system 3. A significant number of unresolved DOIs from JOUJS journals

Data Loading and Preprocessing

We load all DOIs searched in DOI.org and Crossref through their API. The data was retrieved in batches of 100 DOIs on the same day.

# Load the datasets
DOIMatch <- read.table("C:/Users/dgenk/Documentos Locales/ScholCommLab/OpenAlex Coverage 2/data/DOIMatch.txt", header = TRUE, sep = "\t")
ActiveJournals <- read_delim("C:/Users/dgenk/Documentos Locales/ScholCommLab/OpenAlex Coverage 2/data/globalCoverageData.txt")

# Filter for active journals only
DOIMatch <- DOIMatch %>% 
  mutate(active = ifelse(issn1 %in% ActiveJournals$ISSN_OJS | 
                        issn2 %in% ActiveJournals$ISSN_OJS2.x |
                        issn1 %in% ActiveJournals$ISSN_OJS2.x |
                        issn2 %in% ActiveJournals$ISSN_OJS, 1, 0)) %>%
  filter(active == 1)

# Get unique active DOIs
activeDOIs <- DOIMatch %>% 
  filter(active == 1) %>% 
  unique()

Overall DOI Statistics

First, we provide the number of total DOIs that were issued by JUOJS. We report also the DOIs that were not found in DOI.org nor Crossref.

Total DOIs searched (DOIs issued by JUOJS)

totalDOIs <- DOIMatch %>% 
  select(DOI) %>% 
  unique() %>% 
  count()

totalDOIs_DOIOrg <- DOIMatch %>% 
  filter(DataCite == 1 | Crossref == 1 | mEDRA == 1 | 
         JaLC == 1 | ISTIC == 1 | Airiti == 1) %>% 
  select(DOI) %>% 
  unique() %>% 
  count()

# Create data frame for visualization
doi_summary <- data.frame(
  Category = c("Total DOIs issued by JUOJS", "Total DOIs in DOI.org"),
  Count = c(totalDOIs$n, totalDOIs_DOIOrg$n)
)

# Create summary table
kable(doi_summary, 
      caption = "Overall DOI Statistics",
      format = "html",
      align = c("l", "r")) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "bordered"),
                full_width = FALSE) %>%
  column_spec(1, bold = TRUE) %>%
  column_spec(2, width = "100px")
Overall DOI Statistics
Category Count
Total DOIs issued by JUOJS 2196697
Total DOIs in DOI.org 1839607
# Create bar chart
ggplot(doi_summary, aes(x = Category, y = Count)) +
  geom_bar(stat = "identity", fill = "steelblue", width = 0.5) +
  geom_text(aes(label = comma(Count)), vjust = -0.5) +
  theme_minimal() +
  labs(title = "Overall DOI Statistics",
       x = "",
       y = "Number of DOIs") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Distribution by Registration Agency

Since DOI.org centralises the DOIs issued by different registration agencies, we report here the distribution of DOIs by registration agency according to DOI.org.

totalDOIs_DOIOrgRA <- DOIMatch %>% 
  filter(DataCite == 1 | Crossref == 1 | mEDRA == 1 | 
         JaLC == 1 | ISTIC == 1 | Airiti == 1) %>% 
  unique() %>%
  summarise(
    DataCite = sum(DataCite),
    Crossref = sum(Crossref),
    mEDRA = sum(mEDRA),
    JaLC = sum(JaLC),
    ISTIC = sum(ISTIC),
    Airiti = sum(Airiti)
  )



# Create a visualization
ra_long <- tidyr::pivot_longer(totalDOIs_DOIOrgRA, 
                              everything(),
                              names_to = "Agency",
                              values_to = "Count")

ggplot(ra_long, aes(x = reorder(Agency, -Count), y = Count)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  geom_text(aes(label = comma(Count)), vjust = -0.5) +
  theme_minimal() +
  labs(title = "Distribution of DOIs across Registration Agencies",
       x = "Registration Agency",
       y = "Number of DOIs") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Comparison: DOI.org vs Crossref

We cross-checked the information produced by DOI.org in Crossref directly to understant the extent to which these services are aligned. The result of the match between DOIs identified as registered in CrossRef by DOI.org and the actual search in Crossref are provided below.

# Calculate counts for Venn diagram
vennDOIORG <- DOIMatch %>% 
  filter(Crossref == 1) %>% 
  select(DOI) %>% 
  unique() %>% 
  count()

vennCrossRef <- DOIMatch %>% 
  filter(CrossrefDirect == 1) %>% 
  select(DOI) %>% 
  unique() %>% 
  count()

vennIntersection <- DOIMatch %>% 
  filter(CrossrefDirect == 1 & Crossref == 1) %>% 
  select(DOI) %>% 
  unique() %>% 
  count()

venn_summary <- data.frame(
  Category = c("DOI.org", "Crossref", "Intersection"),
  Count = c(vennDOIORG$n, vennCrossRef$n, vennIntersection$n)
)

kable(venn_summary,
      caption = "DOI.org vs Crossref Coverage",
      format = "html") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "bordered"),
                full_width = FALSE) %>%
  column_spec(1, bold = TRUE) 
DOI.org vs Crossref Coverage
Category Count
DOI.org 1731463
Crossref 1730164
Intersection 1729984
# Create Venn diagram
grid.newpage()
draw.pairwise.venn(
  area1 = vennDOIORG$n,
  area2 = vennCrossRef$n,
  cross.area = vennIntersection$n,
  category = c("DOI.org", "Crossref"),
  fill = c("lightblue", "lightgreen"),
  euler.d = TRUE,
  scaled = TRUE
)

## (polygon[GRID.polygon.70], polygon[GRID.polygon.71], polygon[GRID.polygon.72], polygon[GRID.polygon.73], text[GRID.text.74], lines[GRID.lines.75], text[GRID.text.76], lines[GRID.lines.77], text[GRID.text.78], text[GRID.text.79], text[GRID.text.80])

DataCite DOIs in Crossref

We found that part of the difference between DOI.org and Crossref can be attributed to some DOIs being identified by DOI.org as DataCite. However, these DOIs were found in Crossref.

DataciteCrossref <- DOIMatch %>% 
  filter(DataCite == 1 & CrossrefDirect == 1) %>% 
  unique() %>% 
  count()

# Sample of DataCite DOIs found in Crossref
DataciteCrossrefSample <- DOIMatch %>% 
  filter(DataCite == 1 & CrossrefDirect == 1) %>% 
  unique() %>%
  select(DOI, issn1, issn2) %>%
  slice_sample(n = 10) %>%
  head(10)

# Display sample table
kable(DataciteCrossrefSample,
      caption = "Sample of DOIs registered as DataCite but found in Crossref",
      format = "html") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "bordered"),
                full_width = FALSE) %>%
  column_spec(1, width = "300px") %>%
  column_spec(2:3, width = "100px")
Sample of DOIs registered as DataCite but found in Crossref
DOI issn1 issn2
10.14464/zsem.v38i3-4.639 0170-6241 2625-4328
10.14297/jpaap.v8i1.422 2051-9788 NA
10.14297/jpaap.v8i2.439 2051-9788 NA
10.21935/tls.v3i1.122 2512-4587 NA
10.14464/innotrac.v1i0.450 2701-3693 NA
10.14297/jpaap.v9i1.504 2051-9788 NA
10.14464/ess.v7i1.472 1869-5213 NA
10.21935/tls.v3i1.125 2512-4587 NA
10.14297/jpaap.v9i2.488 2051-9788 NA
10.14464/ess.v10i7.630 1869-5213 NA
# Create visualization for DataCite-Crossref overlap
overlap_data <- data.frame(
  Category = c("DataCite DOIs in Crossref"),
  Count = DataciteCrossref$n
)

ggplot(overlap_data, aes(x = Category, y = Count)) +
  geom_bar(stat = "identity", fill = "lightgreen", width = 0.5) +
  geom_text(aes(label = comma(Count)), vjust = -0.5) +
  theme_minimal() +
  labs(title = "Number of DataCite DOIs found in Crossref",
       x = "",
       y = "Number of DOIs")

Unresolved DOIs

on the resolution of DOIs for JUOJS journals, we found a significant number of DOIs that were not found in DOI.org nor Crossref. More work is needed to understand the reasons.

DOIsInvalid <- DOIMatch %>% 
  filter(DataCite == 0 & Crossref == 0 & mEDRA == 0 & 
         JaLC == 0 & ISTIC == 0 & Airiti == 0 & 
         CrossrefDirect == 0) %>% 
  filter(active == 1) %>% 
  select(DOI) %>% 
  unique()

# Sample of unresolved DOIs
DOIsInvalidSample <- DOIsInvalid %>% 
  slice_sample(n = 10)

# Display sample table
kable(DOIsInvalidSample,
      caption = "Sample of Unresolved DOIs",
      format = "html") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "bordered"),
                full_width = FALSE) %>%
  column_spec(1, width = "300px")
Sample of Unresolved DOIs
DOI
10.37476/akmen.v17i1.853
10.51852/-.v4i1.269
10.18686/hdjy.v2i25.4441
10.2478/maslo.v73i4.6945
10.47750/pnr.2022.13.s10.187
10.37134/jvt.vol4.1.7.2023
10.62504/c21y0t03
10.29300/lughah.v12i1.8017
10.21067/smartics.v7i1.5046
10.12345/jxffcxysj.v4i21.9397
# Create visualization for unresolved DOIs
unresolved_data <- data.frame(
  Category = c("Total DOIs", "Unresolved DOIs"),
  Count = c(totalDOIs$n, nrow(DOIsInvalid))
)

ggplot(unresolved_data, aes(x = Category, y = Count)) +
  geom_bar(stat = "identity", fill = c("steelblue", "darkred"), width = 0.5) +
  geom_text(aes(label = comma(Count)), vjust = -0.5) +
  theme_minimal() +
  labs(title = "Comparison of Total vs Unresolved DOIs",
       x = "",
       y = "Number of DOIs")