This analysis examines the distribution and registration patterns of DOIs (Digital Object Identifiers) across different registration agencies for journals using OJS (JUOJS) - ONLY ACTIVE JOURNALS. We provide: 1. The overlap between DOI.org and Crossref registrations 2. The presence of DataCite DOIs in the Crossref system 3. A significant number of unresolved DOIs from JOUJS journals
We load all DOIs searched in DOI.org and Crossref through their API. The data was retrieved in batches of 100 DOIs on the same day.
# Load the datasets
DOIMatch <- read.table("C:/Users/dgenk/Documentos Locales/ScholCommLab/OpenAlex Coverage 2/data/DOIMatch.txt", header = TRUE, sep = "\t")
ActiveJournals <- read_delim("C:/Users/dgenk/Documentos Locales/ScholCommLab/OpenAlex Coverage 2/data/globalCoverageData.txt")
# Filter for active journals only
DOIMatch <- DOIMatch %>%
mutate(active = ifelse(issn1 %in% ActiveJournals$ISSN_OJS |
issn2 %in% ActiveJournals$ISSN_OJS2.x |
issn1 %in% ActiveJournals$ISSN_OJS2.x |
issn2 %in% ActiveJournals$ISSN_OJS, 1, 0)) %>%
filter(active == 1)
# Get unique active DOIs
activeDOIs <- DOIMatch %>%
filter(active == 1) %>%
unique()
First, we provide the number of total DOIs that were issued by JUOJS. We report also the DOIs that were not found in DOI.org nor Crossref.
totalDOIs <- DOIMatch %>%
select(DOI) %>%
unique() %>%
count()
totalDOIs_DOIOrg <- DOIMatch %>%
filter(DataCite == 1 | Crossref == 1 | mEDRA == 1 |
JaLC == 1 | ISTIC == 1 | Airiti == 1) %>%
select(DOI) %>%
unique() %>%
count()
# Create data frame for visualization
doi_summary <- data.frame(
Category = c("Total DOIs issued by JUOJS", "Total DOIs in DOI.org"),
Count = c(totalDOIs$n, totalDOIs_DOIOrg$n)
)
# Create summary table
kable(doi_summary,
caption = "Overall DOI Statistics",
format = "html",
align = c("l", "r")) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "bordered"),
full_width = FALSE) %>%
column_spec(1, bold = TRUE) %>%
column_spec(2, width = "100px")
Category | Count |
---|---|
Total DOIs issued by JUOJS | 2196697 |
Total DOIs in DOI.org | 1839607 |
# Create bar chart
ggplot(doi_summary, aes(x = Category, y = Count)) +
geom_bar(stat = "identity", fill = "steelblue", width = 0.5) +
geom_text(aes(label = comma(Count)), vjust = -0.5) +
theme_minimal() +
labs(title = "Overall DOI Statistics",
x = "",
y = "Number of DOIs") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Since DOI.org centralises the DOIs issued by different registration agencies, we report here the distribution of DOIs by registration agency according to DOI.org.
totalDOIs_DOIOrgRA <- DOIMatch %>%
filter(DataCite == 1 | Crossref == 1 | mEDRA == 1 |
JaLC == 1 | ISTIC == 1 | Airiti == 1) %>%
unique() %>%
summarise(
DataCite = sum(DataCite),
Crossref = sum(Crossref),
mEDRA = sum(mEDRA),
JaLC = sum(JaLC),
ISTIC = sum(ISTIC),
Airiti = sum(Airiti)
)
# Create a visualization
ra_long <- tidyr::pivot_longer(totalDOIs_DOIOrgRA,
everything(),
names_to = "Agency",
values_to = "Count")
ggplot(ra_long, aes(x = reorder(Agency, -Count), y = Count)) +
geom_bar(stat = "identity", fill = "steelblue") +
geom_text(aes(label = comma(Count)), vjust = -0.5) +
theme_minimal() +
labs(title = "Distribution of DOIs across Registration Agencies",
x = "Registration Agency",
y = "Number of DOIs") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
We cross-checked the information produced by DOI.org in Crossref directly to understant the extent to which these services are aligned. The result of the match between DOIs identified as registered in CrossRef by DOI.org and the actual search in Crossref are provided below.
# Calculate counts for Venn diagram
vennDOIORG <- DOIMatch %>%
filter(Crossref == 1) %>%
select(DOI) %>%
unique() %>%
count()
vennCrossRef <- DOIMatch %>%
filter(CrossrefDirect == 1) %>%
select(DOI) %>%
unique() %>%
count()
vennIntersection <- DOIMatch %>%
filter(CrossrefDirect == 1 & Crossref == 1) %>%
select(DOI) %>%
unique() %>%
count()
venn_summary <- data.frame(
Category = c("DOI.org", "Crossref", "Intersection"),
Count = c(vennDOIORG$n, vennCrossRef$n, vennIntersection$n)
)
kable(venn_summary,
caption = "DOI.org vs Crossref Coverage",
format = "html") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "bordered"),
full_width = FALSE) %>%
column_spec(1, bold = TRUE)
Category | Count |
---|---|
DOI.org | 1731463 |
Crossref | 1730164 |
Intersection | 1729984 |
# Create Venn diagram
grid.newpage()
draw.pairwise.venn(
area1 = vennDOIORG$n,
area2 = vennCrossRef$n,
cross.area = vennIntersection$n,
category = c("DOI.org", "Crossref"),
fill = c("lightblue", "lightgreen"),
euler.d = TRUE,
scaled = TRUE
)
## (polygon[GRID.polygon.70], polygon[GRID.polygon.71], polygon[GRID.polygon.72], polygon[GRID.polygon.73], text[GRID.text.74], lines[GRID.lines.75], text[GRID.text.76], lines[GRID.lines.77], text[GRID.text.78], text[GRID.text.79], text[GRID.text.80])
We found that part of the difference between DOI.org and Crossref can be attributed to some DOIs being identified by DOI.org as DataCite. However, these DOIs were found in Crossref.
DataciteCrossref <- DOIMatch %>%
filter(DataCite == 1 & CrossrefDirect == 1) %>%
unique() %>%
count()
# Sample of DataCite DOIs found in Crossref
DataciteCrossrefSample <- DOIMatch %>%
filter(DataCite == 1 & CrossrefDirect == 1) %>%
unique() %>%
select(DOI, issn1, issn2) %>%
slice_sample(n = 10) %>%
head(10)
# Display sample table
kable(DataciteCrossrefSample,
caption = "Sample of DOIs registered as DataCite but found in Crossref",
format = "html") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "bordered"),
full_width = FALSE) %>%
column_spec(1, width = "300px") %>%
column_spec(2:3, width = "100px")
DOI | issn1 | issn2 |
---|---|---|
10.14464/zsem.v38i3-4.639 | 0170-6241 | 2625-4328 |
10.14297/jpaap.v8i1.422 | 2051-9788 | NA |
10.14297/jpaap.v8i2.439 | 2051-9788 | NA |
10.21935/tls.v3i1.122 | 2512-4587 | NA |
10.14464/innotrac.v1i0.450 | 2701-3693 | NA |
10.14297/jpaap.v9i1.504 | 2051-9788 | NA |
10.14464/ess.v7i1.472 | 1869-5213 | NA |
10.21935/tls.v3i1.125 | 2512-4587 | NA |
10.14297/jpaap.v9i2.488 | 2051-9788 | NA |
10.14464/ess.v10i7.630 | 1869-5213 | NA |
# Create visualization for DataCite-Crossref overlap
overlap_data <- data.frame(
Category = c("DataCite DOIs in Crossref"),
Count = DataciteCrossref$n
)
ggplot(overlap_data, aes(x = Category, y = Count)) +
geom_bar(stat = "identity", fill = "lightgreen", width = 0.5) +
geom_text(aes(label = comma(Count)), vjust = -0.5) +
theme_minimal() +
labs(title = "Number of DataCite DOIs found in Crossref",
x = "",
y = "Number of DOIs")
on the resolution of DOIs for JUOJS journals, we found a significant number of DOIs that were not found in DOI.org nor Crossref. More work is needed to understand the reasons.
DOIsInvalid <- DOIMatch %>%
filter(DataCite == 0 & Crossref == 0 & mEDRA == 0 &
JaLC == 0 & ISTIC == 0 & Airiti == 0 &
CrossrefDirect == 0) %>%
filter(active == 1) %>%
select(DOI) %>%
unique()
# Sample of unresolved DOIs
DOIsInvalidSample <- DOIsInvalid %>%
slice_sample(n = 10)
# Display sample table
kable(DOIsInvalidSample,
caption = "Sample of Unresolved DOIs",
format = "html") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "bordered"),
full_width = FALSE) %>%
column_spec(1, width = "300px")
DOI |
---|
10.37476/akmen.v17i1.853 |
10.51852/-.v4i1.269 |
10.18686/hdjy.v2i25.4441 |
10.2478/maslo.v73i4.6945 |
10.47750/pnr.2022.13.s10.187 |
10.37134/jvt.vol4.1.7.2023 |
10.62504/c21y0t03 |
10.29300/lughah.v12i1.8017 |
10.21067/smartics.v7i1.5046 |
10.12345/jxffcxysj.v4i21.9397 |
# Create visualization for unresolved DOIs
unresolved_data <- data.frame(
Category = c("Total DOIs", "Unresolved DOIs"),
Count = c(totalDOIs$n, nrow(DOIsInvalid))
)
ggplot(unresolved_data, aes(x = Category, y = Count)) +
geom_bar(stat = "identity", fill = c("steelblue", "darkred"), width = 0.5) +
geom_text(aes(label = comma(Count)), vjust = -0.5) +
theme_minimal() +
labs(title = "Comparison of Total vs Unresolved DOIs",
x = "",
y = "Number of DOIs")