How Accessible Are AERA Articles? 2010-2022

library(tidyverse)
library(janitor)
library(googlesheets4)

all <- read_csv("all-aera-articles-feb-21-combined-reprocessed-proc.csv")

all %>% 
  count(name_of_journal) %>% 
  filter(name_of_journal != "AERA Open") %>% 
  arrange(name_of_journal)

## # A tibble: 6 × 2
##   name_of_journal                                      n
##   <chr>                                            <int>
## 1 American Educational Research Journal              592
## 2 Educational Evaluation and Policy Analysis         361
## 3 Educational Researcher                             689
## 4 Journal of Educational and Behavioral Statistics   399
## 5 Review of Educational Research                     288
## 6 Review of Research in Education                    187

all %>% 
  count(name_of_journal) %>% 
  filter(name_of_journal != "AERA Open") %>% 
  arrange(name_of_journal) %>% 
  summarize(sum_n = sum(n))

## # A tibble: 1 × 1
##   sum_n
##   <int>
## 1  2516

d_orig <- read_csv("all-aera-articles-feb-21-combined-reprocessed-proc.csv")

d_orig <- rename(d_orig,
                 article_name = name_of_article,
                 journal_name = name_of_journal)

d_orig_to_join <- d_orig %>% 
  select(-link) %>% 
  distinct(journal_name, article_name, year, .keep_all = TRUE)

d_proc <- read_csv("feb-21-2023-all-processed.csv")

d_proc_to_join <- select(d_proc, 
                         article_name,
                         journal_name,
                         year,
                         link)

nrow(d_proc)

## [1] 5704

d_proc <- d_proc_to_join %>% 
  filter(!str_detect(link, "scholar.google.com"))

nrow(d_proc) # 302 removed

## [1] 5402

dd <- d_proc_to_join %>% 
  mutate(row_id = row_number()) %>% 
  left_join(d_orig_to_join, by = c("article_name", "journal_name", "year"))

dd %>% 
  filter(journal_name != "AERA Open") %>% 
  sample_n(250) %>% 
  write_csv("open-science-publishing-sample-for-qual-analysis.csv")

orig <- read_csv("open-science-publishing-sample-for-qual-analysis.csv")

dd %>% 
  filter(journal_name != "AERA Open") %>% 
  filter(!(row_id %in% orig$row_id)) %>% 
  filter(!str_detect(link, "instlink")) %>% 
  sample_n(160) %>% 
  write_csv("open-science-publishing-sample-for-qual-analysis-1.csv")

# d_orig$data_set <- c(rep(1, 1345), rep(2, 1714))
# 
# d_orig <- d_orig %>% 
#   group_by(data_set) %>% 
#   mutate(unique_row_id = row_number() - 1) %>% 
#   mutate(unique_row_id = str_c(data_set, " - ", unique_row_id))
# 
# d_proc$data_set <- c(rep(1, 2017), rep(2, 2513))
#   
# d_proc <- d_proc %>% 
#   group_by(data_set) %>% 
#   mutate(unique_row_id = str_c(data_set, " - ", unique_row_id))
# 
# d_proc$domain <- domain(d_proc$link)
# 
# write_csv(d_orig, "all-articles-to-access.csv")
# write_csv(d_proc, "all-articles-accessed.csv")

What proportion of articles are available in any form?

Overall

nrow_articles <- d_orig %>% 
  filter(journal_name != "AERA Open") %>% 
  nrow()

nrow_accessed <- d_proc_to_join %>% 
  filter(journal_name != "AERA Open") %>% 
  distinct(article_name, journal_name) %>% 
  nrow()

nrow_articles

## [1] 2516

nrow_accessed

## [1] 1642

round(nrow_accessed / nrow_articles, 4)

## [1] 0.6526

N accessible articles: 1642
N total articles: 2516
N articles not accessible: 874
Proportion accessible: 0.653

N files available per article

d_proc_to_join %>% 
  count(article_name, journal_name) %>% summarize(mean_n = mean(n), 
                                                  sd_n = sd(n))

## # A tibble: 1 × 2
##   mean_n  sd_n
##    <dbl> <dbl>
## 1   2.66  1.83

d_proc_to_join %>% 
  count(article_name, journal_name) %>% 
  count(n) %>% 
  mutate(more_than_one = ifelse(n > 1, 1, 0)) %>% 
  group_by(more_than_one) %>% 
  summarize(sum_nn = sum(nn)) %>% 
  mutate(prop = sum_nn / sum(sum_nn))

## # A tibble: 2 × 3
##   more_than_one sum_nn  prop
##           <dbl>  <int> <dbl>
## 1             0    720 0.336
## 2             1   1426 0.664

d_proc_to_join %>% 
  count(article_name, journal_name) %>% 
  count(n) %>% 
  bind_rows(tibble(n = 0, nn = 687)) %>% 
  ggplot(aes(x = n, y = nn)) +
  geom_col() +
  ylab("Count") +
  xlab("Versions of Each Article") +
  theme_light() +
  theme(text = element_text(family = "Times", size = 16))

ggsave("count-by-year.png", width = 7, height = 7, dpi = 300)

By journal

nrow_articles_by_journal <- d_orig %>% 
  ungroup() %>% 
  group_by(journal_name) %>% 
  summarize(n = n())

nrow_articles_accessed_by_journal <- d_proc_to_join %>% 
  distinct(journal_name, article_name) %>% 
  count(article_name, journal_name) %>% 
  count(journal_name) %>% 
  rename(n_accessed = n)

nrow_articles_by_journal %>% 
  left_join(nrow_articles_accessed_by_journal) %>% 
  mutate(prop = n_accessed / n %>% round(2)) %>% 
  select(journal_name, n_accessed, n, prop) %>% 
  arrange(desc(prop)) %>% 
  slice(-1) %>% 
  arrange(desc(prop)) %>% 
  select(journal_name, n, n_accessed, prop) %>% 
  mutate(prop = round(prop * 100, 2))

## # A tibble: 6 × 4
##   journal_name                                         n n_accessed  prop
##   <chr>                                            <int>      <int> <dbl>
## 1 Review of Educational Research                     288        216  75  
## 2 Educational Evaluation and Policy Analysis         361        264  73.1
## 3 Review of Research in Education                    187        127  67.9
## 4 American Educational Research Journal              592        383  64.7
## 5 Journal of Educational and Behavioral Statistics   399        243  60.9
## 6 Educational Researcher                             689        409  59.4

By year

nrow_articles_by_year <- d_orig %>% 
  ungroup() %>% 
  group_by(year) %>% 
  summarize(n = n())

nrow_articles_accessed_by_year <- d_proc_to_join %>% 
  ungroup() %>% 
  distinct(year, article_name) %>% 
  count(year) %>% 
  rename(n_accessed = n)

library(scales)

nrow_articles_by_year %>% 
  left_join(nrow_articles_accessed_by_year) %>% 
  mutate(prop = n_accessed / n %>% round(2)) %>% 
  select(year, Accessible = n_accessed, n, prop) %>% 
  ggplot(aes(x = year, y = prop)) +
  geom_line() +
  geom_point() +
  ylab('Articles Accessible') +
  theme_light() +
  scale_x_continuous(breaks = c(2010:2022)) +
  scale_y_continuous(labels = percent, limits = c(0, 1)) +
  theme(text = element_text(family = "Times", size = 16))

ggsave("percentage-by-year.png", width = 8, height = 8, dpi = 300)

What proportion of accessible articles are available from specific domains?

Overall

d_proc_to_join$domain <- urltools::domain(d_proc_to_join$link)

d_proc_to_join %>% 
  ungroup() %>% 
  count(domain, sort = TRUE) %>% 
  mutate(prop = n / sum(n) %>% round(2)) %>% 
  slice(1:10) %>% 
  mutate(prop = round(prop * 100, 2)) %>% 
  transmute(domain = domain,
            n = str_c(n, " (", prop, ")"))

## # A tibble: 10 × 2
##    domain                n          
##    <chr>                 <chr>      
##  1 www.researchgate.net  891 (15.62)
##  2 journals.sagepub.com  845 (14.81)
##  3 www.academia.edu      563 (9.87) 
##  4 citeseerx.ist.psu.edu 492 (8.63) 
##  5 scholar.google.com    302 (5.29) 
##  6 scholar.archive.org   292 (5.12) 
##  7 files.eric.ed.gov     151 (2.65) 
##  8 core.ac.uk            110 (1.93) 
##  9 www.ncbi.nlm.nih.gov  84 (1.47)  
## 10 edworkingpapers.org   66 (1.16)

By Journal

d_proc_to_join %>% 
  ungroup() %>% 
  count(domain, journal_name, sort = TRUE) %>% 
  group_by(journal_name) %>% 
  slice(1:5) %>% 
  mutate(prop = n / sum(n) %>% round(2)) %>% 
  arrange(journal_name, desc(prop)) %>% 
  knitr::kable()

domain	journal_name	n	prop
journals.sagepub.com	AERA Open	500	0.4739336
scholar.archive.org	AERA Open	199	0.1886256
www.researchgate.net	AERA Open	159	0.1507109
scholar.google.com	AERA Open	103	0.0976303
www.academia.edu	AERA Open	94	0.0890995
www.researchgate.net	American Educational Research Journal	159	0.3123772
citeseerx.ist.psu.edu	American Educational Research Journal	142	0.2789784
www.academia.edu	American Educational Research Journal	103	0.2023576
journals.sagepub.com	American Educational Research Journal	56	0.1100196
files.eric.ed.gov	American Educational Research Journal	49	0.0962672
citeseerx.ist.psu.edu	Educational Evaluation and Policy Analysis	112	0.3137255
www.researchgate.net	Educational Evaluation and Policy Analysis	87	0.2436975
www.academia.edu	Educational Evaluation and Policy Analysis	62	0.1736695
journals.sagepub.com	Educational Evaluation and Policy Analysis	60	0.1680672
files.eric.ed.gov	Educational Evaluation and Policy Analysis	36	0.1008403
www.researchgate.net	Educational Researcher	198	0.3800384
www.academia.edu	Educational Researcher	131	0.2514395
journals.sagepub.com	Educational Researcher	81	0.1554702
citeseerx.ist.psu.edu	Educational Researcher	56	0.1074856
scholar.google.com	Educational Researcher	55	0.1055662
citeseerx.ist.psu.edu	Journal of Educational and Behavioral Statistics	103	0.3456376
www.researchgate.net	Journal of Educational and Behavioral Statistics	83	0.2785235
journals.sagepub.com	Journal of Educational and Behavioral Statistics	47	0.1577181
scholar.google.com	Journal of Educational and Behavioral Statistics	35	0.1174497
files.eric.ed.gov	Journal of Educational and Behavioral Statistics	30	0.1006711
www.researchgate.net	Review of Educational Research	129	0.3514986
www.academia.edu	Review of Educational Research	83	0.2261580
citeseerx.ist.psu.edu	Review of Educational Research	64	0.1743869
journals.sagepub.com	Review of Educational Research	52	0.1416894
scholar.google.com	Review of Educational Research	39	0.1062670
www.researchgate.net	Review of Research in Education	76	0.3534884
www.academia.edu	Review of Research in Education	72	0.3348837
journals.sagepub.com	Review of Research in Education	49	0.2279070
citeseerx.ist.psu.edu	Review of Research in Education	9	0.0418605
scholar.google.com	Review of Research in Education	9	0.0418605

Table with results

DT::datatable(select(d_proc_to_join, domain, everything()))

coded_data <- read_sheet("https://docs.google.com/spreadsheets/d/1R08x5jhfpkZ8ZzvtvqwfMLtlwvj36yEXuBRW4dioJVc/edit#gid=1786696285")
# coded_data <- read_csv("open-science-publishing-sample-for-qual-analysis-3.csv")

coded_data <- coded_data %>% 
  filter(!str_detect(link, "scholar.google.com"))

coded_data <- coded_data %>% janitor::clean_names()

coded_data %>% nrow()

## [1] 250

x1 <- dd %>% 
  filter(journal_name != "AERA Open") %>% nrow()

x2 <- dd %>% 
  filter(journal_name != "AERA Open") %>% 
  anti_join(coded_data, by = "row_id") %>% 
  nrow()

x1 - x2

## [1] 250

# dd %>% 
#   filter(journal_name != "AERA Open") %>% 
#   anti_join(coded_data, by = "row_id") %>% 
#   sample_n(40) %>% write_csv("additional-sample-5-10-2024.csv")

coded_data <- coded_data %>% 
  mutate(code = tolower(code),
         code = str_c("code", code))

nrow(coded_data) # 264 - was 290, 26 were google scholar citations

## [1] 250

coded_data <- coded_data %>%
  mutate(restricted = ifelse(code == "code1a", 1, 0),
         open = ifelse(code == "code1b", 1, 0),
         free = ifelse(code == "code1c", 1, 0),
         preprint = ifelse(code == "code2", 1, 0),
         other = ifelse(code == "code3", 1, 0))

coded_data %>% 
  janitor::tabyl(code)

##    code   n percent
##  code1a 145   0.580
##  code1b  15   0.060
##  code1c  25   0.100
##   code2  52   0.208
##   code3   6   0.024
##  codena   7   0.028

log_odds_to_p <- function(x) {
  exp(x) / (1 + exp(x))
}

m1 <- glm(restricted ~ 1, data = coded_data, family = "binomial")
summary(m1)

## 
## Call:
## glm(formula = restricted ~ 1, family = "binomial", data = coded_data)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)   0.3228     0.1281   2.519   0.0118 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 340.15  on 249  degrees of freedom
## Residual deviance: 340.15  on 249  degrees of freedom
## AIC: 342.15
## 
## Number of Fisher Scoring iterations: 4

coef(m1) %>% log_odds_to_p()

## (Intercept) 
##        0.58

confint(m1) %>% log_odds_to_p() %>% round(4)

##  2.5 % 97.5 % 
## 0.5182 0.6401

coef(m1) %>% log_odds_to_p() * (1642/2516)

## (Intercept) 
##   0.3785215

confint(m1) %>% log_odds_to_p() %>% round(4) * (1642/2516)

##     2.5 %    97.5 % 
## 0.3381893 0.4177441

m2 <- glm(open ~ 1, data = coded_data, family = "binomial")
summary(m2)

## 
## Call:
## glm(formula = open ~ 1, family = "binomial", data = coded_data)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.7515     0.2663  -10.33   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 113.48  on 249  degrees of freedom
## Residual deviance: 113.48  on 249  degrees of freedom
## AIC: 115.48
## 
## Number of Fisher Scoring iterations: 5

coef(m2) %>% log_odds_to_p()

## (Intercept) 
##        0.06

confint(m2) %>% log_odds_to_p()

##      2.5 %     97.5 % 
## 0.03497575 0.09396983

coef(m2) %>% log_odds_to_p()* (1642/2516)

## (Intercept) 
##  0.03915739

confint(m2) %>% log_odds_to_p()* (1642/2516)

##      2.5 %     97.5 % 
## 0.02282598 0.06132689

m3 <- glm(free ~ 1, data = coded_data, family = "binomial")
summary(m3)

## 
## Call:
## glm(formula = free ~ 1, family = "binomial", data = coded_data)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.1972     0.2108  -10.42   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 162.54  on 249  degrees of freedom
## Residual deviance: 162.54  on 249  degrees of freedom
## AIC: 164.54
## 
## Number of Fisher Scoring iterations: 4

coef(m3) %>% log_odds_to_p()* (1642/2516)

## (Intercept) 
##  0.06526232

confint(m3) %>% log_odds_to_p()* (1642/2516)

##      2.5 %     97.5 % 
## 0.04367007 0.09217050

m4 <- glm(preprint ~ 1, data = coded_data, family = "binomial")
summary(m3)

## 
## Call:
## glm(formula = free ~ 1, family = "binomial", data = coded_data)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.1972     0.2108  -10.42   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 162.54  on 249  degrees of freedom
## Residual deviance: 162.54  on 249  degrees of freedom
## AIC: 164.54
## 
## Number of Fisher Scoring iterations: 4

coef(m4) %>% log_odds_to_p()* (1642/2516)

## (Intercept) 
##   0.1357456

confint(m4) %>% log_odds_to_p()* (1642/2516)

##     2.5 %    97.5 % 
## 0.1049491 0.1704250

m5 <- glm(other ~ 1, data = coded_data, family = "binomial")
summary(m5)

## 
## Call:
## glm(formula = other ~ 1, family = "binomial", data = coded_data)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -3.7054     0.4132  -8.967   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 56.611  on 249  degrees of freedom
## Residual deviance: 56.611  on 249  degrees of freedom
## AIC: 58.611
## 
## Number of Fisher Scoring iterations: 6

coef(m5) %>% log_odds_to_p()* (1642/2516)

## (Intercept) 
##  0.01566296

confint(m5) %>% log_odds_to_p()* (1642/2516)

##       2.5 %      97.5 % 
## 0.006270243 0.031345825

How Accessible Are AERA Articles? 2010-2022

2023-2-16

What proportion of articles are available in any form?

Overall

N files available per article

By journal

By year

What proportion of accessible articles are available from specific domains?

Overall

By Journal

Table with results