library(tidyverse)
library(janitor)
library(googlesheets4)
all <- read_csv("all-aera-articles-feb-21-combined-reprocessed-proc.csv")
all %>% 
  count(name_of_journal) %>% 
  filter(name_of_journal != "AERA Open") %>% 
  arrange(name_of_journal)
## # A tibble: 6 × 2
##   name_of_journal                                      n
##   <chr>                                            <int>
## 1 American Educational Research Journal              592
## 2 Educational Evaluation and Policy Analysis         361
## 3 Educational Researcher                             689
## 4 Journal of Educational and Behavioral Statistics   399
## 5 Review of Educational Research                     288
## 6 Review of Research in Education                    187
all %>% 
  count(name_of_journal) %>% 
  filter(name_of_journal != "AERA Open") %>% 
  arrange(name_of_journal) %>% 
  summarize(sum_n = sum(n))
## # A tibble: 1 × 1
##   sum_n
##   <int>
## 1  2516
d_orig <- read_csv("all-aera-articles-feb-21-combined-reprocessed-proc.csv")

d_orig <- rename(d_orig,
                 article_name = name_of_article,
                 journal_name = name_of_journal)

d_orig_to_join <- d_orig %>% 
  select(-link) %>% 
  distinct(journal_name, article_name, year, .keep_all = TRUE)

d_proc <- read_csv("feb-21-2023-all-processed.csv")

d_proc_to_join <- select(d_proc, 
                         article_name,
                         journal_name,
                         year,
                         link)

nrow(d_proc)
## [1] 5704
d_proc <- d_proc_to_join %>% 
  filter(!str_detect(link, "scholar.google.com"))

nrow(d_proc) # 302 removed
## [1] 5402
dd <- d_proc_to_join %>% 
  mutate(row_id = row_number()) %>% 
  left_join(d_orig_to_join, by = c("article_name", "journal_name", "year"))

dd %>% 
  filter(journal_name != "AERA Open") %>% 
  sample_n(250) %>% 
  write_csv("open-science-publishing-sample-for-qual-analysis.csv")

orig <- read_csv("open-science-publishing-sample-for-qual-analysis.csv")

dd %>% 
  filter(journal_name != "AERA Open") %>% 
  filter(!(row_id %in% orig$row_id)) %>% 
  filter(!str_detect(link, "instlink")) %>% 
  sample_n(160) %>% 
  write_csv("open-science-publishing-sample-for-qual-analysis-1.csv")
# d_orig$data_set <- c(rep(1, 1345), rep(2, 1714))
# 
# d_orig <- d_orig %>% 
#   group_by(data_set) %>% 
#   mutate(unique_row_id = row_number() - 1) %>% 
#   mutate(unique_row_id = str_c(data_set, " - ", unique_row_id))
# 
# d_proc$data_set <- c(rep(1, 2017), rep(2, 2513))
#   
# d_proc <- d_proc %>% 
#   group_by(data_set) %>% 
#   mutate(unique_row_id = str_c(data_set, " - ", unique_row_id))
# 
# d_proc$domain <- domain(d_proc$link)
# 
# write_csv(d_orig, "all-articles-to-access.csv")
# write_csv(d_proc, "all-articles-accessed.csv")

What proportion of articles are available in any form?

Overall

nrow_articles <- d_orig %>% 
  filter(journal_name != "AERA Open") %>% 
  nrow()

nrow_accessed <- d_proc_to_join %>% 
  filter(journal_name != "AERA Open") %>% 
  distinct(article_name, journal_name) %>% 
  nrow()

nrow_articles
## [1] 2516
nrow_accessed
## [1] 1642
round(nrow_accessed / nrow_articles, 4)
## [1] 0.6526

N accessible articles: 1642
N total articles: 2516
N articles not accessible: 874
Proportion accessible: 0.653

N files available per article

d_proc_to_join %>% 
  count(article_name, journal_name) %>% summarize(mean_n = mean(n), 
                                                  sd_n = sd(n))
## # A tibble: 1 × 2
##   mean_n  sd_n
##    <dbl> <dbl>
## 1   2.66  1.83
d_proc_to_join %>% 
  count(article_name, journal_name) %>% 
  count(n) %>% 
  mutate(more_than_one = ifelse(n > 1, 1, 0)) %>% 
  group_by(more_than_one) %>% 
  summarize(sum_nn = sum(nn)) %>% 
  mutate(prop = sum_nn / sum(sum_nn))
## # A tibble: 2 × 3
##   more_than_one sum_nn  prop
##           <dbl>  <int> <dbl>
## 1             0    720 0.336
## 2             1   1426 0.664
d_proc_to_join %>% 
  count(article_name, journal_name) %>% 
  count(n) %>% 
  bind_rows(tibble(n = 0, nn = 687)) %>% 
  ggplot(aes(x = n, y = nn)) +
  geom_col() +
  ylab("Count") +
  xlab("Versions of Each Article") +
  theme_light() +
  theme(text = element_text(family = "Times", size = 16))

ggsave("count-by-year.png", width = 7, height = 7, dpi = 300)

By journal

nrow_articles_by_journal <- d_orig %>% 
  ungroup() %>% 
  group_by(journal_name) %>% 
  summarize(n = n())

nrow_articles_accessed_by_journal <- d_proc_to_join %>% 
  distinct(journal_name, article_name) %>% 
  count(article_name, journal_name) %>% 
  count(journal_name) %>% 
  rename(n_accessed = n)

nrow_articles_by_journal %>% 
  left_join(nrow_articles_accessed_by_journal) %>% 
  mutate(prop = n_accessed / n %>% round(2)) %>% 
  select(journal_name, n_accessed, n, prop) %>% 
  arrange(desc(prop)) %>% 
  slice(-1) %>% 
  arrange(desc(prop)) %>% 
  select(journal_name, n, n_accessed, prop) %>% 
  mutate(prop = round(prop * 100, 2))
## # A tibble: 6 × 4
##   journal_name                                         n n_accessed  prop
##   <chr>                                            <int>      <int> <dbl>
## 1 Review of Educational Research                     288        216  75  
## 2 Educational Evaluation and Policy Analysis         361        264  73.1
## 3 Review of Research in Education                    187        127  67.9
## 4 American Educational Research Journal              592        383  64.7
## 5 Journal of Educational and Behavioral Statistics   399        243  60.9
## 6 Educational Researcher                             689        409  59.4

By year

nrow_articles_by_year <- d_orig %>% 
  ungroup() %>% 
  group_by(year) %>% 
  summarize(n = n())

nrow_articles_accessed_by_year <- d_proc_to_join %>% 
  ungroup() %>% 
  distinct(year, article_name) %>% 
  count(year) %>% 
  rename(n_accessed = n)

library(scales)

nrow_articles_by_year %>% 
  left_join(nrow_articles_accessed_by_year) %>% 
  mutate(prop = n_accessed / n %>% round(2)) %>% 
  select(year, Accessible = n_accessed, n, prop) %>% 
  ggplot(aes(x = year, y = prop)) +
  geom_line() +
  geom_point() +
  ylab('Articles Accessible') +
  theme_light() +
  scale_x_continuous(breaks = c(2010:2022)) +
  scale_y_continuous(labels = percent, limits = c(0, 1)) +
  theme(text = element_text(family = "Times", size = 16))

ggsave("percentage-by-year.png", width = 8, height = 8, dpi = 300)

What proportion of accessible articles are available from specific domains?

Overall

d_proc_to_join$domain <- urltools::domain(d_proc_to_join$link)

d_proc_to_join %>% 
  ungroup() %>% 
  count(domain, sort = TRUE) %>% 
  mutate(prop = n / sum(n) %>% round(2)) %>% 
  slice(1:10) %>% 
  mutate(prop = round(prop * 100, 2)) %>% 
  transmute(domain = domain,
            n = str_c(n, " (", prop, ")"))
## # A tibble: 10 × 2
##    domain                n          
##    <chr>                 <chr>      
##  1 www.researchgate.net  891 (15.62)
##  2 journals.sagepub.com  845 (14.81)
##  3 www.academia.edu      563 (9.87) 
##  4 citeseerx.ist.psu.edu 492 (8.63) 
##  5 scholar.google.com    302 (5.29) 
##  6 scholar.archive.org   292 (5.12) 
##  7 files.eric.ed.gov     151 (2.65) 
##  8 core.ac.uk            110 (1.93) 
##  9 www.ncbi.nlm.nih.gov  84 (1.47)  
## 10 edworkingpapers.org   66 (1.16)

By Journal

d_proc_to_join %>% 
  ungroup() %>% 
  count(domain, journal_name, sort = TRUE) %>% 
  group_by(journal_name) %>% 
  slice(1:5) %>% 
  mutate(prop = n / sum(n) %>% round(2)) %>% 
  arrange(journal_name, desc(prop)) %>% 
  knitr::kable()
domain journal_name n prop
journals.sagepub.com AERA Open 500 0.4739336
scholar.archive.org AERA Open 199 0.1886256
www.researchgate.net AERA Open 159 0.1507109
scholar.google.com AERA Open 103 0.0976303
www.academia.edu AERA Open 94 0.0890995
www.researchgate.net American Educational Research Journal 159 0.3123772
citeseerx.ist.psu.edu American Educational Research Journal 142 0.2789784
www.academia.edu American Educational Research Journal 103 0.2023576
journals.sagepub.com American Educational Research Journal 56 0.1100196
files.eric.ed.gov American Educational Research Journal 49 0.0962672
citeseerx.ist.psu.edu Educational Evaluation and Policy Analysis 112 0.3137255
www.researchgate.net Educational Evaluation and Policy Analysis 87 0.2436975
www.academia.edu Educational Evaluation and Policy Analysis 62 0.1736695
journals.sagepub.com Educational Evaluation and Policy Analysis 60 0.1680672
files.eric.ed.gov Educational Evaluation and Policy Analysis 36 0.1008403
www.researchgate.net Educational Researcher 198 0.3800384
www.academia.edu Educational Researcher 131 0.2514395
journals.sagepub.com Educational Researcher 81 0.1554702
citeseerx.ist.psu.edu Educational Researcher 56 0.1074856
scholar.google.com Educational Researcher 55 0.1055662
citeseerx.ist.psu.edu Journal of Educational and Behavioral Statistics 103 0.3456376
www.researchgate.net Journal of Educational and Behavioral Statistics 83 0.2785235
journals.sagepub.com Journal of Educational and Behavioral Statistics 47 0.1577181
scholar.google.com Journal of Educational and Behavioral Statistics 35 0.1174497
files.eric.ed.gov Journal of Educational and Behavioral Statistics 30 0.1006711
www.researchgate.net Review of Educational Research 129 0.3514986
www.academia.edu Review of Educational Research 83 0.2261580
citeseerx.ist.psu.edu Review of Educational Research 64 0.1743869
journals.sagepub.com Review of Educational Research 52 0.1416894
scholar.google.com Review of Educational Research 39 0.1062670
www.researchgate.net Review of Research in Education 76 0.3534884
www.academia.edu Review of Research in Education 72 0.3348837
journals.sagepub.com Review of Research in Education 49 0.2279070
citeseerx.ist.psu.edu Review of Research in Education 9 0.0418605
scholar.google.com Review of Research in Education 9 0.0418605

Table with results

DT::datatable(select(d_proc_to_join, domain, everything()))
coded_data <- read_sheet("https://docs.google.com/spreadsheets/d/1R08x5jhfpkZ8ZzvtvqwfMLtlwvj36yEXuBRW4dioJVc/edit#gid=1786696285")
# coded_data <- read_csv("open-science-publishing-sample-for-qual-analysis-3.csv")

coded_data <- coded_data %>% 
  filter(!str_detect(link, "scholar.google.com"))

coded_data <- coded_data %>% janitor::clean_names()

coded_data %>% nrow()
## [1] 250
x1 <- dd %>% 
  filter(journal_name != "AERA Open") %>% nrow()

x2 <- dd %>% 
  filter(journal_name != "AERA Open") %>% 
  anti_join(coded_data, by = "row_id") %>% 
  nrow()

x1 - x2
## [1] 250
# dd %>% 
#   filter(journal_name != "AERA Open") %>% 
#   anti_join(coded_data, by = "row_id") %>% 
#   sample_n(40) %>% write_csv("additional-sample-5-10-2024.csv")
coded_data <- coded_data %>% 
  mutate(code = tolower(code),
         code = str_c("code", code))

nrow(coded_data) # 264 - was 290, 26 were google scholar citations
## [1] 250
coded_data <- coded_data %>%
  mutate(restricted = ifelse(code == "code1a", 1, 0),
         open = ifelse(code == "code1b", 1, 0),
         free = ifelse(code == "code1c", 1, 0),
         preprint = ifelse(code == "code2", 1, 0),
         other = ifelse(code == "code3", 1, 0))

coded_data %>% 
  janitor::tabyl(code)
##    code   n percent
##  code1a 145   0.580
##  code1b  15   0.060
##  code1c  25   0.100
##   code2  52   0.208
##   code3   6   0.024
##  codena   7   0.028
log_odds_to_p <- function(x) {
  exp(x) / (1 + exp(x))
}
m1 <- glm(restricted ~ 1, data = coded_data, family = "binomial")
summary(m1)
## 
## Call:
## glm(formula = restricted ~ 1, family = "binomial", data = coded_data)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)   0.3228     0.1281   2.519   0.0118 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 340.15  on 249  degrees of freedom
## Residual deviance: 340.15  on 249  degrees of freedom
## AIC: 342.15
## 
## Number of Fisher Scoring iterations: 4
coef(m1) %>% log_odds_to_p()
## (Intercept) 
##        0.58
confint(m1) %>% log_odds_to_p() %>% round(4)
##  2.5 % 97.5 % 
## 0.5182 0.6401
coef(m1) %>% log_odds_to_p() * (1642/2516)
## (Intercept) 
##   0.3785215
confint(m1) %>% log_odds_to_p() %>% round(4) * (1642/2516)
##     2.5 %    97.5 % 
## 0.3381893 0.4177441
m2 <- glm(open ~ 1, data = coded_data, family = "binomial")
summary(m2)
## 
## Call:
## glm(formula = open ~ 1, family = "binomial", data = coded_data)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.7515     0.2663  -10.33   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 113.48  on 249  degrees of freedom
## Residual deviance: 113.48  on 249  degrees of freedom
## AIC: 115.48
## 
## Number of Fisher Scoring iterations: 5
coef(m2) %>% log_odds_to_p()
## (Intercept) 
##        0.06
confint(m2) %>% log_odds_to_p()
##      2.5 %     97.5 % 
## 0.03497575 0.09396983
coef(m2) %>% log_odds_to_p()* (1642/2516)
## (Intercept) 
##  0.03915739
confint(m2) %>% log_odds_to_p()* (1642/2516)
##      2.5 %     97.5 % 
## 0.02282598 0.06132689
m3 <- glm(free ~ 1, data = coded_data, family = "binomial")
summary(m3)
## 
## Call:
## glm(formula = free ~ 1, family = "binomial", data = coded_data)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.1972     0.2108  -10.42   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 162.54  on 249  degrees of freedom
## Residual deviance: 162.54  on 249  degrees of freedom
## AIC: 164.54
## 
## Number of Fisher Scoring iterations: 4
coef(m3) %>% log_odds_to_p()* (1642/2516)
## (Intercept) 
##  0.06526232
confint(m3) %>% log_odds_to_p()* (1642/2516)
##      2.5 %     97.5 % 
## 0.04367007 0.09217050
m4 <- glm(preprint ~ 1, data = coded_data, family = "binomial")
summary(m3)
## 
## Call:
## glm(formula = free ~ 1, family = "binomial", data = coded_data)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.1972     0.2108  -10.42   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 162.54  on 249  degrees of freedom
## Residual deviance: 162.54  on 249  degrees of freedom
## AIC: 164.54
## 
## Number of Fisher Scoring iterations: 4
coef(m4) %>% log_odds_to_p()* (1642/2516)
## (Intercept) 
##   0.1357456
confint(m4) %>% log_odds_to_p()* (1642/2516)
##     2.5 %    97.5 % 
## 0.1049491 0.1704250
m5 <- glm(other ~ 1, data = coded_data, family = "binomial")
summary(m5)
## 
## Call:
## glm(formula = other ~ 1, family = "binomial", data = coded_data)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -3.7054     0.4132  -8.967   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 56.611  on 249  degrees of freedom
## Residual deviance: 56.611  on 249  degrees of freedom
## AIC: 58.611
## 
## Number of Fisher Scoring iterations: 6
coef(m5) %>% log_odds_to_p()* (1642/2516)
## (Intercept) 
##  0.01566296
confint(m5) %>% log_odds_to_p()* (1642/2516)
##       2.5 %      97.5 % 
## 0.006270243 0.031345825