library(tidyverse)
library(janitor)
library(googlesheets4)
all <- read_csv("all-aera-articles-feb-21-combined-reprocessed-proc.csv")
all %>%
count(name_of_journal) %>%
filter(name_of_journal != "AERA Open") %>%
arrange(name_of_journal)
## # A tibble: 6 × 2
## name_of_journal n
## <chr> <int>
## 1 American Educational Research Journal 592
## 2 Educational Evaluation and Policy Analysis 361
## 3 Educational Researcher 689
## 4 Journal of Educational and Behavioral Statistics 399
## 5 Review of Educational Research 288
## 6 Review of Research in Education 187
all %>%
count(name_of_journal) %>%
filter(name_of_journal != "AERA Open") %>%
arrange(name_of_journal) %>%
summarize(sum_n = sum(n))
## # A tibble: 1 × 1
## sum_n
## <int>
## 1 2516
d_orig <- read_csv("all-aera-articles-feb-21-combined-reprocessed-proc.csv")
d_orig <- rename(d_orig,
article_name = name_of_article,
journal_name = name_of_journal)
d_orig_to_join <- d_orig %>%
select(-link) %>%
distinct(journal_name, article_name, year, .keep_all = TRUE)
d_proc <- read_csv("feb-21-2023-all-processed.csv")
d_proc_to_join <- select(d_proc,
article_name,
journal_name,
year,
link)
nrow(d_proc)
## [1] 5704
d_proc <- d_proc_to_join %>%
filter(!str_detect(link, "scholar.google.com"))
nrow(d_proc) # 302 removed
## [1] 5402
dd <- d_proc_to_join %>%
mutate(row_id = row_number()) %>%
left_join(d_orig_to_join, by = c("article_name", "journal_name", "year"))
dd %>%
filter(journal_name != "AERA Open") %>%
sample_n(250) %>%
write_csv("open-science-publishing-sample-for-qual-analysis.csv")
orig <- read_csv("open-science-publishing-sample-for-qual-analysis.csv")
dd %>%
filter(journal_name != "AERA Open") %>%
filter(!(row_id %in% orig$row_id)) %>%
filter(!str_detect(link, "instlink")) %>%
sample_n(160) %>%
write_csv("open-science-publishing-sample-for-qual-analysis-1.csv")
# d_orig$data_set <- c(rep(1, 1345), rep(2, 1714))
#
# d_orig <- d_orig %>%
# group_by(data_set) %>%
# mutate(unique_row_id = row_number() - 1) %>%
# mutate(unique_row_id = str_c(data_set, " - ", unique_row_id))
#
# d_proc$data_set <- c(rep(1, 2017), rep(2, 2513))
#
# d_proc <- d_proc %>%
# group_by(data_set) %>%
# mutate(unique_row_id = str_c(data_set, " - ", unique_row_id))
#
# d_proc$domain <- domain(d_proc$link)
#
# write_csv(d_orig, "all-articles-to-access.csv")
# write_csv(d_proc, "all-articles-accessed.csv")
nrow_articles <- d_orig %>%
filter(journal_name != "AERA Open") %>%
nrow()
nrow_accessed <- d_proc_to_join %>%
filter(journal_name != "AERA Open") %>%
distinct(article_name, journal_name) %>%
nrow()
nrow_articles
## [1] 2516
nrow_accessed
## [1] 1642
round(nrow_accessed / nrow_articles, 4)
## [1] 0.6526
N accessible articles: 1642
N total articles: 2516
N articles not accessible: 874
Proportion accessible: 0.653
d_proc_to_join %>%
count(article_name, journal_name) %>% summarize(mean_n = mean(n),
sd_n = sd(n))
## # A tibble: 1 × 2
## mean_n sd_n
## <dbl> <dbl>
## 1 2.66 1.83
d_proc_to_join %>%
count(article_name, journal_name) %>%
count(n) %>%
mutate(more_than_one = ifelse(n > 1, 1, 0)) %>%
group_by(more_than_one) %>%
summarize(sum_nn = sum(nn)) %>%
mutate(prop = sum_nn / sum(sum_nn))
## # A tibble: 2 × 3
## more_than_one sum_nn prop
## <dbl> <int> <dbl>
## 1 0 720 0.336
## 2 1 1426 0.664
d_proc_to_join %>%
count(article_name, journal_name) %>%
count(n) %>%
bind_rows(tibble(n = 0, nn = 687)) %>%
ggplot(aes(x = n, y = nn)) +
geom_col() +
ylab("Count") +
xlab("Versions of Each Article") +
theme_light() +
theme(text = element_text(family = "Times", size = 16))
ggsave("count-by-year.png", width = 7, height = 7, dpi = 300)
nrow_articles_by_journal <- d_orig %>%
ungroup() %>%
group_by(journal_name) %>%
summarize(n = n())
nrow_articles_accessed_by_journal <- d_proc_to_join %>%
distinct(journal_name, article_name) %>%
count(article_name, journal_name) %>%
count(journal_name) %>%
rename(n_accessed = n)
nrow_articles_by_journal %>%
left_join(nrow_articles_accessed_by_journal) %>%
mutate(prop = n_accessed / n %>% round(2)) %>%
select(journal_name, n_accessed, n, prop) %>%
arrange(desc(prop)) %>%
slice(-1) %>%
arrange(desc(prop)) %>%
select(journal_name, n, n_accessed, prop) %>%
mutate(prop = round(prop * 100, 2))
## # A tibble: 6 × 4
## journal_name n n_accessed prop
## <chr> <int> <int> <dbl>
## 1 Review of Educational Research 288 216 75
## 2 Educational Evaluation and Policy Analysis 361 264 73.1
## 3 Review of Research in Education 187 127 67.9
## 4 American Educational Research Journal 592 383 64.7
## 5 Journal of Educational and Behavioral Statistics 399 243 60.9
## 6 Educational Researcher 689 409 59.4
nrow_articles_by_year <- d_orig %>%
ungroup() %>%
group_by(year) %>%
summarize(n = n())
nrow_articles_accessed_by_year <- d_proc_to_join %>%
ungroup() %>%
distinct(year, article_name) %>%
count(year) %>%
rename(n_accessed = n)
library(scales)
nrow_articles_by_year %>%
left_join(nrow_articles_accessed_by_year) %>%
mutate(prop = n_accessed / n %>% round(2)) %>%
select(year, Accessible = n_accessed, n, prop) %>%
ggplot(aes(x = year, y = prop)) +
geom_line() +
geom_point() +
ylab('Articles Accessible') +
theme_light() +
scale_x_continuous(breaks = c(2010:2022)) +
scale_y_continuous(labels = percent, limits = c(0, 1)) +
theme(text = element_text(family = "Times", size = 16))
ggsave("percentage-by-year.png", width = 8, height = 8, dpi = 300)
d_proc_to_join$domain <- urltools::domain(d_proc_to_join$link)
d_proc_to_join %>%
ungroup() %>%
count(domain, sort = TRUE) %>%
mutate(prop = n / sum(n) %>% round(2)) %>%
slice(1:10) %>%
mutate(prop = round(prop * 100, 2)) %>%
transmute(domain = domain,
n = str_c(n, " (", prop, ")"))
## # A tibble: 10 × 2
## domain n
## <chr> <chr>
## 1 www.researchgate.net 891 (15.62)
## 2 journals.sagepub.com 845 (14.81)
## 3 www.academia.edu 563 (9.87)
## 4 citeseerx.ist.psu.edu 492 (8.63)
## 5 scholar.google.com 302 (5.29)
## 6 scholar.archive.org 292 (5.12)
## 7 files.eric.ed.gov 151 (2.65)
## 8 core.ac.uk 110 (1.93)
## 9 www.ncbi.nlm.nih.gov 84 (1.47)
## 10 edworkingpapers.org 66 (1.16)
d_proc_to_join %>%
ungroup() %>%
count(domain, journal_name, sort = TRUE) %>%
group_by(journal_name) %>%
slice(1:5) %>%
mutate(prop = n / sum(n) %>% round(2)) %>%
arrange(journal_name, desc(prop)) %>%
knitr::kable()
domain | journal_name | n | prop |
---|---|---|---|
journals.sagepub.com | AERA Open | 500 | 0.4739336 |
scholar.archive.org | AERA Open | 199 | 0.1886256 |
www.researchgate.net | AERA Open | 159 | 0.1507109 |
scholar.google.com | AERA Open | 103 | 0.0976303 |
www.academia.edu | AERA Open | 94 | 0.0890995 |
www.researchgate.net | American Educational Research Journal | 159 | 0.3123772 |
citeseerx.ist.psu.edu | American Educational Research Journal | 142 | 0.2789784 |
www.academia.edu | American Educational Research Journal | 103 | 0.2023576 |
journals.sagepub.com | American Educational Research Journal | 56 | 0.1100196 |
files.eric.ed.gov | American Educational Research Journal | 49 | 0.0962672 |
citeseerx.ist.psu.edu | Educational Evaluation and Policy Analysis | 112 | 0.3137255 |
www.researchgate.net | Educational Evaluation and Policy Analysis | 87 | 0.2436975 |
www.academia.edu | Educational Evaluation and Policy Analysis | 62 | 0.1736695 |
journals.sagepub.com | Educational Evaluation and Policy Analysis | 60 | 0.1680672 |
files.eric.ed.gov | Educational Evaluation and Policy Analysis | 36 | 0.1008403 |
www.researchgate.net | Educational Researcher | 198 | 0.3800384 |
www.academia.edu | Educational Researcher | 131 | 0.2514395 |
journals.sagepub.com | Educational Researcher | 81 | 0.1554702 |
citeseerx.ist.psu.edu | Educational Researcher | 56 | 0.1074856 |
scholar.google.com | Educational Researcher | 55 | 0.1055662 |
citeseerx.ist.psu.edu | Journal of Educational and Behavioral Statistics | 103 | 0.3456376 |
www.researchgate.net | Journal of Educational and Behavioral Statistics | 83 | 0.2785235 |
journals.sagepub.com | Journal of Educational and Behavioral Statistics | 47 | 0.1577181 |
scholar.google.com | Journal of Educational and Behavioral Statistics | 35 | 0.1174497 |
files.eric.ed.gov | Journal of Educational and Behavioral Statistics | 30 | 0.1006711 |
www.researchgate.net | Review of Educational Research | 129 | 0.3514986 |
www.academia.edu | Review of Educational Research | 83 | 0.2261580 |
citeseerx.ist.psu.edu | Review of Educational Research | 64 | 0.1743869 |
journals.sagepub.com | Review of Educational Research | 52 | 0.1416894 |
scholar.google.com | Review of Educational Research | 39 | 0.1062670 |
www.researchgate.net | Review of Research in Education | 76 | 0.3534884 |
www.academia.edu | Review of Research in Education | 72 | 0.3348837 |
journals.sagepub.com | Review of Research in Education | 49 | 0.2279070 |
citeseerx.ist.psu.edu | Review of Research in Education | 9 | 0.0418605 |
scholar.google.com | Review of Research in Education | 9 | 0.0418605 |
DT::datatable(select(d_proc_to_join, domain, everything()))
coded_data <- read_sheet("https://docs.google.com/spreadsheets/d/1R08x5jhfpkZ8ZzvtvqwfMLtlwvj36yEXuBRW4dioJVc/edit#gid=1786696285")
# coded_data <- read_csv("open-science-publishing-sample-for-qual-analysis-3.csv")
coded_data <- coded_data %>%
filter(!str_detect(link, "scholar.google.com"))
coded_data <- coded_data %>% janitor::clean_names()
coded_data %>% nrow()
## [1] 250
x1 <- dd %>%
filter(journal_name != "AERA Open") %>% nrow()
x2 <- dd %>%
filter(journal_name != "AERA Open") %>%
anti_join(coded_data, by = "row_id") %>%
nrow()
x1 - x2
## [1] 250
# dd %>%
# filter(journal_name != "AERA Open") %>%
# anti_join(coded_data, by = "row_id") %>%
# sample_n(40) %>% write_csv("additional-sample-5-10-2024.csv")
coded_data <- coded_data %>%
mutate(code = tolower(code),
code = str_c("code", code))
nrow(coded_data) # 264 - was 290, 26 were google scholar citations
## [1] 250
coded_data <- coded_data %>%
mutate(restricted = ifelse(code == "code1a", 1, 0),
open = ifelse(code == "code1b", 1, 0),
free = ifelse(code == "code1c", 1, 0),
preprint = ifelse(code == "code2", 1, 0),
other = ifelse(code == "code3", 1, 0))
coded_data %>%
janitor::tabyl(code)
## code n percent
## code1a 145 0.580
## code1b 15 0.060
## code1c 25 0.100
## code2 52 0.208
## code3 6 0.024
## codena 7 0.028
log_odds_to_p <- function(x) {
exp(x) / (1 + exp(x))
}
m1 <- glm(restricted ~ 1, data = coded_data, family = "binomial")
summary(m1)
##
## Call:
## glm(formula = restricted ~ 1, family = "binomial", data = coded_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.3228 0.1281 2.519 0.0118 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 340.15 on 249 degrees of freedom
## Residual deviance: 340.15 on 249 degrees of freedom
## AIC: 342.15
##
## Number of Fisher Scoring iterations: 4
coef(m1) %>% log_odds_to_p()
## (Intercept)
## 0.58
confint(m1) %>% log_odds_to_p() %>% round(4)
## 2.5 % 97.5 %
## 0.5182 0.6401
coef(m1) %>% log_odds_to_p() * (1642/2516)
## (Intercept)
## 0.3785215
confint(m1) %>% log_odds_to_p() %>% round(4) * (1642/2516)
## 2.5 % 97.5 %
## 0.3381893 0.4177441
m2 <- glm(open ~ 1, data = coded_data, family = "binomial")
summary(m2)
##
## Call:
## glm(formula = open ~ 1, family = "binomial", data = coded_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.7515 0.2663 -10.33 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 113.48 on 249 degrees of freedom
## Residual deviance: 113.48 on 249 degrees of freedom
## AIC: 115.48
##
## Number of Fisher Scoring iterations: 5
coef(m2) %>% log_odds_to_p()
## (Intercept)
## 0.06
confint(m2) %>% log_odds_to_p()
## 2.5 % 97.5 %
## 0.03497575 0.09396983
coef(m2) %>% log_odds_to_p()* (1642/2516)
## (Intercept)
## 0.03915739
confint(m2) %>% log_odds_to_p()* (1642/2516)
## 2.5 % 97.5 %
## 0.02282598 0.06132689
m3 <- glm(free ~ 1, data = coded_data, family = "binomial")
summary(m3)
##
## Call:
## glm(formula = free ~ 1, family = "binomial", data = coded_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.1972 0.2108 -10.42 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 162.54 on 249 degrees of freedom
## Residual deviance: 162.54 on 249 degrees of freedom
## AIC: 164.54
##
## Number of Fisher Scoring iterations: 4
coef(m3) %>% log_odds_to_p()* (1642/2516)
## (Intercept)
## 0.06526232
confint(m3) %>% log_odds_to_p()* (1642/2516)
## 2.5 % 97.5 %
## 0.04367007 0.09217050
m4 <- glm(preprint ~ 1, data = coded_data, family = "binomial")
summary(m3)
##
## Call:
## glm(formula = free ~ 1, family = "binomial", data = coded_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.1972 0.2108 -10.42 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 162.54 on 249 degrees of freedom
## Residual deviance: 162.54 on 249 degrees of freedom
## AIC: 164.54
##
## Number of Fisher Scoring iterations: 4
coef(m4) %>% log_odds_to_p()* (1642/2516)
## (Intercept)
## 0.1357456
confint(m4) %>% log_odds_to_p()* (1642/2516)
## 2.5 % 97.5 %
## 0.1049491 0.1704250
m5 <- glm(other ~ 1, data = coded_data, family = "binomial")
summary(m5)
##
## Call:
## glm(formula = other ~ 1, family = "binomial", data = coded_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.7054 0.4132 -8.967 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 56.611 on 249 degrees of freedom
## Residual deviance: 56.611 on 249 degrees of freedom
## AIC: 58.611
##
## Number of Fisher Scoring iterations: 6
coef(m5) %>% log_odds_to_p()* (1642/2516)
## (Intercept)
## 0.01566296
confint(m5) %>% log_odds_to_p()* (1642/2516)
## 2.5 % 97.5 %
## 0.006270243 0.031345825