Analysis for AERA

Plotting

library(tidyverse)
library(googlesheets4)
library(igraph)
library(tidygraph)
library(ggraph)
library(ggiraph)

s <- read_sheet("https://docs.google.com/spreadsheets/d/1UGuey59KCe3RIFQ_n9ZB17UH1quMueAMtI6jRBJIVhg/edit#gid=0")

s <- s %>% 
    janitor::clean_names()

s %>% 
    glimpse()

## Rows: 1,337
## Columns: 6
## $ journal_special_issue  <chr> "British Journal of Educational Technology", "B…
## $ referencing_paper      <chr> "Dvir, M., & Ben-Zvi, D. (2022). Students' actu…
## $ referencing_paper_code <dbl> 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001,…
## $ reference              <chr> "Ahrweiler, P., & Wörmann, S. (1998). Computer …
## $ reference_code         <dbl> 2015, 2017, 2038, 2039, 2055, 2113, 2157, 2223,…
## $ original_order         <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, …

s_edge <- s %>% 
    select(referencing_paper_code, reference_code)

s_edge <- s_edge %>% 
    rename(name = referencing_paper_code) %>% 
    mutate(from_special_issue = ifelse(str_sub(name, start = 1, end = 2) == "10", "BJET", 
                                       ifelse(str_sub(name, start = 1, end = 2) == "11", "JLS", 
                                              ifelse(str_sub(name, start = 1, end = 2) == "12", "SERJ", "Other")))) %>% 
    rename(referencing_paper_code = name, name = reference_code) %>% 
    mutate(to_special_issue = ifelse(str_sub(name, start = 1, end = 2) == "10", "BJET", 
                                     ifelse(str_sub(name, start = 1, end = 2) == "11", "JLS", 
                                            ifelse(str_sub(name, start = 1, end = 2) == "12", "SERJ", "Other")))) %>% 
    rename(reference_code = name) %>% 
    rename(from = referencing_paper_code,
           to = reference_code)

s_edge %>%
    filter(str_sub(from, start = 1, end = 1) == "1")

## # A tibble: 1,337 × 4
##     from    to from_special_issue to_special_issue
##    <dbl> <dbl> <chr>              <chr>           
##  1  1001  2015 BJET               Other           
##  2  1001  2017 BJET               Other           
##  3  1001  2038 BJET               Other           
##  4  1001  2039 BJET               Other           
##  5  1001  2055 BJET               Other           
##  6  1001  2113 BJET               Other           
##  7  1001  2157 BJET               Other           
##  8  1001  2223 BJET               Other           
##  9  1001  2296 BJET               Other           
## 10  1001  2297 BJET               Other           
## # … with 1,327 more rows

s_edge <- s_edge %>%
    filter(!(str_sub(to, start = 1, end = 1) == "1"))

s_edge <- s_edge %>% 
    mutate(from = str_c("from_", from)) %>% 
    mutate(to = str_c("to_", to))

Upset counts

for_upset <- s_edge %>% 
    count(to, from_special_issue) %>% 
    spread(from_special_issue, n, fill = 0) %>% 
    as.data.frame() %>% 
    arrange(to)

replace_int <- function(x) {
    ifelse(x >= 1, 1, 0)
}

# for_upset <- for_upset %>% 
#  mutate_if(is.numeric, replace_int)

# for_upset is how many times SI articles cited other papers, total

for_upset_counts <- for_upset %>% 
    mutate(BJET_only = ifelse(BJET >= 1 & (JLS == 0 & SERJ == 0), BJET, 0),
           JLS_only = ifelse(JLS >= 1 & (BJET == 0 & SERJ == 0), 1, 0),
           SERJ_only = ifelse(SERJ >= 1 & (JLS == 0 & BJET == 0), 1, 0),
           BJET_JLS = ifelse(BJET >= 1 & JLS >= 1 & SERJ == 0, 1, 0),
           SERJ_JLS = ifelse(SERJ >= 1 & JLS >= 1 & BJET == 0, 1, 0),
           BJET_SERJ = ifelse(BJET >= 1 & SERJ >= 1 & JLS == 0, 1, 0),
           BJET_SERJ_JLS = ifelse(BJET >= 1 & SERJ >= 1 & JLS >= 1, 1, 0)) %>% 
    as_tibble() %>% 
    mutate(times_cited = BJET + JLS + SERJ)

BJET SERJ

for_upset_counts %>% 
    filter(BJET_SERJ == 1) %>% 
    arrange(desc(times_cited)) %>% 
    knitr::kable()

to	BJET	SERJ	BJET_SERJ	times_cited
to_2070	1	4	1	5
to_2255	3	2	1	5
to_3070	4	1	1	5
to_2400	1	3	1	4
to_2852	3	1	1	4
to_2401	1	2	1	3
to_2502	2	1	1	3
to_2674	1	2	1	3
to_2091	1	1	1	2
to_2104	1	1	1	2
to_2125	1	1	1	2
to_2245	1	1	1	2
to_2367	1	1	1	2
to_2402	1	1	1	2
to_2434	1	1	1	2
to_2479	1	1	1	2
to_2536	1	1	1	2
to_2542	1	1	1	2
to_2569	1	1	1	2
to_2612	1	1	1	2
to_2717	1	1	1	2
to_2753	1	1	1	2
to_2755	1	1	1	2
to_2855	1	1	1	2
to_2903	1	1	1	2
to_3023	1	1	1	2
to_3105	1	1	1	2

BJET JLS

for_upset_counts %>% 
    filter(BJET_JLS == 1) %>% 
    arrange(desc(times_cited)) %>% 
    knitr::kable()

to	BJET	JLS	BJET_JLS	times_cited
to_2100	3	1	1	4
to_2316	2	2	1	4
to_2981	1	3	1	4
to_3057	2	2	1	4
to_2131	2	1	1	3
to_2383	2	1	1	3
to_2435	1	2	1	3
to_2475	1	2	1	3
to_2622	2	1	1	3
to_2752	1	2	1	3
to_2850	1	2	1	3
to_2851	1	2	1	3
to_2062	1	1	1	2
to_2136	1	1	1	2
to_2214	1	1	1	2
to_2495	1	1	1	2
to_2980	1	1	1	2
to_3049	1	1	1	2
to_3069	1	1	1	2
to_3083	1	1	1	2

SERJ JLS

for_upset_counts %>% 
    filter(SERJ_JLS == 1) %>% 
    arrange(desc(times_cited)) %>% 
    knitr::kable()

to	JLS	SERJ	SERJ_JLS	times_cited
to_2425	3	1	1	4
to_2895	2	1	1	3
to_2093	1	1	1	2
to_2178	1	1	1	2
to_2225	1	1	1	2
to_2426	1	1	1	2
to_2427	1	1	1	2
to_2430	1	1	1	2
to_2869	1	1	1	2

All

for_upset_counts %>% 
    filter(BJET_SERJ_JLS == 1) %>% 
    arrange(desc(times_cited)) %>% 
    knitr::kable()

to	BJET	JLS	SERJ	BJET_SERJ_JLS	times_cited
to_2734	3	3	4	1	10
to_2577	6	2	1	1	9
to_2349	3	1	2	1	6
to_2775	4	1	1	1	6
to_2776	2	3	1	1	6
to_2641	3	1	1	1	5
to_2708	1	1	3	1	5
to_2684	1	2	1	1	4
to_2089	1	1	1	1	3
to_2126	1	1	1	1	3

# for_upset_counts is whether a cited paper was cited by an article in another special issue
# so, the counts represent whether an article was cited by one or more --- not how many times it was co-cited (the number of pairwise citations)
# e.g., a 9 means 9 articles were co-cited
# we could look at how many times they were co-cited
# and sort in that way

for_upset_counts %>% 
    nrow()

## [1] 1103

for_upset %>% 
    summarize(sum_bjet = sum(BJET),
              sum_jls = sum(JLS),
              sum_serj = sum(SERJ))

##   sum_bjet sum_jls sum_serj
## 1      526     232      553

for_upset_plot <- for_upset_counts %>% 
    summarize(sum_BJET_only = sum(BJET_only),
              sum_JLS_only = sum(JLS_only),
              sum_SERJ_only = sum(SERJ_only),
              sum_BJET_JLS = sum(BJET_JLS),
              sum_SERJ_JLS = sum(SERJ_JLS),
              sum_BJET_SERJ = sum(BJET_SERJ),
              sum_BJET_SERJ_JLS = sum(BJET_SERJ_JLS))

for_upset_plot

## # A tibble: 1 × 7
##   sum_BJET_only sum_JLS_only sum_SERJ_only sum_BJET_JLS sum_SE…¹ sum_B…² sum_B…³
##           <dbl>        <dbl>         <dbl>        <dbl>    <dbl>   <dbl>   <dbl>
## 1           439          168           453           20        9      27      10
## # … with abbreviated variable names ¹sum_SERJ_JLS, ²sum_BJET_SERJ,
## #   ³sum_BJET_SERJ_JLS

library(UpSetR)
source("upset-simple.R")
# 
# t <- tibble(BJET = c(1, 0, 0, 1, 1, 1, 0),
#        JLS = c(0, 1, 0, 1, 1, 0, 1),
#        SERJ = c(0, 0, 1, 1, 0, 1, 1),
#        count = c(423,168, 453, 10, 25, 9, 9)) 

expressionInput <- c(BJET = 416, JLS = 168, SERJ = 453, `BJET&JLS` = 20, `BJET&SERJ` = 27, 
                     `JLS&SERJ` = 9, `BJET&JLS&SERJ` = 10)

upset(fromExpression(expressionInput))

cited_once <- s_edge %>% 
    count(to) %>% 
    filter(n <= 1)

s_edge <- s_edge %>% 
    anti_join(cited_once)

g <- s_edge %>% 
    select(1, 2) %>% 
    mutate(from = str_sub(from, start = 6)) %>% 
    mutate(from = as.integer(from)) %>% 
    igraph::graph_from_data_frame()

from_distinct <- s_edge %>% 
    select(contains("from")) %>% 
    distinct()

to_distinct <- s_edge %>% 
    select(contains("to")) %>% 
    distinct() %>% 
    rename(from = to, from_special_issue = to_special_issue)

bound_distinct <- bind_rows(from_distinct, to_distinct) %>% 
    select(id = from, special_issue = from_special_issue)

V(g)$issue <- bound_distinct$special_issue

V(g)$name <- ifelse(str_detect(names(V(g)), "to"), str_sub(names(V(g)), start = 4), names(V(g)))

V(g)$color <- ifelse(V(g)$issue == "SERJ", "darkred", 
                     ifelse(V(g)$issue == "BJET", "lightblue", 
                            ifelse(V(g)$issue == "JLS", "darkgreen", "gray")))

V(g)$type <- bipartite.mapping(g)$type

V(g)$shape <- ifelse(V(g)$type, "square", "circle")

V(g)$size <- igraph::centralization.degree(g, mode = "in")$res
V(g)$size <- ifelse(V(g)$size == 0, 8, V(g)$size)

plot(g, vertex.label.cex = .7, vertex.label.color = "black")

E(g)$weight <- 1

RQ1

Processing for RQ1

p <- igraph::bipartite.projection(g, multiplicity = TRUE)$proj1

E(p)$weight

##   [1] 2 2 1 3 2 1 1 1 2 1 1 1 2 5 7 6 2 1 3 1 3 2 1 1 2 2 2 4 2 1 4 1 1 3 2 1 1
##  [38] 1 1 6 3 2 1 1 3 5 2 1 1 1 8 1 4 5 6 4 8 9 4 2 3 6 1 2 2 2 2 2 3 4 2 2 4 2
##  [75] 1 1 2 1 1 1 1 2 2 3 2 1 1 1 1 1 1 1 1 4 1 2 1 1 1 4 4 3 1 1 2 1 1 1 2 1 1
## [112] 1 1 7 1 4 1 1 1 2 1 1 1 1 1 1 1 3 1 2 2 1 1 1 1 1 2 1 1 1 1 2 1 1 3 1 2 1
## [149] 3 1 1 1 1 1 1 1 2 2 2 1 1 2 1 2 2 1 1 3 1 3 5 1 8 1 4 3 2 1 7 4 1 1 1 1 1
## [186] 3 3 2 2 1 1 1 2

# p <- igraph::simplify(p, edge.attr.comb = "sum")

V(p)$color <- ifelse(V(p)$issue == "SERJ", "darkred", 
                     ifelse(V(p)$issue == "BJET", "lightblue", 
                            ifelse(V(p)$issue == "JLS", "darkgreen", "gray")))

# V(p)$type <- bipartite.mapping(p)$type

# V(p)$shape <- ifelse(V(p)$type, "square", "circle")

V(p)$size <- igraph::centralization.degree(p, mode = "in")$res
# V(p)$size <- ifelse(V(p)$size == 0, 8, V(p)$size)

# plot(g, vertex.label=NA, vertex.size=3, edge.width=E(p)$weight)

deg <- centralization.degree(p, mode = "in")$res

pd <- p %>% 
    as_data_frame() %>% 
    as_tibble()

# pd <- pd %>%
#     spread(to, weight, fill = 0) %>% 
#     gather(to, weight, -from)

pdr <- pd %>% 
    mutate(from_special_issue = ifelse(str_sub(from, start = 1, end = 2) == "10", "BJET", 
                                       ifelse(str_sub(from, start = 1, end = 2) == "11", "JLS", 
                                              ifelse(str_sub(from, start = 1, end = 2) == "12", "SERJ", "Other")))) %>% 
    mutate(to_special_issue = ifelse(str_sub(to, start = 1, end = 2) == "10", "BJET", 
                                     ifelse(str_sub(to, start = 1, end = 2) == "11", "JLS", 
                                            ifelse(str_sub(to, start = 1, end = 2) == "12", "SERJ", "Other")))) %>% 
    group_by(from_special_issue, to_special_issue) %>% 
    summarize(sum_weight = sum(weight))

for_upset_counts <- pdr %>% 
    spread(to_special_issue, weight, fill = 0) %>% 
    mutate(BJET_only = ifelse(BJET >= 1 & (JLS == 0 & SERJ == 0), 1, 0),
           JLS_only = ifelse(JLS >= 1 & (BJET == 0 & SERJ == 0), 1, 0),
           SERJ_only = ifelse(SERJ >= 1 & (JLS == 0 & BJET == 0), 1, 0),
           BJET_JLS = ifelse(BJET >= 1 & JLS >= 1 & SERJ == 0, 1, 0),
           SERJ_JLS = ifelse(SERJ >= 1 & JLS >= 1 & BJET == 0, 1, 0),
           BJET_SERJ = ifelse(BJET >= 1 & SERJ >= 1 & JLS == 0, 1, 0),
           BJET_SERJ_JLS = ifelse(BJET >= 1 & SERJ >= 1 & JLS >= 1, 1, 0))

## Error in `spread()`:
## Caused by error:
## ! object 'weight' not found

for_upset_plot <- for_upset_counts %>% 
    summarize(sum_BJET_only = sum(BJET_only),
              sum_JLS_only = sum(JLS_only),
              sum_SERJ_only = sum(SERJ_only),
              sum_BJET_JLS = sum(BJET_JLS),
              sum_SERJ_JLS = sum(SERJ_JLS),
              sum_BJET_SERJ = sum(SERJ_JLS),
              sum_BJET_SERJ_JLS = sum(BJET_SERJ_JLS))

RQ1a: How many connections are there between citing papers between special issues?

total number of connections (two papers citing the same paper)

pdr

## # A tibble: 6 × 3
## # Groups:   from_special_issue [3]
##   from_special_issue to_special_issue sum_weight
##   <chr>              <chr>                 <dbl>
## 1 BJET               BJET                     87
## 2 BJET               JLS                      80
## 3 BJET               SERJ                     84
## 4 JLS                JLS                      32
## 5 JLS                SERJ                     40
## 6 SERJ               SERJ                     74

pdr %>% 
    spread(to_special_issue, sum_weight)

## # A tibble: 3 × 4
## # Groups:   from_special_issue [3]
##   from_special_issue  BJET   JLS  SERJ
##   <chr>              <dbl> <dbl> <dbl>
## 1 BJET                  87    80    84
## 2 JLS                   NA    32    40
## 3 SERJ                  NA    NA    74

average connections (citing a paper cited by another) per paper

pdr %>% 
    spread(to_special_issue, sum_weight) %>% 
    mutate(BJET = BJET/9,
           JLS = JLS / 8,
           SERJ = SERJ/11)

## # A tibble: 3 × 4
## # Groups:   from_special_issue [3]
##   from_special_issue  BJET   JLS  SERJ
##   <chr>              <dbl> <dbl> <dbl>
## 1 BJET                9.67    10  7.64
## 2 JLS                NA        4  3.64
## 3 SERJ               NA       NA  6.73

RQ1b: Which specific citing papers are the most connected to other papers (where connections are defined as citing the same paper)?

stj <- s %>% 
    select(id = referencing_paper_code, referencing_paper) %>% 
    distinct() %>% 
    # filter(id != 2734) %>% 
    group_by(id) %>% 
    summarize(from = first(id), reference = first(referencing_paper)) %>% 
    select(-id)

s_edge %>% 
    distinct(from, from_special_issue) %>% 
    mutate(from = str_sub(from, start = 6)) %>% 
    mutate(from = as.integer(from)) %>% 
    mutate(deg = deg) %>% 
    arrange(desc(deg)) %>% 
    left_join(stj)

## # A tibble: 28 × 4
##     from from_special_issue   deg reference                                     
##    <dbl> <chr>              <dbl> <chr>                                         
##  1  1102 JLS                   22 Kahn, J. (2020). Learning at the intersection…
##  2  1005 BJET                  21 Lee, V. R., Pimentel, D. R., Bhargava, R., & …
##  3  1106 JLS                   21 Stornaiuolo, A. (2020). Authoring data storie…
##  4  1204 SERJ                  21 Fergusson, A., & Pfannkuch, M. (2022). Introd…
##  5  1210 SERJ                  20 Vance, E. A., Glimp, D. R., Pieplow, N. D., G…
##  6  1006 BJET                  19 Matuk, C., DesPortes, K., Amato, A., Vacca, R…
##  7  1207 SERJ                  19 LEE, H., MOJICA, G., THRASHER, E., & Baumgart…
##  8  1009 BJET                  17 Shreiner, T. L., & Guzdial, M. (2022). The in…
##  9  1007 BJET                  16 Radinsky, J., & Tabak, I. (2022). Data practi…
## 10  1008 BJET                  15 Rosenberg, J. M., Schultheis, E. H., Kjelvik,…
## # … with 18 more rows

RQ2

processing for RQ2

p <- igraph::bipartite.projection(g, multiplicity = TRUE)$proj2

# E(p)$weight

# p <- igraph::simplify(p, edge.attr.comb = "sum")

# V(p)$type <- bipartite.mapping(p)$type

# V(p)$shape <- ifelse(V(p)$type, "square", "circle")

V(p)$size <- igraph::centralization.degree(p, mode = "in")$res
# V(p)$size <- ifelse(V(p)$size == 0, 8, V(p)$size)

# plot(g, vertex.label=NA, vertex.size=3, edge.width=E(p)$weight)

deg <- centralization.degree(p, mode = "in")$res

pd <- p %>% 
    as_data_frame() %>% 
    as_tibble()

pd <- pd %>%
    spread(to, weight, fill = 0) %>% 
    gather(to, weight, -from)

RQ2a: What connections exist between cited papers (where connections are defined as being cited by the same paper), on average?

pdr <- pd %>% 
    summarize(sum_weight = sum(weight))

pdr

## # A tibble: 1 × 1
##   sum_weight
##        <dbl>
## 1       2356

RQ2b: Which specific cited papers are most connected?

stj <- s %>% 
    select(id = reference_code, reference) %>% 
    distinct() %>% 
    # filter(id != 2734) %>% 
    group_by(id) %>% 
    summarize(id = first(id), reference = first(reference))

tibble(id = V(p) %>% names(),
       deg = deg) %>% 
    mutate(id = as.integer(id)) %>% 
    mutate(deg = deg) %>% 
    arrange(desc(deg)) %>% 
    left_join(stj)

## # A tibble: 117 × 3
##       id   deg reference                                                        
##    <dbl> <dbl> <chr>                                                            
##  1  2734    80 "Noble, S. U. (2018). Algorithms of oppression. New York Univers…
##  2  2577    76 "Lee, V. R., & Wilkerson, M. (2018). Data use by middle and seco…
##  3  2349    67 "Finzer, W. (2013). The data science education dilemma. Technolo…
##  4  2776    65 "Philip, T. M., Schuler-Brown, S., & Way, W. (2013). A framework…
##  5  2255    63 "D'Ignazio, C., & Klein, L. F. (2020). Data feminism. MIT Press."
##  6  2775    62 "Philip, T. M., Olivares-Pasillas, M. C., & Rocha, J. (2016). Be…
##  7  2580    58 "Lee, V. R., Wilkerson, M. H., & Lanouette, K. (2021). A call fo…
##  8  2070    57 "Bargagliotti, A., Franklin, C., Arnold, P., & Gould, R. (2020).…
##  9  2852    55 "Rubin, A. (2020). Learning to reason with data: How did we get …
## 10  2316    53 "Enyedy, N., & Mukhopadhyay, S. (2007). They don't show nothing …
## # … with 107 more rows

Analysis for AERA

2023-03-22

Plotting

Upset counts

BJET SERJ

BJET JLS

SERJ JLS

All

RQ1

Processing for RQ1

RQ1a: How many connections are there between citing papers between special issues?

RQ1b: Which specific citing papers are the most connected to other papers (where connections are defined as citing the same paper)?

RQ2

processing for RQ2

RQ2a: What connections exist between cited papers (where connections are defined as being cited by the same paper), on average?

RQ2b: Which specific cited papers are most connected?