Overview
Out of a total of 132,846 EEBO records, 60,227 (45.34%) are in EEBO-TCP (but 66 EEBO records have multiple TCP ids).
Out of the 132,846 EEBO records, 6,802 (5.12%) could not be matched to an ESTC record and will be left out of the analysis. On the other hand, 7,373 EEBO records (5.55%) were matched to more than one ESTC record, possibly causing bias.
Out of the 60,327 EEBO-TCP records, 1,143 (1.89%) could not be matched to an ESTC record and will be left out of the analysis. On the other hand, 3,269 EEBO-TCP records (5.42%) were matched to more than one ESTC record, possibly causing bias.
In the analysis, only ESTC records with publication years in the range [1474,1700] have been included. This results in the exclusion of 4,862 (4.17%) ESTC records that have representation in EEBO, possibly causing bias. 2,119 (3.41%) of the ESTC records with representation in EEBO-TCP are removed due to this filtering condition.
In the end, our working dataset consists of 132,412 ESTC records, of which 111,816 (84.45%) we estimate to have representation in EEBO, and 60,095 (45.38%) to have representation in EEBO-TCP.
Publication type analysis
library(ggbeeswarm)
bind_rows(
df %>% mutate(group = "Editions"),
df %>% group_by(work_id,type,first_publication_year,publication_year) %>%
summarize(in_eebo=any(in_eebo),in_eebo_tcp=any(in_eebo_tcp),.groups="drop") %>%
mutate(group = "Works",edition_type=if_else(publication_year==first_publication_year,"First year work","Later work"))
) %>%
mutate(edition_type=fct_relevel(edition_type,"Singular","First year work","Later work","First year edition","Later edition")) %>%
filter(type %in% c("Book","Pamphlet")) %>%
group_by(publication_year, edition_type, group, type, in_eebo) %>%
tally() %>%
mutate(prop = n / sum(n), tn = sum(n)) %>%
filter(in_eebo) %>%
ggplot(aes(x = type, y = prop, group = edition_type, color = edition_type)) +
geom_quasirandom(aes(size = tn), dodge = 1.0) +
stat_summary(aes(group = edition_type), position = position_dodge(width = 1.0), fun = median, fun.min = median, fun.max = median, geom = "crossbar", width = 0.5, color = "red") +
theme_hsci_discrete() +
xlab(NULL) +
ylab("EEBO coverage") +
scale_y_continuous(labels = scales::percent_format(accuracy = 1), breaks = seq(0, 1, by = 0.05)) +
scale_size(breaks = c(250, 500, 1500), range = c(0.1, 8.0)) +
theme(legend.justification = c(0, 0), legend.position = c(0.02, 0.02), legend.background = element_blank(), legend.box.just = "bottom", legend.key = element_blank(), legend.box = "horizontal") +
labs(color = "Representation type", size = "Count") +
guides(shape = "none")

library(ggbeeswarm)
bind_rows(
df %>% mutate(group = "Editions"),
df %>% group_by(work_id,type,first_publication_year,publication_year) %>%
summarize(in_eebo=any(in_eebo),in_eebo_tcp=any(in_eebo_tcp),.groups="drop") %>%
mutate(group = "Works",edition_type=if_else(publication_year==first_publication_year,"First year work","Later work"))
) %>%
mutate(edition_type=fct_relevel(edition_type,"Singular","First year work","Later work","First year edition","Later edition")) %>%
filter(type %in% c("Book","Pamphlet")) %>%
group_by(publication_year, edition_type, group, type, in_eebo_tcp) %>%
tally() %>%
mutate(prop = n / sum(n), tn = sum(n)) %>%
filter(in_eebo_tcp) %>%
ggplot(aes(x = type, y = prop, group = edition_type, color = edition_type)) +
geom_quasirandom(aes(size = tn), dodge = 1.0) +
stat_summary(aes(group = edition_type), position = position_dodge(width = 1.0), fun = median, fun.min = median, fun.max = median, geom = "crossbar", width = 0.5, color = "red") +
theme_hsci_discrete() +
xlab(NULL) +
ylab("EEBO-TCP coverage") +
scale_y_continuous(labels = scales::percent_format(accuracy = 1), breaks = seq(0, 1, by = 0.05)) +
scale_size(breaks = c(250, 500, 1500), range = c(0.1, 8.0)) +
theme(legend.justification = c(0, 0), legend.position = c(0.02, 0.02), legend.background = element_blank(), legend.box.just = "bottom", legend.key = element_blank(), legend.box = "horizontal") +
labs(color = "Representation type", size = "Count") +
guides(shape = "none")

Edition-level temporal overview
df %>% mutate(g = case_when(
!certain ~ "Uncertain dating",
in_eebo_tcp ~ "In EEBO-TCP",
in_eebo ~ "In EEBO",
T ~ "ESTC total",
)) %>%
ggplot(aes(x = publication_year, fill = fct_relevel(g, "Uncertain dating", "ESTC total", "In EEBO","In EEBO-TCP"))) +
geom_bar(width = 1) +
theme_hsci_discrete() +
scale_x_continuous(breaks = seq(1000, 2000, by = 20)) +
scale_y_continuous(breaks = seq(0, 10000, by = 1000)) +
xlab("Year") +
ylab("ESTC entries") +
theme(legend.justification = c(0, 1), legend.position = c(0.05, 0.95), legend.background = element_blank(), legend.key = element_blank()) +
labs(fill = NULL) +
guides(fill = guide_legend(reverse = TRUE))

df %>% filter(certain) %>% mutate(g = case_when(
in_eebo_tcp ~ "In EEBO-TCP",
in_eebo ~ "In EEBO",
T ~ "Not in EEBO",
)) %>%
ggplot(aes(x = publication_year, fill = fct_relevel(g, "Not in EEBO", "In EEBO","In EEBO-TCP"))) +
geom_bar(width = 1,position='fill') +
theme_hsci_discrete() +
scale_x_continuous(breaks = seq(1000, 2000, by = 20)) +
scale_y_continuous(breaks = seq(0, 1, by = 0.1),labels=scales::percent_format(accuracy=1)) +
xlab("Year") +
ylab("Proportion of ESTC entries") +
theme(legend.justification = c(1, 0), legend.position = c(0.94, 0.08), legend.key = element_blank()) +
labs(fill = NULL) +
guides(fill = guide_legend(reverse = TRUE))

Work-level temporal overview
df %>%
filter(first_publication_year>1474) %>%
group_by(work_id,first_publication_year) %>%
summarize(in_eebo=any(in_eebo),in_eebo_tcp=any(in_eebo_tcp),certain=any(first_year_publication & certain),.groups="drop") %>%
mutate(g = case_when(
!certain ~ "Uncertain dating",
in_eebo_tcp ~ "In EEBO-TCP",
in_eebo ~ "In EEBO",
T ~ "ESTC total",
)) %>%
ggplot(aes(x = first_publication_year, fill = fct_relevel(g, "Uncertain dating", "ESTC total", "In EEBO","In EEBO-TCP"))) +
geom_bar(width = 1) +
theme_hsci_discrete() +
scale_x_continuous(breaks = seq(1000, 2000, by = 20)) +
scale_y_continuous(breaks = seq(0, 10000, by = 1000)) +
xlab("Year") +
ylab("ESTC entries") +
theme(legend.justification = c(0, 1), legend.position = c(0.05, 0.95), legend.background = element_blank(), legend.key = element_blank()) +
labs(fill = NULL) +
guides(fill = guide_legend(reverse = TRUE))

df %>%
filter(first_publication_year>1474) %>%
group_by(work_id,first_publication_year) %>%
summarize(in_eebo=any(in_eebo),in_eebo_tcp=any(in_eebo_tcp),.groups="drop") %>%
mutate(g = case_when(
in_eebo_tcp ~ "In EEBO-TCP",
in_eebo ~ "In EEBO",
T ~ "Not in EEBO",
)) %>%
ggplot(aes(x = first_publication_year, fill = fct_relevel(g, "Not in EEBO", "In EEBO","In EEBO-TCP"))) +
geom_bar(width = 1,position='fill') +
theme_hsci_discrete() +
scale_x_continuous(breaks = seq(1000, 2000, by = 20)) +
scale_y_continuous(breaks = seq(0, 1, by = 0.1),labels=scales::percent_format(accuracy=1)) +
xlab("Year of first publication") +
ylab("Proportion of ESTC works") +
theme(legend.justification = c(1, 0), legend.position = c(0.94, 0.08), legend.key = element_blank()) +
labs(fill = NULL) +
guides(fill = guide_legend(reverse = TRUE))

Document type coverage through time
bind_rows(
df %>% mutate(group = "Editions",type=recode(type,"Book"="Book (edition-level)","Pamphlet"="Pamphlet (edition-level)")),
df %>% group_by(work_id,type,first_publication_year,publication_year) %>%
summarize(in_eebo=any(in_eebo),in_eebo_tcp=any(in_eebo_tcp),certain=any(first_year_publication & certain),.groups="drop") %>%
mutate(group = "Works")
) %>%
mutate(type=fct_relevel(type,"Pamphlet (edition-level)","Book (edition-level)","Pamphlet","Book")) %>%
filter(certain) %>%
filter(!is.na(type),type!="In-between") %>%
group_by(publication_year, type, in_eebo) %>%
tally() %>%
mutate(prop = n / sum(n), tn = sum(n)) %>%
filter(in_eebo) %>%
ggplot(aes(x = publication_year, y = prop, color = type)) +
geom_smooth(aes(weight = n, fill = type), span = 0.3) +
geom_point(color = "gray", shape = 21, aes(size = tn)) +
geom_point(aes(size = n)) +
theme_hsci_discrete() +
scale_x_continuous(breaks = seq(1000, 2000, by = 20)) +
scale_y_continuous(labels = scales::percent_format(accuracy = 1), breaks = seq(0, 1, by = 0.05)) +
xlab("Year") +
ylab("EEBO coverage") +
theme(legend.justification = c(0, 0), legend.box.just = "bottom", legend.position = c(0.05, 0.02), legend.background = element_blank(), legend.key = element_blank(), legend.box = "horizontal") +
labs(color = NULL, size = NULL, shape = NULL, fill = NULL) +
scale_size(breaks = c(500, 2000, 3500), range = c(0.1, 8.0))
`geom_smooth()` using method = 'loess' and formula 'y ~ x'

bind_rows(
df %>% mutate(group = "Editions",type=recode(type,"Book"="Book (edition-level)","Pamphlet"="Pamphlet (edition-level)")),
df %>% group_by(work_id,type,first_publication_year,publication_year) %>%
summarize(in_eebo=any(in_eebo),in_eebo_tcp=any(in_eebo_tcp),certain=any(first_year_publication & certain),.groups="drop") %>%
mutate(group = "Works")
) %>%
mutate(type=fct_relevel(type,"Pamphlet (edition-level)","Book (edition-level)","Pamphlet","Book")) %>%
filter(certain) %>%
filter(!is.na(type),type!="In-between") %>%
group_by(publication_year, type, in_eebo_tcp) %>%
tally() %>%
mutate(prop = n / sum(n), tn = sum(n)) %>%
filter(in_eebo_tcp) %>%
ggplot(aes(x = publication_year, y = prop, color = type)) +
geom_smooth(aes(weight = n, fill = type), span = 0.3) +
geom_point(color = "gray", shape = 21, aes(size = tn)) +
geom_point(aes(size = n)) +
theme_hsci_discrete() +
scale_x_continuous(breaks = seq(1000, 2000, by = 20)) +
scale_y_continuous(labels = scales::percent_format(accuracy = 1), breaks = seq(0, 1, by = 0.05)) +
xlab("Year") +
ylab("EEBO-TCP coverage") +
theme(legend.justification = c(0, 0), legend.box.just = "bottom", legend.position = c(0.05, 0.02), legend.background = element_blank(), legend.key = element_blank(), legend.box = "horizontal") +
labs(color = NULL, size = NULL, shape = NULL, fill = NULL) +
scale_size(breaks = c(500, 2000, 3500), range = c(0.1, 8.0))
`geom_smooth()` using method = 'loess' and formula 'y ~ x'

Topical coverage EEBO-TCP vs EEBO
eebo_genres %>%
inner_join(eebo,by=c("eebo_id")) %>%
inner_join(estc_core,by=c("estc_id")) %>%
group_by(work_id,genre) %>%
summarize(in_eebo_tcp=any(!is.na(eebo_tcp_id)),.groups="drop") %>%
count(genre,in_eebo_tcp) %>%
group_by(genre) %>%
mutate(prop=n/sum(n)) %>%
ungroup() %>%
filter(in_eebo_tcp) %>%
mutate(genre=fct_reorder(genre,prop)) %>%
ggplot(aes(x=genre,y=prop)) +
geom_col() +
theme_hsci_discrete() +
scale_y_continuous(labels=scales::percent_format(accuracy=1)) +
xlab("Genre") +
ylab("Coverage in EEBO-TCP by work") +
coord_flip()

eebo_genres %>%
left_join(eebo %>%
filter(!is.na(eebo_tcp_id)) %>%
distinct(eebo_id) %>%
mutate(in_eebo_tcp=T),by=c("eebo_id")) %>%
count(genre,in_eebo_tcp) %>%
group_by(genre) %>%
mutate(prop=n/sum(n)) %>%
ungroup() %>%
filter(in_eebo_tcp) %>%
mutate(genre=fct_reorder(genre,prop)) %>%
ggplot(aes(x=genre,y=prop)) +
geom_col() +
theme_hsci_discrete() +
scale_y_continuous(labels=scales::percent_format(accuracy=1)) +
xlab("Genre") +
ylab("Coverage in EEBO-TCP by edition") +
coord_flip()

---
title: "EEBO/ESTC analysis"
output: 
  html_notebook: 
    code_folding: hide
    toc: yes
---

```{r setup,echo=F}
knitr::opts_knit$set(root.dir = here::here())
```


```{r,include=F}
library(tidyverse)
library(here)
pak::pkg_install("hsci-r/gghsci")
library(gghsci)
```

```{r,include=F}
p <- function(number) {
  return(format(number, scientific = FALSE, big.mark = ","))
}
pp <- function(percentage,accuracy=0.01) {
  return(scales::percent(percentage, accuracy = accuracy))
}
```

```{r,include=F}
pak::pkg_install("COMHIS/estcr")
library(estcr)
load_estc()
rm(estc_ecco_eebo_links)
```

```{r,include=F}
eebo <- read_tsv(here("data/input/eebo/eebo.tsv"),col_types=cols(eebo_id='c'))
eebo_keymapped <- read_tsv(here("data/input/eebo/eebo-keymapped.tsv"),col_types=cols(eebo_id='c'))
eebo <- eebo %>% 
  anti_join(eebo_keymapped,by=c("eebo_id","eebo_tcp_id")) %>% 
  bind_rows(eebo_keymapped)
rm(eebo_keymapped)
eebo_tcp <- eebo %>% filter(!is.na(eebo_tcp_id))
estc_core <- estc_core %>% mutate(in_eebo=estc_id %in% eebo$estc_id,in_eebo_tcp=estc_id %in% eebo_tcp$estc_id)
eebo_genres <- read_tsv(here("data/input/eebo/eebo-genres.tsv"))
```

```{r,include=F}
estc_core <- estc_core %>%
  group_by(work_id) %>%
  mutate(first_publication_year = min(publication_year), editions=n()) %>%
  mutate(first_year_publication = first_publication_year == publication_year) %>%
  ungroup() %>%
  mutate(edition_type=if_else(editions==1,"Singular",if_else(first_year_publication,"First year edition","Later edition"))) %>%
  mutate(
    certain = !uncertain & !circa & !range,
    type = case_when(
      pagecount <= 32 ~ "Pamphlet",
      pagecount > 32 & pagecount < 128 ~ "In-between",
      pagecount >= 128 ~ "Book"
    )
  )

df <- estc_core %>% 
  filter(publication_year >= 1474, publication_year < 1700)
  
```


```{r,include=F}
library(assertthat)
n_eebo_ids <- eebo %>% distinct(eebo_id) %>% nrow()
n_eebo_tcp_ids <- eebo_tcp %>% distinct(eebo_tcp_id) %>% nrow()
n_eebo_ids_in_eebo_tcp <- eebo %>% filter(!is.na(eebo_tcp_id)) %>% distinct(eebo_id) %>% nrow()
assert_that(eebo %>% filter(!is.na(eebo_tcp_id)) %>% distinct(eebo_id,eebo_tcp_id) %>% count(eebo_tcp_id) %>% filter(n>1) %>% nrow()==0)
n_eebo_ids_multimapped_to_eebo_tcp <- eebo %>% filter(!is.na(eebo_tcp_id)) %>% distinct(eebo_id,eebo_tcp_id) %>% count(eebo_id) %>% filter(n>1) %>% nrow()

n_eebo_ids_not_in_estc <- eebo %>% filter(is.na(estc_id)) %>% distinct(eebo_id) %>% nrow()
n_eebo_tcp_ids_not_in_estc <- eebo_tcp %>% filter(is.na(estc_id)) %>% distinct(eebo_tcp_id) %>% nrow()

n_eebo_ids_multimapped_to_estc <- eebo %>% filter(!is.na(estc_id)) %>% distinct(eebo_id,estc_id) %>% count(eebo_id) %>% filter(n>1) %>% nrow()
n_eebo_tcp_ids_multimapped_to_estc <- eebo_tcp%>% filter(!is.na(estc_id)) %>% distinct(eebo_tcp_id,estc_id) %>% count(eebo_tcp_id) %>% filter(n>1) %>% nrow()

n_estc_ids_with_eebo_ids <- estc_core %>% filter(in_eebo) %>% nrow()
n_estc_ids_in_df_with_eebo_ids <- df %>% filter(in_eebo) %>% nrow()
n_estc_ids_with_eebo_tcp_ids <- estc_core %>% filter(in_eebo_tcp) %>% nrow()
n_estc_ids_in_df_with_eebo_tcp_ids <- df %>% filter(in_eebo_tcp) %>% nrow()

n_estc_ids_in_df <- df %>% nrow()
```

# Overview

Out of a total of `r p(n_eebo_ids)` EEBO records, `r p(n_eebo_ids_in_eebo_tcp)` (`r pp(n_eebo_ids_in_eebo_tcp/n_eebo_ids)`) are in EEBO-TCP (but `r p(n_eebo_ids_multimapped_to_eebo_tcp)` EEBO records have multiple TCP ids).

Out of the `r p(n_eebo_ids)` EEBO records, `r p(n_eebo_ids_not_in_estc)`  (`r pp(n_eebo_ids_not_in_estc/n_eebo_ids)`) could not be matched to an ESTC record and will be left out of the analysis. On the other hand, `r p(n_eebo_ids_multimapped_to_estc)` EEBO records (`r pp(n_eebo_ids_multimapped_to_estc/n_eebo_ids)`) were matched to more than one ESTC record, possibly causing bias.

Out of the `r p(n_eebo_tcp_ids)` EEBO-TCP records, `r p(n_eebo_tcp_ids_not_in_estc)` (`r pp(n_eebo_tcp_ids_not_in_estc/n_eebo_tcp_ids)`) could not be matched to an ESTC record and will be left out of the analysis. On the other hand, `r p(n_eebo_tcp_ids_multimapped_to_estc)` EEBO-TCP records (`r pp(n_eebo_tcp_ids_multimapped_to_estc/n_eebo_tcp_ids)`) were matched to more than one ESTC record, possibly causing bias.

In the analysis, only ESTC records with publication years in the range [1474,1700] have been included. This results in the exclusion of `r p(n_estc_ids_with_eebo_ids-n_estc_ids_in_df_with_eebo_ids)` (`r pp((n_estc_ids_with_eebo_ids-n_estc_ids_in_df_with_eebo_ids)/n_estc_ids_with_eebo_ids)`) ESTC records that have representation in EEBO, possibly causing bias. `r p(n_estc_ids_with_eebo_tcp_ids-n_estc_ids_in_df_with_eebo_tcp_ids)` (`r pp((n_estc_ids_with_eebo_tcp_ids-n_estc_ids_in_df_with_eebo_tcp_ids)/n_estc_ids_with_eebo_tcp_ids)`) of the ESTC records with representation in EEBO-TCP are removed due to this filtering condition.

In the end, our working dataset consists of `r p(n_estc_ids_in_df)` ESTC records, of which `r p(n_estc_ids_in_df_with_eebo_ids)` (`r pp(n_estc_ids_in_df_with_eebo_ids/n_estc_ids_in_df)`) we estimate to have representation in EEBO, and `r p(n_estc_ids_in_df_with_eebo_tcp_ids)` (`r pp(n_estc_ids_in_df_with_eebo_tcp_ids/n_estc_ids_in_df)`) to have representation in EEBO-TCP.

# Publication type analysis

```{r}
library(ggbeeswarm)
bind_rows(
  df %>% mutate(group = "Editions"),
  df %>% group_by(work_id,type,first_publication_year,publication_year) %>%
    summarize(in_eebo=any(in_eebo),in_eebo_tcp=any(in_eebo_tcp),.groups="drop") %>%
    mutate(group = "Works",edition_type=if_else(publication_year==first_publication_year,"First year work","Later work"))
) %>%
  mutate(edition_type=fct_relevel(edition_type,"Singular","First year work","Later work","First year edition","Later edition")) %>%
  filter(type %in% c("Book","Pamphlet")) %>%
  group_by(publication_year, edition_type, group, type, in_eebo) %>% 
  tally() %>% 
  mutate(prop = n / sum(n), tn = sum(n)) %>% 
  filter(in_eebo) %>%
  ggplot(aes(x = type, y = prop, group = edition_type, color = edition_type)) +
  geom_quasirandom(aes(size = tn), dodge = 1.0) +
  stat_summary(aes(group = edition_type), position = position_dodge(width = 1.0), fun = median, fun.min = median, fun.max = median, geom = "crossbar", width = 0.5, color = "red") +
  theme_hsci_discrete() +
  xlab(NULL) +
  ylab("EEBO coverage") +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1), breaks = seq(0, 1, by = 0.05)) +
  scale_size(breaks = c(250, 500, 1500), range = c(0.1, 8.0)) +
  theme(legend.justification = c(0, 0), legend.position = c(0.02, 0.02), legend.background = element_blank(), legend.box.just = "bottom", legend.key = element_blank(), legend.box = "horizontal") +
  labs(color = "Representation type", size = "Count") +
  guides(shape = "none")
```

```{r}
library(ggbeeswarm)
bind_rows(
  df %>% mutate(group = "Editions"),
  df %>% group_by(work_id,type,first_publication_year,publication_year) %>%
    summarize(in_eebo=any(in_eebo),in_eebo_tcp=any(in_eebo_tcp),.groups="drop") %>%
    mutate(group = "Works",edition_type=if_else(publication_year==first_publication_year,"First year work","Later work"))
) %>%
  mutate(edition_type=fct_relevel(edition_type,"Singular","First year work","Later work","First year edition","Later edition")) %>%
  filter(type %in% c("Book","Pamphlet")) %>%
  group_by(publication_year, edition_type, group, type, in_eebo_tcp) %>% 
  tally() %>% 
  mutate(prop = n / sum(n), tn = sum(n)) %>% 
  filter(in_eebo_tcp) %>%
  ggplot(aes(x = type, y = prop, group = edition_type, color = edition_type)) +
  geom_quasirandom(aes(size = tn), dodge = 1.0) +
  stat_summary(aes(group = edition_type), position = position_dodge(width = 1.0), fun = median, fun.min = median, fun.max = median, geom = "crossbar", width = 0.5, color = "red") +
  theme_hsci_discrete() +
  xlab(NULL) +
  ylab("EEBO-TCP coverage") +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1), breaks = seq(0, 1, by = 0.05)) +
  scale_size(breaks = c(250, 500, 1500), range = c(0.1, 8.0)) +
  theme(legend.justification = c(0, 0), legend.position = c(0.02, 0.02), legend.background = element_blank(), legend.box.just = "bottom", legend.key = element_blank(), legend.box = "horizontal") +
  labs(color = "Representation type", size = "Count") +
  guides(shape = "none")
```

# Edition-level temporal overview

```{r,fig.width = 6, fig.height = 3}
df %>%  mutate(g = case_when(
  !certain ~ "Uncertain dating",
  in_eebo_tcp  ~ "In EEBO-TCP",
  in_eebo ~ "In EEBO",
  T ~ "ESTC total",
)) %>%
  ggplot(aes(x = publication_year, fill = fct_relevel(g, "Uncertain dating", "ESTC total", "In EEBO","In EEBO-TCP"))) +
  geom_bar(width = 1) +
  theme_hsci_discrete() +
  scale_x_continuous(breaks = seq(1000, 2000, by = 20)) +
  scale_y_continuous(breaks = seq(0, 10000, by = 1000)) +
  xlab("Year") +
  ylab("ESTC entries") +
  theme(legend.justification = c(0, 1), legend.position = c(0.05, 0.95), legend.background = element_blank(), legend.key = element_blank()) +
  labs(fill = NULL) +
  guides(fill = guide_legend(reverse = TRUE))
```

```{r,fig.width = 6, fig.height = 3}
df %>% filter(certain) %>% mutate(g = case_when(
  in_eebo_tcp  ~ "In EEBO-TCP",
  in_eebo ~ "In EEBO",
  T ~ "Not in EEBO",
)) %>%
  ggplot(aes(x = publication_year, fill = fct_relevel(g, "Not in EEBO", "In EEBO","In EEBO-TCP"))) +
  geom_bar(width = 1,position='fill') +
  theme_hsci_discrete() +
  scale_x_continuous(breaks = seq(1000, 2000, by = 20)) +
  scale_y_continuous(breaks = seq(0, 1, by = 0.1),labels=scales::percent_format(accuracy=1)) +
  xlab("Year") +
  ylab("Proportion of ESTC entries") +
  theme(legend.justification = c(1, 0), legend.position = c(0.94, 0.08), legend.key = element_blank()) +
  labs(fill = NULL) +
  guides(fill = guide_legend(reverse = TRUE))
```
# Work-level temporal overview

```{r}
df %>% 
  filter(first_publication_year>1474) %>%
  group_by(work_id,first_publication_year) %>%
  summarize(in_eebo=any(in_eebo),in_eebo_tcp=any(in_eebo_tcp),certain=any(first_year_publication & certain),.groups="drop") %>% 
  mutate(g = case_when(
    !certain ~ "Uncertain dating",
    in_eebo_tcp  ~ "In EEBO-TCP",
    in_eebo ~ "In EEBO",
    T ~ "ESTC total",
  )) %>% 
  ggplot(aes(x = first_publication_year, fill = fct_relevel(g, "Uncertain dating", "ESTC total", "In EEBO","In EEBO-TCP"))) +
  geom_bar(width = 1) +
  theme_hsci_discrete() +
  scale_x_continuous(breaks = seq(1000, 2000, by = 20)) +
  scale_y_continuous(breaks = seq(0, 10000, by = 1000)) +
  xlab("Year") +
  ylab("ESTC entries") +
  theme(legend.justification = c(0, 1), legend.position = c(0.05, 0.95), legend.background = element_blank(), legend.key = element_blank()) +
  labs(fill = NULL) +
  guides(fill = guide_legend(reverse = TRUE))
```


```{r}
df %>% 
  filter(first_publication_year>1474) %>%
  group_by(work_id,first_publication_year) %>%
  summarize(in_eebo=any(in_eebo),in_eebo_tcp=any(in_eebo_tcp),.groups="drop") %>%
  mutate(g = case_when(
  in_eebo_tcp  ~ "In EEBO-TCP",
  in_eebo ~ "In EEBO",
  T ~ "Not in EEBO",
)) %>%  
ggplot(aes(x = first_publication_year, fill = fct_relevel(g, "Not in EEBO", "In EEBO","In EEBO-TCP"))) +
  geom_bar(width = 1,position='fill') +
  theme_hsci_discrete() +
  scale_x_continuous(breaks = seq(1000, 2000, by = 20)) +
  scale_y_continuous(breaks = seq(0, 1, by = 0.1),labels=scales::percent_format(accuracy=1)) +
  xlab("Year of first publication") +
  ylab("Proportion of ESTC works") +
  theme(legend.justification = c(1, 0), legend.position = c(0.94, 0.08), legend.key = element_blank()) +
  labs(fill = NULL) +
  guides(fill = guide_legend(reverse = TRUE))  
```

# Document type coverage through time

```{r}
bind_rows(
  df %>% mutate(group = "Editions",type=recode(type,"Book"="Book (edition-level)","Pamphlet"="Pamphlet (edition-level)")),
  df %>% group_by(work_id,type,first_publication_year,publication_year) %>%
    summarize(in_eebo=any(in_eebo),in_eebo_tcp=any(in_eebo_tcp),certain=any(first_year_publication & certain),.groups="drop") %>%
    mutate(group = "Works")
) %>%
  mutate(type=fct_relevel(type,"Pamphlet (edition-level)","Book (edition-level)","Pamphlet","Book")) %>%
  filter(certain) %>% 
  filter(!is.na(type),type!="In-between") %>% 
  group_by(publication_year, type, in_eebo) %>% 
  tally() %>% 
  mutate(prop = n / sum(n), tn = sum(n)) %>% 
  filter(in_eebo) %>%
  ggplot(aes(x = publication_year, y = prop, color = type)) +
  geom_smooth(aes(weight = n, fill = type), span = 0.3) +
  geom_point(color = "gray", shape = 21, aes(size = tn)) +
  geom_point(aes(size = n)) +
  theme_hsci_discrete() +
  scale_x_continuous(breaks = seq(1000, 2000, by = 20)) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1), breaks = seq(0, 1, by = 0.05)) +
  xlab("Year") +
  ylab("EEBO coverage") +
  theme(legend.justification = c(0, 0), legend.box.just = "bottom", legend.position = c(0.05, 0.02), legend.background = element_blank(), legend.key = element_blank(), legend.box = "horizontal") +
  labs(color = NULL, size = NULL, shape = NULL, fill = NULL) +
  scale_size(breaks = c(500, 2000, 3500), range = c(0.1, 8.0))
```


```{r}
bind_rows(
  df %>% mutate(group = "Editions",type=recode(type,"Book"="Book (edition-level)","Pamphlet"="Pamphlet (edition-level)")),
  df %>% group_by(work_id,type,first_publication_year,publication_year) %>%
    summarize(in_eebo=any(in_eebo),in_eebo_tcp=any(in_eebo_tcp),certain=any(first_year_publication & certain),.groups="drop") %>%
    mutate(group = "Works")
) %>%
  mutate(type=fct_relevel(type,"Pamphlet (edition-level)","Book (edition-level)","Pamphlet","Book")) %>%
  filter(certain) %>% 
  filter(!is.na(type),type!="In-between") %>% 
  group_by(publication_year, type, in_eebo_tcp) %>% 
  tally() %>% 
  mutate(prop = n / sum(n), tn = sum(n)) %>% 
  filter(in_eebo_tcp) %>%
  ggplot(aes(x = publication_year, y = prop, color = type)) +
  geom_smooth(aes(weight = n, fill = type), span = 0.3) +
  geom_point(color = "gray", shape = 21, aes(size = tn)) +
  geom_point(aes(size = n)) +
  theme_hsci_discrete() +
  scale_x_continuous(breaks = seq(1000, 2000, by = 20)) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1), breaks = seq(0, 1, by = 0.05)) +
  xlab("Year") +
  ylab("EEBO-TCP coverage") +
  theme(legend.justification = c(0, 0), legend.box.just = "bottom", legend.position = c(0.05, 0.02), legend.background = element_blank(), legend.key = element_blank(), legend.box = "horizontal") +
  labs(color = NULL, size = NULL, shape = NULL, fill = NULL) +
  scale_size(breaks = c(500, 2000, 3500), range = c(0.1, 8.0))
```

# Topical coverage EEBO-TCP vs EEBO

```{r}
eebo_genres %>% 
    inner_join(eebo,by=c("eebo_id")) %>%
    inner_join(estc_core,by=c("estc_id")) %>%
  group_by(work_id,genre) %>%
  summarize(in_eebo_tcp=any(!is.na(eebo_tcp_id)),.groups="drop") %>%
  count(genre,in_eebo_tcp) %>% 
  group_by(genre) %>%
  mutate(prop=n/sum(n)) %>%
  ungroup() %>%
  filter(in_eebo_tcp) %>%
  mutate(genre=fct_reorder(genre,prop)) %>%
  ggplot(aes(x=genre,y=prop)) + 
  geom_col() + 
  theme_hsci_discrete() +
  scale_y_continuous(labels=scales::percent_format(accuracy=1)) +
  xlab("Genre") +
  ylab("Coverage in EEBO-TCP by work") +
  coord_flip() 
```

```{r}
eebo_genres %>% 
  left_join(eebo %>% 
              filter(!is.na(eebo_tcp_id)) %>%
              distinct(eebo_id) %>% 
              mutate(in_eebo_tcp=T),by=c("eebo_id")) %>%
  count(genre,in_eebo_tcp) %>% 
  group_by(genre) %>%
  mutate(prop=n/sum(n)) %>%
  ungroup() %>%
  filter(in_eebo_tcp) %>%
  mutate(genre=fct_reorder(genre,prop)) %>%
  ggplot(aes(x=genre,y=prop)) + 
  geom_col() + 
  theme_hsci_discrete() +
  scale_y_continuous(labels=scales::percent_format(accuracy=1)) +
  xlab("Genre") +
  ylab("Coverage in EEBO-TCP by edition") +
  coord_flip() 
```

