Overview
Out of a total of 132,846 EEBO records, 60,227 (45.34%) are in EEBO-TCP (but 66 EEBO records have multiple TCP ids).
Out of the 132,846 EEBO records, 6,802 (5.12%) could not be matched to an ESTC record and will be left out of the analysis. On the other hand, 7,373 EEBO records (5.55%) were matched to more than one ESTC record, possibly causing bias.
Out of the 60,327 EEBO-TCP records, 1,143 (1.89%) could not be matched to an ESTC record and will be left out of the analysis. On the other hand, 3,269 EEBO-TCP records (5.42%) were matched to more than one ESTC record, possibly causing bias.
In the analysis, only ESTC records with publication years in the range [1474,1700] have been included. This results in the exclusion of 4,862 (4.17%) ESTC records that have representation in EEBO, possibly causing bias. 2,119 (3.41%) of the ESTC records with representation in EEBO-TCP are removed due to this filtering condition.
In the end, our working dataset consists of 132,412 ESTC records, of which 111,816 (84.45%) we estimate to have representation in EEBO, and 60,095 (45.38%) to have representation in EEBO-TCP.
Publication type analysis
library(ggbeeswarm)
bind_rows(
df %>% mutate(group = "Editions"),
df %>% group_by(work_id,type,first_publication_year,publication_year) %>%
summarize(in_eebo=any(in_eebo),in_eebo_tcp=any(in_eebo_tcp),.groups="drop") %>%
mutate(group = "Works",edition_type=if_else(publication_year==first_publication_year,"First year work","Later work"))
) %>%
mutate(edition_type=fct_relevel(edition_type,"Singular","First year work","Later work","First year edition","Later edition")) %>%
filter(type %in% c("Book","Pamphlet")) %>%
group_by(publication_year, edition_type, group, type, in_eebo) %>%
tally() %>%
mutate(prop = n / sum(n), tn = sum(n)) %>%
filter(in_eebo) %>%
ggplot(aes(x = type, y = prop, group = edition_type, color = edition_type)) +
geom_quasirandom(aes(size = tn), dodge = 1.0) +
stat_summary(aes(group = edition_type), position = position_dodge(width = 1.0), fun = median, fun.min = median, fun.max = median, geom = "crossbar", width = 0.5, color = "red") +
theme_hsci_discrete() +
xlab(NULL) +
ylab("EEBO coverage") +
scale_y_continuous(labels = scales::percent_format(accuracy = 1), breaks = seq(0, 1, by = 0.05)) +
scale_size(breaks = c(250, 500, 1500), range = c(0.1, 8.0)) +
theme(legend.justification = c(0, 0), legend.position = c(0.02, 0.02), legend.background = element_blank(), legend.box.just = "bottom", legend.key = element_blank(), legend.box = "horizontal") +
labs(color = "Representation type", size = "Count") +
guides(shape = "none")

library(ggbeeswarm)
bind_rows(
df %>% mutate(group = "Editions"),
df %>% group_by(work_id,type,first_publication_year,publication_year) %>%
summarize(in_eebo=any(in_eebo),in_eebo_tcp=any(in_eebo_tcp),.groups="drop") %>%
mutate(group = "Works",edition_type=if_else(publication_year==first_publication_year,"First year work","Later work"))
) %>%
mutate(edition_type=fct_relevel(edition_type,"Singular","First year work","Later work","First year edition","Later edition")) %>%
filter(type %in% c("Book","Pamphlet")) %>%
group_by(publication_year, edition_type, group, type, in_eebo_tcp) %>%
tally() %>%
mutate(prop = n / sum(n), tn = sum(n)) %>%
filter(in_eebo_tcp) %>%
ggplot(aes(x = type, y = prop, group = edition_type, color = edition_type)) +
geom_quasirandom(aes(size = tn), dodge = 1.0) +
stat_summary(aes(group = edition_type), position = position_dodge(width = 1.0), fun = median, fun.min = median, fun.max = median, geom = "crossbar", width = 0.5, color = "red") +
theme_hsci_discrete() +
xlab(NULL) +
ylab("EEBO-TCP coverage") +
scale_y_continuous(labels = scales::percent_format(accuracy = 1), breaks = seq(0, 1, by = 0.05)) +
scale_size(breaks = c(250, 500, 1500), range = c(0.1, 8.0)) +
theme(legend.justification = c(0, 0), legend.position = c(0.02, 0.02), legend.background = element_blank(), legend.box.just = "bottom", legend.key = element_blank(), legend.box = "horizontal") +
labs(color = "Representation type", size = "Count") +
guides(shape = "none")

Edition-level temporal overview
df %>% mutate(g = case_when(
!certain ~ "Uncertain dating",
in_eebo_tcp ~ "In EEBO-TCP",
in_eebo ~ "In EEBO",
T ~ "ESTC total",
)) %>%
ggplot(aes(x = publication_year, fill = fct_relevel(g, "Uncertain dating", "ESTC total", "In EEBO","In EEBO-TCP"))) +
geom_bar(width = 1) +
theme_hsci_discrete() +
scale_x_continuous(breaks = seq(1000, 2000, by = 20)) +
scale_y_continuous(breaks = seq(0, 10000, by = 1000)) +
xlab("Year") +
ylab("ESTC entries") +
theme(legend.justification = c(0, 1), legend.position = c(0.05, 0.95), legend.background = element_blank(), legend.key = element_blank()) +
labs(fill = NULL) +
guides(fill = guide_legend(reverse = TRUE))

df %>% filter(certain) %>% mutate(g = case_when(
in_eebo_tcp ~ "In EEBO-TCP",
in_eebo ~ "In EEBO",
T ~ "Not in EEBO",
)) %>%
ggplot(aes(x = publication_year, fill = fct_relevel(g, "Not in EEBO", "In EEBO","In EEBO-TCP"))) +
geom_bar(width = 1,position='fill') +
theme_hsci_discrete() +
scale_x_continuous(breaks = seq(1000, 2000, by = 20)) +
scale_y_continuous(breaks = seq(0, 1, by = 0.1),labels=scales::percent_format(accuracy=1)) +
xlab("Year") +
ylab("Proportion of ESTC entries") +
theme(legend.justification = c(1, 0), legend.position = c(0.94, 0.08), legend.key = element_blank()) +
labs(fill = NULL) +
guides(fill = guide_legend(reverse = TRUE))

Work-level temporal overview
df %>%
filter(first_publication_year>1474) %>%
group_by(work_id,first_publication_year) %>%
summarize(in_eebo=any(in_eebo),in_eebo_tcp=any(in_eebo_tcp),certain=any(first_year_publication & certain),.groups="drop") %>%
mutate(g = case_when(
!certain ~ "Uncertain dating",
in_eebo_tcp ~ "In EEBO-TCP",
in_eebo ~ "In EEBO",
T ~ "ESTC total",
)) %>%
ggplot(aes(x = first_publication_year, fill = fct_relevel(g, "Uncertain dating", "ESTC total", "In EEBO","In EEBO-TCP"))) +
geom_bar(width = 1) +
theme_hsci_discrete() +
scale_x_continuous(breaks = seq(1000, 2000, by = 20)) +
scale_y_continuous(breaks = seq(0, 10000, by = 1000)) +
xlab("Year") +
ylab("ESTC entries") +
theme(legend.justification = c(0, 1), legend.position = c(0.05, 0.95), legend.background = element_blank(), legend.key = element_blank()) +
labs(fill = NULL) +
guides(fill = guide_legend(reverse = TRUE))

df %>%
filter(first_publication_year>1474) %>%
group_by(work_id,first_publication_year) %>%
summarize(in_eebo=any(in_eebo),in_eebo_tcp=any(in_eebo_tcp),.groups="drop") %>%
mutate(g = case_when(
in_eebo_tcp ~ "In EEBO-TCP",
in_eebo ~ "In EEBO",
T ~ "Not in EEBO",
)) %>%
ggplot(aes(x = first_publication_year, fill = fct_relevel(g, "Not in EEBO", "In EEBO","In EEBO-TCP"))) +
geom_bar(width = 1,position='fill') +
theme_hsci_discrete() +
scale_x_continuous(breaks = seq(1000, 2000, by = 20)) +
scale_y_continuous(breaks = seq(0, 1, by = 0.1),labels=scales::percent_format(accuracy=1)) +
xlab("Year of first publication") +
ylab("Proportion of ESTC works") +
theme(legend.justification = c(1, 0), legend.position = c(0.94, 0.08), legend.key = element_blank()) +
labs(fill = NULL) +
guides(fill = guide_legend(reverse = TRUE))

Document type coverage through time
bind_rows(
df %>% mutate(group = "Editions",type=recode(type,"Book"="Book (edition-level)","Pamphlet"="Pamphlet (edition-level)")),
df %>% group_by(work_id,type,first_publication_year,publication_year) %>%
summarize(in_eebo=any(in_eebo),in_eebo_tcp=any(in_eebo_tcp),certain=any(first_year_publication & certain),.groups="drop") %>%
mutate(group = "Works")
) %>%
mutate(type=fct_relevel(type,"Pamphlet (edition-level)","Book (edition-level)","Pamphlet","Book")) %>%
filter(certain) %>%
filter(!is.na(type),type!="In-between") %>%
group_by(publication_year, type, in_eebo) %>%
tally() %>%
mutate(prop = n / sum(n), tn = sum(n)) %>%
filter(in_eebo) %>%
ggplot(aes(x = publication_year, y = prop, color = type)) +
geom_smooth(aes(weight = n, fill = type), span = 0.3) +
geom_point(color = "gray", shape = 21, aes(size = tn)) +
geom_point(aes(size = n)) +
theme_hsci_discrete() +
scale_x_continuous(breaks = seq(1000, 2000, by = 20)) +
scale_y_continuous(labels = scales::percent_format(accuracy = 1), breaks = seq(0, 1, by = 0.05)) +
xlab("Year") +
ylab("EEBO coverage") +
theme(legend.justification = c(0, 0), legend.box.just = "bottom", legend.position = c(0.05, 0.02), legend.background = element_blank(), legend.key = element_blank(), legend.box = "horizontal") +
labs(color = NULL, size = NULL, shape = NULL, fill = NULL) +
scale_size(breaks = c(500, 2000, 3500), range = c(0.1, 8.0))
`geom_smooth()` using method = 'loess' and formula 'y ~ x'

bind_rows(
df %>% mutate(group = "Editions",type=recode(type,"Book"="Book (edition-level)","Pamphlet"="Pamphlet (edition-level)")),
df %>% group_by(work_id,type,first_publication_year,publication_year) %>%
summarize(in_eebo=any(in_eebo),in_eebo_tcp=any(in_eebo_tcp),certain=any(first_year_publication & certain),.groups="drop") %>%
mutate(group = "Works")
) %>%
mutate(type=fct_relevel(type,"Pamphlet (edition-level)","Book (edition-level)","Pamphlet","Book")) %>%
filter(certain) %>%
filter(!is.na(type),type!="In-between") %>%
group_by(publication_year, type, in_eebo_tcp) %>%
tally() %>%
mutate(prop = n / sum(n), tn = sum(n)) %>%
filter(in_eebo_tcp) %>%
ggplot(aes(x = publication_year, y = prop, color = type)) +
geom_smooth(aes(weight = n, fill = type), span = 0.3) +
geom_point(color = "gray", shape = 21, aes(size = tn)) +
geom_point(aes(size = n)) +
theme_hsci_discrete() +
scale_x_continuous(breaks = seq(1000, 2000, by = 20)) +
scale_y_continuous(labels = scales::percent_format(accuracy = 1), breaks = seq(0, 1, by = 0.05)) +
xlab("Year") +
ylab("EEBO-TCP coverage") +
theme(legend.justification = c(0, 0), legend.box.just = "bottom", legend.position = c(0.05, 0.02), legend.background = element_blank(), legend.key = element_blank(), legend.box = "horizontal") +
labs(color = NULL, size = NULL, shape = NULL, fill = NULL) +
scale_size(breaks = c(500, 2000, 3500), range = c(0.1, 8.0))
`geom_smooth()` using method = 'loess' and formula 'y ~ x'

Topical coverage EEBO-TCP vs EEBO
eebo_genres %>%
inner_join(eebo,by=c("eebo_id")) %>%
inner_join(estc_core,by=c("estc_id")) %>%
group_by(work_id,genre) %>%
summarize(in_eebo_tcp=any(!is.na(eebo_tcp_id)),.groups="drop") %>%
count(genre,in_eebo_tcp) %>%
group_by(genre) %>%
mutate(prop=n/sum(n)) %>%
ungroup() %>%
filter(in_eebo_tcp) %>%
mutate(genre=fct_reorder(genre,prop)) %>%
ggplot(aes(x=genre,y=prop)) +
geom_col() +
theme_hsci_discrete() +
scale_y_continuous(labels=scales::percent_format(accuracy=1)) +
xlab("Genre") +
ylab("Coverage in EEBO-TCP by work") +
coord_flip()

eebo_genres %>%
left_join(eebo %>%
filter(!is.na(eebo_tcp_id)) %>%
distinct(eebo_id) %>%
mutate(in_eebo_tcp=T),by=c("eebo_id")) %>%
count(genre,in_eebo_tcp) %>%
group_by(genre) %>%
mutate(prop=n/sum(n)) %>%
ungroup() %>%
filter(in_eebo_tcp) %>%
mutate(genre=fct_reorder(genre,prop)) %>%
ggplot(aes(x=genre,y=prop)) +
geom_col() +
theme_hsci_discrete() +
scale_y_continuous(labels=scales::percent_format(accuracy=1)) +
xlab("Genre") +
ylab("Coverage in EEBO-TCP by edition") +
coord_flip()

