library(assertthat)
library(here)
here() starts at /Users/jiemakel/tyo/estc_bnf_analysis
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
  method         from
  print.tbl_lazy     
  print.tbl_sql      
── Attaching packages ─────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──
✓ ggplot2 3.3.5     ✓ purrr   0.3.4
✓ tibble  3.1.6     ✓ dplyr   1.0.8
✓ tidyr   1.2.0     ✓ stringr 1.4.0
✓ readr   2.1.2     ✓ forcats 0.5.1
── Conflicts ────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
x dplyr::filter()    masks stats::filter()
x tibble::has_name() masks assertthat::has_name()
x dplyr::lag()       masks stats::lag()
library(glue)
# Install with devtools::install_github("hsci-r/gghsci") / pak::pkg_install("hsci-r/gghsci")
library(gghsci)
# Requires estcr and bnfr. See https://github.com/COMHIS/estcr/blob/main/README.md#installand https://github.com/COMHIS/bnfr/blob/main/README.md#installation for installing them

source(here("code/analysis/load_data_all_bnf_matches.R"))
Warning: One or more parsing issues, see `problems()` for details
Joining, by = "bnf_author_id"
Rows: 5186 Columns: 9
── Column specification ────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (5): actor_id, actor_name_primary, estc_id, publoc, work_id
dbl (4): pubyear, ncbel2_5_iv_b_French, ncbel2_4_iv_5_children, ncbel2_4_iii_minor

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Joining, by = "actor_id"
Joining, by = "actor_id"
Joining, by = "actor_id"
Joining, by = "actor_id"
Joining, by = "actor_id"
Joining, by = "actor_id"
Joining, by = "actor_id"
Joining, by = "bnf_author_id"
Joining, by = "bnf_author_id"
Joining, by = "bnf_record_id"
Joining, by = "bnf_author_id"
Joining, by = "bnf_record_id"
Joining, by = "bnf_record_id"

Proportion of ESTC that are French translations

estc_core %>% 
  filter(!uncertain,publication_year>1550,publication_year<=1800) %>% 
  group_by(publication_year) %>% 
  summarize(prop=sum(french)/n()) %>%
  ggplot(aes(x=publication_year,y=prop)) + 
  scale_y_continuous(labels = scales::percent) +
  geom_line() + 
  theme_hsci()

Reprint count comparison

estc_core %>%
  filter(!is.na(work_id), work_id != "no_work_id") %>%
  count(french, work_id, name = "editions") %>%
  filter(editions < 100) %>%
  count(french, editions) %>%
  group_by(french) %>%
  mutate(prop = n / sum(n)) %>%
  ggplot(aes(x = editions, y = prop, color = french)) +
  geom_line() +
  scale_y_log10(labels = scales::percent) +
  theme_hsci_discrete()

Document type comparison

estc_core %>%
  count(french, document_type) %>%
  group_by(french) %>%
  mutate(prop = n / sum(n)) %>%
  ggplot(aes(x = document_type, y = prop, fill = french)) +
  geom_bar(stat = 'identity', position = 'dodge') +
  scale_y_continuous(labels = scales::percent) +
  theme_hsci_discrete()

Subject topic comparison

estc_core %>%
  left_join(estc_projected_ecco_modules %>% filter(max_prop > 0.5),
            by = c("work_id")) %>%
  count(french, projected_ecco_module) %>%
  group_by(french) %>%
  mutate(prop = n / sum(n)) %>%
  ggplot(aes(x = projected_ecco_module, y = prop, fill = french)) +
  geom_bar(stat = 'identity', position = 'dodge') +
  scale_y_continuous(labels = scales::percent) +
  coord_flip() +
  theme_hsci_discrete()

Subject topic and document type comparison

estc_core %>%
  left_join(estc_projected_ecco_modules %>% filter(max_prop > 0.5),
            by = c("work_id")) %>%
  count(french, document_type,projected_ecco_module) %>%
  group_by(document_type, french) %>%
  mutate(prop = n / sum(n)) %>%
  ggplot(aes(x = projected_ecco_module, y = prop, fill = french)) +
  geom_bar(stat = 'identity', position = 'dodge') +
  scale_y_continuous(labels = scales::percent) +
  coord_flip() +
  theme_hsci_discrete() +
  facet_wrap(~ document_type)

Subject topics through time

estc_core %>% 
  left_join(estc_projected_ecco_modules %>% filter(max_prop > 0.5),by=c("work_id")) %>%
  left_join(estc_actor_links %>% filter(actor_role_author),by=c("estc_id")) %>%
  filter(!is.na(actor_id)) %>%
  group_by(projected_ecco_module,actor_id) %>% 
  summarize(max_prop=max(max_prop),.groups="drop") %>%
  filter(max_prop > 0.5) %>%
  distinct(projected_ecco_module,actor_id,max_prop) %>%
  left_join(estc_actor_links,by=c("actor_id")) %>%
  full_join(estc_core,by=c("estc_id")) %>%
  filter(!uncertain, publication_year >= 1600,publication_year <= 1800) %>%
  group_by(projected_ecco_module,publication_decade) %>%
  summarize(frenchprop=sum(french)/n(),.groups="drop") %>%
  ggplot(aes(x=publication_decade,y=frenchprop,color=projected_ecco_module)) +
  scale_y_continuous(labels = scales::percent) +
  geom_line(show.legend = F) +
  theme_hsci_discrete() + 
  facet_wrap(~projected_ecco_module,scales="free_y")
Warning: Removed 21 row(s) containing missing values (geom_path).

estc_core %>%
  filter(!uncertain, publication_year >= 1600, publication_year <= 1800) %>%
  left_join(estc_projected_ecco_modules %>% filter(max_prop > 0.5),
            by = c("work_id")) %>%
  count(french, publication_decade, projected_ecco_module) %>%
  group_by(publication_decade, french) %>%
  mutate(prop = n / sum(n)) %>%
  ggplot(aes(x = publication_decade, y = prop, color = french)) +
  geom_line() +
  scale_y_continuous(labels = scales::percent) +
  theme_hsci_discrete() +
  facet_wrap( ~ projected_ecco_module,scales="free_y")

LS0tCnRpdGxlOiAiRVNUQy9CTkYgYW5hbHlzaXMiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCmBgYHtyIHNldHVwLGluY2x1ZGU9RkFMU0V9CmtuaXRyOjpvcHRzX2tuaXQkc2V0KHJvb3QuZGlyID0gaGVyZTo6aGVyZSgpKSAKYGBgCgpgYGB7ciBsb2FkX2xpYnJhcmllc30KbGlicmFyeShhc3NlcnR0aGF0KQpsaWJyYXJ5KGhlcmUpCmxpYnJhcnkodGlkeXZlcnNlKQpsaWJyYXJ5KGdsdWUpCiMgSW5zdGFsbCB3aXRoIGRldnRvb2xzOjppbnN0YWxsX2dpdGh1YigiaHNjaS1yL2dnaHNjaSIpIC8gcGFrOjpwa2dfaW5zdGFsbCgiaHNjaS1yL2dnaHNjaSIpCmxpYnJhcnkoZ2doc2NpKQpgYGAKCmBgYHtyIGxvYWRfZGF0YX0KIyBSZXF1aXJlcyBlc3RjciBhbmQgYm5mci4gU2VlIGh0dHBzOi8vZ2l0aHViLmNvbS9DT01ISVMvZXN0Y3IvYmxvYi9tYWluL1JFQURNRS5tZCNpbnN0YWxsYW5kIGh0dHBzOi8vZ2l0aHViLmNvbS9DT01ISVMvYm5mci9ibG9iL21haW4vUkVBRE1FLm1kI2luc3RhbGxhdGlvbiBmb3IgaW5zdGFsbGluZyB0aGVtCgpzb3VyY2UoaGVyZSgiY29kZS9hbmFseXNpcy9sb2FkX2RhdGFfYWxsX2JuZl9tYXRjaGVzLlIiKSkKYGBgCgojIFByb3BvcnRpb24gb2YgRVNUQyB0aGF0IGFyZSBGcmVuY2ggdHJhbnNsYXRpb25zCgpgYGB7ciBmcmVuY2hfcHJvcG9ydGlvbl90aHJvdWdoX3RpbWV9CmVzdGNfY29yZSAlPiUgCiAgZmlsdGVyKCF1bmNlcnRhaW4scHVibGljYXRpb25feWVhcj4xNTUwLHB1YmxpY2F0aW9uX3llYXI8PTE4MDApICU+JSAKICBncm91cF9ieShwdWJsaWNhdGlvbl95ZWFyKSAlPiUgCiAgc3VtbWFyaXplKHByb3A9c3VtKGZyZW5jaCkvbigpKSAlPiUKICBnZ3Bsb3QoYWVzKHg9cHVibGljYXRpb25feWVhcix5PXByb3ApKSArIAogIHNjYWxlX3lfY29udGludW91cyhsYWJlbHMgPSBzY2FsZXM6OnBlcmNlbnQpICsKICBnZW9tX2xpbmUoKSArIAogIHRoZW1lX2hzY2koKQpgYGAKCiMgUmVwcmludCBjb3VudCBjb21wYXJpc29uCgpgYGB7ciByZXByaW50X2NvdW50X2NvbXBhcmlzb259CmVzdGNfY29yZSAlPiUKICBmaWx0ZXIoIWlzLm5hKHdvcmtfaWQpLCB3b3JrX2lkICE9ICJub193b3JrX2lkIikgJT4lCiAgY291bnQoZnJlbmNoLCB3b3JrX2lkLCBuYW1lID0gImVkaXRpb25zIikgJT4lCiAgZmlsdGVyKGVkaXRpb25zIDwgMTAwKSAlPiUKICBjb3VudChmcmVuY2gsIGVkaXRpb25zKSAlPiUKICBncm91cF9ieShmcmVuY2gpICU+JQogIG11dGF0ZShwcm9wID0gbiAvIHN1bShuKSkgJT4lCiAgZ2dwbG90KGFlcyh4ID0gZWRpdGlvbnMsIHkgPSBwcm9wLCBjb2xvciA9IGZyZW5jaCkpICsKICBnZW9tX2xpbmUoKSArCiAgc2NhbGVfeV9sb2cxMChsYWJlbHMgPSBzY2FsZXM6OnBlcmNlbnQpICsKICB0aGVtZV9oc2NpX2Rpc2NyZXRlKCkKYGBgCgojIERvY3VtZW50IHR5cGUgY29tcGFyaXNvbgoKYGBge3IgZG9jdW1lbnRfdHlwZV9jb21wYXJpc29ufQplc3RjX2NvcmUgJT4lCiAgY291bnQoZnJlbmNoLCBkb2N1bWVudF90eXBlKSAlPiUKICBncm91cF9ieShmcmVuY2gpICU+JQogIG11dGF0ZShwcm9wID0gbiAvIHN1bShuKSkgJT4lCiAgZ2dwbG90KGFlcyh4ID0gZG9jdW1lbnRfdHlwZSwgeSA9IHByb3AsIGZpbGwgPSBmcmVuY2gpKSArCiAgZ2VvbV9iYXIoc3RhdCA9ICdpZGVudGl0eScsIHBvc2l0aW9uID0gJ2RvZGdlJykgKwogIHNjYWxlX3lfY29udGludW91cyhsYWJlbHMgPSBzY2FsZXM6OnBlcmNlbnQpICsKICB0aGVtZV9oc2NpX2Rpc2NyZXRlKCkKYGBgCgojIFN1YmplY3QgdG9waWMgY29tcGFyaXNvbgoKYGBge3Igc3ViamVjdF90b3BpY19jb21wYXJpc29ufQplc3RjX2NvcmUgJT4lCiAgbGVmdF9qb2luKGVzdGNfcHJvamVjdGVkX2VjY29fbW9kdWxlcyAlPiUgZmlsdGVyKG1heF9wcm9wID4gMC41KSwKICAgICAgICAgICAgYnkgPSBjKCJ3b3JrX2lkIikpICU+JQogIGNvdW50KGZyZW5jaCwgcHJvamVjdGVkX2VjY29fbW9kdWxlKSAlPiUKICBncm91cF9ieShmcmVuY2gpICU+JQogIG11dGF0ZShwcm9wID0gbiAvIHN1bShuKSkgJT4lCiAgZ2dwbG90KGFlcyh4ID0gcHJvamVjdGVkX2VjY29fbW9kdWxlLCB5ID0gcHJvcCwgZmlsbCA9IGZyZW5jaCkpICsKICBnZW9tX2JhcihzdGF0ID0gJ2lkZW50aXR5JywgcG9zaXRpb24gPSAnZG9kZ2UnKSArCiAgc2NhbGVfeV9jb250aW51b3VzKGxhYmVscyA9IHNjYWxlczo6cGVyY2VudCkgKwogIGNvb3JkX2ZsaXAoKSArCiAgdGhlbWVfaHNjaV9kaXNjcmV0ZSgpCmBgYAoKIyBTdWJqZWN0IHRvcGljIGFuZCBkb2N1bWVudCB0eXBlIGNvbXBhcmlzb24KCmBgYHtyIHN1YmplY3RfdG9waWNfZG9jdW1lbnRfdHlwZV9jb21wYXJpc29ufQplc3RjX2NvcmUgJT4lCiAgbGVmdF9qb2luKGVzdGNfcHJvamVjdGVkX2VjY29fbW9kdWxlcyAlPiUgZmlsdGVyKG1heF9wcm9wID4gMC41KSwKICAgICAgICAgICAgYnkgPSBjKCJ3b3JrX2lkIikpICU+JQogIGNvdW50KGZyZW5jaCwgZG9jdW1lbnRfdHlwZSxwcm9qZWN0ZWRfZWNjb19tb2R1bGUpICU+JQogIGdyb3VwX2J5KGRvY3VtZW50X3R5cGUsIGZyZW5jaCkgJT4lCiAgbXV0YXRlKHByb3AgPSBuIC8gc3VtKG4pKSAlPiUKICBnZ3Bsb3QoYWVzKHggPSBwcm9qZWN0ZWRfZWNjb19tb2R1bGUsIHkgPSBwcm9wLCBmaWxsID0gZnJlbmNoKSkgKwogIGdlb21fYmFyKHN0YXQgPSAnaWRlbnRpdHknLCBwb3NpdGlvbiA9ICdkb2RnZScpICsKICBzY2FsZV95X2NvbnRpbnVvdXMobGFiZWxzID0gc2NhbGVzOjpwZXJjZW50KSArCiAgY29vcmRfZmxpcCgpICsKICB0aGVtZV9oc2NpX2Rpc2NyZXRlKCkgKwogIGZhY2V0X3dyYXAofiBkb2N1bWVudF90eXBlKQpgYGAKCiMgU3ViamVjdCB0b3BpY3MgdGhyb3VnaCB0aW1lCgpgYGB7cn0KZXN0Y19jb3JlICU+JSAKICBsZWZ0X2pvaW4oZXN0Y19wcm9qZWN0ZWRfZWNjb19tb2R1bGVzICU+JSBmaWx0ZXIobWF4X3Byb3AgPiAwLjUpLGJ5PWMoIndvcmtfaWQiKSkgJT4lCiAgbGVmdF9qb2luKGVzdGNfYWN0b3JfbGlua3MgJT4lIGZpbHRlcihhY3Rvcl9yb2xlX2F1dGhvciksYnk9YygiZXN0Y19pZCIpKSAlPiUKICBmaWx0ZXIoIWlzLm5hKGFjdG9yX2lkKSkgJT4lCiAgZ3JvdXBfYnkocHJvamVjdGVkX2VjY29fbW9kdWxlLGFjdG9yX2lkKSAlPiUgCiAgc3VtbWFyaXplKG1heF9wcm9wPW1heChtYXhfcHJvcCksLmdyb3Vwcz0iZHJvcCIpICU+JQogIGZpbHRlcihtYXhfcHJvcCA+IDAuNSkgJT4lCiAgZGlzdGluY3QocHJvamVjdGVkX2VjY29fbW9kdWxlLGFjdG9yX2lkLG1heF9wcm9wKSAlPiUKICBsZWZ0X2pvaW4oZXN0Y19hY3Rvcl9saW5rcyxieT1jKCJhY3Rvcl9pZCIpKSAlPiUKICBmdWxsX2pvaW4oZXN0Y19jb3JlLGJ5PWMoImVzdGNfaWQiKSkgJT4lCiAgZmlsdGVyKCF1bmNlcnRhaW4sIHB1YmxpY2F0aW9uX3llYXIgPj0gMTYwMCxwdWJsaWNhdGlvbl95ZWFyIDw9IDE4MDApICU+JQogIGdyb3VwX2J5KHByb2plY3RlZF9lY2NvX21vZHVsZSxwdWJsaWNhdGlvbl9kZWNhZGUpICU+JQogIHN1bW1hcml6ZShmcmVuY2hwcm9wPXN1bShmcmVuY2gpL24oKSwuZ3JvdXBzPSJkcm9wIikgJT4lCiAgZ2dwbG90KGFlcyh4PXB1YmxpY2F0aW9uX2RlY2FkZSx5PWZyZW5jaHByb3AsY29sb3I9cHJvamVjdGVkX2VjY29fbW9kdWxlKSkgKwogIHNjYWxlX3lfY29udGludW91cyhsYWJlbHMgPSBzY2FsZXM6OnBlcmNlbnQpICsKICBnZW9tX2xpbmUoc2hvdy5sZWdlbmQgPSBGKSArCiAgdGhlbWVfaHNjaV9kaXNjcmV0ZSgpICsgCiAgZmFjZXRfd3JhcCh+cHJvamVjdGVkX2VjY29fbW9kdWxlLHNjYWxlcz0iZnJlZV95IikKYGBgCgpgYGB7ciBzdWJqZWN0X3RvcGljc190aHJvdWdoX3RpbWV9CmVzdGNfY29yZSAlPiUKICBmaWx0ZXIoIXVuY2VydGFpbiwgcHVibGljYXRpb25feWVhciA+PSAxNjAwLCBwdWJsaWNhdGlvbl95ZWFyIDw9IDE4MDApICU+JQogIGxlZnRfam9pbihlc3RjX3Byb2plY3RlZF9lY2NvX21vZHVsZXMgJT4lIGZpbHRlcihtYXhfcHJvcCA+IDAuNSksCiAgICAgICAgICAgIGJ5ID0gYygid29ya19pZCIpKSAlPiUKICBjb3VudChmcmVuY2gsIHB1YmxpY2F0aW9uX2RlY2FkZSwgcHJvamVjdGVkX2VjY29fbW9kdWxlKSAlPiUKICBncm91cF9ieShwdWJsaWNhdGlvbl9kZWNhZGUsIGZyZW5jaCkgJT4lCiAgbXV0YXRlKHByb3AgPSBuIC8gc3VtKG4pKSAlPiUKICBnZ3Bsb3QoYWVzKHggPSBwdWJsaWNhdGlvbl9kZWNhZGUsIHkgPSBwcm9wLCBjb2xvciA9IGZyZW5jaCkpICsKICBnZW9tX2xpbmUoKSArCiAgc2NhbGVfeV9jb250aW51b3VzKGxhYmVscyA9IHNjYWxlczo6cGVyY2VudCkgKwogIHRoZW1lX2hzY2lfZGlzY3JldGUoKSArCiAgZmFjZXRfd3JhcCggfiBwcm9qZWN0ZWRfZWNjb19tb2R1bGUsc2NhbGVzPSJmcmVlX3kiKQpgYGAKCgo=