total_verse_clusts_by_col_by_year <- poems_to_cols %>% 
  inner_join(poem_verse_clusts) %>%
  inner_join(works_of_interest, join_by(year<=year)) %>%
  group_by(work, year=year.y,col_name,pl_name) %>%
  summarise(available_verse_clusts=n_distinct(clust_id),.groups="drop") %>%
  distinct(year,col_name,pl_name,available_verse_clusts) %>%
  compute_a()
d <- poems_to_cols %>% 
  inner_join(poem_verse_clusts) %>%
  inner_join(work_verse_clusts) %>%
  inner_join(works_of_interest, join_by(work, year<=year)) %>%
  group_by(col_name,pl_name, book, work, year=year.y) %>%
  summarise(shared_verse_clusts=n_distinct(clust_id),.groups="drop") %>%
  right_join(total_verse_clusts_by_col_by_year) %>%
  collect() %>%
  complete(nesting(year,work), fill=list(shared_verse_clusts=0)) %>%
  filter(!is.na(work)) %>%
  mutate(prop=shared_verse_clusts/available_verse_clusts) %>%
  arrange(year, work,col_name,pl_name)

d2 <- poems_to_cols %>% 
  inner_join(poem_verse_clusts) %>%
  inner_join(work_verse_clusts) %>%
  inner_join(works_of_interest, join_by(work, year<=year)) %>%
  group_by(col_name,pl_name, book, year=year.y) %>%
  summarise(shared_verse_clusts=n_distinct(clust_id),.groups="drop") %>%
  right_join(total_verse_clusts_by_col_by_year) %>%
  collect() %>%
  complete(nesting(year,book), fill=list(shared_verse_clusts=0)) %>%
  filter(!is.na(book)) %>%
  mutate(prop=shared_verse_clusts/available_verse_clusts) %>%
  arrange(year, book,col_name,pl_name) %>%
  rename(work=book)

Proportion of parts of the Kalevala for which we identify possible oral sources

d3 <- poems_to_cols %>% 
  inner_join(poem_verse_clusts) %>%
  inner_join(work_verse_clusts) %>%
  inner_join(works_of_interest, join_by(work, year<=year)) %>%
  group_by(book, work, year=year.y) %>%
  summarise(shared_verse_clusts=n_distinct(clust_id),.groups="drop") %>%
  right_join(work_verse_clusts %>% count(work,name="work_verse_clusts")) %>%
  collect() %>%
  complete(nesting(year,work), fill=list(shared_verse_clusts=0)) %>%
  filter(!is.na(work)) %>%
  mutate(prop=shared_verse_clusts/work_verse_clusts) %>%
  arrange(year, work)
d3 %>% 
  filter(book=="Kalevala (1849)") %>%
  ggplot(aes(x=work,y=prop)) +
  geom_col() +
  theme_hsci_discrete() +
  scale_y_continuous(labels=scales::percent) +
  coord_flip() +
  theme(legend.position="bottom") +
  guides(fill=guide_legend(ncol=1))

Composition of the Kalevala in terms of regions

As a whole

d4 <- poems_to_cols %>% 
  inner_join(poem_verse_clusts) %>%
  inner_join(work_verse_clusts) %>%
  inner_join(works_of_interest, join_by(work, year<=year)) %>%
  group_by(pl_name, book, year=year.y) %>%
  summarise(shared_verse_clusts=n_distinct(clust_id),.groups="drop") %>%
  right_join(work_verse_clusts %>% inner_join(works_of_interest) %>% group_by(book) %>% summarise(work_verse_clusts=n_distinct(clust_id))) %>%
  collect() %>%
  complete(nesting(year,book), fill=list(shared_verse_clusts=0)) %>%
  filter(!is.na(book)) %>%
  mutate(prop=shared_verse_clusts/work_verse_clusts) %>%
  arrange(year, book, pl_name)
d4 %>% 
  filter(book=="Kalevala (1849)") %>%
  ggplot(aes(x=pl_name,y=prop)) +
  geom_col(position='dodge') +
  theme_hsci_discrete() +
  scale_y_continuous(labels=scales::percent) +
  coord_flip() +
  theme(legend.position="bottom") +
  guides(fill=guide_legend(ncol=1))

By part

d5<- poems_to_cols %>% 
  inner_join(poem_verse_clusts) %>%
  inner_join(work_verse_clusts) %>%
  inner_join(works_of_interest, join_by(work, year<=year)) %>%
  group_by(pl_name, book, work, year=year.y) %>%
  summarise(shared_verse_clusts=n_distinct(clust_id),.groups="drop") %>%
  right_join(work_verse_clusts %>% count(work,name="work_verse_clusts")) %>%
  collect() %>%
  complete(nesting(year,work), fill=list(shared_verse_clusts=0)) %>%
  filter(!is.na(work)) %>%
  mutate(prop=shared_verse_clusts/work_verse_clusts) %>%
  arrange(year, work, pl_name)
d3 %>% 
  filter(book=="Kalevala (1849)") %>%
  ggplot(aes(x=work,y=prop, fill=pl_name)) +
  geom_col(position='dodge') +
  theme_hsci_discrete() +
  scale_y_continuous(labels=scales::percent) +
  coord_flip() +
  theme(legend.position="bottom") +
  guides(fill=guide_legend(ncol=1))

Oral source as graph

Proportionally of the available material

d2 %>% 
  group_by(col_name,work) %>%
  filter(sum(available_verse_clusts)>=100) %>%
  ungroup() %>%
  arrange(col_name, pl_name) %>%
  mutate(col_name=fct_rev(fct_inorder(str_c(col_name,", ", pl_name)))) %>%
  arrange(year,work) %>%
  mutate(work=fct_inorder(work)) %>%
  complete(col_name,work,fill=list(prop=0)) %>%
  ggplot(aes(x=col_name,y=prop,fill=work)) +
  geom_col(position='dodge') +
  scale_y_continuous(labels=scales::percent) +
  coord_flip() +
  theme_hsci_discrete() +
  theme(legend.position="top")

In absolute numbers

d2 %>% 
  group_by(col_name,work) %>%
  filter(sum(available_verse_clusts)>=100) %>%
  ungroup() %>%
  arrange(col_name, pl_name) %>%
  mutate(col_name=fct_rev(fct_inorder(str_c(col_name,", ", pl_name)))) %>%
  arrange(year,work) %>%
  mutate(work=fct_inorder(work)) %>%
  complete(col_name,work,fill=list(prop=0,shared_verse_clusts=0)) %>%
  ggplot(aes(x=col_name,y=shared_verse_clusts,fill=work)) +
  geom_col(position='dodge') +
  coord_flip() +
  theme_hsci_discrete() +
  theme(legend.position="top")

Regional balance compared to a neutral baseline expectation

d6 <- poems_to_cols %>% 
  inner_join(poem_verse_clusts) %>%
  inner_join(work_verse_clusts) %>%
  inner_join(works_of_interest, join_by(work, year<=year)) %>%
  group_by(pl_name, book, year=year.y) %>%
  summarise(shared_verse_clusts=n_distinct(clust_id),.groups="drop") %>%
  right_join(total_verse_clusts_by_pl_by_year) %>%
  right_join(total_verse_clusts_by_year %>% rename(total_verse_clusts=available_verse_clusts)) %>%
  mutate(expected_prop=available_verse_clusts/total_verse_clusts) %>%
  collect() %>%
  complete(nesting(year,book), fill=list(shared_verse_clusts=0)) %>%
  filter(!is.na(book)) %>%
  mutate(prop=shared_verse_clusts/available_verse_clusts) %>%
  arrange(year, book,pl_name) %>%
  rename(work=book)
d6 %>% 
  filter(work=="Kalevala (1849)") %>%
  arrange(year, work) %>%
  mutate(work=fct_inorder(work)) %>%
  mutate(pl_name=fct_rev(fct_inorder(pl_name))) %>%
  complete(pl_name,work,fill=list(prop=0,expected_prop=0,shared_verse_clusts=0)) %>%
  mutate(ratio=prop/expected_prop) %>%
  ggplot(aes(x=pl_name,y=ratio)) +
  geom_point() +
#  scale_y_continuous(labels=function(x) { x %>% map_chr(function(lab) { if (lab < 0) { str_c(-1/lab,":1") } else {str_c("1:",lab)}})}) +
  coord_flip() +
  theme_hsci_discrete() +
  theme(legend.position="top")

Collector + Regional balance compared to a neutral baseline expectation

d2 %>% 
  group_by(col_name,work) %>%
  filter(sum(available_verse_clusts)>=100) %>%
  ungroup() %>%
  group_by(col_name,work) %>%
  filter(sum(shared_verse_clusts)>=20) %>%
  filter(n()>1) %>%
  ungroup() %>%
  arrange(col_name, pl_name) %>%
  group_by(col_name, pl_name) %>%
  mutate(expected_prop=available_verse_clusts/sum(available_verse_clusts)) %>%
  ungroup() %>%
  arrange(year, work) %>%
  mutate(work=fct_inorder(work)) %>%
  mutate(col_name=fct_rev(fct_inorder(str_c(work,", ", col_name)))) %>%
  complete(col_name,work,fill=list(prop=0,expected_prop=0,shared_verse_clusts=0)) %>%
  mutate(ratio=prop/expected_prop) %>%
  mutate(ratio=if_else(ratio>=1,ratio-1,-1/ratio+1)) %>%
  ggplot(aes(x=col_name,y=ratio,color=pl_name)) +
  geom_point() +
#  scale_y_continuous(labels=function(x) { x %>% map_chr(function(lab) { if (lab < 0) { str_c(-1/lab,":1") } else {str_c("1:",lab)}})}) +
  coord_flip() +
  theme_hsci_discrete() +
  theme(legend.position="top")

Oral sources as table

d2 %>% 
  group_by(col_name,work) %>%
  filter(sum(available_verse_clusts)>=100) %>%
  ungroup() %>%
  mutate(col_name=fct_rev(fct_inorder(str_c(col_name,", ", pl_name)))) %>%
  select(-year,-pl_name) %>%
  gt(groupname_col="work",rowname_col="col_name") %>%
  fmt_integer(shared_verse_clusts) %>%
  fmt_percent(prop) %>%
  cols_label(
    shared_verse_clusts="Shared verse clusters",
    available_verse_clusts="Available verse clusters",
    prop="Proportion of available verse clusters used"
  )
  
  
---
title: "Oral Sources of the Kalevala"
date: "`r Sys.Date()`"
output: 
  html_notebook:
    code_folding: hide
    toc: yes
  html_document:
    code_folding: hide
    toc: yes
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(message=FALSE,dpi=300,fig.retina=2,fig.width=8)
source(here::here("src/common_basis.R"))
tmap_mode("plot")
save_plots <- function(dir,basename,plot,width=6,height=5,units="in",dpi=300) {
  ggsave(glue("{dir}/{basename}.png"),plot,width=width,height=height,dpi=dpi, units=units)
  ggsave(glue("{dir}/{basename}.svg"),plot,width=width,height=height,dpi=dpi, units=units)
  ggsave(glue("{dir}/{basename}.pdf"),plot,width=width,height=height,dpi=dpi, units=units, device = cairo_pdf)
  knitr::include_graphics(glue("{dir}/{basename}.svg"), dpi=dpi)
}
```

```{r}
total_verse_clusts_by_col_by_year <- poems_to_cols %>% 
  inner_join(poem_verse_clusts) %>%
  inner_join(works_of_interest, join_by(year<=year)) %>%
  group_by(work, year=year.y,col_name,pl_name) %>%
  summarise(available_verse_clusts=n_distinct(clust_id),.groups="drop") %>%
  distinct(year,col_name,pl_name,available_verse_clusts) %>%
  compute_a()

total_verse_clusts_by_pl_by_year <- poems_to_cols %>% 
  inner_join(poem_verse_clusts) %>%
  inner_join(works_of_interest, join_by(year<=year)) %>%
  group_by(work, year=year.y,pl_name) %>%
  summarise(available_verse_clusts=n_distinct(clust_id),.groups="drop") %>%
  distinct(year,pl_name,available_verse_clusts) %>%
  compute_a()

total_verse_clusts_by_year <- poems_to_cols %>% 
  inner_join(poem_verse_clusts) %>%
  inner_join(works_of_interest, join_by(year<=year)) %>%
  group_by(work, year=year.y) %>%
  summarise(available_verse_clusts=n_distinct(clust_id),.groups="drop") %>%
  distinct(year,available_verse_clusts) %>%
  compute_a()
```


```{r}
d <- poems_to_cols %>% 
  inner_join(poem_verse_clusts) %>%
  inner_join(work_verse_clusts) %>%
  inner_join(works_of_interest, join_by(work, year<=year)) %>%
  group_by(col_name,pl_name, book, work, year=year.y) %>%
  summarise(shared_verse_clusts=n_distinct(clust_id),.groups="drop") %>%
  right_join(total_verse_clusts_by_col_by_year) %>%
  collect() %>%
  complete(nesting(year,work), fill=list(shared_verse_clusts=0)) %>%
  filter(!is.na(work)) %>%
  mutate(prop=shared_verse_clusts/available_verse_clusts) %>%
  arrange(year, work,col_name,pl_name)

d2 <- poems_to_cols %>% 
  inner_join(poem_verse_clusts) %>%
  inner_join(work_verse_clusts) %>%
  inner_join(works_of_interest, join_by(work, year<=year)) %>%
  group_by(col_name,pl_name, book, year=year.y) %>%
  summarise(shared_verse_clusts=n_distinct(clust_id),.groups="drop") %>%
  right_join(total_verse_clusts_by_col_by_year) %>%
  collect() %>%
  complete(nesting(year,book), fill=list(shared_verse_clusts=0)) %>%
  filter(!is.na(book)) %>%
  mutate(prop=shared_verse_clusts/available_verse_clusts) %>%
  arrange(year, book,col_name,pl_name) %>%
  rename(work=book)
```

## Proportion of parts of the Kalevala for which we identify possible oral sources

```{r}
d3 <- poems_to_cols %>% 
  inner_join(poem_verse_clusts) %>%
  inner_join(work_verse_clusts) %>%
  inner_join(works_of_interest, join_by(work, year<=year)) %>%
  group_by(book, work, year=year.y) %>%
  summarise(shared_verse_clusts=n_distinct(clust_id),.groups="drop") %>%
  right_join(work_verse_clusts %>% count(work,name="work_verse_clusts")) %>%
  collect() %>%
  complete(nesting(year,work), fill=list(shared_verse_clusts=0)) %>%
  filter(!is.na(work)) %>%
  mutate(prop=shared_verse_clusts/work_verse_clusts) %>%
  arrange(year, work)
```

```{r,fig.height=10,fig.width=8}
d3 %>% 
  filter(book=="Kalevala (1849)") %>%
  ggplot(aes(x=work,y=prop)) +
  geom_col() +
  theme_hsci_discrete() +
  scale_y_continuous(labels=scales::percent) +
  coord_flip() +
  theme(legend.position="bottom") +
  guides(fill=guide_legend(ncol=1))
```
## Composition of the Kalevala in terms of regions

### As a whole

```{r}
d4 <- poems_to_cols %>% 
  inner_join(poem_verse_clusts) %>%
  inner_join(work_verse_clusts) %>%
  inner_join(works_of_interest, join_by(work, year<=year)) %>%
  group_by(pl_name, book, year=year.y) %>%
  summarise(shared_verse_clusts=n_distinct(clust_id),.groups="drop") %>%
  right_join(work_verse_clusts %>% inner_join(works_of_interest) %>% group_by(book) %>% summarise(work_verse_clusts=n_distinct(clust_id))) %>%
  collect() %>%
  complete(nesting(year,book), fill=list(shared_verse_clusts=0)) %>%
  filter(!is.na(book)) %>%
  mutate(prop=shared_verse_clusts/work_verse_clusts) %>%
  arrange(year, book, pl_name)
```

```{r,fig.height=10,fig.width=8}
d4 %>% 
  filter(book=="Kalevala (1849)") %>%
  ggplot(aes(x=pl_name,y=prop)) +
  geom_col(position='dodge') +
  theme_hsci_discrete() +
  scale_y_continuous(labels=scales::percent) +
  coord_flip() +
  theme(legend.position="bottom") +
  guides(fill=guide_legend(ncol=1))
```

### By part

```{r}
d5<- poems_to_cols %>% 
  inner_join(poem_verse_clusts) %>%
  inner_join(work_verse_clusts) %>%
  inner_join(works_of_interest, join_by(work, year<=year)) %>%
  group_by(pl_name, book, work, year=year.y) %>%
  summarise(shared_verse_clusts=n_distinct(clust_id),.groups="drop") %>%
  right_join(work_verse_clusts %>% count(work,name="work_verse_clusts")) %>%
  collect() %>%
  complete(nesting(year,work), fill=list(shared_verse_clusts=0)) %>%
  filter(!is.na(work)) %>%
  mutate(prop=shared_verse_clusts/work_verse_clusts) %>%
  arrange(year, work, pl_name)
```

```{r,fig.height=10,fig.width=8}
d5 %>% 
  filter(book=="Kalevala (1849)") %>%
  ggplot(aes(x=work,y=prop, fill=pl_name)) +
  geom_col(position='dodge') +
  theme_hsci_discrete() +
  scale_y_continuous(labels=scales::percent) +
  coord_flip() +
  theme(legend.position="bottom") +
  guides(fill=guide_legend(ncol=1))
```

## Oral source as graph

### Proportionally of the available material

```{r,fig.height=100,width=10}
d2 %>% 
  group_by(col_name,work) %>%
  filter(sum(available_verse_clusts)>=100) %>%
  ungroup() %>%
  arrange(col_name, pl_name) %>%
  mutate(col_name=fct_rev(fct_inorder(str_c(col_name,", ", pl_name)))) %>%
  arrange(year,work) %>%
  mutate(work=fct_inorder(work)) %>%
  complete(col_name,work,fill=list(prop=0)) %>%
  ggplot(aes(x=col_name,y=prop,fill=work)) +
  geom_col(position='dodge') +
  scale_y_continuous(labels=scales::percent) +
  coord_flip() +
  theme_hsci_discrete() +
  theme(legend.position="top")
```

### In absolute numbers

```{r,fig.height=100,width=10}
d2 %>% 
  group_by(col_name,work) %>%
  filter(sum(available_verse_clusts)>=100) %>%
  ungroup() %>%
  arrange(col_name, pl_name) %>%
  mutate(col_name=fct_rev(fct_inorder(str_c(col_name,", ", pl_name)))) %>%
  arrange(year,work) %>%
  mutate(work=fct_inorder(work)) %>%
  complete(col_name,work,fill=list(prop=0,shared_verse_clusts=0)) %>%
  ggplot(aes(x=col_name,y=shared_verse_clusts,fill=work)) +
  geom_col(position='dodge') +
  coord_flip() +
  theme_hsci_discrete() +
  theme(legend.position="top")
```

### Regional balance compared to a neutral baseline expectation

```{r}
d6 <- poems_to_cols %>% 
  inner_join(poem_verse_clusts) %>%
  inner_join(work_verse_clusts) %>%
  inner_join(works_of_interest, join_by(work, year<=year)) %>%
  group_by(pl_name, book, year=year.y) %>%
  summarise(shared_verse_clusts=n_distinct(clust_id),.groups="drop") %>%
  right_join(total_verse_clusts_by_pl_by_year) %>%
  right_join(total_verse_clusts_by_year %>% rename(total_verse_clusts=available_verse_clusts)) %>%
  mutate(expected_prop=available_verse_clusts/total_verse_clusts) %>%
  collect() %>%
  complete(nesting(year,book), fill=list(shared_verse_clusts=0)) %>%
  filter(!is.na(book)) %>%
  mutate(prop=shared_verse_clusts/available_verse_clusts) %>%
  arrange(year, book,pl_name) %>%
  rename(work=book)
```


```{r}
d6 %>% 
  filter(work=="Kalevala (1849)") %>%
  arrange(year, work) %>%
  mutate(work=fct_inorder(work)) %>%
  mutate(pl_name=fct_rev(fct_inorder(pl_name))) %>%
  complete(pl_name,work,fill=list(prop=0,expected_prop=0,shared_verse_clusts=0)) %>%
  mutate(ratio=prop/expected_prop) %>%
  ggplot(aes(x=pl_name,y=ratio)) +
  geom_point() +
#  scale_y_continuous(labels=function(x) { x %>% map_chr(function(lab) { if (lab < 0) { str_c(-1/lab,":1") } else {str_c("1:",lab)}})}) +
  coord_flip() +
  theme_hsci_discrete() +
  theme(legend.position="top")
```


### Collector + Regional balance compared to a neutral baseline expectation

```{r,fig.height=100,width=10}
d2 %>% 
  group_by(col_name,work) %>%
  filter(sum(available_verse_clusts)>=100) %>%
  ungroup() %>%
  group_by(col_name,work) %>%
  filter(sum(shared_verse_clusts)>=20) %>%
  filter(n()>1) %>%
  ungroup() %>%
  arrange(col_name, pl_name) %>%
  group_by(col_name, pl_name) %>%
  mutate(expected_prop=available_verse_clusts/sum(available_verse_clusts)) %>%
  ungroup() %>%
  arrange(year, work) %>%
  mutate(work=fct_inorder(work)) %>%
  mutate(col_name=fct_rev(fct_inorder(str_c(work,", ", col_name)))) %>%
  complete(col_name,work,fill=list(prop=0,expected_prop=0,shared_verse_clusts=0)) %>%
  mutate(ratio=prop/expected_prop) %>%
  mutate(ratio=if_else(ratio>=1,ratio-1,-1/ratio+1)) %>%
  ggplot(aes(x=col_name,y=ratio,color=pl_name)) +
  geom_point() +
#  scale_y_continuous(labels=function(x) { x %>% map_chr(function(lab) { if (lab < 0) { str_c(-1/lab,":1") } else {str_c("1:",lab)}})}) +
  coord_flip() +
  theme_hsci_discrete() +
  theme(legend.position="top")
```

## Oral sources as table

```{r}
d2 %>% 
  group_by(col_name,work) %>%
  filter(sum(available_verse_clusts)>=100) %>%
  ungroup() %>%
  mutate(col_name=fct_rev(fct_inorder(str_c(col_name,", ", pl_name)))) %>%
  select(-year,-pl_name) %>%
  gt(groupname_col="work",rowname_col="col_name") %>%
  fmt_integer(shared_verse_clusts) %>%
  fmt_percent(prop) %>%
  cols_label(
    shared_verse_clusts="Shared verse clusters",
    available_verse_clusts="Available verse clusters",
    prop="Proportion of available verse clusters used"
  )
  
  
```
