library(tidyverse)
library(broom)
library(skimr)
library(scales)
library(ggrepel)
library(kableExtra)
library(stringr)
df_lr1 <- read_rds("data/df_lr1.rds")


# Import the literary form dataset (fiction vs non-fiction)

df_litform <- read_csv("data/litform.csv") %>%
  mutate(instance_hrid = as.character(instance_hrid),
         litform = as.factor(litform))

df_withlitform <- df_lr1 %>%
  left_join(df_litform) %>%
  mutate(litform_cat = ifelse(litform == "0", "nonfiction", "fiction")) %>%
  mutate(litform_cat = as.factor(litform_cat)) %>%
  filter(!is.na(litform))
# model function

get_lm_model <- function(df) {
  model <- lm(sum_total_of_charge_count ~ age + total_5xx_6xx, data = df)
  return(model)
}


get_lm_model <- function(df) {
  model <- lm(sum_total_of_charge_count ~ age + total_5xx_6xx, data = df)
  return(model)
}


get_predictions <- function(df, model) {
  df_augment <- augment(model) %>%
    select(sum_total_of_charge_count:.resid) %>%
    mutate(across(.fitted:.resid, round, 3))
  
  id_col <- df %>%
    select(instance_hrid)
  
  predictions <- bind_cols(id_col, df_augment)
  
  return(predictions)
}

get_tidymodel <- function(model) {
  tidymodel <- model %>%
    tidy() %>%
    mutate(across(estimate:p.value, round, 3)) %>%
    select(-statistic)
  
  return(tidymodel)
}
data_eng <- df_lr1 %>%
  select(instance_hrid, language, age, total_5xx_6xx, sum_total_of_charge_count) %>%
  filter(language == "English")

model_eng <- data_eng %>%
  get_lm_model()

predict_eng <- get_predictions(data_eng, model_eng)
# data overview for cdexec
df_lr1 %>%
  select(instance_hrid, language, age, total_5xx_6xx, sum_total_of_charge_count) %>%
  glimpse()
## Rows: 1,105,330
## Columns: 5
## $ instance_hrid             <chr> "10000001", "10000002", "10000003", "1000000~
## $ language                  <chr> "English", "English", "English", "English", ~
## $ age                       <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,~
## $ total_5xx_6xx             <dbl> 9, 6, 7, 9, 13, 18, 10, 11, 12, 9, 11, 17, 6~
## $ sum_total_of_charge_count <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0,~
df_lr1 %>%
  count(language) %>%
  arrange(language)
# eng lang histogram of charges

data_eng %>%
  filter(sum_total_of_charge_count > 0) %>%
  sample_n(100) %>%
  ggplot(aes(sample = sum_total_of_charge_count  )) +
  geom_qq(alpha = 0.2) +
  geom_qq_line()

# example for cdexec slide

get_tidymodel(model_eng)
predict_eng %>%
  filter(instance_hrid == "4213747")
0.892 + (17 * 0.114) + (32 * 0.032) 
## [1] 3.854
data_tha <- df_lr1 %>%
  select(instance_hrid, language, age, total_5xx_6xx, sum_total_of_charge_count) %>%
  filter(language == "Thai")

model_tha <- data_tha %>%
  get_lm_model()

predict_that <- get_predictions(data_tha, model_tha)

model_tha %>%
  tidy() %>%
    mutate(across(estimate:p.value, round, 3))
data_tha %>%
  skim()
Data summary
Name Piped data
Number of rows 24042
Number of columns 5
_______________________
Column type frequency:
character 2
numeric 3
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
instance_hrid 0 1 7 8 0 24040 0
language 0 1 4 4 0 1 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 0 1 10.16 4.79 1 7 11 14 18 <U+2585><U+2583><U+2587><U+2587><U+2586>
total_5xx_6xx 0 1 5.91 3.10 0 4 6 8 26 <U+2587><U+2587><U+2581><U+2581><U+2581>
sum_total_of_charge_count 0 1 0.15 0.59 0 0 0 0 36 <U+2587><U+2581><U+2581><U+2581><U+2581>
# old and rich records

s1 <- predict_eng %>%
  filter(age > 15, total_5xx_6xx > 20, sum_total_of_charge_count > 0) %>%
  sample_n(100)

s1 %>%
  filter(between(.resid, -0.4, 0.4))
# new and poor record

s2 <- predict_eng %>%
  filter(age < 3, total_5xx_6xx < 3, sum_total_of_charge_count == 0) %>%
  sample_n(100)

# tidy

model_eng %>%
  tidy() %>%
  mutate(across(estimate:p.value, round, 3)) %>%
  mutate(language = "English") %>%
  select(language, everything())
# model all

model3_all <- lm(sum_total_of_charge_count ~ language +
                   age + total_5xx_6xx, data = df_lr1)
summary(model3_all)
## 
## Call:
## lm(formula = sum_total_of_charge_count ~ language + age + total_5xx_6xx, 
##     data = df_lr1)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##   -4.1   -2.0   -0.6    0.2 5920.5 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         1.35409    0.05144  26.324  < 2e-16 ***
## languageFrench     -2.02724    0.06358 -31.883  < 2e-16 ***
## languageGerman     -2.19077    0.05056 -43.331  < 2e-16 ***
## languageIndonesian -2.11340    0.07931 -26.646  < 2e-16 ***
## languageRussian    -2.32434    0.07020 -33.110  < 2e-16 ***
## languageSpanish    -2.04511    0.05830 -35.077  < 2e-16 ***
## languageThai       -2.14351    0.10349 -20.711  < 2e-16 ***
## languageVietnamese -2.16272    0.08598 -25.154  < 2e-16 ***
## age                 0.07839    0.00293  26.751  < 2e-16 ***
## total_5xx_6xx       0.02385    0.00327   7.292 3.05e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.69 on 1105320 degrees of freedom
## Multiple R-squared:  0.005117,   Adjusted R-squared:  0.005109 
## F-statistic: 631.7 on 9 and 1105320 DF,  p-value: < 2.2e-16
model3_all %>% tidy() %>%
  filter(term != "(Intercept)") %>%
  select(term, estimate) %>%
  arrange(desc(abs(estimate)))
# nested models 

# TO DO: Need to keep instance_hrid


models1 <- df_lr1 %>%
    select(instance_hrid, language, age, total_5xx_6xx, sum_total_of_charge_count) %>%
  nest(-language) %>%
  mutate(model = map(data, ~lm(sum_total_of_charge_count ~ age + total_5xx_6xx, data = .))) %>%
           select(-data)

models2 <- df_lr1 %>%
    select(instance_hrid, language, age, lcclass, total_5xx_6xx, has_circulated, sum_total_of_charge_count) 
models2 %>%
  sample_n(10000) %>%
  group_by(language,lcclass, has_circulated) %>%
  summarize(n = n(),
            sum_5_6 = sum(total_5xx_6xx)) %>%
  filter(language == "English") %>%
  pivot_wider(names_from = "has_circulated", values_from = "n") %>%
  rename(circ = `1`, nocirc = `0`) %>%
  mutate(across(nocirc:circ, ~ ifelse(is.na(.x), 0, .x))) %>%
  mutate(total_titles = circ + nocirc) %>%
  arrange(desc(total_titles)) %>%
  select(c(language:sum_5_6, total_titles, nocirc:circ)) 
models2 %>%
  sample_n(10000) %>%
  group_by(age, language, total_5xx_6xx, has_circulated, ) %>%
  summarize(n = n()) %>%
  arrange(age) %>%
  filter(language == "Spanish") %>%
  ggplot(aes(x = total_5xx_6xx)) +
  geom_density() +
  facet_wrap(~age)

    # nest(-language) %>%
  # mutate(model = map(data, ~lm(sum_total_of_charge_count ~ age + total_5xx_6xx, data = .))) %>%
  #          select(-data)
models1  %>%
  mutate(tidied = map(model, tidy)) %>%
  unnest(tidied) %>%
  mutate(across(estimate:p.value, round, 3)) %>%
  select(-model, -statistic, -std.error, -p.value) %>%
  filter(term %in% c("age", "total_5xx_6xx")) %>%
  arrange(term, desc(estimate))
models1  %>%
  mutate(tidied = map(model, tidy)) %>%
  unnest(tidied) %>%
  mutate(across(estimate:p.value, round, 3)) %>%
  select(-model, -statistic, -std.error, -p.value) %>%
  filter(term %in% c("age", "total_5xx_6xx")) %>%
  arrange(term, desc(estimate)) %>%
  pivot_wider(names_from = "term", values_from = "estimate") %>%
  ggplot(aes(x = age, y = total_5xx_6xx )) +
  geom_point(alpha = 0.3, color = "red") +
  geom_text(aes(label = language), size = 3) +
  scale_x_continuous(breaks = seq(0,15, by = .01), limits = c(0, 0.13)) +
  scale_y_continuous(breaks = seq(-0.01,0.04, by = .01), limits = c(-0.01, 0.04)) +
  theme(panel.grid.minor = element_blank()) +
  labs(title = "Model coefficients for CUL print monograph circulation, by language, age and level of description ",
       y = "effect of total number of 5xx and 6xx fields",
       x = "effect of age",
       subtitle = "Combined visualization of separate multiple regression model for each language")

ggsave("output/coefficients_by_age_and_total5xx-6xx_in_record.png", height = 6, width = 10, units = "in")
models1  %>%
  mutate(tidied = map(model, tidy)) %>%
  unnest(tidied) %>%
  mutate(across(estimate:p.value, round, 3)) %>%
  select(-model, -statistic, -std.error, -p.value) %>%
  pivot_wider(names_from = "term", values_from = "estimate")
models1  %>%
  mutate(tidied = map(model, tidy)) %>%
  unnest(tidied) %>%
  mutate(across(estimate:p.value, round, 3)) %>%
  select(-model, -statistic, -std.error, -p.value) %>%
  pivot_wider(names_from = "term", values_from = "estimate")

augment models1

df_models1_augment <- models1  %>%
  mutate(augment = map(model, augment)) %>%
  unnest(augment) %>%
  select(-model) %>%
  sample_n(100) %>%
  mutate(across(.fitted:.std.resid, round, 3))


df_models1_augment %>%
  arrange(.fitted)

litform

df_withlitform %>%
  count(litform_cat, litform)
df_withlitform %>%
  filter(language != "English") %>%
  ggplot(aes(litform_cat, sum_total_of_charge_count)) +
  geom_point(alpha = 0.1) +
  facet_wrap(~language)

df_withlitform %>%
  group_by(language, litform_cat) %>%
  summarise(mean_subjects = mean(total_5xx_6xx)) %>%
  ggplot(aes(x = language, y = mean_subjects)) +
  geom_col(aes(fill = litform_cat), position = "dodge") +
  labs(y = "average number of 5xx-6xx / record") +
  theme(legend.position = "top",
        legend.title = element_blank())

ggsave("output/total5xx-6xx_by_litform.png", height = 6, width = 10, units = "in")
df_withlitform %>%
  group_by(age, language, litform_cat) %>%
  summarise(n = n(),
            mean_subjects = mean(total_5xx_6xx)) %>%
  pivot_wider(names_from = "litform_cat", values_from = "n") %>%
  mutate(total = fiction + nonfiction) %>%
  mutate(percent_fiction = fiction / total)
df_withlitform %>%
  group_by(age, language, litform_cat) %>%
  summarise(n = n(),
            mean_subjects = mean(total_5xx_6xx)) %>%
  ggplot(aes(x = age, y = mean_subjects)) +
  geom_line(aes(color = litform_cat), size = 2) +
  facet_wrap(~language) +
    labs(y = "average count of 5xx-6xx per record") +
  theme(legend.position = "top",
        legend.title = element_blank())

models_litform <- df_withlitform %>%
    select(instance_hrid, language, age, total_5xx_6xx, sum_total_of_charge_count, litform_cat) %>%
  nest(-language) %>%
  mutate(model = map(data, ~lm(sum_total_of_charge_count ~ age + total_5xx_6xx + litform_cat, data = .))) %>%
           select(-data)
models_litform  %>%
  mutate(tidied = map(model, tidy)) %>%
  unnest(tidied) %>%
  mutate(across(estimate:p.value, round, 3)) %>%
  select(-model, -statistic, -std.error, -p.value) %>%
  arrange(term, desc(estimate)) %>% 
  pivot_wider(names_from = "term", values_from = "estimate")

now split into fiction and nonfiction models

df_withlitform %>%
  group_by(language, litform_cat) %>%
  summarise(n = n()) %>%
  pivot_wider(names_from = "litform_cat", values_from = "n") %>%
  mutate(total = fiction + nonfiction,
         perc_fiction = fiction / total)
df_withlitform %>%
  group_by(language, age, litform_cat) %>%
  summarise(n = n(),
            perc_circ = mean(has_circulated) ) %>%
  ggplot(aes(x = age, y = perc_circ, color = litform_cat)) +
  geom_line() +
  facet_wrap(~language) +
  theme(legend.position = "top",
        legend.title = element_blank()) +
  labs(title = "Percent circulating by litform, age, and language") +
  scale_y_continuous(labels = percent_format())

df_withlitform %>%
  #filter(language != "English") %>%
  group_by(language,litform_cat) %>%
  summarise(n = n(),
            perc_circ = mean(has_circulated) ) %>%
  mutate(perc_circ = round(perc_circ, 3) * 100) %>%
  ggplot(aes(x = litform_cat, y = n, fill = litform_cat)) +
  geom_col() +
  geom_text(aes(label = paste0(perc_circ, "%")  ), size = 5) +
  facet_wrap(~language) +
  theme(legend.position = "top",
        legend.title = element_blank()) +
  labs(x = NULL,
       y = "number of titles") +
  scale_y_continuous(labels = comma_format()) +
  labs(title = "Number of titles by litform and percent circulating, by language")

df_withlitform_fiction <- df_withlitform %>%
  filter(litform_cat == "fiction")

df_withlitform_nonfiction <- df_withlitform %>%
  filter(litform_cat == "nonfiction")


get_models <- function(df) {
  models <- df %>%
    select(instance_hrid, language, age, total_5xx_6xx, sum_total_of_charge_count) %>%
  nest(-language) %>%
  mutate(model = map(data, ~lm(sum_total_of_charge_count ~ age + total_5xx_6xx, data = .))) %>%
           select(-data)
}

models_fiction <- get_models(df_withlitform_fiction)
models_nonfiction <- get_models(df_withlitform_nonfiction)
df_withlitform %>%
  count(litform, sort = TRUE)
models_fiction  %>%
  mutate(tidied = map(model, tidy)) %>%
  unnest(tidied) %>%
  mutate(across(estimate:p.value, round, 3)) %>%
  select(-model, -statistic, -std.error, -p.value) %>%
  filter(term %in% c("age", "total_5xx_6xx")) %>%
  arrange(term, desc(estimate)) %>%
  pivot_wider(names_from = "term", values_from = "estimate") %>%
  ggplot(aes(x = age, y = total_5xx_6xx)) +
  geom_point(alpha = 0.3, color = "red") +
  geom_label(aes(label = language), size = 3) +
  scale_x_continuous(breaks = seq(0,0.4, by = .05), limits = c(0, 0.4)) +
  scale_y_continuous(breaks = seq(-0.05,0.3, by = 0.05), limits = c(-0.05, 0.3)) +
  theme(panel.grid.minor = element_blank()) +
  labs(title = "Fiction CUL print monograph circulation model coefficients, by total_5xx_6xx and age",
       y = "effect of total number of 5xx and 6xx fields",
       x = "effect of age",
       subtitle = "Combined visualization of separate multiple regression model for each language",
       caption = "Linear model: lm(sum_total_of_charge_count ~ age + total_5xx_6xx, data = .)")

models_nonfiction  %>%
  mutate(tidied = map(model, tidy)) %>%
  unnest(tidied) %>%
  mutate(across(estimate:p.value, round, 3)) %>%
  select(-model, -statistic, -std.error, -p.value) %>%
  filter(term %in% c("age", "total_5xx_6xx")) %>%
  arrange(term, desc(estimate)) %>%
  pivot_wider(names_from = "term", values_from = "estimate") %>%
  ggplot(aes(x = age, y = total_5xx_6xx )) +
  geom_label(aes(label = language), size = 4) +
  scale_x_continuous(breaks = seq(0,15, by = .01), limits = c(0, 0.12)) +
  scale_y_continuous(breaks = seq(0,0.04, by = .01), limits = c(0, 0.03)) +
  theme(panel.grid.minor = element_blank()) +
  labs(title = "Nonfiction CUL print monograph circulation model coefficients, by total_5xx_6xx and age",
       y = "effect of total number of 5xx and 6xx fields",
       x = "effect of age",
       subtitle = "Combined visualization of separate multiple regression model for each language",
       caption = "Linear model: lm(sum_total_of_charge_count ~ age + total_5xx_6xx, data = .)")

Look at the outliers

df_withlitform_fiction %>%
  group_by(language) %>%
  summarize(max_charges = max(sum_total_of_charge_count))
df_withlitform_fiction %>%
  group_by(language) %>%
  summarise(quants = list(quantile(sum_total_of_charge_count, probs = c(.01, .1, .25, .5, .75, .90,.99, .995, .999)))) %>%
  unnest_wider(quants)
df_withlitform_nonfiction %>%
  group_by(language) %>%
  summarise(quants = list(quantile(sum_total_of_charge_count, probs = c(.01, .1, .25, .5, .75, .90,.99, .995, .999)))) %>%
  unnest_wider(quants)
df_withlitform_fiction %>%
  group_by(language, total_5xx_6xx) %>%
  summarise(n = n(),
            perc_circ = mean(has_circulated) )
df_withlitform_nonfiction %>%
  group_by(language, total_5xx_6xx) %>%
  summarise(n = n(),
            perc_circ = mean(has_circulated) )
df_withlitform_nonfiction %>%
  filter(language == "Thai",
         total_5xx_6xx == 0)
df_withlitform %>%
  group_by(language, lcclass, litform) %>%
  mutate(litform = str_to_lower(litform)) %>%
  summarise(n = n()) %>%
  arrange(litform) %>%
  pivot_wider(names_from = "litform", values_from = "n") %>%
  mutate(across(`|`:`z`, ~ ifelse(is.na(.x), 0, .x))) %>%
  write_csv("output/lang_lcclass_litform.csv")
models_fiction  %>%
  mutate(tidied = map(model, tidy)) %>%
  unnest(tidied) %>%
  mutate(across(estimate:p.value, round, 3)) %>%
  select(-model, -statistic, -std.error) %>%
  filter(term %in% c("age", "total_5xx_6xx")) %>%
  arrange(term, desc(estimate)) %>%
  mutate(stat_significant = ifelse(p.value < 0.05, "yes", "no")) %>%
  ggplot(aes(x = language, y = estimate, fill = stat_significant)) +
    geom_col() +
  facet_wrap(~term, ncol = 1)

Amount of model variance that is explained.

Fiction model explains more of the variance than the nonfiction model

Fiction

models_fiction  %>%
  mutate(gl = map(model, glance)) %>%
  unnest(gl) %>%
  mutate(across(r.squared:p.value, round, 3))

Nonfiction

models_nonfiction  %>%
  mutate(gl = map(model, glance)) %>%
  unnest(gl) %>%
  mutate(across(r.squared:p.value, round, 3))
aug_fiction <- models_fiction  %>%
  mutate(aug = map(model, augment)) %>%
  unnest(aug) %>%
  select(.fitted:.resid ) %>%
  mutate(across(.fitted:.resid, round, 2)) %>%
  bind_cols(df_withlitform_fiction)

aug_fiction
aug_nonfiction <- models_nonfiction  %>%
  mutate(aug = map(model, augment)) %>%
  unnest(aug) %>%
  select(.fitted:.resid ) %>%
  mutate(across(.fitted:.resid, round, 2)) %>%
  bind_cols(df_withlitform_nonfiction)

aug_nonfiction %>%
  head()

next step: check with hand calculations

models_nonfiction  %>%
  mutate(tidied = map(model, tidy)) %>%
  unnest(tidied) %>%
  mutate(across(estimate:p.value, round, 3)) %>%
  select(-model, -statistic, -std.error, -p.value)

example nonfiction

# Intercept + (0.102 * age) + (0.011 * total_5xx_6xx)
# 10000001
1.170 + (0.102 * 2) + (0.011 * 9)
## [1] 1.473
aug_nonfiction %>%
  head(1) %>%
  glimpse()
## Rows: 1
## Columns: 13
## $ .fitted                   <dbl> 1.47
## $ .resid                    <dbl> -1.47
## $ instance_hrid             <chr> "10000001"
## $ language                  <chr> "English"
## $ age                       <dbl> 2
## $ encoding_level            <chr> "full"
## $ enc_cat                   <chr> "full"
## $ lcclass                   <chr> "HF"
## $ has_circulated            <dbl> 0
## $ sum_total_of_charge_count <dbl> 0
## $ total_5xx_6xx             <dbl> 9
## $ litform                   <fct> 0
## $ litform_cat               <fct> nonfiction

example fiction

models_fiction  %>%
  mutate(tidied = map(model, tidy)) %>%
  unnest(tidied) %>%
  mutate(across(estimate:p.value, round, 3)) %>%
  select(-model, -statistic, -std.error, -p.value) %>%
  arrange(term, estimate)
# Intercept + (0.102 * age) + (0.011 * total_5xx_6xx)
# 10000002
-1.986 + (0.310 * 2) + (0.271 * 6)
## [1] 0.26
aug_fiction %>%
  head(1) %>%
  glimpse()
## Rows: 1
## Columns: 13
## $ .fitted                   <dbl> 0.26
## $ .resid                    <dbl> -0.26
## $ instance_hrid             <chr> "10000002"
## $ language                  <chr> "English"
## $ age                       <dbl> 2
## $ encoding_level            <chr> "full"
## $ enc_cat                   <chr> "full"
## $ lcclass                   <chr> "PS"
## $ has_circulated            <dbl> 0
## $ sum_total_of_charge_count <dbl> 0
## $ total_5xx_6xx             <dbl> 6
## $ litform                   <fct> p
## $ litform_cat               <fct> fiction
df_withlitform %>%
  filter(sum_total_of_charge_count > 0) %>%
  group_by(language, litform_cat) %>%
  summarise(n = n(),
            mean_circ = mean(sum_total_of_charge_count) ) %>%
  select(-n) %>%
  pivot_wider(names_from = "litform_cat", values_from = "mean_circ") %>%
  mutate(diff = fiction - nonfiction) %>%
  ggplot(aes(x = fiction, y = nonfiction)) +
  geom_smooth(se = FALSE, method = "lm") +
  geom_point() +
  geom_text(aes(label = language), size = 3)

df_withlitform %>%
  filter(sum_total_of_charge_count > 0) %>%
  count(sum_total_of_charge_count) %>%
  ggplot(aes(x = sum_total_of_charge_count, y = n )) +
  geom_line()

df_withlitform %>%
  filter(sum_total_of_charge_count > 0) %>%
  select(sum_total_of_charge_count) %>%
  mutate(logcharges = log(sum_total_of_charge_count)) %>%
  arrange(sum_total_of_charge_count) %>%
  ggplot(aes(x = logcharges )) +
  geom_density(adjust = 3)

Different approach: Onlky analyze circulating items

df_circulated <- df_withlitform %>%
  filter(sum_total_of_charge_count > 0) 

df_circulated %>%
  group_by(language, litform_cat) %>%
  summarize(n = n(),
            mean_circ = mean(sum_total_of_charge_count))
get_models2 <- function(df) {
  models <- df %>%
    select(instance_hrid, language, age, total_5xx_6xx, sum_total_of_charge_count, litform_cat) %>%
  nest(-language) %>%
  mutate(model = map(data, ~lm(sum_total_of_charge_count ~ age + total_5xx_6xx + litform_cat, data = .))) %>%
           select(-data)
}

models2 <- get_models2(df_circulated)

tidy_models2 <- models2  %>%
  mutate(tidied = map(model, tidy)) %>%
  unnest(tidied) %>%
  mutate(across(estimate:p.value, round, 3)) %>%
  select(-model, -statistic, -std.error)
tidy_models2 %>%
  arrange(term, estimate)
df_circulated_fiction <- df_circulated %>%
  filter(litform_cat == "fiction")

df_circulated_nonfiction <- df_circulated %>%
  filter(litform_cat == "nonfiction")

models_circulated_fiction <- get_models(df_circulated_fiction)
models_circulated_nonfiction <- get_models(df_circulated_nonfiction)

tidy_models2_fiction <- models_circulated_fiction  %>%
  mutate(tidied = map(model, tidy)) %>%
  unnest(tidied) %>%
  mutate(across(estimate:p.value, round, 3)) %>%
  select(-model, -statistic, -std.error)

tidy_models2_nonfiction <- models_circulated_nonfiction  %>%
  mutate(tidied = map(model, tidy)) %>%
  unnest(tidied) %>%
  mutate(across(estimate:p.value, round, 3)) %>%
  select(-model, -statistic, -std.error)

fiction results

tidy_models2_fiction %>%
  arrange(term, estimate)

nonfiction results

tidy_models2_nonfiction %>%
  arrange(term, estimate)
df_circulated_nonfiction %>%
  #filter(language == "English") %>%
  filter(sum_total_of_charge_count < 100) %>%
  ggplot(aes(x = total_5xx_6xx, y = sum_total_of_charge_count)) +
  geom_point(alpha = 0.1) +
  labs(title = "Circulating nonfiction, by total circulations and completeness",
        y = "total circulations",
       x = "total 5xx and 6xx in record") +
  facet_wrap(~language)

df_circulated_fiction %>%
  #filter(language == "English") %>%
  filter(sum_total_of_charge_count < 100) %>%
  ggplot(aes(x = total_5xx_6xx, y = sum_total_of_charge_count)) +
  geom_point(alpha = 0.1) +
  labs(title = "Circulating fiction, by total circulations and completeness",
        y = "total circulations",
       x = "total 5xx and 6xx in record") +
  facet_wrap(~language)