library(tidyverse)
library(broom)
library(skimr)
library(scales)
library(ggrepel)
library(kableExtra)
library(stringr)

df_lr1 <- read_rds("data/df_lr1.rds")


# Import the literary form dataset (fiction vs non-fiction)

df_litform <- read_csv("data/litform.csv") %>%
  mutate(instance_hrid = as.character(instance_hrid),
         litform = as.factor(litform))

df_withlitform <- df_lr1 %>%
  left_join(df_litform) %>%
  mutate(litform_cat = ifelse(litform == "0", "nonfiction", "fiction")) %>%
  mutate(litform_cat = as.factor(litform_cat)) %>%
  filter(!is.na(litform))

# model function

get_lm_model <- function(df) {
  model <- lm(sum_total_of_charge_count ~ age + total_5xx_6xx, data = df)
  return(model)
}


get_lm_model <- function(df) {
  model <- lm(sum_total_of_charge_count ~ age + total_5xx_6xx, data = df)
  return(model)
}


get_predictions <- function(df, model) {
  df_augment <- augment(model) %>%
    select(sum_total_of_charge_count:.resid) %>%
    mutate(across(.fitted:.resid, round, 3))
  
  id_col <- df %>%
    select(instance_hrid)
  
  predictions <- bind_cols(id_col, df_augment)
  
  return(predictions)
}

get_tidymodel <- function(model) {
  tidymodel <- model %>%
    tidy() %>%
    mutate(across(estimate:p.value, round, 3)) %>%
    select(-statistic)
  
  return(tidymodel)
}

data_eng <- df_lr1 %>%
  select(instance_hrid, language, age, total_5xx_6xx, sum_total_of_charge_count) %>%
  filter(language == "English")

model_eng <- data_eng %>%
  get_lm_model()

predict_eng <- get_predictions(data_eng, model_eng)

# data overview for cdexec
df_lr1 %>%
  select(instance_hrid, language, age, total_5xx_6xx, sum_total_of_charge_count) %>%
  glimpse()

## Rows: 1,105,330
## Columns: 5
## $ instance_hrid             <chr> "10000001", "10000002", "10000003", "1000000~
## $ language                  <chr> "English", "English", "English", "English", ~
## $ age                       <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,~
## $ total_5xx_6xx             <dbl> 9, 6, 7, 9, 13, 18, 10, 11, 12, 9, 11, 17, 6~
## $ sum_total_of_charge_count <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0,~

df_lr1 %>%
  count(language) %>%
  arrange(language)

# eng lang histogram of charges

data_eng %>%
  filter(sum_total_of_charge_count > 0) %>%
  sample_n(100) %>%
  ggplot(aes(sample = sum_total_of_charge_count  )) +
  geom_qq(alpha = 0.2) +
  geom_qq_line()

# example for cdexec slide

get_tidymodel(model_eng)

predict_eng %>%
  filter(instance_hrid == "4213747")

0.892 + (17 * 0.114) + (32 * 0.032)

## [1] 3.854

data_tha <- df_lr1 %>%
  select(instance_hrid, language, age, total_5xx_6xx, sum_total_of_charge_count) %>%
  filter(language == "Thai")

model_tha <- data_tha %>%
  get_lm_model()

predict_that <- get_predictions(data_tha, model_tha)

model_tha %>%
  tidy() %>%
    mutate(across(estimate:p.value, round, 3))

data_tha %>%
  skim()

Data summary

Name	Piped data
Number of rows	24042
Number of columns	5
_______________________
Column type frequency:
character	2
numeric	3
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
instance_hrid	0	1	7	8	0	24040	0
language	0	1	4	4	0	1	0

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
age	1	10.16	4.79	1	7	11	14	18	<U+2585><U+2583><U+2587><U+2587><U+2586>
total_5xx_6xx	1	5.91	3.10	0	4	6	8	26	<U+2587><U+2587><U+2581><U+2581><U+2581>
sum_total_of_charge_count	1	0.15	0.59	0	0	0	0	36	<U+2587><U+2581><U+2581><U+2581><U+2581>

# old and rich records

s1 <- predict_eng %>%
  filter(age > 15, total_5xx_6xx > 20, sum_total_of_charge_count > 0) %>%
  sample_n(100)

s1 %>%
  filter(between(.resid, -0.4, 0.4))

# new and poor record

s2 <- predict_eng %>%
  filter(age < 3, total_5xx_6xx < 3, sum_total_of_charge_count == 0) %>%
  sample_n(100)

# tidy

model_eng %>%
  tidy() %>%
  mutate(across(estimate:p.value, round, 3)) %>%
  mutate(language = "English") %>%
  select(language, everything())

# model all

model3_all <- lm(sum_total_of_charge_count ~ language +
                   age + total_5xx_6xx, data = df_lr1)

summary(model3_all)

## 
## Call:
## lm(formula = sum_total_of_charge_count ~ language + age + total_5xx_6xx, 
##     data = df_lr1)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##   -4.1   -2.0   -0.6    0.2 5920.5 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         1.35409    0.05144  26.324  < 2e-16 ***
## languageFrench     -2.02724    0.06358 -31.883  < 2e-16 ***
## languageGerman     -2.19077    0.05056 -43.331  < 2e-16 ***
## languageIndonesian -2.11340    0.07931 -26.646  < 2e-16 ***
## languageRussian    -2.32434    0.07020 -33.110  < 2e-16 ***
## languageSpanish    -2.04511    0.05830 -35.077  < 2e-16 ***
## languageThai       -2.14351    0.10349 -20.711  < 2e-16 ***
## languageVietnamese -2.16272    0.08598 -25.154  < 2e-16 ***
## age                 0.07839    0.00293  26.751  < 2e-16 ***
## total_5xx_6xx       0.02385    0.00327   7.292 3.05e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.69 on 1105320 degrees of freedom
## Multiple R-squared:  0.005117,   Adjusted R-squared:  0.005109 
## F-statistic: 631.7 on 9 and 1105320 DF,  p-value: < 2.2e-16

model3_all %>% tidy() %>%
  filter(term != "(Intercept)") %>%
  select(term, estimate) %>%
  arrange(desc(abs(estimate)))

# nested models 

# TO DO: Need to keep instance_hrid


models1 <- df_lr1 %>%
    select(instance_hrid, language, age, total_5xx_6xx, sum_total_of_charge_count) %>%
  nest(-language) %>%
  mutate(model = map(data, ~lm(sum_total_of_charge_count ~ age + total_5xx_6xx, data = .))) %>%
           select(-data)

models2 <- df_lr1 %>%
    select(instance_hrid, language, age, lcclass, total_5xx_6xx, has_circulated, sum_total_of_charge_count)

models2 %>%
  sample_n(10000) %>%
  group_by(language,lcclass, has_circulated) %>%
  summarize(n = n(),
            sum_5_6 = sum(total_5xx_6xx)) %>%
  filter(language == "English") %>%
  pivot_wider(names_from = "has_circulated", values_from = "n") %>%
  rename(circ = `1`, nocirc = `0`) %>%
  mutate(across(nocirc:circ, ~ ifelse(is.na(.x), 0, .x))) %>%
  mutate(total_titles = circ + nocirc) %>%
  arrange(desc(total_titles)) %>%
  select(c(language:sum_5_6, total_titles, nocirc:circ))

models2 %>%
  sample_n(10000) %>%
  group_by(age, language, total_5xx_6xx, has_circulated, ) %>%
  summarize(n = n()) %>%
  arrange(age) %>%
  filter(language == "Spanish") %>%
  ggplot(aes(x = total_5xx_6xx)) +
  geom_density() +
  facet_wrap(~age)

    # nest(-language) %>%
  # mutate(model = map(data, ~lm(sum_total_of_charge_count ~ age + total_5xx_6xx, data = .))) %>%
  #          select(-data)

models1  %>%
  mutate(tidied = map(model, tidy)) %>%
  unnest(tidied) %>%
  mutate(across(estimate:p.value, round, 3)) %>%
  select(-model, -statistic, -std.error, -p.value) %>%
  filter(term %in% c("age", "total_5xx_6xx")) %>%
  arrange(term, desc(estimate))

models1  %>%
  mutate(tidied = map(model, tidy)) %>%
  unnest(tidied) %>%
  mutate(across(estimate:p.value, round, 3)) %>%
  select(-model, -statistic, -std.error, -p.value) %>%
  filter(term %in% c("age", "total_5xx_6xx")) %>%
  arrange(term, desc(estimate)) %>%
  pivot_wider(names_from = "term", values_from = "estimate") %>%
  ggplot(aes(x = age, y = total_5xx_6xx )) +
  geom_point(alpha = 0.3, color = "red") +
  geom_text(aes(label = language), size = 3) +
  scale_x_continuous(breaks = seq(0,15, by = .01), limits = c(0, 0.13)) +
  scale_y_continuous(breaks = seq(-0.01,0.04, by = .01), limits = c(-0.01, 0.04)) +
  theme(panel.grid.minor = element_blank()) +
  labs(title = "Model coefficients for CUL print monograph circulation, by language, age and level of description ",
       y = "effect of total number of 5xx and 6xx fields",
       x = "effect of age",
       subtitle = "Combined visualization of separate multiple regression model for each language")

ggsave("output/coefficients_by_age_and_total5xx-6xx_in_record.png", height = 6, width = 10, units = "in")

models1  %>%
  mutate(tidied = map(model, tidy)) %>%
  unnest(tidied) %>%
  mutate(across(estimate:p.value, round, 3)) %>%
  select(-model, -statistic, -std.error, -p.value) %>%
  pivot_wider(names_from = "term", values_from = "estimate")

models1  %>%
  mutate(tidied = map(model, tidy)) %>%
  unnest(tidied) %>%
  mutate(across(estimate:p.value, round, 3)) %>%
  select(-model, -statistic, -std.error, -p.value) %>%
  pivot_wider(names_from = "term", values_from = "estimate")

augment models1

df_models1_augment <- models1  %>%
  mutate(augment = map(model, augment)) %>%
  unnest(augment) %>%
  select(-model) %>%
  sample_n(100) %>%
  mutate(across(.fitted:.std.resid, round, 3))


df_models1_augment %>%
  arrange(.fitted)

litform

df_withlitform %>%
  count(litform_cat, litform)

df_withlitform %>%
  filter(language != "English") %>%
  ggplot(aes(litform_cat, sum_total_of_charge_count)) +
  geom_point(alpha = 0.1) +
  facet_wrap(~language)

df_withlitform %>%
  group_by(language, litform_cat) %>%
  summarise(mean_subjects = mean(total_5xx_6xx)) %>%
  ggplot(aes(x = language, y = mean_subjects)) +
  geom_col(aes(fill = litform_cat), position = "dodge") +
  labs(y = "average number of 5xx-6xx / record") +
  theme(legend.position = "top",
        legend.title = element_blank())

ggsave("output/total5xx-6xx_by_litform.png", height = 6, width = 10, units = "in")

df_withlitform %>%
  group_by(age, language, litform_cat) %>%
  summarise(n = n(),
            mean_subjects = mean(total_5xx_6xx)) %>%
  pivot_wider(names_from = "litform_cat", values_from = "n") %>%
  mutate(total = fiction + nonfiction) %>%
  mutate(percent_fiction = fiction / total)

df_withlitform %>%
  group_by(age, language, litform_cat) %>%
  summarise(n = n(),
            mean_subjects = mean(total_5xx_6xx)) %>%
  ggplot(aes(x = age, y = mean_subjects)) +
  geom_line(aes(color = litform_cat), size = 2) +
  facet_wrap(~language) +
    labs(y = "average count of 5xx-6xx per record") +
  theme(legend.position = "top",
        legend.title = element_blank())

models_litform <- df_withlitform %>%
    select(instance_hrid, language, age, total_5xx_6xx, sum_total_of_charge_count, litform_cat) %>%
  nest(-language) %>%
  mutate(model = map(data, ~lm(sum_total_of_charge_count ~ age + total_5xx_6xx + litform_cat, data = .))) %>%
           select(-data)

models_litform  %>%
  mutate(tidied = map(model, tidy)) %>%
  unnest(tidied) %>%
  mutate(across(estimate:p.value, round, 3)) %>%
  select(-model, -statistic, -std.error, -p.value) %>%
  arrange(term, desc(estimate)) %>% 
  pivot_wider(names_from = "term", values_from = "estimate")

now split into fiction and nonfiction models

df_withlitform %>%
  group_by(language, litform_cat) %>%
  summarise(n = n()) %>%
  pivot_wider(names_from = "litform_cat", values_from = "n") %>%
  mutate(total = fiction + nonfiction,
         perc_fiction = fiction / total)

df_withlitform %>%
  group_by(language, age, litform_cat) %>%
  summarise(n = n(),
            perc_circ = mean(has_circulated) ) %>%
  ggplot(aes(x = age, y = perc_circ, color = litform_cat)) +
  geom_line() +
  facet_wrap(~language) +
  theme(legend.position = "top",
        legend.title = element_blank()) +
  labs(title = "Percent circulating by litform, age, and language") +
  scale_y_continuous(labels = percent_format())

df_withlitform %>%
  #filter(language != "English") %>%
  group_by(language,litform_cat) %>%
  summarise(n = n(),
            perc_circ = mean(has_circulated) ) %>%
  mutate(perc_circ = round(perc_circ, 3) * 100) %>%
  ggplot(aes(x = litform_cat, y = n, fill = litform_cat)) +
  geom_col() +
  geom_text(aes(label = paste0(perc_circ, "%")  ), size = 5) +
  facet_wrap(~language) +
  theme(legend.position = "top",
        legend.title = element_blank()) +
  labs(x = NULL,
       y = "number of titles") +
  scale_y_continuous(labels = comma_format()) +
  labs(title = "Number of titles by litform and percent circulating, by language")

df_withlitform_fiction <- df_withlitform %>%
  filter(litform_cat == "fiction")

df_withlitform_nonfiction <- df_withlitform %>%
  filter(litform_cat == "nonfiction")


get_models <- function(df) {
  models <- df %>%
    select(instance_hrid, language, age, total_5xx_6xx, sum_total_of_charge_count) %>%
  nest(-language) %>%
  mutate(model = map(data, ~lm(sum_total_of_charge_count ~ age + total_5xx_6xx, data = .))) %>%
           select(-data)
}

models_fiction <- get_models(df_withlitform_fiction)
models_nonfiction <- get_models(df_withlitform_nonfiction)

df_withlitform %>%
  count(litform, sort = TRUE)

models_fiction  %>%
  mutate(tidied = map(model, tidy)) %>%
  unnest(tidied) %>%
  mutate(across(estimate:p.value, round, 3)) %>%
  select(-model, -statistic, -std.error, -p.value) %>%
  filter(term %in% c("age", "total_5xx_6xx")) %>%
  arrange(term, desc(estimate)) %>%
  pivot_wider(names_from = "term", values_from = "estimate") %>%
  ggplot(aes(x = age, y = total_5xx_6xx)) +
  geom_point(alpha = 0.3, color = "red") +
  geom_label(aes(label = language), size = 3) +
  scale_x_continuous(breaks = seq(0,0.4, by = .05), limits = c(0, 0.4)) +
  scale_y_continuous(breaks = seq(-0.05,0.3, by = 0.05), limits = c(-0.05, 0.3)) +
  theme(panel.grid.minor = element_blank()) +
  labs(title = "Fiction CUL print monograph circulation model coefficients, by total_5xx_6xx and age",
       y = "effect of total number of 5xx and 6xx fields",
       x = "effect of age",
       subtitle = "Combined visualization of separate multiple regression model for each language",
       caption = "Linear model: lm(sum_total_of_charge_count ~ age + total_5xx_6xx, data = .)")

models_nonfiction  %>%
  mutate(tidied = map(model, tidy)) %>%
  unnest(tidied) %>%
  mutate(across(estimate:p.value, round, 3)) %>%
  select(-model, -statistic, -std.error, -p.value) %>%
  filter(term %in% c("age", "total_5xx_6xx")) %>%
  arrange(term, desc(estimate)) %>%
  pivot_wider(names_from = "term", values_from = "estimate") %>%
  ggplot(aes(x = age, y = total_5xx_6xx )) +
  geom_label(aes(label = language), size = 4) +
  scale_x_continuous(breaks = seq(0,15, by = .01), limits = c(0, 0.12)) +
  scale_y_continuous(breaks = seq(0,0.04, by = .01), limits = c(0, 0.03)) +
  theme(panel.grid.minor = element_blank()) +
  labs(title = "Nonfiction CUL print monograph circulation model coefficients, by total_5xx_6xx and age",
       y = "effect of total number of 5xx and 6xx fields",
       x = "effect of age",
       subtitle = "Combined visualization of separate multiple regression model for each language",
       caption = "Linear model: lm(sum_total_of_charge_count ~ age + total_5xx_6xx, data = .)")

Look at the outliers

df_withlitform_fiction %>%
  group_by(language) %>%
  summarize(max_charges = max(sum_total_of_charge_count))

df_withlitform_fiction %>%
  group_by(language) %>%
  summarise(quants = list(quantile(sum_total_of_charge_count, probs = c(.01, .1, .25, .5, .75, .90,.99, .995, .999)))) %>%
  unnest_wider(quants)

df_withlitform_nonfiction %>%
  group_by(language) %>%
  summarise(quants = list(quantile(sum_total_of_charge_count, probs = c(.01, .1, .25, .5, .75, .90,.99, .995, .999)))) %>%
  unnest_wider(quants)

df_withlitform_fiction %>%
  group_by(language, total_5xx_6xx) %>%
  summarise(n = n(),
            perc_circ = mean(has_circulated) )

df_withlitform_nonfiction %>%
  group_by(language, total_5xx_6xx) %>%
  summarise(n = n(),
            perc_circ = mean(has_circulated) )

df_withlitform_nonfiction %>%
  filter(language == "Thai",
         total_5xx_6xx == 0)

df_withlitform %>%
  group_by(language, lcclass, litform) %>%
  mutate(litform = str_to_lower(litform)) %>%
  summarise(n = n()) %>%
  arrange(litform) %>%
  pivot_wider(names_from = "litform", values_from = "n") %>%
  mutate(across(`|`:`z`, ~ ifelse(is.na(.x), 0, .x))) %>%
  write_csv("output/lang_lcclass_litform.csv")

models_fiction  %>%
  mutate(tidied = map(model, tidy)) %>%
  unnest(tidied) %>%
  mutate(across(estimate:p.value, round, 3)) %>%
  select(-model, -statistic, -std.error) %>%
  filter(term %in% c("age", "total_5xx_6xx")) %>%
  arrange(term, desc(estimate)) %>%
  mutate(stat_significant = ifelse(p.value < 0.05, "yes", "no")) %>%
  ggplot(aes(x = language, y = estimate, fill = stat_significant)) +
    geom_col() +
  facet_wrap(~term, ncol = 1)

Amount of model variance that is explained.

Fiction model explains more of the variance than the nonfiction model

Fiction

models_fiction  %>%
  mutate(gl = map(model, glance)) %>%
  unnest(gl) %>%
  mutate(across(r.squared:p.value, round, 3))

Nonfiction

models_nonfiction  %>%
  mutate(gl = map(model, glance)) %>%
  unnest(gl) %>%
  mutate(across(r.squared:p.value, round, 3))

aug_fiction <- models_fiction  %>%
  mutate(aug = map(model, augment)) %>%
  unnest(aug) %>%
  select(.fitted:.resid ) %>%
  mutate(across(.fitted:.resid, round, 2)) %>%
  bind_cols(df_withlitform_fiction)

aug_fiction

aug_nonfiction <- models_nonfiction  %>%
  mutate(aug = map(model, augment)) %>%
  unnest(aug) %>%
  select(.fitted:.resid ) %>%
  mutate(across(.fitted:.resid, round, 2)) %>%
  bind_cols(df_withlitform_nonfiction)

aug_nonfiction %>%
  head()

next step: check with hand calculations

models_nonfiction  %>%
  mutate(tidied = map(model, tidy)) %>%
  unnest(tidied) %>%
  mutate(across(estimate:p.value, round, 3)) %>%
  select(-model, -statistic, -std.error, -p.value)

example nonfiction

# Intercept + (0.102 * age) + (0.011 * total_5xx_6xx)
# 10000001
1.170 + (0.102 * 2) + (0.011 * 9)

## [1] 1.473

aug_nonfiction %>%
  head(1) %>%
  glimpse()

## Rows: 1
## Columns: 13
## $ .fitted                   <dbl> 1.47
## $ .resid                    <dbl> -1.47
## $ instance_hrid             <chr> "10000001"
## $ language                  <chr> "English"
## $ age                       <dbl> 2
## $ encoding_level            <chr> "full"
## $ enc_cat                   <chr> "full"
## $ lcclass                   <chr> "HF"
## $ has_circulated            <dbl> 0
## $ sum_total_of_charge_count <dbl> 0
## $ total_5xx_6xx             <dbl> 9
## $ litform                   <fct> 0
## $ litform_cat               <fct> nonfiction

example fiction

models_fiction  %>%
  mutate(tidied = map(model, tidy)) %>%
  unnest(tidied) %>%
  mutate(across(estimate:p.value, round, 3)) %>%
  select(-model, -statistic, -std.error, -p.value) %>%
  arrange(term, estimate)

# Intercept + (0.102 * age) + (0.011 * total_5xx_6xx)
# 10000002
-1.986 + (0.310 * 2) + (0.271 * 6)

## [1] 0.26

aug_fiction %>%
  head(1) %>%
  glimpse()

## Rows: 1
## Columns: 13
## $ .fitted                   <dbl> 0.26
## $ .resid                    <dbl> -0.26
## $ instance_hrid             <chr> "10000002"
## $ language                  <chr> "English"
## $ age                       <dbl> 2
## $ encoding_level            <chr> "full"
## $ enc_cat                   <chr> "full"
## $ lcclass                   <chr> "PS"
## $ has_circulated            <dbl> 0
## $ sum_total_of_charge_count <dbl> 0
## $ total_5xx_6xx             <dbl> 6
## $ litform                   <fct> p
## $ litform_cat               <fct> fiction

df_withlitform %>%
  filter(sum_total_of_charge_count > 0) %>%
  group_by(language, litform_cat) %>%
  summarise(n = n(),
            mean_circ = mean(sum_total_of_charge_count) ) %>%
  select(-n) %>%
  pivot_wider(names_from = "litform_cat", values_from = "mean_circ") %>%
  mutate(diff = fiction - nonfiction) %>%
  ggplot(aes(x = fiction, y = nonfiction)) +
  geom_smooth(se = FALSE, method = "lm") +
  geom_point() +
  geom_text(aes(label = language), size = 3)

df_withlitform %>%
  filter(sum_total_of_charge_count > 0) %>%
  count(sum_total_of_charge_count) %>%
  ggplot(aes(x = sum_total_of_charge_count, y = n )) +
  geom_line()

df_withlitform %>%
  filter(sum_total_of_charge_count > 0) %>%
  select(sum_total_of_charge_count) %>%
  mutate(logcharges = log(sum_total_of_charge_count)) %>%
  arrange(sum_total_of_charge_count) %>%
  ggplot(aes(x = logcharges )) +
  geom_density(adjust = 3)

Different approach: Onlky analyze circulating items

df_circulated <- df_withlitform %>%
  filter(sum_total_of_charge_count > 0) 

df_circulated %>%
  group_by(language, litform_cat) %>%
  summarize(n = n(),
            mean_circ = mean(sum_total_of_charge_count))

get_models2 <- function(df) {
  models <- df %>%
    select(instance_hrid, language, age, total_5xx_6xx, sum_total_of_charge_count, litform_cat) %>%
  nest(-language) %>%
  mutate(model = map(data, ~lm(sum_total_of_charge_count ~ age + total_5xx_6xx + litform_cat, data = .))) %>%
           select(-data)
}

models2 <- get_models2(df_circulated)

tidy_models2 <- models2  %>%
  mutate(tidied = map(model, tidy)) %>%
  unnest(tidied) %>%
  mutate(across(estimate:p.value, round, 3)) %>%
  select(-model, -statistic, -std.error)

tidy_models2 %>%
  arrange(term, estimate)

df_circulated_fiction <- df_circulated %>%
  filter(litform_cat == "fiction")

df_circulated_nonfiction <- df_circulated %>%
  filter(litform_cat == "nonfiction")

models_circulated_fiction <- get_models(df_circulated_fiction)
models_circulated_nonfiction <- get_models(df_circulated_nonfiction)

tidy_models2_fiction <- models_circulated_fiction  %>%
  mutate(tidied = map(model, tidy)) %>%
  unnest(tidied) %>%
  mutate(across(estimate:p.value, round, 3)) %>%
  select(-model, -statistic, -std.error)

tidy_models2_nonfiction <- models_circulated_nonfiction  %>%
  mutate(tidied = map(model, tidy)) %>%
  unnest(tidied) %>%
  mutate(across(estimate:p.value, round, 3)) %>%
  select(-model, -statistic, -std.error)

fiction results

tidy_models2_fiction %>%
  arrange(term, estimate)

nonfiction results

tidy_models2_nonfiction %>%
  arrange(term, estimate)

df_circulated_nonfiction %>%
  #filter(language == "English") %>%
  filter(sum_total_of_charge_count < 100) %>%
  ggplot(aes(x = total_5xx_6xx, y = sum_total_of_charge_count)) +
  geom_point(alpha = 0.1) +
  labs(title = "Circulating nonfiction, by total circulations and completeness",
        y = "total circulations",
       x = "total 5xx and 6xx in record") +
  facet_wrap(~language)

df_circulated_fiction %>%
  #filter(language == "English") %>%
  filter(sum_total_of_charge_count < 100) %>%
  ggplot(aes(x = total_5xx_6xx, y = sum_total_of_charge_count)) +
  geom_point(alpha = 0.1) +
  labs(title = "Circulating fiction, by total circulations and completeness",
        y = "total circulations",
       x = "total 5xx and 6xx in record") +
  facet_wrap(~language)

Do titles with full MARC cataloging circulate more than those with lesser level cataloging? Data Explorations

Adam Chandler

11 May 2022 12:07

augment models1

litform

now split into fiction and nonfiction models

Look at the outliers

Amount of model variance that is explained.

Fiction model explains more of the variance than the nonfiction model

Fiction

Nonfiction

next step: check with hand calculations

example nonfiction

example fiction

Different approach: Onlky analyze circulating items

fiction results

nonfiction results