library(tidyverse)
library(broom)
library(skimr)
library(scales)
library(ggrepel)
library(kableExtra)
library(stringr)
df_lr1 <- read_rds("data/df_lr1.rds")
# Import the literary form dataset (fiction vs non-fiction)
df_litform <- read_csv("data/litform.csv") %>%
mutate(instance_hrid = as.character(instance_hrid),
litform = as.factor(litform))
df_withlitform <- df_lr1 %>%
left_join(df_litform) %>%
mutate(litform_cat = ifelse(litform == "0", "nonfiction", "fiction")) %>%
mutate(litform_cat = as.factor(litform_cat)) %>%
filter(!is.na(litform))
# model function
get_lm_model <- function(df) {
model <- lm(sum_total_of_charge_count ~ age + total_5xx_6xx, data = df)
return(model)
}
get_lm_model <- function(df) {
model <- lm(sum_total_of_charge_count ~ age + total_5xx_6xx, data = df)
return(model)
}
get_predictions <- function(df, model) {
df_augment <- augment(model) %>%
select(sum_total_of_charge_count:.resid) %>%
mutate(across(.fitted:.resid, round, 3))
id_col <- df %>%
select(instance_hrid)
predictions <- bind_cols(id_col, df_augment)
return(predictions)
}
get_tidymodel <- function(model) {
tidymodel <- model %>%
tidy() %>%
mutate(across(estimate:p.value, round, 3)) %>%
select(-statistic)
return(tidymodel)
}
data_eng <- df_lr1 %>%
select(instance_hrid, language, age, total_5xx_6xx, sum_total_of_charge_count) %>%
filter(language == "English")
model_eng <- data_eng %>%
get_lm_model()
predict_eng <- get_predictions(data_eng, model_eng)
# data overview for cdexec
df_lr1 %>%
select(instance_hrid, language, age, total_5xx_6xx, sum_total_of_charge_count) %>%
glimpse()
## Rows: 1,105,330
## Columns: 5
## $ instance_hrid <chr> "10000001", "10000002", "10000003", "1000000~
## $ language <chr> "English", "English", "English", "English", ~
## $ age <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,~
## $ total_5xx_6xx <dbl> 9, 6, 7, 9, 13, 18, 10, 11, 12, 9, 11, 17, 6~
## $ sum_total_of_charge_count <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0,~
df_lr1 %>%
count(language) %>%
arrange(language)
# eng lang histogram of charges
data_eng %>%
filter(sum_total_of_charge_count > 0) %>%
sample_n(100) %>%
ggplot(aes(sample = sum_total_of_charge_count )) +
geom_qq(alpha = 0.2) +
geom_qq_line()

# example for cdexec slide
get_tidymodel(model_eng)
predict_eng %>%
filter(instance_hrid == "4213747")
0.892 + (17 * 0.114) + (32 * 0.032)
## [1] 3.854
data_tha <- df_lr1 %>%
select(instance_hrid, language, age, total_5xx_6xx, sum_total_of_charge_count) %>%
filter(language == "Thai")
model_tha <- data_tha %>%
get_lm_model()
predict_that <- get_predictions(data_tha, model_tha)
model_tha %>%
tidy() %>%
mutate(across(estimate:p.value, round, 3))
data_tha %>%
skim()
Data summary
|
|
|
|
Name
|
Piped data
|
|
Number of rows
|
24042
|
|
Number of columns
|
5
|
|
_______________________
|
|
|
Column type frequency:
|
|
|
character
|
2
|
|
numeric
|
3
|
|
________________________
|
|
|
Group variables
|
None
|
Variable type: character
|
skim_variable
|
n_missing
|
complete_rate
|
min
|
max
|
empty
|
n_unique
|
whitespace
|
|
instance_hrid
|
0
|
1
|
7
|
8
|
0
|
24040
|
0
|
|
language
|
0
|
1
|
4
|
4
|
0
|
1
|
0
|
Variable type: numeric
|
skim_variable
|
n_missing
|
complete_rate
|
mean
|
sd
|
p0
|
p25
|
p50
|
p75
|
p100
|
hist
|
|
age
|
0
|
1
|
10.16
|
4.79
|
1
|
7
|
11
|
14
|
18
|
<U+2585><U+2583><U+2587><U+2587><U+2586>
|
|
total_5xx_6xx
|
0
|
1
|
5.91
|
3.10
|
0
|
4
|
6
|
8
|
26
|
<U+2587><U+2587><U+2581><U+2581><U+2581>
|
|
sum_total_of_charge_count
|
0
|
1
|
0.15
|
0.59
|
0
|
0
|
0
|
0
|
36
|
<U+2587><U+2581><U+2581><U+2581><U+2581>
|
# old and rich records
s1 <- predict_eng %>%
filter(age > 15, total_5xx_6xx > 20, sum_total_of_charge_count > 0) %>%
sample_n(100)
s1 %>%
filter(between(.resid, -0.4, 0.4))
# new and poor record
s2 <- predict_eng %>%
filter(age < 3, total_5xx_6xx < 3, sum_total_of_charge_count == 0) %>%
sample_n(100)
# tidy
model_eng %>%
tidy() %>%
mutate(across(estimate:p.value, round, 3)) %>%
mutate(language = "English") %>%
select(language, everything())
# model all
model3_all <- lm(sum_total_of_charge_count ~ language +
age + total_5xx_6xx, data = df_lr1)
summary(model3_all)
##
## Call:
## lm(formula = sum_total_of_charge_count ~ language + age + total_5xx_6xx,
## data = df_lr1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.1 -2.0 -0.6 0.2 5920.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.35409 0.05144 26.324 < 2e-16 ***
## languageFrench -2.02724 0.06358 -31.883 < 2e-16 ***
## languageGerman -2.19077 0.05056 -43.331 < 2e-16 ***
## languageIndonesian -2.11340 0.07931 -26.646 < 2e-16 ***
## languageRussian -2.32434 0.07020 -33.110 < 2e-16 ***
## languageSpanish -2.04511 0.05830 -35.077 < 2e-16 ***
## languageThai -2.14351 0.10349 -20.711 < 2e-16 ***
## languageVietnamese -2.16272 0.08598 -25.154 < 2e-16 ***
## age 0.07839 0.00293 26.751 < 2e-16 ***
## total_5xx_6xx 0.02385 0.00327 7.292 3.05e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.69 on 1105320 degrees of freedom
## Multiple R-squared: 0.005117, Adjusted R-squared: 0.005109
## F-statistic: 631.7 on 9 and 1105320 DF, p-value: < 2.2e-16
model3_all %>% tidy() %>%
filter(term != "(Intercept)") %>%
select(term, estimate) %>%
arrange(desc(abs(estimate)))
# nested models
# TO DO: Need to keep instance_hrid
models1 <- df_lr1 %>%
select(instance_hrid, language, age, total_5xx_6xx, sum_total_of_charge_count) %>%
nest(-language) %>%
mutate(model = map(data, ~lm(sum_total_of_charge_count ~ age + total_5xx_6xx, data = .))) %>%
select(-data)
models2 <- df_lr1 %>%
select(instance_hrid, language, age, lcclass, total_5xx_6xx, has_circulated, sum_total_of_charge_count)
models2 %>%
sample_n(10000) %>%
group_by(language,lcclass, has_circulated) %>%
summarize(n = n(),
sum_5_6 = sum(total_5xx_6xx)) %>%
filter(language == "English") %>%
pivot_wider(names_from = "has_circulated", values_from = "n") %>%
rename(circ = `1`, nocirc = `0`) %>%
mutate(across(nocirc:circ, ~ ifelse(is.na(.x), 0, .x))) %>%
mutate(total_titles = circ + nocirc) %>%
arrange(desc(total_titles)) %>%
select(c(language:sum_5_6, total_titles, nocirc:circ))
models2 %>%
sample_n(10000) %>%
group_by(age, language, total_5xx_6xx, has_circulated, ) %>%
summarize(n = n()) %>%
arrange(age) %>%
filter(language == "Spanish") %>%
ggplot(aes(x = total_5xx_6xx)) +
geom_density() +
facet_wrap(~age)

# nest(-language) %>%
# mutate(model = map(data, ~lm(sum_total_of_charge_count ~ age + total_5xx_6xx, data = .))) %>%
# select(-data)
models1 %>%
mutate(tidied = map(model, tidy)) %>%
unnest(tidied) %>%
mutate(across(estimate:p.value, round, 3)) %>%
select(-model, -statistic, -std.error, -p.value) %>%
filter(term %in% c("age", "total_5xx_6xx")) %>%
arrange(term, desc(estimate))
models1 %>%
mutate(tidied = map(model, tidy)) %>%
unnest(tidied) %>%
mutate(across(estimate:p.value, round, 3)) %>%
select(-model, -statistic, -std.error, -p.value) %>%
filter(term %in% c("age", "total_5xx_6xx")) %>%
arrange(term, desc(estimate)) %>%
pivot_wider(names_from = "term", values_from = "estimate") %>%
ggplot(aes(x = age, y = total_5xx_6xx )) +
geom_point(alpha = 0.3, color = "red") +
geom_text(aes(label = language), size = 3) +
scale_x_continuous(breaks = seq(0,15, by = .01), limits = c(0, 0.13)) +
scale_y_continuous(breaks = seq(-0.01,0.04, by = .01), limits = c(-0.01, 0.04)) +
theme(panel.grid.minor = element_blank()) +
labs(title = "Model coefficients for CUL print monograph circulation, by language, age and level of description ",
y = "effect of total number of 5xx and 6xx fields",
x = "effect of age",
subtitle = "Combined visualization of separate multiple regression model for each language")

ggsave("output/coefficients_by_age_and_total5xx-6xx_in_record.png", height = 6, width = 10, units = "in")
models1 %>%
mutate(tidied = map(model, tidy)) %>%
unnest(tidied) %>%
mutate(across(estimate:p.value, round, 3)) %>%
select(-model, -statistic, -std.error, -p.value) %>%
pivot_wider(names_from = "term", values_from = "estimate")
models1 %>%
mutate(tidied = map(model, tidy)) %>%
unnest(tidied) %>%
mutate(across(estimate:p.value, round, 3)) %>%
select(-model, -statistic, -std.error, -p.value) %>%
pivot_wider(names_from = "term", values_from = "estimate")
augment models1
df_models1_augment <- models1 %>%
mutate(augment = map(model, augment)) %>%
unnest(augment) %>%
select(-model) %>%
sample_n(100) %>%
mutate(across(.fitted:.std.resid, round, 3))
df_models1_augment %>%
arrange(.fitted)
litform
df_withlitform %>%
count(litform_cat, litform)
df_withlitform %>%
filter(language != "English") %>%
ggplot(aes(litform_cat, sum_total_of_charge_count)) +
geom_point(alpha = 0.1) +
facet_wrap(~language)

df_withlitform %>%
group_by(language, litform_cat) %>%
summarise(mean_subjects = mean(total_5xx_6xx)) %>%
ggplot(aes(x = language, y = mean_subjects)) +
geom_col(aes(fill = litform_cat), position = "dodge") +
labs(y = "average number of 5xx-6xx / record") +
theme(legend.position = "top",
legend.title = element_blank())

ggsave("output/total5xx-6xx_by_litform.png", height = 6, width = 10, units = "in")
df_withlitform %>%
group_by(age, language, litform_cat) %>%
summarise(n = n(),
mean_subjects = mean(total_5xx_6xx)) %>%
pivot_wider(names_from = "litform_cat", values_from = "n") %>%
mutate(total = fiction + nonfiction) %>%
mutate(percent_fiction = fiction / total)
df_withlitform %>%
group_by(age, language, litform_cat) %>%
summarise(n = n(),
mean_subjects = mean(total_5xx_6xx)) %>%
ggplot(aes(x = age, y = mean_subjects)) +
geom_line(aes(color = litform_cat), size = 2) +
facet_wrap(~language) +
labs(y = "average count of 5xx-6xx per record") +
theme(legend.position = "top",
legend.title = element_blank())

models_litform <- df_withlitform %>%
select(instance_hrid, language, age, total_5xx_6xx, sum_total_of_charge_count, litform_cat) %>%
nest(-language) %>%
mutate(model = map(data, ~lm(sum_total_of_charge_count ~ age + total_5xx_6xx + litform_cat, data = .))) %>%
select(-data)
models_litform %>%
mutate(tidied = map(model, tidy)) %>%
unnest(tidied) %>%
mutate(across(estimate:p.value, round, 3)) %>%
select(-model, -statistic, -std.error, -p.value) %>%
arrange(term, desc(estimate)) %>%
pivot_wider(names_from = "term", values_from = "estimate")
now split into fiction and nonfiction models
df_withlitform %>%
group_by(language, litform_cat) %>%
summarise(n = n()) %>%
pivot_wider(names_from = "litform_cat", values_from = "n") %>%
mutate(total = fiction + nonfiction,
perc_fiction = fiction / total)
df_withlitform %>%
group_by(language, age, litform_cat) %>%
summarise(n = n(),
perc_circ = mean(has_circulated) ) %>%
ggplot(aes(x = age, y = perc_circ, color = litform_cat)) +
geom_line() +
facet_wrap(~language) +
theme(legend.position = "top",
legend.title = element_blank()) +
labs(title = "Percent circulating by litform, age, and language") +
scale_y_continuous(labels = percent_format())

df_withlitform %>%
#filter(language != "English") %>%
group_by(language,litform_cat) %>%
summarise(n = n(),
perc_circ = mean(has_circulated) ) %>%
mutate(perc_circ = round(perc_circ, 3) * 100) %>%
ggplot(aes(x = litform_cat, y = n, fill = litform_cat)) +
geom_col() +
geom_text(aes(label = paste0(perc_circ, "%") ), size = 5) +
facet_wrap(~language) +
theme(legend.position = "top",
legend.title = element_blank()) +
labs(x = NULL,
y = "number of titles") +
scale_y_continuous(labels = comma_format()) +
labs(title = "Number of titles by litform and percent circulating, by language")

df_withlitform_fiction <- df_withlitform %>%
filter(litform_cat == "fiction")
df_withlitform_nonfiction <- df_withlitform %>%
filter(litform_cat == "nonfiction")
get_models <- function(df) {
models <- df %>%
select(instance_hrid, language, age, total_5xx_6xx, sum_total_of_charge_count) %>%
nest(-language) %>%
mutate(model = map(data, ~lm(sum_total_of_charge_count ~ age + total_5xx_6xx, data = .))) %>%
select(-data)
}
models_fiction <- get_models(df_withlitform_fiction)
models_nonfiction <- get_models(df_withlitform_nonfiction)
df_withlitform %>%
count(litform, sort = TRUE)
models_fiction %>%
mutate(tidied = map(model, tidy)) %>%
unnest(tidied) %>%
mutate(across(estimate:p.value, round, 3)) %>%
select(-model, -statistic, -std.error, -p.value) %>%
filter(term %in% c("age", "total_5xx_6xx")) %>%
arrange(term, desc(estimate)) %>%
pivot_wider(names_from = "term", values_from = "estimate") %>%
ggplot(aes(x = age, y = total_5xx_6xx)) +
geom_point(alpha = 0.3, color = "red") +
geom_label(aes(label = language), size = 3) +
scale_x_continuous(breaks = seq(0,0.4, by = .05), limits = c(0, 0.4)) +
scale_y_continuous(breaks = seq(-0.05,0.3, by = 0.05), limits = c(-0.05, 0.3)) +
theme(panel.grid.minor = element_blank()) +
labs(title = "Fiction CUL print monograph circulation model coefficients, by total_5xx_6xx and age",
y = "effect of total number of 5xx and 6xx fields",
x = "effect of age",
subtitle = "Combined visualization of separate multiple regression model for each language",
caption = "Linear model: lm(sum_total_of_charge_count ~ age + total_5xx_6xx, data = .)")

models_nonfiction %>%
mutate(tidied = map(model, tidy)) %>%
unnest(tidied) %>%
mutate(across(estimate:p.value, round, 3)) %>%
select(-model, -statistic, -std.error, -p.value) %>%
filter(term %in% c("age", "total_5xx_6xx")) %>%
arrange(term, desc(estimate)) %>%
pivot_wider(names_from = "term", values_from = "estimate") %>%
ggplot(aes(x = age, y = total_5xx_6xx )) +
geom_label(aes(label = language), size = 4) +
scale_x_continuous(breaks = seq(0,15, by = .01), limits = c(0, 0.12)) +
scale_y_continuous(breaks = seq(0,0.04, by = .01), limits = c(0, 0.03)) +
theme(panel.grid.minor = element_blank()) +
labs(title = "Nonfiction CUL print monograph circulation model coefficients, by total_5xx_6xx and age",
y = "effect of total number of 5xx and 6xx fields",
x = "effect of age",
subtitle = "Combined visualization of separate multiple regression model for each language",
caption = "Linear model: lm(sum_total_of_charge_count ~ age + total_5xx_6xx, data = .)")

Look at the outliers
df_withlitform_fiction %>%
group_by(language) %>%
summarize(max_charges = max(sum_total_of_charge_count))
df_withlitform_fiction %>%
group_by(language) %>%
summarise(quants = list(quantile(sum_total_of_charge_count, probs = c(.01, .1, .25, .5, .75, .90,.99, .995, .999)))) %>%
unnest_wider(quants)
df_withlitform_nonfiction %>%
group_by(language) %>%
summarise(quants = list(quantile(sum_total_of_charge_count, probs = c(.01, .1, .25, .5, .75, .90,.99, .995, .999)))) %>%
unnest_wider(quants)
df_withlitform_fiction %>%
group_by(language, total_5xx_6xx) %>%
summarise(n = n(),
perc_circ = mean(has_circulated) )
df_withlitform_nonfiction %>%
group_by(language, total_5xx_6xx) %>%
summarise(n = n(),
perc_circ = mean(has_circulated) )
df_withlitform_nonfiction %>%
filter(language == "Thai",
total_5xx_6xx == 0)
df_withlitform %>%
group_by(language, lcclass, litform) %>%
mutate(litform = str_to_lower(litform)) %>%
summarise(n = n()) %>%
arrange(litform) %>%
pivot_wider(names_from = "litform", values_from = "n") %>%
mutate(across(`|`:`z`, ~ ifelse(is.na(.x), 0, .x))) %>%
write_csv("output/lang_lcclass_litform.csv")
models_fiction %>%
mutate(tidied = map(model, tidy)) %>%
unnest(tidied) %>%
mutate(across(estimate:p.value, round, 3)) %>%
select(-model, -statistic, -std.error) %>%
filter(term %in% c("age", "total_5xx_6xx")) %>%
arrange(term, desc(estimate)) %>%
mutate(stat_significant = ifelse(p.value < 0.05, "yes", "no")) %>%
ggplot(aes(x = language, y = estimate, fill = stat_significant)) +
geom_col() +
facet_wrap(~term, ncol = 1)

Amount of model variance that is explained.
Fiction model explains more of the variance than the nonfiction model
Fiction
models_fiction %>%
mutate(gl = map(model, glance)) %>%
unnest(gl) %>%
mutate(across(r.squared:p.value, round, 3))
Nonfiction
models_nonfiction %>%
mutate(gl = map(model, glance)) %>%
unnest(gl) %>%
mutate(across(r.squared:p.value, round, 3))
aug_fiction <- models_fiction %>%
mutate(aug = map(model, augment)) %>%
unnest(aug) %>%
select(.fitted:.resid ) %>%
mutate(across(.fitted:.resid, round, 2)) %>%
bind_cols(df_withlitform_fiction)
aug_fiction
aug_nonfiction <- models_nonfiction %>%
mutate(aug = map(model, augment)) %>%
unnest(aug) %>%
select(.fitted:.resid ) %>%
mutate(across(.fitted:.resid, round, 2)) %>%
bind_cols(df_withlitform_nonfiction)
aug_nonfiction %>%
head()
next step: check with hand calculations
models_nonfiction %>%
mutate(tidied = map(model, tidy)) %>%
unnest(tidied) %>%
mutate(across(estimate:p.value, round, 3)) %>%
select(-model, -statistic, -std.error, -p.value)
example nonfiction
# Intercept + (0.102 * age) + (0.011 * total_5xx_6xx)
# 10000001
1.170 + (0.102 * 2) + (0.011 * 9)
## [1] 1.473
aug_nonfiction %>%
head(1) %>%
glimpse()
## Rows: 1
## Columns: 13
## $ .fitted <dbl> 1.47
## $ .resid <dbl> -1.47
## $ instance_hrid <chr> "10000001"
## $ language <chr> "English"
## $ age <dbl> 2
## $ encoding_level <chr> "full"
## $ enc_cat <chr> "full"
## $ lcclass <chr> "HF"
## $ has_circulated <dbl> 0
## $ sum_total_of_charge_count <dbl> 0
## $ total_5xx_6xx <dbl> 9
## $ litform <fct> 0
## $ litform_cat <fct> nonfiction
example fiction
models_fiction %>%
mutate(tidied = map(model, tidy)) %>%
unnest(tidied) %>%
mutate(across(estimate:p.value, round, 3)) %>%
select(-model, -statistic, -std.error, -p.value) %>%
arrange(term, estimate)
# Intercept + (0.102 * age) + (0.011 * total_5xx_6xx)
# 10000002
-1.986 + (0.310 * 2) + (0.271 * 6)
## [1] 0.26
aug_fiction %>%
head(1) %>%
glimpse()
## Rows: 1
## Columns: 13
## $ .fitted <dbl> 0.26
## $ .resid <dbl> -0.26
## $ instance_hrid <chr> "10000002"
## $ language <chr> "English"
## $ age <dbl> 2
## $ encoding_level <chr> "full"
## $ enc_cat <chr> "full"
## $ lcclass <chr> "PS"
## $ has_circulated <dbl> 0
## $ sum_total_of_charge_count <dbl> 0
## $ total_5xx_6xx <dbl> 6
## $ litform <fct> p
## $ litform_cat <fct> fiction
df_withlitform %>%
filter(sum_total_of_charge_count > 0) %>%
group_by(language, litform_cat) %>%
summarise(n = n(),
mean_circ = mean(sum_total_of_charge_count) ) %>%
select(-n) %>%
pivot_wider(names_from = "litform_cat", values_from = "mean_circ") %>%
mutate(diff = fiction - nonfiction) %>%
ggplot(aes(x = fiction, y = nonfiction)) +
geom_smooth(se = FALSE, method = "lm") +
geom_point() +
geom_text(aes(label = language), size = 3)

df_withlitform %>%
filter(sum_total_of_charge_count > 0) %>%
count(sum_total_of_charge_count) %>%
ggplot(aes(x = sum_total_of_charge_count, y = n )) +
geom_line()

df_withlitform %>%
filter(sum_total_of_charge_count > 0) %>%
select(sum_total_of_charge_count) %>%
mutate(logcharges = log(sum_total_of_charge_count)) %>%
arrange(sum_total_of_charge_count) %>%
ggplot(aes(x = logcharges )) +
geom_density(adjust = 3)

Different approach: Onlky analyze circulating items
df_circulated <- df_withlitform %>%
filter(sum_total_of_charge_count > 0)
df_circulated %>%
group_by(language, litform_cat) %>%
summarize(n = n(),
mean_circ = mean(sum_total_of_charge_count))
get_models2 <- function(df) {
models <- df %>%
select(instance_hrid, language, age, total_5xx_6xx, sum_total_of_charge_count, litform_cat) %>%
nest(-language) %>%
mutate(model = map(data, ~lm(sum_total_of_charge_count ~ age + total_5xx_6xx + litform_cat, data = .))) %>%
select(-data)
}
models2 <- get_models2(df_circulated)
tidy_models2 <- models2 %>%
mutate(tidied = map(model, tidy)) %>%
unnest(tidied) %>%
mutate(across(estimate:p.value, round, 3)) %>%
select(-model, -statistic, -std.error)
tidy_models2 %>%
arrange(term, estimate)
df_circulated_fiction <- df_circulated %>%
filter(litform_cat == "fiction")
df_circulated_nonfiction <- df_circulated %>%
filter(litform_cat == "nonfiction")
models_circulated_fiction <- get_models(df_circulated_fiction)
models_circulated_nonfiction <- get_models(df_circulated_nonfiction)
tidy_models2_fiction <- models_circulated_fiction %>%
mutate(tidied = map(model, tidy)) %>%
unnest(tidied) %>%
mutate(across(estimate:p.value, round, 3)) %>%
select(-model, -statistic, -std.error)
tidy_models2_nonfiction <- models_circulated_nonfiction %>%
mutate(tidied = map(model, tidy)) %>%
unnest(tidied) %>%
mutate(across(estimate:p.value, round, 3)) %>%
select(-model, -statistic, -std.error)
fiction results
tidy_models2_fiction %>%
arrange(term, estimate)
nonfiction results
tidy_models2_nonfiction %>%
arrange(term, estimate)
df_circulated_nonfiction %>%
#filter(language == "English") %>%
filter(sum_total_of_charge_count < 100) %>%
ggplot(aes(x = total_5xx_6xx, y = sum_total_of_charge_count)) +
geom_point(alpha = 0.1) +
labs(title = "Circulating nonfiction, by total circulations and completeness",
y = "total circulations",
x = "total 5xx and 6xx in record") +
facet_wrap(~language)

df_circulated_fiction %>%
#filter(language == "English") %>%
filter(sum_total_of_charge_count < 100) %>%
ggplot(aes(x = total_5xx_6xx, y = sum_total_of_charge_count)) +
geom_point(alpha = 0.1) +
labs(title = "Circulating fiction, by total circulations and completeness",
y = "total circulations",
x = "total 5xx and 6xx in record") +
facet_wrap(~language)
