library(tidyverse)
library(lubridate)
library(openxlsx)
library(readxl)
library(janitor)
library(scales)
library(skimr)
library(moderndive)
library(infer)
library(tidylo)
library(RColorBrewer)
library(DT)
theme_set(theme_minimal())
languagelookup <- read_csv("documentation/languagelookup.csv") %>%
rename(lang_code = code)
lang_code_regions_coded <- read_excel("documentation/lang_code_regions_coded.xlsx",
sheet = "500+") %>%
select(-n, -language)
df <- readRDS("data/cul_acq_circ_oclc_2001-2018.rds") %>%
filter(!is.na(encoding_level)) %>%
left_join(languagelookup, by = "lang_code") %>%
left_join(lang_code_regions_coded, by = "lang_code") %>%
mutate(encoding_level = str_to_lower(encoding_level)) %>%
mutate(enc_cat = ifelse(encoding_level %in% c("4", "8", "full", "fulloclc"), "full", "notfull" )) %>%
mutate(publisher = str_to_lower(publisher)) %>%
select(bib_id, oclc_id_norm, language, region, age, publisher, call_no_char1, enc_cat, encoding_level, has_circulated, sum_total_of_charge_count)
both_df <- df %>%
group_by(language, age) %>%
summarize(total = n(),
num_circ = sum(has_circulated),
prop_circ = num_circ / total) %>%
arrange(desc(total))
full_df <- df %>%
filter(enc_cat == "full") %>%
group_by(language, age) %>%
summarize(full_total = n(),
full_num_circ = sum(has_circulated),
full_prop_circ = full_num_circ / full_total) %>%
arrange(desc(full_total))
notfull_df <- df %>%
filter(enc_cat == "notfull") %>%
group_by(language, age) %>%
summarize(notfull_total = n(),
notfull_num_circ = sum(has_circulated),
notfull_prop_circ = notfull_num_circ / notfull_total) %>%
arrange(desc(notfull_total))
cutoff <- 100
df_all <- full_df %>%
full_join(notfull_df, by = c("language", "age")) %>%
filter(full_total > cutoff,
notfull_total > cutoff) %>%
arrange(language, age) %>%
mutate(prop_diff = full_prop_circ - notfull_prop_circ) %>%
mutate(rpd = (full_prop_circ - notfull_prop_circ) / ((full_prop_circ + notfull_prop_circ) / 2) * 100) %>%
left_join(both_df, by = c("language", "age"))
language_filter <- df_all %>%
count(language, name = 'total_years', sort = TRUE) %>%
filter(total_years > 15) %>%
select(language)
df_all_filtered <- df_all %>%
filter(language %in% language_filter$language)
Do titles with full MARC cataloging circulate more than those with lesser level cataloging?
Reasoning behind why we put encoding levels into the full or nonfull categories for this study.
df %>%
select(-c(call_no_char1, publisher, region, sum_total_of_charge_count) ) %>%
glimpse()
## Rows: 1,487,782
## Columns: 7
## $ bib_id <chr> "10000001", "10000002", "10000003", "10000004", "100000~
## $ oclc_id_norm <chr> "978802516", "967764776", "983797318", "970609985", "97~
## $ language <chr> "English", "English", "English", "English", "English", ~
## $ age <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2~
## $ enc_cat <chr> "full", "full", "full", "notfull", "full", "full", "ful~
## $ encoding_level <chr> "full", "full", "full", "m", "full", "full", "full", "f~
## $ has_circulated <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1~
df %>%
group_by(enc_cat, encoding_level, has_circulated) %>%
mutate(has_circulated = ifelse(has_circulated == 0, "nocirc", "circ")) %>%
summarize(n = n()) %>%
pivot_wider(names_from = has_circulated, values_from = n) %>%
mutate(total = circ + nocirc) %>%
arrange(enc_cat, desc(total)) %>%
ggplot(aes(x = fct_reorder(encoding_level,desc(total)), y = total, fill = enc_cat)) +
geom_col(show.legend = FALSE) +
scale_y_continuous(labels = comma_format(), breaks = seq(0, 700000, by = 50000)) +
labs(title = "all records",
x = "encoding level")
# clean up
remove(df)
df_all %>%
count(language, name = 'total_years', sort = TRUE) %>%
filter(total_years > 15)
df_all_filtered %>%
filter(language == "English") %>%
select(age, full_total, notfull_total) %>%
pivot_longer(cols = c(full_total, notfull_total), names_to = "enc_cat", values_to = "n") %>%
ggplot(aes(x = age, y = n, fill = enc_cat)) +
geom_col(show.legend = TRUE) +
scale_y_continuous(breaks = seq(0, 9e4, by = 5000)) +
scale_x_discrete(limits = c(1,18, seq(1,18, by = 1))) +
labs(title = "English")
df_all_filtered %>%
filter(language != "English") %>%
group_by(language) %>%
select(age, full_total, notfull_total) %>%
pivot_longer(cols = c(full_total, notfull_total), names_to = "enc_cat", values_to = "n") %>%
ggplot(aes(x = age, y = n, fill = enc_cat)) +
geom_col(show.legend = FALSE) +
scale_y_continuous(breaks = seq(0, 3e4, by = 2000)) +
scale_x_discrete(limits = c(1,18, seq(1,18, by = 1))) +
facet_wrap(~language) +
labs(title = "Non-English")
discrete_groups <- (18 * 8 * 4) - (3 * 4)
caption_text = paste0(discrete_groups, " discrete groups across 18 years and 8 languages")
df_all_filtered %>%
select(language, age, full_total, full_num_circ, notfull_total, notfull_num_circ) %>%
mutate(full_num_nocirc = full_total - full_num_circ,
notfull_num_nocirc = notfull_total - notfull_num_circ) %>%
select(-ends_with("_total")) %>%
pivot_longer(cols = c(full_num_circ:notfull_num_nocirc), names_to = "category", values_to = "n") %>%
mutate(category = str_remove(category, "num_")) %>%
ggplot(aes(x=age, y = n, fill = category)) +
geom_col(position = "fill") +
scale_x_continuous(breaks = seq(1,18, by = 1)) +
#facet_wrap(~language, ncol = 1, strip.position="top") +
facet_wrap(~language, strip.position="left", ncol = 1) +
theme_minimal() +
theme(axis.text.y=element_blank(), #remove y axis labels
axis.ticks.y=element_blank() #remove y axis ticks
) +
labs(y = NULL,
caption = caption_text) +
theme(legend.position = "top",
panel.grid = element_blank(),
plot.caption = element_text(face = "italic", hjust = 0, size = rel(1.2))) +
scale_fill_brewer(palette = "Paired")
`
caption_text2 = paste0(nrow(df_all_filtered), " discrete groups across 18 years and 8 languages")
df_all_filtered %>%
select(language, age, prop_diff) %>%
mutate(posneg = ifelse(prop_diff < 0, "neg", "pos")) %>%
ggplot(aes(x=age, y = prop_diff)) +
geom_line(alpha = 0.1) +
geom_text(aes(label = round(prop_diff,3) * 100 , color = posneg), vjust = 0.2, show.legend = FALSE, size = 5) +
scale_x_continuous(breaks = seq(1,18, by = 1)) +
facet_wrap(~language, strip.position="left", ncol = 1) +
theme_minimal() +
theme(axis.text.y=element_blank(), #remove y axis labels
axis.ticks.y=element_blank() #remove y axis ticks
) +
labs(y = NULL,
caption = caption_text2) +
theme(legend.position = "top",
legend.title = element_blank(),
axis.text = element_text(rel(1.3)),
panel.grid = element_blank(),
plot.caption = element_text(face = "italic", hjust = 0, size = rel(1.1))) +
scale_fill_brewer(palette = "Paired") +
scale_y_continuous(expand = expansion(mult = c(0, 0.2)))
# functions and function calls for get_bs
get_bs <- function(df, repetitions) {
set.seed(1234)
df %>%
specify(response = prop_diff ) %>%
generate(reps = repetitions) %>%
calculate(stat = "mean")
}
get_viz <- function(df_bs, language_name) {
percentile_ci <- df_bs %>%
get_confidence_interval(level = 0.95, type = "percentile")
lower_ci = round(percentile_ci$lower_ci,3) * 100
upper_ci = round(percentile_ci$upper_ci,3) * 100
ci <- paste0("[", lower_ci, ",", upper_ci, "]")
mean_estimate <- round(mean(df_bs$stat), 3) * 100
visualize(df_bs) +
shade_confidence_interval(endpoints = percentile_ci, alpha = 0.1) +
scale_x_continuous(labels = scales::percent) +
geom_vline(xintercept = mean(df_bs$stat), linetype = "dashed") +
geom_vline(xintercept = 0, linetype = "dashed", color = "red") +
labs(x = "Difference in circ percentages between full and nonfull",
title = paste0("Sampling distribution for ", language_name),
caption = paste0(nrow(df_bs), " resamples"),
subtitle = paste0("Mean estimate: ", mean_estimate, "%, 95% confidence interval ", ci ))
}
df_all_filtered %>%
filter(language == "English") %>%
select(language, age, full_prop_circ, notfull_prop_circ) %>%
pivot_longer(full_prop_circ:notfull_prop_circ) %>%
ggplot(aes(x = as.factor(age), y = value, group = name)) +
#stat_smooth(geom='line', alpha=0.5, se=FALSE, method = lm) +
geom_line(aes(color = name), size = 1.5) +
geom_point(size = 1) +
scale_y_continuous(labels = scales::percent, limits = c(0,0.7), breaks = seq(0,0.7, by = 0.05)) +
facet_wrap(~language) +
theme(panel.grid.minor = element_blank(),
legend.title = element_blank(),
legend.position = "top") +
labs(title = NULL,
y = "percent of titles that circulated",
x = "years in collection")
df_bs <- df_all_filtered %>%
filter(language == "English") %>%
get_bs(., 1000)
get_viz(df_bs, "English")
remove(df_bs)
df_all_filtered %>%
filter(language != "English") %>%
select(language, age, full_prop_circ, notfull_prop_circ) %>%
pivot_longer(full_prop_circ:notfull_prop_circ) %>%
ggplot(aes(x = as.factor(age), y = value, group = name)) +
#stat_smooth(geom='line', alpha=0.5, se=FALSE, method = lm) +
geom_line(aes(color = name), size = 1.0) +
geom_point(size = 0.5) +
scale_y_continuous(labels = scales::percent, limits = c(0,0.4), breaks = seq(0,0.4, by = 0.05)) +
facet_wrap(~language) +
theme(panel.grid.minor = element_blank(),
legend.title = element_blank(),
legend.position = "top") +
labs(title = NULL,
y = "percent of titles that circulated",
x = "years in collection") +
facet_wrap(~language, ncol = 2)
df_bs <- df_all_filtered %>%
filter(language == "French") %>%
get_bs(., 1000)
get_viz(df_bs, "French")
remove(df_bs)
df_bs <- df_all_filtered %>%
filter(language == "German") %>%
get_bs(., 1000)
get_viz(df_bs, "German")
remove(df_bs)
df_bs <- df_all_filtered %>%
filter(language == "Indonesian") %>%
get_bs(., 1000)
get_viz(df_bs, "Indonesian")
remove(df_bs)
df_bs <- df_all_filtered %>%
filter(language == "Russian") %>%
get_bs(., 1000)
get_viz(df_bs, "Russian")
remove(df_bs)
df_bs <- df_all_filtered %>%
filter(language == "Spanish") %>%
get_bs(., 1000)
get_viz(df_bs, "Spanish")
remove(df_bs)
df_bs <- df_all_filtered %>%
filter(language == "Thai") %>%
get_bs(., 1000)
get_viz(df_bs, "Thai")
remove(df_bs)
df_bs <- df_all_filtered %>%
filter(language == "Vietnamese") %>%
get_bs(., 1000)
get_viz(df_bs, "Vietnamese")
remove(df_bs)