library(tidyverse)
library(lubridate)
library(openxlsx)
library(readxl)
library(janitor)
library(scales)
library(skimr)
library(moderndive)
library(infer)
library(tidylo)
library(RColorBrewer)
library(DT)

theme_set(theme_minimal())

languagelookup <- read_csv("documentation/languagelookup.csv") %>%
  rename(lang_code = code)

lang_code_regions_coded <- read_excel("documentation/lang_code_regions_coded.xlsx", 
    sheet = "500+") %>%
  select(-n, -language)

df <- readRDS("data/cul_acq_circ_oclc_2001-2018.rds") %>%
  filter(!is.na(encoding_level)) %>%
  left_join(languagelookup, by = "lang_code") %>%
  left_join(lang_code_regions_coded, by = "lang_code") %>%
  mutate(encoding_level = str_to_lower(encoding_level)) %>%
  mutate(enc_cat = ifelse(encoding_level %in% c("4", "8", "full", "fulloclc"), "full", "notfull" )) %>%
  mutate(publisher = str_to_lower(publisher)) %>%
  select(bib_id, oclc_id_norm, language, region, age, publisher, call_no_char1, enc_cat, encoding_level, has_circulated, sum_total_of_charge_count)

both_df <- df %>%
  group_by(language, age) %>%
  summarize(total = n(),
            num_circ = sum(has_circulated),
            prop_circ = num_circ / total) %>%
  arrange(desc(total))


full_df <- df %>%
  filter(enc_cat == "full") %>%
  group_by(language, age) %>%
  summarize(full_total = n(),
            full_num_circ = sum(has_circulated),
            full_prop_circ = full_num_circ / full_total) %>%
  arrange(desc(full_total))

notfull_df <- df %>%
  filter(enc_cat == "notfull") %>%
  group_by(language, age) %>%
  summarize(notfull_total = n(),
            notfull_num_circ = sum(has_circulated),
            notfull_prop_circ = notfull_num_circ / notfull_total) %>%
  arrange(desc(notfull_total))

cutoff <- 100

df_all <- full_df %>%
  full_join(notfull_df, by = c("language", "age")) %>%
  filter(full_total > cutoff,
         notfull_total > cutoff) %>%
  arrange(language, age) %>%
  mutate(prop_diff = full_prop_circ - notfull_prop_circ) %>%
  mutate(rpd = (full_prop_circ - notfull_prop_circ) / ((full_prop_circ + notfull_prop_circ) / 2) * 100) %>%
  left_join(both_df, by = c("language", "age"))

language_filter <- df_all %>%
  count(language, name = 'total_years', sort = TRUE) %>%
  filter(total_years > 15) %>%
  select(language)

df_all_filtered <- df_all %>%
  filter(language %in% language_filter$language)

Literature review and research question(s)

Do titles with full MARC cataloging circulate more than those with lesser level cataloging?

Reasoning behind why we put encoding levels into the full or nonfull categories for this study.

Dataset

df %>%
  select(-c(call_no_char1, publisher, region, sum_total_of_charge_count)  ) %>%
  glimpse()

## Rows: 1,487,782
## Columns: 7
## $ bib_id         <chr> "10000001", "10000002", "10000003", "10000004", "100000~
## $ oclc_id_norm   <chr> "978802516", "967764776", "983797318", "970609985", "97~
## $ language       <chr> "English", "English", "English", "English", "English", ~
## $ age            <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2~
## $ enc_cat        <chr> "full", "full", "full", "notfull", "full", "full", "ful~
## $ encoding_level <chr> "full", "full", "full", "m", "full", "full", "full", "f~
## $ has_circulated <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1~

df %>%
  group_by(enc_cat, encoding_level, has_circulated) %>%
  mutate(has_circulated = ifelse(has_circulated == 0, "nocirc", "circ")) %>%
  summarize(n = n()) %>%
  pivot_wider(names_from = has_circulated, values_from = n) %>%
  mutate(total = circ + nocirc) %>%
  arrange(enc_cat, desc(total)) %>%
  ggplot(aes(x = fct_reorder(encoding_level,desc(total)), y = total, fill = enc_cat)) +
  geom_col(show.legend = FALSE) +
  scale_y_continuous(labels = comma_format(), breaks = seq(0, 700000, by = 50000)) +
  labs(title = "all records",
       x = "encoding level")

# clean up
remove(df)

df_all %>%
  count(language, name = 'total_years', sort = TRUE) %>%
  filter(total_years > 15)

df_all_filtered %>%
  filter(language == "English") %>%
  select(age, full_total, notfull_total) %>%
  pivot_longer(cols = c(full_total, notfull_total), names_to = "enc_cat", values_to = "n") %>%
  ggplot(aes(x = age, y = n, fill = enc_cat)) +
  geom_col(show.legend = TRUE) +
  scale_y_continuous(breaks = seq(0, 9e4, by = 5000)) +
  scale_x_discrete(limits = c(1,18, seq(1,18, by = 1))) +
  labs(title = "English")

df_all_filtered %>%
  filter(language != "English") %>%
  group_by(language) %>%
  select(age, full_total, notfull_total) %>%
  pivot_longer(cols = c(full_total, notfull_total), names_to = "enc_cat", values_to = "n") %>%
  ggplot(aes(x = age, y = n, fill = enc_cat)) +
  geom_col(show.legend = FALSE) +
  scale_y_continuous(breaks = seq(0, 3e4, by = 2000)) +
  scale_x_discrete(limits = c(1,18, seq(1,18, by = 1))) +
  facet_wrap(~language) +
  labs(title = "Non-English")

discrete_groups <- (18 * 8 * 4) - (3 * 4)
caption_text = paste0(discrete_groups, " discrete groups across 18 years and 8 languages")


df_all_filtered %>%
  select(language, age, full_total, full_num_circ, notfull_total, notfull_num_circ) %>%
  mutate(full_num_nocirc = full_total - full_num_circ,
         notfull_num_nocirc = notfull_total - notfull_num_circ) %>%
  select(-ends_with("_total")) %>%
  pivot_longer(cols = c(full_num_circ:notfull_num_nocirc), names_to = "category", values_to = "n") %>%
  mutate(category = str_remove(category, "num_")) %>%
  ggplot(aes(x=age, y = n, fill = category)) +
  geom_col(position = "fill") +
  scale_x_continuous(breaks = seq(1,18, by = 1)) +
  #facet_wrap(~language, ncol = 1, strip.position="top") +
  facet_wrap(~language, strip.position="left", ncol = 1) +
  theme_minimal() +
  theme(axis.text.y=element_blank(),  #remove y axis labels
        axis.ticks.y=element_blank()  #remove y axis ticks
        ) +
  labs(y = NULL,
       caption = caption_text) +
  theme(legend.position = "top",
        panel.grid = element_blank(),
        plot.caption = element_text(face = "italic", hjust = 0, size = rel(1.2))) +
  scale_fill_brewer(palette = "Paired")

caption_text2 = paste0(nrow(df_all_filtered), " discrete groups across 18 years and 8 languages")
df_all_filtered %>%
  select(language, age, prop_diff) %>%
  mutate(posneg = ifelse(prop_diff < 0, "neg", "pos")) %>%
  ggplot(aes(x=age, y = prop_diff)) +
  geom_line(alpha = 0.1) +
  geom_text(aes(label = round(prop_diff,3) * 100 , color = posneg), vjust = 0.2, show.legend = FALSE, size = 5) +
  scale_x_continuous(breaks = seq(1,18, by = 1)) +
  facet_wrap(~language, strip.position="left", ncol = 1) +
  theme_minimal() +
  theme(axis.text.y=element_blank(),  #remove y axis labels
        axis.ticks.y=element_blank()  #remove y axis ticks
        ) +
  labs(y = NULL,
       caption = caption_text2) +
  theme(legend.position = "top",
        legend.title = element_blank(),
        axis.text = element_text(rel(1.3)),
        panel.grid = element_blank(),
        plot.caption = element_text(face = "italic", hjust = 0, size = rel(1.1))) +
  scale_fill_brewer(palette = "Paired") +
  scale_y_continuous(expand = expansion(mult = c(0, 0.2)))

# functions and function calls for get_bs

get_bs <- function(df, repetitions) {
  set.seed(1234)
  df %>%
  specify(response = prop_diff  ) %>%
  generate(reps = repetitions) %>%
  calculate(stat = "mean")
}

get_viz <- function(df_bs, language_name) {
  
  percentile_ci <- df_bs %>% 
    get_confidence_interval(level = 0.95, type = "percentile")
  
  lower_ci = round(percentile_ci$lower_ci,3) * 100
  upper_ci = round(percentile_ci$upper_ci,3) * 100
  ci <- paste0("[", lower_ci, ",", upper_ci, "]")
  mean_estimate <- round(mean(df_bs$stat), 3) * 100

  
  visualize(df_bs) +
    shade_confidence_interval(endpoints = percentile_ci, alpha = 0.1) +
  scale_x_continuous(labels = scales::percent) +
  geom_vline(xintercept = mean(df_bs$stat), linetype = "dashed") +
  geom_vline(xintercept = 0, linetype = "dashed", color = "red") +
  labs(x = "Difference in circ percentages between full and nonfull", 
       title = paste0("Sampling distribution for ", language_name),
       caption = paste0(nrow(df_bs), " resamples"),
       subtitle = paste0("Mean estimate: ", mean_estimate, "%, 95% confidence interval ", ci )) 
}

Findings

English

df_all_filtered %>%
  filter(language == "English") %>%
  select(language, age, full_prop_circ, notfull_prop_circ) %>%
  pivot_longer(full_prop_circ:notfull_prop_circ) %>%
  ggplot(aes(x = as.factor(age), y = value, group = name)) +
  #stat_smooth(geom='line', alpha=0.5, se=FALSE, method = lm) +
  geom_line(aes(color = name), size = 1.5) +
  geom_point(size = 1) +
    scale_y_continuous(labels = scales::percent, limits = c(0,0.7), breaks = seq(0,0.7, by = 0.05)) +
  facet_wrap(~language) +
  theme(panel.grid.minor = element_blank(),
        legend.title = element_blank(),
        legend.position = "top") +
  labs(title = NULL,
y = "percent of titles that circulated",
x = "years in collection")

df_bs <- df_all_filtered %>%
  filter(language == "English") %>%
  get_bs(., 1000)

get_viz(df_bs, "English")

remove(df_bs)

Non-English

df_all_filtered %>%
  filter(language != "English") %>%
  select(language, age, full_prop_circ, notfull_prop_circ) %>%
  pivot_longer(full_prop_circ:notfull_prop_circ) %>%
  ggplot(aes(x = as.factor(age), y = value, group = name)) +
  #stat_smooth(geom='line', alpha=0.5, se=FALSE, method = lm) +
  geom_line(aes(color = name), size = 1.0) +
  geom_point(size = 0.5) +
    scale_y_continuous(labels = scales::percent, limits = c(0,0.4), breaks = seq(0,0.4, by = 0.05)) +
  facet_wrap(~language) +
  theme(panel.grid.minor = element_blank(),
        legend.title = element_blank(),
        legend.position = "top") +
  labs(title = NULL,
y = "percent of titles that circulated",
x = "years in collection") +
  facet_wrap(~language, ncol = 2)

French

df_bs <- df_all_filtered %>%
  filter(language == "French") %>%
  get_bs(., 1000)

get_viz(df_bs, "French")

remove(df_bs)

German

df_bs <- df_all_filtered %>%
  filter(language == "German") %>%
  get_bs(., 1000)

get_viz(df_bs, "German")

remove(df_bs)

Indonesian

df_bs <- df_all_filtered %>%
  filter(language == "Indonesian") %>%
  get_bs(., 1000)

get_viz(df_bs, "Indonesian")

remove(df_bs)

Russian

df_bs <- df_all_filtered %>%
  filter(language == "Russian") %>%
  get_bs(., 1000)

get_viz(df_bs, "Russian")

remove(df_bs)

Spanish

df_bs <- df_all_filtered %>%
  filter(language == "Spanish") %>%
  get_bs(., 1000)

get_viz(df_bs, "Spanish")

remove(df_bs)

Thai

df_bs <- df_all_filtered %>%
  filter(language == "Thai") %>%
  get_bs(., 1000)

get_viz(df_bs, "Thai")

remove(df_bs)

Vietnamese

df_bs <- df_all_filtered %>%
  filter(language == "Vietnamese") %>%
  get_bs(., 1000)

get_viz(df_bs, "Vietnamese")

remove(df_bs)

Do titles with full MARC cataloging circulate more than those with lesser level cataloging?

Adam Chandler, Laura Daniels, and Jason Kovari

12 January 2022 15:01

Literature review and research question(s)

Dataset

Findings

English

Non-English

French

German

Indonesian

Russian

Spanish

Thai

Vietnamese

Discussion