library(tidyverse)
library(lubridate)
library(openxlsx)
library(readxl)
library(janitor)
library(scales)
library(skimr)
library(moderndive)
library(infer)
library(tidylo)
library(RColorBrewer)
library(DT)

theme_set(theme_minimal())
languagelookup <- read_csv("documentation/languagelookup.csv") %>%
  rename(lang_code = code)

lang_code_regions_coded <- read_excel("documentation/lang_code_regions_coded.xlsx", 
    sheet = "500+") %>%
  select(-n, -language)

df <- readRDS("data/cul_acq_circ_oclc_2001-2018.rds") %>%
  filter(!is.na(encoding_level)) %>%
  left_join(languagelookup, by = "lang_code") %>%
  left_join(lang_code_regions_coded, by = "lang_code") %>%
  mutate(encoding_level = str_to_lower(encoding_level)) %>%
  mutate(enc_cat = ifelse(encoding_level %in% c("4", "8", "full", "fulloclc"), "full", "notfull" )) %>%
  mutate(publisher = str_to_lower(publisher)) %>%
  select(bib_id, oclc_id_norm, language, region, age, publisher, call_no_char1, enc_cat, encoding_level, has_circulated, sum_total_of_charge_count)
both_df <- df %>%
  group_by(language, age) %>%
  summarize(total = n(),
            num_circ = sum(has_circulated),
            prop_circ = num_circ / total) %>%
  arrange(desc(total))


full_df <- df %>%
  filter(enc_cat == "full") %>%
  group_by(language, age) %>%
  summarize(full_total = n(),
            full_num_circ = sum(has_circulated),
            full_prop_circ = full_num_circ / full_total) %>%
  arrange(desc(full_total))

notfull_df <- df %>%
  filter(enc_cat == "notfull") %>%
  group_by(language, age) %>%
  summarize(notfull_total = n(),
            notfull_num_circ = sum(has_circulated),
            notfull_prop_circ = notfull_num_circ / notfull_total) %>%
  arrange(desc(notfull_total))

cutoff <- 100

df_all <- full_df %>%
  full_join(notfull_df, by = c("language", "age")) %>%
  filter(full_total > cutoff,
         notfull_total > cutoff) %>%
  arrange(language, age) %>%
  mutate(prop_diff = full_prop_circ - notfull_prop_circ) %>%
  mutate(rpd = (full_prop_circ - notfull_prop_circ) / ((full_prop_circ + notfull_prop_circ) / 2) * 100) %>%
  left_join(both_df, by = c("language", "age"))

language_filter <- df_all %>%
  count(language, name = 'total_years', sort = TRUE) %>%
  filter(total_years > 15) %>%
  select(language)

df_all_filtered <- df_all %>%
  filter(language %in% language_filter$language)

Literature review and research question(s)

Do titles with full MARC cataloging circulate more than those with lesser level cataloging?

Reasoning behind why we put encoding levels into the full or nonfull categories for this study.

Dataset

df %>%
  select(-c(call_no_char1, publisher, region, sum_total_of_charge_count)  ) %>%
  glimpse()
## Rows: 1,487,782
## Columns: 7
## $ bib_id         <chr> "10000001", "10000002", "10000003", "10000004", "100000~
## $ oclc_id_norm   <chr> "978802516", "967764776", "983797318", "970609985", "97~
## $ language       <chr> "English", "English", "English", "English", "English", ~
## $ age            <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2~
## $ enc_cat        <chr> "full", "full", "full", "notfull", "full", "full", "ful~
## $ encoding_level <chr> "full", "full", "full", "m", "full", "full", "full", "f~
## $ has_circulated <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1~
df %>%
  group_by(enc_cat, encoding_level, has_circulated) %>%
  mutate(has_circulated = ifelse(has_circulated == 0, "nocirc", "circ")) %>%
  summarize(n = n()) %>%
  pivot_wider(names_from = has_circulated, values_from = n) %>%
  mutate(total = circ + nocirc) %>%
  arrange(enc_cat, desc(total)) %>%
  ggplot(aes(x = fct_reorder(encoding_level,desc(total)), y = total, fill = enc_cat)) +
  geom_col(show.legend = FALSE) +
  scale_y_continuous(labels = comma_format(), breaks = seq(0, 700000, by = 50000)) +
  labs(title = "all records",
       x = "encoding level")

# clean up
remove(df)
df_all %>%
  count(language, name = 'total_years', sort = TRUE) %>%
  filter(total_years > 15) 
df_all_filtered %>%
  filter(language == "English") %>%
  select(age, full_total, notfull_total) %>%
  pivot_longer(cols = c(full_total, notfull_total), names_to = "enc_cat", values_to = "n") %>%
  ggplot(aes(x = age, y = n, fill = enc_cat)) +
  geom_col(show.legend = TRUE) +
  scale_y_continuous(breaks = seq(0, 9e4, by = 5000)) +
  scale_x_discrete(limits = c(1,18, seq(1,18, by = 1))) +
  labs(title = "English")

df_all_filtered %>%
  filter(language != "English") %>%
  group_by(language) %>%
  select(age, full_total, notfull_total) %>%
  pivot_longer(cols = c(full_total, notfull_total), names_to = "enc_cat", values_to = "n") %>%
  ggplot(aes(x = age, y = n, fill = enc_cat)) +
  geom_col(show.legend = FALSE) +
  scale_y_continuous(breaks = seq(0, 3e4, by = 2000)) +
  scale_x_discrete(limits = c(1,18, seq(1,18, by = 1))) +
  facet_wrap(~language) +
  labs(title = "Non-English")

discrete_groups <- (18 * 8 * 4) - (3 * 4)
caption_text = paste0(discrete_groups, " discrete groups across 18 years and 8 languages")


df_all_filtered %>%
  select(language, age, full_total, full_num_circ, notfull_total, notfull_num_circ) %>%
  mutate(full_num_nocirc = full_total - full_num_circ,
         notfull_num_nocirc = notfull_total - notfull_num_circ) %>%
  select(-ends_with("_total")) %>%
  pivot_longer(cols = c(full_num_circ:notfull_num_nocirc), names_to = "category", values_to = "n") %>%
  mutate(category = str_remove(category, "num_")) %>%
  ggplot(aes(x=age, y = n, fill = category)) +
  geom_col(position = "fill") +
  scale_x_continuous(breaks = seq(1,18, by = 1)) +
  #facet_wrap(~language, ncol = 1, strip.position="top") +
  facet_wrap(~language, strip.position="left", ncol = 1) +
  theme_minimal() +
  theme(axis.text.y=element_blank(),  #remove y axis labels
        axis.ticks.y=element_blank()  #remove y axis ticks
        ) +
  labs(y = NULL,
       caption = caption_text) +
  theme(legend.position = "top",
        panel.grid = element_blank(),
        plot.caption = element_text(face = "italic", hjust = 0, size = rel(1.2))) +
  scale_fill_brewer(palette = "Paired") 

`

caption_text2 = paste0(nrow(df_all_filtered), " discrete groups across 18 years and 8 languages")
df_all_filtered %>%
  select(language, age, prop_diff) %>%
  mutate(posneg = ifelse(prop_diff < 0, "neg", "pos")) %>%
  ggplot(aes(x=age, y = prop_diff)) +
  geom_line(alpha = 0.1) +
  geom_text(aes(label = round(prop_diff,3) * 100 , color = posneg), vjust = 0.2, show.legend = FALSE, size = 5) +
  scale_x_continuous(breaks = seq(1,18, by = 1)) +
  facet_wrap(~language, strip.position="left", ncol = 1) +
  theme_minimal() +
  theme(axis.text.y=element_blank(),  #remove y axis labels
        axis.ticks.y=element_blank()  #remove y axis ticks
        ) +
  labs(y = NULL,
       caption = caption_text2) +
  theme(legend.position = "top",
        legend.title = element_blank(),
        axis.text = element_text(rel(1.3)),
        panel.grid = element_blank(),
        plot.caption = element_text(face = "italic", hjust = 0, size = rel(1.1))) +
  scale_fill_brewer(palette = "Paired") +
  scale_y_continuous(expand = expansion(mult = c(0, 0.2)))

# functions and function calls for get_bs

get_bs <- function(df, repetitions) {
  set.seed(1234)
  df %>%
  specify(response = prop_diff  ) %>%
  generate(reps = repetitions) %>%
  calculate(stat = "mean")
}

get_viz <- function(df_bs, language_name) {
  
  percentile_ci <- df_bs %>% 
    get_confidence_interval(level = 0.95, type = "percentile")
  
  lower_ci = round(percentile_ci$lower_ci,3) * 100
  upper_ci = round(percentile_ci$upper_ci,3) * 100
  ci <- paste0("[", lower_ci, ",", upper_ci, "]")
  mean_estimate <- round(mean(df_bs$stat), 3) * 100

  
  visualize(df_bs) +
    shade_confidence_interval(endpoints = percentile_ci, alpha = 0.1) +
  scale_x_continuous(labels = scales::percent) +
  geom_vline(xintercept = mean(df_bs$stat), linetype = "dashed") +
  geom_vline(xintercept = 0, linetype = "dashed", color = "red") +
  labs(x = "Difference in circ percentages between full and nonfull", 
       title = paste0("Sampling distribution for ", language_name),
       caption = paste0(nrow(df_bs), " resamples"),
       subtitle = paste0("Mean estimate: ", mean_estimate, "%, 95% confidence interval ", ci )) 
}

Findings

English

df_bs <- df_all_filtered %>%
  filter(language == "English") %>%
  get_bs(., 1000)

get_viz(df_bs, "English")

remove(df_bs)
# get_bs <- function(df) {
#   set.seed(1234)
#   df %>%
#     specify(response = prop_diff  ) %>%
#     generate(reps = 10) %>%
#     calculate(stat = "mean") %>%
#     summarize(mean = mean(stat))
# }
# 
# 
# df_all_filtered %>%
#   filter(language == "English") %>%
#     get_bs()
#   
# 
# 
# for (l in unique(df_all_filtered$language)) {
#   print(l)
#   df_all_filtered %>%
#     filter(language == l) %>%
#     get_bs
#   
# }

# df_bs_english <- df_all_filtered %>%
#   filter(language == "English") %>%
#   get_bs(., 1000, "prop_diff")
# 
# df_bs_french <- df_all_filtered %>%
#   filter(language == "French") %>%
#   get_bs(., 1000, "prop_diff")
# 
# df_bs_german <- df_all_filtered %>%
#   filter(language == "German") %>%
#   get_bs(., 1000, "prop_diff")
# 
# df_bs_indonesian <- df_all_filtered %>%
#   filter(language == "Indonesian") %>%
#   get_bs(., 1000, "prop_diff")
# 
# df_bs_russian <- df_all_filtered %>%
#   filter(language == "Russian") %>%
#   get_bs(., 1000, "prop_diff")
# 
# df_bs_spanish <- df_all_filtered %>%
#   filter(language == "Spanish") %>%
#   get_bs(., 1000, "prop_diff")
# 
# df_bs_thai <- df_all_filtered %>%
#   filter(language == "Thai") %>%
#   get_bs(., 1000, "prop_diff")
# 
# df_bs_vietnamese <- df_all_filtered %>%
#   filter(language == "Vietnamese") %>%
#   get_bs(., 1000, "prop_diff")

Discussion