Overview

# prepare data

library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
library(stringr)
library(forcats)
library(readxl)
library(broom)


# load data
df_raw <- read_excel("data/after_samplecollection/complete_collected_dataset.xlsx")

prepare_df <- function(df_raw) {
  df <- df_raw %>%
    rename(present_or_not = `Present or not`) 
  df$present_or_not <- as.factor(df$present_or_not)
  df <- df %>%
    mutate(present_or_not = fct_recode(present_or_not,
                                  "1" = "Present",
                                  "1" = "CheckedOut",
                                  "0" = "NotOnShelf-Verified",
                                  "0" = "LMBO") 
           )
  df$location_code <- as.factor(df$location_code)
  df$location_code <- fct_infreq(df$location_code)
  df$has_circulated <- ifelse(df$recorded_uses_item > 0, 1, 0)
  df <- df %>%
    select(-Initials, -ITEM_STATUS_DESC)
  return(df)
}

df <- prepare_df(df_raw)

Exploratory Data Analysis

What is in the sample population dataset?

glimpse(df)
## Observations: 6,006
## Variables: 19
## $ present_or_not       <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ barcode              <chr> "31924062968908", "31924072130184", "3192...
## $ location_code        <fct> afr, afr, afr, afr, afr, afr, afr, afr, a...
## $ call_nbr_display     <chr> "DT32 .R61", "DT328.M53 .H3613x 1993", "D...
## $ call_nbr_norm_item   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ enumeration          <chr> NA, NA, NA, NA, NA, NA, NA, "c.2", NA, NA...
## $ title                <chr> "Death of Africa. By Peter Ritner.", "Vic...
## $ item_control_nbr     <chr> "3723592", "4103508", "7620171", "9494855...
## $ recorded_uses_item   <dbl> 1, 0, 2, 0, 0, 10, 2, 0, 56, 1, 2, 4, 3, ...
## $ bib_rec_nbr          <chr> "1968678", "2249095", "5689943", "8618953...
## $ worldcat_oclc_nbr    <chr> "412793", "59941146", "148569", "86982417...
## $ Catalog_URL          <chr> "https://newcatalog.library.cornell.edu/c...
## $ us_holdings          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ row_number           <dbl> 1585081, 735421, 2715241, 850681, 84661, ...
## $ Condition            <chr> "Acceptable", "Excellent", "Acceptable", ...
## $ `Barcode validation` <chr> "yes", "yes", "yes", "yes", "yes", "no", ...
## $ Timestamp            <dttm> 2018-07-06 13:56:22, 2018-07-06 13:56:22...
## $ Status               <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ has_circulated       <dbl> 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1,...

Bootstrap simulation for mean and standard error

library(infer)

replimit = 1000

p_hat <- df %>%
  summarise(stat = mean(present_or_not == "1")) %>%
  pull()
p_hat
## [1] 0.9642025
boot <- df %>%
  specify(response = present_or_not, success = "1") %>%
  generate(reps = replimit, type = "bootstrap") %>%
  calculate(stat = "prop")
boot
## # A tibble: 1,000 x 2
##    replicate  stat
##        <int> <dbl>
##  1         1 0.968
##  2         2 0.965
##  3         3 0.963
##  4         4 0.965
##  5         5 0.964
##  6         6 0.966
##  7         7 0.962
##  8         8 0.965
##  9         9 0.967
## 10        10 0.965
## # ... with 990 more rows
se <- boot %>%
  summarize(sd(stat)) %>%
  pull()
se
## [1] 0.002359031
ggplot(boot, aes(x = stat)) +
#  geom_density() +
  geom_histogram() +
  geom_vline(xintercept = p_hat, col = "blue") +
  scale_x_continuous(breaks = seq(0.955,0.972,by=.001))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

  # geom_vline(xintercept = p_hat + se, col = "green") +
  # geom_vline(xintercept = p_hat - se, col = "green")

We are confident that the true proportion of accounted for monographs across CUL is between 0.959, 0.969.

Are there differences across CUL units? First remove the units where the sample size of missing items was less than 30

df_groups <- df %>%
  group_by(location_code) %>%
  summarise(total = n())


df %>%
  filter(location_code == "mann") %>%
  specify(response = present_or_not, success = "1") %>%
  generate(reps = replimit, type = "bootstrap") %>%
  calculate(stat = "prop")
## # A tibble: 1,000 x 2
##    replicate  stat
##        <int> <dbl>
##  1         1 0.932
##  2         2 0.922
##  3         3 0.940
##  4         4 0.925
##  5         5 0.907
##  6         6 0.911
##  7         7 0.929
##  8         8 0.936
##  9         9 0.940
## 10        10 0.932
## # ... with 990 more rows
get_boot <- function(df, repnum) {
  df %>%
    specify(response = present_or_not, success = "1") %>%
    generate(reps = repnum, type = "bootstrap") %>%
    calculate(stat = "prop")
}

df_boot <- data.frame()

for(j in 1:nrow(df_groups)) {
  df_subset <- df %>%
    filter(location_code == df_groups$location_code[j])
  df_ret <- get_boot(df_subset,replimit)
  df_ret$location_code <- df_groups$location_code[j]
  df_boot <- rbind(df_boot, df_ret)
}


df_group_se <- df_boot %>%
  group_by(location_code) %>%
  summarize(mean = mean(stat),
            se = sd(stat)) %>%
  inner_join(df_groups) %>%
  filter(!total < 30) %>%
  mutate(num_missing = total - round(total * mean)) %>%
  arrange(-mean)
## Joining, by = "location_code"
df_group_se
## # A tibble: 12 x 5
##    location_code  mean      se total num_missing
##    <fct>         <dbl>   <dbl> <int>       <dbl>
##  1 mus           0.990 0.00938   102           1
##  2 afr           0.980 0.0203     49           1
##  3 olin          0.974 0.00285  3221          84
##  4 was           0.966 0.00723   598          20
##  5 ech           0.960 0.00880   461          18
##  6 law           0.951 0.0104    452          22
##  7 math          0.948 0.0215    116           6
##  8 uris          0.948 0.0140    270          14
##  9 sasa          0.946 0.0151    223          12
## 10 mann          0.929 0.0154    281          20
## 11 jgsm          0.927 0.0415     41           3
## 12 ilr           0.911 0.0245    145          13
ggplot(df_group_se, aes(y=mean, x=  fct_reorder(location_code, -mean), label = total)) +
  geom_errorbar(aes(ymin=mean-se, ymax=mean+se, width = 0.1)) +
  geom_point(color="red") +
  scale_y_continuous(limits = c(.88,1.005), breaks = seq(.88,1.0, by = .01)) +
  labs(title = "\nCUL monograph accounted results, by location") +
  xlab("location") +
  ylab("proportion accounted for") +
  geom_hline(yintercept = p_hat, col = "yellow")

#  coord_flip()

ggsave("output/cul_monograph_results_by_loc.png", dpi = 600, width = 8, height = 5)  

How does Cornell compare to the EAST consortium libraries?

# load EAST dataset

df_east <- read_excel("data/east_consortium/TattleTapeSurveyResultsToShare.xlsx", skip = 1)

# Glimpse EAST dataset

df_east <- df_east[,1:11]
df_east <- df_east %>%
  filter(!is.na(library)) %>%
  filter(!is.na(validation_score)) %>%
  select(library, validation_score) %>%
  arrange(desc(validation_score))

df_east
## # A tibble: 37 x 2
##    library  validation_score
##    <chr>               <dbl>
##  1 jackal              0.997
##  2 squirrel            0.995
##  3 giraffe             0.994
##  4 nyan cat            0.994
##  5 leopard             0.992
##  6 grizzly             0.99 
##  7 otter               0.99 
##  8 cheetah             0.99 
##  9 dolphin             0.989
## 10 liger               0.988
## # ... with 27 more rows
mean(df_east$validation_score)
## [1] 0.9726216
# run simulation on EAST dataset

p_hat_east <- df_east %>%
  summarise(stat = mean(validation_score)) %>%
  pull()
p_hat_east
## [1] 0.9726216
boot_east <- df_east %>%
  specify(response = validation_score) %>%
  generate(reps = replimit, type = "bootstrap") %>%
  calculate(stat = "mean")
boot_east
## # A tibble: 1,000 x 2
##    replicate  stat
##        <int> <dbl>
##  1         1 0.972
##  2         2 0.977
##  3         3 0.975
##  4         4 0.976
##  5         5 0.972
##  6         6 0.973
##  7         7 0.973
##  8         8 0.975
##  9         9 0.968
## 10        10 0.972
## # ... with 990 more rows
se_east <- boot_east %>%
  summarize(sd(stat)) %>%
  pull()
se_east
## [1] 0.002893712
ggplot(boot_east, aes(x = stat)) +
  geom_density() +
  geom_vline(xintercept = p_hat_east, col = "red") +
  geom_vline(xintercept = p_hat, col = "blue") +
  annotate("text", x = p_hat, y = 25, label = "Cornell", size = 3) +
  annotate("text", x = p_hat_east, y = 25, label = "EAST", size = 3)

Models

Logistic regression

# probability with no explanatory variables
# > exp(-3.29342)
# [1] 0.03712666
# > exp(-3.29342)/(1+exp(-3.29342))
# [1] 0.03579761
# > 
# > 1 - (exp(-3.29342)/(1+exp(-3.29342)))
# [1] 0.9642024

print("model 1")
## [1] "model 1"
mod1 <- glm(present_or_not ~ 1, data=df, family=binomial)
summary(mod1)
## 
## Call:
## glm(formula = present_or_not ~ 1, family = binomial, data = df)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -0.270  -0.270  -0.270  -0.270   2.581  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.29342    0.06945  -47.42   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1854.1  on 6005  degrees of freedom
## Residual deviance: 1854.1  on 6005  degrees of freedom
## AIC: 1856.1
## 
## Number of Fisher Scoring iterations: 6
# model 2
print("model 2")
## [1] "model 2"
mod2 <- glm(present_or_not ~ has_circulated, data=df, family=binomial)
summary(mod2)
## 
## Call:
## glm(formula = present_or_not ~ has_circulated, family = binomial, 
##     data = df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.2777  -0.2777  -0.2625  -0.2625   2.6021  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -3.35092    0.09976 -33.589   <2e-16 ***
## has_circulated  0.11454    0.13898   0.824     0.41    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1854.1  on 6005  degrees of freedom
## Residual deviance: 1853.4  on 6004  degrees of freedom
## AIC: 1857.4
## 
## Number of Fisher Scoring iterations: 6
# model 3
print("model 3")
## [1] "model 3"
mod3 <- glm(present_or_not ~ location_code, data=df, family=binomial)
summary(mod3)
## 
## Call:
## glm(formula = present_or_not ~ location_code, family = binomial, 
##     data = df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.4334  -0.2822  -0.2299  -0.2299   3.0414  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -3.6202     0.1106 -32.744  < 2e-16 ***
## location_codewas     0.2564     0.2529   1.014  0.31071    
## location_codeech     0.4170     0.2646   1.576  0.11509    
## location_codelaw     0.6475     0.2450   2.643  0.00821 ** 
## location_codemann    1.0514     0.2570   4.091 4.30e-05 ***
## location_codeuris    0.7141     0.2959   2.413  0.01581 *  
## location_codesasa    0.7533     0.3167   2.378  0.01738 *  
## location_codeilr     1.3024     0.3110   4.188 2.82e-05 ***
## location_codemath    0.7115     0.4336   1.641  0.10080    
## location_codemus    -0.9949     1.0110  -0.984  0.32507    
## location_codeafr    -0.2510     1.0164  -0.247  0.80495    
## location_codejgsm    1.0812     0.6098   1.773  0.07622 .  
## location_codehote  -11.9459   297.0818  -0.040  0.96793    
## location_codevet     0.5757     1.0295   0.559  0.57603    
## location_codeasia  -11.9459  1455.3975  -0.008  0.99345    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1854.1  on 6005  degrees of freedom
## Residual deviance: 1814.8  on 5991  degrees of freedom
## AIC: 1844.8
## 
## Number of Fisher Scoring iterations: 14
aug_mod3 <- augment(mod3, type.predict = "response") %>%
#  mutate(prob = (exp(.fitted) / (1+exp(.fitted)) )  ) %>%
  filter(location_code == "ilr" | location_code == "mus" | location_code == "mann") %>%
  sample_n(50) %>%
  select(location_code, present_or_not, .fitted) %>%
  arrange(.fitted)

aug_mod3
## # A tibble: 50 x 3
##    location_code present_or_not .fitted
##    <fct>         <fct>            <dbl>
##  1 mus           1              0.00980
##  2 mus           1              0.00980
##  3 mus           1              0.00980
##  4 mus           1              0.00980
##  5 mus           1              0.00980
##  6 mus           1              0.00980
##  7 mann          0              0.0712 
##  8 mann          0              0.0712 
##  9 mann          1              0.0712 
## 10 mann          0              0.0712 
## # ... with 40 more rows

More explanatory variables

What can we learn by adding an indicator for is_oversize?

glimpse(df)
## Observations: 6,006
## Variables: 19
## $ present_or_not       <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ barcode              <chr> "31924062968908", "31924072130184", "3192...
## $ location_code        <fct> afr, afr, afr, afr, afr, afr, afr, afr, a...
## $ call_nbr_display     <chr> "DT32 .R61", "DT328.M53 .H3613x 1993", "D...
## $ call_nbr_norm_item   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ enumeration          <chr> NA, NA, NA, NA, NA, NA, NA, "c.2", NA, NA...
## $ title                <chr> "Death of Africa. By Peter Ritner.", "Vic...
## $ item_control_nbr     <chr> "3723592", "4103508", "7620171", "9494855...
## $ recorded_uses_item   <dbl> 1, 0, 2, 0, 0, 10, 2, 0, 56, 1, 2, 4, 3, ...
## $ bib_rec_nbr          <chr> "1968678", "2249095", "5689943", "8618953...
## $ worldcat_oclc_nbr    <chr> "412793", "59941146", "148569", "86982417...
## $ Catalog_URL          <chr> "https://newcatalog.library.cornell.edu/c...
## $ us_holdings          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ row_number           <dbl> 1585081, 735421, 2715241, 850681, 84661, ...
## $ Condition            <chr> "Acceptable", "Excellent", "Acceptable", ...
## $ `Barcode validation` <chr> "yes", "yes", "yes", "yes", "yes", "no", ...
## $ Timestamp            <dttm> 2018-07-06 13:56:22, 2018-07-06 13:56:22...
## $ Status               <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ has_circulated       <dbl> 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1,...
df %>%
  select(call_nbr_display) %>%
  top_n(5)
## Selecting by call_nbr_display
## # A tibble: 5 x 1
##   call_nbr_display                  
##   <chr>                             
## 1 maps G6690s 1000 .P6 Map          
## 2 oversize HA4587.W47 W49x 1999 +   
## 3 oversize HD8653.5 .M37 1997 +     
## 4 oversize DS793.A5 A644 1993 v.44 +
## 5 Z989.F36 Y83 2012
# add is oversize 0-1 variable

df %>%
  filter(str_detect(call_nbr_display, "\\+")  ) %>%
  select(call_nbr_display)
## # A tibble: 689 x 1
##    call_nbr_display                
##    <chr>                           
##  1 + HD5715.5.N5 R57               
##  2 + HD9017.S82 J78 1969           
##  3 Oversize BL2085 .G84 1995z +    
##  4 Oversize BQ972.A33 K94 2008 +   
##  5 Oversize BQ4190 .B957 2000 +    
##  6 Oversize BQ4570.A4 M83 2007 +   
##  7 Oversize BR1260 .L364 2006 +    
##  8 Oversize BX4705.S639 S55 2003 ++
##  9 Oversize CN1230.C3 C67 +        
## 10 Oversize DS525 .P739 2009 +     
## # ... with 679 more rows
df$is_oversize <- ifelse(str_detect(df$call_nbr_display, "\\+"), 1, 0)

df %>%
  filter(is_oversize == "1") %>%
  select(is_oversize, call_nbr_display)
## # A tibble: 689 x 2
##    is_oversize call_nbr_display                
##          <dbl> <chr>                           
##  1           1 + HD5715.5.N5 R57               
##  2           1 + HD9017.S82 J78 1969           
##  3           1 Oversize BL2085 .G84 1995z +    
##  4           1 Oversize BQ972.A33 K94 2008 +   
##  5           1 Oversize BQ4190 .B957 2000 +    
##  6           1 Oversize BQ4570.A4 M83 2007 +   
##  7           1 Oversize BR1260 .L364 2006 +    
##  8           1 Oversize BX4705.S639 S55 2003 ++
##  9           1 Oversize CN1230.C3 C67 +        
## 10           1 Oversize DS525 .P739 2009 +     
## # ... with 679 more rows
df %>%
  count(is_oversize)
## # A tibble: 2 x 2
##   is_oversize     n
##         <dbl> <int>
## 1           0  5317
## 2           1   689
df %>%
  filter(location_code == "olin") %>%
  group_by(is_oversize, present_or_not) %>%
  count()
## # A tibble: 4 x 3
## # Groups:   is_oversize, present_or_not [4]
##   is_oversize present_or_not     n
##         <dbl> <fct>          <int>
## 1           0 1               2812
## 2           0 0                 69
## 3           1 1                325
## 4           1 0                 15
df %>%
  filter(location_code == "olin") %>%
  group_by(is_oversize) %>%
  summarise(stat = mean(present_or_not == "1"))
## # A tibble: 2 x 2
##   is_oversize  stat
##         <dbl> <dbl>
## 1           0 0.976
## 2           1 0.956
df %>%
  group_by(is_oversize) %>%
  summarise(stat = mean(present_or_not == "1"))
## # A tibble: 2 x 2
##   is_oversize  stat
##         <dbl> <dbl>
## 1           0 0.966
## 2           1 0.952
df %>%
  filter(location_code == "uris") %>%
  group_by(is_oversize) %>%
  summarise(stat = mean(present_or_not == "1"))
## # A tibble: 2 x 2
##   is_oversize  stat
##         <dbl> <dbl>
## 1           0 0.957
## 2           1 0.918
df_o <- df %>%
  group_by(location_code, is_oversize) %>%
  summarise(count = n(),
            stat = mean(present_or_not == "1")) %>%
  arrange(stat)

ggplot(df_o, aes(x = as.factor(is_oversize), y = stat)) +
  geom_boxplot()

ggplot(df_o, aes(x = location_code, y = stat, color = as.factor(is_oversize), size = count )  ) +
geom_point() +
  coord_flip()

#  geom_text(aes(label=paste("",count), hjust=1, vjust=-0.5, size = 0.05))

df_o %>%
  select(-count) %>%
  spread(is_oversize, stat) %>%
  filter(!is.na(`1`)) %>%
  summarize(difference = abs(`0` -`1`)) %>%
  arrange(desc(difference))
## # A tibble: 9 x 2
##   location_code difference
##   <fct>              <dbl>
## 1 mann              0.0976
## 2 sasa              0.0804
## 3 jgsm              0.0750
## 4 uris              0.0389
## 5 afr               0.0213
## 6 was               0.0212
## 7 olin              0.0202
## 8 ech               0.0122
## 9 mus               0.0112
# model 4
print("model 4")
## [1] "model 4"
mod4 <- glm(present_or_not ~ is_oversize, data=df, family=binomial)
summary(mod4)
## 
## Call:
## glm(formula = present_or_not ~ is_oversize, family = binomial, 
##     data = df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.3133  -0.2639  -0.2639  -0.2639   2.5979  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.33983    0.07543 -44.279   <2e-16 ***
## is_oversize  0.35018    0.19369   1.808   0.0706 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1854.1  on 6005  degrees of freedom
## Residual deviance: 1851.0  on 6004  degrees of freedom
## AIC: 1855
## 
## Number of Fisher Scoring iterations: 6
# model 5
print("model 5")
## [1] "model 5"
mod5 <- glm(present_or_not ~ location_code + is_oversize, data=df, family=binomial)
summary(mod5)
## 
## Call:
## glm(formula = present_or_not ~ location_code + is_oversize, family = binomial, 
##     data = df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.4842  -0.3072  -0.2231  -0.2231   3.0651  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -3.6806     0.1150 -32.004  < 2e-16 ***
## location_codewas     0.2042     0.2544   0.803  0.42219    
## location_codeech     0.3540     0.2666   1.328  0.18434    
## location_codelaw     0.7078     0.2470   2.866  0.00416 ** 
## location_codemann    1.0996     0.2583   4.257 2.07e-05 ***
## location_codeuris    0.6504     0.2978   2.184  0.02898 *  
## location_codesasa    0.7094     0.3177   2.233  0.02558 *  
## location_codeilr     1.3627     0.3126   4.359 1.31e-05 ***
## location_codemath    0.7719     0.4347   1.776  0.07581 .  
## location_codemus    -1.0076     1.0112  -0.996  0.31904    
## location_codeafr    -0.2144     1.0167  -0.211  0.83295    
## location_codejgsm    1.1277     0.6105   1.847  0.06470 .  
## location_codehote  -11.8855   297.0818  -0.040  0.96809    
## location_codevet     0.6361     1.0300   0.618  0.53687    
## location_codeasia  -11.8855  1455.3975  -0.008  0.99348    
## is_oversize          0.4684     0.2036   2.300  0.02145 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1854.1  on 6005  degrees of freedom
## Residual deviance: 1809.9  on 5990  degrees of freedom
## AIC: 1841.9
## 
## Number of Fisher Scoring iterations: 14
glance(mod5)
## # A tibble: 1 x 7
##   null.deviance df.null logLik   AIC   BIC deviance df.residual
##           <dbl>   <int>  <dbl> <dbl> <dbl>    <dbl>       <int>
## 1         1854.    6005  -905. 1842. 1949.    1810.        5990
aug_mod5 <- augment(mod5, type.predict = "response") %>%
  # mutate(prob = (exp(.fitted) / (1+exp(.fitted)) )  ) %>%
  mutate(y_hat = .fitted) %>%
  filter(location_code == "sasa" |
           location_code == "jgsm" |
           location_code == "uris" |
           location_code == "mus") %>%
  sample_n(100) %>%
  select(location_code, present_or_not, is_oversize, y_hat) %>%
  arrange(y_hat)

aug_mod5
## # A tibble: 100 x 4
##    location_code present_or_not is_oversize   y_hat
##    <fct>         <fct>                <dbl>   <dbl>
##  1 mus           1                        0 0.00912
##  2 mus           1                        0 0.00912
##  3 mus           1                        0 0.00912
##  4 mus           1                        0 0.00912
##  5 mus           1                        0 0.00912
##  6 mus           1                        0 0.00912
##  7 mus           1                        0 0.00912
##  8 mus           1                        0 0.00912
##  9 mus           1                        0 0.00912
## 10 mus           1                        0 0.00912
## # ... with 90 more rows
nd1 <- data.frame(location_code = c("mus","mus", "jgsm", "jgsm"),
                  is_oversize = c(0, 1, 0, 1))
augment(mod5, newdata = nd1, type.predict = "response")
## # A tibble: 4 x 4
##   location_code is_oversize .fitted .se.fit
## * <fct>               <dbl>   <dbl>   <dbl>
## 1 mus                     0 0.00912 0.00909
## 2 mus                     1 0.0145  0.0145 
## 3 jgsm                    0 0.0722  0.0402 
## 4 jgsm                    1 0.111   0.0621
#nd2 <- df[c(10,200,1000,3000),]
nd2 <- df[seq(1,nrow(df), by = 100),]
augment(mod5, newdata = nd2, type.predict = "response") %>%
  select(location_code, is_oversize, .fitted, .se.fit) %>%
  arrange(.fitted)
## # A tibble: 61 x 4
##    location_code is_oversize .fitted .se.fit
##    <fct>               <dbl>   <dbl>   <dbl>
##  1 mus                     0 0.00912 0.00909
##  2 afr                     0 0.0199  0.0197 
##  3 olin                    0 0.0246  0.00276
##  4 olin                    0 0.0246  0.00276
##  5 olin                    0 0.0246  0.00276
##  6 olin                    0 0.0246  0.00276
##  7 olin                    0 0.0246  0.00276
##  8 olin                    0 0.0246  0.00276
##  9 olin                    0 0.0246  0.00276
## 10 olin                    0 0.0246  0.00276
## # ... with 51 more rows

What else can we learn from the call numbers?

df <- df %>%
  mutate(call_first_letter = call_nbr_display,
         call_first_letter = str_replace_all(call_first_letter, regex("oversize", ignore_case = TRUE), ""),
         call_first_letter = str_trim(call_first_letter),
         call_first_letter = str_sub(call_first_letter, 1,1),
         call_first_letter = as.factor(call_first_letter))

# this section not working yet

df_loc_call <- df %>%
  group_by(call_first_letter, location_code) %>%
  summarise(count = n(),
            stat = mean(present_or_not == "1")) %>%
  mutate(num_available = round(count * stat)) %>%
  mutate(num_missing = count - num_available) %>%
  mutate(bootmean = NA,
         bootse = NA) %>%
  filter(!count < 5) %>%
  arrange(stat, location_code)

for(j in 1:nrow(df_loc_call)) {
  df_loc_call_subset <- df %>%
    filter(location_code == df_loc_call$location_code[j] & call_first_letter == df_loc_call$call_first_letter[j])
  df_ret <- get_boot(df_loc_call_subset,replimit)
  df_loc_call$bootmean[j] <- mean(df_ret$stat)
  df_loc_call$bootse[j] <- sd(df_ret$stat)
}

df_loc_call
## # A tibble: 81 x 8
## # Groups:   call_first_letter [23]
##    call_first_lett~ location_code count  stat num_available num_missing
##    <fct>            <fct>         <int> <dbl>         <dbl>       <dbl>
##  1 T                law               6 0.667             4           2
##  2 L                mann             23 0.696            16           7
##  3 J                sasa             11 0.727             8           3
##  4 S                ech               5 0.8               4           1
##  5 R                ilr               5 0.8               4           1
##  6 L                was               6 0.833             5           1
##  7 M                ech               6 0.833             5           1
##  8 Q                ech               7 0.857             6           1
##  9 H                law              22 0.864            19           3
## 10 T                mann             25 0.88             22           3
## # ... with 71 more rows, and 2 more variables: bootmean <dbl>,
## #   bootse <dbl>
df_loc_call %>%
  filter(bootmean < p_hat) %>%
  mutate(loc_firstl = paste(location_code, call_first_letter, sep = "_")) %>%
  arrange(loc_firstl) %>%
  ggplot(aes(y=bootmean, x=fct_reorder(loc_firstl, -bootmean) )) +
  geom_errorbar(aes(ymin=bootmean-bootse, ymax=bootmean+bootse, width = 0.1)) +
  geom_point(color="red") +
  coord_flip() +
  scale_y_continuous(limits = c(.470,1.005), breaks = seq(.60,1.0, by = .02)) + labs(title = paste0("\nCUL monograph accounted for results (< grand mean of ", round(p_hat,3), ")" ) ) +
  xlab("location and first letter of callnum") +
  ylab("proportion accounted for") +
  theme_minimal()

  # geom_hline(yintercept = p_hat, col = "yellow")

p_hat
## [1] 0.9642025

Next steps?

To be systematic about addressing the uneven accounted for, and bring CUL up the the level of EAST, we need a process for regularly sampling each location_code + call_first_letter group across the system (stratified sample). 30 is often used as a sample size cutoff by statisticians as a reasonable compromise between accuracy and data collection effort.

df %>%
  group_by(location_code, call_first_letter) %>%
  count() %>%
  arrange(n)
## # A tibble: 160 x 3
## # Groups:   location_code, call_first_letter [160]
##    location_code call_first_letter     n
##    <fct>         <fct>             <int>
##  1 olin          m                     1
##  2 was           E                     1
##  3 was           K                     1
##  4 was           S                     1
##  5 was           V                     1
##  6 ech           A                     1
##  7 law           C                     1
##  8 law           F                     1
##  9 law           P                     1
## 10 mann          A                     1
## # ... with 150 more rows
160 * 30
## [1] 4800

That’s a big number. What if we excluded the location_code + call_first_letter groups that already contained more than 30 in the original sample.

df %>%
  group_by(location_code, call_first_letter) %>%
  count() %>%
  filter(!n > 30) %>%
  arrange(n)
## # A tibble: 125 x 3
## # Groups:   location_code, call_first_letter [125]
##    location_code call_first_letter     n
##    <fct>         <fct>             <int>
##  1 olin          m                     1
##  2 was           E                     1
##  3 was           K                     1
##  4 was           S                     1
##  5 was           V                     1
##  6 ech           A                     1
##  7 law           C                     1
##  8 law           F                     1
##  9 law           P                     1
## 10 mann          A                     1
## # ... with 115 more rows
125 * 30
## [1] 3750

Additional detail about the data sample

6006 volumes are on your list, 6006 of which have been checked.

Accounted for: 5791 At the current rate you will find 96.4% on shelf. (afr: 98.0% asia: 100% ech: 96.1% hote: 100% ilr: 91.0% jgsm: 92.7% law: 95.1% mann: 92.9% math: 94.8% mus: 99.0% olin: 97.4% sasa: 94.6% uris: 94.8% vet: 95.5% was: 96.7% )

96.8% of the accounted for items are physically present on the shelf.

df %>% count(present_or_not) A tibble: 4 x 2 present_or_not n 1 CheckedOut 185 2 LMBO 18 3 NotOnShelf-Verified 197 4 Present 5606

df %>% count(present_or_not) %>% summarise(sum = sum(n)) A tibble: 1 x 1 sum 1 6006

5606 + 185 [1] 5791 5791 / 6006 [1] 0.9642025 5606 + 185 + 18 [1] 5809 5809 / 6006 [1] 0.9671995 5606 / 5791 [1] 0.9680539

accounted_for = CheckedOut + Present

missing <- 18 + 197 1 - (missing / 6006)