Overview

# prepare data

library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
library(stringr)
library(forcats)
library(readxl)
library(broom)
library(janitor)


# load data
df_raw <- read_excel("data/after_samplecollection/Inventory list with Pub Date 20180817.xlsx")

df <- df_raw %>% 
  clean_names()

prepare_df <- function(df_raw) {
  df$present_or_not <- as.factor(df$present_or_not)
  df <- df %>%
    mutate(present_or_not = fct_recode(present_or_not,
                                  "1" = "Present",
                                  "1" = "CheckedOut",
                                  "0" = "NotOnShelf-Verified",
                                  "0" = "LMBO")) %>%
    mutate(location_code = as.factor(location_code),
         location_code = fct_infreq(location_code))
 
  df$has_circulated <- ifelse(df$recorded_uses_item > 0, 1, 0)
  df$is_oversize <- ifelse(str_detect(df$display_call_no, "\\+"), 1, 0)
  df$begin_pub_date <- as.numeric(df$begin_pub_date)
  df <- df %>%
    mutate(age = 2018 - begin_pub_date)
  df$age_group <- NA
  df$age_group[df$age <= 10] <- "0-10"
  df$age_group[df$age > 10] <- "11plus"
  df$age_group <- as.factor(df$age_group)
  df$firstletter <- as.factor(df$firstletter)

  return(df)
}

df <- prepare_df(df_raw)

df_location_code_gt30 <- df %>%
  group_by(location_code) %>%
  summarise(total = n()) %>%
  filter(total >= 29) %>%
  select(location_code)

df <- df %>%
  inner_join(df_location_code_gt30)
## Joining, by = "location_code"
glimpse(df)
## Observations: 5,950
## Variables: 31
## $ present_or_not     <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ bib_rec_nbr        <chr> "1968678", "2249095", "5689943", "8618953",...
## $ mfhd_id            <chr> "2389846", "2702959", "6199187", "8994997",...
## $ item_control_nbr   <chr> "3723592", "4103508", "7620171", "9494855",...
## $ barcode            <chr> "31924062968908", "31924072130184", "319241...
## $ begin_pub_date     <dbl> 1960, 1993, 1971, 2013, 2010, 1971, 1994, 1...
## $ location_code      <fct> afr, afr, afr, afr, afr, afr, afr, afr, afr...
## $ firstletter        <fct> D, D, D, D, D, D, D, E, E, E, E, E, E, E, E...
## $ class              <chr> "DT", "DT", "DT", "DT", "DT", "DT", "DT", "...
## $ classnumber        <dbl> 32.000, 328.000, 356.000, 433.285, 433.545,...
## $ normalized_call_no <chr> "DT   32            R 61", "DT  328        ...
## $ display_call_no    <chr> "DT32 .R61", "DT328.M53 .H3613x 1993", "DT3...
## $ call_nbr_norm_item <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ enumeration        <chr> NA, NA, NA, NA, NA, NA, NA, "c.2", NA, NA, ...
## $ length_cn          <dbl> 9, 22, 14, 19, 22, 20, 18, 11, 12, 18, 18, ...
## $ pagination         <chr> "312 p. 22 cm.", "xv, 199 p. : ill. ; 24 cm...
## $ title              <chr> "Death of Africa. By Peter Ritner.", "Victi...
## $ recorded_uses_item <dbl> 1, 0, 2, 0, 0, 10, 2, 0, 56, 1, 2, 4, 3, 2,...
## $ worldcat_oclc_nbr  <chr> "412793", "59941146", "148569", "869824175"...
## $ catalog_url        <chr> "https://newcatalog.library.cornell.edu/cat...
## $ us_holdings        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ row_number         <dbl> 1585081, 735421, 2715241, 850681, 84661, 55...
## $ initials           <chr> "mah94", "mah94", "mah94", "mah94", "mah94"...
## $ condition          <chr> "Acceptable", "Excellent", "Acceptable", "E...
## $ barcode_validation <chr> "yes", "yes", "yes", "yes", "yes", "no", "y...
## $ timestamp          <dttm> 2018-07-06 13:56:22, 2018-07-06 13:56:22, ...
## $ item_status_desc   <chr> "Not Charged", "Not Charged", "Not Charged"...
## $ has_circulated     <dbl> 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1...
## $ is_oversize        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ age                <dbl> 58, 25, 47, 5, 8, 47, 24, 53, 28, 39, 36, 3...
## $ age_group          <fct> 11plus, 11plus, 11plus, 0-10, 0-10, 11plus,...

More data prep and some models

df <- df %>%
  unite("combo", location_code, firstletter, sep = "_", remove = FALSE)

df_location_code_letter_gt30 <- df %>%
  group_by(combo) %>%
  summarise(total = n()) %>%
  filter(total > 50)

df_combo <- df %>%
  inner_join(df_location_code_letter_gt30)
## Joining, by = "combo"
df_combo$combo <- as.factor(df_combo$combo)

df_combo %>%
  group_by(combo) %>%
  nest(-combo) %>%
  mutate(models = map(data, ~ glm(present_or_not ~ length_cn + recorded_uses_item, ., family=binomial))) %>%
  mutate(tidied = map(models,tidy)) %>%
  unnest(tidied) %>%
  filter(term != "(Intercept)") %>%
  arrange(p.value)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## # A tibble: 52 x 6
##    combo  term               estimate std.error statistic p.value
##    <fct>  <chr>                 <dbl>     <dbl>     <dbl>   <dbl>
##  1 uris_T length_cn            0.203     0.0848      2.40  0.0164
##  2 ech_D  length_cn            0.307     0.140       2.19  0.0287
##  3 olin_H length_cn            0.121     0.0559      2.17  0.0300
##  4 math_Q recorded_uses_item   0.0461    0.0215      2.14  0.0320
##  5 olin_K length_cn            0.348     0.175       1.99  0.0461
##  6 olin_F length_cn            0.146     0.0742      1.97  0.0490
##  7 ilr_H  recorded_uses_item  -0.661     0.343      -1.93  0.0538
##  8 olin_K recorded_uses_item   0.434     0.290       1.49  0.135 
##  9 olin_B recorded_uses_item   0.0588    0.0478      1.23  0.219 
## 10 olin_E length_cn            0.103     0.0880      1.17  0.241 
## # ... with 42 more rows
df_combo %>%
  group_by(location_code) %>%
  nest(-location_code) %>%
  mutate(models = map(data, ~ glm(present_or_not ~ is_oversize, ., family=binomial))) %>%
  mutate(tidied = map(models,tidy)) %>%
  unnest(tidied) %>%
  mutate(p_adjust = p.adjust(p.value)) %>%
  filter(term != "(Intercept)") %>%
  arrange(p.value)
## # A tibble: 7 x 7
##   location_code term        estimate std.error statistic p.value p_adjust
##   <fct>         <chr>          <dbl>     <dbl>     <dbl>   <dbl>    <dbl>
## 1 olin          is_oversize    0.654     0.300   2.18     0.0294    0.206
## 2 uris          is_oversize    0.750     0.670   1.12     0.263     1    
## 3 ech           is_oversize    0.687     0.619   1.11     0.267     1    
## 4 was           is_oversize   -0.978     1.05   -0.934    0.351     1    
## 5 mann          is_oversize  -14.7    2284.     -0.00642  0.995     1    
## 6 sasa          is_oversize  -13.9    2400.     -0.00579  0.995     1    
## 7 mus           is_oversize  -16.2    5118.     -0.00316  0.997     1
# model 2a
print("model 2a")
## [1] "model 2a"
mod2a <- glm(present_or_not ~ location_code, data=df, family=binomial)
summary(mod2a)
## 
## Call:
## glm(formula = present_or_not ~ location_code, family = binomial, 
##     data = df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.4334  -0.2819  -0.2273  -0.2273   3.0414  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -3.6433     0.1119 -32.569  < 2e-16 ***
## location_codewas    0.2299     0.2586   0.889  0.37392    
## location_codeech    0.4379     0.2652   1.651  0.09868 .  
## location_codelaw    0.6753     0.2456   2.750  0.00596 ** 
## location_codemann   1.0784     0.2576   4.186 2.84e-05 ***
## location_codeuris   0.7372     0.2964   2.487  0.01287 *  
## location_codesasa   0.7764     0.3172   2.448  0.01437 *  
## location_codeilr    1.3255     0.3115   4.256 2.08e-05 ***
## location_codemath   0.7346     0.4339   1.693  0.09044 .  
## location_codemus   -0.9718     1.0111  -0.961  0.33652    
## location_codeafr   -0.2279     1.0165  -0.224  0.82264    
## location_codejgsm   1.1044     0.6101   1.810  0.07025 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1823.6  on 5949  degrees of freedom
## Residual deviance: 1784.7  on 5938  degrees of freedom
## AIC: 1808.7
## 
## Number of Fisher Scoring iterations: 7
# augment(mod2a, type.predict = "response") %>%
#   sample_n(100) %>%
#   arrange(.fitted)

# augment(mod2a, type.predict = "response") %>%
#   group_by(location_code) %>%
#   summarize(total = n(),
#             mean_fitted = mean(.fitted),
#             mean_se_fit = mean(.se.fit)) %>%
#   arrange(total)


# augment(mod2a, type.predict = "response") %>%
#   ggplot(aes(x = .fitted)) +
#   geom_density() +
#   facet_wrap(~ location_code, scales = "free")

Model 2b

# model 2b
print("model 2b")
## [1] "model 2b"
mod2b <- glm(present_or_not ~ location_code + length_cn + recorded_uses_item, data=df, family=binomial)
summary(mod2b)
## 
## Call:
## glm(formula = present_or_not ~ location_code + length_cn + recorded_uses_item, 
##     family = binomial, data = df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.3232  -0.2889  -0.2374  -0.2172   2.8437  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -4.334379   0.262109 -16.537  < 2e-16 ***
## location_codewas    0.162951   0.260983   0.624 0.532382    
## location_codeech    0.365750   0.267962   1.365 0.172275    
## location_codelaw    0.668289   0.246645   2.710 0.006738 ** 
## location_codemann   1.027301   0.259795   3.954 7.68e-05 ***
## location_codeuris   0.589947   0.300343   1.964 0.049502 *  
## location_codesasa   0.703886   0.319179   2.205 0.027433 *  
## location_codeilr    1.340871   0.315856   4.245 2.18e-05 ***
## location_codemath   0.743028   0.438712   1.694 0.090331 .  
## location_codemus   -0.934293   1.011942  -0.923 0.355868    
## location_codeafr   -0.216119   1.017210  -0.212 0.831746    
## location_codejgsm   0.714289   0.683650   1.045 0.296108    
## length_cn           0.036283   0.012946   2.803 0.005068 ** 
## recorded_uses_item  0.018457   0.005572   3.312 0.000926 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1823.6  on 5949  degrees of freedom
## Residual deviance: 1768.6  on 5936  degrees of freedom
## AIC: 1796.6
## 
## Number of Fisher Scoring iterations: 7
augment(mod2b, type.predict = "response") %>%
  #filter(.fitted > 0.01 & present_or_not == "0") %>%
  filter(.fitted > 0.05) %>%
  arrange(desc(.fitted))
## # A tibble: 956 x 11
##    present_or_not location_code length_cn recorded_uses_i~ .fitted .se.fit
##    <fct>          <fct>             <dbl>            <dbl>   <dbl>   <dbl>
##  1 0              jgsm                 17              204   0.682  0.250 
##  2 1              ilr                  17              147   0.583  0.204 
##  3 1              law                  16              145   0.399  0.196 
##  4 1              jgsm                 18              108   0.274  0.154 
##  5 1              mann                 10               98   0.243  0.103 
##  6 1              law                  18               94   0.218  0.0937
##  7 0              math                 14               89   0.191  0.0943
##  8 1              uris                 15               92   0.182  0.0826
##  9 0              mann                 15               68   0.181  0.0607
## 10 1              mann                 49                1   0.181  0.0696
## # ... with 946 more rows, and 5 more variables: .resid <dbl>, .hat <dbl>,
## #   .sigma <dbl>, .cooksd <dbl>, .std.resid <dbl>
df %>% filter(present_or_not == "0")
## # A tibble: 211 x 32
##    present_or_not bib_rec_nbr mfhd_id item_control_nbr barcode
##    <fct>          <chr>       <chr>   <chr>            <chr>  
##  1 0              7732244     8149908 9144851          319241~
##  2 0              1130676     1387582 2457742          319240~
##  3 0              2522770     3016635 4427745          319240~
##  4 0              3894667     4465433 5917889          319240~
##  5 0              76363       102907  125824           319240~
##  6 0              4033240     4594846 6113168          319240~
##  7 0              9278443     9613918 9969752          319241~
##  8 0              2117586     2557433 3942405          319240~
##  9 0              2536204     3031081 4442395          319240~
## 10 0              2819311     3339767 4750858          319240~
## # ... with 201 more rows, and 27 more variables: begin_pub_date <dbl>,
## #   combo <chr>, location_code <fct>, firstletter <fct>, class <chr>,
## #   classnumber <dbl>, normalized_call_no <chr>, display_call_no <chr>,
## #   call_nbr_norm_item <dbl>, enumeration <chr>, length_cn <dbl>,
## #   pagination <chr>, title <chr>, recorded_uses_item <dbl>,
## #   worldcat_oclc_nbr <chr>, catalog_url <chr>, us_holdings <dbl>,
## #   row_number <dbl>, initials <chr>, condition <chr>,
## #   barcode_validation <chr>, timestamp <dttm>, item_status_desc <chr>,
## #   has_circulated <dbl>, is_oversize <dbl>, age <dbl>, age_group <fct>