Overview
# prepare data
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
library(stringr)
library(forcats)
library(readxl)
library(broom)
library(janitor)
# load data
df_raw <- read_excel("data/after_samplecollection/Inventory list with Pub Date 20180817.xlsx")
df <- df_raw %>%
clean_names()
prepare_df <- function(df_raw) {
df$present_or_not <- as.factor(df$present_or_not)
df <- df %>%
mutate(present_or_not = fct_recode(present_or_not,
"1" = "Present",
"1" = "CheckedOut",
"0" = "NotOnShelf-Verified",
"0" = "LMBO")) %>%
mutate(location_code = as.factor(location_code),
location_code = fct_infreq(location_code))
df$has_circulated <- ifelse(df$recorded_uses_item > 0, 1, 0)
df$is_oversize <- ifelse(str_detect(df$display_call_no, "\\+"), 1, 0)
df$begin_pub_date <- as.numeric(df$begin_pub_date)
df <- df %>%
mutate(age = 2018 - begin_pub_date)
df$age_group <- NA
df$age_group[df$age <= 10] <- "0-10"
df$age_group[df$age > 10] <- "11plus"
df$age_group <- as.factor(df$age_group)
df$firstletter <- as.factor(df$firstletter)
return(df)
}
df <- prepare_df(df_raw)
df_location_code_gt30 <- df %>%
group_by(location_code) %>%
summarise(total = n()) %>%
filter(total >= 29) %>%
select(location_code)
df <- df %>%
inner_join(df_location_code_gt30)
## Joining, by = "location_code"
glimpse(df)
## Observations: 5,950
## Variables: 31
## $ present_or_not <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ bib_rec_nbr <chr> "1968678", "2249095", "5689943", "8618953",...
## $ mfhd_id <chr> "2389846", "2702959", "6199187", "8994997",...
## $ item_control_nbr <chr> "3723592", "4103508", "7620171", "9494855",...
## $ barcode <chr> "31924062968908", "31924072130184", "319241...
## $ begin_pub_date <dbl> 1960, 1993, 1971, 2013, 2010, 1971, 1994, 1...
## $ location_code <fct> afr, afr, afr, afr, afr, afr, afr, afr, afr...
## $ firstletter <fct> D, D, D, D, D, D, D, E, E, E, E, E, E, E, E...
## $ class <chr> "DT", "DT", "DT", "DT", "DT", "DT", "DT", "...
## $ classnumber <dbl> 32.000, 328.000, 356.000, 433.285, 433.545,...
## $ normalized_call_no <chr> "DT 32 R 61", "DT 328 ...
## $ display_call_no <chr> "DT32 .R61", "DT328.M53 .H3613x 1993", "DT3...
## $ call_nbr_norm_item <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ enumeration <chr> NA, NA, NA, NA, NA, NA, NA, "c.2", NA, NA, ...
## $ length_cn <dbl> 9, 22, 14, 19, 22, 20, 18, 11, 12, 18, 18, ...
## $ pagination <chr> "312 p. 22 cm.", "xv, 199 p. : ill. ; 24 cm...
## $ title <chr> "Death of Africa. By Peter Ritner.", "Victi...
## $ recorded_uses_item <dbl> 1, 0, 2, 0, 0, 10, 2, 0, 56, 1, 2, 4, 3, 2,...
## $ worldcat_oclc_nbr <chr> "412793", "59941146", "148569", "869824175"...
## $ catalog_url <chr> "https://newcatalog.library.cornell.edu/cat...
## $ us_holdings <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ row_number <dbl> 1585081, 735421, 2715241, 850681, 84661, 55...
## $ initials <chr> "mah94", "mah94", "mah94", "mah94", "mah94"...
## $ condition <chr> "Acceptable", "Excellent", "Acceptable", "E...
## $ barcode_validation <chr> "yes", "yes", "yes", "yes", "yes", "no", "y...
## $ timestamp <dttm> 2018-07-06 13:56:22, 2018-07-06 13:56:22, ...
## $ item_status_desc <chr> "Not Charged", "Not Charged", "Not Charged"...
## $ has_circulated <dbl> 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1...
## $ is_oversize <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ age <dbl> 58, 25, 47, 5, 8, 47, 24, 53, 28, 39, 36, 3...
## $ age_group <fct> 11plus, 11plus, 11plus, 0-10, 0-10, 11plus,...
More data prep and some models
df <- df %>%
unite("combo", location_code, firstletter, sep = "_", remove = FALSE)
df_location_code_letter_gt30 <- df %>%
group_by(combo) %>%
summarise(total = n()) %>%
filter(total > 50)
df_combo <- df %>%
inner_join(df_location_code_letter_gt30)
## Joining, by = "combo"
df_combo$combo <- as.factor(df_combo$combo)
df_combo %>%
group_by(combo) %>%
nest(-combo) %>%
mutate(models = map(data, ~ glm(present_or_not ~ length_cn + recorded_uses_item, ., family=binomial))) %>%
mutate(tidied = map(models,tidy)) %>%
unnest(tidied) %>%
filter(term != "(Intercept)") %>%
arrange(p.value)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## # A tibble: 52 x 6
## combo term estimate std.error statistic p.value
## <fct> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 uris_T length_cn 0.203 0.0848 2.40 0.0164
## 2 ech_D length_cn 0.307 0.140 2.19 0.0287
## 3 olin_H length_cn 0.121 0.0559 2.17 0.0300
## 4 math_Q recorded_uses_item 0.0461 0.0215 2.14 0.0320
## 5 olin_K length_cn 0.348 0.175 1.99 0.0461
## 6 olin_F length_cn 0.146 0.0742 1.97 0.0490
## 7 ilr_H recorded_uses_item -0.661 0.343 -1.93 0.0538
## 8 olin_K recorded_uses_item 0.434 0.290 1.49 0.135
## 9 olin_B recorded_uses_item 0.0588 0.0478 1.23 0.219
## 10 olin_E length_cn 0.103 0.0880 1.17 0.241
## # ... with 42 more rows
df_combo %>%
group_by(location_code) %>%
nest(-location_code) %>%
mutate(models = map(data, ~ glm(present_or_not ~ is_oversize, ., family=binomial))) %>%
mutate(tidied = map(models,tidy)) %>%
unnest(tidied) %>%
mutate(p_adjust = p.adjust(p.value)) %>%
filter(term != "(Intercept)") %>%
arrange(p.value)
## # A tibble: 7 x 7
## location_code term estimate std.error statistic p.value p_adjust
## <fct> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 olin is_oversize 0.654 0.300 2.18 0.0294 0.206
## 2 uris is_oversize 0.750 0.670 1.12 0.263 1
## 3 ech is_oversize 0.687 0.619 1.11 0.267 1
## 4 was is_oversize -0.978 1.05 -0.934 0.351 1
## 5 mann is_oversize -14.7 2284. -0.00642 0.995 1
## 6 sasa is_oversize -13.9 2400. -0.00579 0.995 1
## 7 mus is_oversize -16.2 5118. -0.00316 0.997 1
# model 2a
print("model 2a")
## [1] "model 2a"
mod2a <- glm(present_or_not ~ location_code, data=df, family=binomial)
summary(mod2a)
##
## Call:
## glm(formula = present_or_not ~ location_code, family = binomial,
## data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.4334 -0.2819 -0.2273 -0.2273 3.0414
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.6433 0.1119 -32.569 < 2e-16 ***
## location_codewas 0.2299 0.2586 0.889 0.37392
## location_codeech 0.4379 0.2652 1.651 0.09868 .
## location_codelaw 0.6753 0.2456 2.750 0.00596 **
## location_codemann 1.0784 0.2576 4.186 2.84e-05 ***
## location_codeuris 0.7372 0.2964 2.487 0.01287 *
## location_codesasa 0.7764 0.3172 2.448 0.01437 *
## location_codeilr 1.3255 0.3115 4.256 2.08e-05 ***
## location_codemath 0.7346 0.4339 1.693 0.09044 .
## location_codemus -0.9718 1.0111 -0.961 0.33652
## location_codeafr -0.2279 1.0165 -0.224 0.82264
## location_codejgsm 1.1044 0.6101 1.810 0.07025 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1823.6 on 5949 degrees of freedom
## Residual deviance: 1784.7 on 5938 degrees of freedom
## AIC: 1808.7
##
## Number of Fisher Scoring iterations: 7
# augment(mod2a, type.predict = "response") %>%
# sample_n(100) %>%
# arrange(.fitted)
# augment(mod2a, type.predict = "response") %>%
# group_by(location_code) %>%
# summarize(total = n(),
# mean_fitted = mean(.fitted),
# mean_se_fit = mean(.se.fit)) %>%
# arrange(total)
# augment(mod2a, type.predict = "response") %>%
# ggplot(aes(x = .fitted)) +
# geom_density() +
# facet_wrap(~ location_code, scales = "free")
Model 2b
# model 2b
print("model 2b")
## [1] "model 2b"
mod2b <- glm(present_or_not ~ location_code + length_cn + recorded_uses_item, data=df, family=binomial)
summary(mod2b)
##
## Call:
## glm(formula = present_or_not ~ location_code + length_cn + recorded_uses_item,
## family = binomial, data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3232 -0.2889 -0.2374 -0.2172 2.8437
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.334379 0.262109 -16.537 < 2e-16 ***
## location_codewas 0.162951 0.260983 0.624 0.532382
## location_codeech 0.365750 0.267962 1.365 0.172275
## location_codelaw 0.668289 0.246645 2.710 0.006738 **
## location_codemann 1.027301 0.259795 3.954 7.68e-05 ***
## location_codeuris 0.589947 0.300343 1.964 0.049502 *
## location_codesasa 0.703886 0.319179 2.205 0.027433 *
## location_codeilr 1.340871 0.315856 4.245 2.18e-05 ***
## location_codemath 0.743028 0.438712 1.694 0.090331 .
## location_codemus -0.934293 1.011942 -0.923 0.355868
## location_codeafr -0.216119 1.017210 -0.212 0.831746
## location_codejgsm 0.714289 0.683650 1.045 0.296108
## length_cn 0.036283 0.012946 2.803 0.005068 **
## recorded_uses_item 0.018457 0.005572 3.312 0.000926 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1823.6 on 5949 degrees of freedom
## Residual deviance: 1768.6 on 5936 degrees of freedom
## AIC: 1796.6
##
## Number of Fisher Scoring iterations: 7
augment(mod2b, type.predict = "response") %>%
#filter(.fitted > 0.01 & present_or_not == "0") %>%
filter(.fitted > 0.05) %>%
arrange(desc(.fitted))
## # A tibble: 956 x 11
## present_or_not location_code length_cn recorded_uses_i~ .fitted .se.fit
## <fct> <fct> <dbl> <dbl> <dbl> <dbl>
## 1 0 jgsm 17 204 0.682 0.250
## 2 1 ilr 17 147 0.583 0.204
## 3 1 law 16 145 0.399 0.196
## 4 1 jgsm 18 108 0.274 0.154
## 5 1 mann 10 98 0.243 0.103
## 6 1 law 18 94 0.218 0.0937
## 7 0 math 14 89 0.191 0.0943
## 8 1 uris 15 92 0.182 0.0826
## 9 0 mann 15 68 0.181 0.0607
## 10 1 mann 49 1 0.181 0.0696
## # ... with 946 more rows, and 5 more variables: .resid <dbl>, .hat <dbl>,
## # .sigma <dbl>, .cooksd <dbl>, .std.resid <dbl>
df %>% filter(present_or_not == "0")
## # A tibble: 211 x 32
## present_or_not bib_rec_nbr mfhd_id item_control_nbr barcode
## <fct> <chr> <chr> <chr> <chr>
## 1 0 7732244 8149908 9144851 319241~
## 2 0 1130676 1387582 2457742 319240~
## 3 0 2522770 3016635 4427745 319240~
## 4 0 3894667 4465433 5917889 319240~
## 5 0 76363 102907 125824 319240~
## 6 0 4033240 4594846 6113168 319240~
## 7 0 9278443 9613918 9969752 319241~
## 8 0 2117586 2557433 3942405 319240~
## 9 0 2536204 3031081 4442395 319240~
## 10 0 2819311 3339767 4750858 319240~
## # ... with 201 more rows, and 27 more variables: begin_pub_date <dbl>,
## # combo <chr>, location_code <fct>, firstletter <fct>, class <chr>,
## # classnumber <dbl>, normalized_call_no <chr>, display_call_no <chr>,
## # call_nbr_norm_item <dbl>, enumeration <chr>, length_cn <dbl>,
## # pagination <chr>, title <chr>, recorded_uses_item <dbl>,
## # worldcat_oclc_nbr <chr>, catalog_url <chr>, us_holdings <dbl>,
## # row_number <dbl>, initials <chr>, condition <chr>,
## # barcode_validation <chr>, timestamp <dttm>, item_status_desc <chr>,
## # has_circulated <dbl>, is_oversize <dbl>, age <dbl>, age_group <fct>