# prepare data
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
library(stringr)
library(forcats)
library(readxl)
library(broom)
# load data
df_raw <- read_excel("data/after_samplecollection/complete_collected_dataset.xlsx")
prepare_df <- function(df_raw) {
df <- df_raw %>%
rename(present_or_not = `Present or not`)
df$present_or_not <- as.factor(df$present_or_not)
df <- df %>%
mutate(present_or_not = fct_recode(present_or_not,
"1" = "Present",
"1" = "CheckedOut",
"0" = "NotOnShelf-Verified",
"0" = "LMBO")
)
df$location_code <- as.factor(df$location_code)
df$location_code <- fct_infreq(df$location_code)
df$has_circulated <- ifelse(df$recorded_uses_item > 0, 1, 0)
df <- df %>%
select(-Initials, -ITEM_STATUS_DESC)
return(df)
}
df <- prepare_df(df_raw)
glimpse(df)
## Observations: 6,006
## Variables: 19
## $ present_or_not <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ barcode <chr> "31924062968908", "31924072130184", "3192...
## $ location_code <fct> afr, afr, afr, afr, afr, afr, afr, afr, a...
## $ call_nbr_display <chr> "DT32 .R61", "DT328.M53 .H3613x 1993", "D...
## $ call_nbr_norm_item <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ enumeration <chr> NA, NA, NA, NA, NA, NA, NA, "c.2", NA, NA...
## $ title <chr> "Death of Africa. By Peter Ritner.", "Vic...
## $ item_control_nbr <chr> "3723592", "4103508", "7620171", "9494855...
## $ recorded_uses_item <dbl> 1, 0, 2, 0, 0, 10, 2, 0, 56, 1, 2, 4, 3, ...
## $ bib_rec_nbr <chr> "1968678", "2249095", "5689943", "8618953...
## $ worldcat_oclc_nbr <chr> "412793", "59941146", "148569", "86982417...
## $ Catalog_URL <chr> "https://newcatalog.library.cornell.edu/c...
## $ us_holdings <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ row_number <dbl> 1585081, 735421, 2715241, 850681, 84661, ...
## $ Condition <chr> "Acceptable", "Excellent", "Acceptable", ...
## $ `Barcode validation` <chr> "yes", "yes", "yes", "yes", "yes", "no", ...
## $ Timestamp <dttm> 2018-07-06 13:56:22, 2018-07-06 13:56:22...
## $ Status <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ has_circulated <dbl> 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1,...
library(infer)
replimit = 1000
p_hat <- df %>%
summarise(stat = mean(present_or_not == "1")) %>%
pull()
p_hat
## [1] 0.9642025
boot <- df %>%
specify(response = present_or_not, success = "1") %>%
generate(reps = replimit, type = "bootstrap") %>%
calculate(stat = "prop")
boot
## # A tibble: 1,000 x 2
## replicate stat
## <int> <dbl>
## 1 1 0.968
## 2 2 0.965
## 3 3 0.963
## 4 4 0.965
## 5 5 0.964
## 6 6 0.966
## 7 7 0.962
## 8 8 0.965
## 9 9 0.967
## 10 10 0.965
## # ... with 990 more rows
se <- boot %>%
summarize(sd(stat)) %>%
pull()
se
## [1] 0.002359031
ggplot(boot, aes(x = stat)) +
# geom_density() +
geom_histogram() +
geom_vline(xintercept = p_hat, col = "blue") +
scale_x_continuous(breaks = seq(0.955,0.972,by=.001))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# geom_vline(xintercept = p_hat + se, col = "green") +
# geom_vline(xintercept = p_hat - se, col = "green")
We are confident that the true proportion of accounted for monographs across CUL is between 0.959, 0.969.
df_groups <- df %>%
group_by(location_code) %>%
summarise(total = n())
df %>%
filter(location_code == "mann") %>%
specify(response = present_or_not, success = "1") %>%
generate(reps = replimit, type = "bootstrap") %>%
calculate(stat = "prop")
## # A tibble: 1,000 x 2
## replicate stat
## <int> <dbl>
## 1 1 0.932
## 2 2 0.922
## 3 3 0.940
## 4 4 0.925
## 5 5 0.907
## 6 6 0.911
## 7 7 0.929
## 8 8 0.936
## 9 9 0.940
## 10 10 0.932
## # ... with 990 more rows
get_boot <- function(df, repnum) {
df %>%
specify(response = present_or_not, success = "1") %>%
generate(reps = repnum, type = "bootstrap") %>%
calculate(stat = "prop")
}
df_boot <- data.frame()
for(j in 1:nrow(df_groups)) {
df_subset <- df %>%
filter(location_code == df_groups$location_code[j])
df_ret <- get_boot(df_subset,replimit)
df_ret$location_code <- df_groups$location_code[j]
df_boot <- rbind(df_boot, df_ret)
}
df_group_se <- df_boot %>%
group_by(location_code) %>%
summarize(mean = mean(stat),
se = sd(stat)) %>%
inner_join(df_groups) %>%
filter(!total < 30) %>%
mutate(num_missing = total - round(total * mean)) %>%
arrange(-mean)
## Joining, by = "location_code"
df_group_se
## # A tibble: 12 x 5
## location_code mean se total num_missing
## <fct> <dbl> <dbl> <int> <dbl>
## 1 mus 0.990 0.00938 102 1
## 2 afr 0.980 0.0203 49 1
## 3 olin 0.974 0.00285 3221 84
## 4 was 0.966 0.00723 598 20
## 5 ech 0.960 0.00880 461 18
## 6 law 0.951 0.0104 452 22
## 7 math 0.948 0.0215 116 6
## 8 uris 0.948 0.0140 270 14
## 9 sasa 0.946 0.0151 223 12
## 10 mann 0.929 0.0154 281 20
## 11 jgsm 0.927 0.0415 41 3
## 12 ilr 0.911 0.0245 145 13
ggplot(df_group_se, aes(y=mean, x= fct_reorder(location_code, -mean), label = total)) +
geom_errorbar(aes(ymin=mean-se, ymax=mean+se, width = 0.1)) +
geom_point(color="red") +
scale_y_continuous(limits = c(.88,1.005), breaks = seq(.88,1.0, by = .01)) +
labs(title = "\nCUL monograph accounted results, by location") +
xlab("location") +
ylab("proportion accounted for") +
geom_hline(yintercept = p_hat, col = "yellow")
# coord_flip()
ggsave("output/cul_monograph_results_by_loc.png", dpi = 600, width = 8, height = 5)
# load EAST dataset
df_east <- read_excel("data/east_consortium/TattleTapeSurveyResultsToShare.xlsx", skip = 1)
# Glimpse EAST dataset
df_east <- df_east[,1:11]
df_east <- df_east %>%
filter(!is.na(library)) %>%
filter(!is.na(validation_score)) %>%
select(library, validation_score) %>%
arrange(desc(validation_score))
df_east
## # A tibble: 37 x 2
## library validation_score
## <chr> <dbl>
## 1 jackal 0.997
## 2 squirrel 0.995
## 3 giraffe 0.994
## 4 nyan cat 0.994
## 5 leopard 0.992
## 6 grizzly 0.99
## 7 otter 0.99
## 8 cheetah 0.99
## 9 dolphin 0.989
## 10 liger 0.988
## # ... with 27 more rows
mean(df_east$validation_score)
## [1] 0.9726216
# run simulation on EAST dataset
p_hat_east <- df_east %>%
summarise(stat = mean(validation_score)) %>%
pull()
p_hat_east
## [1] 0.9726216
boot_east <- df_east %>%
specify(response = validation_score) %>%
generate(reps = replimit, type = "bootstrap") %>%
calculate(stat = "mean")
boot_east
## # A tibble: 1,000 x 2
## replicate stat
## <int> <dbl>
## 1 1 0.972
## 2 2 0.977
## 3 3 0.975
## 4 4 0.976
## 5 5 0.972
## 6 6 0.973
## 7 7 0.973
## 8 8 0.975
## 9 9 0.968
## 10 10 0.972
## # ... with 990 more rows
se_east <- boot_east %>%
summarize(sd(stat)) %>%
pull()
se_east
## [1] 0.002893712
ggplot(boot_east, aes(x = stat)) +
geom_density() +
geom_vline(xintercept = p_hat_east, col = "red") +
geom_vline(xintercept = p_hat, col = "blue") +
annotate("text", x = p_hat, y = 25, label = "Cornell", size = 3) +
annotate("text", x = p_hat_east, y = 25, label = "EAST", size = 3)
# probability with no explanatory variables
# > exp(-3.29342)
# [1] 0.03712666
# > exp(-3.29342)/(1+exp(-3.29342))
# [1] 0.03579761
# >
# > 1 - (exp(-3.29342)/(1+exp(-3.29342)))
# [1] 0.9642024
print("model 1")
## [1] "model 1"
mod1 <- glm(present_or_not ~ 1, data=df, family=binomial)
summary(mod1)
##
## Call:
## glm(formula = present_or_not ~ 1, family = binomial, data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.270 -0.270 -0.270 -0.270 2.581
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.29342 0.06945 -47.42 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1854.1 on 6005 degrees of freedom
## Residual deviance: 1854.1 on 6005 degrees of freedom
## AIC: 1856.1
##
## Number of Fisher Scoring iterations: 6
# model 2
print("model 2")
## [1] "model 2"
mod2 <- glm(present_or_not ~ has_circulated, data=df, family=binomial)
summary(mod2)
##
## Call:
## glm(formula = present_or_not ~ has_circulated, family = binomial,
## data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.2777 -0.2777 -0.2625 -0.2625 2.6021
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.35092 0.09976 -33.589 <2e-16 ***
## has_circulated 0.11454 0.13898 0.824 0.41
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1854.1 on 6005 degrees of freedom
## Residual deviance: 1853.4 on 6004 degrees of freedom
## AIC: 1857.4
##
## Number of Fisher Scoring iterations: 6
# model 3
print("model 3")
## [1] "model 3"
mod3 <- glm(present_or_not ~ location_code, data=df, family=binomial)
summary(mod3)
##
## Call:
## glm(formula = present_or_not ~ location_code, family = binomial,
## data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.4334 -0.2822 -0.2299 -0.2299 3.0414
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.6202 0.1106 -32.744 < 2e-16 ***
## location_codewas 0.2564 0.2529 1.014 0.31071
## location_codeech 0.4170 0.2646 1.576 0.11509
## location_codelaw 0.6475 0.2450 2.643 0.00821 **
## location_codemann 1.0514 0.2570 4.091 4.30e-05 ***
## location_codeuris 0.7141 0.2959 2.413 0.01581 *
## location_codesasa 0.7533 0.3167 2.378 0.01738 *
## location_codeilr 1.3024 0.3110 4.188 2.82e-05 ***
## location_codemath 0.7115 0.4336 1.641 0.10080
## location_codemus -0.9949 1.0110 -0.984 0.32507
## location_codeafr -0.2510 1.0164 -0.247 0.80495
## location_codejgsm 1.0812 0.6098 1.773 0.07622 .
## location_codehote -11.9459 297.0818 -0.040 0.96793
## location_codevet 0.5757 1.0295 0.559 0.57603
## location_codeasia -11.9459 1455.3975 -0.008 0.99345
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1854.1 on 6005 degrees of freedom
## Residual deviance: 1814.8 on 5991 degrees of freedom
## AIC: 1844.8
##
## Number of Fisher Scoring iterations: 14
aug_mod3 <- augment(mod3, type.predict = "response") %>%
# mutate(prob = (exp(.fitted) / (1+exp(.fitted)) ) ) %>%
filter(location_code == "ilr" | location_code == "mus" | location_code == "mann") %>%
sample_n(50) %>%
select(location_code, present_or_not, .fitted) %>%
arrange(.fitted)
aug_mod3
## # A tibble: 50 x 3
## location_code present_or_not .fitted
## <fct> <fct> <dbl>
## 1 mus 1 0.00980
## 2 mus 1 0.00980
## 3 mus 1 0.00980
## 4 mus 1 0.00980
## 5 mus 1 0.00980
## 6 mus 1 0.00980
## 7 mann 0 0.0712
## 8 mann 0 0.0712
## 9 mann 1 0.0712
## 10 mann 0 0.0712
## # ... with 40 more rows
glimpse(df)
## Observations: 6,006
## Variables: 19
## $ present_or_not <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ barcode <chr> "31924062968908", "31924072130184", "3192...
## $ location_code <fct> afr, afr, afr, afr, afr, afr, afr, afr, a...
## $ call_nbr_display <chr> "DT32 .R61", "DT328.M53 .H3613x 1993", "D...
## $ call_nbr_norm_item <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ enumeration <chr> NA, NA, NA, NA, NA, NA, NA, "c.2", NA, NA...
## $ title <chr> "Death of Africa. By Peter Ritner.", "Vic...
## $ item_control_nbr <chr> "3723592", "4103508", "7620171", "9494855...
## $ recorded_uses_item <dbl> 1, 0, 2, 0, 0, 10, 2, 0, 56, 1, 2, 4, 3, ...
## $ bib_rec_nbr <chr> "1968678", "2249095", "5689943", "8618953...
## $ worldcat_oclc_nbr <chr> "412793", "59941146", "148569", "86982417...
## $ Catalog_URL <chr> "https://newcatalog.library.cornell.edu/c...
## $ us_holdings <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ row_number <dbl> 1585081, 735421, 2715241, 850681, 84661, ...
## $ Condition <chr> "Acceptable", "Excellent", "Acceptable", ...
## $ `Barcode validation` <chr> "yes", "yes", "yes", "yes", "yes", "no", ...
## $ Timestamp <dttm> 2018-07-06 13:56:22, 2018-07-06 13:56:22...
## $ Status <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ has_circulated <dbl> 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1,...
df %>%
select(call_nbr_display) %>%
top_n(5)
## Selecting by call_nbr_display
## # A tibble: 5 x 1
## call_nbr_display
## <chr>
## 1 maps G6690s 1000 .P6 Map
## 2 oversize HA4587.W47 W49x 1999 +
## 3 oversize HD8653.5 .M37 1997 +
## 4 oversize DS793.A5 A644 1993 v.44 +
## 5 Z989.F36 Y83 2012
# add is oversize 0-1 variable
df %>%
filter(str_detect(call_nbr_display, "\\+") ) %>%
select(call_nbr_display)
## # A tibble: 689 x 1
## call_nbr_display
## <chr>
## 1 + HD5715.5.N5 R57
## 2 + HD9017.S82 J78 1969
## 3 Oversize BL2085 .G84 1995z +
## 4 Oversize BQ972.A33 K94 2008 +
## 5 Oversize BQ4190 .B957 2000 +
## 6 Oversize BQ4570.A4 M83 2007 +
## 7 Oversize BR1260 .L364 2006 +
## 8 Oversize BX4705.S639 S55 2003 ++
## 9 Oversize CN1230.C3 C67 +
## 10 Oversize DS525 .P739 2009 +
## # ... with 679 more rows
df$is_oversize <- ifelse(str_detect(df$call_nbr_display, "\\+"), 1, 0)
df %>%
filter(is_oversize == "1") %>%
select(is_oversize, call_nbr_display)
## # A tibble: 689 x 2
## is_oversize call_nbr_display
## <dbl> <chr>
## 1 1 + HD5715.5.N5 R57
## 2 1 + HD9017.S82 J78 1969
## 3 1 Oversize BL2085 .G84 1995z +
## 4 1 Oversize BQ972.A33 K94 2008 +
## 5 1 Oversize BQ4190 .B957 2000 +
## 6 1 Oversize BQ4570.A4 M83 2007 +
## 7 1 Oversize BR1260 .L364 2006 +
## 8 1 Oversize BX4705.S639 S55 2003 ++
## 9 1 Oversize CN1230.C3 C67 +
## 10 1 Oversize DS525 .P739 2009 +
## # ... with 679 more rows
df %>%
count(is_oversize)
## # A tibble: 2 x 2
## is_oversize n
## <dbl> <int>
## 1 0 5317
## 2 1 689
df %>%
filter(location_code == "olin") %>%
group_by(is_oversize, present_or_not) %>%
count()
## # A tibble: 4 x 3
## # Groups: is_oversize, present_or_not [4]
## is_oversize present_or_not n
## <dbl> <fct> <int>
## 1 0 1 2812
## 2 0 0 69
## 3 1 1 325
## 4 1 0 15
df %>%
filter(location_code == "olin") %>%
group_by(is_oversize) %>%
summarise(stat = mean(present_or_not == "1"))
## # A tibble: 2 x 2
## is_oversize stat
## <dbl> <dbl>
## 1 0 0.976
## 2 1 0.956
df %>%
group_by(is_oversize) %>%
summarise(stat = mean(present_or_not == "1"))
## # A tibble: 2 x 2
## is_oversize stat
## <dbl> <dbl>
## 1 0 0.966
## 2 1 0.952
df %>%
filter(location_code == "uris") %>%
group_by(is_oversize) %>%
summarise(stat = mean(present_or_not == "1"))
## # A tibble: 2 x 2
## is_oversize stat
## <dbl> <dbl>
## 1 0 0.957
## 2 1 0.918
df_o <- df %>%
group_by(location_code, is_oversize) %>%
summarise(count = n(),
stat = mean(present_or_not == "1")) %>%
arrange(stat)
ggplot(df_o, aes(x = as.factor(is_oversize), y = stat)) +
geom_boxplot()
ggplot(df_o, aes(x = location_code, y = stat, color = as.factor(is_oversize), size = count ) ) +
geom_point() +
coord_flip()
# geom_text(aes(label=paste("",count), hjust=1, vjust=-0.5, size = 0.05))
df_o %>%
select(-count) %>%
spread(is_oversize, stat) %>%
filter(!is.na(`1`)) %>%
summarize(difference = abs(`0` -`1`)) %>%
arrange(desc(difference))
## # A tibble: 9 x 2
## location_code difference
## <fct> <dbl>
## 1 mann 0.0976
## 2 sasa 0.0804
## 3 jgsm 0.0750
## 4 uris 0.0389
## 5 afr 0.0213
## 6 was 0.0212
## 7 olin 0.0202
## 8 ech 0.0122
## 9 mus 0.0112
# model 4
print("model 4")
## [1] "model 4"
mod4 <- glm(present_or_not ~ is_oversize, data=df, family=binomial)
summary(mod4)
##
## Call:
## glm(formula = present_or_not ~ is_oversize, family = binomial,
## data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.3133 -0.2639 -0.2639 -0.2639 2.5979
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.33983 0.07543 -44.279 <2e-16 ***
## is_oversize 0.35018 0.19369 1.808 0.0706 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1854.1 on 6005 degrees of freedom
## Residual deviance: 1851.0 on 6004 degrees of freedom
## AIC: 1855
##
## Number of Fisher Scoring iterations: 6
# model 5
print("model 5")
## [1] "model 5"
mod5 <- glm(present_or_not ~ location_code + is_oversize, data=df, family=binomial)
summary(mod5)
##
## Call:
## glm(formula = present_or_not ~ location_code + is_oversize, family = binomial,
## data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.4842 -0.3072 -0.2231 -0.2231 3.0651
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.6806 0.1150 -32.004 < 2e-16 ***
## location_codewas 0.2042 0.2544 0.803 0.42219
## location_codeech 0.3540 0.2666 1.328 0.18434
## location_codelaw 0.7078 0.2470 2.866 0.00416 **
## location_codemann 1.0996 0.2583 4.257 2.07e-05 ***
## location_codeuris 0.6504 0.2978 2.184 0.02898 *
## location_codesasa 0.7094 0.3177 2.233 0.02558 *
## location_codeilr 1.3627 0.3126 4.359 1.31e-05 ***
## location_codemath 0.7719 0.4347 1.776 0.07581 .
## location_codemus -1.0076 1.0112 -0.996 0.31904
## location_codeafr -0.2144 1.0167 -0.211 0.83295
## location_codejgsm 1.1277 0.6105 1.847 0.06470 .
## location_codehote -11.8855 297.0818 -0.040 0.96809
## location_codevet 0.6361 1.0300 0.618 0.53687
## location_codeasia -11.8855 1455.3975 -0.008 0.99348
## is_oversize 0.4684 0.2036 2.300 0.02145 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1854.1 on 6005 degrees of freedom
## Residual deviance: 1809.9 on 5990 degrees of freedom
## AIC: 1841.9
##
## Number of Fisher Scoring iterations: 14
glance(mod5)
## # A tibble: 1 x 7
## null.deviance df.null logLik AIC BIC deviance df.residual
## <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int>
## 1 1854. 6005 -905. 1842. 1949. 1810. 5990
aug_mod5 <- augment(mod5, type.predict = "response") %>%
# mutate(prob = (exp(.fitted) / (1+exp(.fitted)) ) ) %>%
mutate(y_hat = .fitted) %>%
filter(location_code == "sasa" |
location_code == "jgsm" |
location_code == "uris" |
location_code == "mus") %>%
sample_n(100) %>%
select(location_code, present_or_not, is_oversize, y_hat) %>%
arrange(y_hat)
aug_mod5
## # A tibble: 100 x 4
## location_code present_or_not is_oversize y_hat
## <fct> <fct> <dbl> <dbl>
## 1 mus 1 0 0.00912
## 2 mus 1 0 0.00912
## 3 mus 1 0 0.00912
## 4 mus 1 0 0.00912
## 5 mus 1 0 0.00912
## 6 mus 1 0 0.00912
## 7 mus 1 0 0.00912
## 8 mus 1 0 0.00912
## 9 mus 1 0 0.00912
## 10 mus 1 0 0.00912
## # ... with 90 more rows
nd1 <- data.frame(location_code = c("mus","mus", "jgsm", "jgsm"),
is_oversize = c(0, 1, 0, 1))
augment(mod5, newdata = nd1, type.predict = "response")
## # A tibble: 4 x 4
## location_code is_oversize .fitted .se.fit
## * <fct> <dbl> <dbl> <dbl>
## 1 mus 0 0.00912 0.00909
## 2 mus 1 0.0145 0.0145
## 3 jgsm 0 0.0722 0.0402
## 4 jgsm 1 0.111 0.0621
#nd2 <- df[c(10,200,1000,3000),]
nd2 <- df[seq(1,nrow(df), by = 100),]
augment(mod5, newdata = nd2, type.predict = "response") %>%
select(location_code, is_oversize, .fitted, .se.fit) %>%
arrange(.fitted)
## # A tibble: 61 x 4
## location_code is_oversize .fitted .se.fit
## <fct> <dbl> <dbl> <dbl>
## 1 mus 0 0.00912 0.00909
## 2 afr 0 0.0199 0.0197
## 3 olin 0 0.0246 0.00276
## 4 olin 0 0.0246 0.00276
## 5 olin 0 0.0246 0.00276
## 6 olin 0 0.0246 0.00276
## 7 olin 0 0.0246 0.00276
## 8 olin 0 0.0246 0.00276
## 9 olin 0 0.0246 0.00276
## 10 olin 0 0.0246 0.00276
## # ... with 51 more rows
df <- df %>%
mutate(call_first_letter = call_nbr_display,
call_first_letter = str_replace_all(call_first_letter, regex("oversize", ignore_case = TRUE), ""),
call_first_letter = str_trim(call_first_letter),
call_first_letter = str_sub(call_first_letter, 1,1),
call_first_letter = as.factor(call_first_letter))
# this section not working yet
df_loc_call <- df %>%
group_by(call_first_letter, location_code) %>%
summarise(count = n(),
stat = mean(present_or_not == "1")) %>%
mutate(num_available = round(count * stat)) %>%
mutate(num_missing = count - num_available) %>%
mutate(bootmean = NA,
bootse = NA) %>%
filter(!count < 5) %>%
arrange(stat, location_code)
for(j in 1:nrow(df_loc_call)) {
df_loc_call_subset <- df %>%
filter(location_code == df_loc_call$location_code[j] & call_first_letter == df_loc_call$call_first_letter[j])
df_ret <- get_boot(df_loc_call_subset,replimit)
df_loc_call$bootmean[j] <- mean(df_ret$stat)
df_loc_call$bootse[j] <- sd(df_ret$stat)
}
df_loc_call
## # A tibble: 81 x 8
## # Groups: call_first_letter [23]
## call_first_lett~ location_code count stat num_available num_missing
## <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 T law 6 0.667 4 2
## 2 L mann 23 0.696 16 7
## 3 J sasa 11 0.727 8 3
## 4 S ech 5 0.8 4 1
## 5 R ilr 5 0.8 4 1
## 6 L was 6 0.833 5 1
## 7 M ech 6 0.833 5 1
## 8 Q ech 7 0.857 6 1
## 9 H law 22 0.864 19 3
## 10 T mann 25 0.88 22 3
## # ... with 71 more rows, and 2 more variables: bootmean <dbl>,
## # bootse <dbl>
df_loc_call %>%
filter(bootmean < p_hat) %>%
mutate(loc_firstl = paste(location_code, call_first_letter, sep = "_")) %>%
arrange(loc_firstl) %>%
ggplot(aes(y=bootmean, x=fct_reorder(loc_firstl, -bootmean) )) +
geom_errorbar(aes(ymin=bootmean-bootse, ymax=bootmean+bootse, width = 0.1)) +
geom_point(color="red") +
coord_flip() +
scale_y_continuous(limits = c(.470,1.005), breaks = seq(.60,1.0, by = .02)) + labs(title = paste0("\nCUL monograph accounted for results (< grand mean of ", round(p_hat,3), ")" ) ) +
xlab("location and first letter of callnum") +
ylab("proportion accounted for") +
theme_minimal()
# geom_hline(yintercept = p_hat, col = "yellow")
p_hat
## [1] 0.9642025
To be systematic about addressing the uneven accounted for, and bring CUL up the the level of EAST, we need a process for regularly sampling each location_code + call_first_letter group across the system (stratified sample). 30 is often used as a sample size cutoff by statisticians as a reasonable compromise between accuracy and data collection effort.
df %>%
group_by(location_code, call_first_letter) %>%
count() %>%
arrange(n)
## # A tibble: 160 x 3
## # Groups: location_code, call_first_letter [160]
## location_code call_first_letter n
## <fct> <fct> <int>
## 1 olin m 1
## 2 was E 1
## 3 was K 1
## 4 was S 1
## 5 was V 1
## 6 ech A 1
## 7 law C 1
## 8 law F 1
## 9 law P 1
## 10 mann A 1
## # ... with 150 more rows
160 * 30
## [1] 4800
That’s a big number. What if we excluded the location_code + call_first_letter groups that already contained more than 30 in the original sample.
df %>%
group_by(location_code, call_first_letter) %>%
count() %>%
filter(!n > 30) %>%
arrange(n)
## # A tibble: 125 x 3
## # Groups: location_code, call_first_letter [125]
## location_code call_first_letter n
## <fct> <fct> <int>
## 1 olin m 1
## 2 was E 1
## 3 was K 1
## 4 was S 1
## 5 was V 1
## 6 ech A 1
## 7 law C 1
## 8 law F 1
## 9 law P 1
## 10 mann A 1
## # ... with 115 more rows
125 * 30
## [1] 3750
6006 volumes are on your list, 6006 of which have been checked.
Accounted for: 5791 At the current rate you will find 96.4% on shelf. (afr: 98.0% asia: 100% ech: 96.1% hote: 100% ilr: 91.0% jgsm: 92.7% law: 95.1% mann: 92.9% math: 94.8% mus: 99.0% olin: 97.4% sasa: 94.6% uris: 94.8% vet: 95.5% was: 96.7% )
96.8% of the accounted for items are physically present on the shelf.
df %>% count(present_or_not) A tibble: 4 x 2 present_or_not n
df %>% count(present_or_not) %>% summarise(sum = sum(n)) A tibble: 1 x 1 sum
5606 + 185 [1] 5791 5791 / 6006 [1] 0.9642025 5606 + 185 + 18 [1] 5809 5809 / 6006 [1] 0.9671995 5606 / 5791 [1] 0.9680539
accounted_for = CheckedOut + Present
missing <- 18 + 197 1 - (missing / 6006)