The goal of this project (from the assignment details):
library(data.table, quietly = T); library(plyr, quietly = T); library(ffbase, quietly = T); library(Hmisc, quietly = T); library(stringr, quietly = T)
#setwd("~/GitHub/IUassignments")
fread(
input = "Library_Collection_full_dataset_2016-01-23.csv",
# nrows = 10000,
header = T,
select = c(2, 5, 6, 11, 15, 16, 18, 22, 23, 24, 36, 43, 45, 46),
colClasses = c(CallNumber = "character"),
stringsAsFactors = T
) -> what
#as.ffdf(what) -> whatdb
A section of items to not include, as requested by a member of the IU libraries staff.
fread("Library_Collection_full_dataset_2016-01-23.csv", nrows = 10) -> testMe
## Warning in fread("Library_Collection_full_dataset_2016-01-23.csv", nrows
## = 10): Some columns have been read as type 'integer64' but package bit64
## isn't loaded. Those columns will display as strange looking floating point
## data. There is no need to reload the data. Just require(bit64) to obtain
## the integer64 print method and print the data again.
#save.ffdf(what, overwrite = T)
#load.ffdf("C:/Users/Zach/Documents/GitHub/IUassignments/ffdb")
load("what")
what[(what$HomeLocation != "_DOCSMFIC" &
what$ItemType != "NONCIRC" &
what$HomeLocation != "_DOCSUS" &
what$HomeLocation != "_DOCSINDRF" &
what$HomeLocation != "_DOCSINMAP" &
what$HomeLocation != "_DOCSINOV" &
what$HomeLocation != "_DOCSMAPS" &
what$HomeLocation != "_DOCSMFLM" &
what$HomeLocation != "_DOCSNEW" &
what$HomeLocation != "_DOCSOFF1" &
what$HomeLocation != "_DOCSOVER" &
what$HomeLocation != "_DOCSREF" &
what$HomeLocation != "_DOCSREFD" &
what$HomeLocation != "_DOCSREFL" &
what$HomeLocation != "_DOCSREFS" &
what$HomeLocation != "_DOCSSEC" &
what$HomeLocation != "_DOCSSTKL" &
what$HomeLocation != "_DOCSSTLC" &
what$HomeLocation != "_DOCSSTS" &
what$HomeLocation != "_MFMCRD" &
what$HomeLocation != "_MFMFICHE" &
what$HomeLocation != "_MFMFLM" &
what$HomeLocation != "MICROCARD" &
what$HomeLocation != "MICROFICHE" &
what$HomeLocation != "MICROFILM" &
what$HomeLocation != "MICROFORMS" &
what$HomeLocation != "_MFMPRINT" &
what$HomeLocation != "_MFMFORM" &
what$CurrentLocation != "BINDERY" &
what$CurrentLocation != "DISCARDACQ" &
what$HomeLocation != "DISCARD" &
what$HomeLocation != "ON-ORDER" &
what$HomeLocation != "LOST" &
what$HomeLocation != "_XXLOST" &
what$HomeLocation != "MISSING" &
what$HomeLocation != "_XXMISSING" &
what$Format != "SERI" &
what$Format != "SERIAL") & (what$Library == "B-ALF" |
what$Library == "B-FINEA" |
what$Library == "B-BUSSP" |
what$Library == "B-EDUC" |
what$Library == "B-CHEM" |
what$Library == "B-LIFES" |
what$Library == "B-WELLS"),] -> what
Here’s what we kept as columns for the data:
colnames(testMe) #original
## [1] "TitleControlNumber" "CatalogKey"
## [3] "CallSequence" "CopyNumber"
## [5] "Format" "PubYear"
## [7] "BLvl" "Type"
## [9] "BibForm" "TypeForm"
## [11] "Language" "MARCkey"
## [13] "Author" "Title"
## [15] "Library" "CallNumber"
## [17] "ShelvingKey" "ClasscodeLCSUDOCNLM"
## [19] "CallNumberRangeKey" "ItemCreatedDate"
## [21] "ItemID" "ItemType"
## [23] "HomeLocation" "CurrentLocation"
## [25] "ItemCategory1" "ItemCategory2"
## [27] "Catalogingdepartmentcode" "CatalogingStaffCode"
## [29] "Catalogingdatecode" "PlaceofPublication260a"
## [31] "Publisher260b" "DateofPublication260c"
## [33] "ISSN" "ISBN"
## [35] "GMD245h" "OCLC"
## [37] "TitleCreatedDate" "TitleCatalogedDate"
## [39] "CreatedBy" "Pagination300a"
## [41] "Illustrations300b" "Size300c"
## [43] "TotalCharges" "InHouseCharges"
## [45] "DateLastCharged" "LastActivityDate"
colnames(what) #this one
## [1] "CatalogKey" "Format" "PubYear"
## [4] "Language" "Library" "CallNumber"
## [7] "ClasscodeLCSUDOCNLM" "ItemType" "HomeLocation"
## [10] "CurrentLocation" "OCLC" "TotalCharges"
## [13] "DateLastCharged" "LastActivityDate"
This section is a bit of a duplicate of what Rachel made in SPSS.
The data itself is also a bit dirty, as shown by all of the different language entries. Look for the “|||”, “N/A”, and blank entries:
table(what$Language) -> languageTable
barplot((sort(languageTable, decreasing = T)[1:50] / length(what$Language))[1:10])
Above is the top 50 languages. Thankfully the junk entries aren’t a huge portion.
require(grDevices)
table(what$Format) -> formatTable
barplot((sort(formatTable, decreasing = T)[1:50] / length(what$Format))[1:10], las = 2)
table(what$ItemType) -> itemtypeTable
barplot((sort(itemtypeTable, decreasing = T)[1:50] / length(what$ItemType))[1:15], las = 2)
table(what$Library) -> libraryTable
barplot((sort(libraryTable, decreasing = T)[1:50] / length(what$Library))[1:15], las = 2)
table(what$ClasscodeLCSUDOCNLM) -> classcodeTable
barplot((sort(classcodeTable, decreasing = T)[1:50] / length(what$ClasscodeLCSUDOCNLM))[1:10], las = 2)
describe(what$TotalCharges)
## what$TotalCharges
## n missing unique Info Mean .05 .10 .25 .50
## 4151063 0 580 0.82 3.608 0 0 0 0
## .75 .90 .95
## 2 10 19
##
## lowest : 0 1 2 3 4, highest: 6555 7452 8209 9105 9124
The above describes the distribution of the check-outs.
Let’s look at total number of checkouts per language:
aggregate(TotalCharges ~ Language, data = what, sum) -> checkoutsbyLanguage
checkoutsbyLanguage[order(checkoutsbyLanguage$TotalCharges, decreasing = T),][1:20,] -> topCheckouts
round(topCheckouts$TotalCharges / sum(what$TotalCharges), 2) -> topCheckouts$percent
barplot(height = topCheckouts$percent[1:10], names.arg = topCheckouts$Language[1:10], main = "Number of Checkouts By Language", ylab = "Proportion of Checkouts")
Task: Check on stuff that’s not checked out/checked out. See activity date ~ type. Maybe by library.
What has been done below:
#codes into computer-readable dates
as.Date(what$LastActivityDate, format = "%m/%d/%Y") -> what$LastActivityDate
as.Date(what$DateLastCharged, format = "%m/%d/%Y") -> what$DateLastCharged
#provide a summary of the date distributions
summary(what[,13:14, with = F])
## DateLastCharged LastActivityDate
## Min. :1900-01-01 Min. :1900-01-01
## 1st Qu.:1900-01-01 1st Qu.:1900-01-01
## Median :1900-01-01 Median :2001-09-03
## Mean :1943-12-31 Mean :1967-10-14
## 3rd Qu.:2009-04-09 3rd Qu.:2010-07-09
## Max. :2016-01-22 Max. :2023-07-17
#
length(what$LastActivityDate[what$LastActivityDate > "2016-03-29"])
## [1] 20
what[what$LastActivityDate < "2016-03-29",] -> what #removing the future items
what$charged <- NA
what$charged[what$TotalCharges > 0] <- TRUE
what$charged[what$TotalCharges == 0] <- FALSE
summary(what) #new summary, with all data
## CatalogKey Format PubYear Language
## Min. : 6 MARC :3824179 Min. : 0 eng :2524147
## 1st Qu.: 1321652 MAP : 197376 1st Qu.:1963 ger : 258464
## Median : 3200720 VM : 105694 Median :1981 fre : 203126
## Mean : 3921824 MANU : 12937 Mean :1951 rus : 195927
## 3rd Qu.: 5307582 SOUN : 4417 3rd Qu.:1997 spa : 181066
## Max. :15309728 MRDF : 3932 Max. :9999 chi : 119125
## (Other): 2508 (Other): 669188
## Library CallNumber ClasscodeLCSUDOCNLM
## B-ALF :2167724 BROWSING - VIDEO : 628 LC :3792909
## B-WELLS:1762819 FOREIGN FILM - BROWSING: 530 SU : 235706
## B-FINEA: 91549 CLASSIC - BROWSING : 413 AL : 58688
## B-EDUC : 68192 CLASSICS - BROWSING : 403 DEWEY : 20338
## B-BUSSP: 40654 Browsing - DVD : 359 AUTO : 14367
## B-LIFES: 10186 BROWSING - CLASSICS : 322 DE : 8177
## (Other): 9919 (Other) :4148388 (Other): 20858
## ItemType HomeLocation CurrentLocation OCLC
## NORMAL :3899398 _ALF :2010795 _ALF :1995055 Min. :0
## RESTRI : 85125 _RCSTACKS:1320496 _RCSTACKS :1198354 1st Qu.:0
## DAY14 : 58929 STACKS : 172157 _RCMAPS : 168074 Median :0
## NONCIR : 44871 _RCMAPS : 168236 STACKS : 153307 Mean :0
## HOUR02 : 18993 _RCCHINES: 86368 _RCCHINESE: 84212 3rd Qu.:0
## _BMAVI : 14652 _ALFR : 77453 CHECKEDOUT: 80906 Max. :0
## (Other): 29075 (Other) : 315538 (Other) : 471135
## TotalCharges DateLastCharged LastActivityDate
## Min. : 0.000 Min. :1900-01-01 Min. :1900-01-01
## 1st Qu.: 0.000 1st Qu.:1900-01-01 1st Qu.:1900-01-01
## Median : 0.000 Median :1900-01-01 Median :2001-09-03
## Mean : 3.608 Mean :1943-12-31 Mean :1967-10-14
## 3rd Qu.: 2.000 3rd Qu.:2009-04-09 3rd Qu.:2010-07-09
## Max. :9124.000 Max. :2016-01-22 Max. :2016-01-22
##
## charged
## Mode :logical
## FALSE:2345262
## TRUE :1805781
## NA's :0
##
##
##
Essentially it’s just showing the proportion of the number of items that live in the two groups I made: the items with 0 check-outs, and the items with more than 0 check-outs. Tableau would have been a better choice for this exploration, rather than an Rpub.
Maybe the most important part of this document is giving meaning to the call numbers in our dataset, and this code will do exactly that.
The steps below as as follows:
… by subsetting based on limiting the data to LC (Library of Congress) coded-items. To demonstrate this, this shows 50 sampled call numbers from the data that went did and didn’t go through the subset criteria.
#take a sample of 400000
what[sample(nrow(what), 400000)] -> test
#subset things which aren't LC coded
test[test$ClasscodeLCSUDOCNLM != "LC"] -> test2
#show this
test2[sample(nrow(test2), 20)][,6:7, with=F];
## CallNumber ClasscodeLCSUDOCNLM
## 1: XX(15145426.1) AUTO
## 2: FOREIGN - BROWSING AU
## 3: Y 4.AP 6/2:H 81/2/983/PT.2 SU
## 4: I 19.89:HA-629 SU
## 5: Y 4.IN 8/16:F 76/975 SU
## 6: D 103.22:W 52 SU
## 7: Y 4.IN 8/4:94-85 SU
## 8: Browsing - Series - DVD ALPH
## 9: Y 4.J 89/1:106/146 SU
## 10: PREX 2.22:6 SU
## 11: CB1714 pt. 2 AL
## 12: D 102.25:P 96/1x SU
## 13: JN34 .O373 2013 AUTO
## 14: FW 4.14: S 831/C83/No. No.2 Aike SU
## 15: I 19.88:GQ-1217 SU
## 16: Y 4.J 89/1:98/120 SU
## 17: KC0658 pt. 2 AL
## 18: Y 4.F 49:W 15 SU
## 19: Y 4.SCI 2:95/110 SU
## 20: S 1.127: C 34 1986 SU
#subset things which are LC coded
test[test$ClasscodeLCSUDOCNLM == "LC"] -> test2
#show this, and delete the demos
test2[sample(nrow(test2), 20)][,6:7, with=F]; rm(test, test2)
## CallNumber ClasscodeLCSUDOCNLM
## 1: PS3511.E54 Z56 1997 LC
## 2: JN6719.A58 W6 LC
## 3: BR115.P7 G82 2010 LC
## 4: PT2603.A3 A6 1996 LC
## 5: JN2199.C46 B6 LC
## 6: PC2073 .M35 LC
## 7: PN2598.D19 A3 1970 v.1 LC
## 8: QE185 .A42 no.66-41 LC
## 9: Z675.S3 D72 LC
## 10: E457.91 1907 v.8 LC
## 11: QD1000 .Q565 LC
## 12: PQ6658.E72 E8 1982 LC
## 13: HM22.R9 B65 LC
## 14: PQ6573 .A1 1905 v.47 LC
## 15: DT756 .C5 LC
## 16: PS1418 .W716 1841 v.1/2 LC
## 17: HF5387 .T74 LC
## 18: HV9199.S2 D5 LC
## 19: LB1523 .E9 LC
## 20: PS3515.I19 O5 LC
… to pull out call numbers and apply them to the three column layout. muy importante
#filter out only LC-coded material
what[(what$ClasscodeLCSUDOCNLM == "LC")] -> what
#test for the LC material meeting the call number standards:
what[grep("^[A-Z]{1}[a-z]{1,10}.{1,}", what$CallNumber, perl = T),] -> calltest
paste("we have ", nrow(calltest), " items that are marked as LC but don't meet the format", sep = "")
## [1] "we have 112 items that are marked as LC but don't meet the format"
#print 20 samples of this data:
calltest[sample(nrow(calltest), 20)]
## CatalogKey Format PubYear Language Library
## 1: 3105754 MARC 1949 ara B-ALF
## 2: 6916615 VM 2007 eng B-WELLS
## 3: 7753635 VM 1997 eng B-ALF
## 4: 837201 MARC 1994 mon B-WELLS
## 5: 7753639 VM 1997 eng B-ALF
## 6: 7006071 MARC 2006 hin B-WELLS
## 7: 4424423 MARC 0 eng B-LIFES
## 8: 6797867 VM 2001 eng B-WELLS
## 9: 1171952 MARC 1963 rus B-ALF
## 10: 6633724 MARC 1972 eng B-ALF
## 11: 4077752 MAP 1900 eng B-WELLS
## 12: 4077752 MAP 1900 eng B-WELLS
## 13: 7413307 VM 1995 eng B-ALF
## 14: 4077752 MAP 1900 eng B-WELLS
## 15: 7893019 VM 2007 eng B-WELLS
## 16: 10098961 MARC 2010 wol B-WELLS
## 17: 1194845 MARC 1955 eng B-ALF
## 18: 7893019 VM 2007 eng B-WELLS
## 19: 2838414 MARC 1991 por B-WELLS
## 20: 6493939 MARC 2005 eng B-EDUC
## CallNumber ClasscodeLCSUDOCNLM ItemType HomeLocation
## 1: Ds218.K125 v.2 LC NORMAL _ALF
## 2: Series - DVD Disc 3-Browsing LC CURRICU _MDIABROW
## 3: Series - Browsing LC NORMAL _ALF
## 4: Pl415 .M76 1994 v.91 LC NORMAL _RCSTACKS
## 5: Series - Browsing LC NORMAL _ALF
## 6: Bl1138.27 .J682 2006 LC NORMAL _RCSTACKS
## 7: SchroederM131 LC HOUR02 RESERVES
## 8: Browsing-DVD LC CURRICU _MDIABROW
## 9: Dk63 .T87 v.2 LC NORMAL _ALF
## 10: Map I-709-A 1972 LC PERIOD _ALF
## 11: Cross L Ranch SW 1972V LC NORMAL _RCMAPS
## 12: Cross L Ranch 1972V LC NORMAL _RCMAPS
## 13: Series - Part 2 LC NORMAL _ALF
## 14: Chimayo 1953 PR1977 LC NORMAL _RCMAPS
## 15: Series - DVD LC CURRICU _MDIABROW
## 16: Pl8785.9.S225 N367 2010 LC NORMAL _RCSTACKS
## 17: Ds511 .H17 LC NORMAL _ALF
## 18: Series - DVD LC CURRICU _MDIABROW
## 19: Pq9698.22.U3 Q37 1991 LC NORMAL _RCSTACKS
## 20: Mt1 .S55 2005 Gr.3 Disc 3 LC HOUR02 _EDUCTXBK
## CurrentLocation OCLC TotalCharges DateLastCharged
## 1: _ALF 1.920229e-316 0 1900-01-01
## 2: _MDIABROW 6.083502e-316 23 2008-10-22
## 3: _ALF 2.621375e-316 11 2008-04-12
## 4: _RCSTACKS 1.716673e-316 4 2014-02-11
## 5: _ALF 1.930456e-316 15 2012-04-04
## 6: _RCSTACKS 3.370660e-316 0 1900-01-01
## 7: RESERVES 0.000000e+00 20 1900-01-01
## 8: _MDIABROW 2.370622e-316 24 2015-12-21
## 9: _ALF 1.373211e-316 0 1900-01-01
## 10: _ALF 0.000000e+00 4 2010-01-15
## 11: _RCMAPS 5.720088e-317 0 1900-01-01
## 12: _RCMAPS 5.720088e-317 0 1900-01-01
## 13: _ALF 2.510236e-316 3 2008-11-03
## 14: _RCMAPS 5.720088e-317 0 1900-01-01
## 15: _MDIABROW 7.163038e-316 21 2012-02-05
## 16: CHECKEDOUT 3.735309e-315 3 2015-12-30
## 17: _ALF 1.577151e-317 0 1900-01-01
## 18: _MDIABROW 7.163038e-316 22 2012-02-05
## 19: _RCSTACKS 1.466796e-316 23 2010-01-08
## 20: _EDUCTXBK 0.000000e+00 0 1900-01-01
## LastActivityDate charged
## 1: 1900-01-01 FALSE
## 2: 2008-10-29 TRUE
## 3: 2008-04-16 TRUE
## 4: 2014-03-10 TRUE
## 5: 2012-04-11 TRUE
## 6: 2007-07-05 FALSE
## 7: 1998-09-30 TRUE
## 8: 2015-12-18 TRUE
## 9: 1900-01-01 FALSE
## 10: 2011-10-04 TRUE
## 11: 1900-01-01 FALSE
## 12: 1900-01-01 FALSE
## 13: 2008-11-08 TRUE
## 14: 1900-01-01 FALSE
## 15: 2012-02-13 TRUE
## 16: 2015-12-30 TRUE
## 17: 1900-01-01 FALSE
## 18: 2012-02-13 TRUE
## 19: 2010-02-18 TRUE
## 20: 1900-01-01 FALSE
#filter out these:
what[!grep("^[A-Z]{1}[a-z]{1,10}.{1,}", what$CallNumber, perl = T),] -> what
#begin thing that puts them into columns:
what$CN1 <- str_match(what$CallNumber, "^[A-Z]{1}")
what[!is.na(what$CN1),] -> what
what$CN2 <- str_match(what$CallNumber, "(?<=^.{1})[A-Z]{1}")
what$CN2[!is.na(what$CN2)] <- paste(what$CN1[!is.na(what$CN2)], what$CN2[!is.na(what$CN2)], sep = "")
what$CN2[is.na(what$CN2)] <- "EMPTY"
what$CN3 <- str_match(what$CallNumber, "(?<=^.{2})[A-Z]{1}")
what$CN3[!is.na(what$CN3)] <- paste(what$CN2[!is.na(what$CN3)], what$CN3[!is.na(what$CN3)], sep = "")
what$CN3[is.na(what$CN3)] <- "EMPTY"
#refactor the call numbers, since we have all of the old ones in the factor list still.
factor(what$CallNumber) -> what$CallNumber
#what it looks like:
what[sample(nrow(what), 20),][,c(6,7,16:18), with = F]
## CallNumber ClasscodeLCSUDOCNLM CN1 CN2 CN3
## 1: HD7345.A3 A8 LC H HD EMPTY
## 2: B765 .A8 1938 v.1 LC B EMPTY EMPTY
## 3: PN710 .M945 LC P PN EMPTY
## 4: D766.613.S25 R14 LC D EMPTY EMPTY
## 5: F1219.3.M4 M36 1989 LC F EMPTY EMPTY
## 6: PG3949.3.I83 D35 1988 LC P PG EMPTY
## 7: DT57.5.L47 2011 LC D DT EMPTY
## 8: PR2976 .M37 2001 LC P PR EMPTY
## 9: BX8128.W64 E67 2008 LC B BX EMPTY
## 10: P95.82.E852 G76 2002 LC P EMPTY EMPTY
## 11: PC804.2.Z37 A5 1988 v.2 LC P PC EMPTY
## 12: D40 .M125 LC D EMPTY EMPTY
## 13: F232.E5 S7 LC F EMPTY EMPTY
## 14: PJ5055.22.A94 M88 2003 LC P PJ EMPTY
## 15: BX1430.Z2 D25 v.1 LC B BX EMPTY
## 16: DT958 .R196 LC D DT EMPTY
## 17: AM21.A2 N385 1967 LC A AM EMPTY
## 18: PR6073.H49 N8 1973 LC P PR EMPTY
## 19: PL763.6 .N69 v.16 LC P PL EMPTY
## 20: BR141 .B28 v.3 LC B BR EMPTY
paste("the percentage of all items that are empty in the first layer: ", round(length(what$CallNumber[what$CN1 == "EMPTY"])/length(what$CallNumber), 3)*100, "%", sep = "")
## [1] "the percentage of all items that are empty in the first layer: 0%"
paste("the percentage of all items that are empty in the second layer: ", round(length(what$CallNumber[what$CN2 == "EMPTY"])/length(what$CallNumber), 3)*100, "%", sep = "")
## [1] "the percentage of all items that are empty in the second layer: 18.6%"
paste("the percentage of all items that are empty in the third layer: ", round(length(what$CallNumber[what$CN3 == "EMPTY"])/length(what$CallNumber), 3)*100, "%", sep = "")
## [1] "the percentage of all items that are empty in the third layer: 99.2%"
Not such good news for the third layer.
… for call numbers missing third layer. First, sample 20 lines from this specific subset and confirm that these are all actually empty in the third position.
what[what$CN3 == "EMPTY",][sample(nrow(what[what$CN3 == "EMPTY",]), 20)][,c(6,7,16:18), with = F]
## CallNumber ClasscodeLCSUDOCNLM CN1 CN2 CN3
## 1: GR203.K74 S57 1884a LC G GR EMPTY
## 2: PN56.B7 J65 1998 LC P PN EMPTY
## 3: AC149 .S73 v.333 LC A AC EMPTY
## 4: PN2786.M8 M82 LC P PN EMPTY
## 5: BP165 .B384 1991 LC B BP EMPTY
## 6: PL723 .S5455 Suppl.1 LC P PL EMPTY
## 7: PR2978 .G8 v.2 LC P PR EMPTY
## 8: PQ3919.2.B75 V47 1997 LC P PQ EMPTY
## 9: JF1338.A2 C14 LC J JF EMPTY
## 10: PL248.N15 A6 1967 LC P PL EMPTY
## 11: HC340.2 .R86 2015 LC H HC EMPTY
## 12: PT2625.U85 M4144 2005 LC P PT EMPTY
## 13: PQ7797.B635 Z7965 LC P PQ EMPTY
## 14: PK6116.E5 P3 v.1 LC P PK EMPTY
## 15: PT6432.J75 F48 1970 LC P PT EMPTY
## 16: PN98.S7 S36 LC P PN EMPTY
## 17: PG8664.G3 A4 LC P PG EMPTY
## 18: JV8885.B7 B8 1991 LC J JV EMPTY
## 19: SH331 .F23 no.475 LC S SH EMPTY
## 20: PQ4309 .D87 1990 LC P PQ EMPTY
After repeat runs, it seems to check out that it isn’t the fault of the script and that this data just isn’t there.
A quick primer on what this means: if we have a book with call number “DT36.7 .R47 1988 v.5 pt.1” and this exact figure shows up 5 times in the data, there will be 4 duplicates.
This amount of duplicate items came as a surprise – there ends up being many duplicate call numbers in this data. Let’s explore the amount:
#copy the duplicates into another data set.
what[duplicated(what$CallNumber),] -> dupes
paste("the percentage of all items that are duplicates: ", round(length(dupes$CallNumber)/length(what$CallNumber), 3)*100, "%", " of ", nrow(what), sep = "")
## [1] "the percentage of all items that are duplicates: 6.2% of 3792506"
#20 most popular duplicates:
head(count(dupes, "CallNumber")[order(count(dupes, "CallNumber")$freq, decreasing = T),], 20) -> topDupes
as.character(topDupes$CallNumber) -> topDupes$CallNumber
topDupes
## CallNumber freq
## 47715 G4093.M7A43 1967 .U5 AIP-2HH-2 89
## 22909 DS773.89 .C4853 1991 ser.5,pt. 69
## 22785 DS754.19 .C46 1985 v. 127-191 64
## 47714 G4093.M7A43 1967 .U5 AIP-2HH-1 55
## 83566 HG221 .K467 1998 51
## 47916 G4094.I4G475 1887 .S3 v.2 shee 46
## 129705 PL3033 .Y825 2011 no.2 ser.4 v 37
## 47819 G4094 .K6 G475 1927 .S3 Sheet 36
## 47914 G4094.I4G475 1887 .S3 v.1 shee 36
## 42952 G3950 s62 .G4 Mammoth Cave 192 35
## 47817 G4094 .K6 G475 1909 .S3 Sheet 35
## 56329 G4370 s63 .G4 Russian Mission 35
## 58796 G8230 s50 .M6 feuilleNH-29-XXI 34
## 47818 G4094 .K6 G475 1916 .S3 Sheet 33
## 188462 XX(5823191.2) 32
## 188459 XX(5635594.2) 31
## 85484 HJ8106 .T24 30
## 28000 E185.615 .C5842 2006 pt.1,sec.A 27
## 56311 G4370 s63 .G4 Philip Smith Mtn 27
## 87339 HN373 .D5 27
A question that might pop into the mind, given how many duplicate items there are, are these many many duplicates of a few items (say, 50 copies of 4600 different items) or mostly just 2-3 copies of many items? Let’s try to answer this.
First, we’ll make a table that has a count of how often each unique call number shows up in the duplicate items list. And because we know there will be one row assigned to each unique item,
count(dupes, "CallNumber") -> dupesTable
factor(dupes$Library) -> dupes$Library
nrow(dupesTable)/length(unique(what$CallNumber)) #divides the number of unique items that have duplicates BY the number of unique call numbers in the data
## [1] 0.05413812
This isn’t a huge difference from the previous calculation, which didn’t take into account only looking at unique call numbers.
A better way to answer this question would be to just look at the distribution by how many times things are duplicated in the data:
describe(dupesTable$freq)
## dupesTable$freq
## n missing unique Info Mean .05 .10 .25 .50
## 192621 0 41 0.35 1.218 1 1 1 1
## .75 .90 .95
## 1 2 2
##
## lowest : 1 2 3 4 5, highest: 51 55 64 69 89
Considering all duplicates, obviously everything has at least a count of 1. But even in the 95th percentile of items, there’s still only a count of two duplicates. This says that it’s not just a large amount of a few bad entries. There’s a ton of ways to look at this more, but I started at showing distributions by the libraries:
## [1] "B-ALF" "B-BUSSP" "B-CHEM" "B-EDUC" "B-FINEA" "B-LIFES" "B-WELLS"
## [1] "B-ALF:"
## count(dupes[dupes$Library == levels(dupes$Library)[1], ], "CallNumber")$freq
## n missing unique Info Mean .05 .10 .25 .50
## 55617 0 15 0.19 1.088 1 1 1 1
## .75 .90 .95
## 1 1 2
##
## 1 2 3 4 5 6 7 8 9 13 14 15 19 20 27
## Frequency 51879 3006 534 119 40 15 11 4 1 3 1 1 1 1 1
## % 93 5 1 0 0 0 0 0 0 0 0 0 0 0 0
## [1] "B-BUSSP:"
## count(dupes[dupes$Library == levels(dupes$Library)[2], ], "CallNumber")$freq
## n missing unique Info Mean
## 4210 0 8 0.12 1.067
##
## 1 2 3 4 5 7 12 51
## Frequency 4041 133 23 5 5 1 1 1
## % 96 3 1 0 0 0 0 0
## [1] "B-CHEM:"
## count(dupes[dupes$Library == levels(dupes$Library)[3], ], "CallNumber")$freq
## n missing unique Info Mean
## 710 0 5 0.13 1.058
##
## 1 2 3 4 5
## Frequency 678 28 1 1 2
## % 95 4 0 0 0
## [1] "B-EDUC:"
## count(dupes[dupes$Library == levels(dupes$Library)[4], ], "CallNumber")$freq
## n missing unique Info Mean .05 .10 .25 .50
## 4242 0 15 0.28 1.248 1 1 1 1
## .75 .90 .95
## 1 2 2
##
## 1 2 3 4 5 6 7 8 9 10 11 13 22 25 26
## Frequency 3797 274 36 24 79 5 1 7 7 3 3 1 2 2 1
## % 90 6 1 1 2 0 0 0 0 0 0 0 0 0 0
## [1] "B-FINEA:"
## count(dupes[dupes$Library == levels(dupes$Library)[5], ], "CallNumber")$freq
## n missing unique Info Mean
## 6116 0 7 0.27 1.124
##
## 1 2 3 4 5 6 7
## Frequency 5513 489 90 11 8 4 1
## % 90 8 1 0 0 0 0
## [1] "B-LIFES:"
## count(dupes[dupes$Library == levels(dupes$Library)[6], ], "CallNumber")$freq
## n missing unique Info Mean
## 577 0 5 0.14 1.062
##
## 1 2 3 4 6
## Frequency 548 26 1 1 1
## % 95 5 0 0 0
## [1] "B-WELLS:"
## count(dupes[dupes$Library == levels(dupes$Library)[7], ], "CallNumber")$freq
## n missing unique Info Mean .05 .10 .25 .50
## 127073 0 40 0.36 1.228 1 1 1 1
## .75 .90 .95
## 1 2 2
##
## lowest : 1 2 3 4 5, highest: 46 55 64 69 89
Just judging by means and percentiles, there’s no wild differences between all of these libraries.
We’ll try this again, but by format.
## [1] "MANU" "MANUSC" "MAP" "MARC" "MRDF" "MUSI" "SOUN" "SOUND"
## [9] "VM"
## [1] "MANU:"
## count(dupes[dupes$Format == levels(dupes$Format)[1], ], "CallNumber")$freq
## n missing unique Info Mean
## 3802 0 4 0.03 1.012
##
## 1 (3761, 99%), 2 (39, 1%), 3 (1, 0%), 4 (1, 0%)
## [1] "MANUSC:"
## count(dupes[dupes$Format == levels(dupes$Format)[2], ], "CallNumber")$freq
## n missing unique Info Mean
## 38 0 1 0 1
## [1] "MAP:"
## count(dupes[dupes$Format == levels(dupes$Format)[3], ], "CallNumber")$freq
## n missing unique Info Mean .05 .10 .25 .50
## 22603 0 34 0.61 1.529 1 1 1 1
## .75 .90 .95
## 2 3 3
##
## lowest : 1 2 3 4 5, highest: 35 36 46 55 89
## [1] "MARC:"
## count(dupes[dupes$Format == levels(dupes$Format)[4], ], "CallNumber")$freq
## n missing unique Info Mean .05 .10 .25 .50
## 163959 0 33 0.32 1.18 1 1 1 1
## .75 .90 .95
## 1 2 2
##
## lowest : 1 2 3 4 5, highest: 32 37 51 64 69
## [1] "MRDF:"
## count(dupes[dupes$Format == levels(dupes$Format)[5], ], "CallNumber")$freq
## n missing unique Info Mean
## 56 0 1 0 1
## [1] "MUSI:"
## count(dupes[dupes$Format == levels(dupes$Format)[6], ], "CallNumber")$freq
## n missing unique Info Mean
## 121 0 6 0.33 1.281
##
## 1 2 3 4 6 9
## Frequency 106 8 2 3 1 1
## % 88 7 2 2 1 1
## [1] "SOUN:"
## count(dupes[dupes$Format == levels(dupes$Format)[7], ], "CallNumber")$freq
## n missing unique Info Mean .05 .10 .25 .50
## 260 0 10 0.21 1.458 1 1 1 1
## .75 .90 .95
## 1 1 3
##
## 1 2 3 4 5 6 7 9 16 23
## Frequency 240 3 5 1 1 4 2 1 1 2
## % 92 1 2 0 0 2 1 0 0 1
## [1] "SOUND:"
## count(dupes[dupes$Format == levels(dupes$Format)[8], ], "CallNumber")$freq
## n missing unique Info Mean
## 4 0 1 0 1
## [1] "VM:"
## count(dupes[dupes$Format == levels(dupes$Format)[9], ], "CallNumber")$freq
## n missing unique Info Mean
## 1785 0 8 0.3 1.146
##
## 1 2 3 4 5 6 9 11
## Frequency 1586 169 20 3 3 1 2 1
## % 89 9 1 0 0 0 0 0
## [1] "NA:"
##
## NULL
Again, not a whole lot of variance here.
Taking that previously printed top 20 count of the duplicates, we can look at those items and seeing how many libraries each item is spread across. The idea is that maybe each duplicate is in a different library.
rbind(count(dupes[dupes$CallNumber == topDupes$CallNumber[1]], "Library"), count(dupes[dupes$CallNumber == topDupes$CallNumber[2]], "Library"), count(dupes[dupes$CallNumber == topDupes$CallNumber[3]], "Library"), count(dupes[dupes$CallNumber == topDupes$CallNumber[4]], "Library"), count(dupes[dupes$CallNumber == topDupes$CallNumber[5]], "Library"))
## Library freq
## 1 B-WELLS 89
## 2 B-WELLS 69
## 3 B-WELLS 64
## 4 B-WELLS 55
## 5 B-BUSSP 51
This shows, for at least the top five most frequent duplicates, that they aren’t spanning across multiple libraries since we have one library for each item. To do a check on every item for how many libraries each is spread across, a more complex script would need to be built. This can be done if needed.
Curious about how many charges would be discounted if we threw out all of these duplicate items…
#describe the distribution
describe(dupes$TotalCharges)
## dupes$TotalCharges
## n missing unique Info Mean .05 .10 .25 .50
## 234551 0 264 0.88 5.66 0 0 0 1
## .75 .90 .95
## 5 17 28
##
## lowest : 0 1 2 3 4, highest: 489 558 601 669 4525
#the sum
sum(dupes$TotalCharges)
## [1] 1327478
#how many have 0 charges, vs >0.
paste("the number of dupes that have 0 charges:", nrow(dupes[dupes$TotalCharges == 0,]))
## [1] "the number of dupes that have 0 charges: 116719"
#how many that compares to the total dataset
paste("percent of total charges in dupes to the total in the dataset: ", round(sum(dupes$TotalCharges)/sum(what$TotalCharges),3)*100,"%", sep ="")
## [1] "percent of total charges in dupes to the total in the dataset: 9.7%"
It looks like we really shouldn’t throw this out. Possibly sum up the duplicates and add them to the first entry of the item, and remove anything but the first entry of the item? This would solve the issue of removing any subject skew and keeping the charge amounts.
#wip: finding this out
This is probably one of the most important parts of this wall-of-code – this is where we give some substantive human-readable meaning to the subjects this collection has.
Despite a great deal of searching, there wasn’t a friendly and machine-readable list of the Library of Congress’ codes and their meanings. As such, it had to be built by hand. A PDF file exists for every initial level of the call number, and then lists the second dimension for each call number within each PDF. More granularity is possible once you enter the number ranges, but that extended past the hand-built possibilities of this.
Source: https://www.loc.gov/catdir/cpso/lcco/
Since the building of this data was done by a great deal of copy/pasting, there isn’t much code for that part besides some regex. Here’s what happens afterwards:
readLines("cn2.txt") -> cn2
#first, show what we're working with:
head(cn2)
## [1] "A|General works"
## [2] "AC|Collections. Series. Collected works"
## [3] "AE|Encyclopedias"
## [4] "AG|Dictionaries and other general reference works"
## [5] "AI|Indexes"
## [6] "AM|Museums. Collectors and collecting"
#second, build a dataset from the text using nerd stuff
data.table(code = str_extract(string = cn2, pattern = ".+(?=\\|)")) -> cndecode
cndecode$code[is.na(cndecode$code)] <- "NA"
str_extract(string = cn2, pattern = "(?<=\\|).+") -> cndecode$meaning
#beautiful finished product:
cndecode
## code
## 1: A
## 2: AC
## 3: AE
## 4: AG
## 5: AI
## ---
## 223: VG
## 224: VK
## 225: VM
## 226: Z
## 227: ZA
## meaning
## 1: General works
## 2: Collections. Series. Collected works
## 3: Encyclopedias
## 4: Dictionaries and other general reference works
## 5: Indexes
## ---
## 223: Minor services of navies
## 224: Navigation. Merchant marine
## 225: Naval architecture. Shipbuilding. Marine engineering
## 226: Books (General). Writing. Paleography. Book industries and trade. Libraries. Bibliography
## 227: Information resources (General)
#we'll use merge to line up the codes with the meanings
merge(what, cndecode, by.x = "CN1", by.y = "code", all.x = T, sort = F) -> what
what[,!1,with=F] -> what; setnames(what, "meaning", "CN1")
merge(what, cndecode, by.x = "CN2", by.y = "code", all.x = T, sort = F) -> what
what[,!1,with=F] -> what; setnames(what, "meaning", "CN2")
merge(what, cndecode, by.x = "CN3", by.y = "code", all.x = T, sort = F) -> what
what[,!1,with=F] -> what; setnames(what, "meaning", "CN3")
#factor the new codes
factor(what$CN1) -> what$CN1
factor(what$CN2) -> what$CN2
factor(what$CN3) -> what$CN3
#here's a cut from the top of the data, as an example:
head(what[,c(5,14:16), with=F])
## Library LastActivityDate charged CN1
## 1: B-ALF 1900-01-01 FALSE History (General)
## 2: B-ALF 1993-05-18 TRUE History (General)
## 3: B-ALF 1900-01-01 FALSE History (General)
## 4: B-ALF 2012-03-14 TRUE Philosophy (General)
## 5: B-ALF 1900-01-01 FALSE Philosophy (General)
## 6: B-ALF 2007-11-28 FALSE History (General)
And just like that… the magic has happened.
As usual, there’s some QA to be done. We want to know how many of our rows ended up getting a classification meaning, in all levels. Since this data should match the LC format, 100% should have at least the first level filled.
paste("percent of items getting classified on the first level: ", round(nrow(what[!is.na(what$CN1)])/nrow(what),3)*100, "%", sep = "")
## [1] "percent of items getting classified on the first level: 99.6%"
paste("percent of items getting classified on the second level: ", round(nrow(what[!is.na(what$CN2)])/nrow(what),3)*100, "%", sep = "")
## [1] "percent of items getting classified on the second level: 80%"
paste("percent of items getting classified on the third level: ", round(nrow(what[!is.na(what$CN3)])/nrow(what),3)*100, "%", sep = "")
## [1] "percent of items getting classified on the third level: 0.1%"
What’s the .4% not matching in the first level?
what[is.na(what$CN1)] -> cn1empty
cn1empty[sample(nrow(cn1empty), 10)][,5:7,with=F]
## Library CallNumber ClasscodeLCSUDOCNLM
## 1: B-ALF XX(14444697.1) LC
## 2: B-WELLS XX(5823191.2) LC
## 3: B-ALF XX(15194675.2) LC
## 4: B-WELLS XX(15299329.1) v.1 LC
## 5: B-FINEA XX(9970833.1) LC
## 6: B-ALF XX(15205935.2) LC
## 7: B-ALF XX(10098922.2) LC
## 8: B-WELLS XX(20151109BUK-16) LC
## 9: B-WELLS XX(15141018.1) LC
## 10: B-WELLS XX(15215847.1) LC
Looks like these might have been mislabeled as LC-coded. We’ll remove them.
what[!is.na(what$CN1)] -> what
#and checking again...
paste("percent of items getting classified on the first level: ", round(nrow(what[!is.na(what$CN1)])/nrow(what),3)*100, "%", sep = "")
## [1] "percent of items getting classified on the first level: 100%"
paste("percent of items getting classified on the second level: ", round(nrow(what[!is.na(what$CN2)])/nrow(what),3)*100, "%", sep = "")
## [1] "percent of items getting classified on the second level: 80.3%"
paste("percent of items getting classified on the third level: ", round(nrow(what[!is.na(what$CN3)])/nrow(what),3)*100, "%", sep = "")
## [1] "percent of items getting classified on the third level: 0.1%"
write.csv(what[sample(nrow(what), 50000),], "samp50000.csv")
write.csv(what[sample(nrow(what), 100000),], "samp100000.csv")
write.csv(what[sample(nrow(what), 500000),], "samp500000.csv")
write.csv(what[sample(nrow(what), 1000000),], "samp1000000.csv")
write.csv(what, "newdata.csv")
write.csv(what[what$Library == "B-WELLS"], "wellsdata.csv")
This is being worked on…
aggregate(V1 ~ ItemType, data = whatdf, sum) -> typeAggr
typeAggr[order(typeAggr$V1, decreasing = T),] -> typeAggr
number of books that have been checked out in the past year – visualize how? number of books in general, by subject. Write up some sentences on the data