I looked for books in the Princeton history catalog with the author “Princeton History Department” filed in PRIN 685 in the archives, which is where disses have gone since 1988 or so, and pulled primary author, year, and any numbers before “leaves” or “pages” in the catalog. That gets a page length for almost all of them.
Parsing the e-mailed pages from the library is done in perl:
perl -ne '
BEGIN{$year="";$length="";};
if ($_ =~ m/(\d+) ?(leaves|p\.)/g) {$length=$1};
if ($_ =~ m/Author:[^A-Z]*(.*)/) {$author=$1};
if ($_ =~ m/PRIN 685 (\d\d\d\d)/) {$year = $1};
if ($_ =~ m/============/) {print "$year\t$length\t$author\n";$length="";$year="";}'
all.txt | sort -nrk 2 | uniq > dissLengths.txt
Reading those in:
disses = read.table("dissLengths.txt", sep = "\t", header = F, quote = "")
names(disses) = c("year", "length", "author")
require(ggplot2)
disses = disses[disses$year > 1980, ] #There was just one from before
# standard deviation will identify outliers:
disses$sd = scale(log(disses$length))
ggplot(disses, aes(y = length, x = year, label = gsub("\\d.*", "", author))) +
geom_point(data = disses[abs(disses$sd) <= 1.4, ], position = "jitter") +
scale_y_continuous("length in 'leaves' from the catalog", breaks = seq(0,
1000, by = 50)) + geom_smooth() + geom_text(data = disses[abs(disses$sd) >
1.4, ], size = 3.5) + labs(title = "All Princeton Dissertations from the library catalog with identifiable lengths and years\nwith outliers identified by name")
And a couple summary statistics
disses$birth = gsub(".*(\\d\\d\\d\\d)-.*", "\\1", disses$author)
disses$birth[!grepl("\\d\\d\\d\\d", disses$birth)] = NA
disses$birth = as.numeric(disses$birth)
ggplot(disses, aes(x = year - birth, y = length)) + geom_point(position = "jitter") +
geom_smooth() + labs(title = "No connection between age and length") + xlim(26,
40)
ggplot(disses, aes(x = factor(year), y = year - birth)) + geom_boxplot() + labs(title = "age at dissertation has a stable median of 32") +
geom_hline(yintercept = median(disses$year - disses$birth, na.rm = T))
ggplot(disses, aes(x = length)) + geom_histogram() + scale_x_continuous(breaks = seq(0,
1000, by = 50)) + labs(title = "300 pages is the normal length") + scale_x_log10()
disses[abs(disses$sd) > 2, ]
## year length author sd birth
## 1 1996 1157 Gregory, Brad Stephan. 3.857 NA
## 2 1996 796 Mendelsohn, John Andrew, 1965- 2.727 1965
## 3 1995 793 Bellany, Alastair James, 1968- 2.716 1968
## 4 1997 792 Shelford, April. 2.712 NA
## 6 1993 656 Wahrman, Dror. 2.143 NA
## 7 1990 642 Blair, Ann M. 2.078 NA
## 8 2010 638 Tannous, Jack Boulos Victor, 1980- 2.059 1980
## 9 1999 635 Cupples, Cynthia Jean, 1963- 2.045 1963
## 238 2009 137 Schwartz, Daniel Louis, 1973- -2.589 1973
## 239 1989 135 Furuya, Jun. -2.633 NA
## 240 2006 129 Murphy, Jane Holt, 1972- -2.770 1972
## 241 2005 106 Sanders, Holly Vincele, 1974- -3.364 1974
## NA NA NA <NA> NA NA
## NA.1 NA NA <NA> NA NA
## NA.2 NA NA <NA> NA NA
## NA.3 NA NA <NA> NA NA
## NA.4 NA NA <NA> NA NA
## NA.5 NA NA <NA> NA NA
## NA.6 NA NA <NA> NA NA
## NA.7 NA NA <NA> NA NA
## NA.8 NA NA <NA> NA NA
## NA.9 NA NA <NA> NA NA
## NA.10 NA NA <NA> NA NA
## NA.11 NA NA <NA> NA NA
## NA.12 NA NA <NA> NA NA
## NA.13 NA NA <NA> NA NA
## NA.14 NA NA <NA> NA NA
## NA.15 NA NA <NA> NA NA
## NA.16 NA NA <NA> NA NA
## NA.17 NA NA <NA> NA NA
## NA.18 NA NA <NA> NA NA
## NA.19 NA NA <NA> NA NA
## NA.20 NA NA <NA> NA NA
## NA.21 NA NA <NA> NA NA
## NA.22 NA NA <NA> NA NA
## NA.23 NA NA <NA> NA NA
## NA.24 NA NA <NA> NA NA
## NA.25 NA NA <NA> NA NA
## NA.26 NA NA <NA> NA NA
## NA.27 NA NA <NA> NA NA
## NA.28 NA NA <NA> NA NA
## NA.29 NA NA <NA> NA NA
## NA.30 NA NA <NA> NA NA
## NA.31 NA NA <NA> NA NA
## NA.32 NA NA <NA> NA NA
## NA.33 NA NA <NA> NA NA
## NA.34 NA NA <NA> NA NA
## NA.35 NA NA <NA> NA NA
## NA.36 NA NA <NA> NA NA
## NA.37 NA NA <NA> NA NA
## NA.38 NA NA <NA> NA NA
## NA.39 NA NA <NA> NA NA
## NA.40 NA NA <NA> NA NA
## NA.41 NA NA <NA> NA NA
## NA.42 NA NA <NA> NA NA
## NA.43 NA NA <NA> NA NA
## NA.44 NA NA <NA> NA NA
## NA.45 NA NA <NA> NA NA
## NA.46 NA NA <NA> NA NA
## NA.47 NA NA <NA> NA NA
## NA.48 NA NA <NA> NA NA
## NA.49 NA NA <NA> NA NA
## NA.50 NA NA <NA> NA NA
## NA.51 NA NA <NA> NA NA
## NA.52 NA NA <NA> NA NA
## NA.53 NA NA <NA> NA NA
## NA.54 NA NA <NA> NA NA
## NA.55 NA NA <NA> NA NA
## NA.56 NA NA <NA> NA NA
tail(disses[!is.na(disses$sd), ])
## year length author sd birth
## 236 1994 178 Elukin, Jonathan M., 1961- -1.798 1961
## 237 2005 170 Kane, Eileen M., 1972- -1.937 1972
## 238 2009 137 Schwartz, Daniel Louis, 1973- -2.589 1973
## 239 1989 135 Furuya, Jun. -2.633 NA
## 240 2006 129 Murphy, Jane Holt, 1972- -2.770 1972
## 241 2005 106 Sanders, Holly Vincele, 1974- -3.364 1974