How long are Princeton Dissertations?

I looked for books in the Princeton history catalog with the author “Princeton History Department” filed in PRIN 685 in the archives, which is where disses have gone since 1988 or so, and pulled primary author, year, and any numbers before “leaves” or “pages” in the catalog. That gets a page length for almost all of them.

Parsing the e-mailed pages from the library is done in perl:

perl -ne '
BEGIN{$year="";$length="";};
if ($_ =~ m/(\d+) ?(leaves|p\.)/g) {$length=$1};
if ($_ =~ m/Author:[^A-Z]*(.*)/) {$author=$1};
if ($_ =~ m/PRIN 685 (\d\d\d\d)/) {$year = $1};
if ($_ =~ m/============/) {print "$year\t$length\t$author\n";$length="";$year="";}' 
all.txt | sort -nrk 2 | uniq > dissLengths.txt

Reading those in:

disses = read.table("dissLengths.txt", sep = "\t", header = F, quote = "")
names(disses) = c("year", "length", "author")
require(ggplot2)
disses = disses[disses$year > 1980, ]  #There was just one from before

# standard deviation will identify outliers:
disses$sd = scale(log(disses$length))

ggplot(disses, aes(y = length, x = year, label = gsub("\\d.*", "", author))) + 
    geom_point(data = disses[abs(disses$sd) <= 1.4, ], position = "jitter") + 
    scale_y_continuous("length in 'leaves' from the catalog", breaks = seq(0, 
        1000, by = 50)) + geom_smooth() + geom_text(data = disses[abs(disses$sd) > 
    1.4, ], size = 3.5) + labs(title = "All Princeton Dissertations from the library catalog with identifiable lengths and years\nwith outliers identified by name")

plot of chunk unnamed-chunk-1

And a couple summary statistics


disses$birth = gsub(".*(\\d\\d\\d\\d)-.*", "\\1", disses$author)
disses$birth[!grepl("\\d\\d\\d\\d", disses$birth)] = NA
disses$birth = as.numeric(disses$birth)

ggplot(disses, aes(x = year - birth, y = length)) + geom_point(position = "jitter") + 
    geom_smooth() + labs(title = "No connection between age and length") + xlim(26, 
    40)

plot of chunk unnamed-chunk-2


ggplot(disses, aes(x = factor(year), y = year - birth)) + geom_boxplot() + labs(title = "age at dissertation has a stable median of 32") + 
    geom_hline(yintercept = median(disses$year - disses$birth, na.rm = T))

plot of chunk unnamed-chunk-2



ggplot(disses, aes(x = length)) + geom_histogram() + scale_x_continuous(breaks = seq(0, 
    1000, by = 50)) + labs(title = "300 pages is the normal length") + scale_x_log10()

plot of chunk unnamed-chunk-2


disses[abs(disses$sd) > 2, ]
##       year length                             author     sd birth
## 1     1996   1157             Gregory, Brad Stephan.  3.857    NA
## 2     1996    796     Mendelsohn, John Andrew, 1965-  2.727  1965
## 3     1995    793     Bellany, Alastair James, 1968-  2.716  1968
## 4     1997    792                   Shelford, April.  2.712    NA
## 6     1993    656                     Wahrman, Dror.  2.143    NA
## 7     1990    642                      Blair, Ann M.  2.078    NA
## 8     2010    638 Tannous, Jack Boulos Victor, 1980-  2.059  1980
## 9     1999    635       Cupples, Cynthia Jean, 1963-  2.045  1963
## 238   2009    137      Schwartz, Daniel Louis, 1973- -2.589  1973
## 239   1989    135                       Furuya, Jun. -2.633    NA
## 240   2006    129           Murphy, Jane Holt, 1972- -2.770  1972
## 241   2005    106      Sanders, Holly Vincele, 1974- -3.364  1974
## NA      NA     NA                               <NA>     NA    NA
## NA.1    NA     NA                               <NA>     NA    NA
## NA.2    NA     NA                               <NA>     NA    NA
## NA.3    NA     NA                               <NA>     NA    NA
## NA.4    NA     NA                               <NA>     NA    NA
## NA.5    NA     NA                               <NA>     NA    NA
## NA.6    NA     NA                               <NA>     NA    NA
## NA.7    NA     NA                               <NA>     NA    NA
## NA.8    NA     NA                               <NA>     NA    NA
## NA.9    NA     NA                               <NA>     NA    NA
## NA.10   NA     NA                               <NA>     NA    NA
## NA.11   NA     NA                               <NA>     NA    NA
## NA.12   NA     NA                               <NA>     NA    NA
## NA.13   NA     NA                               <NA>     NA    NA
## NA.14   NA     NA                               <NA>     NA    NA
## NA.15   NA     NA                               <NA>     NA    NA
## NA.16   NA     NA                               <NA>     NA    NA
## NA.17   NA     NA                               <NA>     NA    NA
## NA.18   NA     NA                               <NA>     NA    NA
## NA.19   NA     NA                               <NA>     NA    NA
## NA.20   NA     NA                               <NA>     NA    NA
## NA.21   NA     NA                               <NA>     NA    NA
## NA.22   NA     NA                               <NA>     NA    NA
## NA.23   NA     NA                               <NA>     NA    NA
## NA.24   NA     NA                               <NA>     NA    NA
## NA.25   NA     NA                               <NA>     NA    NA
## NA.26   NA     NA                               <NA>     NA    NA
## NA.27   NA     NA                               <NA>     NA    NA
## NA.28   NA     NA                               <NA>     NA    NA
## NA.29   NA     NA                               <NA>     NA    NA
## NA.30   NA     NA                               <NA>     NA    NA
## NA.31   NA     NA                               <NA>     NA    NA
## NA.32   NA     NA                               <NA>     NA    NA
## NA.33   NA     NA                               <NA>     NA    NA
## NA.34   NA     NA                               <NA>     NA    NA
## NA.35   NA     NA                               <NA>     NA    NA
## NA.36   NA     NA                               <NA>     NA    NA
## NA.37   NA     NA                               <NA>     NA    NA
## NA.38   NA     NA                               <NA>     NA    NA
## NA.39   NA     NA                               <NA>     NA    NA
## NA.40   NA     NA                               <NA>     NA    NA
## NA.41   NA     NA                               <NA>     NA    NA
## NA.42   NA     NA                               <NA>     NA    NA
## NA.43   NA     NA                               <NA>     NA    NA
## NA.44   NA     NA                               <NA>     NA    NA
## NA.45   NA     NA                               <NA>     NA    NA
## NA.46   NA     NA                               <NA>     NA    NA
## NA.47   NA     NA                               <NA>     NA    NA
## NA.48   NA     NA                               <NA>     NA    NA
## NA.49   NA     NA                               <NA>     NA    NA
## NA.50   NA     NA                               <NA>     NA    NA
## NA.51   NA     NA                               <NA>     NA    NA
## NA.52   NA     NA                               <NA>     NA    NA
## NA.53   NA     NA                               <NA>     NA    NA
## NA.54   NA     NA                               <NA>     NA    NA
## NA.55   NA     NA                               <NA>     NA    NA
## NA.56   NA     NA                               <NA>     NA    NA
tail(disses[!is.na(disses$sd), ])
##     year length                        author     sd birth
## 236 1994    178    Elukin, Jonathan M., 1961- -1.798  1961
## 237 2005    170        Kane, Eileen M., 1972- -1.937  1972
## 238 2009    137 Schwartz, Daniel Louis, 1973- -2.589  1973
## 239 1989    135                  Furuya, Jun. -2.633    NA
## 240 2006    129      Murphy, Jane Holt, 1972- -2.770  1972
## 241 2005    106 Sanders, Holly Vincele, 1974- -3.364  1974