Synopsis

In this project, we compare the style of the Platonic Philebus, divided in 1000-word blocks, against two sets of three Platos we prepared earlier, using Delta (Burrows 2002) and R Package Stylo (Eder, Rybicki, and Kestemont 2016). The aim of this comparison is to see if this dialogue is stylistically homogeneous.

Corpus

For these experiments, I used Diorisis Ancient Greek Corpus. The code I used for extracting the lemmata is accessible at my RPubs account. I start with the files produced by this code in my working directory.

Packages

library(stylo) 
library(dendextend)
library(ape)

Training Corpora: Load and Parse

Training Corpus 1

tr_corpus_1 <- load.corpus.and.parse(files = "all", corpus.dir = "training_corpus1", markup.type= "plain", corpus.lang = "Other", sampling = "no.sampling", preserve.case = FALSE, encoding = "UTF-8")

Training Corpus 2

tr_corpus_2 <- load.corpus.and.parse(files = "all", corpus.dir = "training_corpus2", markup.type= "plain", corpus.lang = "Other", sampling = "no.sampling", preserve.case = FALSE, encoding = "UTF-8")

Test Corpus: Load and Parse

test_corpus <- load.corpus.and.parse(files = "Philebus.txt", markup.type= "plain", corpus.lang = "Other", sample.size = 1000, sampling = "normal.sampling", sample.overlap = 0, features = "w", ngram.size = 1, preserve.case = FALSE, encoding = "UTF-8")

Most frequent words

my_freq1 <- make.frequency.list(tr_corpus_1)
my_freq2 <- make.frequency.list(tr_corpus_2)
my_freq_intersect <- intersect(my_freq1, my_freq2) ## select words that occur in two corpora
mfw <- my_freq_intersect[1:100] ## subset 100 mfw
mfw <- mfw[-c(41, 56, 77, 86)]
mfw
##  [1] "ὁ"         "καί"       "δέ"        "εἰμί"      "οὗτος"     "ἐγώ"      
##  [7] "τε"        "οὐ"        "ἄν"        "αὐτός"     "μέν"       "ὅς"       
## [13] "τις"       "ἠέ"        "γάρ"       "μή"        "λέγω"      "ἐν"       
## [19] "πᾶς"       "γε"        "δή"        "ὡς"        "ἀλλά"      "φημί"     
## [25] "ὅστις"     "ἄλλος"     "περί"      "γίγνομαι"  "ὦ"         "τίς"      
## [31] "σύ"        "καλός"     "οὖν"       "τοιοῦτος"  "ἔχω"       "πρός"     
## [37] "ἐάν"       "εἰ"        "εἰς"       "κατά"      "ποιέω"     "πολύς"    
## [43] "ἑαυτοῦ"    "οὕτως"     "θεός"      "λόγος"     "δοκέω"     "ἐκ"       
## [49] "διά"       "δέομαι"    "ἐπί"       "νόμος"     "εἶπον"     "οὐδέ"     
## [55] "ὑπό"       "πρότερος"  "ἐκεῖνος"   "ἀίω"       "νῦν"       "οὐδείς"   
## [61] "ἄνθρωπος"  "οἴομαι"    "ἀγαθός"    "ἄρα"       "ἕκαστος"   "μέγας"    
## [67] "ὀρθός"     "αὖ"        "φαίνω"     "οἷος"      "ἕ"         "ὅσος"     
## [73] "παῖς"      "πάνυ"      "κακός"     "ἦ"         "ἀνήρ"      "πῶς"      
## [79] "ὅσιος"     "πού"       "οἶδα"      "παρά"      "ἀρχή"      "βούλομαι" 
## [85] "ἀληθής"    "ἔοικα"     "οὐκοῦν"    "θέα"       "ἑκάτερος"  "μάλιστα"  
## [91] "χρή"       "ἀμφότερος" "μηδείς"    "ἆρα"       "μήτε"      "μετά"

Tables of Frequencies

freq_corp1 <- as.data.frame.matrix(as.table(make.table.of.frequencies(tr_corpus_1, mfw)))
freq_corp2 <- as.data.frame.matrix(as.table(make.table.of.frequencies(tr_corpus_2, mfw)))
freq_test <- as.data.frame.matrix(as.table(make.table.of.frequencies(test_corpus, mfw, absent.sensitive = FALSE)))

Perform Delta

delta1 <- perform.delta(training.set = freq_corp1, test.set = freq_test, distance = "delta", no.of.candidates = 3, z.scores.both.sets = TRUE)
delta1$y
##  Philebus_1  Philebus_2  Philebus_3  Philebus_4  Philebus_5  Philebus_6 
##       "Pl2"       "Pl2"       "Pl3"       "Pl2"       "Pl2"       "Pl1" 
##  Philebus_7  Philebus_8  Philebus_9 Philebus_10 Philebus_11 Philebus_12 
##       "Pl2"       "Pl2"       "Pl2"       "Pl2"       "Pl2"       "Pl2" 
## Philebus_13 Philebus_14 Philebus_15 Philebus_16 Philebus_17 
##       "Pl2"       "Pl2"       "Pl2"       "Pl2"       "Pl2" 
## attr(,"description")
## [1] "classification results in a compact form"
delta2 <- perform.delta(training.set = freq_corp2, test.set = freq_test, distance = "delta", no.of.candidates = 3, z.scores.both.sets = TRUE)
delta2$y
##  Philebus_1  Philebus_2  Philebus_3  Philebus_4  Philebus_5  Philebus_6 
##       "Pl1"       "Pl1"       "Pl1"       "Pl1"       "Pl2"       "Pl1" 
##  Philebus_7  Philebus_8  Philebus_9 Philebus_10 Philebus_11 Philebus_12 
##       "Pl3"       "Pl2"       "Pl2"       "Pl2"       "Pl1"       "Pl2" 
## Philebus_13 Philebus_14 Philebus_15 Philebus_16 Philebus_17 
##       "Pl2"       "Pl3"       "Pl1"       "Pl1"       "Pl1" 
## attr(,"description")
## [1] "classification results in a compact form"

Dendrogram: Phlb. vs Set 1

dataset1 <- rbind(freq_corp1, freq_test)
    hc1 <- hclust(dist.delta(dataset1))
    hcd1 <- as.dendrogram(hc1)
    par(mar=c(2,2,2,6))
    hcd1 %>% set("branches_k_color", k = 4) %>% set("labels_col", k=4) %>%     plot(horiz = TRUE)
    abline(v=1,col="red",lty=2)

Dendrogram: Phlb. vs. Set 2

dataset2 <- rbind(freq_corp2, freq_test)
    hc2 <- hclust(dist.delta(dataset2))
    hcd2 <- as.dendrogram(hc2)
    par(mar=c(2,2,2,6))
    hcd2 %>% set("branches_k_color", k = 4) %>% set("labels_col", k=4) %>%     plot(horiz = TRUE)
    abline(v=1,col="red",lty=2)

Training Corpus 3 & 4

tr_corpus_3 <- load.corpus.and.parse(files = "all", corpus.dir = "training_corpus3", markup.type= "plain", corpus.lang = "Other", sampling = "no.sampling", preserve.case = FALSE, encoding = "UTF-8")

tr_corpus_4 <- load.corpus.and.parse(files = "all", corpus.dir = "training_corpus4", markup.type= "plain", corpus.lang = "Other", sampling = "no.sampling", preserve.case = FALSE, encoding = "UTF-8")

freq_corp3 <- as.data.frame.matrix(as.table(make.table.of.frequencies(tr_corpus_3, mfw, absent.sensitive = FALSE)))
freq_corp4 <- as.data.frame.matrix(as.table(make.table.of.frequencies(tr_corpus_4, mfw, absent.sensitive = FALSE)))

Dendrogram: Phlb. vs Set 3 & 4

dataset3 <- rbind(freq_corp3, freq_test)
    hc3 <- hclust(dist.delta(dataset3))
    hcd3 <- as.dendrogram(hc3)
    par(mar=c(2,2,2,6))
    hcd3 %>% set("branches_k_color", k = 4) %>% set("labels_col", k=4) %>%     plot(horiz = TRUE)
    abline(v=1,col="red",lty=2)

dataset4 <- rbind(freq_corp4, freq_test)
    hc4 <- hclust(dist.delta(dataset4))
    hcd4 <- as.dendrogram(hc4)
    par(mar=c(2,2,2,6))
    hcd4 %>% set("branches_k_color", k = 5) %>% set("labels_col", k=5) %>%     plot(horiz = TRUE)
    abline(v=1,col="red",lty=2)    

Perform Delta

delta3 <- perform.delta(training.set = freq_corp3, test.set = freq_test, distance = "delta", no.of.candidates = 3, z.scores.both.sets = TRUE)
delta3$y
##  Philebus_1  Philebus_2  Philebus_3  Philebus_4  Philebus_5  Philebus_6 
##       "Pl3"       "Pl3"       "Pl3"       "Pl3"       "Pl3"       "Pl3" 
##  Philebus_7  Philebus_8  Philebus_9 Philebus_10 Philebus_11 Philebus_12 
##       "Pl3"       "Pl3"       "Pl3"       "Pl3"       "Pl3"       "Pl3" 
## Philebus_13 Philebus_14 Philebus_15 Philebus_16 Philebus_17 
##       "Pl3"       "Pl3"       "Pl3"       "Pl3"       "Pl3" 
## attr(,"description")
## [1] "classification results in a compact form"
delta4 <- perform.delta(training.set = freq_corp4, test.set = freq_test, distance = "delta", no.of.candidates = 3, z.scores.both.sets = TRUE)
delta4$y
##  Philebus_1  Philebus_2  Philebus_3  Philebus_4  Philebus_5  Philebus_6 
##       "Pl1"       "Pl3"       "Pl3"       "Pl3"       "Pl3"       "Pl3" 
##  Philebus_7  Philebus_8  Philebus_9 Philebus_10 Philebus_11 Philebus_12 
##       "Pl3"       "Pl3"       "Pl3"       "Pl3"       "Pl1"       "Pl3" 
## Philebus_13 Philebus_14 Philebus_15 Philebus_16 Philebus_17 
##       "Pl3"       "Pl3"       "Pl3"       "Pl3"       "Pl3" 
## attr(,"description")
## [1] "classification results in a compact form"

Results

cbind(delta1$y, delta2$y, delta3$y, delta4$y)
##             [,1]  [,2]  [,3]  [,4] 
## Philebus_1  "Pl2" "Pl1" "Pl3" "Pl1"
## Philebus_2  "Pl2" "Pl1" "Pl3" "Pl3"
## Philebus_3  "Pl3" "Pl1" "Pl3" "Pl3"
## Philebus_4  "Pl2" "Pl1" "Pl3" "Pl3"
## Philebus_5  "Pl2" "Pl2" "Pl3" "Pl3"
## Philebus_6  "Pl1" "Pl1" "Pl3" "Pl3"
## Philebus_7  "Pl2" "Pl3" "Pl3" "Pl3"
## Philebus_8  "Pl2" "Pl2" "Pl3" "Pl3"
## Philebus_9  "Pl2" "Pl2" "Pl3" "Pl3"
## Philebus_10 "Pl2" "Pl2" "Pl3" "Pl3"
## Philebus_11 "Pl2" "Pl1" "Pl3" "Pl1"
## Philebus_12 "Pl2" "Pl2" "Pl3" "Pl3"
## Philebus_13 "Pl2" "Pl2" "Pl3" "Pl3"
## Philebus_14 "Pl2" "Pl3" "Pl3" "Pl3"
## Philebus_15 "Pl2" "Pl1" "Pl3" "Pl3"
## Philebus_16 "Pl2" "Pl1" "Pl3" "Pl3"
## Philebus_17 "Pl2" "Pl1" "Pl3" "Pl3"
Burrows, John. 2002. Delta: A Measure of Stylistic Difference and a Guide to Likely Authorship.” Literary and Linguistic Computing 17 (3): 267–87.
Eder, Maciej, Jan Rybicki, and Mike Kestemont. 2016. “Stylometry with R: A Package for Computational Text Analysis.” The R Journal 8 (1).