In this project, we compare the style of the Platonic Philebus, divided in 1000-word blocks, against two sets of three Platos we prepared earlier, using Delta (Burrows 2002) and R Package Stylo (Eder, Rybicki, and Kestemont 2016). The aim of this comparison is to see if this dialogue is stylistically homogeneous.
For these experiments, I used Diorisis Ancient Greek Corpus. The code I used for extracting the lemmata is accessible at my RPubs account. I start with the files produced by this code in my working directory.
library(stylo)
library(dendextend)
library(ape)
tr_corpus_1 <- load.corpus.and.parse(files = "all", corpus.dir = "training_corpus1", markup.type= "plain", corpus.lang = "Other", sampling = "no.sampling", preserve.case = FALSE, encoding = "UTF-8")
tr_corpus_2 <- load.corpus.and.parse(files = "all", corpus.dir = "training_corpus2", markup.type= "plain", corpus.lang = "Other", sampling = "no.sampling", preserve.case = FALSE, encoding = "UTF-8")
test_corpus <- load.corpus.and.parse(files = "Philebus.txt", markup.type= "plain", corpus.lang = "Other", sample.size = 1000, sampling = "normal.sampling", sample.overlap = 0, features = "w", ngram.size = 1, preserve.case = FALSE, encoding = "UTF-8")
my_freq1 <- make.frequency.list(tr_corpus_1)
my_freq2 <- make.frequency.list(tr_corpus_2)
my_freq_intersect <- intersect(my_freq1, my_freq2) ## select words that occur in two corpora
mfw <- my_freq_intersect[1:100] ## subset 100 mfw
mfw <- mfw[-c(41, 56, 77, 86)]
mfw
## [1] "ὁ" "καί" "δέ" "εἰμί" "οὗτος" "ἐγώ"
## [7] "τε" "οὐ" "ἄν" "αὐτός" "μέν" "ὅς"
## [13] "τις" "ἠέ" "γάρ" "μή" "λέγω" "ἐν"
## [19] "πᾶς" "γε" "δή" "ὡς" "ἀλλά" "φημί"
## [25] "ὅστις" "ἄλλος" "περί" "γίγνομαι" "ὦ" "τίς"
## [31] "σύ" "καλός" "οὖν" "τοιοῦτος" "ἔχω" "πρός"
## [37] "ἐάν" "εἰ" "εἰς" "κατά" "ποιέω" "πολύς"
## [43] "ἑαυτοῦ" "οὕτως" "θεός" "λόγος" "δοκέω" "ἐκ"
## [49] "διά" "δέομαι" "ἐπί" "νόμος" "εἶπον" "οὐδέ"
## [55] "ὑπό" "πρότερος" "ἐκεῖνος" "ἀίω" "νῦν" "οὐδείς"
## [61] "ἄνθρωπος" "οἴομαι" "ἀγαθός" "ἄρα" "ἕκαστος" "μέγας"
## [67] "ὀρθός" "αὖ" "φαίνω" "οἷος" "ἕ" "ὅσος"
## [73] "παῖς" "πάνυ" "κακός" "ἦ" "ἀνήρ" "πῶς"
## [79] "ὅσιος" "πού" "οἶδα" "παρά" "ἀρχή" "βούλομαι"
## [85] "ἀληθής" "ἔοικα" "οὐκοῦν" "θέα" "ἑκάτερος" "μάλιστα"
## [91] "χρή" "ἀμφότερος" "μηδείς" "ἆρα" "μήτε" "μετά"
freq_corp1 <- as.data.frame.matrix(as.table(make.table.of.frequencies(tr_corpus_1, mfw)))
freq_corp2 <- as.data.frame.matrix(as.table(make.table.of.frequencies(tr_corpus_2, mfw)))
freq_test <- as.data.frame.matrix(as.table(make.table.of.frequencies(test_corpus, mfw, absent.sensitive = FALSE)))
delta1 <- perform.delta(training.set = freq_corp1, test.set = freq_test, distance = "delta", no.of.candidates = 3, z.scores.both.sets = TRUE)
delta1$y
## Philebus_1 Philebus_2 Philebus_3 Philebus_4 Philebus_5 Philebus_6
## "Pl2" "Pl2" "Pl3" "Pl2" "Pl2" "Pl1"
## Philebus_7 Philebus_8 Philebus_9 Philebus_10 Philebus_11 Philebus_12
## "Pl2" "Pl2" "Pl2" "Pl2" "Pl2" "Pl2"
## Philebus_13 Philebus_14 Philebus_15 Philebus_16 Philebus_17
## "Pl2" "Pl2" "Pl2" "Pl2" "Pl2"
## attr(,"description")
## [1] "classification results in a compact form"
delta2 <- perform.delta(training.set = freq_corp2, test.set = freq_test, distance = "delta", no.of.candidates = 3, z.scores.both.sets = TRUE)
delta2$y
## Philebus_1 Philebus_2 Philebus_3 Philebus_4 Philebus_5 Philebus_6
## "Pl1" "Pl1" "Pl1" "Pl1" "Pl2" "Pl1"
## Philebus_7 Philebus_8 Philebus_9 Philebus_10 Philebus_11 Philebus_12
## "Pl3" "Pl2" "Pl2" "Pl2" "Pl1" "Pl2"
## Philebus_13 Philebus_14 Philebus_15 Philebus_16 Philebus_17
## "Pl2" "Pl3" "Pl1" "Pl1" "Pl1"
## attr(,"description")
## [1] "classification results in a compact form"
dataset1 <- rbind(freq_corp1, freq_test)
hc1 <- hclust(dist.delta(dataset1))
hcd1 <- as.dendrogram(hc1)
par(mar=c(2,2,2,6))
hcd1 %>% set("branches_k_color", k = 4) %>% set("labels_col", k=4) %>% plot(horiz = TRUE)
abline(v=1,col="red",lty=2)
dataset2 <- rbind(freq_corp2, freq_test)
hc2 <- hclust(dist.delta(dataset2))
hcd2 <- as.dendrogram(hc2)
par(mar=c(2,2,2,6))
hcd2 %>% set("branches_k_color", k = 4) %>% set("labels_col", k=4) %>% plot(horiz = TRUE)
abline(v=1,col="red",lty=2)
tr_corpus_3 <- load.corpus.and.parse(files = "all", corpus.dir = "training_corpus3", markup.type= "plain", corpus.lang = "Other", sampling = "no.sampling", preserve.case = FALSE, encoding = "UTF-8")
tr_corpus_4 <- load.corpus.and.parse(files = "all", corpus.dir = "training_corpus4", markup.type= "plain", corpus.lang = "Other", sampling = "no.sampling", preserve.case = FALSE, encoding = "UTF-8")
freq_corp3 <- as.data.frame.matrix(as.table(make.table.of.frequencies(tr_corpus_3, mfw, absent.sensitive = FALSE)))
freq_corp4 <- as.data.frame.matrix(as.table(make.table.of.frequencies(tr_corpus_4, mfw, absent.sensitive = FALSE)))
dataset3 <- rbind(freq_corp3, freq_test)
hc3 <- hclust(dist.delta(dataset3))
hcd3 <- as.dendrogram(hc3)
par(mar=c(2,2,2,6))
hcd3 %>% set("branches_k_color", k = 4) %>% set("labels_col", k=4) %>% plot(horiz = TRUE)
abline(v=1,col="red",lty=2)
dataset4 <- rbind(freq_corp4, freq_test)
hc4 <- hclust(dist.delta(dataset4))
hcd4 <- as.dendrogram(hc4)
par(mar=c(2,2,2,6))
hcd4 %>% set("branches_k_color", k = 5) %>% set("labels_col", k=5) %>% plot(horiz = TRUE)
abline(v=1,col="red",lty=2)
delta3 <- perform.delta(training.set = freq_corp3, test.set = freq_test, distance = "delta", no.of.candidates = 3, z.scores.both.sets = TRUE)
delta3$y
## Philebus_1 Philebus_2 Philebus_3 Philebus_4 Philebus_5 Philebus_6
## "Pl3" "Pl3" "Pl3" "Pl3" "Pl3" "Pl3"
## Philebus_7 Philebus_8 Philebus_9 Philebus_10 Philebus_11 Philebus_12
## "Pl3" "Pl3" "Pl3" "Pl3" "Pl3" "Pl3"
## Philebus_13 Philebus_14 Philebus_15 Philebus_16 Philebus_17
## "Pl3" "Pl3" "Pl3" "Pl3" "Pl3"
## attr(,"description")
## [1] "classification results in a compact form"
delta4 <- perform.delta(training.set = freq_corp4, test.set = freq_test, distance = "delta", no.of.candidates = 3, z.scores.both.sets = TRUE)
delta4$y
## Philebus_1 Philebus_2 Philebus_3 Philebus_4 Philebus_5 Philebus_6
## "Pl1" "Pl3" "Pl3" "Pl3" "Pl3" "Pl3"
## Philebus_7 Philebus_8 Philebus_9 Philebus_10 Philebus_11 Philebus_12
## "Pl3" "Pl3" "Pl3" "Pl3" "Pl1" "Pl3"
## Philebus_13 Philebus_14 Philebus_15 Philebus_16 Philebus_17
## "Pl3" "Pl3" "Pl3" "Pl3" "Pl3"
## attr(,"description")
## [1] "classification results in a compact form"
cbind(delta1$y, delta2$y, delta3$y, delta4$y)
## [,1] [,2] [,3] [,4]
## Philebus_1 "Pl2" "Pl1" "Pl3" "Pl1"
## Philebus_2 "Pl2" "Pl1" "Pl3" "Pl3"
## Philebus_3 "Pl3" "Pl1" "Pl3" "Pl3"
## Philebus_4 "Pl2" "Pl1" "Pl3" "Pl3"
## Philebus_5 "Pl2" "Pl2" "Pl3" "Pl3"
## Philebus_6 "Pl1" "Pl1" "Pl3" "Pl3"
## Philebus_7 "Pl2" "Pl3" "Pl3" "Pl3"
## Philebus_8 "Pl2" "Pl2" "Pl3" "Pl3"
## Philebus_9 "Pl2" "Pl2" "Pl3" "Pl3"
## Philebus_10 "Pl2" "Pl2" "Pl3" "Pl3"
## Philebus_11 "Pl2" "Pl1" "Pl3" "Pl1"
## Philebus_12 "Pl2" "Pl2" "Pl3" "Pl3"
## Philebus_13 "Pl2" "Pl2" "Pl3" "Pl3"
## Philebus_14 "Pl2" "Pl3" "Pl3" "Pl3"
## Philebus_15 "Pl2" "Pl1" "Pl3" "Pl3"
## Philebus_16 "Pl2" "Pl1" "Pl3" "Pl3"
## Philebus_17 "Pl2" "Pl1" "Pl3" "Pl3"