library(stylo)
library(dendextend)
library(ape)
corpus_all <- load.corpus.and.parse(files = "all", corpus.dir = "corpus_all", markup.type= "plain", corpus.lang = "Other", sampling = "no.sampling", preserve.case = FALSE, encoding = "UTF-8") ## this corpus does not contain Philebus
test_corpus <- load.corpus.and.parse(files = "Philebus.txt", corpus.dir = getwd(), markup.type= "plain", corpus.lang = "Other", sampling = "normal.sampling", sample.size = 3000, preserve.case = FALSE, encoding = "UTF-8")
## delete dialogues < 3000 words
q <- vector()
for(i in 1:length(corpus_all)){
if(length(corpus_all[[i]]) < 3000){q <- c(q, i)}
q
}
corpus3000 <- corpus_all[-q]
## make samples
samples3000 <- make.samples(corpus3000, sample.size = 3000, sampling = "normal.sampling", sample.overlap = 0,
sampling.with.replacement = FALSE)
length(samples3000)
## [1] 151
## add Philebus
corpus3000 <- c(samples3000, test_corpus)
mfw <- make.frequency.list(corpus3000)
mfw <- mfw[1:101]
mfw <- mfw[-71] ## remove Socrates
mfw
## [1] "ὁ" "καί" "εἰμί" "δέ" "οὗτος" "ἐγώ"
## [7] "αὐτός" "οὐ" "τε" "μέν" "ἄν" "τις"
## [13] "ὅς" "λέγω" "γάρ" "ἠέ" "ἐν" "δή"
## [19] "γε" "ἀλλά" "ἄλλος" "πᾶς" "σύ" "φημί"
## [25] "μή" "γίγνομαι" "ὅστις" "ὡς" "περί" "τίς"
## [31] "οὖν" "ὦ" "ἔχω" "εἰ" "πρός" "λόγος"
## [37] "κατά" "πολύς" "εἰς" "τοιοῦτος" "οὕτως" "ἑαυτοῦ"
## [43] "δοκέω" "ἐκ" "ποιέω" "οὐδείς" "νῦν" "εἷς"
## [49] "εἶπον" "οἴομαι" "διά" "ἐκεῖνος" "καλός" "οὔτε"
## [55] "ἐπί" "οὐδέ" "πόλις" "ἀγαθός" "ἐάν" "μέγας"
## [61] "φαίνω" "πρότερος" "οἷος" "δέομαι" "αὖ" "πῶς"
## [67] "ἕτερος" "ἄνθρωπος" "ἀίω" "ἕκαστος" "πάνυ" "ἀληθής"
## [73] "ὑπό" "ὀρθός" "ἕ" "οἶδα" "ὅσος" "ψυχή"
## [79] "οὐκοῦν" "παρά" "ἄρα" "βούλομαι" "μετά" "ἔοικα"
## [85] "ἆρα" "ὥσπερ" "ἔτι" "ἀνήρ" "κακός" "ὅδε"
## [91] "πού" "θεός" "σῶμα" "φύσις" "ποτέ" "νόμος"
## [97] "μήν" "δίκαιος" "ἀεί" "ταυτός"
freq <- as.data.frame.matrix(as.table(make.table.of.frequencies(corpus3000, mfw, absent.sensitive = FALSE)))
dim(freq)
## [1] 156 100
d <- dist.delta(freq)
hc <- hclust(d)
colors = c('#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a','#900C3F','#b15928')
clus12 = cutree(hc, 12)
plot(as.phylo(hc), type = "fan", tip.color = colors[clus12])
dm <- as.matrix(d) ## distance as matrix
my_rows <- rownames(dm)[1:151]
## subset columns, save as df
Phlb <- c("Philebus_1", "Philebus_2", "Philebus_3", "Philebus_4", "Philebus_5")
sdm <- as.data.frame(dm[1:151,(colnames(dm) %in% Phlb)])
rownames(sdm) <- my_rows
## select 5 minimal values for each block
n<- ncol(sdm)
x <- c()
for(i in 1:n){
o <- order(sdm[,i])
z <- rownames(sdm)[o]
z <- z[1:5]
x <- rbind(x,z)
}
rownames(x) <- Phlb
x
## [,1] [,2] [,3] [,4]
## Philebus_1 "Laws7_4" "Sophist_4" "Statesman_3" "Statesman_1"
## Philebus_2 "Laws2_2" "Laws1_2" "Laws3_2" "Laws4_1"
## Philebus_3 "Statesman_1" "Laws10_2" "Laws2_1" "Sophist_2"
## Philebus_4 "Sophist_4" "Laws3_2" "Laws10_2" "Statesman_1"
## Philebus_5 "Sophist_2" "Statesman_3" "Laws2_2" "Statesman_4"
## [,5]
## Philebus_1 "Theaetetus_3"
## Philebus_2 "Statesman_1"
## Philebus_3 "Sophist_1"
## Philebus_4 "Laws7_1"
## Philebus_5 "Statesman_1"
Same can be done using perform.delta function; however it does not return the number of the fragment, for only the characters before the underscore are treated as candidates.
delta2 <- perform.delta(training.set = freq[1:151, ], test.set = freq[152:156, ], distance = "delta", no.of.candidates = 5, z.scores.both.sets = TRUE)
delta2$ranking
## 1 2 3 4 5
## Philebus_1 "Laws7" "Sophist" "Statesman" "Statesman" "Theaetetus"
## Philebus_2 "Laws2" "Laws1" "Laws3" "Laws4" "Statesman"
## Philebus_3 "Statesman" "Laws10" "Laws2" "Sophist" "Sophist"
## Philebus_4 "Sophist" "Laws3" "Laws10" "Statesman" "Laws7"
## Philebus_5 "Sophist" "Statesman" "Laws2" "Statesman" "Statesman"
## attr(,"description")
## [1] "predicted classes with their runner-ups"
delta2$scores
## 1 2 3 4 5
## Philebus_1 0.7211966 0.7508173 0.7570131 0.7837819 0.7913422
## Philebus_2 0.6721729 0.7159967 0.7172953 0.7248398 0.7255297
## Philebus_3 0.7819919 0.7935629 0.8015238 0.8083551 0.8387032
## Philebus_4 0.7032408 0.7408127 0.7410721 0.7542996 0.7592156
## Philebus_5 0.6696491 0.7192088 0.7270360 0.7340047 0.7486723
## attr(,"description")
## [1] "Delta scores, ordered according to candidates"