5000-word blocks
corpus_all <- load.corpus.and.parse(files = "all", corpus.dir = "corpus_all", markup.type= "plain", corpus.lang = "Other", sampling = "no.sampling", preserve.case = FALSE, encoding = "UTF-8") ## this corpus does not contain Philebus
## delete dialogues < 5000 words
q <- vector()
for(i in 1:length(corpus_all)){
if(length(corpus_all[[i]]) < 5000){q <- c(q, i)}
q
}
corpus5000 <- corpus_all[-q]
samples5000 <- make.samples(corpus5000, sample.size = 5000,
sampling = "random.sampling",
sampling.with.replacement = TRUE, number.of.samples = 3)
test_corpus <- load.corpus.and.parse(files = "Philebus.txt", markup.type= "plain", corpus.lang = "Other", sample.size = 5000,
sampling = "normal.sampling", sample.overlap = 0, features = "w",
ngram.size = 1, preserve.case = FALSE,encoding = "UTF-8")
corpus5000 <- c(samples5000, test_corpus)
mfw <- make.frequency.list(corpus5000)
mfw <- mfw[1:100]
mfw
## [1] "ὁ" "καί" "δέ" "εἰμί" "οὗτος" "ἐγώ"
## [7] "αὐτός" "οὐ" "τε" "μέν" "ἄν" "τις"
## [13] "ὅς" "λέγω" "ἠέ" "γάρ" "ἐν" "γε"
## [19] "δή" "φημί" "ἀλλά" "ἄλλος" "μή" "πᾶς"
## [25] "γίγνομαι" "ὅστις" "σύ" "ὡς" "περί" "τίς"
## [31] "οὖν" "ὦ" "ἔχω" "εἰ" "πρός" "τοιοῦτος"
## [37] "λόγος" "εἰς" "πολύς" "κατά" "οὕτως" "ἑαυτοῦ"
## [43] "ποιέω" "δοκέω" "πόλις" "ἐκ" "νῦν" "οὐδείς"
## [49] "εἶπον" "καλός" "εἷς" "οἴομαι" "ἐάν" "διά"
## [55] "ἐπί" "οὐδέ" "ἐκεῖνος" "ἀγαθός" "οὔτε" "δέομαι"
## [61] "μέγας" "πῶς" "φαίνω" "ἀίω" "αὖ" "οἷος"
## [67] "ὀρθός" "ἕκαστος" "πρότερος" "ὑπό" "ἄνθρωπος" "ψυχή"
## [73] "ἕ" "οἶδα" "ὅσος" "πάνυ" "ἀληθής" "μετά"
## [79] "νόμος" "θεός" "κακός" "ἕτερος" "οὐκοῦν" "ἄρα"
## [85] "ἀνήρ" "σωκράτης" "ἔοικα" "ἆρα" "παρά" "βούλομαι"
## [91] "ἔτι" "φύσις" "πού" "ὥσπερ" "ποτέ" "ὅδε"
## [97] "δίκαιος" "μήν" "σῶμα" "ἦ"
freq5000 <- as.data.frame.matrix(as.table(make.table.of.frequencies(corpus5000, mfw, absent.sensitive = FALSE)))
d5000 <- dist.delta(freq5000)
dm5000 <- as.matrix(d5000)
my_rows5000 <- rownames(dm5000)[1:132]
Phlb <- c("Philebus_1", "Philebus_2", "Philebus_3")
## subset columns, save as df
sdm5000 <- as.data.frame(dm5000[1:132,(colnames(dm5000) %in% Phlb)])
rownames(sdm5000) <- my_rows5000
## select 5 minimal values for each block
n<- ncol(sdm5000)
x <- c()
for(i in 1:n){
o <- order(sdm5000[,i])
z <- rownames(sdm5000)[o]
z <- z[1:5]
x <- rbind(x,z)
}
rownames(x) <- Phlb
x
## [,1] [,2] [,3] [,4] [,5]
## Philebus_1 "Statesman_2" "Statesman_3" "Sophist_2" "Sophist_3" "Laws7_3"
## Philebus_2 "Statesman_2" "Laws1_3" "Sophist_1" "Sophist_3" "Sophist_2"
## Philebus_3 "Statesman_2" "Statesman_3" "Sophist_3" "Laws1_3" "Laws1_1"
delta5000 <- perform.delta(training.set = freq5000[1:132, ], test.set = freq5000[133:135, ], distance = "delta", no.of.candidates = 5, z.scores.both.sets = TRUE)
delta5000$ranking
## 1 2 3 4 5
## Philebus_1 "Statesman" "Statesman" "Sophist" "Sophist" "Laws7"
## Philebus_2 "Statesman" "Laws1" "Sophist" "Sophist" "Sophist"
## Philebus_3 "Statesman" "Statesman" "Sophist" "Laws1" "Laws1"
## attr(,"description")
## [1] "predicted classes with their runner-ups"
delta5000$scores
## 1 2 3 4 5
## Philebus_1 0.7770036 0.7780149 0.7787245 0.7857099 0.7880522
## Philebus_2 0.8429860 0.8532213 0.8743187 0.8825589 0.9085966
## Philebus_3 0.7023582 0.7476980 0.7559824 0.7750509 0.7762408
## attr(,"description")
## [1] "Delta scores, ordered according to candidates"