library(stylo)
library(dendextend)
library(ape)
corpus_all <- load.corpus.and.parse(files = "all", corpus.dir = "corpus_all", markup.type= "plain", corpus.lang = "Other", sampling = "no.sampling", preserve.case = FALSE, encoding = "UTF-8") ## this corpus does not contain Philebus
test_corpus <- load.corpus.and.parse(files = "Philebus.txt", markup.type= "plain", corpus.lang = "Other", sample.size = 5000,
sampling = "normal.sampling", sample.overlap = 0, features = "w",
ngram.size = 1, preserve.case = FALSE,encoding = "UTF-8")
corpus <- c(corpus_all, test_corpus)
mfw <- make.frequency.list(corpus)
mfw <- mfw[1:101]
mfw <- mfw[-71] ## remove Socrates
mfw
## [1] "ὁ" "καί" "εἰμί" "δέ" "οὗτος" "ἐγώ"
## [7] "αὐτός" "οὐ" "τε" "μέν" "ἄν" "τις"
## [13] "ὅς" "λέγω" "γάρ" "ἠέ" "ἐν" "δή"
## [19] "γε" "ἀλλά" "ἄλλος" "σύ" "πᾶς" "φημί"
## [25] "μή" "ὅστις" "γίγνομαι" "ὡς" "περί" "τίς"
## [31] "οὖν" "ὦ" "ἔχω" "εἰ" "πρός" "λόγος"
## [37] "πολύς" "κατά" "εἰς" "τοιοῦτος" "οὕτως" "ἑαυτοῦ"
## [43] "δοκέω" "ποιέω" "οὐδείς" "ἐκ" "νῦν" "εἶπον"
## [49] "εἷς" "διά" "οἴομαι" "ἐπί" "καλός" "ἐκεῖνος"
## [55] "ἀγαθός" "πόλις" "οὐδέ" "οὔτε" "ἐάν" "φαίνω"
## [61] "μέγας" "οἷος" "ἄνθρωπος" "πρότερος" "αὖ" "δέομαι"
## [67] "ὑπό" "ἀίω" "πῶς" "ἕτερος" "ἕκαστος" "ὀρθός"
## [73] "πάνυ" "ἀληθής" "οἶδα" "ἕ" "ψυχή" "ὅσος"
## [79] "οὐκοῦν" "ἄρα" "παρά" "κακός" "μετά" "βούλομαι"
## [85] "ἀνήρ" "ἔοικα" "ὥσπερ" "ἔτι" "θεός" "ἆρα"
## [91] "νόμος" "ὅδε" "σῶμα" "πού" "δίκαιος" "φύσις"
## [97] "ποτέ" "μήν" "μᾶλλον" "ἀεί"
freq <- as.data.frame.matrix(as.table(make.table.of.frequencies(corpus, mfw, absent.sensitive = FALSE)))
dim(freq)
## [1] 58 100
d1 <- dist.delta(freq)
hc <- hclust(d1)
colors = c('#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a','#900C3F','#b15928')
clus12 = cutree(hc, 12)
plot(as.phylo(hc), type = "fan", tip.color = colors[clus12])
dm1 <- as.matrix(d1) ## distance as matrix
my_rows <- rownames(dm1)[1:55]
## subset columns, save as df
Phlb <- c("Philebus_1", "Philebus_2", "Philebus_3")
sdm1 <- as.data.frame(dm1[1:55,(colnames(dm1) %in% Phlb)])
rownames(sdm1) <- my_rows
## select 5 minimal values for each block
n<- ncol(sdm1)
x <- c()
for(i in 1:n){
o <- order(sdm1[,i])
z <- rownames(sdm1)[o]
z <- z[1:5]
x <- rbind(x,z)
}
rownames(x) <- Phlb
x
## [,1] [,2] [,3] [,4] [,5]
## Philebus_1 "Sophist" "Statesman" "Laws7" "Laws4" "Laws3"
## Philebus_2 "Sophist" "Statesman" "Laws2" "Laws1" "Laws3"
## Philebus_3 "Statesman" "Laws2" "Sophist" "Laws1" "Laws7"
Same can be done using the perform.delta function
delta1 <- perform.delta(training.set = freq[1:55, ], test.set = freq[56:58, ], distance = "delta", no.of.candidates = 5, z.scores.both.sets = TRUE)
delta1$ranking
## 1 2 3 4 5
## Philebus_1 "Sophist" "Statesman" "Laws7" "Laws4" "Laws3"
## Philebus_2 "Sophist" "Statesman" "Laws2" "Laws1" "Laws3"
## Philebus_3 "Statesman" "Laws2" "Sophist" "Laws1" "Laws7"
## attr(,"description")
## [1] "predicted classes with their runner-ups"
## delete dialogues < 5000 words
q <- vector()
for(i in 1:length(corpus_all)){
if(length(corpus_all[[i]]) < 5000){q <- c(q, i)}
q
}
corpus5000 <- corpus_all[-q]
samples5000 <- make.samples(corpus5000, sample.size = 5000,
sampling = "normal.sampling", sample.overlap = 0,
sampling.with.replacement = FALSE)
dim(samples5000)
## NULL
## add Philebus
corpus5000 <- c(samples5000, test_corpus)
freq5000 <- as.data.frame.matrix(as.table(make.table.of.frequencies(corpus5000, mfw, absent.sensitive = FALSE)))
dim(freq)
## [1] 58 100
d2 <- dist.delta(freq5000)
hc2 <- hclust(d2)
colors = c('#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a','#900C3F','#b15928')
clus12 = cutree(hc2, 12)
plot(as.phylo(hc2), type = "fan", tip.color = colors[clus12])
dm2 <- as.matrix(d2) ## distance as matrix
my_rows2 <- rownames(dm2)[1:79]
## subset columns, save as df
sdm2 <- as.data.frame(dm2[1:79,(colnames(dm2) %in% Phlb)])
rownames(sdm2) <- my_rows2
## select 5 minimal values for each block
n<- ncol(sdm2)
x <- c()
for(i in 1:n){
o <- order(sdm2[,i])
z <- rownames(sdm2)[o]
z <- z[1:5]
x <- rbind(x,z)
}
rownames(x) <- Phlb
x
## [,1] [,2] [,3] [,4] [,5]
## Philebus_1 "Statesman_2" "Statesman_1" "Laws4_1" "Theaetetus_1" "Laws3_1"
## Philebus_2 "Laws2_1" "Sophist_2" "Statesman_2" "Statesman_1" "Sophist_1"
## Philebus_3 "Laws4_1" "Laws2_1" "Statesman_1" "Laws3_1" "Laws7_1"
## ret
Same can be done using perform.delta function; however it does not return the number of the fragment, for only the characters before the underscore are treated as candidates.
delta2 <- perform.delta(training.set = freq5000[1:79, ], test.set = freq5000[80:82, ], distance = "delta", no.of.candidates = 5, z.scores.both.sets = TRUE)
delta2$ranking
## 1 2 3 4 5
## Philebus_1 "Statesman" "Statesman" "Laws4" "Theaetetus" "Laws3"
## Philebus_2 "Laws2" "Sophist" "Statesman" "Statesman" "Sophist"
## Philebus_3 "Laws4" "Laws2" "Statesman" "Laws3" "Laws7"
## attr(,"description")
## [1] "predicted classes with their runner-ups"
delta2$scores
## 1 2 3 4 5
## Philebus_1 0.7363448 0.7473091 0.7647666 0.7656733 0.7799669
## Philebus_2 0.8063787 0.8384637 0.8694230 0.8741261 0.8820220
## Philebus_3 0.7366366 0.7465235 0.7494409 0.7601559 0.7606171
## attr(,"description")
## [1] "Delta scores, ordered according to candidates"