Packages

library(stylo)
library(dendextend)
library(ape)

Corpus 3000

corpus_all <- load.corpus.and.parse(files = "all", corpus.dir = "corpus_all", markup.type= "plain", corpus.lang = "Other", sampling = "no.sampling", preserve.case = FALSE, encoding = "UTF-8") ## this corpus does not contain Philebus

test_corpus <- load.corpus.and.parse(files = "Philebus.txt", corpus.dir = getwd(), markup.type= "plain", corpus.lang = "Other", sampling = "normal.sampling", sample.size = 3000, preserve.case = FALSE, encoding = "UTF-8")

## delete dialogues < 3000 words
q <- vector()
for(i in 1:length(corpus_all)){
  if(length(corpus_all[[i]]) < 3000){q <- c(q, i)} 
  q
}
corpus3000 <- corpus_all[-q]

## make samples
samples3000 <- make.samples(corpus3000, sample.size = 3000, sampling = "normal.sampling", sample.overlap = 0,
sampling.with.replacement = FALSE)
length(samples3000)
## [1] 151
## add Philebus
corpus3000 <- c(samples3000, test_corpus)

MFW & Frequencies

mfw <- make.frequency.list(corpus3000)
mfw <- mfw[1:101]
mfw <- mfw[-71] ## remove Socrates
mfw
##   [1] "ὁ"        "καί"      "εἰμί"     "δέ"       "οὗτος"    "ἐγώ"     
##   [7] "αὐτός"    "οὐ"       "τε"       "μέν"      "ἄν"       "τις"     
##  [13] "ὅς"       "λέγω"     "γάρ"      "ἠέ"       "ἐν"       "δή"      
##  [19] "γε"       "ἀλλά"     "ἄλλος"    "πᾶς"      "σύ"       "φημί"    
##  [25] "μή"       "γίγνομαι" "ὅστις"    "ὡς"       "περί"     "τίς"     
##  [31] "οὖν"      "ὦ"        "ἔχω"      "εἰ"       "πρός"     "λόγος"   
##  [37] "κατά"     "πολύς"    "εἰς"      "τοιοῦτος" "οὕτως"    "ἑαυτοῦ"  
##  [43] "δοκέω"    "ἐκ"       "ποιέω"    "οὐδείς"   "νῦν"      "εἷς"     
##  [49] "εἶπον"    "οἴομαι"   "διά"      "ἐκεῖνος"  "καλός"    "οὔτε"    
##  [55] "ἐπί"      "οὐδέ"     "πόλις"    "ἀγαθός"   "ἐάν"      "μέγας"   
##  [61] "φαίνω"    "πρότερος" "οἷος"     "δέομαι"   "αὖ"       "πῶς"     
##  [67] "ἕτερος"   "ἄνθρωπος" "ἀίω"      "ἕκαστος"  "πάνυ"     "ἀληθής"  
##  [73] "ὑπό"      "ὀρθός"    "ἕ"        "οἶδα"     "ὅσος"     "ψυχή"    
##  [79] "οὐκοῦν"   "παρά"     "ἄρα"      "βούλομαι" "μετά"     "ἔοικα"   
##  [85] "ἆρα"      "ὥσπερ"    "ἔτι"      "ἀνήρ"     "κακός"    "ὅδε"     
##  [91] "πού"      "θεός"     "σῶμα"     "φύσις"    "ποτέ"     "νόμος"   
##  [97] "μήν"      "δίκαιος"  "ἀεί"      "ταυτός"
freq <- as.data.frame.matrix(as.table(make.table.of.frequencies(corpus3000, mfw, absent.sensitive = FALSE)))
dim(freq)
## [1] 156 100

Plot 3000

d <- dist.delta(freq)
hc <- hclust(d)
colors = c('#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a','#900C3F','#b15928')
clus12 = cutree(hc, 12)
plot(as.phylo(hc), type = "fan", tip.color = colors[clus12])

Make Shortlist 3000

dm <- as.matrix(d) ## distance as matrix
my_rows <- rownames(dm)[1:151] 

## subset columns, save as df
Phlb <- c("Philebus_1", "Philebus_2", "Philebus_3", "Philebus_4", "Philebus_5")
sdm <- as.data.frame(dm[1:151,(colnames(dm) %in% Phlb)])
rownames(sdm) <- my_rows

## select 5 minimal values for each block
n<- ncol(sdm)
x <- c()
for(i in 1:n){
   o <- order(sdm[,i])
   z <- rownames(sdm)[o]
   z <- z[1:5]
   x <- rbind(x,z)
}
rownames(x) <- Phlb
x
##            [,1]          [,2]          [,3]          [,4]         
## Philebus_1 "Laws7_4"     "Sophist_4"   "Statesman_3" "Statesman_1"
## Philebus_2 "Laws2_2"     "Laws1_2"     "Laws3_2"     "Laws4_1"    
## Philebus_3 "Statesman_1" "Laws10_2"    "Laws2_1"     "Sophist_2"  
## Philebus_4 "Sophist_4"   "Laws3_2"     "Laws10_2"    "Statesman_1"
## Philebus_5 "Sophist_2"   "Statesman_3" "Laws2_2"     "Statesman_4"
##            [,5]          
## Philebus_1 "Theaetetus_3"
## Philebus_2 "Statesman_1" 
## Philebus_3 "Sophist_1"   
## Philebus_4 "Laws7_1"     
## Philebus_5 "Statesman_1"

Perform Delta 3000

Same can be done using perform.delta function; however it does not return the number of the fragment, for only the characters before the underscore are treated as candidates.

delta2 <- perform.delta(training.set = freq[1:151, ], test.set = freq[152:156, ], distance = "delta", no.of.candidates = 5, z.scores.both.sets = TRUE)
delta2$ranking
##            1           2           3           4           5           
## Philebus_1 "Laws7"     "Sophist"   "Statesman" "Statesman" "Theaetetus"
## Philebus_2 "Laws2"     "Laws1"     "Laws3"     "Laws4"     "Statesman" 
## Philebus_3 "Statesman" "Laws10"    "Laws2"     "Sophist"   "Sophist"   
## Philebus_4 "Sophist"   "Laws3"     "Laws10"    "Statesman" "Laws7"     
## Philebus_5 "Sophist"   "Statesman" "Laws2"     "Statesman" "Statesman" 
## attr(,"description")
## [1] "predicted classes with their runner-ups"
delta2$scores
##                    1         2         3         4         5
## Philebus_1 0.7211966 0.7508173 0.7570131 0.7837819 0.7913422
## Philebus_2 0.6721729 0.7159967 0.7172953 0.7248398 0.7255297
## Philebus_3 0.7819919 0.7935629 0.8015238 0.8083551 0.8387032
## Philebus_4 0.7032408 0.7408127 0.7410721 0.7542996 0.7592156
## Philebus_5 0.6696491 0.7192088 0.7270360 0.7340047 0.7486723
## attr(,"description")
## [1] "Delta scores, ordered according to candidates"