Packages

library(stylo)
library(dendextend)
library(ape)

Corpus ALL (Whole texts)

corpus_all <- load.corpus.and.parse(files = "all", corpus.dir = "corpus_all", markup.type= "plain", corpus.lang = "Other", sampling = "no.sampling", preserve.case = FALSE, encoding = "UTF-8") ## this corpus does not contain Philebus

test_corpus <- load.corpus.and.parse(files = "Philebus.txt", markup.type= "plain", corpus.lang = "Other", sample.size = 5000, 
sampling = "normal.sampling", sample.overlap = 0, features = "w", 
ngram.size = 1, preserve.case = FALSE,encoding = "UTF-8")
corpus <- c(corpus_all, test_corpus)

MFW & Frequencies ALL

mfw <- make.frequency.list(corpus)
mfw <- mfw[1:101]
mfw <- mfw[-71] ## remove Socrates
mfw
##   [1] "ὁ"        "καί"      "εἰμί"     "δέ"       "οὗτος"    "ἐγώ"     
##   [7] "αὐτός"    "οὐ"       "τε"       "μέν"      "ἄν"       "τις"     
##  [13] "ὅς"       "λέγω"     "γάρ"      "ἠέ"       "ἐν"       "δή"      
##  [19] "γε"       "ἀλλά"     "ἄλλος"    "σύ"       "πᾶς"      "φημί"    
##  [25] "μή"       "ὅστις"    "γίγνομαι" "ὡς"       "περί"     "τίς"     
##  [31] "οὖν"      "ὦ"        "ἔχω"      "εἰ"       "πρός"     "λόγος"   
##  [37] "πολύς"    "κατά"     "εἰς"      "τοιοῦτος" "οὕτως"    "ἑαυτοῦ"  
##  [43] "δοκέω"    "ποιέω"    "οὐδείς"   "ἐκ"       "νῦν"      "εἶπον"   
##  [49] "εἷς"      "διά"      "οἴομαι"   "ἐπί"      "καλός"    "ἐκεῖνος" 
##  [55] "ἀγαθός"   "πόλις"    "οὐδέ"     "οὔτε"     "ἐάν"      "φαίνω"   
##  [61] "μέγας"    "οἷος"     "ἄνθρωπος" "πρότερος" "αὖ"       "δέομαι"  
##  [67] "ὑπό"      "ἀίω"      "πῶς"      "ἕτερος"   "ἕκαστος"  "ὀρθός"   
##  [73] "πάνυ"     "ἀληθής"   "οἶδα"     "ἕ"        "ψυχή"     "ὅσος"    
##  [79] "οὐκοῦν"   "ἄρα"      "παρά"     "κακός"    "μετά"     "βούλομαι"
##  [85] "ἀνήρ"     "ἔοικα"    "ὥσπερ"    "ἔτι"      "θεός"     "ἆρα"     
##  [91] "νόμος"    "ὅδε"      "σῶμα"     "πού"      "δίκαιος"  "φύσις"   
##  [97] "ποτέ"     "μήν"      "μᾶλλον"   "ἀεί"
freq <- as.data.frame.matrix(as.table(make.table.of.frequencies(corpus, mfw, absent.sensitive = FALSE)))
dim(freq)
## [1]  58 100

Plot ALL

d1 <- dist.delta(freq)
hc <- hclust(d1)
colors = c('#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a','#900C3F','#b15928')
clus12 = cutree(hc, 12)
plot(as.phylo(hc), type = "fan", tip.color = colors[clus12])

Make Shortlist ALL

dm1 <- as.matrix(d1) ## distance as matrix
my_rows <- rownames(dm1)[1:55] 

## subset columns, save as df
Phlb <- c("Philebus_1", "Philebus_2", "Philebus_3")
sdm1 <- as.data.frame(dm1[1:55,(colnames(dm1) %in% Phlb)])
rownames(sdm1) <- my_rows

## select 5 minimal values for each block
n<- ncol(sdm1)
x <- c()
for(i in 1:n){
   o <- order(sdm1[,i])
   z <- rownames(sdm1)[o]
   z <- z[1:5]
   x <- rbind(x,z)
}
rownames(x) <- Phlb
x
##            [,1]        [,2]        [,3]      [,4]    [,5]   
## Philebus_1 "Sophist"   "Statesman" "Laws7"   "Laws4" "Laws3"
## Philebus_2 "Sophist"   "Statesman" "Laws2"   "Laws1" "Laws3"
## Philebus_3 "Statesman" "Laws2"     "Sophist" "Laws1" "Laws7"

Perform Delta ALL

Same can be done using the perform.delta function

delta1 <- perform.delta(training.set = freq[1:55, ], test.set = freq[56:58, ], distance = "delta", no.of.candidates = 5, z.scores.both.sets = TRUE)
delta1$ranking
##            1           2           3         4       5      
## Philebus_1 "Sophist"   "Statesman" "Laws7"   "Laws4" "Laws3"
## Philebus_2 "Sophist"   "Statesman" "Laws2"   "Laws1" "Laws3"
## Philebus_3 "Statesman" "Laws2"     "Sophist" "Laws1" "Laws7"
## attr(,"description")
## [1] "predicted classes with their runner-ups"

Corpus ALL (5000)

## delete dialogues < 5000 words
q <- vector()
for(i in 1:length(corpus_all)){
  if(length(corpus_all[[i]]) < 5000){q <- c(q, i)} 
  q
}
corpus5000 <- corpus_all[-q]

samples5000 <- make.samples(corpus5000, sample.size = 5000, 
             sampling = "normal.sampling", sample.overlap = 0,
             sampling.with.replacement = FALSE)
dim(samples5000)
## NULL
## add Philebus
corpus5000 <- c(samples5000, test_corpus)

Frequencies 5000

freq5000 <- as.data.frame.matrix(as.table(make.table.of.frequencies(corpus5000, mfw, absent.sensitive = FALSE)))
dim(freq)
## [1]  58 100

Plot 5000

d2 <- dist.delta(freq5000)
hc2 <- hclust(d2)
colors = c('#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a','#900C3F','#b15928')
clus12 = cutree(hc2, 12)
plot(as.phylo(hc2), type = "fan", tip.color = colors[clus12])

Make Shortlist 5000

dm2 <- as.matrix(d2) ## distance as matrix
my_rows2 <- rownames(dm2)[1:79] 

## subset columns, save as df
sdm2 <- as.data.frame(dm2[1:79,(colnames(dm2) %in% Phlb)])
rownames(sdm2) <- my_rows2

## select 5 minimal values for each block
n<- ncol(sdm2)
x <- c()
for(i in 1:n){
   o <- order(sdm2[,i])
   z <- rownames(sdm2)[o]
   z <- z[1:5]
   x <- rbind(x,z)
}
rownames(x) <- Phlb
x
##            [,1]          [,2]          [,3]          [,4]           [,5]       
## Philebus_1 "Statesman_2" "Statesman_1" "Laws4_1"     "Theaetetus_1" "Laws3_1"  
## Philebus_2 "Laws2_1"     "Sophist_2"   "Statesman_2" "Statesman_1"  "Sophist_1"
## Philebus_3 "Laws4_1"     "Laws2_1"     "Statesman_1" "Laws3_1"      "Laws7_1"
## ret

Perform Delta 5000

Same can be done using perform.delta function; however it does not return the number of the fragment, for only the characters before the underscore are treated as candidates.

delta2 <- perform.delta(training.set = freq5000[1:79, ], test.set = freq5000[80:82, ], distance = "delta", no.of.candidates = 5, z.scores.both.sets = TRUE)
delta2$ranking
##            1           2           3           4            5        
## Philebus_1 "Statesman" "Statesman" "Laws4"     "Theaetetus" "Laws3"  
## Philebus_2 "Laws2"     "Sophist"   "Statesman" "Statesman"  "Sophist"
## Philebus_3 "Laws4"     "Laws2"     "Statesman" "Laws3"      "Laws7"  
## attr(,"description")
## [1] "predicted classes with their runner-ups"
delta2$scores
##                    1         2         3         4         5
## Philebus_1 0.7363448 0.7473091 0.7647666 0.7656733 0.7799669
## Philebus_2 0.8063787 0.8384637 0.8694230 0.8741261 0.8820220
## Philebus_3 0.7366366 0.7465235 0.7494409 0.7601559 0.7606171
## attr(,"description")
## [1] "Delta scores, ordered according to candidates"