Packages

library(stylo) 
library(dendextend)
library(ape)

Training & Test Corpora: Load and Parse

tr_corpus_1 <- load.corpus.and.parse(files = "all", corpus.dir = "training_corpus1", markup.type= "plain", corpus.lang = "Other", sampling = "no.sampling", preserve.case = FALSE, encoding = "UTF-8")
tr_corpus_2 <- load.corpus.and.parse(files = "all", corpus.dir = "training_corpus2", markup.type= "plain", corpus.lang = "Other", sampling = "no.sampling", preserve.case = FALSE, encoding = "UTF-8")
test_5000 <- load.corpus.and.parse(files = "Philebus.txt", markup.type= "plain", corpus.lang = "Other", sample.size = 5000, sampling = "normal.sampling", sample.overlap = 0, features = "w", ngram.size = 1, preserve.case = FALSE, encoding = "UTF-8")

Most frequent words

my_freq1 <- make.frequency.list(tr_corpus_1)
my_freq2 <- make.frequency.list(tr_corpus_2)
my_freq_intersect <- intersect(my_freq1, my_freq2) ## select words that occur in two corpora
mfw <- my_freq_intersect[1:100] ## subset 100 mfw
mfw <- mfw[-c(41, 56, 77, 86)]
mfw
##  [1] "ὁ"         "καί"       "δέ"        "εἰμί"      "οὗτος"     "ἐγώ"      
##  [7] "τε"        "οὐ"        "ἄν"        "αὐτός"     "μέν"       "ὅς"       
## [13] "τις"       "ἠέ"        "γάρ"       "μή"        "λέγω"      "ἐν"       
## [19] "πᾶς"       "γε"        "δή"        "ὡς"        "ἀλλά"      "φημί"     
## [25] "ὅστις"     "ἄλλος"     "περί"      "γίγνομαι"  "ὦ"         "τίς"      
## [31] "σύ"        "καλός"     "οὖν"       "τοιοῦτος"  "ἔχω"       "πρός"     
## [37] "ἐάν"       "εἰ"        "εἰς"       "κατά"      "ποιέω"     "πολύς"    
## [43] "ἑαυτοῦ"    "οὕτως"     "θεός"      "λόγος"     "δοκέω"     "ἐκ"       
## [49] "διά"       "δέομαι"    "ἐπί"       "νόμος"     "εἶπον"     "οὐδέ"     
## [55] "ὑπό"       "πρότερος"  "ἐκεῖνος"   "ἀίω"       "νῦν"       "οὐδείς"   
## [61] "ἄνθρωπος"  "οἴομαι"    "ἀγαθός"    "ἄρα"       "ἕκαστος"   "μέγας"    
## [67] "ὀρθός"     "αὖ"        "φαίνω"     "οἷος"      "ἕ"         "ὅσος"     
## [73] "παῖς"      "πάνυ"      "κακός"     "ἦ"         "ἀνήρ"      "πῶς"      
## [79] "ὅσιος"     "πού"       "οἶδα"      "παρά"      "ἀρχή"      "βούλομαι" 
## [85] "ἀληθής"    "ἔοικα"     "οὐκοῦν"    "θέα"       "ἑκάτερος"  "μάλιστα"  
## [91] "χρή"       "ἀμφότερος" "μηδείς"    "ἆρα"       "μήτε"      "μετά"

Tables of Frequencies

freq_corp1 <- as.data.frame.matrix(as.table(make.table.of.frequencies(tr_corpus_1, mfw)))
freq_corp2 <- as.data.frame.matrix(as.table(make.table.of.frequencies(tr_corpus_2, mfw)))
freq_test_5000 <- as.data.frame.matrix(as.table(make.table.of.frequencies(test_5000, mfw, absent.sensitive = FALSE)))

Perform Delta

delta1 <- perform.delta(training.set = freq_corp1, test.set = freq_test_5000, distance = "delta", no.of.candidates = 3, z.scores.both.sets = TRUE)
delta1$y
## Philebus_1 Philebus_2 Philebus_3 
##      "Pl2"      "Pl2"      "Pl2" 
## attr(,"description")
## [1] "classification results in a compact form"
delta2 <- perform.delta(training.set = freq_corp2, test.set = freq_test_5000, distance = "delta", no.of.candidates = 3, z.scores.both.sets = TRUE)
delta2$y
## Philebus_1 Philebus_2 Philebus_3 
##      "Pl1"      "Pl2"      "Pl2" 
## attr(,"description")
## [1] "classification results in a compact form"

Clusters: Phlb. vs Set 1

dataset1 <- rbind(freq_corp1, freq_test_5000)
    par(mar=c(2,2,2,6))
    hc1 <- hclust(dist.delta(dataset1))
    hcd1 <- as.dendrogram(hc1)
    hcd1 %>% set("branches_k_color", k = 4) %>% set("labels_col", k=4) %>%     plot(horiz = TRUE)
    abline(v=1,col="red",lty=2)

Clusters: Phlb. vs. Set 2

dataset2 <- rbind(freq_corp2, freq_test_5000)
    hc2 <- hclust(dist.delta(dataset2))
    hcd2 <- as.dendrogram(hc2)
    par(mar=c(2,2,2,6))
    hcd2 %>% set("branches_k_color", k = 4) %>% set("labels_col", k=4) %>%     plot(horiz = TRUE)
    abline(v=1,col="red",lty=2)

Training Corpus 3 & 4

tr_corpus_3 <- load.corpus.and.parse(files = "all", corpus.dir = "training_corpus3", markup.type= "plain", corpus.lang = "Other", sampling = "no.sampling", preserve.case = FALSE, encoding = "UTF-8")

tr_corpus_4 <- load.corpus.and.parse(files = "all", corpus.dir = "training_corpus4", markup.type= "plain", corpus.lang = "Other", sampling = "no.sampling", preserve.case = FALSE, encoding = "UTF-8")

freq_corp3 <- as.data.frame.matrix(as.table(make.table.of.frequencies(tr_corpus_3, mfw, absent.sensitive = FALSE)))
freq_corp4 <- as.data.frame.matrix(as.table(make.table.of.frequencies(tr_corpus_4, mfw, absent.sensitive = FALSE)))

Clusters: Phlb. vs Set 3 & 4

dataset3 <- rbind(freq_corp3, freq_test_5000)
    hc3 <- hclust(dist.delta(dataset3))
    hcd3 <- as.dendrogram(hc3)
    par(mar=c(2,2,2,6))
    hcd3 %>% set("branches_k_color", k = 4) %>% set("labels_col", k=4) %>%     plot(horiz = TRUE)
    abline(v=1,col="red",lty=2)

dataset4 <- rbind(freq_corp4, freq_test_5000)
    hc4 <- hclust(dist.delta(dataset4))
    hcd4 <- as.dendrogram(hc4)
    par(mar=c(2,2,2,6))
    hcd4 %>% set("branches_k_color", k = 4) %>% set("labels_col", k=4) %>%     plot(horiz = TRUE)
    abline(v=1,col="red",lty=2)    

Perform Delta

delta3 <- perform.delta(training.set = freq_corp3, test.set = freq_test_5000, distance = "delta", no.of.candidates = 3, z.scores.both.sets = TRUE)
delta3$y
## Philebus_1 Philebus_2 Philebus_3 
##      "Pl3"      "Pl3"      "Pl3" 
## attr(,"description")
## [1] "classification results in a compact form"
delta4 <- perform.delta(training.set = freq_corp4, test.set = freq_test_5000, distance = "delta", no.of.candidates = 3, z.scores.both.sets = TRUE)
delta4$y
## Philebus_1 Philebus_2 Philebus_3 
##      "Pl3"      "Pl3"      "Pl3" 
## attr(,"description")
## [1] "classification results in a compact form"
cbind(delta1$y, delta2$y, delta3$y, delta4$y)
##            [,1]  [,2]  [,3]  [,4] 
## Philebus_1 "Pl2" "Pl1" "Pl3" "Pl3"
## Philebus_2 "Pl2" "Pl2" "Pl3" "Pl3"
## Philebus_3 "Pl2" "Pl2" "Pl3" "Pl3"

Distance tables (Sets 1-4)

as.matrix(dist.delta((dataset1)))
##                  Pl1_Euthyphro Pl1_HippiasMajor Pl2_Republic2 Pl2_Republic3
## Pl1_Euthyphro        0.0000000        0.7982304     1.1324481     1.1371370
## Pl1_HippiasMajor     0.7982304        0.0000000     1.0653264     1.0543927
## Pl2_Republic2        1.1324481        1.0653264     0.0000000     0.6271313
## Pl2_Republic3        1.1371370        1.0543927     0.6271313     0.0000000
## Pl3_Laws11           1.6299021        1.6566945     1.3089634     1.1282576
## Pl3_Laws6            1.5568739        1.6097192     1.2174111     1.1233399
## Philebus_1           1.2236525        1.1317020     1.0560632     1.0207462
## Philebus_2           1.1565279        1.2034091     1.1539192     1.1055276
## Philebus_3           1.1746438        1.1530257     1.0076794     0.8882321
##                  Pl3_Laws11 Pl3_Laws6 Philebus_1 Philebus_2 Philebus_3
## Pl1_Euthyphro     1.6299021 1.5568739  1.2236525  1.1565279  1.1746438
## Pl1_HippiasMajor  1.6566945 1.6097192  1.1317020  1.2034091  1.1530257
## Pl2_Republic2     1.3089634 1.2174111  1.0560632  1.1539192  1.0076794
## Pl2_Republic3     1.1282576 1.1233399  1.0207462  1.1055276  0.8882321
## Pl3_Laws11        0.0000000 0.7304595  1.3278116  1.4338780  1.2012549
## Pl3_Laws6         0.7304595 0.0000000  1.1507042  1.2296972  1.0871940
## Philebus_1        1.3278116 1.1507042  0.0000000  0.8806585  0.7948277
## Philebus_2        1.4338780 1.2296972  0.8806585  0.0000000  0.6753248
## Philebus_3        1.2012549 1.0871940  0.7948277  0.6753248  0.0000000
as.matrix(dist.delta((dataset2)))
##                Pl1_Gorgias Pl1_Protagoras Pl2_Republic8 Pl2_Republic9 Pl3_Laws8
## Pl1_Gorgias       0.000000       0.678999      1.229048      1.112928 1.5203558
## Pl1_Protagoras    0.678999       0.000000      1.127090      1.003995 1.3565115
## Pl2_Republic8     1.229048       1.127090      0.000000      0.644581 1.2449511
## Pl2_Republic9     1.112928       1.003995      0.644581      0.000000 1.3141871
## Pl3_Laws8         1.520356       1.356512      1.244951      1.314187 0.0000000
## Pl3_Laws9         1.514058       1.434185      1.207720      1.360203 0.7472828
## Philebus_1        1.146171       1.075359      1.191393      1.178378 1.2776216
## Philebus_2        1.237692       1.258824      1.191461      1.194538 1.3855281
## Philebus_3        1.056840       1.079718      1.019441      1.046299 1.1524515
##                Pl3_Laws9 Philebus_1 Philebus_2 Philebus_3
## Pl1_Gorgias    1.5140575  1.1461713  1.2376919  1.0568399
## Pl1_Protagoras 1.4341850  1.0753588  1.2588242  1.0797184
## Pl2_Republic8  1.2077201  1.1913926  1.1914614  1.0194409
## Pl2_Republic9  1.3602027  1.1783782  1.1945378  1.0462986
## Pl3_Laws8      0.7472828  1.2776216  1.3855281  1.1524515
## Pl3_Laws9      0.0000000  1.3045668  1.4093791  1.2131989
## Philebus_1     1.3045668  0.0000000  1.0133291  0.9066277
## Philebus_2     1.4093791  1.0133291  0.0000000  0.7796193
## Philebus_3     1.2131989  0.9066277  0.7796193  0.0000000
as.matrix(dist.delta((dataset3)))
##                  Pl1_Euthyphro Pl1_HippiasMajor Pl2_Republic2 Pl2_Republic3
## Pl1_Euthyphro        0.0000000        0.9080436     1.3344141     1.3457935
## Pl1_HippiasMajor     0.9080436        0.0000000     1.2410607     1.2306269
## Pl2_Republic2        1.3344141        1.2410607     0.0000000     0.7606477
## Pl2_Republic3        1.3457935        1.2306269     0.7606477     0.0000000
## Pl3_Laws1            1.3647860        1.3923041     1.0870141     0.9845900
## Pl3_Laws2            1.3176924        1.2876555     1.0000289     0.9560728
## Philebus_1           1.4431512        1.3038777     1.2249446     1.1982442
## Philebus_2           1.3463579        1.3866101     1.3640438     1.3187183
## Philebus_3           1.3826356        1.3499426     1.2209616     1.0673194
##                  Pl3_Laws1 Pl3_Laws2 Philebus_1 Philebus_2 Philebus_3
## Pl1_Euthyphro    1.3647860 1.3176924  1.4431512  1.3463579  1.3826356
## Pl1_HippiasMajor 1.3923041 1.2876555  1.3038777  1.3866101  1.3499426
## Pl2_Republic2    1.0870141 1.0000289  1.2249446  1.3640438  1.2209616
## Pl2_Republic3    0.9845900 0.9560728  1.1982442  1.3187183  1.0673194
## Pl3_Laws1        0.0000000 0.6402198  1.0146754  1.0655754  0.9104088
## Pl3_Laws2        0.6402198 0.0000000  0.9209539  1.0197191  0.9071319
## Philebus_1       1.0146754 0.9209539  0.0000000  1.0567281  0.9713519
## Philebus_2       1.0655754 1.0197191  1.0567281  0.0000000  0.7900575
## Philebus_3       0.9104088 0.9071319  0.9713519  0.7900575  0.0000000
as.matrix(dist.delta((dataset4)))
##                Pl1_Gorgias Pl1_Protagoras Pl2_Republic8 Pl2_Republic9 Pl3_Laws1
## Pl1_Gorgias      0.0000000      0.7652905     1.3573703     1.2197161 1.2619296
## Pl1_Protagoras   0.7652905      0.0000000     1.2502423     1.1152130 1.1573086
## Pl2_Republic8    1.3573703      1.2502423     0.0000000     0.7146617 1.1239121
## Pl2_Republic9    1.2197161      1.1152130     0.7146617     0.0000000 1.1973106
## Pl3_Laws1        1.2619296      1.1573086     1.1239121     1.1973106 0.0000000
## Pl3_Laws2        1.2417717      1.1907925     1.0919095     1.2015376 0.7014496
## Philebus_1       1.2485689      1.1923176     1.3271783     1.3096179 1.0698683
## Philebus_2       1.3619493      1.3970653     1.3290277     1.3332667 1.1702709
## Philebus_3       1.1865464      1.2355921     1.1636352     1.1950003 1.0115704
##                Pl3_Laws2 Philebus_1 Philebus_2 Philebus_3
## Pl1_Gorgias    1.2417717   1.248569  1.3619493  1.1865464
## Pl1_Protagoras 1.1907925   1.192318  1.3970653  1.2355921
## Pl2_Republic8  1.0919095   1.327178  1.3290277  1.1636352
## Pl2_Republic9  1.2015376   1.309618  1.3332667  1.1950003
## Pl3_Laws1      0.7014496   1.069868  1.1702709  1.0115704
## Pl3_Laws2      0.0000000   1.018049  1.1573076  1.0281623
## Philebus_1     1.0180490   0.000000  1.1258875  1.0368510
## Philebus_2     1.1573076   1.125888  0.0000000  0.8680829
## Philebus_3     1.0281623   1.036851  0.8680829  0.0000000