This file contains the code necessary for parsing Plato’s Republic from the Diorisis Ancient Greek Corpus (Vatri, Alessandro and Barbara McGillivray. 2018. ‘The Diorisis Ancient Greek Corpus’, Research Data Journal for the Humanities and Social Sciences 3, 55–65). With this code, each book is saved as a separate file; to parse the whole text, please refer to our Corpus_Platonicum (Diorisis): Lemmata Extraction. I here assume that you have already have the “Republic.xml” file in your working directory; if not, please refer to the link above.
First load the file from the working directory
library(XML)
doc_R <- xmlParse("Republic.xml", useInternalNodes = TRUE, isURL = F)
rootnode_R <- xmlRoot(doc_R)
Here’s how the xml looks like. The number of books is stored under the “location” attribute in “sentence”; each sentence contains a number of words, and lemmata are stored under “lemma” element (we need the attribute “entry” in Greek).
<body>
<sentence id="1" location="1.327">
<word form="kate/bhn" id="1">
<lemma id="53651" entry="καταβαίνω" POS="verb" TreeTagger="false" disambiguated="n/a">
<analysis morph="aor ind act 3rd pl (epic doric aeolic)"/>
<analysis morph="aor ind act 1st sg"/>
</lemma>
</word>
<word form="xqe\s" id="2">
<lemma id="113946" entry="χθές" POS="adverb" TreeTagger="false" disambiguated="n/a">
<analysis morph="indeclform (adverb)"/>
</lemma>
</word>
We need to subset sentences according to the book they belong to.
library(stringr)
all_sent <- getNodeSet(rootnode_R, "//text/body/sentence") ## 6389 elements
b1 <- str_extract((sapply(all_sent, xmlGetAttr, 'location')), "\\d+(?=\\.)") == 1
b2 <- str_extract((sapply(all_sent, xmlGetAttr, 'location')), "\\d+(?=\\.)") == 2
b3 <- str_extract((sapply(all_sent, xmlGetAttr, 'location')), "\\d+(?=\\.)") == 3
b4 <- str_extract((sapply(all_sent, xmlGetAttr, 'location')), "\\d+(?=\\.)") == 4
b5 <- str_extract((sapply(all_sent, xmlGetAttr, 'location')), "\\d+(?=\\.)") == 5
b6 <- str_extract((sapply(all_sent, xmlGetAttr, 'location')), "\\d+(?=\\.)") == 6
b7 <- str_extract((sapply(all_sent, xmlGetAttr, 'location')), "\\d+(?=\\.)") == 7
b8 <- str_extract((sapply(all_sent, xmlGetAttr, 'location')), "\\d+(?=\\.)") == 8
b9 <- str_extract((sapply(all_sent, xmlGetAttr, 'location')), "\\d+(?=\\.)") == 9
b10 <- str_extract((sapply(all_sent, xmlGetAttr, 'location')), "\\d+(?=\\.)") == 10
sent_b1 <- all_sent[b1]
sent_b2 <- all_sent[b2]
sent_b3 <- all_sent[b3]
sent_b4 <- all_sent[b4]
sent_b5 <- all_sent[b5]
sent_b6 <- all_sent[b6]
sent_b7 <- all_sent[b7]
sent_b8 <- all_sent[b8]
sent_b9 <- all_sent[b9]
sent_b10 <- all_sent[b10]
Before we extract the information stored in under “lemma” elements in each sentence, we need to know the number of sentences in each book.
lb1 <- length(sent_b1)
lb2 <- length(sent_b2)
lb3 <- length(sent_b3)
lb4 <- length(sent_b4)
lb5 <- length(sent_b5)
lb6 <- length(sent_b6)
lb7 <- length(sent_b7)
lb8 <- length(sent_b8)
lb9 <- length(sent_b9)
lb10 <- length(sent_b10)
Book 1 is the longest (847 sentences), book 9 is the shortest (525 sentences). There are 6389 sentences in the “Republic”.
c(lb1, lb2, lb3, lb4, lb5, lb6, lb7, lb8, lb9, lb10)
## [1] 847 607 670 637 817 578 572 571 525 565
We first want to write a loop that would go through all the sentences and extracts all the lemmata for book 1.
text_b1 <- character()
for (n in 1:lb1) {
words <- sent_b1[[n]]["word"]
sl <- length(words)
for (y in 1:sl) {
lemma <- sapply(words[[y]]["lemma"], xmlGetAttr, "entry")
text_b1 <- c(text_b1, lemma)
}
}
length(text_b1)
## [1] 9469
text_b1 <- as.character(text_b1)
head(text_b1)
## [1] "καταβαίνω" "χθές" "εἰς" "Πειραιεύς" "μετά" "Γλαύκων"
write.table(text_b1, "Republic1.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)
text_b2 <- character()
for (n in 1:lb2) {
words <- sent_b2[[n]]["word"]
sl <- length(words)
for (y in 1:sl) {
lemma <- sapply(words[[y]]["lemma"], xmlGetAttr, "entry")
text_b2 <- c(text_b2, lemma)
}
}
length(text_b2)
## [1] 8920
text_b2 <- as.character(text_b2)
head(text_b2)
## [1] "ἐγώ" "μέν" "οὖν" "οὗτος" "εἶπον" "οἴομαι"
write.table(text_b2, "Republic2.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)
text_b3 <- character()
for (n in 1:lb3) {
words <- sent_b3[[n]]["word"]
sl <- length(words)
for (y in 1:sl) {
lemma <- sapply(words[[y]]["lemma"], xmlGetAttr, "entry")
text_b3 <- c(text_b3, lemma)
}
}
length(text_b3)
## [1] 10276
text_b3 <- as.character(text_b3)
head(text_b3)
## [1] "καί" "οἴομαι" "γε" "φημί" "ὀρθός" "ἐγώ"
write.table(text_b3, "Republic3.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)
Note that 386a1-4 ( Τὰ μὲν δὴ περὶ θεούς, ἦν δ’ ἐγώ, τοιαῦτ’ ἄττα, ὡς ἔοικεν, ἀκουστέον τε καὶ οὐκ ἀκουστέον εὐθὺς ἐκ παίδων τοῖς θεούςτε τιμήσουσιν καὶ γονέας τήν τε ἀλλήλων φιλίαν μὴ περὶ σμικροῦ ποιησομένοις.)is parsed as the last sentence in book 2. Book 3 in this file starts with 386a5 (Καὶ οἶμαί γ’, ἔφη, ὀρθῶς ἡμῖν φαίνεσθαι.)
text_b4 <- character()
for (n in 1:lb4) {
words <- sent_b4[[n]]["word"]
sl <- length(words)
for (y in 1:sl) {
lemma <- sapply(words[[y]]["lemma"], xmlGetAttr, "entry")
text_b4 <- c(text_b4, lemma)
}
}
length(text_b4)
## [1] 8898
text_b4 <- as.character(text_b4)
head(text_b4)
## [1] "καί" "ὁ" "Ἀδείμαντος" "ὑπολαμβάνω" "τίς"
## [6] "οὖν"
write.table(text_b4, "Republic4.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)
text_b5 <- character()
for (n in 1:lb5) {
words <- sent_b5[[n]]["word"]
sl <- length(words)
for (y in 1:sl) {
lemma <- sapply(words[[y]]["lemma"], xmlGetAttr, "entry")
text_b5 <- c(text_b5, lemma)
}
}
length(text_b5)
## [1] 10239
text_b5 <- as.character(text_b5)
head(text_b5)
## [1] "ἀγαθός" "μέν" "τοίνυν" "ὁ" "τοιοῦτος" "πόλις"
write.table(text_b5, "Republic5.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)
text_b6 <- character()
for (n in 1:lb6) {
words <- sent_b6[[n]]["word"]
sl <- length(words)
for (y in 1:sl) {
lemma <- sapply(words[[y]]["lemma"], xmlGetAttr, "entry")
text_b6 <- c(text_b6, lemma)
}
}
length(text_b6)
## [1] 8718
text_b6 <- as.character(text_b6)
head(text_b6)
## [1] "ἕ" "μέν" "δή" "φιλόσοφος" "εἰμί" "δέ"
write.table(text_b6, "Republic6.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)
text_b7 <- character()
for (n in 1:lb7) {
words <- sent_b7[[n]]["word"]
sl <- length(words)
for (y in 1:sl) {
lemma <- sapply(words[[y]]["lemma"], xmlGetAttr, "entry")
text_b7 <- c(text_b7, lemma)
}
}
length(text_b7)
## [1] 8592
text_b7 <- as.character(text_b7)
head(text_b7)
## [1] "μετά" "οὗτος" "δή" "εἶπον" "ἀπεικάζω" "τοιοῦτος"
write.table(text_b7, "Republic7.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)
text_b8 <- character()
for (n in 1:lb8) {
words <- sent_b8[[n]]["word"]
sl <- length(words)
for (y in 1:sl) {
lemma <- sapply(words[[y]]["lemma"], xmlGetAttr, "entry")
text_b8 <- c(text_b8, lemma)
}
}
length(text_b8)
## [1] 8195
text_b8 <- as.character(text_b8)
head(text_b8)
## [1] "εἰμί" "οὗτος" "μέν" "δή" "ὁμολογέω" "ὦ"
write.table(text_b8, "Republic8.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)
text_b9 <- character()
for (n in 1:lb9) {
words <- sent_b9[[n]]["word"]
sl <- length(words)
for (y in 1:sl) {
lemma <- sapply(words[[y]]["lemma"], xmlGetAttr, "entry")
text_b9 <- c(text_b9, lemma)
}
}
length(text_b9)
## [1] 6858
text_b9 <- as.character(text_b9)
head(text_b9)
## [1] "αὐτός" "δή" "λοιπός" "εἰμί" "δέ" "ἐγώ"
write.table(text_b9, "Republic9.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)
text_b10 <- character()
for (n in 1:lb10) {
words <- sent_b10[[n]]["word"]
sl <- length(words)
for (y in 1:sl) {
lemma <- sapply(words[[y]]["lemma"], xmlGetAttr, "entry")
text_b10 <- c(text_b10, lemma)
}
}
length(text_b10)
## [1] 8713
text_b10 <- as.character(text_b10)
head(text_b10)
## [1] "καί" "μήν" "εἰμί" "δέ" "ἐγώ" "πολύς"
write.table(text_b10, "Republic10.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)
We now have all books of the “Laws” saved as txt files in the working directory.
list.files(pattern = "^Rep")
## [1] "Republic.txt" "Republic.xml" "Republic1.txt" "Republic10.txt"
## [5] "Republic2.txt" "Republic3.txt" "Republic4.txt" "Republic5.txt"
## [9] "Republic6.txt" "Republic7.txt" "Republic8.txt" "Republic9.txt"
There is probably a more economic way to solve the problem, but the output is as expected (the head of each book above confirms this).
Please note that the original xml file contains some lemmatization errors, but this problem is to be addressed separately.
With questions, corrections, and suggestions please write to Olga Alieva (oalieva@hse.ru)