Parsing Plato’s Republic (Separate Books)

Synopsis

This file contains the code necessary for parsing Plato’s Republic from the Diorisis Ancient Greek Corpus (Vatri, Alessandro and Barbara McGillivray. 2018. ‘The Diorisis Ancient Greek Corpus’, Research Data Journal for the Humanities and Social Sciences 3, 55–65). With this code, each book is saved as a separate file; to parse the whole text, please refer to our Corpus_Platonicum (Diorisis): Lemmata Extraction. I here assume that you have already have the “Republic.xml” file in your working directory; if not, please refer to the link above.

Loading the file

First load the file from the working directory

library(XML)
doc_R <- xmlParse("Republic.xml", useInternalNodes = TRUE, isURL = F)
rootnode_R  <- xmlRoot(doc_R)

Here’s how the xml looks like. The number of books is stored under the “location” attribute in “sentence”; each sentence contains a number of words, and lemmata are stored under “lemma” element (we need the attribute “entry” in Greek).

<body>
      <sentence id="1" location="1.327">
        <word form="kate/bhn" id="1">
          <lemma id="53651" entry="καταβαίνω" POS="verb" TreeTagger="false" disambiguated="n/a">
            <analysis morph="aor ind act 3rd pl (epic doric aeolic)"/>
            <analysis morph="aor ind act 1st sg"/>
          </lemma>
        </word>
        <word form="xqe\s" id="2">
          <lemma id="113946" entry="χθές" POS="adverb" TreeTagger="false" disambiguated="n/a">
            <analysis morph="indeclform (adverb)"/>
          </lemma>
        </word>

Subsetting the sentences

We need to subset sentences according to the book they belong to.

library(stringr)
all_sent <- getNodeSet(rootnode_R, "//text/body/sentence") ## 6389 elements

b1 <- str_extract((sapply(all_sent, xmlGetAttr, 'location')), "\\d+(?=\\.)") == 1
b2 <- str_extract((sapply(all_sent, xmlGetAttr, 'location')), "\\d+(?=\\.)") == 2
b3 <- str_extract((sapply(all_sent, xmlGetAttr, 'location')), "\\d+(?=\\.)") == 3
b4 <- str_extract((sapply(all_sent, xmlGetAttr, 'location')), "\\d+(?=\\.)") == 4
b5 <- str_extract((sapply(all_sent, xmlGetAttr, 'location')), "\\d+(?=\\.)") == 5
b6 <- str_extract((sapply(all_sent, xmlGetAttr, 'location')), "\\d+(?=\\.)") == 6
b7 <- str_extract((sapply(all_sent, xmlGetAttr, 'location')), "\\d+(?=\\.)") == 7
b8 <- str_extract((sapply(all_sent, xmlGetAttr, 'location')), "\\d+(?=\\.)") == 8
b9 <- str_extract((sapply(all_sent, xmlGetAttr, 'location')), "\\d+(?=\\.)") == 9
b10 <- str_extract((sapply(all_sent, xmlGetAttr, 'location')), "\\d+(?=\\.)") == 10

sent_b1 <- all_sent[b1]
sent_b2 <- all_sent[b2]
sent_b3 <- all_sent[b3]
sent_b4 <- all_sent[b4]
sent_b5 <- all_sent[b5]
sent_b6 <- all_sent[b6]
sent_b7 <- all_sent[b7]
sent_b8 <- all_sent[b8]
sent_b9 <- all_sent[b9]
sent_b10 <- all_sent[b10]

Length of books

Before we extract the information stored in under “lemma” elements in each sentence, we need to know the number of sentences in each book.

lb1 <- length(sent_b1)
lb2 <- length(sent_b2)
lb3 <- length(sent_b3)
lb4 <- length(sent_b4)
lb5 <- length(sent_b5)
lb6 <- length(sent_b6)
lb7 <- length(sent_b7)
lb8 <- length(sent_b8)
lb9 <- length(sent_b9)
lb10 <- length(sent_b10)

Book 1 is the longest (847 sentences), book 9 is the shortest (525 sentences). There are 6389 sentences in the “Republic”.

c(lb1, lb2, lb3, lb4, lb5, lb6, lb7, lb8, lb9, lb10)

##  [1] 847 607 670 637 817 578 572 571 525 565

Book 1

We first want to write a loop that would go through all the sentences and extracts all the lemmata for book 1.

text_b1 <- character()
for (n in 1:lb1) {
  words <- sent_b1[[n]]["word"] 
  sl <- length(words)
  
  for (y in 1:sl) { 
    lemma <- sapply(words[[y]]["lemma"], xmlGetAttr, "entry")
    text_b1 <- c(text_b1, lemma)
  }
}
length(text_b1)

## [1] 9469

text_b1 <- as.character(text_b1)
head(text_b1)

## [1] "καταβαίνω" "χθές"      "εἰς"       "Πειραιεύς" "μετά"      "Γλαύκων"

write.table(text_b1, "Republic1.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

Book 2

text_b2 <- character()
for (n in 1:lb2) {
  words <- sent_b2[[n]]["word"] 
  sl <- length(words) 
  
  for (y in 1:sl) { 
    lemma <- sapply(words[[y]]["lemma"], xmlGetAttr, "entry")
    text_b2 <- c(text_b2, lemma)
  }
}
length(text_b2)

## [1] 8920

text_b2 <- as.character(text_b2)
head(text_b2)

## [1] "ἐγώ"    "μέν"    "οὖν"    "οὗτος"  "εἶπον"  "οἴομαι"

write.table(text_b2, "Republic2.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

Book 3

text_b3 <- character()
for (n in 1:lb3) {
  words <- sent_b3[[n]]["word"] 
  sl <- length(words) 
  
  for (y in 1:sl) { 
    lemma <- sapply(words[[y]]["lemma"], xmlGetAttr, "entry")
    text_b3 <- c(text_b3, lemma)
  }
}
length(text_b3)

## [1] 10276

text_b3 <- as.character(text_b3)
head(text_b3)

## [1] "καί"    "οἴομαι" "γε"     "φημί"   "ὀρθός"  "ἐγώ"

write.table(text_b3, "Republic3.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

Note that 386a1-4 ( Τὰ μὲν δὴ περὶ θεούς, ἦν δ’ ἐγώ, τοιαῦτ’ ἄττα, ὡς ἔοικεν, ἀκουστέον τε καὶ οὐκ ἀκουστέον εὐθὺς ἐκ παίδων τοῖς θεούςτε τιμήσουσιν καὶ γονέας τήν τε ἀλλήλων φιλίαν μὴ περὶ σμικροῦ ποιησομένοις.)is parsed as the last sentence in book 2. Book 3 in this file starts with 386a5 (Καὶ οἶμαί γ’, ἔφη, ὀρθῶς ἡμῖν φαίνεσθαι.)

Book 4

text_b4 <- character()
for (n in 1:lb4) {
  words <- sent_b4[[n]]["word"] 
  sl <- length(words) 
  
  for (y in 1:sl) { 
    lemma <- sapply(words[[y]]["lemma"], xmlGetAttr, "entry")
    text_b4 <- c(text_b4, lemma)
  }
}
length(text_b4)

## [1] 8898

text_b4 <- as.character(text_b4)
head(text_b4)

## [1] "καί"        "ὁ"          "Ἀδείμαντος" "ὑπολαμβάνω" "τίς"       
## [6] "οὖν"

write.table(text_b4, "Republic4.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

Book 5

text_b5 <- character()
for (n in 1:lb5) {
  words <- sent_b5[[n]]["word"] 
  sl <- length(words) 
  
  for (y in 1:sl) { 
    lemma <- sapply(words[[y]]["lemma"], xmlGetAttr, "entry")
    text_b5 <- c(text_b5, lemma)
  }
}
length(text_b5)

## [1] 10239

text_b5 <- as.character(text_b5)
head(text_b5)

## [1] "ἀγαθός"   "μέν"      "τοίνυν"   "ὁ"        "τοιοῦτος" "πόλις"

write.table(text_b5, "Republic5.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

Book 6

text_b6 <- character()
for (n in 1:lb6) {
  words <- sent_b6[[n]]["word"] 
  sl <- length(words) 
  
  for (y in 1:sl) { 
    lemma <- sapply(words[[y]]["lemma"], xmlGetAttr, "entry")
    text_b6 <- c(text_b6, lemma)
  }
}
length(text_b6)

## [1] 8718

text_b6 <- as.character(text_b6)
head(text_b6)

## [1] "ἕ"         "μέν"       "δή"        "φιλόσοφος" "εἰμί"      "δέ"

write.table(text_b6, "Republic6.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

Book 7

text_b7 <- character()
for (n in 1:lb7) {
  words <- sent_b7[[n]]["word"] 
  sl <- length(words) 
  
  for (y in 1:sl) { 
    lemma <- sapply(words[[y]]["lemma"], xmlGetAttr, "entry")
    text_b7 <- c(text_b7, lemma)
  }
}
length(text_b7)

## [1] 8592

text_b7 <- as.character(text_b7)
head(text_b7)

## [1] "μετά"     "οὗτος"    "δή"       "εἶπον"    "ἀπεικάζω" "τοιοῦτος"

write.table(text_b7, "Republic7.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

Book 8

text_b8 <- character()
for (n in 1:lb8) {
  words <- sent_b8[[n]]["word"] 
  sl <- length(words) 
  
  for (y in 1:sl) { 
    lemma <- sapply(words[[y]]["lemma"], xmlGetAttr, "entry")
    text_b8 <- c(text_b8, lemma)
  }
}
length(text_b8)

## [1] 8195

text_b8 <- as.character(text_b8)
head(text_b8)

## [1] "εἰμί"     "οὗτος"    "μέν"      "δή"       "ὁμολογέω" "ὦ"

write.table(text_b8, "Republic8.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

Book 9

text_b9 <- character()
for (n in 1:lb9) {
  words <- sent_b9[[n]]["word"] 
  sl <- length(words) 
  
  for (y in 1:sl) { 
    lemma <- sapply(words[[y]]["lemma"], xmlGetAttr, "entry")
    text_b9 <- c(text_b9, lemma)
  }
}
length(text_b9)

## [1] 6858

text_b9 <- as.character(text_b9)
head(text_b9)

## [1] "αὐτός"  "δή"     "λοιπός" "εἰμί"   "δέ"     "ἐγώ"

write.table(text_b9, "Republic9.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

Book 10

text_b10 <- character()
for (n in 1:lb10) {
  words <- sent_b10[[n]]["word"] 
  sl <- length(words) 
  
  for (y in 1:sl) { 
    lemma <- sapply(words[[y]]["lemma"], xmlGetAttr, "entry")
    text_b10 <- c(text_b10, lemma)
  }
}
length(text_b10)

## [1] 8713

text_b10 <- as.character(text_b10)
head(text_b10)

## [1] "καί"   "μήν"   "εἰμί"  "δέ"    "ἐγώ"   "πολύς"

write.table(text_b10, "Republic10.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

List of files

We now have all books of the “Laws” saved as txt files in the working directory.

list.files(pattern = "^Rep")

##  [1] "Republic.txt"   "Republic.xml"   "Republic1.txt"  "Republic10.txt"
##  [5] "Republic2.txt"  "Republic3.txt"  "Republic4.txt"  "Republic5.txt" 
##  [9] "Republic6.txt"  "Republic7.txt"  "Republic8.txt"  "Republic9.txt"

P.S.

There is probably a more economic way to solve the problem, but the output is as expected (the head of each book above confirms this).

P.S.S.

Please note that the original xml file contains some lemmatization errors, but this problem is to be addressed separately.

With questions, corrections, and suggestions please write to Olga Alieva (oalieva@hse.ru)