Synopsis

This file contains the code necessary for parsing Plato’s Laws from the Diorisis Ancient Greek Corpus (Vatri, Alessandro and Barbara McGillivray. 2018. ‘The Diorisis Ancient Greek Corpus’, Research Data Journal for the Humanities and Social Sciences 3, 55–65). With this code, each book is saved as a separate file; to parse the whole text, please refer to our Corpus_Platonicum (Diorisis): Lemmata Extraction. I here assume that you have already have the “Laws.xml” file in your working directory; if not, please refer to the link above.

Loading the file

First load the file from the working directory

library(XML)
doc_Lg <- xmlParse("Laws.xml", useInternalNodes = TRUE, isURL = F)
rootnode_Lg  <- xmlRoot(doc_Lg)

Here’s how the xml tree looks like. The number of the book is stored under the “location” attribute in “sentence”.

<body>
      <sentence id="1" location="1.624">
        <word form="qeo\s" id="1">
          <lemma id="48291" entry="θεός" POS="noun" TreeTagger="false" disambiguated="n/a">
            <analysis morph="masc nom sg"/>
          </lemma>

Subsetting the sentences

We need to subset sentences according to the book they belong to.

library(stringr)
all_sent <- getNodeSet(rootnode_Lg, "//text/body/sentence") 

b1 <- str_extract((sapply(all_sent, xmlGetAttr, 'location')), "\\d+(?=\\.)") == 1
b2 <- str_extract((sapply(all_sent, xmlGetAttr, 'location')), "\\d+(?=\\.)") == 2
b3 <- str_extract((sapply(all_sent, xmlGetAttr, 'location')), "\\d+(?=\\.)") == 3
b4 <- str_extract((sapply(all_sent, xmlGetAttr, 'location')), "\\d+(?=\\.)") == 4
b5 <- str_extract((sapply(all_sent, xmlGetAttr, 'location')), "\\d+(?=\\.)") == 5
b6 <- str_extract((sapply(all_sent, xmlGetAttr, 'location')), "\\d+(?=\\.)") == 6
b7 <- str_extract((sapply(all_sent, xmlGetAttr, 'location')), "\\d+(?=\\.)") == 7
b8 <- str_extract((sapply(all_sent, xmlGetAttr, 'location')), "\\d+(?=\\.)") == 8
b9 <- str_extract((sapply(all_sent, xmlGetAttr, 'location')), "\\d+(?=\\.)") == 9
b10 <- str_extract((sapply(all_sent, xmlGetAttr, 'location')), "\\d+(?=\\.)") == 10
b11 <- str_extract((sapply(all_sent, xmlGetAttr, 'location')), "\\d+(?=\\.)") == 11
b12 <- str_extract((sapply(all_sent, xmlGetAttr, 'location')), "\\d+(?=\\.)") == 12

sent_b1 <- all_sent[b1]
sent_b2 <- all_sent[b2]
sent_b3 <- all_sent[b3]
sent_b4 <- all_sent[b4]
sent_b5 <- all_sent[b5]
sent_b6 <- all_sent[b6]
sent_b7 <- all_sent[b7]
sent_b8 <- all_sent[b8]
sent_b9 <- all_sent[b9]
sent_b10 <- all_sent[b10]
sent_b11 <- all_sent[b11]
sent_b12 <- all_sent[b12]

Length of books

Before we extract the information stored in under “lemma” elements in each sentence, we need to know the number of sentences in each book.

lb1 <- length(sent_b1)
lb2 <- length(sent_b2)
lb3 <- length(sent_b3)
lb4 <- length(sent_b4)
lb5 <- length(sent_b5)
lb6 <- length(sent_b6)
lb7 <- length(sent_b7)
lb8 <- length(sent_b8)
lb9 <- length(sent_b9)
lb10 <- length(sent_b10)
lb11 <- length(sent_b11)
lb12 <- length(sent_b12)

Book 7 is the longest (597 sentences), book 5 is the shortest (272 sentences). There are 5018 sentences in the “Laws”.

c(lb1, lb2, lb3, lb4, lb5, lb6, lb7, lb8, lb9, lb10, lb11, lb12)
##  [1] 496 409 494 364 272 465 597 310 407 504 315 385

Book 1

We first want to write a loop that would go through all the sentences and extracts all the lemmata for book 1.

text_b1 <- character()
for (n in 1:lb1) {
  words <- sent_b1[[n]]["word"] ## go to the word node
  sl <- length(words) ## the length of the word node 
  
  for (y in 1:sl) { 
    lemma <- sapply(words[[y]]["lemma"], xmlGetAttr, "entry")
    text_b1 <- c(text_b1, lemma)
  }
}
length(text_b1)
## [1] 8169
text_b1 <- as.character(text_b1)
head(text_b1)
## [1] "θεός"     "ἠέ"       "τις"      "ἄνθρωπος" "σύ"       "ὦ"
write.table(text_b1, "Laws1.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

Book 2

text_b2 <- character()
for (n in 1:lb2) {
  words <- sent_b2[[n]]["word"] 
  sl <- length(words) 
  
  for (y in 1:sl) { 
    lemma <- sapply(words[[y]]["lemma"], xmlGetAttr, "entry")
    text_b2 <- c(text_b2, lemma)
  }
}
length(text_b2)
## [1] 7160
text_b2 <- as.character(text_b2)
head(text_b2)
## [1] "ὁ"     "δή"    "μετά"  "οὗτος" "ὡς"    "ἔοικα"
write.table(text_b2, "Laws2.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

Book 3

text_b3 <- character()
for (n in 1:lb3) {
  words <- sent_b3[[n]]["word"] 
  sl <- length(words) 
  
  for (y in 1:sl) { 
    lemma <- sapply(words[[y]]["lemma"], xmlGetAttr, "entry")
    text_b3 <- c(text_b3, lemma)
  }
}
length(text_b3)
## [1] 8296
text_b3 <- as.character(text_b3)
head(text_b3)
## [1] "οὗτος"    "μέν"      "οὖν"      "δή"       "ταύτῃ"    "πολιτεία"
write.table(text_b3, "Laws3.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

Book 4

text_b4 <- character()
for (n in 1:lb4) {
  words <- sent_b4[[n]]["word"] 
  sl <- length(words) 
  
  for (y in 1:sl) { 
    lemma <- sapply(words[[y]]["lemma"], xmlGetAttr, "entry")
    text_b4 <- c(text_b4, lemma)
  }
}
length(text_b4)
## [1] 6425
text_b4 <- as.character(text_b4)
head(text_b4)
## [1] "φέρω"       "δή"         "τίς"        "δέομαι"     "διανοέομαι"
## [6] "ποτέ"
write.table(text_b4, "Laws4.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

Book 5

text_b5 <- character()
for (n in 1:lb5) {
  words <- sent_b5[[n]]["word"] 
  sl <- length(words) 
  
  for (y in 1:sl) { 
    lemma <- sapply(words[[y]]["lemma"], xmlGetAttr, "entry")
    text_b5 <- c(text_b5, lemma)
  }
}
length(text_b5)
## [1] 7066
text_b5 <- as.character(text_b5)
head(text_b5)
## [1] "ἀκούω" "δή"    "πᾶς"   "ὅς"    "νυνδί" "ὁ"
write.table(text_b5, "Laws5.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

Book 6

text_b6 <- character()
for (n in 1:lb6) {
  words <- sent_b6[[n]]["word"] 
  sl <- length(words) 
  
  for (y in 1:sl) { 
    lemma <- sapply(words[[y]]["lemma"], xmlGetAttr, "entry")
    text_b6 <- c(text_b6, lemma)
  }
}
length(text_b6)
## [1] 10935
text_b6 <- as.character(text_b6)
head(text_b6)
## [1] "ἀλλά" "μήν"  "μετά" "γε"   "πᾶς"  "ὁ"
write.table(text_b6, "Laws6.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

Book 7

text_b7 <- character()
for (n in 1:lb7) {
  words <- sent_b7[[n]]["word"] 
  sl <- length(words) 
  
  for (y in 1:sl) { 
    lemma <- sapply(words[[y]]["lemma"], xmlGetAttr, "entry")
    text_b7 <- c(text_b7, lemma)
  }
}
length(text_b7)
## [1] 12031
text_b7 <- as.character(text_b7)
head(text_b7)
## [1] "γίγνομαι" "δέ"       "παῖς"     "ἄρσην"    "καί"      "θῆλυς"
write.table(text_b7, "Laws7.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

Book 8

text_b8 <- character()
for (n in 1:lb8) {
  words <- sent_b8[[n]]["word"] 
  sl <- length(words) 
  
  for (y in 1:sl) { 
    lemma <- sapply(words[[y]]["lemma"], xmlGetAttr, "entry")
    text_b8 <- c(text_b8, lemma)
  }
}
length(text_b8)
## [1] 7388
text_b8 <- as.character(text_b8)
head(text_b8)
## [1] "οὗτος" "μήν"   "ἔχω"   "εἰμί"  "τάσσω" "μέν"
write.table(text_b8, "Laws8.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

Book 9

text_b9 <- character()
for (n in 1:lb9) {
  words <- sent_b9[[n]]["word"] 
  sl <- length(words) 
  
  for (y in 1:sl) { 
    lemma <- sapply(words[[y]]["lemma"], xmlGetAttr, "entry")
    text_b9 <- c(text_b9, lemma)
  }
}
length(text_b9)
## [1] 9697
text_b9 <- as.character(text_b9)
head(text_b9)
## [1] "δίκη"      "δή"        "ὁ"         "μετά"      "οὗτος"     "ἀκόλουθος"
write.table(text_b9, "Laws9.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

Book 10

text_b10 <- character()
for (n in 1:lb10) {
  words <- sent_b10[[n]]["word"] 
  sl <- length(words) 
  
  for (y in 1:sl) { 
    lemma <- sapply(words[[y]]["lemma"], xmlGetAttr, "entry")
    text_b10 <- c(text_b10, lemma)
  }
}
length(text_b10)
## [1] 8424
text_b10 <- as.character(text_b10)
head(text_b10)
## [1] "μετά"  "δέ"    "ὁ"     "αἰκία" "περί"  "πᾶς"
write.table(text_b10, "Laws10.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

Book 11

text_b11 <- character()
for (n in 1:lb11) {
  words <- sent_b11[[n]]["word"] 
  sl <- length(words) 
  
  for (y in 1:sl) { 
    lemma <- sapply(words[[y]]["lemma"], xmlGetAttr, "entry")
    text_b11 <- c(text_b11, lemma)
  }
}
length(text_b11)
## [1] 8424
text_b11 <- as.character(text_b11)
head(text_b11)
## [1] "ὁ"          "δή"         "μετά"       "οὗτος"      "ἀίω"       
## [6] "συμβόλαιον"
write.table(text_b11, "Laws11.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

Book 12

text_b12 <- character()
for (n in 1:lb12) {
  words <- sent_b12[[n]]["word"] 
  sl <- length(words) 
  
  for (y in 1:sl) { 
    lemma <- sapply(words[[y]]["lemma"], xmlGetAttr, "entry")
    text_b12 <- c(text_b12, lemma)
  }
}
length(text_b12)
## [1] 9178
text_b12 <- as.character(text_b12)
head(text_b12)
## [1] "ἐάν"        "ὡς"         "πρεσβευτής" "τις"        "ἠέ"        
## [6] "κῆρυξ"
write.table(text_b12, "Laws12.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

List of files

We now have all books of the “Laws” saved as txt files in the working directory.

list.files(pattern = "^Laws")
##  [1] "Laws.txt"   "Laws.xml"   "Laws1.txt"  "Laws10.txt" "Laws11.txt"
##  [6] "Laws12.txt" "Laws2.txt"  "Laws3.txt"  "Laws4.txt"  "Laws5.txt" 
## [11] "Laws6.txt"  "Laws7.txt"  "Laws8.txt"  "Laws9.txt"

P.S.

There is probably a more economic way to solve the problem, but the output is correct (as the head of each book above confirms) and can be used for further analysis.

P.S.S.

Please note that the original file may contain lemmatization errors, but this problem is to be addressed separately.

With questions, corrections, and suggestions please write to Olga Alieva ()