Parsing Plato’s Laws (Separate Books)

Synopsis

This file contains the code necessary for parsing Plato’s Laws from the Diorisis Ancient Greek Corpus (Vatri, Alessandro and Barbara McGillivray. 2018. ‘The Diorisis Ancient Greek Corpus’, Research Data Journal for the Humanities and Social Sciences 3, 55–65). With this code, each book is saved as a separate file; to parse the whole text, please refer to our Corpus_Platonicum (Diorisis): Lemmata Extraction. I here assume that you have already have the “Laws.xml” file in your working directory; if not, please refer to the link above.

Loading the file

First load the file from the working directory

library(XML)
doc_Lg <- xmlParse("Laws.xml", useInternalNodes = TRUE, isURL = F)
rootnode_Lg  <- xmlRoot(doc_Lg)

Here’s how the xml tree looks like. The number of the book is stored under the “location” attribute in “sentence”.

<body>
      <sentence id="1" location="1.624">
        <word form="qeo\s" id="1">
          <lemma id="48291" entry="θεός" POS="noun" TreeTagger="false" disambiguated="n/a">
            <analysis morph="masc nom sg"/>
          </lemma>

Subsetting the sentences

We need to subset sentences according to the book they belong to.

library(stringr)
all_sent <- getNodeSet(rootnode_Lg, "//text/body/sentence") 

b1 <- str_extract((sapply(all_sent, xmlGetAttr, 'location')), "\\d+(?=\\.)") == 1
b2 <- str_extract((sapply(all_sent, xmlGetAttr, 'location')), "\\d+(?=\\.)") == 2
b3 <- str_extract((sapply(all_sent, xmlGetAttr, 'location')), "\\d+(?=\\.)") == 3
b4 <- str_extract((sapply(all_sent, xmlGetAttr, 'location')), "\\d+(?=\\.)") == 4
b5 <- str_extract((sapply(all_sent, xmlGetAttr, 'location')), "\\d+(?=\\.)") == 5
b6 <- str_extract((sapply(all_sent, xmlGetAttr, 'location')), "\\d+(?=\\.)") == 6
b7 <- str_extract((sapply(all_sent, xmlGetAttr, 'location')), "\\d+(?=\\.)") == 7
b8 <- str_extract((sapply(all_sent, xmlGetAttr, 'location')), "\\d+(?=\\.)") == 8
b9 <- str_extract((sapply(all_sent, xmlGetAttr, 'location')), "\\d+(?=\\.)") == 9
b10 <- str_extract((sapply(all_sent, xmlGetAttr, 'location')), "\\d+(?=\\.)") == 10
b11 <- str_extract((sapply(all_sent, xmlGetAttr, 'location')), "\\d+(?=\\.)") == 11
b12 <- str_extract((sapply(all_sent, xmlGetAttr, 'location')), "\\d+(?=\\.)") == 12

sent_b1 <- all_sent[b1]
sent_b2 <- all_sent[b2]
sent_b3 <- all_sent[b3]
sent_b4 <- all_sent[b4]
sent_b5 <- all_sent[b5]
sent_b6 <- all_sent[b6]
sent_b7 <- all_sent[b7]
sent_b8 <- all_sent[b8]
sent_b9 <- all_sent[b9]
sent_b10 <- all_sent[b10]
sent_b11 <- all_sent[b11]
sent_b12 <- all_sent[b12]

Length of books

Before we extract the information stored in under “lemma” elements in each sentence, we need to know the number of sentences in each book.

lb1 <- length(sent_b1)
lb2 <- length(sent_b2)
lb3 <- length(sent_b3)
lb4 <- length(sent_b4)
lb5 <- length(sent_b5)
lb6 <- length(sent_b6)
lb7 <- length(sent_b7)
lb8 <- length(sent_b8)
lb9 <- length(sent_b9)
lb10 <- length(sent_b10)
lb11 <- length(sent_b11)
lb12 <- length(sent_b12)

Book 7 is the longest (597 sentences), book 5 is the shortest (272 sentences). There are 5018 sentences in the “Laws”.

c(lb1, lb2, lb3, lb4, lb5, lb6, lb7, lb8, lb9, lb10, lb11, lb12)

##  [1] 496 409 494 364 272 465 597 310 407 504 315 385

Book 1

We first want to write a loop that would go through all the sentences and extracts all the lemmata for book 1.

text_b1 <- character()
for (n in 1:lb1) {
  words <- sent_b1[[n]]["word"] ## go to the word node
  sl <- length(words) ## the length of the word node 
  
  for (y in 1:sl) { 
    lemma <- sapply(words[[y]]["lemma"], xmlGetAttr, "entry")
    text_b1 <- c(text_b1, lemma)
  }
}
length(text_b1)

## [1] 8169

text_b1 <- as.character(text_b1)
head(text_b1)

## [1] "θεός"     "ἠέ"       "τις"      "ἄνθρωπος" "σύ"       "ὦ"

write.table(text_b1, "Laws1.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

Book 2

text_b2 <- character()
for (n in 1:lb2) {
  words <- sent_b2[[n]]["word"] 
  sl <- length(words) 
  
  for (y in 1:sl) { 
    lemma <- sapply(words[[y]]["lemma"], xmlGetAttr, "entry")
    text_b2 <- c(text_b2, lemma)
  }
}
length(text_b2)

## [1] 7160

text_b2 <- as.character(text_b2)
head(text_b2)

## [1] "ὁ"     "δή"    "μετά"  "οὗτος" "ὡς"    "ἔοικα"

write.table(text_b2, "Laws2.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

Book 3

text_b3 <- character()
for (n in 1:lb3) {
  words <- sent_b3[[n]]["word"] 
  sl <- length(words) 
  
  for (y in 1:sl) { 
    lemma <- sapply(words[[y]]["lemma"], xmlGetAttr, "entry")
    text_b3 <- c(text_b3, lemma)
  }
}
length(text_b3)

## [1] 8296

text_b3 <- as.character(text_b3)
head(text_b3)

## [1] "οὗτος"    "μέν"      "οὖν"      "δή"       "ταύτῃ"    "πολιτεία"

write.table(text_b3, "Laws3.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

Book 4

text_b4 <- character()
for (n in 1:lb4) {
  words <- sent_b4[[n]]["word"] 
  sl <- length(words) 
  
  for (y in 1:sl) { 
    lemma <- sapply(words[[y]]["lemma"], xmlGetAttr, "entry")
    text_b4 <- c(text_b4, lemma)
  }
}
length(text_b4)

## [1] 6425

text_b4 <- as.character(text_b4)
head(text_b4)

## [1] "φέρω"       "δή"         "τίς"        "δέομαι"     "διανοέομαι"
## [6] "ποτέ"

write.table(text_b4, "Laws4.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

Book 5

text_b5 <- character()
for (n in 1:lb5) {
  words <- sent_b5[[n]]["word"] 
  sl <- length(words) 
  
  for (y in 1:sl) { 
    lemma <- sapply(words[[y]]["lemma"], xmlGetAttr, "entry")
    text_b5 <- c(text_b5, lemma)
  }
}
length(text_b5)

## [1] 7066

text_b5 <- as.character(text_b5)
head(text_b5)

## [1] "ἀκούω" "δή"    "πᾶς"   "ὅς"    "νυνδί" "ὁ"

write.table(text_b5, "Laws5.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

Book 6

text_b6 <- character()
for (n in 1:lb6) {
  words <- sent_b6[[n]]["word"] 
  sl <- length(words) 
  
  for (y in 1:sl) { 
    lemma <- sapply(words[[y]]["lemma"], xmlGetAttr, "entry")
    text_b6 <- c(text_b6, lemma)
  }
}
length(text_b6)

## [1] 10935

text_b6 <- as.character(text_b6)
head(text_b6)

## [1] "ἀλλά" "μήν"  "μετά" "γε"   "πᾶς"  "ὁ"

write.table(text_b6, "Laws6.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

Book 7

text_b7 <- character()
for (n in 1:lb7) {
  words <- sent_b7[[n]]["word"] 
  sl <- length(words) 
  
  for (y in 1:sl) { 
    lemma <- sapply(words[[y]]["lemma"], xmlGetAttr, "entry")
    text_b7 <- c(text_b7, lemma)
  }
}
length(text_b7)

## [1] 12031

text_b7 <- as.character(text_b7)
head(text_b7)

## [1] "γίγνομαι" "δέ"       "παῖς"     "ἄρσην"    "καί"      "θῆλυς"

write.table(text_b7, "Laws7.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

Book 8

text_b8 <- character()
for (n in 1:lb8) {
  words <- sent_b8[[n]]["word"] 
  sl <- length(words) 
  
  for (y in 1:sl) { 
    lemma <- sapply(words[[y]]["lemma"], xmlGetAttr, "entry")
    text_b8 <- c(text_b8, lemma)
  }
}
length(text_b8)

## [1] 7388

text_b8 <- as.character(text_b8)
head(text_b8)

## [1] "οὗτος" "μήν"   "ἔχω"   "εἰμί"  "τάσσω" "μέν"

write.table(text_b8, "Laws8.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

Book 9

text_b9 <- character()
for (n in 1:lb9) {
  words <- sent_b9[[n]]["word"] 
  sl <- length(words) 
  
  for (y in 1:sl) { 
    lemma <- sapply(words[[y]]["lemma"], xmlGetAttr, "entry")
    text_b9 <- c(text_b9, lemma)
  }
}
length(text_b9)

## [1] 9697

text_b9 <- as.character(text_b9)
head(text_b9)

## [1] "δίκη"      "δή"        "ὁ"         "μετά"      "οὗτος"     "ἀκόλουθος"

write.table(text_b9, "Laws9.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

Book 10

text_b10 <- character()
for (n in 1:lb10) {
  words <- sent_b10[[n]]["word"] 
  sl <- length(words) 
  
  for (y in 1:sl) { 
    lemma <- sapply(words[[y]]["lemma"], xmlGetAttr, "entry")
    text_b10 <- c(text_b10, lemma)
  }
}
length(text_b10)

## [1] 8424

text_b10 <- as.character(text_b10)
head(text_b10)

## [1] "μετά"  "δέ"    "ὁ"     "αἰκία" "περί"  "πᾶς"

write.table(text_b10, "Laws10.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

Book 11

text_b11 <- character()
for (n in 1:lb11) {
  words <- sent_b11[[n]]["word"] 
  sl <- length(words) 
  
  for (y in 1:sl) { 
    lemma <- sapply(words[[y]]["lemma"], xmlGetAttr, "entry")
    text_b11 <- c(text_b11, lemma)
  }
}
length(text_b11)

## [1] 8424

text_b11 <- as.character(text_b11)
head(text_b11)

## [1] "ὁ"          "δή"         "μετά"       "οὗτος"      "ἀίω"       
## [6] "συμβόλαιον"

write.table(text_b11, "Laws11.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

Book 12

text_b12 <- character()
for (n in 1:lb12) {
  words <- sent_b12[[n]]["word"] 
  sl <- length(words) 
  
  for (y in 1:sl) { 
    lemma <- sapply(words[[y]]["lemma"], xmlGetAttr, "entry")
    text_b12 <- c(text_b12, lemma)
  }
}
length(text_b12)

## [1] 9178

text_b12 <- as.character(text_b12)
head(text_b12)

## [1] "ἐάν"        "ὡς"         "πρεσβευτής" "τις"        "ἠέ"        
## [6] "κῆρυξ"

write.table(text_b12, "Laws12.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

List of files

We now have all books of the “Laws” saved as txt files in the working directory.

list.files(pattern = "^Laws")

##  [1] "Laws.txt"   "Laws.xml"   "Laws1.txt"  "Laws10.txt" "Laws11.txt"
##  [6] "Laws12.txt" "Laws2.txt"  "Laws3.txt"  "Laws4.txt"  "Laws5.txt" 
## [11] "Laws6.txt"  "Laws7.txt"  "Laws8.txt"  "Laws9.txt"

P.S.

There is probably a more economic way to solve the problem, but the output is correct (as the head of each book above confirms) and can be used for further analysis.

P.S.S.

Please note that the original file may contain lemmatization errors, but this problem is to be addressed separately.

With questions, corrections, and suggestions please write to Olga Alieva (oalieva@hse.ru)