Corpus_Platonicum (Diorisis): Lemmata Extraction

Synopsis

This code documents the process of downloading and parsing Plato’s dialogues from the Diorisis Ancient Greek Corpus (Vatri, Alessandro and Barbara McGillivray. 2018. ‘The Diorisis Ancient Greek Corpus’, Research Data Journal for the Humanities and Social Sciences 3, 55–65). We shall extract lemmata from 36 Plato’s dialogues and save them as separate txt files in the working directory.

Downloading the Corpus

url <- "https://figshare.com/ndownloader/files/11296247"
file_name <- "Diorisis.zip"
download.file(url, "Diorisis.zip")
diorisis_list <- unzip("Diorisis.zip", files = NULL, list = TRUE) 
plato_list <- grep("(0059)", unzip('Diorisis.zip', list=TRUE)$Name, ignore.case=TRUE, value=TRUE) ## returns the list of Plato's dialogues (Perseus ID 0059)
unzip("Diorisis.zip", files = plato_list)

Here is the list of the files in our working directory; let’s rename them for convenience.

list.files()

##  [1] "Diorisis.zip"                          
##  [2] "Plato (0059) - Alcibiades 1 (013).xml" 
##  [3] "Plato (0059) - Alcibiades 2 (014).xml" 
##  [4] "Plato (0059) - Apology (002).xml"      
##  [5] "Plato (0059) - Charmides (018).xml"    
##  [6] "Plato (0059) - Cleitophon (029).xml"   
##  [7] "Plato (0059) - Cratylus (005).xml"     
##  [8] "Plato (0059) - Critias (032).xml"      
##  [9] "Plato (0059) - Crito (003).xml"        
## [10] "Plato (0059) - Epinomis (035).xml"     
## [11] "Plato (0059) - Epistles (036).xml"     
## [12] "Plato (0059) - Euthydemus (021).xml"   
## [13] "Plato (0059) - Euthyphro (001).xml"    
## [14] "Plato (0059) - Gorgias (023).xml"      
## [15] "Plato (0059) - Hipparchus (015).xml"   
## [16] "Plato (0059) - Hippias Major (025).xml"
## [17] "Plato (0059) - Hippias Minor (026).xml"
## [18] "Plato (0059) - Ion (027).xml"          
## [19] "Plato (0059) - Laches (019).xml"       
## [20] "Plato (0059) - Laws (034).xml"         
## [21] "Plato (0059) - Lovers (016).xml"       
## [22] "Plato (0059) - Lysis (020).xml"        
## [23] "Plato (0059) - Menexenus (028).xml"    
## [24] "Plato (0059) - Meno (024).xml"         
## [25] "Plato (0059) - Minos (033).xml"        
## [26] "Plato (0059) - Parmenides (009).xml"   
## [27] "Plato (0059) - Phaedo (004).xml"       
## [28] "Plato (0059) - Phaedrus (012).xml"     
## [29] "Plato (0059) - Philebus (010).xml"     
## [30] "Plato (0059) - Protagoras (022).xml"   
## [31] "Plato (0059) - Republic (030).xml"     
## [32] "Plato (0059) - Sophist (007).xml"      
## [33] "Plato (0059) - Statesman (008).xml"    
## [34] "Plato (0059) - Symposium (011).xml"    
## [35] "Plato (0059) - Theaetetus (006).xml"   
## [36] "Plato (0059) - Theages (017).xml"      
## [37] "Plato (0059) - Timaeus (031).xml"      
## [38] "Plato_Lemmata.Rmd"

library(stringr)
library(filesstrings)
remove_filename_spaces(".")

## 36 files required renaming and this was done successfully.

file_names_old <- list.files()
head(file_names_old)

## [1] "Diorisis.zip"                     "Plato_Lemmata.Rmd"               
## [3] "Plato(0059)-Alcibiades1(013).xml" "Plato(0059)-Alcibiades2(014).xml"
## [5] "Plato(0059)-Apology(002).xml"     "Plato(0059)-Charmides(018).xml"

file_names_new <- str_remove_all(file_names_old, "Plato.0059..")
file_names_new <- str_remove_all(file_names_new, ".\\d\\d\\d.")
file.rename(from=file_names_old, to=paste0(file_names_new))

##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE

list.files()

##  [1] "Alcibiades1.xml"   "Alcibiades2.xml"   "Apology.xml"      
##  [4] "Charmides.xml"     "Cleitophon.xml"    "Cratylus.xml"     
##  [7] "Critias.xml"       "Crito.xml"         "Diorisis.zip"     
## [10] "Epinomis.xml"      "Epistles.xml"      "Euthydemus.xml"   
## [13] "Euthyphro.xml"     "Gorgias.xml"       "Hipparchus.xml"   
## [16] "HippiasMajor.xml"  "HippiasMinor.xml"  "Ion.xml"          
## [19] "Laches.xml"        "Laws.xml"          "Lovers.xml"       
## [22] "Lysis.xml"         "Menexenus.xml"     "Meno.xml"         
## [25] "Minos.xml"         "Parmenides.xml"    "Phaedo.xml"       
## [28] "Phaedrus.xml"      "Philebus.xml"      "Plato_Lemmata.Rmd"
## [31] "Protagoras.xml"    "Republic.xml"      "Sophist.xml"      
## [34] "Statesman.xml"     "Symposium.xml"     "Theaetetus.xml"   
## [37] "Theages.xml"       "Timaeus.xml"

Parsing the Corpus

Here is how the xml tree looks like.

 <text>
    <body>
      <sentence id="1" location="11">
        <word form="o(/ra" id="1">
          <lemma id="74735" entry="ὁράω" POS="verb" TreeTagger="false" disambiguated="n/a">
            <analysis morph="pres imperat act 2nd sg (epic)"/>
            <analysis morph="imperf ind act 3rd sg (homeric ionic)"/>
          </lemma>

library(XML)
filenames <- list.files(pattern = ".xml")

for (i in filenames){
        url = paste("./", i, sep = "")
        doc <- xmlTreeParse(url, useInternalNodes = TRUE, isURL = F)
        rootnode  <- xmlRoot(doc)
        text <- xpathSApply(rootnode, "//TEI.2/text/body/sentence/word/lemma", xmlGetAttr, 'entry')
        text <- as.character(text)
        y <-gsub(".xml","", i)
        outname <-  paste(y, ".txt", sep= "") 
        write.table(text, outname, row.names = FALSE, col.names = FALSE, quote = FALSE)
}

Our working directory should now have the .txt files for all the dialogues. To check, let’s list files once again.

list.files(pattern = ".txt")

##  [1] "Alcibiades1.txt"  "Alcibiades2.txt"  "Apology.txt"      "Charmides.txt"   
##  [5] "Cleitophon.txt"   "Cratylus.txt"     "Critias.txt"      "Crito.txt"       
##  [9] "Epinomis.txt"     "Epistles.txt"     "Euthydemus.txt"   "Euthyphro.txt"   
## [13] "Gorgias.txt"      "Hipparchus.txt"   "HippiasMajor.txt" "HippiasMinor.txt"
## [17] "Ion.txt"          "Laches.txt"       "Laws.txt"         "Lovers.txt"      
## [21] "Lysis.txt"        "Menexenus.txt"    "Meno.txt"         "Minos.txt"       
## [25] "Parmenides.txt"   "Phaedo.txt"       "Phaedrus.txt"     "Philebus.txt"    
## [29] "Protagoras.txt"   "Republic.txt"     "Sophist.txt"      "Statesman.txt"   
## [33] "Symposium.txt"    "Theaetetus.txt"   "Theages.txt"      "Timaeus.txt"

With questions, corrections, and suggestions please write to Olga Alieva (oalieva@hse.ru)

Corpus_Platonicum (Diorisis): Lemmata Extraction

locusclassicus

17 10 2021

Synopsis

Downloading the Corpus

Parsing the Corpus