This file imports a collection of text files sourced from Gutenberg into a corpus object; the exception is Zofloya, or The Moor (1806), by Charlotte Dacre, which was compiled from hand-scraped HathiTrust textual data.1 https://catalog.hathitrust.org/Record/100207374 Information about how to complete basic text analysis workflow in Quanteda is available online,2 https://quanteda.io/articles/design.html as is documentation of the readtext package.3 https://cran.r-project.org/web/packages/readtext/readtext.pdf
---
title: "Corpus Development with Quanteda"
author: "Tonya Howe"
output:
tufte::tufte_handout: default
tufte::tufte_html: default
---
setwd("/Users/tonya-mariehowe//Documents/LSE")
unzip("/Users/tonya-mariehowe/Documents/LSE/gothic_corpus.zip")
#filenames <- list.files("Bunch_of_Gothic_Novels", "\\.txt$")
#filenames <- gsub(".txt$", "", filenames)
txts <- readtext(paste0("gothic_corpus", "/", "*.txt"))
yrs <- c(1764, 1777, 1786, 1790, 1791, 1794, 1794, 1796, 1798, 1806, 1818, 1818, 1818, 1819, 1821, 1824, 1847, 1859, 1871, 1897)
author <- c("Horace Walpole", "Clara Reeve", "William Beckford", "Ann Radcliffe", "Ann Radcliffe", "William Godwin", "Ann Radcliffe", "Matthew Lewis", "Charles Brockden Brown", "Charlotte Dacre", "Mary Shelley", "Thomas Love Peacock", "Jane Austen", "John Polidori", "Thomas De Quincy", "James Hogg", "Emily Bronte", "Wilkie Collins", "Sheridan Le Fanu", "Bram Stoker" )
title <- c("The Castle of Otranto", "The Old English Baron", "The History of the Caliph Vathek", "The Sicilian Romance", "The Romance of the Forest", "Things as They Are; or, The Adventures of Caleb Williams", "The Mysteries of Udolpho", "The Monk: A Romance", "Wieland: or, The Transformation: An American Tale", "Zofloya; or, The Moor", "Frankenstein; or, The Modern Prometheus", "Nightmare Abbey", "Northanger Abbey", "The Vampyre", "Confessions of an English Opium-Eater", "The Private Memoirs and Confessions of a Justified Sinner: Written by Himself", "Wuthering Heights", "The Woman in White", "Carmilla", "Dracula" )
sex <- c("M", "F", "M", "F", "F", "M", "F", "M", "M", "F", "F", "M", "F", "M", "M", "M", "F", "M", "M", "M" )
fiction <- c(T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, T, T, T, T, T)
satire <- c(FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE)
docvars_df <- data.frame(yrs, author, sex, title, satire, fiction)
#docvars(corpus_txts, "year") <- yrs
#docvars(corpus_txts)
txts <- cbind(txts, docvars_df)
corpus_txts <- corpus(txts)
corpus_f <- corpus_subset(corpus_txts, sex == "F", drop_docid = TRUE)
corpus_m <- corpus_subset(corpus_txts, sex == "M", drop_docid = TRUE)
summary(corpus_txts)
## Corpus consisting of 20 documents, showing 20 documents:
##
## Text Types Tokens Sentences yrs
## 1764_OTRANTO.txt 5045 43932 2175 1764
## 1777_OLD_ENGLISH_BARON.txt 4813 67099 2592 1777
## 1786_VATHEK.txt 6227 42882 1120 1786
## 1790_SICILIAN_ROMANCE.txt 6465 77475 2839 1790
## 1791_ROMANCE_OF_THE_FOREST.txt 9056 155144 5334 1791
## 1794_CALEB_WILLIAMS.txt 10672 167606 7105 1794
## 1794_UDOLPHO.txt 12424 349551 10636 1794
## 1796_MONK.txt 10222 159503 7994 1796
## 1798_WIELAND.txt 7983 95330 5262 1798
## 1806_ZOFLOYA.txt 9637 117940 3189 1806
## 1818_FRANKENSTEIN.txt 7606 85320 3355 1818
## 1818_NIGHTMARE_ABBEY.txt 5411 32626 1260 1818
## 1818_NORTHANGER_ABBEY.txt 6921 90653 3596 1818
## 1819_VAMPYRE.txt 3167 14534 368 1819
## 1821_OPIUM_EATER.txt 6442 44347 1087 1821
## 1824_JUSTIFIED_SINNER.txt 8769 98242 3244 1824
## 1847_WUTHERING_HEIGHTS.txt 10830 141359 6773 1847
## 1859_WOMAN_IN_WHITE.txt 13738 284438 13518 1859
## 1871_CARMILLA.txt 4334 33139 1436 1871
## 1897_DRACULA.txt 11533 187604 8994 1897
## author sex
## Horace Walpole M
## Clara Reeve F
## William Beckford M
## Ann Radcliffe F
## Ann Radcliffe F
## William Godwin M
## Ann Radcliffe F
## Matthew Lewis M
## Charles Brockden Brown M
## Charlotte Dacre F
## Mary Shelley F
## Thomas Love Peacock M
## Jane Austen F
## John Polidori M
## Thomas De Quincy M
## James Hogg M
## Emily Bronte F
## Wilkie Collins M
## Sheridan Le Fanu M
## Bram Stoker M
## title
## The Castle of Otranto
## The Old English Baron
## The History of the Caliph Vathek
## The Sicilian Romance
## The Romance of the Forest
## Things as They Are; or, The Adventures of Caleb Williams
## The Mysteries of Udolpho
## The Monk: A Romance
## Wieland: or, The Transformation: An American Tale
## Zofloya; or, The Moor
## Frankenstein; or, The Modern Prometheus
## Nightmare Abbey
## Northanger Abbey
## The Vampyre
## Confessions of an English Opium-Eater
## The Private Memoirs and Confessions of a Justified Sinner: Written by Himself
## Wuthering Heights
## The Woman in White
## Carmilla
## Dracula
## satire fiction
## FALSE TRUE
## FALSE TRUE
## FALSE TRUE
## FALSE TRUE
## FALSE TRUE
## FALSE TRUE
## FALSE TRUE
## FALSE TRUE
## FALSE TRUE
## FALSE TRUE
## FALSE TRUE
## TRUE TRUE
## TRUE TRUE
## FALSE TRUE
## FALSE FALSE
## TRUE TRUE
## FALSE TRUE
## FALSE TRUE
## FALSE TRUE
## FALSE TRUE
These texts have been stripped of PG intro and outro text by hand, there being too much variation in the textual patterns to do so with exclude_pattern.
#corpus_txts <- corpus_trim(corpus_txts, exclude_pattern="Project Gutenberg EBook")
#gothic_tokens <- tokens_tolower(tokens_remove(tokens(corpus_txts, remove_punct = TRUE), pattern = stopwords("en")))
gothic_tokens <- tokens_tolower(tokens(corpus_txts, remove_punct = FALSE))
gothic_f_tokens <- tokens_tolower(tokens(corpus_f, remove_punct = FALSE))
gothic_m_tokens <- tokens_tolower(tokens(corpus_m, remove_punct = FALSE))
gothic_tokens_p <- tokens_tolower(tokens(corpus_txts, remove_punct = TRUE))
gothic_f_tokens_p <- tokens_tolower(tokens(corpus_f, remove_punct = TRUE))
gothic_m_tokens_p <- tokens_tolower(tokens(corpus_m, remove_punct = TRUE))
This KWIC pulls all strings starting with “wild,” and then limits them with the stringr library to a subset–only those whose docname contains “FRANKENSTEIN,” to show only those lines relevant to Mary Shelley’s 1818 novel.4 In Frankenstein, “wild” appears 21 times, usually either to capture elements of the natural landscape or the turbulent landscape of the human mind.
kw_wild <- kwic(gothic_tokens, pattern = "wild*")
library("stringr")
wild_shelley <- kw_wild[str_detect(kw_wild$docname, "FRANKENSTEIN"), ]
print(wild_shelley)
## Keyword-in-context with 21 matches.
## [1818_FRANKENSTEIN.txt, 1719] of my life i ran | wild |
## [1818_FRANKENSTEIN.txt, 2776] men, even to the | wild |
## [1818_FRANKENSTEIN.txt, 4119] have generally an expression of | wildness |
## [1818_FRANKENSTEIN.txt, 5931] will appear possible in these | wild |
## [1818_FRANKENSTEIN.txt, 9673] i read and studied the | wild |
## [1818_FRANKENSTEIN.txt, 16915] i was disturbed by the | wildest |
## [1818_FRANKENSTEIN.txt, 18445] attentively, he saw a | wildness |
## [1818_FRANKENSTEIN.txt, 49064] . i was like a | wild |
## [1818_FRANKENSTEIN.txt, 53399] will go to the vast | wilds |
## [1818_FRANKENSTEIN.txt, 53561] , to dwell in those | wilds |
## [1818_FRANKENSTEIN.txt, 54565] darkness. ” these were | wild |
## [1818_FRANKENSTEIN.txt, 54680] family. my haggard and | wild |
## [1818_FRANKENSTEIN.txt, 57526] of nature. ” his | wild |
## [1818_FRANKENSTEIN.txt, 64193] land. it had a | wild |
## [1818_FRANKENSTEIN.txt, 76781] not how. amidst the | wilds |
## [1818_FRANKENSTEIN.txt, 77102] i generally subsisted on the | wild |
## [1818_FRANKENSTEIN.txt, 77876] from land by its superior | wildness |
## [1818_FRANKENSTEIN.txt, 78556] could be and uttered a | wild |
## [1818_FRANKENSTEIN.txt, 79450] to an expression of the | wildest |
## [1818_FRANKENSTEIN.txt, 83347] gesture seemed instigated by the | wildest |
## [1818_FRANKENSTEIN.txt, 83520] the monster continued to utter | wild |
##
## on a common and read
## sea and unvisited regions i
## , and even madness,
## and mysterious regions which would
## fancies of these writers with
## dreams. i thought i
## in my eyes for which
## beast that had broken the
## of south america. my
## where the beasts of the
## and miserable thoughts, but
## appearance awoke intense alarm,
## and enthusiastic imagination was chastened
## and rocky appearance, but
## of tartary and russia,
## animals that crossed my path
## and ruggedness. the greeks
## cry of ecstasy when i
## rage as he shrieked out
## rage of some uncontrollable passion
## and incoherent self-reproaches. at
gothic_tokens_stop <- tokens_remove(gothic_tokens_p, pattern=stopwords("en"))
gothic_f_tokens_stop <- tokens_remove(gothic_f_tokens_p, pattern=stopwords("en"))
gothic_m_tokens_stop <- tokens_remove(gothic_m_tokens_p, pattern=stopwords("en"))
gothic_stemmed <- tokens_wordstem(gothic_tokens_stop)
gothic_stemmed_dfmat <- dfm(gothic_stemmed)
gothic_cleaned_dfmat <- dfm(gothic_tokens_stop)
gothic_f_dfmat <- dfm(gothic_f_tokens_stop)
gothic_m_dfmat <- dfm(gothic_m_tokens_stop)
extrafont::font_import("Georgia")
## Importing fonts may take a few minutes, depending on the number of fonts and the speed of the system.
## Continue? [y/n]
## Exiting.
textplot_wordcloud(gothic_cleaned_dfmat, max_words = 150, color = "black", font="Georgia", random_color=FALSE)
What are the top features of 6 or more characters associated with satires?5 Satires seem to inclued more emphasis on mistaken appearances or beliefs–note the presence of three such words in the top ten: “seemed,” “looked,” and “appeared.” By contrast, satirical texts have none of these, instead focusing on characters. There are, however, fewer satirical than nonsatirical texts in the corpus. And, when we use stemming, we still see more emphasis on characters in satirical texts and a much higher use of “thought” and “appear” in nonsatirical texts, though other elements of appearance have decreased.
print(topfeatures(dfm_select(gothic_cleaned_dfmat, min_nchar=6), groups=satire))
## $`FALSE`
## little without though seemed thought moment however nothing
## 1849 1697 1573 1546 1537 1480 1266 1221
## looked appeared
## 1218 1168
##
## $`TRUE`
## catherine friend nothing thought without though little brother
## 426 221 219 209 206 198 196 193
## tilney scythrop
## 193 165
print(topfeatures(dfm_select(gothic_stemmed_dfmat, min_nchar = 6), groups=satire))
## $`FALSE`
## thought appear return moment without friend though present person answer
## 2144 2107 1859 1853 1697 1585 1574 1510 1415 1309
##
## $`TRUE`
## catherin friend thought tilney brother without though general
## 485 314 250 238 237 206 198 189
## scythrop appear
## 182 177
With female- and male- authored texts?6 The balance here is more equivalent. Note the emphasis on “heart” and “mind” in female-authored gothic texts, versus “man,” “shall,” and “must” in male-authored gothic texts. This may suggest that female gothic authors are more invested in interiority, where male gothic authors are in pushing outward. I would be interested in looking at the use of “might” in context. Is it about power, or more like the word may? But, when we extend to longer words and with stemming, these patterns change.
print(topfeatures(gothic_cleaned_dfmat, groups=sex))
## $F
## said now upon emily one time la heart mind might
## 3293 2709 2108 2023 1654 1397 1197 1174 1162 1151
##
## $M
## said one upon time mr now man us shall must
## 2897 2841 2172 2122 2015 2004 1581 1497 1443 1400
print(topfeatures(dfm_select(gothic_cleaned_dfmat, min_nchar=6), groups=sex))
## $F
## though thought madame little adeline appeared without seemed
## 958 928 919 854 840 837 822 818
## montoni however
## 815 798
##
## $M
## little without moment nothing seemed thought though looked friend however
## 1191 1081 894 855 849 818 813 730 692 619
print(topfeatures(gothic_stemmed_dfmat, groups=sex))
## $F
## said now upon emili one time look appear even thought
## 3293 2709 2108 2023 1703 1558 1497 1376 1329 1258
##
## $M
## one said time upon mr now look man hand know
## 2902 2897 2432 2172 2015 2004 1801 1729 1546 1516
print(topfeatures(dfm_select(gothic_stemmed_dfmat, min_nchar=6), groups=sex))
## $F
## appear thought return moment though adelin friend catherin
## 1376 1258 1169 978 959 896 882 860
## father without
## 858 822
##
## $M
## thought without moment friend appear person present return though answer
## 1136 1081 1041 1017 908 845 828 819 813 793
With 8 or more characters, and when looking at document frequency instead of feature frequency?7 These are more stable between stemmed and unstemmed. There are only 300 more features in unstemmed selection by document frequency. There is a clear focus on thematically connected features like “picture,” “work,” and “library”.
dfm_select(gothic_cleaned_dfmat, min_nchar=8)
## Document-feature matrix of: 20 documents, 24,750 features (85.61% sparse) and 6 docvars.
## features
## docs decorative _london_ melbourne_ following
## 1764_OTRANTO.txt 1 1 1 6
## 1777_OLD_ENGLISH_BARON.txt 0 0 0 8
## 1786_VATHEK.txt 0 1 1 4
## 1790_SICILIAN_ROMANCE.txt 0 0 0 25
## 1791_ROMANCE_OF_THE_FOREST.txt 0 0 0 60
## 1794_CALEB_WILLIAMS.txt 0 0 0 15
## features
## docs catholic principal incidents believed
## 1764_OTRANTO.txt 1 10 1 5
## 1777_OLD_ENGLISH_BARON.txt 0 8 1 0
## 1786_VATHEK.txt 0 0 3 3
## 1790_SICILIAN_ROMANCE.txt 0 4 3 22
## 1791_ROMANCE_OF_THE_FOREST.txt 0 1 1 34
## 1794_CALEB_WILLIAMS.txt 0 15 14 37
## features
## docs christianity language
## 1764_OTRANTO.txt 1 5
## 1777_OLD_ENGLISH_BARON.txt 0 0
## 1786_VATHEK.txt 0 3
## 1790_SICILIAN_ROMANCE.txt 0 5
## 1791_ROMANCE_OF_THE_FOREST.txt 1 5
## 1794_CALEB_WILLIAMS.txt 0 17
## [ reached max_ndoc ... 14 more documents, reached max_nfeat ... 24,740 more features ]
dfm_select(gothic_stemmed_dfmat, min_nchar=8)
## Document-feature matrix of: 20 documents, 13,083 features (90.26% sparse) and 6 docvars.
## features
## docs _london_ melbourne_ christian afterward
## 1764_OTRANTO.txt 1 1 5 1
## 1777_OLD_ENGLISH_BARON.txt 0 0 8 7
## 1786_VATHEK.txt 1 1 0 4
## 1790_SICILIAN_ROMANCE.txt 0 0 0 11
## 1791_ROMANCE_OF_THE_FOREST.txt 0 0 2 24
## 1794_CALEB_WILLIAMS.txt 0 0 4 34
## features
## docs circumst establish arragonian familiar
## 1764_OTRANTO.txt 14 2 1 1
## 1777_OLD_ENGLISH_BARON.txt 34 5 0 3
## 1786_VATHEK.txt 2 0 0 1
## 1790_SICILIAN_ROMANCE.txt 77 1 0 2
## 1791_ROMANCE_OF_THE_FOREST.txt 159 3 0 4
## 1794_CALEB_WILLIAMS.txt 112 12 0 13
## features
## docs singular judgment
## 1764_OTRANTO.txt 2 8
## 1777_OLD_ENGLISH_BARON.txt 1 6
## 1786_VATHEK.txt 4 0
## 1790_SICILIAN_ROMANCE.txt 12 4
## 1791_ROMANCE_OF_THE_FOREST.txt 8 15
## 1794_CALEB_WILLIAMS.txt 16 20
## [ reached max_ndoc ... 14 more documents, reached max_nfeat ... 13,073 more features ]
dfm_trim(gothic_cleaned_dfmat, min_nchar=8, min_termfreq = 10, min_docfreq = 15)
## Document-feature matrix of: 20 documents, 2,393 features (12.28% sparse) and 6 docvars.
## features
## docs picture company first following work found
## 1764_OTRANTO.txt 8 12 36 6 13 28
## 1777_OLD_ENGLISH_BARON.txt 3 40 49 8 18 32
## 1786_VATHEK.txt 1 10 33 4 5 19
## 1790_SICILIAN_ROMANCE.txt 5 16 53 25 1 66
## 1791_ROMANCE_OF_THE_FOREST.txt 16 5 142 60 7 124
## 1794_CALEB_WILLIAMS.txt 12 23 196 15 33 155
## features
## docs library ancient family north
## 1764_OTRANTO.txt 1 5 8 1
## 1777_OLD_ENGLISH_BARON.txt 0 3 86 8
## 1786_VATHEK.txt 1 2 4 1
## 1790_SICILIAN_ROMANCE.txt 2 3 25 8
## 1791_ROMANCE_OF_THE_FOREST.txt 5 9 93 4
## 1794_CALEB_WILLIAMS.txt 8 1 39 1
## [ reached max_ndoc ... 14 more documents, reached max_nfeat ... 2,383 more features ]
dfm_trim(gothic_stemmed_dfmat, min_nchar=8, min_termfreq = 10, min_docfreq = 15)
## Document-feature matrix of: 20 documents, 2,063 features (10.19% sparse) and 6 docvars.
## features
## docs pictur compani limit first follow work found
## 1764_OTRANTO.txt 10 12 1 36 25 13 33
## 1777_OLD_ENGLISH_BARON.txt 4 40 2 49 45 21 34
## 1786_VATHEK.txt 4 10 3 33 33 6 19
## 1790_SICILIAN_ROMANCE.txt 5 16 3 53 70 5 68
## 1791_ROMANCE_OF_THE_FOREST.txt 18 5 1 142 145 13 127
## 1794_CALEB_WILLIAMS.txt 13 23 7 196 48 51 157
## features
## docs librari ancient famili
## 1764_OTRANTO.txt 1 5 10
## 1777_OLD_ENGLISH_BARON.txt 0 4 87
## 1786_VATHEK.txt 1 2 4
## 1790_SICILIAN_ROMANCE.txt 2 3 26
## 1791_ROMANCE_OF_THE_FOREST.txt 5 9 93
## 1794_CALEB_WILLIAMS.txt 8 1 40
## [ reached max_ndoc ... 14 more documents, reached max_nfeat ... 2,053 more features ]
With 18th century vs 19th century texts?