This article was created by working through the book “Text Analysis with R for Students of Literature” by Matthew L. Jockers.
The first step is to create a bag of words from the novel “Jude the Obscure”.
library(dplyr)
library(thomashardyr)
library(tidytext)
data(stop_words)
jude.tbl <- data_frame(text = jude, book = "Jude the Obscure") %>%
unnest_tokens(word, text)
jude.words.v <- jude.tbl$word
print(jude.words.v[1:10])
#> [1] "jude" "the" "obscure" "by" "thomas" "hardy"
#> [7] "contents" "part" "first" "at"
jude.tbl.stop <- jude.tbl %>%
anti_join(stop_words)
jude.words.v.stop <- jude.tbl.stop$word
Once we have the bag of words as a vector we can easily count word occurrences. For example, to count the number of times the word “Jude” appears in the novel:
jude.hits.v <- jude.tbl %>%
filter(word == "jude") %>%
count(word) %>%
select(n) %>%
as.numeric()
print(jude.hits.v)
#> [1] 840
Then we can ask what proportion of the novel is made of the word “Jude”:
total.words.v <- length(jude.words.v)
jude.hits.v/total.words.v
#> [1] 0.005734258
Generalising, we can count the occurrences of every word in the novel.
jude.freqs.t <- table(jude.words.v)
And then find the most commonly occuring words by sorting.
sorted.jude.freqs.t <- sort(jude.freqs.t, decreasing = TRUE)
sorted.jude.freqs.t[1:10]
#> jude.words.v
#> the and to of a i he in it you
#> 7096 4517 4392 3493 3216 2903 2512 2406 1976 1941
So we really want to remove stop words first.
jude.freqs.t.stop <- table(jude.words.v.stop)
sorted.jude.freqs.t.stop <- sort(jude.freqs.t.stop, decreasing = TRUE)
sorted.jude.freqs.t["him"]/sorted.jude.freqs.t["her"]
#> him
#> 0.535877
sorted.jude.freqs.t["he"]/sorted.jude.freqs.t["she"]
#> he
#> 1.501494
sorted.jude.rel.freqs.t <- 100*(sorted.jude.freqs.t/sum(sorted.jude.freqs.t))
sorted.jude.rel.freqs.t.stop <- 100*(sorted.jude.freqs.t.stop/sum(sorted.jude.freqs.t.stop))
library(ggplot2)
ggplot(data.frame(sorted.jude.rel.freqs.t.stop[25:1]), aes(jude.words.v.stop, Freq)) +
geom_point(colour = "cornflowerblue") +
xlab("") +
ylab("Percentage of Full Text") +
coord_flip() +
labs(
title = "Word frequencies in Thomas Hardy's Jude the Obscure",
subtitle = "based on Chapter 3 of Jockers' Text Analysis with R for Students of Literature"
)
n.time.v <- seq(1:length(jude.words.v))
jude.v <- which(jude.words.v == "jude")
ggplot(data.frame(x = 1:length(jude.v), y = jude.v)) +
geom_linerange(aes(x = y, ymin = 0, ymax = 1), colour = "midnightblue", size = 0.1) +
xlab("Novel Time") +
ylab("jude") +
theme(
axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank()
) +
labs(
title = "Dispersion plot of 'jude' in Thomas Hardy's Jude the Obscure",
subtitle = "based on Chapter 4 of Jockers' Text Analysis with R for Students of Literature"
)
sue.v <- which(jude.words.v == "sue")
ggplot(data.frame(x = 1:length(sue.v), y = sue.v)) +
geom_linerange(aes(x = y, ymin = 0, ymax = 1), colour = "midnightblue", size = 0.1) +
xlab("Novel Time") +
ylab("sue") +
theme(
axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank()
) +
labs(
title = "Dispersion plot of 'sue' in Thomas Hardy's Jude the Obscure",
subtitle = "based on Chapter 4 of Jockers' Text Analysis with R for Students of Literature"
)
parts <- grep("^Part", jude)
first_part <- jude[parts[1]:(parts[2] - 1)]
p1_chapters <- grep("^IX$|^IV$|^V?I{1,3}$", first_part)