Basic Text Analysis of “Jude the Obscure” by Thomas Hardy

Matthew Henderson

30 July, 2017

This article was created by working through the book “Text Analysis with R for Students of Literature” by Matthew L. Jockers.

Chapter 2 - First Foray into Text Analysis with R

The first step is to create a bag of words from the novel “Jude the Obscure”.

library(dplyr)
library(thomashardyr)
library(tidytext)

data(stop_words)

jude.tbl <- data_frame(text = jude, book = "Jude the Obscure") %>%
  unnest_tokens(word, text)

jude.words.v <- jude.tbl$word
print(jude.words.v[1:10])
#>  [1] "jude"     "the"      "obscure"  "by"       "thomas"   "hardy"   
#>  [7] "contents" "part"     "first"    "at"
jude.tbl.stop <- jude.tbl %>%
  anti_join(stop_words)

jude.words.v.stop <- jude.tbl.stop$word

Once we have the bag of words as a vector we can easily count word occurrences. For example, to count the number of times the word “Jude” appears in the novel:

jude.hits.v <- jude.tbl %>%
  filter(word == "jude") %>%
  count(word) %>%
  select(n) %>%
  as.numeric()

print(jude.hits.v)
#> [1] 840

Then we can ask what proportion of the novel is made of the word “Jude”:

total.words.v <- length(jude.words.v)
jude.hits.v/total.words.v
#> [1] 0.005734258

Generalising, we can count the occurrences of every word in the novel.

jude.freqs.t <- table(jude.words.v)

And then find the most commonly occuring words by sorting.

sorted.jude.freqs.t <- sort(jude.freqs.t, decreasing = TRUE)
sorted.jude.freqs.t[1:10]
#> jude.words.v
#>  the  and   to   of    a    i   he   in   it  you 
#> 7096 4517 4392 3493 3216 2903 2512 2406 1976 1941

So we really want to remove stop words first.

jude.freqs.t.stop <- table(jude.words.v.stop)
sorted.jude.freqs.t.stop <- sort(jude.freqs.t.stop, decreasing = TRUE)

Chapter 3 - Accessing and Comparing Word Frequency Data

sorted.jude.freqs.t["him"]/sorted.jude.freqs.t["her"]
#>      him 
#> 0.535877
sorted.jude.freqs.t["he"]/sorted.jude.freqs.t["she"]
#>       he 
#> 1.501494
sorted.jude.rel.freqs.t <- 100*(sorted.jude.freqs.t/sum(sorted.jude.freqs.t))
sorted.jude.rel.freqs.t.stop <- 100*(sorted.jude.freqs.t.stop/sum(sorted.jude.freqs.t.stop))
library(ggplot2)

ggplot(data.frame(sorted.jude.rel.freqs.t.stop[25:1]), aes(jude.words.v.stop, Freq)) +
  geom_point(colour = "cornflowerblue") +
  xlab("") +
  ylab("Percentage of Full Text") +
  coord_flip() +
  labs(
    title = "Word frequencies in Thomas Hardy's Jude the Obscure",
    subtitle = "based on Chapter 3 of Jockers' Text Analysis with R for Students of Literature"
  )

Chapter 4 - Token Distribution Analysis

n.time.v <- seq(1:length(jude.words.v))
jude.v <- which(jude.words.v == "jude")

ggplot(data.frame(x = 1:length(jude.v), y = jude.v)) +
  geom_linerange(aes(x = y, ymin = 0, ymax = 1), colour = "midnightblue", size = 0.1) +
  xlab("Novel Time") +
  ylab("jude") +
  theme(
    axis.title.y = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank()
  ) +
  labs(
    title = "Dispersion plot of 'jude' in Thomas Hardy's Jude the Obscure",
    subtitle = "based on Chapter 4 of Jockers' Text Analysis with R for Students of Literature"
  )
sue.v <- which(jude.words.v == "sue")

ggplot(data.frame(x = 1:length(sue.v), y = sue.v)) +
  geom_linerange(aes(x = y, ymin = 0, ymax = 1), colour = "midnightblue", size = 0.1) +
  xlab("Novel Time") +
  ylab("sue") +
  theme(
    axis.title.y = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank()
  ) +
  labs(
    title = "Dispersion plot of 'sue' in Thomas Hardy's Jude the Obscure",
    subtitle = "based on Chapter 4 of Jockers' Text Analysis with R for Students of Literature"
  )

4.2.2 Identify the chapter break positions …

parts <- grep("^Part", jude)
first_part <- jude[parts[1]:(parts[2] - 1)]
p1_chapters <- grep("^IX$|^IV$|^V?I{1,3}$", first_part)