1.1 Contrasting tidy text with other data structures
packages = c("dplyr","ggplot2", "sentimentr","tidytext","wordcloud","textdata","janeaustenr","stringr","tidyr","reshape2","forcats","igraph","ggraph","widyr","scales")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
library(tidytext)
library(dplyr)
library(ggplot2)
library(sentimentr)
library(wordcloud)
library(textdata)
library(janeaustenr)
library(stringr)
library(tidyr)
library(reshape2)
library(forcats)
library(igraph)
library(ggraph)
library(widyr)
library(scales)
1.2 The unnest_tokens function
text <- c("Because I could not stop for Death -",
"He kindly stopped for me -",
"The Carriage held but just Ourselves -",
"and Immortality")
text
## [1] "Because I could not stop for Death -"
## [2] "He kindly stopped for me -"
## [3] "The Carriage held but just Ourselves -"
## [4] "and Immortality"
text_df <- tibble(line = 1:4, text = text)
text_df
## # A tibble: 4 x 2
## line text
## <int> <chr>
## 1 1 Because I could not stop for Death -
## 2 2 He kindly stopped for me -
## 3 3 The Carriage held but just Ourselves -
## 4 4 and Immortality
text_df %>%
unnest_tokens(word, text)
## # A tibble: 20 x 2
## line word
## <int> <chr>
## 1 1 because
## 2 1 i
## 3 1 could
## 4 1 not
## 5 1 stop
## 6 1 for
## 7 1 death
## 8 2 he
## 9 2 kindly
## 10 2 stopped
## 11 2 for
## 12 2 me
## 13 3 the
## 14 3 carriage
## 15 3 held
## 16 3 but
## 17 3 just
## 18 3 ourselves
## 19 4 and
## 20 4 immortality
1.3 Tidying the works of Jane Austen
original_books <- austen_books() %>%
group_by(book) %>%
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup()
original_books
## # A tibble: 73,422 x 4
## text book linenumber chapter
## <chr> <fct> <int> <int>
## 1 "SENSE AND SENSIBILITY" Sense & Sensibility 1 0
## 2 "" Sense & Sensibility 2 0
## 3 "by Jane Austen" Sense & Sensibility 3 0
## 4 "" Sense & Sensibility 4 0
## 5 "(1811)" Sense & Sensibility 5 0
## 6 "" Sense & Sensibility 6 0
## 7 "" Sense & Sensibility 7 0
## 8 "" Sense & Sensibility 8 0
## 9 "" Sense & Sensibility 9 0
## 10 "CHAPTER 1" Sense & Sensibility 10 1
## # ... with 73,412 more rows
tidy_books <- original_books %>%
unnest_tokens(word, text)
tidy_books
## # A tibble: 725,055 x 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Sense & Sensibility 1 0 sense
## 2 Sense & Sensibility 1 0 and
## 3 Sense & Sensibility 1 0 sensibility
## 4 Sense & Sensibility 3 0 by
## 5 Sense & Sensibility 3 0 jane
## 6 Sense & Sensibility 3 0 austen
## 7 Sense & Sensibility 5 0 1811
## 8 Sense & Sensibility 10 1 chapter
## 9 Sense & Sensibility 10 1 1
## 10 Sense & Sensibility 13 1 the
## # ... with 725,045 more rows
data(stop_words)
tidy_books <- tidy_books %>%
anti_join(stop_words)
## Joining, by = "word"
tidy_books %>%
count(word, sort = TRUE)
## # A tibble: 13,914 x 2
## word n
## <chr> <int>
## 1 miss 1855
## 2 time 1337
## 3 fanny 862
## 4 dear 822
## 5 lady 817
## 6 sir 806
## 7 day 797
## 8 emma 787
## 9 sister 727
## 10 house 699
## # ... with 13,904 more rows
tidy_books %>%
count(word, sort = TRUE) %>%
filter(n > 600) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word)) +
geom_col() +
labs(y = NULL)

1.5 Word frequencies
## Warning: package 'gutenbergr' was built under R version 4.0.4
hgwells <- gutenberg_download(c(35, 36, 5230, 159))
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
tidy_hgwells <- hgwells %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
## Joining, by = "word"
tidy_hgwells %>%
count(word, sort = TRUE)
## # A tibble: 11,830 x 2
## word n
## <chr> <int>
## 1 time 461
## 2 people 302
## 3 door 260
## 4 heard 249
## 5 black 232
## 6 stood 229
## 7 white 224
## 8 hand 218
## 9 kemp 213
## 10 eyes 210
## # ... with 11,820 more rows
bronte <- gutenberg_download(c(1260, 768, 969, 9182, 767))
tidy_bronte <- bronte %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
## Joining, by = "word"
tidy_bronte %>%
count(word, sort = TRUE)
## # A tibble: 23,303 x 2
## word n
## <chr> <int>
## 1 time 1064
## 2 miss 854
## 3 day 826
## 4 hand 767
## 5 eyes 713
## 6 don’t 666
## 7 night 648
## 8 heart 638
## 9 looked 601
## 10 door 591
## # ... with 23,293 more rows
frequency <- bind_rows(mutate(tidy_bronte, author = "Brontë Sisters"),
mutate(tidy_hgwells, author = "H.G. Wells"),
mutate(tidy_books, author = "Jane Austen")) %>%
mutate(word = str_extract(word, "[a-z']+")) %>%
count(author, word) %>%
group_by(author) %>%
mutate(proportion = n / sum(n)) %>%
select(-n) %>%
pivot_wider(names_from = author, values_from = proportion) %>%
pivot_longer(`Brontë Sisters`:`H.G. Wells`,
names_to = "author", values_to = "proportion")
frequency
## # A tibble: 57,252 x 4
## word `Jane Austen` author proportion
## <chr> <dbl> <chr> <dbl>
## 1 a 0.00000919 Bronte Sisters 0.0000587
## 2 a 0.00000919 H.G. Wells 0.0000148
## 3 aback NA Bronte Sisters 0.00000391
## 4 aback NA H.G. Wells 0.0000148
## 5 abaht NA Bronte Sisters 0.00000391
## 6 abaht NA H.G. Wells NA
## 7 abandon NA Bronte Sisters 0.0000313
## 8 abandon NA H.G. Wells 0.0000148
## 9 abandoned 0.00000460 Bronte Sisters 0.0000900
## 10 abandoned 0.00000460 H.G. Wells 0.000178
## # ... with 57,242 more rows
# expect a warning about rows with missing values being removed
ggplot(frequency, aes(x = proportion, y = `Jane Austen`,
color = abs(`Jane Austen` - proportion))) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001),
low = "darkslategray4", high = "gray75") +
facet_wrap(~author, ncol = 2) +
theme(legend.position="none") +
labs(y = "Jane Austen", x = NULL)
## Warning: Removed 40857 rows containing missing values (geom_point).
## Warning: Removed 40859 rows containing missing values (geom_text).

cor.test(data = frequency[frequency$author == "Brontë Sisters",],
~ proportion + `Jane Austen`)
##
## Pearson's product-moment correlation
##
## data: proportion and Jane Austen
## t = 111.09, df = 10345, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7286567 0.7462329
## sample estimates:
## cor
## 0.7375697
cor.test(data = frequency[frequency$author == "H.G. Wells",],
~ proportion + `Jane Austen`)
##
## Pearson's product-moment correlation
##
## data: proportion and Jane Austen
## t = 36.083, df = 6046, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3999815 0.4414612
## sample estimates:
## cor
## 0.4209414