This is a practice R project to have a go at text analysis.
I used Project Gutenberg to source the text of four of Anthony Trollope’s novels.
I then used R to plot:
The most frequent words were all the names of the characters in the books, some of which appear in more than one book. Trollope’s novels have many characters in each book and I recognised these.
The most uniquely characteristic words in each book were still characters but not those who were also strongly featured in other books. These results chimed well with my memory of the plots.
The most frequent bigrams were also character names and this was very relevant also as some characters were consistently referred to by both names e.g.Lord Lufton. However others most frequently by just one e.g. Eleanor or by one or the other.
The frequency of negative sentiments was generally higher than negative throughout all the books. When the book was chunked into 100 word chunks the negatives outweighed the positives hugely and there was a pattern of lots of negative sentiments at the last stage of each book. All in all Trollope was a bit of a misery it would seem.
library(tidyverse)
library(tidytext)
library(gutenbergr)
library(RColorBrewer)
Get the data from Gutenberg project and put it in a local csv file - (this chunk should be run just once, manually, as set by using eval = FALSE however this seemed to cause the file to be unavailable later in the process - need to check this out)
# 619 - The Warden
# 3409 - Barchester Towers
# 2860 Framley Parsonage
# 3166 Doctor Thorne
# this downloads the books into one dataframe
trollope_raw <- gutenberg_download(c(619,3409,2860,3166), meta_fields = "title")
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
# this creates a csv file in the location stated
write_csv(trollope_raw, "trollope_raw.csv")
Now to clean the data by removing blank lines with drop_na then adjusting the file to be a token in each row - in this case a word.
# unnest tokens
trollope_words <- trollope_raw %>%
drop_na(text) %>%
unnest_tokens(word, text)
head(trollope_words, n = 30)
## # A tibble: 30 x 3
## gutenberg_id title word
## <int> <chr> <chr>
## 1 619 The Warden the
## 2 619 The Warden warden
## 3 619 The Warden by
## 4 619 The Warden anthony
## 5 619 The Warden trollope
## 6 619 The Warden contents
## 7 619 The Warden i
## 8 619 The Warden hiram's
## 9 619 The Warden hospital
## 10 619 The Warden ii
## # ... with 20 more rows
top_words_trollope <- trollope_words %>%
# remove stop words like 'a' 'the' etc.
anti_join(stop_words) %>%
# count all the words in each book
count(title, word, sort = TRUE) %>%
# keep the top 15 words in each book
group_by(title) %>%
top_n(15) %>%
ungroup() %>%
# make the words an ordered factor so that they plot in order
mutate(word = fct_inorder(word))
## Joining, by = "word"
## Selecting by n
head(top_words_trollope, n=200)
## # A tibble: 60 x 3
## title word n
## <chr> <fct> <int>
## 1 Doctor Thorne frank 958
## 2 Framley Parsonage lufton 951
## 3 Barchester Towers slope 920
## 4 Framley Parsonage lady 869
## 5 Doctor Thorne mary 850
## 6 Doctor Thorne doctor 771
## 7 Doctor Thorne lady 746
## 8 Framley Parsonage robarts 644
## 9 Doctor Thorne thorne 572
## 10 Framley Parsonage lord 570
## # ... with 50 more rows
Now to plot them in a column chart, mapping the frequency to x-axis, and the words to the y-axis, facetting by book title and colouring the bars by book title also
ggplot(top_words_trollope,
aes(y = fct_rev(word), x = n, fill = title)) +
geom_col() +
labs (y = "Count", x = NULL,
title = "15 most frequent words in four Anthony Trollope novels") +
facet_wrap(vars(title), scales = "free_y") +
guides(fill = FALSE) +
theme_bw()
Now for the most unique words - so the ones that are uniquely characteristic of that particular book.
# We have already created trollope_words - a counted list but lets do that again!
trollope_words <- trollope_raw %>%
drop_na() %>%
# Split into word tokens
unnest_tokens(word, text) %>%
# Remove stop words
anti_join(stop_words) %>%
count(title, word, sort = TRUE)
## Joining, by = "word"
# Add the tf-idf values to the counts
trollope_tf_idf <- trollope_words %>%
bind_tf_idf(word, title, n)
# Get the top 10 uniquest words
trollope_tf_idf_plot <- trollope_tf_idf %>%
arrange(desc(tf_idf)) %>%
group_by(title) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = fct_inorder(word))
## Selecting by tf_idf
ggplot(trollope_tf_idf_plot,
aes(y = fct_rev(word), x = tf_idf, fill = title)) +
geom_col() +
guides(fill = FALSE) +
labs(x = "tf-idf", y = NULL, title = "The top 10 most unique words in each novel") +
facet_wrap(~ title, scales = "free") +
theme_bw()
# Bigrams:
Now to look at the frequencies of bigrams (or pairs of words)
trollope_bigrams <- trollope_raw %>%
drop_na(text) %>%
# n = 2 here means bigrams. We could also make trigrams (n = 3) or any type of n-gram
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
# Split the bigrams into two words so that I can remove stopwords
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word) %>%
# Put the two word columns back together
unite(bigram, word1, word2, sep = " ")
head(trollope_bigrams, n = 200)
## # A tibble: 200 x 3
## gutenberg_id title bigram
## <int> <chr> <chr>
## 1 619 The Warden anthony trollope
## 2 619 The Warden trollope contents
## 3 619 The Warden hiram's hospital
## 4 619 The Warden hospital ii
## 5 619 The Warden barchester reformer
## 6 619 The Warden reformer iii
## 7 619 The Warden barchester iv
## 8 619 The Warden iv hiram's
## 9 619 The Warden hiram's bedesmen
## 10 619 The Warden dr grantly
## # ... with 190 more rows
and plotting the most frequent bigrams:
top_bigrams <- trollope_bigrams %>%
# Count all the bigrams in each play
count(title, bigram, sort = TRUE) %>%
# Keep top 15 in each play
group_by(title) %>%
top_n(15) %>%
ungroup() %>%
# Make the bigrams an ordered factor so they plot in order
mutate(bigram = fct_inorder(bigram))
## Selecting by n
# selecting by n
ggplot(top_bigrams, aes(y = fct_rev(bigram), x = n, fill = title)) +
geom_col() +
guides(fill = FALSE) +
labs(y = "Count", x = NULL,
title = "15 most frequent bigrams in four Anthony Trollope novels") +
facet_wrap(vars(title), scales = "free") +
theme_bw()
Now I am going to look at the sentiments in the novels
Starting by joining the sentiment dictionary:
trollope_sentiments <- trollope_words %>%
inner_join(get_sentiments("bing"))
## Joining, by = "word"
head(trollope_sentiments)
## # A tibble: 6 x 4
## title word n sentiment
## <chr> <chr> <int> <chr>
## 1 Framley Parsonage miss 458 negative
## 2 Doctor Thorne miss 402 negative
## 3 Doctor Thorne love 309 positive
## 4 Framley Parsonage love 217 positive
## 5 Barchester Towers miss 212 negative
## 6 Barchester Towers love 198 positive
and now to plot these:
trollope_sentiment_plot <- trollope_sentiments %>%
count(title, sentiment)
ggplot(trollope_sentiment_plot, aes(x = sentiment, y = n, fill = title, alpha = sentiment)) +
geom_col(position = position_dodge()) +
labs(y = NULL, x = NULL, title = "Frequency of positive and negative sentiments in each novel") +
scale_alpha_manual(values = c(0.5, 1)) +
facet_wrap(vars(title)) +
theme_bw()
and now another plot looking at the sentiments in chunks of 100 lines from each book:
trollope_split_into_lines <- trollope_sentiments %>%
# Divide lines into groups of 100
mutate(line = row_number(),
line_chunk = line %/% 100) %>%
# Get a count of postiive and negative words in each 100-line chunk in each play
count(title, line_chunk, sentiment) %>%
# Convert the sentiment column into two columns named "positive" and "negative"
pivot_wider(names_from = sentiment, values_from = n) %>%
# Calculate net sentiment
mutate(sentiment = positive - negative)
ggplot(trollope_split_into_lines,
aes(x = line_chunk, y = sentiment, fill = sentiment)) +
geom_col() +
labs(y = NULL, x = NULL, title = "Totals of positive and negative sentiments in 100 word chucks throughout each novel") +
#scale_fill_viridis(option = "magma", end = 0.9) +
scale_colour_gradient2() +
#scale_color_brewer(palette = "RdBu") + #This sets the color palette
facet_wrap(vars(title), scales = "free_x") +
theme_bw()
## Warning: Removed 2 rows containing missing values (position_stack).