Below I will be sourcing Huckleberry Finn from the gutenbergr
library.
works <- gutenberg_works()
works
## # A tibble: 44,042 × 8
## gutenberg_id title author gutenberg_author_id language gutenberg_bookshelf
## <int> <chr> <chr> <int> <chr> <chr>
## 1 1 "The De… Jeffe… 1638 en "Politics/American…
## 2 2 "The Un… Unite… 1 en "Politics/American…
## 3 3 "John F… Kenne… 1666 en ""
## 4 4 "Lincol… Linco… 3 en "US Civil War"
## 5 5 "The Un… Unite… 1 en "United States/Pol…
## 6 6 "Give M… Henry… 4 en "American Revoluti…
## 7 7 "The Ma… <NA> NA en ""
## 8 8 "Abraha… Linco… 3 en "US Civil War"
## 9 9 "Abraha… Linco… 3 en "US Civil War"
## 10 10 "The Ki… <NA> NA en "Banned Books List…
## # ℹ 44,032 more rows
## # ℹ 2 more variables: rights <chr>, has_text <lgl>
works %>% filter(title == 'Adventures of Huckleberry Finn')
## # A tibble: 1 × 8
## gutenberg_id title author gutenberg_author_id language gutenberg_bookshelf
## <int> <chr> <chr> <int> <chr> <chr>
## 1 19640 Adventur… Twain… 53 en Banned Books List …
## # ℹ 2 more variables: rights <chr>, has_text <lgl>
#huckleberry <- gutenberg_download('76') - no longer works
huckleberry <- read_csv("Huck.csv") %>% rename(text = value, linenumber = ...1)
huckleberry <- na.omit(huckleberry)
From here, we want to clean our data by the tidytext principles
using code sourced from Text Mining with R _(Silge, J., & Robinson,
D. 2017, Sec. 2.2)
tidyHuck <- huckleberry %>% #removing first 380 rows since its an index and interferes with chapter regex
mutate(
#linenumber = row_number(), #add index of row numbers
chapter = cumsum(str_detect(text, #add index of chapter
regex("CHAPTER ([IXVLC]*.|THE LAST)", #by finding any text starting with "chapter #"
ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text) #notice we use 'word' as the unit of token
paged_table(tidyHuck)
#Book Copy for use in unnest_tokens by regex
tidy_books <- austen_books() %>%
group_by(book) %>%
mutate(
linenumber = row_number(),
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
paged_table(get_sentiments("nrc"))
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy") #filter nrc lexicon for words classified as joy
tidyHuck %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE) #join and count all words from tidyHuck to nrc_joy to identfiy how many joy words are used in the book
## # A tibble: 170 × 2
## word n
## <chr> <int>
## 1 good 285
## 2 pretty 160
## 3 kind 137
## 4 money 85
## 5 found 73
## 6 mighty 71
## 7 glad 46
## 8 luck 30
## 9 safe 28
## 10 satisfied 26
## # ℹ 160 more rows
paged_table(get_sentiments("bing"))
huckSentiment <- tidyHuck %>%
inner_join(get_sentiments("bing")) %>% #join words that match bing lexicon
count(index = linenumber %/% 80, sentiment) %>% #Removed 'book' as initial argument since we are only working with one book
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
paged_table(huckSentiment)
ggplot(huckSentiment, aes(x = index, y = sentiment, fill = sentiment)) +
geom_bar(stat = 'identity') +
scale_fill_gradient2(low='red', mid='snow3', high='darkgreen', space='Lab') +
geom_col(show.legend = FALSE)
We can see that there is quite a variance in Huckleberry Finn. I
would imagine this is natural for fiction works and could be an
indicator of entertaining reads.
From here, let’s check the
four different lexicons to compare with.
afinn2 <- get_sentiments(c("afinn"))
unique(afinn2$value)
## [1] -2 -3 2 1 -1 3 4 -4 -5 5 0
#Create tibble for AFINN values
afinn <- tidyHuck %>%
inner_join(get_sentiments("afinn")) %>% #look up sentiment
group_by(index = linenumber %/% 80) %>% #chunk by 80 lines
summarise(sentiment = sum(value)) %>% #count values by our groups
mutate(method = "AFINN") #label the lexicon used
bing_and_nrc <- bind_rows(
tidyHuck %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
tidyHuck %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative"))#only grab the positive and negative qualities in NRC and avoid anger, joy, etc.
) %>%
mutate(method = "NRC")) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>% #pivot wider to easily subtract the positive count from the negative count
mutate(sentiment = positive - negative)#create overall sentiment value for the group
paged_table(afinn)
paged_table(bing_and_nrc)
The syuzhet lexicon has come under criticism, particularly by Swaffored
but still holds value and is extremely popular.
value <- get_sentiment(tidyHuck$word,method = "syuzhet") #calling syuzhet's get_sentiment command
syuzhetHuck <- cbind(tidyHuck,value) #attaching to tidyHuck to match AFINN's tibble structure
#Below code finishes matching AFINN structure
syuzhet <- syuzhetHuck %>%
group_by(index = linenumber %/% 80) %>% #Re-iterating AFINN's 80 word division for comparison
summarise(sentiment = sum(value)) %>%
mutate(method = "syuzhet")
Let’s see how they all compare against each other visually
(Silge, J., & Robinson, D. 2017, Sec. 2.3).
bind_rows(afinn,
bing_and_nrc,syuzhet) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")
Overall, it looks like syuzhet would be most similar to AFINN or
NRC. The syuzhet lexicon doesn’t seem to be as sensitive to slightly
negative words but still captures anything overly negative.
bing_word_counts <- tidyHuck %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
bing_word_counts
## # A tibble: 797 × 3
## word sentiment n
## <chr> <chr> <int>
## 1 well positive 409
## 2 right positive 301
## 3 good positive 285
## 4 like positive 267
## 5 pretty positive 160
## 6 enough positive 112
## 7 better positive 94
## 8 trouble negative 81
## 9 dead negative 72
## 10 miss negative 72
## # ℹ 787 more rows
To match this with the syuzhet package, we will need to classify our words into positive or negative and filter out any neutrals.
syuzhetHuckSentiment <- syuzhetHuck %>% mutate(sentiment =
if_else(value > 0,'positive',
if_else(value < 0,'negative','neutral')
)
) #Classify our values
syuzhet_word_counts <- syuzhetHuckSentiment %>% #count
count(word, sentiment, sort = TRUE) %>%
ungroup()
syuzhetHuckLookup <- syuzhetHuckSentiment %>% #create lookup to join word values for display
select(word,value,sentiment)
syuzhetHuckLookup <- unique(syuzhetHuckLookup)#remove repeats
syuzhetCounts <-left_join(syuzhet_word_counts[,c("word","n")],syuzhetHuckLookup, by = "word") #performing left join to show value of words with our counts to give reader idea of weight
nonNeutralSyuzhetCounts<- syuzhetCounts %>%
filter(sentiment != 'neutral') %>% #filter out neutral words, value 0
mutate(method = 'syuzhet')
paged_table(nonNeutralSyuzhetCounts)
Now that we have these counts, it is easy to compare visually
(Silge, J., & Robinson, D. 2017, Sec. 2.4).
bing_word_counts %>%
group_by(sentiment) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment",
y = NULL)
bing_word_counts <- bing_word_counts %>%
mutate(method='bing')
bind_rows(bing_word_counts,nonNeutralSyuzhetCounts) %>%
group_by(method,sentiment) %>%
slice_max(n, n = 5) %>%
arrange(desc(n)) %>%
ggplot(aes(n,word, fill = method)) +
geom_col(show.legend = FALSE) +
facet_grid(sentiment~method)
custom_stop_words <- bind_rows(tibble(word = c("well","nigger"),
lexicon = c("custom")),
stop_words)
paged_table(custom_stop_words)
pal = brewer.pal(9,"BuGn")
tidyHuck %>%
anti_join(stop_words) %>%
anti_join(custom_stop_words) %>%
count(word) %>%
with(wordcloud(word, n, min.freq = 50, scale = c(8, .3), random.color = F, colors = pal))
pal2 = brewer.pal(8, "RdYlGn")
tidyHuck %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("red", "darkgreen"),
max.words = 100)
huckSentences <- tibble(text = huckleberry$text) %>%
unnest_tokens(sentence, text, token = "sentences")
#huckSentences$sentence[111]
#Notes on unnest_tokens(chapter,text,token = regex, pattern)
# To have this work in a legible manner, you have to make sure that you group by book before summarising, which requires a 'book' column next to the 'text' column. Make sure you're regex is correct and remove any excess lines like indexes/prefaces
#Prepping Huck text to format well
huckText <- as_tibble(huckleberry$text[-(1:384)]) #Removing first 384 lines to avoid regex confusion and preface to book
colnames(huckText)[1] <- 'text'
huckText <- huckText %>% mutate(book = 'Huckleberry')
## Altered Code to only have both books
austen_chapters <- austen_books() %>%
filter(book == 'Sense & Sensibility') %>%
rbind(huckText)%>%
group_by(book)%>%
unnest_tokens(chapter, text, token = "regex",
pattern = "CHAPTER ([IVXLC]*\\.|THE LAST|[\\dIVXLC])") %>%
ungroup()
##Single book
huckChapters <- huckText %>%
group_by(book) %>%
unnest_tokens(chapter, text, token = "regex",
pattern = "CHAPTER ([IVXLC]*\\.|THE LAST|[\\dIVXLC])") %>%
ungroup()
austen_chapters %>%
group_by(book) %>%
summarise(chapters = n())
## # A tibble: 2 × 2
## book chapters
## <fct> <int>
## 1 Sense & Sensibility 51
## 2 Huckleberry 41
huckChapters %>%
group_by(book) %>%
summarise(chapters = n())
## # A tibble: 1 × 2
## book chapters
## <chr> <int>
## 1 Huckleberry 41
## Original Book Code
# austen_chapters <- austen_books() %>%
# group_by(book) %>%
# unnest_tokens(chapter, text, token = "regex",
# pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
# ungroup()
#
# austen_chapters %>%
# group_by(book) %>%
# summarise(chapters = n())
#Bing Chapter Negativity
bingnegative <- get_sentiments("bing") %>%
filter(sentiment == "negative")
wordcounts <- tidyHuck %>%
group_by(chapter) %>%
summarize(words = n())
paged_table(wordcounts)
#Chapter Sentiment
tidyHuck %>%
semi_join(bingnegative) %>% #join/filter for negative word matches
group_by(chapter) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts, by = c( "chapter")) %>%
mutate(ratio = negativewords/words) %>%
filter(chapter != 0) %>%
slice_max(ratio, n = 1) %>% #show only most negative
ungroup()
## # A tibble: 1 × 4
## chapter negativewords words ratio
## <int> <int> <int> <dbl>
## 1 13 76 2059 0.0369
syuzhetNegative <- nonNeutralSyuzhetCounts %>%
filter(sentiment == "negative")
wordcounts <- tidyHuck %>%
group_by(chapter) %>%
summarize(words = n())
paged_table(wordcounts)
#Chapter Sentiment
tidyHuck %>%
semi_join(syuzhetNegative) %>% #join/filter for negative word matches
group_by(chapter) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts, by = c( "chapter")) %>% #pulls word count sum to create ratio
mutate(ratio = negativewords/words) %>%
filter(chapter != 0) %>%
slice_max(ratio, n = 1) %>% #show only most negative
ungroup()
## # A tibble: 1 × 4
## chapter negativewords words ratio
## <int> <int> <int> <dbl>
## 1 22 112 2075 0.0540
# Negative Score
negativeChapters <- tidyHuck %>%
semi_join(syuzhetNegative) %>% #join/filter for negative word matches
group_by(chapter) %>%
left_join(syuzhetHuckLookup) %>%
group_by(chapter) %>%
summarize(score = sum(value)) %>%
arrange(score)
paged_table(negativeChapters)
syuzhetPositive <- nonNeutralSyuzhetCounts %>%
filter(sentiment == "positive")
syuzhetNeutral <- syuzhetCounts %>%
filter(sentiment == 'neutral')
tidyHuck %>%
semi_join(syuzhetPositive) %>% #join/filter for negative word matches
group_by(chapter) %>%
summarize(positivewords = n()) %>%
left_join(wordcounts, by = c( "chapter")) %>% #pulls word count sum to create ratio
mutate(ratio = positivewords/words) %>%
filter(chapter != 0) %>%
slice_max(ratio, n = 1) %>% #show only most negative
ungroup()
## # A tibble: 1 × 4
## chapter positivewords words ratio
## <int> <int> <int> <dbl>
## 1 25 189 2878 0.0657
# Positive Score
tidyHuck %>%
semi_join(syuzhetPositive) %>% #join/filter for negative word matches
group_by(chapter) %>%
left_join(syuzhetHuckLookup) %>%
group_by(chapter) %>%
summarize(score = sum(value)) %>%
arrange(desc(score)) %>%
left_join(negativeChapters, by = 'chapter') %>%
mutate(netScore = score.x + score.y) %>%
arrange(netScore)
## # A tibble: 43 × 4
## chapter score.x score.y netScore
## <int> <dbl> <dbl> <dbl>
## 1 2 52.4 -67.3 -14.9
## 2 6 58.4 -72.0 -13.5
## 3 13 53.3 -60.3 -7.00
## 4 31 99.0 -104. -4.75
## 5 39 54.9 -58.3 -3.4
## 6 38 66.2 -69.0 -2.90
## 7 3 29.0 -31.7 -2.75
## 8 22 57.8 -60.0 -2.20
## 9 27 74.3 -73.2 1.10
## 10 7 55.4 -53.8 1.60
## # ℹ 33 more rows
Finally we can compare the two for a net score and it looks like
Chapter 2 could be considered the most negative!
Silge, J., & Robinson, D. (2017). Text Mining with R: A Tidy Approach. “O’Reilly Media, Inc.”