knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(message = FALSE)
knitr::opts_chunk$set(warning = FALSE)
library(tidytext)
library(textdata)
Citation: In Text Mining with R, Chapter 2 looks at Sentiment Analysis. For the assignment below all the code was copied from this source and rerun.
Assignment:
Part 1. Get the base code from Text Mining with R, chapter 2 working as-is in this notebook.
Part 2. Extend the code in two ways:
a. Work with a different corpus
b. Incorporate at least one additional sentiment lexicon
#afinn
afinn_sentiments <- get_sentiments("afinn")
#bing
bing_sentiments <- get_sentiments("bing")
#nrc
nrc_sentiments <- get_sentiments("nrc")
library(janeaustenr)
library(dplyr)
library(stringr)
#Put the text in a tidy format
tidy_books <- austen_books() %>%
group_by(book) %>%
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
#Complete the sentiment analysis
#filter for the joy words, then filter() the data frame with the text from the books for the words from Emma and #use inner_join() to perform the sentiment analysis.
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_books %>%
filter(book == "Emma") %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE)
## # A tibble: 303 x 2
## word n
## <chr> <int>
## 1 good 359
## 2 young 192
## 3 friend 166
## 4 hope 143
## 5 happy 125
## 6 love 117
## 7 deal 92
## 8 found 92
## 9 present 89
## 10 kind 82
## # ... with 293 more rows
library(tidyr)
jane_austen_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
library(ggplot2)
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
#Section 2.3 Comparing the three sentiment dictionaries
pride_prejudice <- tidy_books %>%
filter(book == "Pride & Prejudice")
pride_prejudice
## # A tibble: 122,204 x 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Pride & Prejudice 1 0 pride
## 2 Pride & Prejudice 1 0 and
## 3 Pride & Prejudice 1 0 prejudice
## 4 Pride & Prejudice 3 0 by
## 5 Pride & Prejudice 3 0 jane
## 6 Pride & Prejudice 3 0 austen
## 7 Pride & Prejudice 7 1 chapter
## 8 Pride & Prejudice 7 1 1
## 9 Pride & Prejudice 10 1 it
## 10 Pride & Prejudice 10 1 is
## # ... with 122,194 more rows
afinn <- pride_prejudice %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
bing_and_nrc <- bind_rows(pride_prejudice %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
pride_prejudice %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative"))) %>%
mutate(method = "NRC")) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
#Positive and negative words in each lexicon
get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative")) %>%
count(sentiment)
## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 3324
## 2 positive 2312
get_sentiments("bing") %>%
count(sentiment)
## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
#Bing word counts
bing_word_counts <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
bing_word_counts
## # A tibble: 2,585 x 3
## word sentiment n
## <chr> <chr> <int>
## 1 miss negative 1855
## 2 well positive 1523
## 3 good positive 1380
## 4 great positive 981
## 5 like positive 725
## 6 better positive 639
## 7 enough positive 613
## 8 happy positive 534
## 9 love positive 495
## 10 pleasure positive 462
## # ... with 2,575 more rows
#Bing word count plots
bing_word_counts %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()
#How to exclude words to not include (i.e. "miss" is seen as a negative, but could just be a title "Miss Jones")
custom_stop_words <- bind_rows(tibble(word = c("miss"),
lexicon = c("custom")),
stop_words)
custom_stop_words
## # A tibble: 1,150 x 2
## word lexicon
## <chr> <chr>
## 1 miss custom
## 2 a SMART
## 3 a's SMART
## 4 able SMART
## 5 about SMART
## 6 above SMART
## 7 according SMART
## 8 accordingly SMART
## 9 across SMART
## 10 actually SMART
## # ... with 1,140 more rows
#Word Clouds
library(wordcloud)
tidy_books %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
#2.6 Looking at units beyond just words
PandP_sentences <- tibble(text = prideprejudice) %>%
unnest_tokens(sentence, text, token = "sentences")
PandP_sentences$sentence[2]
## [1] "however little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered the rightful property of some one or other of their daughters."
#Split into a dataframe by chapter
austen_chapters <- austen_books() %>%
group_by(book) %>%
unnest_tokens(chapter, text, token = "regex",
pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
ungroup()
austen_chapters %>%
group_by(book) %>%
summarise(chapters = n())
## # A tibble: 6 x 2
## book chapters
## <fct> <int>
## 1 Sense & Sensibility 51
## 2 Pride & Prejudice 62
## 3 Mansfield Park 49
## 4 Emma 56
## 5 Northanger Abbey 32
## 6 Persuasion 25
#For each book, which chapter has the highest number of negative words
bingnegative <- get_sentiments("bing") %>%
filter(sentiment == "negative")
wordcounts <- tidy_books %>%
group_by(book, chapter) %>%
summarize(words = n())
tidy_books %>%
semi_join(bingnegative) %>%
group_by(book, chapter) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts, by = c("book", "chapter")) %>%
mutate(ratio = negativewords/words) %>%
filter(chapter != 0) %>%
top_n(1) %>%
ungroup()
## # A tibble: 6 x 5
## book chapter negativewords words ratio
## <fct> <int> <int> <int> <dbl>
## 1 Sense & Sensibility 43 161 3405 0.0473
## 2 Pride & Prejudice 34 111 2104 0.0528
## 3 Mansfield Park 46 173 3685 0.0469
## 4 Emma 15 151 3340 0.0452
## 5 Northanger Abbey 21 149 2982 0.0500
## 6 Persuasion 4 62 1807 0.0343
This section extends the above code by working with the Gutenberg corpus, library(guttenbergr), instead of JaneAusten and incorporating the “squzhet” sentiment lexicon to…
library(gutenbergr)
library(dplyr)
library(stringr)
library(syuzhet)
Obtain a work from the Gutenberg Corpus
#Obtain Plato's, "The Republic" corpus
plato <- gutenberg_works(author == "Plato", title == "The Republic")
therepublicid <- plato$gutenberg_id
republic_corpus <- gutenberg_download(therepublicid)
republic_words <- get_tokens(republic_corpus$text, pattern = "\\W")
cat('Word count of The Republic is', length(republic_words), 'words')
## Word count of The Republic is 125208 words
Get Sentiment using syuzhet
#Get sentiment using syuzhet
republic_sentences <- get_sentences(republic_corpus$text)
republic_sentiment <- get_sentiment(republic_sentences, method="syuzhet")
#Count the number of positive and negative sentences in The Republic
pos_sentences <- length(which(republic_sentiment > 0))
neutral_sentences <- length(which(republic_sentiment == 0))
neg_sentences <- length(which(republic_sentiment < 0))
tot_sentences <- length(republic_sentiment)
#In syuzhet positives get a value > 0 and negatives get a value < 0, and 0 is neutral.
cat('Number of positive, neutral, and negative sentences in The Republic is', pos_sentences, neutral_sentences, 'and', neg_sentences, 'respectively for a total of', tot_sentences, 'sentences')
## Number of positive, neutral, and negative sentences in The Republic is 6010 9043 and 2737 respectively for a total of 17790 sentences
Histogram of the Positive and Negative Sentences Count
#Histogram of the positive and negative sentences
republic_sentiment_df <- as.data.frame(republic_sentiment)
republic_sentiment_df <- subset(republic_sentiment_df,republic_sentiment != 0)
ggplot(republic_sentiment_df, aes(x=republic_sentiment)) + geom_histogram(bins = 90)
Create a Word Cloud
#Create a word cloud of "The Republic"
republic_words <- republic_corpus %>% unnest_tokens(word,text)
republic_words %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))