For this assignment we were asked to use In Text Mining with R, Chapter 2 looks at Sentiment Analysis. Use the primary example code from chapter 2 working in an R Markdown document. Also provide a citation to this base code. lasly we were asked to extend the code in two ways:
Work with a different corpus of your choosing, and
Incorporate at least one additional https://cran.r-project.org/web/packages/friends/index.html (possibly from another R package that you’ve found through research).
library(tidytext)
sentiments
## # A tibble: 6,786 × 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # ℹ 6,776 more rows
The three general-purpose lexicons are:
• AFINN from Finn Årup Nielsen
• Bing from Bing Liu and collaborators
• NRC from Saif Mohammad and Peter Turney
get_sentiments("afinn")
## # A tibble: 2,477 × 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # ℹ 2,467 more rows
get_sentiments("bing")
## # A tibble: 6,786 × 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # ℹ 6,776 more rows
get_sentiments("nrc")
## # A tibble: 13,872 × 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # ℹ 13,862 more rows
#2. Sentiment analysis of Jane Austen books
library(janeaustenr)
library(dplyr)
library(stringr)
tidy_books <- austen_books() %>%
group_by(book) %>%
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
tidy_books
## # A tibble: 725,055 × 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Sense & Sensibility 1 0 sense
## 2 Sense & Sensibility 1 0 and
## 3 Sense & Sensibility 1 0 sensibility
## 4 Sense & Sensibility 3 0 by
## 5 Sense & Sensibility 3 0 jane
## 6 Sense & Sensibility 3 0 austen
## 7 Sense & Sensibility 5 0 1811
## 8 Sense & Sensibility 10 1 chapter
## 9 Sense & Sensibility 10 1 1
## 10 Sense & Sensibility 13 1 the
## # ℹ 725,045 more rows
#3. Sentiment analysis of positive words using NRC dictionary
nrcjoy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_books %>%
filter(book == "Emma") %>%
inner_join(nrcjoy) %>%
count(word, sort = TRUE)
## # A tibble: 301 × 2
## word n
## <chr> <int>
## 1 good 359
## 2 friend 166
## 3 hope 143
## 4 happy 125
## 5 love 117
## 6 deal 92
## 7 found 92
## 8 present 89
## 9 kind 82
## 10 happiness 76
## # ℹ 291 more rows
#4. Sentiment analysis of positive and negative using Bing dictionary
library(tidyr)
janeaustensentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
janeaustensentiment
## # A tibble: 920 × 5
## book index negative positive sentiment
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 Sense & Sensibility 0 16 32 16
## 2 Sense & Sensibility 1 19 53 34
## 3 Sense & Sensibility 2 12 31 19
## 4 Sense & Sensibility 3 15 31 16
## 5 Sense & Sensibility 4 16 34 18
## 6 Sense & Sensibility 5 16 51 35
## 7 Sense & Sensibility 6 24 40 16
## 8 Sense & Sensibility 7 23 51 28
## 9 Sense & Sensibility 8 30 40 10
## 10 Sense & Sensibility 9 15 19 4
## # ℹ 910 more rows
#5. Plot of negative and positive words
library(ggplot2)
ggplot(janeaustensentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
#6. Filter for Pride and Prejudice
pride_prejudice <- tidy_books %>%
filter(book == "Pride & Prejudice")
pride_prejudice
## # A tibble: 122,204 × 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Pride & Prejudice 1 0 pride
## 2 Pride & Prejudice 1 0 and
## 3 Pride & Prejudice 1 0 prejudice
## 4 Pride & Prejudice 3 0 by
## 5 Pride & Prejudice 3 0 jane
## 6 Pride & Prejudice 3 0 austen
## 7 Pride & Prejudice 7 1 chapter
## 8 Pride & Prejudice 7 1 1
## 9 Pride & Prejudice 10 1 it
## 10 Pride & Prejudice 10 1 is
## # ℹ 122,194 more rows
#7. Comparing sentiment analysis of Pride and Prejudice by the 3 libraries, AFINN, BING, and NRC
afinn <- pride_prejudice %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
bing_and_nrc <- bind_rows(
pride_prejudice %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
pride_prejudice %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative"))) %>%
mutate(method = "NRC")) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
#8. Plot of the 3 sentiment dictionaries
bind_rows(afinn,
bing_and_nrc) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")
get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative")) %>%
count(sentiment)
## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 3316
## 2 positive 2308
get_sentiments("bing") %>%
count(sentiment)
## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
#9. Most common negative and positive words
bing_word_counts <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
bing_word_counts
## # A tibble: 2,585 × 3
## word sentiment n
## <chr> <chr> <int>
## 1 miss negative 1855
## 2 well positive 1523
## 3 good positive 1380
## 4 great positive 981
## 5 like positive 725
## 6 better positive 639
## 7 enough positive 613
## 8 happy positive 534
## 9 love positive 495
## 10 pleasure positive 462
## # ℹ 2,575 more rows
#10. Plot of most common negative postive and negative words
bing_word_counts %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()
custom_stop_words <- bind_rows(data_frame(word = c("miss"),
lexicon = c("custom")),
stop_words)
custom_stop_words
## # A tibble: 1,150 × 2
## word lexicon
## <chr> <chr>
## 1 miss custom
## 2 a SMART
## 3 a's SMART
## 4 able SMART
## 5 about SMART
## 6 above SMART
## 7 according SMART
## 8 accordingly SMART
## 9 across SMART
## 10 actually SMART
## # ℹ 1,140 more rows
#11. Word Cloud
library(wordcloud)
tidy_books %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
library(reshape2)
tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("darkblue", "red"),
max.words = 100)
PandP_sentences <- data_frame(text = prideprejudice) %>%
unnest_tokens(sentence, text, token = "sentences")
PandP_sentences$sentence[2]
## [1] "by jane austen"
austen_chapters <- austen_books() %>%
group_by(book) %>%
unnest_tokens(chapter, text, token = "regex",
pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
ungroup()
austen_chapters %>%
group_by(book) %>%
summarise(chapters = n())
## # A tibble: 6 × 2
## book chapters
## <fct> <int>
## 1 Sense & Sensibility 51
## 2 Pride & Prejudice 62
## 3 Mansfield Park 49
## 4 Emma 56
## 5 Northanger Abbey 32
## 6 Persuasion 25
bingnegative <- get_sentiments("bing") %>%
filter(sentiment == "negative")
wordcounts <- tidy_books %>%
group_by(book, chapter) %>%
summarize(words = n())
tidy_books %>%
semi_join(bingnegative) %>%
group_by(book, chapter) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts, by = c("book", "chapter")) %>%
mutate(ratio = negativewords/words) %>%
filter(chapter != 0) %>%
top_n(1) %>%
ungroup()
## # A tibble: 6 × 5
## book chapter negativewords words ratio
## <fct> <int> <int> <int> <dbl>
## 1 Sense & Sensibility 43 161 3405 0.0473
## 2 Pride & Prejudice 34 111 2104 0.0528
## 3 Mansfield Park 46 173 3685 0.0469
## 4 Emma 15 151 3340 0.0452
## 5 Northanger Abbey 21 149 2982 0.0500
## 6 Persuasion 4 62 1807 0.0343
For the corpus of my choosing I will be looking at sentiment analysis of the friends package from this URL: (“https://cran.r-project.org/web/packages/available_packages_by_name.html#available-packages-T”). The friends package contained the complete scripts from the American sitcom Friends in tibble format.I will be Use this package to practice data wrangling, text analysis and network analysis along sentiment analysis according to Chapter 2 from “Text Mining with R: A Tidy Approach”.
First I will begin by Loading and inspect friends corpus
library(friends)
glimpse(friends)
## Rows: 67,373
## Columns: 6
## $ text <chr> "There's nothing to tell! He's just some guy I work with!", …
## $ speaker <chr> "Monica Geller", "Joey Tribbiani", "Chandler Bing", "Phoebe …
## $ season <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ episode <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ scene <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ utterance <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1…
For this part I will be looking at the friends and by looking at the seasons
tidy_friends <- friends %>%
group_by(season) %>%
mutate(
linenumber = row_number(),
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
tidy_friends
## # A tibble: 716,519 × 8
## speaker season episode scene utterance linenumber chapter word
## <chr> <int> <int> <int> <int> <int> <int> <chr>
## 1 Monica Geller 1 1 1 1 1 0 there's
## 2 Monica Geller 1 1 1 1 1 0 nothing
## 3 Monica Geller 1 1 1 1 1 0 to
## 4 Monica Geller 1 1 1 1 1 0 tell
## 5 Monica Geller 1 1 1 1 1 0 he's
## 6 Monica Geller 1 1 1 1 1 0 just
## 7 Monica Geller 1 1 1 1 1 0 some
## 8 Monica Geller 1 1 1 1 1 0 guy
## 9 Monica Geller 1 1 1 1 1 0 i
## 10 Monica Geller 1 1 1 1 1 0 work
## # ℹ 716,509 more rows
nrcjoy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_friends %>%
filter(speaker == "Chandler Bing") %>%
inner_join(nrcjoy) %>%
count(word, sort = TRUE)
## # A tibble: 207 × 2
## word n
## <chr> <int>
## 1 good 231
## 2 love 140
## 3 god 135
## 4 pretty 61
## 5 baby 57
## 6 money 53
## 7 sex 52
## 8 kind 48
## 9 friend 45
## 10 happy 43
## # ℹ 197 more rows
Sentiment analysis of positive and negative using Bing dictionary
library(tidyr)
friends_sentiment <- tidy_friends %>%
inner_join(get_sentiments("bing")) %>%
count(speaker, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
Chandler_Bing <- tidy_friends %>%
filter(speaker == "Chandler Bing")
Chandler_Bing
## # A tibble: 87,609 × 8
## speaker season episode scene utterance linenumber chapter word
## <chr> <int> <int> <int> <int> <int> <int> <chr>
## 1 Chandler Bing 1 1 1 3 3 0 all
## 2 Chandler Bing 1 1 1 3 3 0 right
## 3 Chandler Bing 1 1 1 3 3 0 joey
## 4 Chandler Bing 1 1 1 3 3 0 be
## 5 Chandler Bing 1 1 1 3 3 0 nice
## 6 Chandler Bing 1 1 1 3 3 0 so
## 7 Chandler Bing 1 1 1 3 3 0 does
## 8 Chandler Bing 1 1 1 3 3 0 he
## 9 Chandler Bing 1 1 1 3 3 0 have
## 10 Chandler Bing 1 1 1 3 3 0 a
## # ℹ 87,599 more rows
Phoebe_Buffay <- tidy_friends %>%
filter(speaker == "Phoebe Buffay")
Phoebe_Buffay
## # A tibble: 82,454 × 8
## speaker season episode scene utterance linenumber chapter word
## <chr> <int> <int> <int> <int> <int> <int> <chr>
## 1 Phoebe Buffay 1 1 1 4 4 0 wait
## 2 Phoebe Buffay 1 1 1 4 4 0 does
## 3 Phoebe Buffay 1 1 1 4 4 0 he
## 4 Phoebe Buffay 1 1 1 4 4 0 eat
## 5 Phoebe Buffay 1 1 1 4 4 0 chalk
## 6 Phoebe Buffay 1 1 1 6 6 0 just
## 7 Phoebe Buffay 1 1 1 6 6 0 cause
## 8 Phoebe Buffay 1 1 1 6 6 0 i
## 9 Phoebe Buffay 1 1 1 6 6 0 don't
## 10 Phoebe Buffay 1 1 1 6 6 0 want
## # ℹ 82,444 more rows
Comparing sentiment analysis of Phoebe Buffay by the 3 libraries, AFINN, BING, and NRC
afinn <- Phoebe_Buffay %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
bing_and_nrc_Friends <- bind_rows(
Phoebe_Buffay %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
Phoebe_Buffay %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative"))
) %>%
mutate(method = "NRC")) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative)
bind_rows(afinn,
bing_and_nrc) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")
bing_word_counts <- tidy_friends %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
bing_word_counts
## # A tibble: 1,946 × 3
## word sentiment n
## <chr> <chr> <int>
## 1 well positive 4229
## 2 right positive 3569
## 3 like positive 3238
## 4 good positive 1756
## 5 sorry negative 1460
## 6 great positive 1353
## 7 love positive 1040
## 8 thank positive 756
## 9 wow positive 750
## 10 fine positive 648
## # ℹ 1,936 more rows
bing_word_counts %>%
group_by(sentiment) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment",
y = NULL)
library(wordcloud)
tidy_friends %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
library(reshape2)
tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("green", "purple"),
max.words = 100)
Text Book: The base code used here is taken from: “Text Mining with R: A Tidy Approach” by Julia Silge and David Robinson licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 United States License.
Friends: https://cran.r-project.org/web/packages/friends/index.html.