In Text Mining with R, Chapter 2 looks at Sentiment Analysis. In this assignment, you should start by getting the primary example code from chapter 2 working in an R Markdown document. You should provide a citation to this base code. You’re then asked to extend the code in two ways:
Work with a different corpus of your choosing, and Incorporate at least one additional sentiment lexicon (possibly from another R package that you’ve found through research). As usual, please submit links to both an .Rmd file posted in your GitHub repository and to your code on rpubs.com. You make work on a small team on this assignment.
library(tidyverse)
library(tidytext)
library(janeaustenr)
library(wordcloud)
library(gutenbergr)
get_sentiments("nrc")
## # A tibble: 13,872 × 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # ℹ 13,862 more rows
get_sentiments("afinn")
## # A tibble: 2,477 × 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # ℹ 2,467 more rows
get_sentiments("bing")
## # A tibble: 6,786 × 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # ℹ 6,776 more rows
#download the book from gutenberg
gutenberg_metadata
## # A tibble: 72,569 × 8
## gutenberg_id title author gutenberg_author_id language gutenberg_bookshelf
## <int> <chr> <chr> <int> <chr> <chr>
## 1 1 "The De… Jeffe… 1638 en "Politics/American…
## 2 2 "The Un… Unite… 1 en "Politics/American…
## 3 3 "John F… Kenne… 1666 en ""
## 4 4 "Lincol… Linco… 3 en "US Civil War"
## 5 5 "The Un… Unite… 1 en "United States/Pol…
## 6 6 "Give M… Henry… 4 en "American Revoluti…
## 7 7 "The Ma… <NA> NA en ""
## 8 8 "Abraha… Linco… 3 en "US Civil War"
## 9 9 "Abraha… Linco… 3 en "US Civil War"
## 10 10 "The Ki… <NA> NA en "Banned Books List…
## # ℹ 72,559 more rows
## # ℹ 2 more variables: rights <chr>, has_text <lgl>
dark <- gutenberg_works(title == "Heart of Darkness") %>%
gutenberg_download(meta_fields = "title")
# Add a column called chapters based on the I,II, III in the book
dark2<- dark %>%
mutate( linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("^([\\divxlc])+$", ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
dark3 <- dark2 %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
dark3
## # A tibble: 42 × 3
## index sentiment method
## <dbl> <dbl> <chr>
## 1 0 24 AFINN
## 2 1 18 AFINN
## 3 2 -6 AFINN
## 4 3 -20 AFINN
## 5 4 41 AFINN
## 6 5 47 AFINN
## 7 6 -14 AFINN
## 8 7 -35 AFINN
## 9 8 11 AFINN
## 10 9 -25 AFINN
## # ℹ 32 more rows
# chart negative and positive sentiment
dark3 %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE)
# count the words in the book
count_dark <- dark2 %>%
group_by(chapter) %>%
count(word, sort = TRUE)
count_dark
## # A tibble: 8,379 × 3
## # Groups: chapter [4]
## chapter word n
## <int> <chr> <int>
## 1 1 the 868
## 2 2 the 778
## 3 3 the 646
## 4 1 a 532
## 5 1 of 520
## 6 3 i 456
## 7 3 of 432
## 8 2 of 421
## 9 1 and 392
## 10 1 i 352
## # ℹ 8,369 more rows
#remove stop words
remove_dark <- dark2 %>%
anti_join(stop_words)
remove_dark
## # A tibble: 12,943 × 5
## gutenberg_id title linenumber chapter word
## <int> <chr> <int> <int> <chr>
## 1 526 Heart of Darkness 1 0 note
## 2 526 Heart of Darkness 1 0 etext
## 3 526 Heart of Darkness 1 0 219
## 4 526 Heart of Darkness 1 0 version
## 5 526 Heart of Darkness 1 0 ebook
## 6 526 Heart of Darkness 3 0 heart
## 7 526 Heart of Darkness 3 0 darkness
## 8 526 Heart of Darkness 6 0 joseph
## 9 526 Heart of Darkness 6 0 conrad
## 10 526 Heart of Darkness 12 1 nellie
## # ℹ 12,933 more rows
#count the words in the book after removing stop words
remove_dark %>%
count(word, sort = TRUE)
## # A tibble: 4,978 × 2
## word n
## <chr> <int>
## 1 kurtz 101
## 2 time 78
## 3 river 65
## 4 looked 56
## 5 eyes 49
## 6 station 48
## 7 head 47
## 8 heard 47
## 9 black 43
## 10 manager 42
## # ℹ 4,968 more rows
# create custom words to remove ("time")
custom_stop_words <- bind_rows(tibble(word = c("time"),
lexicon = c("custom")),
stop_words)
custom_stop_words
## # A tibble: 1,150 × 2
## word lexicon
## <chr> <chr>
## 1 time custom
## 2 a SMART
## 3 a's SMART
## 4 able SMART
## 5 about SMART
## 6 above SMART
## 7 according SMART
## 8 accordingly SMART
## 9 across SMART
## 10 actually SMART
## # ℹ 1,140 more rows
# remove time from word list
remove_dark %>%
anti_join(custom_stop_words) %>%
count(word, sort = TRUE)
## # A tibble: 4,977 × 2
## word n
## <chr> <int>
## 1 kurtz 101
## 2 river 65
## 3 looked 56
## 4 eyes 49
## 5 station 48
## 6 head 47
## 7 heard 47
## 8 black 43
## 9 manager 42
## 10 earth 39
## # ℹ 4,967 more rows
#download the book from gutenberg
gutenberg_metadata
## # A tibble: 72,569 × 8
## gutenberg_id title author gutenberg_author_id language gutenberg_bookshelf
## <int> <chr> <chr> <int> <chr> <chr>
## 1 1 "The De… Jeffe… 1638 en "Politics/American…
## 2 2 "The Un… Unite… 1 en "Politics/American…
## 3 3 "John F… Kenne… 1666 en ""
## 4 4 "Lincol… Linco… 3 en "US Civil War"
## 5 5 "The Un… Unite… 1 en "United States/Pol…
## 6 6 "Give M… Henry… 4 en "American Revoluti…
## 7 7 "The Ma… <NA> NA en ""
## 8 8 "Abraha… Linco… 3 en "US Civil War"
## 9 9 "Abraha… Linco… 3 en "US Civil War"
## 10 10 "The Ki… <NA> NA en "Banned Books List…
## # ℹ 72,559 more rows
## # ℹ 2 more variables: rights <chr>, has_text <lgl>
heart <- gutenberg_works(title == "Heart of Darkness") %>%
gutenberg_download(meta_fields = "title")
# Add a column called chapters based on the I,II, III in the book
heart2<- heart %>%
mutate( linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("^([\\divxlc])+$", ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
heart3 <- heart2 %>%
inner_join(get_sentiments("bing")) %>%
count(title, index = linenumber %/% 100, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
heart3
## # A tibble: 34 × 5
## title index negative positive sentiment
## <chr> <dbl> <int> <int> <int>
## 1 Heart of Darkness 0 34 40 6
## 2 Heart of Darkness 1 51 38 -13
## 3 Heart of Darkness 2 38 39 1
## 4 Heart of Darkness 3 36 44 8
## 5 Heart of Darkness 4 48 46 -2
## 6 Heart of Darkness 5 60 29 -31
## 7 Heart of Darkness 6 56 24 -32
## 8 Heart of Darkness 7 57 36 -21
## 9 Heart of Darkness 8 48 38 -10
## 10 Heart of Darkness 9 31 34 3
## # ℹ 24 more rows
# chart negative and positive sentiment
ggplot(heart3, aes(index, sentiment, fill = title)) +
geom_col(show.legend = FALSE)
# count the words in the book
count_heart <- heart2 %>%
group_by(chapter) %>%
count(word, sort = TRUE)
count_heart
## # A tibble: 8,379 × 3
## # Groups: chapter [4]
## chapter word n
## <int> <chr> <int>
## 1 1 the 868
## 2 2 the 778
## 3 3 the 646
## 4 1 a 532
## 5 1 of 520
## 6 3 i 456
## 7 3 of 432
## 8 2 of 421
## 9 1 and 392
## 10 1 i 352
## # ℹ 8,369 more rows
# remove stop words
remove_heart <- heart2 %>%
anti_join(stop_words)
## Joining with `by = join_by(word)`
remove_heart
## # A tibble: 12,943 × 5
## gutenberg_id title linenumber chapter word
## <int> <chr> <int> <int> <chr>
## 1 526 Heart of Darkness 1 0 note
## 2 526 Heart of Darkness 1 0 etext
## 3 526 Heart of Darkness 1 0 219
## 4 526 Heart of Darkness 1 0 version
## 5 526 Heart of Darkness 1 0 ebook
## 6 526 Heart of Darkness 3 0 heart
## 7 526 Heart of Darkness 3 0 darkness
## 8 526 Heart of Darkness 6 0 joseph
## 9 526 Heart of Darkness 6 0 conrad
## 10 526 Heart of Darkness 12 1 nellie
## # ℹ 12,933 more rows
#count the words in the book after removing stop words
remove_heart %>%
count(word, sort = TRUE)
## # A tibble: 4,978 × 2
## word n
## <chr> <int>
## 1 kurtz 101
## 2 time 78
## 3 river 65
## 4 looked 56
## 5 eyes 49
## 6 station 48
## 7 head 47
## 8 heard 47
## 9 black 43
## 10 manager 42
## # ℹ 4,968 more rows
# get sentiment
heart4 <- remove_heart %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining with `by = join_by(word)`
heart4
## # A tibble: 1,121 × 3
## word sentiment n
## <chr> <chr> <int>
## 1 darkness negative 26
## 2 lost negative 26
## 3 dark negative 25
## 4 dead negative 23
## 5 death negative 19
## 6 slowly negative 18
## 7 devil negative 15
## 8 fool negative 14
## 9 lying negative 14
## 10 doubt negative 13
## # ℹ 1,111 more rows
#chart both negative and positive sentiment
heart4 %>%
group_by(sentiment) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment",
y = NULL)
# Wordcloud postive words
pos <- heart4 %>%
filter(sentiment == "positive")
wordcloud(
words = pos$word,
freq = pos$n,
max.words = 30,
colors = "blue")
## Warning in wordcloud(words = pos$word, freq = pos$n, max.words = 30, colors =
## "blue"): profound could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = pos$word, freq = pos$n, max.words = 30, colors =
## "blue"): remarkable could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = pos$word, freq = pos$n, max.words = 30, colors =
## "blue"): extraordinary could not be fit on page. It will not be plotted.
neg <- heart4 %>%
filter(sentiment == "negative")
# Wordcloud Negative words
wordcloud(
words = neg$word,
freq = neg$n,
max.words = 30,
colors = "blue")
#download the book from gutenberg
gutenberg_metadata
## # A tibble: 72,569 × 8
## gutenberg_id title author gutenberg_author_id language gutenberg_bookshelf
## <int> <chr> <chr> <int> <chr> <chr>
## 1 1 "The De… Jeffe… 1638 en "Politics/American…
## 2 2 "The Un… Unite… 1 en "Politics/American…
## 3 3 "John F… Kenne… 1666 en ""
## 4 4 "Lincol… Linco… 3 en "US Civil War"
## 5 5 "The Un… Unite… 1 en "United States/Pol…
## 6 6 "Give M… Henry… 4 en "American Revoluti…
## 7 7 "The Ma… <NA> NA en ""
## 8 8 "Abraha… Linco… 3 en "US Civil War"
## 9 9 "Abraha… Linco… 3 en "US Civil War"
## 10 10 "The Ki… <NA> NA en "Banned Books List…
## # ℹ 72,559 more rows
## # ℹ 2 more variables: rights <chr>, has_text <lgl>
lou <- gutenberg_works(title == "Heart of Darkness") %>%
gutenberg_download(meta_fields = "title")
# Add a column called chapters based on the I,II, III in the book
lou2<- lou %>%
mutate( linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("^([\\divxlc])+$", ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
# table with sentiments
lou3 <- lou2 %>%
inner_join(get_sentiments("loughran")) %>%
count(title, index = linenumber %/% 100, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("loughran")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 285 of `x` matches multiple rows in `y`.
## ℹ Row 2526 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
lou3
## # A tibble: 34 × 8
## title index constraining litigious negative positive uncertainty sentiment
## <chr> <dbl> <int> <int> <int> <int> <int> <int>
## 1 Heart o… 0 3 3 7 12 7 5
## 2 Heart o… 1 0 0 17 7 13 -10
## 3 Heart o… 2 0 1 15 7 9 -8
## 4 Heart o… 3 1 5 12 9 7 -3
## 5 Heart o… 4 0 1 7 16 13 9
## 6 Heart o… 5 0 2 11 4 13 -7
## 7 Heart o… 6 0 6 21 5 12 -16
## 8 Heart o… 7 1 1 22 8 10 -14
## 9 Heart o… 8 0 2 17 9 18 -8
## 10 Heart o… 9 1 0 12 10 15 -2
## # ℹ 24 more rows
#add sentiment to loughran
loughran_lexicon <- get_sentiments("loughran")
# chart negative and positive sentiment
ggplot(lou3, aes(index, sentiment, fill = title)) +
geom_col(show.legend = FALSE)
#custom lexicon words
custom_lexicon <- loughran_lexicon %>%
bind_rows(tribble(~word, ~sentiment,
"black", "negative",
"eyes", "positive"))
# look at new sentiment with customer lexicon
custom_words <- lou2 %>%
inner_join(custom_lexicon) %>%
count(word, sentiment, sort = TRUE)
## Joining with `by = join_by(word)`
## Warning in inner_join(., custom_lexicon): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 285 of `x` matches multiple rows in `y`.
## ℹ Row 2526 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
custom_words
## # A tibble: 421 × 3
## word sentiment n
## <chr> <chr> <int>
## 1 could uncertainty 111
## 2 eyes positive 49
## 3 great positive 46
## 4 black negative 43
## 5 good positive 42
## 6 suddenly uncertainty 35
## 7 perhaps uncertainty 29
## 8 lost negative 26
## 9 sometimes uncertainty 24
## 10 believe uncertainty 22
## # ℹ 411 more rows