library(tidytext)
library(textdata)
library(janeaustenr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(tidyr)
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(wordcloud)
## Loading required package: RColorBrewer
library(reshape2)
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
library(harrypotter)
library(RCurl)
##
## Attaching package: 'RCurl'
##
## The following object is masked from 'package:tidyr':
##
## complete
get_sentiments("afinn") # Specify 1 to download
## # A tibble: 2,477 × 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # ℹ 2,467 more rows
get_sentiments("bing")
## # A tibble: 6,786 × 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # ℹ 6,776 more rows
get_sentiments("nrc") # Specify 1 to download
## # A tibble: 13,872 × 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # ℹ 13,862 more rows
get_sentiments("loughran") # Specify 1 to download
## # A tibble: 4,150 × 2
## word sentiment
## <chr> <chr>
## 1 abandon negative
## 2 abandoned negative
## 3 abandoning negative
## 4 abandonment negative
## 5 abandonments negative
## 6 abandons negative
## 7 abdicated negative
## 8 abdicates negative
## 9 abdicating negative
## 10 abdication negative
## # ℹ 4,140 more rows
# Group by chapters, ignore case, ungroup, and unnest for inner join
tidy_books <- austen_books() %>%
group_by(book) %>%
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text) # Note 'word' as new name - same as lexicons
# This creates df with one word per row
# # # # # # # # # # # # # # # #
# Perform the join on the 'joy' words from the NRC lexicon
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
# Count the joy words in the book 'Emma'
Emma_joy_words_nrc <- tidy_books %>%
filter(book == "Emma") %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE)
## Joining with `by = join_by(word)`
head(Emma_joy_words_nrc, 10)
## # A tibble: 10 × 2
## word n
## <chr> <int>
## 1 good 359
## 2 friend 166
## 3 hope 143
## 4 happy 125
## 5 love 117
## 6 deal 92
## 7 found 92
## 8 present 89
## 9 kind 82
## 10 happiness 76
jane_austen_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>% # Separate positive/negative words
mutate(sentiment = positive - negative) # Calculate sentiment overall
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 435434 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")

pride_prejudice <- tidy_books %>%
filter(book == "Pride & Prejudice")
rmarkdown::paged_table(pride_prejudice)
afinn <- pride_prejudice %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
## Joining with `by = join_by(word)`
bing_and_nrc <- bind_rows(pride_prejudice %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
pride_prejudice %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative"))) %>%
mutate(method = "NRC")) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("nrc") %>% filter(sentiment %in% : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 215 of `x` matches multiple rows in `y`.
## ℹ Row 5178 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
bind_rows(afinn,
bing_and_nrc) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")

get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative")) %>%
count(sentiment)
## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 3316
## 2 positive 2308
get_sentiments("bing") %>%
count(sentiment)
## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
library(gutenbergr)
gutenberg_metadata
## # A tibble: 72,569 × 8
## gutenberg_id title author gutenberg_author_id language gutenberg_bookshelf
## <int> <chr> <chr> <int> <chr> <chr>
## 1 1 "The De… Jeffe… 1638 en "Politics/American…
## 2 2 "The Un… Unite… 1 en "Politics/American…
## 3 3 "John F… Kenne… 1666 en ""
## 4 4 "Lincol… Linco… 3 en "US Civil War"
## 5 5 "The Un… Unite… 1 en "United States/Pol…
## 6 6 "Give M… Henry… 4 en "American Revoluti…
## 7 7 "The Ma… <NA> NA en ""
## 8 8 "Abraha… Linco… 3 en "US Civil War"
## 9 9 "Abraha… Linco… 3 en "US Civil War"
## 10 10 "The Ki… <NA> NA en "Banned Books List…
## # ℹ 72,559 more rows
## # ℹ 2 more variables: rights <chr>, has_text <lgl>
count_of_Alice_Wonderland<- gutenberg_download(11)
## Determining mirror for Project Gutenberg from https://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
count_of_Alice_Wonderland
## # A tibble: 3,380 × 2
## gutenberg_id text
## <int> <chr>
## 1 11 "[Illustration]"
## 2 11 ""
## 3 11 ""
## 4 11 ""
## 5 11 ""
## 6 11 "Alice’s Adventures in Wonderland"
## 7 11 ""
## 8 11 "by Lewis Carroll"
## 9 11 ""
## 10 11 "THE MILLENNIUM FULCRUM EDITION 3.0"
## # ℹ 3,370 more rows
Alice_Wonderland_Chapters <- count_of_Alice_Wonderland %>%
filter(text != "") %>%
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("CHAPTER [\\dIVXLC]", ignore_case = TRUE))))
Alice_Wonderland_Chapters
## # A tibble: 2,494 × 4
## gutenberg_id text linenumber chapter
## <int> <chr> <int> <int>
## 1 11 "[Illustration]" 1 0
## 2 11 "Alice’s Adventures in Wonderland" 2 0
## 3 11 "by Lewis Carroll" 3 0
## 4 11 "THE MILLENNIUM FULCRUM EDITION 3.0" 4 0
## 5 11 "Contents" 5 0
## 6 11 " CHAPTER I. Down the Rabbit-Hole" 6 1
## 7 11 " CHAPTER II. The Pool of Tears" 7 2
## 8 11 " CHAPTER III. A Caucus-Race and a Long Ta… 8 3
## 9 11 " CHAPTER IV. The Rabbit Sends in a Littl… 9 4
## 10 11 " CHAPTER V. Advice from a Caterpillar" 10 5
## # ℹ 2,484 more rows
Alice_Wonderland_tidy <- Alice_Wonderland_Chapters %>%
unnest_tokens(word, text) %>%
inner_join(get_sentiments("loughran")) %>%
count(word, sentiment, sort = TRUE) %>%
group_by(sentiment) %>%
top_n(10) %>% ungroup() %>% mutate(word = reorder(word, n)) %>%
anti_join(stop_words)
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("loughran")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 5401 of `x` matches multiple rows in `y`.
## ℹ Row 2928 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
## Selecting by n
## Joining with `by = join_by(word)`
names(Alice_Wonderland_tidy)<-c("word", "sentiment", "Freq")
ggplot(data = Alice_Wonderland_tidy, aes(x = word, y = Freq, fill = sentiment)) +
geom_bar(stat = "identity") + coord_flip() + facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",x = NULL)

Alice_Wonderland_Sentiment_total <- Alice_Wonderland_Chapters %>%
unnest_tokens(word, text) %>% inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining with `by = join_by(word)`
Alice_Wonderland_Sentiment_total %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip() +
geom_text(aes(label = n, hjust = 1.0))
## Selecting by n

Alice_Wonderland_Sentiment <- Alice_Wonderland_Chapters %>%
unnest_tokens(word, text) %>%
inner_join(get_sentiments("bing")) %>%
count(chapter, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
ggplot(Alice_Wonderland_Sentiment, aes(index, sentiment, fill = chapter)) +
geom_col(show.legend = FALSE) +
facet_wrap(~chapter, ncol = 2, scales = "free_x")

Positive_Negative_Count<- Alice_Wonderland_Chapters %>%
unnest_tokens(word, text) %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80, chapter)%>%
summarise(sentiment = sum(value))
## Joining with `by = join_by(word)`
## `summarise()` has grouped output by 'index'. You can override using the
## `.groups` argument.
Positive_Negative_Count%>%
ggplot(aes(chapter, sentiment, fill=index)) +
geom_col()

total_word_count <- Alice_Wonderland_Chapters %>% unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
count(word, sort = TRUE) %>% filter(word != "thomas" )
## Joining with `by = join_by(word)`
total_word_count %>% with(wordcloud(word, n, max.words = 100))
