\(~\)
\(~\)
\(~\)
\(~\)
# load libraries
library("tidyverse")
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.4 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library("tidytext")
library("janeaustenr")
library("tidyr")
library("dplyr")
library("stringr")
library("ggplot2")
\(~\)
sentiments
datasets# get sentiments for afinn, bing and nrc
get_sentiments("afinn")
## # A tibble: 2,477 × 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # … with 2,467 more rows
get_sentiments("bing")
## # A tibble: 6,786 × 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # … with 6,776 more rows
get_sentiments("nrc")
## # A tibble: 13,875 × 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # … with 13,865 more rows
\(~\)
# get data tidied
<- austen_books() %>%
tidy_books group_by(book) %>%
mutate(
linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
ignore_case = TRUE
)))%>%
) ungroup() %>%
unnest_tokens(word, text)
# sentiment analysis
<- get_sentiments("nrc") %>%
nrc_joy filter(sentiment == "joy")
%>%
tidy_books filter(book == "Emma") %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 301 × 2
## word n
## <chr> <int>
## 1 good 359
## 2 friend 166
## 3 hope 143
## 4 happy 125
## 5 love 117
## 6 deal 92
## 7 found 92
## 8 present 89
## 9 kind 82
## 10 happiness 76
## # … with 291 more rows
# Positive and Negative sentiments
<- tidy_books %>%
jane_austen_sentiment inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
# Plot
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
\(~\)
# Filter book Pride and Prejudice
<- tidy_books %>%
pride_prejudice filter(book == "Pride & Prejudice")
pride_prejudice
## # A tibble: 122,204 × 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Pride & Prejudice 1 0 pride
## 2 Pride & Prejudice 1 0 and
## 3 Pride & Prejudice 1 0 prejudice
## 4 Pride & Prejudice 3 0 by
## 5 Pride & Prejudice 3 0 jane
## 6 Pride & Prejudice 3 0 austen
## 7 Pride & Prejudice 7 1 chapter
## 8 Pride & Prejudice 7 1 1
## 9 Pride & Prejudice 10 1 it
## 10 Pride & Prejudice 10 1 is
## # … with 122,194 more rows
<- pride_prejudice %>%
afinn inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
## Joining, by = "word"
<- bind_rows(
bing_and_nrc %>%
pride_prejudice inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
%>%
pride_prejudice inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c(
"positive",
"negative"
%>%
))) mutate(method = "NRC")
%>%
) count(method, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
## Joining, by = "word"
bind_rows(
afinn,
bing_and_nrc%>%
) ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")
get_sentiments("nrc") %>%
filter(sentiment %in% c(
"positive",
"negative"
%>%
)) count(sentiment)
## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 3318
## 2 positive 2308
get_sentiments("bing") %>%
count(sentiment)
## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
\(~\)
<- tidy_books %>%
bing_word_counts inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
bing_word_counts
## # A tibble: 2,585 × 3
## word sentiment n
## <chr> <chr> <int>
## 1 miss negative 1855
## 2 well positive 1523
## 3 good positive 1380
## 4 great positive 981
## 5 like positive 725
## 6 better positive 639
## 7 enough positive 613
## 8 happy positive 534
## 9 love positive 495
## 10 pleasure positive 462
## # … with 2,575 more rows
# Plot word counts
%>%
bing_word_counts group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(
y = "Contribution to sentiment",
x = NULL
+
) coord_flip()
## Selecting by n
# Stop words
<- bind_rows(tibble(word = c("miss"),
custom_stop_words lexicon = c("custom")),
stop_words)
custom_stop_words
## # A tibble: 1,150 × 2
## word lexicon
## <chr> <chr>
## 1 miss custom
## 2 a SMART
## 3 a's SMART
## 4 able SMART
## 5 about SMART
## 6 above SMART
## 7 according SMART
## 8 accordingly SMART
## 9 across SMART
## 10 actually SMART
## # … with 1,140 more rows
\(~\)
library(wordcloud)
## Loading required package: RColorBrewer
%>%
tidy_books anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
%>%
tidy_books inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(
colors = c("gray20", "gray80"),
max.words = 100
)
## Joining, by = "word"
\(~\)
<- tibble(text = prideprejudice) %>%
PandP_sentences unnest_tokens(sentence, text, token = "sentences")
$sentence[2] PandP_sentences
## [1] "by jane austen"
<- austen_books() %>%
austen_chapters group_by(book) %>%
unnest_tokens(chapter, text,
token = "regex",
pattern = "Chapter|CHAPTER [\\dIVXLC]"
%>%
) ungroup()
%>%
austen_chapters group_by(book) %>%
summarise(chapters = n())
## # A tibble: 6 × 2
## book chapters
## <fct> <int>
## 1 Sense & Sensibility 51
## 2 Pride & Prejudice 62
## 3 Mansfield Park 49
## 4 Emma 56
## 5 Northanger Abbey 32
## 6 Persuasion 25
<- get_sentiments("bing") %>%
bingnegative filter(sentiment == "negative")
<- tidy_books %>%
wordcounts group_by(book, chapter) %>%
summarize(words = n())
## `summarise()` has grouped output by 'book'. You can override using the `.groups` argument.
%>%
tidy_books semi_join(bingnegative) %>%
group_by(book, chapter) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts, by = c("book", "chapter")) %>%
mutate(ratio = negativewords / words) %>%
filter(chapter != 0) %>%
top_n(1) %>%
ungroup()
## Joining, by = "word"
## `summarise()` has grouped output by 'book'. You can override using the `.groups` argument.
## Selecting by ratio
## # A tibble: 6 × 5
## book chapter negativewords words ratio
## <fct> <int> <int> <int> <dbl>
## 1 Sense & Sensibility 43 161 3405 0.0473
## 2 Pride & Prejudice 34 111 2104 0.0528
## 3 Mansfield Park 46 173 3685 0.0469
## 4 Emma 15 151 3340 0.0452
## 5 Northanger Abbey 21 149 2982 0.0500
## 6 Persuasion 4 62 1807 0.0343
\(~\)
loughran
and found it on rdocumentation.org\(~\)
\(~\)
library(devtools)
## Loading required package: usethis
install_github("EmilHvitfeldt/hcandersenr")
## Skipping install of 'hcandersenr' from a github remote, the SHA1 (a40ebfba) has not changed since last install.
## Use `force = TRUE` to force installation
library(hcandersenr)
# install.packages("hcandersenr")
library("hcandersenr")
# Sentiment loughran
get_sentiments("loughran")
## # A tibble: 4,150 × 2
## word sentiment
## <chr> <chr>
## 1 abandon negative
## 2 abandoned negative
## 3 abandoning negative
## 4 abandonment negative
## 5 abandonments negative
## 6 abandons negative
## 7 abdicated negative
## 8 abdicates negative
## 9 abdicating negative
## 10 abdication negative
## # … with 4,140 more rows
\(~\)
# Code was taken from https://github.com/EmilHvitfeldt/hcandersenr
hca_fairytales() %>%
select(book, language) %>%
unique() %>%
mutate(langauge = fct_relevel(language, c("English", "Spanish", "German", "Danish", "French"))) %>%
top_n(60, book) %>%
ggplot(aes(langauge, book)) +
geom_raster(alpha = 0.3) +
scale_x_discrete(position = "top")
<- hca_fairytales() %>%
tidy_books_hca group_by(book) %>%
ungroup() %>%
unnest_tokens(word, text)
# Filter by Thumbelina book and by langauge
<- tidy_books_hca %>%
thumbelina filter(book == "Thumbelina") %>%
filter(language == "English")
thumbelina
## # A tibble: 4,381 × 3
## book language word
## <chr> <chr> <chr>
## 1 Thumbelina English there
## 2 Thumbelina English was
## 3 Thumbelina English once
## 4 Thumbelina English a
## 5 Thumbelina English woman
## 6 Thumbelina English who
## 7 Thumbelina English wished
## 8 Thumbelina English very
## 9 Thumbelina English much
## 10 Thumbelina English to
## # … with 4,371 more rows
\(~\)
AFINN
and LOUGHRAN
<- thumbelina %>%
afinn_hca inner_join(get_sentiments("afinn")) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
## Joining, by = "word"
<- thumbelina %>%
loughran_hca inner_join(get_sentiments("loughran")) %>%
filter(!is.na(sentiment)) %>%
count(sentiment, sort = TRUE)
## Joining, by = "word"
loughran_hca
## # A tibble: 5 × 2
## sentiment n
## <chr> <int>
## 1 positive 45
## 2 uncertainty 37
## 3 negative 30
## 4 litigious 6
## 5 constraining 3
<- thumbelina %>%
loughran_hca_word_counts inner_join(get_sentiments("loughran")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
loughran_hca_word_counts
## # A tibble: 53 × 3
## word sentiment n
## <chr> <chr> <int>
## 1 could uncertainty 20
## 2 beautiful positive 16
## 3 poor negative 11
## 4 might uncertainty 6
## 5 shall litigious 5
## 6 good positive 4
## 7 pleased positive 3
## 8 appeared uncertainty 2
## 9 beautifully positive 2
## 10 broken negative 2
## # … with 43 more rows
%>%
loughran_hca_word_counts group_by(sentiment) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment",
y = NULL)
<- bind_rows(tibble(word = c("the"),
hca_custom_stop_words lexicon = c("custom")),
stop_words)
hca_custom_stop_words
## # A tibble: 1,150 × 2
## word lexicon
## <chr> <chr>
## 1 the custom
## 2 a SMART
## 3 a's SMART
## 4 able SMART
## 5 about SMART
## 6 above SMART
## 7 according SMART
## 8 accordingly SMART
## 9 across SMART
## 10 actually SMART
## # … with 1,140 more rows
\(~\)
%>%
tidy_books_hca filter(book == "Thumbelina") %>%
filter(language == "English") %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"
%>%
tidy_books_hca filter(book == "Thumbelina") %>%
inner_join(get_sentiments("bing")) %>%
filter(language == "English") %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 100)
## Joining, by = "word"
\(~\)
\(~\)
\(~\)
Robinson, J. S. and D. (n.d.). 2 sentiment analysis with Tidy Data: Text mining with R. 2 Sentiment analysis with tidy data | Text Mining with R. Retrieved October 25, 2021, from https://www.tidytextmining.com/sentiment.html.
Get_sentiments: Get a tidy data frame of a single sentiment lexicon. RDocumentation. (n.d.). Retrieved October 28, 2021, from https://www.rdocumentation.org/packages/tidytext/versions/0.2.3/topics/get_sentiments.
EmilHvitfeldt. (n.d.). Emilhvitfeldt/hcandersenr: An R package for H.C. Andersens Fairy tales. GitHub. Retrieved October 29, 2021, from https://github.com/EmilHvitfeldt/hcandersenr.