The goal of this week’s assignment is to practice text processing; first, we are going to get an example code from Chapter 2: Sentiment analysis with tidy data. Then, choose another text and apply the same code to analyze the sentiment of it.
library(tidytext)
library(janeaustenr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stringr)
get_sentiments("afinn")
## # A tibble: 2,477 × 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # ℹ 2,467 more rows
get_sentiments("bing")
## # A tibble: 6,786 × 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # ℹ 6,776 more rows
get_sentiments("nrc")
## # A tibble: 13,872 × 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # ℹ 13,862 more rows
In order to perform sentiment analysis, the text must be in tidy format:
tidy_books <- austen_books() %>%
group_by(book) %>% #to create the column book
mutate(
linenumber = row_number(), # to detect which line is the word coming from
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>% # to detect which chapter of the book is the word coming from
ungroup() %>%
unnest_tokens(word, text) #to convert the text to tidy format
library(textdata)
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_books %>%
filter(book == "Emma") %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE)
## Joining with `by = join_by(word)`
## # A tibble: 301 × 2
## word n
## <chr> <int>
## 1 good 359
## 2 friend 166
## 3 hope 143
## 4 happy 125
## 5 love 117
## 6 deal 92
## 7 found 92
## 8 present 89
## 9 kind 82
## 10 happiness 76
## # ℹ 291 more rows
jane_austen_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 435434 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
pride_prejudice <- tidy_books %>%
filter(book == "Pride & Prejudice")
pride_prejudice
## # A tibble: 122,204 × 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Pride & Prejudice 1 0 pride
## 2 Pride & Prejudice 1 0 and
## 3 Pride & Prejudice 1 0 prejudice
## 4 Pride & Prejudice 3 0 by
## 5 Pride & Prejudice 3 0 jane
## 6 Pride & Prejudice 3 0 austen
## 7 Pride & Prejudice 7 1 chapter
## 8 Pride & Prejudice 7 1 1
## 9 Pride & Prejudice 10 1 it
## 10 Pride & Prejudice 10 1 is
## # ℹ 122,194 more rows
afinn <- pride_prejudice %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
## Joining with `by = join_by(word)`
bing_and_nrc <- bind_rows(
pride_prejudice %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
pride_prejudice %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative"))
) %>%
mutate(method = "NRC")) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("nrc") %>% filter(sentiment %in% : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 215 of `x` matches multiple rows in `y`.
## ℹ Row 5178 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
bind_rows(afinn,
bing_and_nrc) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")
get_sentiments("nrc") %>%
filter(sentiment %in% c("positive", "negative")) %>%
count(sentiment)
## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 3316
## 2 positive 2308
get_sentiments("bing") %>%
count(sentiment)
## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
bing_word_counts <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 435434 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
bing_word_counts
## # A tibble: 2,585 × 3
## word sentiment n
## <chr> <chr> <int>
## 1 miss negative 1855
## 2 well positive 1523
## 3 good positive 1380
## 4 great positive 981
## 5 like positive 725
## 6 better positive 639
## 7 enough positive 613
## 8 happy positive 534
## 9 love positive 495
## 10 pleasure positive 462
## # ℹ 2,575 more rows
bing_word_counts %>%
group_by(sentiment) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment",
y = NULL)
library(wordcloud)
## Loading required package: RColorBrewer
tidy_books %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
## Joining with `by = join_by(word)`
The text I chose to perform on the sentiment analysis is the lyrics of one of Celine Dion’s songs; “That’s The Way It Is”
First, let’s read the text from my github;
Text_D <- readLines("https://raw.githubusercontent.com/SalouaDaouki/Data607/main/Celine%20Dion%3A%20That's%20the%20way%20it%20is") # reading the text file form github source
Second, in order to perform sentiment analysis, the text should be tidy. To tidy the text, I need to convert it to a dataframe first;
text_df <- tibble( text = Text_D) # Converting the test data into a dataframe
Then, I need to “tokenizate” the text, meaning breaking the text into individual words to be able to analyse it;
text_df1 <- text_df %>%
unnest_tokens(word, text) %>% # making the text into a single words
anti_join(stop_words) # removing the un-necessary words for analysis
## Joining with `by = join_by(word)`
Now, the text is a tibble with 1 column and 338 rows (single words). We can remove the stop words. after removing the stop words, the number of rows decreases to 76.
Now let’s count the most repeated words in the lyrics;
text_df1 %>%
count(word, sort=TRUE)
## # A tibble: 40 × 2
## word n
## <chr> <int>
## 1 faith 9
## 2 love 7
## 3 yeah 5
## 4 doubt 3
## 5 easy 3
## 6 heart's 3
## 7 left 3
## 8 ready 3
## 9 baby 2
## 10 call 2
## # ℹ 30 more rows
We can plot the tibble above and focus only on the words that are repeated more than twice.
text_df1 %>%
count(word, sort = TRUE) %>%
filter(n > 2) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word)) +
geom_col() +
labs(y = NULL)
We can see that the word faith is the most repeated in the song, followed by love.
We also can create a new tibble that have separate columns for negative and positive and also the sentiment of each word;
text_df1_sentiment <- text_df1 %>%
inner_join(get_sentiments("bing")) %>%
count(word, index = row_number() %/% 76, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
To visualize the sentiment of each word, we can plot the text_df1_sentiment into geom_col to show how positive or negative the sentiment of each word;
library(ggplot2)
ggplot(text_df1_sentiment, aes(index, sentiment, fill = word)) +
geom_col(show.legend = FALSE) +
facet_wrap(~word, ncol = 2, scales = "free_x")
Based on the plots above, faith and love have the thickest bar in the positive side, which means that they are the most positive words mentioned in the lyrics.
Now, we can compare the three sentiments dictionaries,
afinn <- text_df1 %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = row_number() %/% 76) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
## Joining with `by = join_by(word)`
bing_and_nrc <- bind_rows(
text_df1 %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
text_df1 %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative"))
) %>%
mutate(method = "NRC")) %>%
count(method, index = row_number() %/% 76, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("nrc") %>% filter(sentiment %in% : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 9 of `x` matches multiple rows in `y`.
## ℹ Row 3131 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
Visualization will allow us to compare the three methods easily;
bind_rows(afinn,
bing_and_nrc) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")
For Celine Dion’s song, all three methods show the same results, (or maybe I did something wrong in the code that I couldn’t figure out?). I am sure something is wrong, because the count of the positive and negative in each lexicon (according to tidy text mining) is different, so the plots should be different.
bing_word_counts1 <- text_df1 %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining with `by = join_by(word)`
bing_word_counts1
## # A tibble: 10 × 3
## word sentiment n
## <chr> <chr> <int>
## 1 faith positive 9
## 2 love positive 7
## 3 doubt negative 3
## 4 easy positive 3
## 5 ready positive 3
## 6 loneliness negative 2
## 7 surrender negative 2
## 8 win positive 2
## 9 sorrow negative 1
## 10 worry negative 1
bing_word_counts1 %>%
group_by(sentiment) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment",
y = NULL)
Based on the plot above, there are more positive words than negative that are mentioned in the song.
text_df1 %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 40))
## Joining with `by = join_by(word)`
I typed in the below r chunk “get_sentiments” then four sentiments lexicons appeared, so I picked loughran, because that is the only one I haven’t used yet. Let’s perform additional sentiment analysis:
First let’s get the sentiment for loughran:
get_sentiments("loughran")
## # A tibble: 4,150 × 2
## word sentiment
## <chr> <chr>
## 1 abandon negative
## 2 abandoned negative
## 3 abandoning negative
## 4 abandonment negative
## 5 abandonments negative
## 6 abandons negative
## 7 abdicated negative
## 8 abdicates negative
## 9 abdicating negative
## 10 abdication negative
## # ℹ 4,140 more rows
The sentiment of each word using the loughran sentiment lexicon:
text_df1_sentiment1 <- text_df1 %>%
inner_join(get_sentiments("loughran")) %>%
count(word, index = row_number() %/% 76, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("loughran")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 21 of `x` matches multiple rows in `y`.
## ℹ Row 2705 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
To visualize the sentiment of each word, we can plot the text_df1_sentiment1 into geom_col to show how positive or negative the sentiment of each word;
library(ggplot2)
ggplot(text_df1_sentiment1, aes(index, sentiment, fill = word)) +
geom_col(show.legend = FALSE) +
facet_wrap(~word, ncol = 2, scales = "free_x")
Based on the plots above, easy and win have the bars that are above zero, which means that they are the only positive words mentioned in the lyrics using the loughran lexicon.
Let’s compare afinn and loughran methods:
afinn1 <- text_df1 %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = row_number() %/% 76) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
## Joining with `by = join_by(word)`
loughran1 <- text_df1 %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = row_number() %/% 76) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "loughran")
## Joining with `by = join_by(word)`
bind_rows(afinn1, loughran1) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")
Based on the plot and the data affin1 and loughran 1, both methods captured 37 sentiments from the lyrics.
Let’s compare all four:
Afinn_and_loughran <- bind_rows(
text_df1 %>%
inner_join(get_sentiments("afinn")) %>%
mutate(method = "AFINN"),
text_df1 %>%
inner_join(get_sentiments("loughran") %>%
filter(sentiment %in% c("positive",
"negative"))
) %>%
mutate(method = "loughran")) %>%
count(method, index = row_number() %/% 76, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
## Joining with `by = join_by(word)`
bing_and_nrc1 <- bind_rows(
text_df1 %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
text_df1 %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative"))
) %>%
mutate(method = "NRC")) %>%
count(method, index = row_number() %/% 76, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("nrc") %>% filter(sentiment %in% : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 9 of `x` matches multiple rows in `y`.
## ℹ Row 3131 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
bind_rows(Afinn_and_loughran,
bing_and_nrc1, afinn1, loughran1) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")
The number of sentiments using each method is different, afinn and loughran have 37 sentiments while bing has 15. The nrc has orund 9 only.
Each method gives different results and that is because the number of negative and positive words in each sentiment lexicon is different and also the type of the words are different. For example, in bing, the words that were detected 10 words in the lyrics while only 4 of them were included in loughran plus the word question, which was not common using bing. The most common words using bing were faith and love, but using loughran, the words were easy and win.
Tidy Text Mining: Chapter 1: The tidy Text format (for tidying the text in part II)
Tidy Text Mining: Chapter 2: Sentiment analysis with tidy data