This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(tidytext)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v forcats 0.5.1
## v readr 2.0.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
library(tidytext)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
##Recreate Base Analysis
##Re-create and analyze primary code from the textbook,“Sentiment analysis with tidy data” by Silge, Julia and Robinson, David.
## Reference https://www.tidytextmining.com/sentiment.html
get_sentiments("afinn")
## # A tibble: 2,477 x 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # ... with 2,467 more rows
get_sentiments("bing")
## # A tibble: 6,786 x 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # ... with 6,776 more rows
get_sentiments("nrc")
## # A tibble: 13,875 x 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # ... with 13,865 more rows
##install
library(janeaustenr)
library(wordcloud)
## Loading required package: RColorBrewer
##Convert Data to Tidy using austen_books function
tidy_books <- austen_books() %>%
group_by(book) %>%
mutate(
linenumber = row_number(),
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
tidy_books
## # A tibble: 725,055 x 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Sense & Sensibility 1 0 sense
## 2 Sense & Sensibility 1 0 and
## 3 Sense & Sensibility 1 0 sensibility
## 4 Sense & Sensibility 3 0 by
## 5 Sense & Sensibility 3 0 jane
## 6 Sense & Sensibility 3 0 austen
## 7 Sense & Sensibility 5 0 1811
## 8 Sense & Sensibility 10 1 chapter
## 9 Sense & Sensibility 10 1 1
## 10 Sense & Sensibility 13 1 the
## # ... with 725,045 more rows
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
## fil
tidy_books %>%
filter(book == "Emma") %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 301 x 2
## word n
## <chr> <int>
## 1 good 359
## 2 friend 166
## 3 hope 143
## 4 happy 125
## 5 love 117
## 6 deal 92
## 7 found 92
## 8 present 89
## 9 kind 82
## 10 happiness 76
## # ... with 291 more rows
jane_austen_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
pride_prejudice <- tidy_books %>%
filter(book == "Pride & Prejudice")
afinn <- pride_prejudice %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
## Joining, by = "word"
bing_and_nrc <- bind_rows(
pride_prejudice %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
pride_prejudice %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative"))
) %>%
mutate(method = "NRC")) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
## Joining, by = "word"
bind_rows(afinn,
bing_and_nrc) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")
get_sentiments("nrc") %>%
filter(sentiment %in% c("positive", "negative")) %>%
count(sentiment)
## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 3318
## 2 positive 2308
get_sentiments("bing") %>%
count(sentiment)
## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
bing_word_counts <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
bing_word_counts %>%
group_by(sentiment) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment",
y = NULL)
custom_stop_words <- bind_rows(tibble(word = c("miss"),
lexicon = c("custom")),
stop_words)
tidy_books %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"
tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 100)
## Joining, by = "word"
austen_chapters <- austen_books() %>%
group_by(book) %>%
unnest_tokens(chapter, text, token = "regex",
pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
ungroup()
austen_chapters %>%
group_by(book) %>%
summarise(chapters = n())
## # A tibble: 6 x 2
## book chapters
## <fct> <int>
## 1 Sense & Sensibility 51
## 2 Pride & Prejudice 62
## 3 Mansfield Park 49
## 4 Emma 56
## 5 Northanger Abbey 32
## 6 Persuasion 25
bingnegative <- get_sentiments("bing") %>%
filter(sentiment == "negative")
wordcounts <- tidy_books %>%
group_by(book, chapter) %>%
summarize(words = n())
## `summarise()` has grouped output by 'book'. You can override using the `.groups` argument.
tidy_books %>%
semi_join(bingnegative) %>%
group_by(book, chapter) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts, by = c("book", "chapter")) %>%
mutate(ratio = negativewords/words) %>%
filter(chapter != 0) %>%
slice_max(ratio, n = 1) %>%
ungroup()
## Joining, by = "word"
## `summarise()` has grouped output by 'book'. You can override using the `.groups` argument.
## # A tibble: 6 x 5
## book chapter negativewords words ratio
## <fct> <int> <int> <int> <dbl>
## 1 Sense & Sensibility 43 161 3405 0.0473
## 2 Pride & Prejudice 34 111 2104 0.0528
## 3 Mansfield Park 46 173 3685 0.0469
## 4 Emma 15 151 3340 0.0452
## 5 Northanger Abbey 21 149 2982 0.0500
## 6 Persuasion 4 62 1807 0.0343
## Analysis
##Sentiment analysis provides a way to understand the attitudes and opinions expressed in texts. In this chapter, we explored how to approach sentiment analysis using tidy data principles; when text data is in a tidy data structure, sentiment analysis can be implemented as an inner join
##Extend analysis to new corpus and new lexicon
## We introduce a corpus based on the book Narrative of the Life of Frederick Douglass,an American Slave by author Frederick Douglass##Reference:
##https://docsouth.unc.edu/neh/douglass/douglass.html
#install library
library(gutenbergr)
# get gutenberg_id
gutenberg_metadata %>% filter(author == "Douglass, Frederick")
## # A tibble: 6 x 8
## gutenberg_id title author gutenberg_autho~ language gutenberg_books~ rights
## <int> <chr> <chr> <int> <chr> <chr> <chr>
## 1 23 "Narra~ Dougl~ 34510 en Slavery/African~ Public~
## 2 99 "Colle~ Dougl~ 34510 en Slavery/African~ Public~
## 3 202 "My Bo~ Dougl~ 34510 en Slavery/African~ Public~
## 4 6545 "Frede~ Dougl~ 34510 en African America~ Copyri~
## 5 31839 "John ~ Dougl~ 34510 en African America~ Public~
## 6 34915 "Aboli~ Dougl~ 34510 en African America~ Public~
## # ... with 1 more variable: has_text <lgl>
gutenberg_metadata %>%
filter(author == "Douglass, Frederick", title == "Narrative of the Life of Frederick Douglass, an American Slave")
## # A tibble: 1 x 8
## gutenberg_id title author gutenberg_autho~ language gutenberg_books~ rights
## <int> <chr> <chr> <int> <chr> <chr> <chr>
## 1 23 Narrat~ Dougla~ 34510 en Slavery/African~ Publi~
## # ... with 1 more variable: has_text <lgl>
count_of_Narrative_of_the_Life <- gutenberg_download(23)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
count_of_Narrative_of_the_Life
## # A tibble: 3,712 x 2
## gutenberg_id text
## <int> <chr>
## 1 23 "Narrative"
## 2 23 "of the"
## 3 23 "Life"
## 4 23 "of"
## 5 23 "FREDERICK DOUGLASS"
## 6 23 ""
## 7 23 "AN"
## 8 23 "AMERICAN SLAVE."
## 9 23 "WRITTEN BY HIMSELF."
## 10 23 ""
## # ... with 3,702 more rows
##Convert Data to Tidy
count_Narrative_of_the_Life <- count_of_Narrative_of_the_Life[c(663:nrow(count_of_Narrative_of_the_Life)),]
Narrative_of_the_Life_Chapters <- count_Narrative_of_the_Life %>%
filter(text != "") %>%
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("CHAPTER [\\dIVXLC]", ignore_case = TRUE))))
Narrative_of_the_Life_Chapters
## # A tibble: 2,811 x 4
## gutenberg_id text linenumber chapter
## <int> <chr> <int> <int>
## 1 23 "women. I had therefore been, until now, out~ 1 0
## 2 23 "scenes that often occurred on the plantatio~ 2 0
## 3 23 " CHAPTER II" 3 1
## 4 23 "My master’s family consisted of two sons, A~ 4 1
## 5 23 "daughter, Lucretia, and her husband, Captai~ 5 1
## 6 23 "one house, upon the home plantation of Colo~ 6 1
## 7 23 "was Colonel Lloyd’s clerk and superintenden~ 7 1
## 8 23 "called the overseer of the overseers. I spe~ 8 1
## 9 23 "this plantation in my old master’s family. ~ 9 1
## 10 23 "the bloody transaction recorded in the firs~ 10 1
## # ... with 2,801 more rows
###Using Loughran lexicon perform sentiment analysis
###This lexicon labels words with six possible sentiments important in financial contexts: “negative”, “positive”, “litigious”, “uncertainty”, “constraining”, or “superfluous”.
### Reference: https://rdrr.io/cran/textdata/man/lexicon_loughran.html
Narrative_of_the_Life_tidy <- Narrative_of_the_Life_Chapters %>%
unnest_tokens(word, text) %>%
inner_join(get_sentiments("loughran")) %>%
count(word, sentiment, sort = TRUE) %>%
group_by(sentiment) %>%
top_n(10) %>% ungroup() %>% mutate(word = reorder(word, n)) %>%
anti_join(stop_words)
## Joining, by = "word"
## Selecting by n
## Joining, by = "word"
names(Narrative_of_the_Life_tidy)<-c("word", "sentiment", "Freq")
ggplot(data = Narrative_of_the_Life_tidy, aes(x = word, y = Freq, fill = sentiment)) +
geom_bar(stat = "identity") + coord_flip() + facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",x = NULL)
###Analysis
###The dataset consist of word, sentiment and Freq.
###Frequent used positive and negative words
###The most frequent used words for positive sentiments and negative sentiments.
Narrative_of_the_Life_Sentiment_total <- Narrative_of_the_Life_Chapters %>%
unnest_tokens(word, text) %>% inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
Narrative_of_the_Life_Sentiment_total %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip() +
geom_text(aes(label = n, hjust = 1.0))
## Selecting by n
##Chapter wise positive and negative words Bing lexicon
Narrative_of_the_Life_Sentiment <- Narrative_of_the_Life_Chapters %>%
unnest_tokens(word, text) %>%
inner_join(get_sentiments("bing")) %>%
count(chapter, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
ggplot(Narrative_of_the_Life_Sentiment, aes(index, sentiment, fill = chapter)) +
geom_col(show.legend = FALSE) +
facet_wrap(~chapter, ncol = 2, scales = "free_x")
## Finn lexicon to see which chapter has more positive words and which chapter has more negative words
Positive_Negative_Count<- Narrative_of_the_Life_Chapters %>%
unnest_tokens(word, text) %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80, chapter)%>%
summarise(sentiment = sum(value))
## Joining, by = "word"
## `summarise()` has grouped output by 'index'. You can override using the `.groups` argument.
Positive_Negative_Count%>%
ggplot(aes(chapter, sentiment, fill=index)) +
geom_col()
##Wordcloud
## Most common words in the book
total_word_count <- Narrative_of_the_Life_Chapters %>% unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
count(word, sort = TRUE) %>% filter(word != "" )
## Joining, by = "word"
total_word_count %>% with(wordcloud(word, n, max.words = 100))
##TF-IDF
##The statistic tf-idf is intended to measure how important a word is to a document in a ##collection (or corpus) of documents.
book_words <- Narrative_of_the_Life_Chapters %>%
unnest_tokens(word, text) %>%
count(chapter, word, sort = TRUE)
total_words <- book_words %>%
group_by(chapter) %>%
dplyr::summarize(total = sum(n))
book_words <- left_join(book_words, total_words)
## Joining, by = "chapter"
book_words <- book_words %>%
bind_tf_idf(word, chapter, n)
book_words %>%
select(-total) %>%
arrange(desc(tf_idf))
## # A tibble: 9,177 x 6
## chapter word n tf idf tf_idf
## <int> <chr> <int> <dbl> <dbl> <dbl>
## 1 0 occurred 1 0.0476 1.70 0.0812
## 2 0 scenes 1 0.0476 1.70 0.0812
## 3 0 often 1 0.0476 0.788 0.0375
## 4 0 therefore 1 0.0476 0.788 0.0375
## 5 0 women 1 0.0476 0.788 0.0375
## 6 0 until 1 0.0476 0.606 0.0289
## 7 0 bloody 1 0.0476 0.452 0.0215
## 8 3 gore 14 0.00885 2.40 0.0212
## 9 0 way 1 0.0476 0.318 0.0152
## 10 6 write 12 0.00484 2.40 0.0116
## # ... with 9,167 more rows
##plotting
book_words %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(chapter) %>%
top_n(10) %>%
ungroup() %>%
ggplot(aes(word, tf_idf, fill = chapter)) +
geom_col(aes(reorder(word, tf_idf),tf_idf),stat = "identity",show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~chapter, scales = "free") +
coord_flip()
## Selecting by tf_idf
## Warning: Ignoring unknown parameters: stat
##Analysis:
## In this assignment, we added a new corpus from ‘gutenbergr’ package and applied sentiment analysis. From the analysis, we came to know mostly used positive/negative words and chapter wise sentiment analysis.Chapter 8 and 9 has more negative words and Chapter 3 and 4 has more positive words.
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.