library(tidytext)
## Warning: package 'tidytext' was built under R version 3.6.3
library(textdata)
## Warning: package 'textdata' was built under R version 3.6.3
get_sentiments("afinn")
## # A tibble: 2,477 x 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # ... with 2,467 more rows
get_sentiments("bing")
## # A tibble: 6,786 x 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # ... with 6,776 more rows
get_sentiments("nrc")
## # A tibble: 13,901 x 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # ... with 13,891 more rows
library(janeaustenr)
## Warning: package 'janeaustenr' was built under R version 3.6.3
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
#Please find below the full citation of the base code:
#The base code has been taken from the book - 'Text Mining with R', Chapter '2 Sentiment analysis with tidy data', sections 2.1 and 2.2
tidy_books <- austen_books() %>%
group_by(book) %>%
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
#filtering the joy sentiment from the selected corpus
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_books %>%
filter(book == "Emma") %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 303 x 2
## word n
## <chr> <int>
## 1 good 359
## 2 young 192
## 3 friend 166
## 4 hope 143
## 5 happy 125
## 6 love 117
## 7 deal 92
## 8 found 92
## 9 present 89
## 10 kind 82
## # ... with 293 more rows
library(tidyr)
jane_austen_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
library(ggplot2)
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")

#using lexicon loughran for which i got a reference on this page "https://www.datacamp.com/community/tutorials/sentiment-analysis-R#lexiconsandlyrics"
get_sentiments("loughran")
## # A tibble: 4,150 x 2
## word sentiment
## <chr> <chr>
## 1 abandon negative
## 2 abandoned negative
## 3 abandoning negative
## 4 abandonment negative
## 5 abandonments negative
## 6 abandons negative
## 7 abdicated negative
## 8 abdicates negative
## 9 abdicating negative
## 10 abdication negative
## # ... with 4,140 more rows
#selecting a new corpus
# we are using David Robinson’s gutenbergr package to gather the text of Shakepeare's plays.
library(gutenbergr)
## Warning: package 'gutenbergr' was built under R version 3.6.3
bardofavon <- gutenberg_works(author == "Shakespeare, William")
bardofavon
## # A tibble: 79 x 8
## gutenberg_id title author gutenberg_autho~ language gutenberg_books~ rights
## <int> <chr> <chr> <int> <chr> <chr> <chr>
## 1 1041 Shak~ Shake~ 65 en <NA> Publi~
## 2 1045 Venu~ Shake~ 65 en <NA> Publi~
## 3 1500 King~ Shake~ 65 en <NA> Publi~
## 4 1501 Hist~ Shake~ 65 en <NA> Publi~
## 5 1502 The ~ Shake~ 65 en <NA> Publi~
## 6 1503 The ~ Shake~ 65 en <NA> Publi~
## 7 1504 The ~ Shake~ 65 en <NA> Publi~
## 8 1505 The ~ Shake~ 65 en <NA> Publi~
## 9 1507 The ~ Shake~ 65 en <NA> Publi~
## 10 1508 The ~ Shake~ 65 en <NA> Publi~
## # ... with 69 more rows, and 1 more variable: has_text <lgl>
#extracting the play IDs for the plays that we wish to analyse
play_id = bardofavon[c(15,16,17,21,23,24,33,40,56,58),]$gutenberg_id
play_id
## [1] 1513 1514 1515 1519 1522 1523 1533 1540 2265 2267
bardofavon %>% filter(gutenberg_id %in% play_id)
## # A tibble: 10 x 8
## gutenberg_id title author gutenberg_autho~ language gutenberg_books~ rights
## <int> <chr> <chr> <int> <chr> <chr> <chr>
## 1 1513 Rome~ Shake~ 65 en <NA> Publi~
## 2 1514 A Mi~ Shake~ 65 en <NA> Publi~
## 3 1515 The ~ Shake~ 65 en Banned Books fr~ Publi~
## 4 1519 Much~ Shake~ 65 en <NA> Publi~
## 5 1522 Juli~ Shake~ 65 en <NA> Publi~
## 6 1523 As Y~ Shake~ 65 en <NA> Publi~
## 7 1533 Macb~ Shake~ 65 en Opera Publi~
## 8 1540 The ~ Shake~ 65 en <NA> Publi~
## 9 2265 Haml~ Shake~ 65 en Best Books Ever~ Publi~
## 10 2267 Othe~ Shake~ 65 en <NA> Publi~
## # ... with 1 more variable: has_text <lgl>
bardofavon
## # A tibble: 79 x 8
## gutenberg_id title author gutenberg_autho~ language gutenberg_books~ rights
## <int> <chr> <chr> <int> <chr> <chr> <chr>
## 1 1041 Shak~ Shake~ 65 en <NA> Publi~
## 2 1045 Venu~ Shake~ 65 en <NA> Publi~
## 3 1500 King~ Shake~ 65 en <NA> Publi~
## 4 1501 Hist~ Shake~ 65 en <NA> Publi~
## 5 1502 The ~ Shake~ 65 en <NA> Publi~
## 6 1503 The ~ Shake~ 65 en <NA> Publi~
## 7 1504 The ~ Shake~ 65 en <NA> Publi~
## 8 1505 The ~ Shake~ 65 en <NA> Publi~
## 9 1507 The ~ Shake~ 65 en <NA> Publi~
## 10 1508 The ~ Shake~ 65 en <NA> Publi~
## # ... with 69 more rows, and 1 more variable: has_text <lgl>
#downloading the text and storing it in a tibble
bard_plays = gutenberg_download(play_id,meta_fields = "title")
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
bard_plays
## # A tibble: 44,450 x 3
## gutenberg_id text title
## <int> <chr> <chr>
## 1 1513 "ROMEO AND JULIET" Romeo and Juliet
## 2 1513 "" Romeo and Juliet
## 3 1513 "by William Shakespeare" Romeo and Juliet
## 4 1513 "" Romeo and Juliet
## 5 1513 "" Romeo and Juliet
## 6 1513 "" Romeo and Juliet
## 7 1513 "" Romeo and Juliet
## 8 1513 "PERSONS REPRESENTED" Romeo and Juliet
## 9 1513 "" Romeo and Juliet
## 10 1513 "Escalus, Prince of Verona." Romeo and Juliet
## # ... with 44,440 more rows
#analyzing sentiments of the different plays
plays_senti = bard_plays%>%group_by(title)%>%mutate(line = row_number()) %>% unnest_tokens(word, text)%>%inner_join(get_sentiments("loughran")) # using the loughran lexicon for the analysis
## Joining, by = "word"
plays_senti
## # A tibble: 7,603 x 5
## # Groups: title [10]
## gutenberg_id title line word sentiment
## <int> <chr> <int> <chr> <chr>
## 1 1513 Romeo and Juliet 12 variance uncertainty
## 2 1513 Romeo and Juliet 42 greater positive
## 3 1513 Romeo and Juliet 52 break negative
## 4 1513 Romeo and Juliet 60 could uncertainty
## 5 1513 Romeo and Juliet 63 shall litigious
## 6 1513 Romeo and Juliet 63 miss negative
## 7 1513 Romeo and Juliet 63 shall litigious
## 8 1513 Romeo and Juliet 98 shall litigious
## 9 1513 Romeo and Juliet 102 weak negative
## 10 1513 Romeo and Juliet 102 weakest negative
## # ... with 7,593 more rows
#plotting the evolution of emotions through out the different plays selected
plays_senti %>%
# count number of positive and negative words for each chunk of 100 lines
count(title, index = line %/% 100, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative) %>%
ggplot(aes(index, sentiment, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~title,scales = "free_x")

# having read most of the plays selected for the analysis - the sentiment scores evolution doesnt tie up well with my perception for few of the plays. For example, plays such as 'A Midsummer Night's Dream', 'As you like it', 'Much Ado about Nothing' are well accepted comedies. However as per the sentiment score evolution, the scores are mostly seen in the negative zone. Even for plays like 'Merchant of Venice' and 'The Tempest' - they all have a happy ending and such negative scores are not expected
#hence trying the analysis with a different lexicon - nrc
plays_senti1 = bard_plays%>%group_by(title)%>%mutate(line = row_number()) %>% unnest_tokens(word, text)%>%inner_join(get_sentiments("nrc")) # using the nrc lexicon for the analysis
## Joining, by = "word"
plays_senti1
## # A tibble: 67,538 x 5
## # Groups: title [10]
## gutenberg_id title line word sentiment
## <int> <chr> <int> <chr> <chr>
## 1 1513 Romeo and Juliet 8 represented positive
## 2 1513 Romeo and Juliet 10 prince positive
## 3 1513 Romeo and Juliet 11 young anticipation
## 4 1513 Romeo and Juliet 11 young joy
## 5 1513 Romeo and Juliet 11 young positive
## 6 1513 Romeo and Juliet 11 young surprise
## 7 1513 Romeo and Juliet 11 nobleman positive
## 8 1513 Romeo and Juliet 11 nobleman trust
## 9 1513 Romeo and Juliet 11 prince positive
## 10 1513 Romeo and Juliet 16 prince positive
## # ... with 67,528 more rows
plays_senti1 %>%
# count number of positive and negative words for each chunk of 100 lines
count(title, index = line %/% 100, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative) %>%
ggplot(aes(index, sentiment, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~title,scales = "free_x")

# There is a signficant change in the sentiment evolution scores for plays such as 'A Midsummer Night's Dream', 'As you like it', 'Much Ado about Nothing' which are well accepted comedies. The sentiments are now in the net positive domain which is in line with the nature of these plays. Point to be noticed here is Macbeth - a world renowned tragedy tends to maintain a sombre sentiment throughout the play with extreme negative emotions seen towards the end of the novel.