DATA 607 - Week 10

library(tidytext)

## Warning: package 'tidytext' was built under R version 3.6.3

library(textdata)

## Warning: package 'textdata' was built under R version 3.6.3

get_sentiments("afinn")

## # A tibble: 2,477 x 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ... with 2,467 more rows

get_sentiments("bing")

## # A tibble: 6,786 x 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # ... with 6,776 more rows

get_sentiments("nrc")

## # A tibble: 13,901 x 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # ... with 13,891 more rows

library(janeaustenr)

## Warning: package 'janeaustenr' was built under R version 3.6.3

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)
#Please find below the full citation of the base code:
  
#The base code has been taken from the book - 'Text Mining with R', Chapter '2 Sentiment analysis with tidy data', sections 2.1 and 2.2

tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(linenumber = row_number(),
         chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]", 
                                                 ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)

#filtering the joy sentiment from the selected corpus

nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")

tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)

## Joining, by = "word"

## # A tibble: 303 x 2
##    word        n
##    <chr>   <int>
##  1 good      359
##  2 young     192
##  3 friend    166
##  4 hope      143
##  5 happy     125
##  6 love      117
##  7 deal       92
##  8 found      92
##  9 present    89
## 10 kind       82
## # ... with 293 more rows

library(tidyr)

jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

## Joining, by = "word"

library(ggplot2)

ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

#using lexicon loughran for which i got a reference on this page "https://www.datacamp.com/community/tutorials/sentiment-analysis-R#lexiconsandlyrics"

get_sentiments("loughran")

## # A tibble: 4,150 x 2
##    word         sentiment
##    <chr>        <chr>    
##  1 abandon      negative 
##  2 abandoned    negative 
##  3 abandoning   negative 
##  4 abandonment  negative 
##  5 abandonments negative 
##  6 abandons     negative 
##  7 abdicated    negative 
##  8 abdicates    negative 
##  9 abdicating   negative 
## 10 abdication   negative 
## # ... with 4,140 more rows

#selecting a new corpus
# we are using David Robinson’s gutenbergr package to gather the text of Shakepeare's plays.

library(gutenbergr)

## Warning: package 'gutenbergr' was built under R version 3.6.3

bardofavon <- gutenberg_works(author == "Shakespeare, William")
bardofavon

## # A tibble: 79 x 8
##    gutenberg_id title author gutenberg_autho~ language gutenberg_books~ rights
##           <int> <chr> <chr>             <int> <chr>    <chr>            <chr> 
##  1         1041 Shak~ Shake~               65 en       <NA>             Publi~
##  2         1045 Venu~ Shake~               65 en       <NA>             Publi~
##  3         1500 King~ Shake~               65 en       <NA>             Publi~
##  4         1501 Hist~ Shake~               65 en       <NA>             Publi~
##  5         1502 The ~ Shake~               65 en       <NA>             Publi~
##  6         1503 The ~ Shake~               65 en       <NA>             Publi~
##  7         1504 The ~ Shake~               65 en       <NA>             Publi~
##  8         1505 The ~ Shake~               65 en       <NA>             Publi~
##  9         1507 The ~ Shake~               65 en       <NA>             Publi~
## 10         1508 The ~ Shake~               65 en       <NA>             Publi~
## # ... with 69 more rows, and 1 more variable: has_text <lgl>

#extracting the play IDs for the plays that we wish to analyse

play_id = bardofavon[c(15,16,17,21,23,24,33,40,56,58),]$gutenberg_id
play_id

##  [1] 1513 1514 1515 1519 1522 1523 1533 1540 2265 2267

bardofavon %>% filter(gutenberg_id %in% play_id)

## # A tibble: 10 x 8
##    gutenberg_id title author gutenberg_autho~ language gutenberg_books~ rights
##           <int> <chr> <chr>             <int> <chr>    <chr>            <chr> 
##  1         1513 Rome~ Shake~               65 en       <NA>             Publi~
##  2         1514 A Mi~ Shake~               65 en       <NA>             Publi~
##  3         1515 The ~ Shake~               65 en       Banned Books fr~ Publi~
##  4         1519 Much~ Shake~               65 en       <NA>             Publi~
##  5         1522 Juli~ Shake~               65 en       <NA>             Publi~
##  6         1523 As Y~ Shake~               65 en       <NA>             Publi~
##  7         1533 Macb~ Shake~               65 en       Opera            Publi~
##  8         1540 The ~ Shake~               65 en       <NA>             Publi~
##  9         2265 Haml~ Shake~               65 en       Best Books Ever~ Publi~
## 10         2267 Othe~ Shake~               65 en       <NA>             Publi~
## # ... with 1 more variable: has_text <lgl>

bardofavon

## # A tibble: 79 x 8
##    gutenberg_id title author gutenberg_autho~ language gutenberg_books~ rights
##           <int> <chr> <chr>             <int> <chr>    <chr>            <chr> 
##  1         1041 Shak~ Shake~               65 en       <NA>             Publi~
##  2         1045 Venu~ Shake~               65 en       <NA>             Publi~
##  3         1500 King~ Shake~               65 en       <NA>             Publi~
##  4         1501 Hist~ Shake~               65 en       <NA>             Publi~
##  5         1502 The ~ Shake~               65 en       <NA>             Publi~
##  6         1503 The ~ Shake~               65 en       <NA>             Publi~
##  7         1504 The ~ Shake~               65 en       <NA>             Publi~
##  8         1505 The ~ Shake~               65 en       <NA>             Publi~
##  9         1507 The ~ Shake~               65 en       <NA>             Publi~
## 10         1508 The ~ Shake~               65 en       <NA>             Publi~
## # ... with 69 more rows, and 1 more variable: has_text <lgl>

#downloading the text and storing it in a tibble

bard_plays = gutenberg_download(play_id,meta_fields = "title")

## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest

## Using mirror http://aleph.gutenberg.org

bard_plays

## # A tibble: 44,450 x 3
##    gutenberg_id text                         title           
##           <int> <chr>                        <chr>           
##  1         1513 "ROMEO AND JULIET"           Romeo and Juliet
##  2         1513 ""                           Romeo and Juliet
##  3         1513 "by William Shakespeare"     Romeo and Juliet
##  4         1513 ""                           Romeo and Juliet
##  5         1513 ""                           Romeo and Juliet
##  6         1513 ""                           Romeo and Juliet
##  7         1513 ""                           Romeo and Juliet
##  8         1513 "PERSONS REPRESENTED"        Romeo and Juliet
##  9         1513 ""                           Romeo and Juliet
## 10         1513 "Escalus, Prince of Verona." Romeo and Juliet
## # ... with 44,440 more rows

#analyzing sentiments of the different plays

plays_senti = bard_plays%>%group_by(title)%>%mutate(line = row_number()) %>% unnest_tokens(word, text)%>%inner_join(get_sentiments("loughran"))   # using the loughran lexicon for the analysis

## Joining, by = "word"

plays_senti

## # A tibble: 7,603 x 5
## # Groups:   title [10]
##    gutenberg_id title             line word     sentiment  
##           <int> <chr>            <int> <chr>    <chr>      
##  1         1513 Romeo and Juliet    12 variance uncertainty
##  2         1513 Romeo and Juliet    42 greater  positive   
##  3         1513 Romeo and Juliet    52 break    negative   
##  4         1513 Romeo and Juliet    60 could    uncertainty
##  5         1513 Romeo and Juliet    63 shall    litigious  
##  6         1513 Romeo and Juliet    63 miss     negative   
##  7         1513 Romeo and Juliet    63 shall    litigious  
##  8         1513 Romeo and Juliet    98 shall    litigious  
##  9         1513 Romeo and Juliet   102 weak     negative   
## 10         1513 Romeo and Juliet   102 weakest  negative   
## # ... with 7,593 more rows

#plotting the evolution of emotions through out the different plays selected

plays_senti %>%
  # count number of positive and negative words for each chunk of 100 lines
  count(title, index = line %/% 100, sentiment) %>% 
  spread(sentiment, n, fill = 0)                %>%                 
  mutate(sentiment = positive - negative)       %>%
  ggplot(aes(index, sentiment, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~title,scales = "free_x")

# having read most of the plays selected for the analysis - the sentiment scores evolution doesnt tie up well with my perception for few of the plays. For example, plays such as 'A Midsummer Night's Dream', 'As you like it', 'Much Ado about Nothing' are well accepted comedies. However as per the sentiment score evolution, the scores are mostly seen in the negative zone. Even for plays like 'Merchant of Venice' and 'The Tempest' - they all have a happy ending and such negative scores are not expected

#hence trying the analysis with a different lexicon - nrc

plays_senti1 = bard_plays%>%group_by(title)%>%mutate(line = row_number()) %>% unnest_tokens(word, text)%>%inner_join(get_sentiments("nrc"))   # using the nrc lexicon for the analysis

## Joining, by = "word"

plays_senti1

## # A tibble: 67,538 x 5
## # Groups:   title [10]
##    gutenberg_id title             line word        sentiment   
##           <int> <chr>            <int> <chr>       <chr>       
##  1         1513 Romeo and Juliet     8 represented positive    
##  2         1513 Romeo and Juliet    10 prince      positive    
##  3         1513 Romeo and Juliet    11 young       anticipation
##  4         1513 Romeo and Juliet    11 young       joy         
##  5         1513 Romeo and Juliet    11 young       positive    
##  6         1513 Romeo and Juliet    11 young       surprise    
##  7         1513 Romeo and Juliet    11 nobleman    positive    
##  8         1513 Romeo and Juliet    11 nobleman    trust       
##  9         1513 Romeo and Juliet    11 prince      positive    
## 10         1513 Romeo and Juliet    16 prince      positive    
## # ... with 67,528 more rows

plays_senti1 %>%
  # count number of positive and negative words for each chunk of 100 lines
  count(title, index = line %/% 100, sentiment) %>% 
  spread(sentiment, n, fill = 0)                %>%                 
  mutate(sentiment = positive - negative)       %>%
  ggplot(aes(index, sentiment, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~title,scales = "free_x")

# There is a signficant change in the sentiment evolution scores for plays such as 'A Midsummer Night's Dream', 'As you like it', 'Much Ado about Nothing' which are well accepted comedies. The sentiments are now in the net positive domain which is in line with the nature of these plays. Point to be noticed here is Macbeth - a world renowned tragedy tends to maintain a sombre sentiment throughout the play with extreme negative emotions seen towards the end of the novel.

DATA 607 - Week 10

Chitrarth Kaushik

4/5/2020