Chapter Summary Example

Source: Text Mining with R by Julia Silge, David Robinson Released June 2017 Publisher: O’Reilly Media, Inc.

Note: this source citation was added on 4/16/21

#install.packages('tidytext')
#install.packages('psych')
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.0.5
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.0     v dplyr   1.0.5
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(janeaustenr)
## Warning: package 'janeaustenr' was built under R version 4.0.5
library(dplyr)
library(stringr)
library(tidyr)
library(wordcloud)
## Loading required package: RColorBrewer
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.0.5
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
## 
## Attaching package: 'openintro'
## The following object is masked from 'package:reshape2':
## 
##     tips
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
get_sentiments("afinn")
## # A tibble: 2,477 x 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ... with 2,467 more rows
get_sentiments("bing")
## # A tibble: 6,786 x 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # ... with 6,776 more rows
get_sentiments("nrc")
## # A tibble: 13,901 x 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # ... with 13,891 more rows

Joy Words In Emma

a<-austen_books()

tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, 
                                regex("^chapter [\\divxlc]", 
                                      ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)
nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")

tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 303 x 2
##    word        n
##    <chr>   <int>
##  1 good      359
##  2 young     192
##  3 friend    166
##  4 hope      143
##  5 happy     125
##  6 love      117
##  7 deal       92
##  8 found      92
##  9 present    89
## 10 kind       82
## # ... with 293 more rows

Jane Austin Sentiment Analysis Throughout her Novels

jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
## Joining, by = "word"
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

Comparing the Dictionaries

Comparing results of Pride and Prejudice

Same peaks and dips but NRC is more positive

pride_prejudice <- tidy_books %>% 
  filter(book == "Pride & Prejudice")

pride_prejudice
## # A tibble: 122,204 x 4
##    book              linenumber chapter word     
##    <fct>                  <int>   <int> <chr>    
##  1 Pride & Prejudice          1       0 pride    
##  2 Pride & Prejudice          1       0 and      
##  3 Pride & Prejudice          1       0 prejudice
##  4 Pride & Prejudice          3       0 by       
##  5 Pride & Prejudice          3       0 jane     
##  6 Pride & Prejudice          3       0 austen   
##  7 Pride & Prejudice          7       1 chapter  
##  8 Pride & Prejudice          7       1 1        
##  9 Pride & Prejudice         10       1 it       
## 10 Pride & Prejudice         10       1 is       
## # ... with 122,194 more rows
afinn <- pride_prejudice %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")
## Joining, by = "word"
bing_and_nrc <- bind_rows(
  pride_prejudice %>% 
    inner_join(get_sentiments("bing")) %>%
    mutate(method = "Bing et al."),
  pride_prejudice %>% 
    inner_join(get_sentiments("nrc") %>% 
                 filter(sentiment %in% c("positive", 
                                         "negative"))
    ) %>%
    mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
## Joining, by = "word"
## Joining, by = "word"
bind_rows(afinn, 
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

Positive and negative words per lexicon

get_sentiments("nrc") %>% 
  filter(sentiment %in% c("positive", "negative")) %>% 
  count(sentiment)
## # A tibble: 2 x 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   3324
## 2 positive   2312
get_sentiments("bing") %>% 
  count(sentiment)
## # A tibble: 2 x 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005

Most Common Postive and Negative Words

bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
## Joining, by = "word"
bing_word_counts
## # A tibble: 2,585 x 3
##    word     sentiment     n
##    <chr>    <chr>     <int>
##  1 miss     negative   1855
##  2 well     positive   1523
##  3 good     positive   1380
##  4 great    positive    981
##  5 like     positive    725
##  6 better   positive    639
##  7 enough   positive    613
##  8 happy    positive    534
##  9 love     positive    495
## 10 pleasure positive    462
## # ... with 2,575 more rows
bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

Add a word to the stop-words list

custom_stop_words <- bind_rows(tibble(word = c("miss"),  
                                      lexicon = c("custom")), 
                               stop_words)

custom_stop_words
## # A tibble: 1,150 x 2
##    word        lexicon
##    <chr>       <chr>  
##  1 miss        custom 
##  2 a           SMART  
##  3 a's         SMART  
##  4 able        SMART  
##  5 about       SMART  
##  6 above       SMART  
##  7 according   SMART  
##  8 accordingly SMART  
##  9 across      SMART  
## 10 actually    SMART  
## # ... with 1,140 more rows

Word Clouds

tidy_books %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"
## Warning in wordcloud(word, n, max.words = 100): miss could not be fit on page.
## It will not be plotted.

tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)
## Joining, by = "word"

Looking at Units Beyond Just Words

p_and_p_sentences <- tibble(text = prideprejudice) %>% 
  unnest_tokens(sentence, text, token = "sentences")
p_and_p_sentences$sentence[2]
## [1] "by jane austen"
austen_chapters <- austen_books() %>%
  group_by(book) %>%
  unnest_tokens(chapter, text, token = "regex", 
                pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
  ungroup()

austen_chapters %>% 
  group_by(book) %>% 
  summarise(chapters = n())
## # A tibble: 6 x 2
##   book                chapters
##   <fct>                  <int>
## 1 Sense & Sensibility       51
## 2 Pride & Prejudice         62
## 3 Mansfield Park            49
## 4 Emma                      56
## 5 Northanger Abbey          32
## 6 Persuasion                25

We will use tidy text analysis to ask what are the most negative chapters in each of Jane Austen’s novels

  1. Get the list of negative words from the Bing lexicon.
  2. Make a data frame of how many words are in each chapter so we can normalize for the length of chapters.
  3. Find the number of negative words in each chapter and divide by the total words in each chapter.
bingnegative <- get_sentiments("bing") %>% 
  filter(sentiment == "negative")

wordcounts <- tidy_books %>%
  group_by(book, chapter) %>%
  summarize(words = n())
## `summarise()` has grouped output by 'book'. You can override using the `.groups` argument.
tidy_books %>%
  semi_join(bingnegative) %>%
  group_by(book, chapter) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts, by = c("book", "chapter")) %>%
  mutate(ratio = negativewords/words) %>%
  filter(chapter != 0) %>%
  slice_max(ratio, n = 1) %>% 
  ungroup()
## Joining, by = "word"
## `summarise()` has grouped output by 'book'. You can override using the `.groups` argument.
## # A tibble: 6 x 5
##   book                chapter negativewords words  ratio
##   <fct>                 <int>         <int> <int>  <dbl>
## 1 Sense & Sensibility      43           161  3405 0.0473
## 2 Pride & Prejudice        34           111  2104 0.0528
## 3 Mansfield Park           46           173  3685 0.0469
## 4 Emma                     15           151  3340 0.0452
## 5 Northanger Abbey         21           149  2982 0.0500
## 6 Persuasion                4            62  1807 0.0343

Sentiment Analysis On Broadway

I am analyzing a dataset from kaggle (https://www.kaggle.com/jessemostipak/broadway-weekly-grosses) which has a table of synopses and a table of gross receipts for Broadway plays for the past several decades. I will use the package SentimentAnalysis, which contains 4 dictionaries: Harvard-IV, an all-purpose dictionary developed at Harvard, Henry’s finance specific dictionary, the Loughran-Macdonald dictionary (another finance dictionary), and the QDAP, which analyses discourse. Given the specific foci of the latter three databases, I expect the Harvard database to be the most useful.

The reason I am using the SentimentAnalysis package is that it operates differently from the tidytext library - it provides summary statistics for weighted scores using a complex algorithm (utilizing LASSO regularization) which considers not only a term within a discourse but its position. This may help us more reliably determine whether an observation is predominantly positive or negative.

library(SentimentAnalysis)
## Warning: package 'SentimentAnalysis' was built under R version 4.0.5
## 
## Attaching package: 'SentimentAnalysis'
## The following object is masked from 'package:base':
## 
##     write
dfGrosses<-read.csv("https://raw.githubusercontent.com/ericonsi/CUNY_607/main/Assigments/Assignment%2010/grosses.csv")
dfSynopses<-read.csv("https://raw.githubusercontent.com/ericonsi/CUNY_607/main/Assigments/Assignment%2010/synopses.csv", fileEncoding = "UTF-8-BOM")

We compare the performance of the 4 dictionaries on a simple pass-through of the play synopses:

sentiment <- analyzeSentiment(dfSynopses$synopsis)

table(convertToBinaryResponse(sentiment$SentimentGI))
## 
## negative positive 
##      214      731
summary(sentiment$SentimentGI)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
## -0.36364  0.00000  0.06667  0.06235  0.13636  0.54545      177
ggplot(sentiment, aes(SentimentGI)) +
  geom_histogram() +
  ggtitle("Harvard-IV")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 177 rows containing non-finite values (stat_bin).

table(convertToBinaryResponse(sentiment$SentimentGI))
## 
## negative positive 
##      214      731
summary(sentiment$SentimentHE)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
## -0.11765  0.00000  0.00000  0.00509  0.00000  0.20000      177
ggplot(sentiment, aes(SentimentHE)) +
  geom_histogram() +
   ggtitle("Henry")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 177 rows containing non-finite values (stat_bin).

table(convertToBinaryResponse(sentiment$SentimentGI))
## 
## negative positive 
##      214      731
summary(sentiment$SentimentLM)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
## -0.21053 -0.04545  0.00000 -0.01085  0.00000  0.23529      177
ggplot(sentiment, aes(SentimentLM)) +
  geom_histogram() +
   ggtitle("LM")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 177 rows containing non-finite values (stat_bin).

table(convertToBinaryResponse(sentiment$SentimentGI))
## 
## negative positive 
##      214      731
summary(sentiment$SentimentQDAP)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
## -0.45455 -0.04348  0.04348  0.03430  0.11111  0.37500      177
ggplot(sentiment, aes(SentimentQDAP)) +
  geom_histogram() +
   ggtitle("QDAP") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 177 rows containing non-finite values (stat_bin).

Not surprisingly, in the finance dictionaries there are few matches between the dictionaries and words in the play synopses. The QDAP does better, and the Harvard analysis has the most matches. The QDAP analysis and the Harvard analysis are relatively similar in their distribution. They both find the plays, on average, to be mildly positive.

We compare the Harvard dictionary to the dictionaries we used from Tidytext.

summary(DictionaryGI)
##          Length Class  Mode     
## negative 2005   -none- character
## positive 1637   -none- character
get_sentiments("nrc") %>% 
  filter(sentiment %in% c("positive", "negative")) %>% 
  count(sentiment)
## # A tibble: 2 x 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   3324
## 2 positive   2312
get_sentiments("bing") %>% 
  count(sentiment)
## # A tibble: 2 x 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005

The Harvard dictionary (DictionaryGI in the r code) has fewer words than the nrc and bing dictionaries. In addition, there is much more parity between positive and negative words in the Harvard dictionary than the others.

Using sentiment analysis to predict show success

Now that we have a simple tool for evaluating synopsis sentiment we can use it to do some analysis. For example, is there an association between play sentiment and gross revenue or seats sold?

The boxplots and scatterplots below suggest there is.

dfSynopses2 <- cbind(dfSynopses, sentiment)

dfTotalSeats <- dfGrosses %>%
  group_by(show) %>%
  summarise(total_seats_sold  = sum(seats_sold), total_revenues = sum(weekly_gross))

dfJoin <- dfSynopses2 %>% 
  inner_join(dfTotalSeats, by="show") 

dfJoin <- na.omit(dfJoin)

dfSeatAnalysis <- dfJoin %>%
  filter(total_seats_sold<1000000) %>%
  filter(total_revenues>20000000) %>%
  mutate(Sentiment_Valence = case_when(SentimentGI > 0 ~ 'positive',
                           SentimentGI <=0 ~ 'neutral or negative')) 
ggplot(dfSeatAnalysis, aes(x =Sentiment_Valence, y=total_seats_sold)) +
  geom_boxplot()

ggplot(dfSeatAnalysis, aes(SentimentGI, total_seats_sold)) +
  geom_point() +
  stat_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

ggplot(dfSeatAnalysis, aes(x =Sentiment_Valence, y=total_revenues)) +
  geom_boxplot()

ggplot(dfSeatAnalysis, aes(SentimentGI, total_revenues)) +
  geom_point() +
  stat_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

While the data remains suggestive that more positive shows do better, regression analysis fails to allow us to reject the possibility that the apparent association is due to sampling error (at a 95% confidence level.)

m3 <- lm(total_seats_sold ~ SentimentGI, data = dfSeatAnalysis)
summary(m3)
## 
## Call:
## lm(formula = total_seats_sold ~ SentimentGI, data = dfSeatAnalysis)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -487533 -196771  -18893  179514  453624 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   531794      23338  22.787   <2e-16 ***
## SentimentGI   297676     175827   1.693   0.0931 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 229200 on 118 degrees of freedom
## Multiple R-squared:  0.02371,    Adjusted R-squared:  0.01544 
## F-statistic: 2.866 on 1 and 118 DF,  p-value: 0.09309
m <- lm(total_revenues ~ SentimentGI, data = dfSeatAnalysis)
summary(m)
## 
## Call:
## lm(formula = total_revenues ~ SentimentGI, data = dfSeatAnalysis)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -28122826 -16020108  -5858789   7159369  88739954 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 41732283    2330535  17.907   <2e-16 ***
## SentimentGI 28257644   17558076   1.609     0.11    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22890000 on 118 degrees of freedom
## Multiple R-squared:  0.02148,    Adjusted R-squared:  0.01319 
## F-statistic:  2.59 on 1 and 118 DF,  p-value: 0.1102

Comparing the Harvard results and the Bing results

Here we analyze the synopses dataset using Bing to see if it makes the same conclusions about the plays’ sentiment.

tidy_x <- dfSynopses %>%
  unnest_tokens(word, synopsis)

bing_word_counts2 <- tidy_x %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
## Joining, by = "word"
x <- bing_word_counts2 %>%
  count(sentiment) %>%
  mutate(percentage = n / sum(n))
x
##   sentiment   n percentage
## 1  negative 622  0.6056475
## 2  positive 405  0.3943525

The Bing dataset comes to the astonishing conclusion that 60% of Broadway plays are negative in sentiment! This is not the Broadway I know.

This word cloud helps us understand why:

tidy_x %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)
## Joining, by = "word"
## Warning in comparison.cloud(., colors = c("gray20", "gray80"), max.words = 100):
## unforgettable could not be fit on page. It will not be plotted.

Consider this: A satirical farce about an eccentric millionaire who falls intensely in love with a mysterious stranger has 7 negative words and only one positive one. Even “funny” is considered negative here.

It does not help all that much to remove some of the words:

bing_word_counts3 <- tidy_x %>%
  filter(word !="falls" & word !="fall" & word!="unexpected" & word!="mysterious" & word!="farce") %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
## Joining, by = "word"
x1 <- bing_word_counts3 %>%
  count(sentiment) %>%
  mutate(percentage = n / sum(n))
x1
##   sentiment   n percentage
## 1  negative 617  0.6037182
## 2  positive 405  0.3962818

Conclusion

Sentiment analysis is an interesting tool but has its limitations. As the Bing vs Harvard analysis shows, different dictionaries and algorithms can produce different results. As more context-specific dictionaries emerge, analyses will surely improve.