Source: Text Mining with R by Julia Silge, David Robinson Released June 2017 Publisher: O’Reilly Media, Inc.
Note: this source citation was added on 4/16/21
#install.packages('tidytext')
#install.packages('psych')
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.0.5
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.0 v dplyr 1.0.5
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(janeaustenr)
## Warning: package 'janeaustenr' was built under R version 4.0.5
library(dplyr)
library(stringr)
library(tidyr)
library(wordcloud)
## Loading required package: RColorBrewer
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.0.5
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
##
## Attaching package: 'openintro'
## The following object is masked from 'package:reshape2':
##
## tips
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
get_sentiments("afinn")
## # A tibble: 2,477 x 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # ... with 2,467 more rows
get_sentiments("bing")
## # A tibble: 6,786 x 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # ... with 6,776 more rows
get_sentiments("nrc")
## # A tibble: 13,901 x 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # ... with 13,891 more rows
a<-austen_books()
tidy_books <- austen_books() %>%
group_by(book) %>%
mutate(
linenumber = row_number(),
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_books %>%
filter(book == "Emma") %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 303 x 2
## word n
## <chr> <int>
## 1 good 359
## 2 young 192
## 3 friend 166
## 4 hope 143
## 5 happy 125
## 6 love 117
## 7 deal 92
## 8 found 92
## 9 present 89
## 10 kind 82
## # ... with 293 more rows
jane_austen_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
Same peaks and dips but NRC is more positive
pride_prejudice <- tidy_books %>%
filter(book == "Pride & Prejudice")
pride_prejudice
## # A tibble: 122,204 x 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Pride & Prejudice 1 0 pride
## 2 Pride & Prejudice 1 0 and
## 3 Pride & Prejudice 1 0 prejudice
## 4 Pride & Prejudice 3 0 by
## 5 Pride & Prejudice 3 0 jane
## 6 Pride & Prejudice 3 0 austen
## 7 Pride & Prejudice 7 1 chapter
## 8 Pride & Prejudice 7 1 1
## 9 Pride & Prejudice 10 1 it
## 10 Pride & Prejudice 10 1 is
## # ... with 122,194 more rows
afinn <- pride_prejudice %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
## Joining, by = "word"
bing_and_nrc <- bind_rows(
pride_prejudice %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
pride_prejudice %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative"))
) %>%
mutate(method = "NRC")) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
## Joining, by = "word"
bind_rows(afinn,
bing_and_nrc) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")
get_sentiments("nrc") %>%
filter(sentiment %in% c("positive", "negative")) %>%
count(sentiment)
## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 3324
## 2 positive 2312
get_sentiments("bing") %>%
count(sentiment)
## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
bing_word_counts <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
bing_word_counts
## # A tibble: 2,585 x 3
## word sentiment n
## <chr> <chr> <int>
## 1 miss negative 1855
## 2 well positive 1523
## 3 good positive 1380
## 4 great positive 981
## 5 like positive 725
## 6 better positive 639
## 7 enough positive 613
## 8 happy positive 534
## 9 love positive 495
## 10 pleasure positive 462
## # ... with 2,575 more rows
bing_word_counts %>%
group_by(sentiment) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment",
y = NULL)
custom_stop_words <- bind_rows(tibble(word = c("miss"),
lexicon = c("custom")),
stop_words)
custom_stop_words
## # A tibble: 1,150 x 2
## word lexicon
## <chr> <chr>
## 1 miss custom
## 2 a SMART
## 3 a's SMART
## 4 able SMART
## 5 about SMART
## 6 above SMART
## 7 according SMART
## 8 accordingly SMART
## 9 across SMART
## 10 actually SMART
## # ... with 1,140 more rows
tidy_books %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"
## Warning in wordcloud(word, n, max.words = 100): miss could not be fit on page.
## It will not be plotted.
tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 100)
## Joining, by = "word"
p_and_p_sentences <- tibble(text = prideprejudice) %>%
unnest_tokens(sentence, text, token = "sentences")
p_and_p_sentences$sentence[2]
## [1] "by jane austen"
austen_chapters <- austen_books() %>%
group_by(book) %>%
unnest_tokens(chapter, text, token = "regex",
pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
ungroup()
austen_chapters %>%
group_by(book) %>%
summarise(chapters = n())
## # A tibble: 6 x 2
## book chapters
## <fct> <int>
## 1 Sense & Sensibility 51
## 2 Pride & Prejudice 62
## 3 Mansfield Park 49
## 4 Emma 56
## 5 Northanger Abbey 32
## 6 Persuasion 25
bingnegative <- get_sentiments("bing") %>%
filter(sentiment == "negative")
wordcounts <- tidy_books %>%
group_by(book, chapter) %>%
summarize(words = n())
## `summarise()` has grouped output by 'book'. You can override using the `.groups` argument.
tidy_books %>%
semi_join(bingnegative) %>%
group_by(book, chapter) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts, by = c("book", "chapter")) %>%
mutate(ratio = negativewords/words) %>%
filter(chapter != 0) %>%
slice_max(ratio, n = 1) %>%
ungroup()
## Joining, by = "word"
## `summarise()` has grouped output by 'book'. You can override using the `.groups` argument.
## # A tibble: 6 x 5
## book chapter negativewords words ratio
## <fct> <int> <int> <int> <dbl>
## 1 Sense & Sensibility 43 161 3405 0.0473
## 2 Pride & Prejudice 34 111 2104 0.0528
## 3 Mansfield Park 46 173 3685 0.0469
## 4 Emma 15 151 3340 0.0452
## 5 Northanger Abbey 21 149 2982 0.0500
## 6 Persuasion 4 62 1807 0.0343
I am analyzing a dataset from kaggle (https://www.kaggle.com/jessemostipak/broadway-weekly-grosses) which has a table of synopses and a table of gross receipts for Broadway plays for the past several decades. I will use the package SentimentAnalysis, which contains 4 dictionaries: Harvard-IV, an all-purpose dictionary developed at Harvard, Henry’s finance specific dictionary, the Loughran-Macdonald dictionary (another finance dictionary), and the QDAP, which analyses discourse. Given the specific foci of the latter three databases, I expect the Harvard database to be the most useful.
The reason I am using the SentimentAnalysis package is that it operates differently from the tidytext library - it provides summary statistics for weighted scores using a complex algorithm (utilizing LASSO regularization) which considers not only a term within a discourse but its position. This may help us more reliably determine whether an observation is predominantly positive or negative.
library(SentimentAnalysis)
## Warning: package 'SentimentAnalysis' was built under R version 4.0.5
##
## Attaching package: 'SentimentAnalysis'
## The following object is masked from 'package:base':
##
## write
dfGrosses<-read.csv("https://raw.githubusercontent.com/ericonsi/CUNY_607/main/Assigments/Assignment%2010/grosses.csv")
dfSynopses<-read.csv("https://raw.githubusercontent.com/ericonsi/CUNY_607/main/Assigments/Assignment%2010/synopses.csv", fileEncoding = "UTF-8-BOM")
sentiment <- analyzeSentiment(dfSynopses$synopsis)
table(convertToBinaryResponse(sentiment$SentimentGI))
##
## negative positive
## 214 731
summary(sentiment$SentimentGI)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -0.36364 0.00000 0.06667 0.06235 0.13636 0.54545 177
ggplot(sentiment, aes(SentimentGI)) +
geom_histogram() +
ggtitle("Harvard-IV")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 177 rows containing non-finite values (stat_bin).
table(convertToBinaryResponse(sentiment$SentimentGI))
##
## negative positive
## 214 731
summary(sentiment$SentimentHE)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -0.11765 0.00000 0.00000 0.00509 0.00000 0.20000 177
ggplot(sentiment, aes(SentimentHE)) +
geom_histogram() +
ggtitle("Henry")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 177 rows containing non-finite values (stat_bin).
table(convertToBinaryResponse(sentiment$SentimentGI))
##
## negative positive
## 214 731
summary(sentiment$SentimentLM)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -0.21053 -0.04545 0.00000 -0.01085 0.00000 0.23529 177
ggplot(sentiment, aes(SentimentLM)) +
geom_histogram() +
ggtitle("LM")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 177 rows containing non-finite values (stat_bin).
table(convertToBinaryResponse(sentiment$SentimentGI))
##
## negative positive
## 214 731
summary(sentiment$SentimentQDAP)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -0.45455 -0.04348 0.04348 0.03430 0.11111 0.37500 177
ggplot(sentiment, aes(SentimentQDAP)) +
geom_histogram() +
ggtitle("QDAP")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 177 rows containing non-finite values (stat_bin).
Not surprisingly, in the finance dictionaries there are few matches between the dictionaries and words in the play synopses. The QDAP does better, and the Harvard analysis has the most matches. The QDAP analysis and the Harvard analysis are relatively similar in their distribution. They both find the plays, on average, to be mildly positive.
summary(DictionaryGI)
## Length Class Mode
## negative 2005 -none- character
## positive 1637 -none- character
get_sentiments("nrc") %>%
filter(sentiment %in% c("positive", "negative")) %>%
count(sentiment)
## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 3324
## 2 positive 2312
get_sentiments("bing") %>%
count(sentiment)
## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
The Harvard dictionary (DictionaryGI in the r code) has fewer words than the nrc and bing dictionaries. In addition, there is much more parity between positive and negative words in the Harvard dictionary than the others.
Now that we have a simple tool for evaluating synopsis sentiment we can use it to do some analysis. For example, is there an association between play sentiment and gross revenue or seats sold?
The boxplots and scatterplots below suggest there is.
dfSynopses2 <- cbind(dfSynopses, sentiment)
dfTotalSeats <- dfGrosses %>%
group_by(show) %>%
summarise(total_seats_sold = sum(seats_sold), total_revenues = sum(weekly_gross))
dfJoin <- dfSynopses2 %>%
inner_join(dfTotalSeats, by="show")
dfJoin <- na.omit(dfJoin)
dfSeatAnalysis <- dfJoin %>%
filter(total_seats_sold<1000000) %>%
filter(total_revenues>20000000) %>%
mutate(Sentiment_Valence = case_when(SentimentGI > 0 ~ 'positive',
SentimentGI <=0 ~ 'neutral or negative'))
ggplot(dfSeatAnalysis, aes(x =Sentiment_Valence, y=total_seats_sold)) +
geom_boxplot()
ggplot(dfSeatAnalysis, aes(SentimentGI, total_seats_sold)) +
geom_point() +
stat_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
ggplot(dfSeatAnalysis, aes(x =Sentiment_Valence, y=total_revenues)) +
geom_boxplot()
ggplot(dfSeatAnalysis, aes(SentimentGI, total_revenues)) +
geom_point() +
stat_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
While the data remains suggestive that more positive shows do better, regression analysis fails to allow us to reject the possibility that the apparent association is due to sampling error (at a 95% confidence level.)
m3 <- lm(total_seats_sold ~ SentimentGI, data = dfSeatAnalysis)
summary(m3)
##
## Call:
## lm(formula = total_seats_sold ~ SentimentGI, data = dfSeatAnalysis)
##
## Residuals:
## Min 1Q Median 3Q Max
## -487533 -196771 -18893 179514 453624
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 531794 23338 22.787 <2e-16 ***
## SentimentGI 297676 175827 1.693 0.0931 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 229200 on 118 degrees of freedom
## Multiple R-squared: 0.02371, Adjusted R-squared: 0.01544
## F-statistic: 2.866 on 1 and 118 DF, p-value: 0.09309
m <- lm(total_revenues ~ SentimentGI, data = dfSeatAnalysis)
summary(m)
##
## Call:
## lm(formula = total_revenues ~ SentimentGI, data = dfSeatAnalysis)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28122826 -16020108 -5858789 7159369 88739954
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41732283 2330535 17.907 <2e-16 ***
## SentimentGI 28257644 17558076 1.609 0.11
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22890000 on 118 degrees of freedom
## Multiple R-squared: 0.02148, Adjusted R-squared: 0.01319
## F-statistic: 2.59 on 1 and 118 DF, p-value: 0.1102
Here we analyze the synopses dataset using Bing to see if it makes the same conclusions about the plays’ sentiment.
tidy_x <- dfSynopses %>%
unnest_tokens(word, synopsis)
bing_word_counts2 <- tidy_x %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
x <- bing_word_counts2 %>%
count(sentiment) %>%
mutate(percentage = n / sum(n))
x
## sentiment n percentage
## 1 negative 622 0.6056475
## 2 positive 405 0.3943525
The Bing dataset comes to the astonishing conclusion that 60% of Broadway plays are negative in sentiment! This is not the Broadway I know.
This word cloud helps us understand why:
tidy_x %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 100)
## Joining, by = "word"
## Warning in comparison.cloud(., colors = c("gray20", "gray80"), max.words = 100):
## unforgettable could not be fit on page. It will not be plotted.
Consider this: A satirical farce about an eccentric millionaire who falls intensely in love with a mysterious stranger has 7 negative words and only one positive one. Even “funny” is considered negative here.
It does not help all that much to remove some of the words:
bing_word_counts3 <- tidy_x %>%
filter(word !="falls" & word !="fall" & word!="unexpected" & word!="mysterious" & word!="farce") %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
x1 <- bing_word_counts3 %>%
count(sentiment) %>%
mutate(percentage = n / sum(n))
x1
## sentiment n percentage
## 1 negative 617 0.6037182
## 2 positive 405 0.3962818
Sentiment analysis is an interesting tool but has its limitations. As the Bing vs Harvard analysis shows, different dictionaries and algorithms can produce different results. As more context-specific dictionaries emerge, analyses will surely improve.