The Assignment:

In Text Mining with R, Chapter 2 looks at Sentiment Analysis. In this assignment, you should start by getting the primary example code from chapter 2 working in an R Markdown document. You should provide a citation to this base code. You’re then asked to extend the code in two ways:

Work with a different corpus of your choosing, and

Incorporate at least one additional sentiment lexicon (possibly from another R package that you’ve found through research).

Setup

knitr::opts_chunk$set(echo = TRUE)
library(tidytext)
library(tidyr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(janeaustenr)
library(stringr)
library(ggplot2)
library(wordcloud)

## Loading required package: RColorBrewer

library(reshape2)

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

Setting up the example code from ‘Text Mining with R’ Chapter 2.

This was copied directly from:

“2 Sentiment Analysis with Tidy Data.” Text Mining with R: a Tidy Approach, by Julia Silge and David Robinson, O’Reilly Media, 2017.

https://www.tidytextmining.com/sentiment.html

I did however load the libraries in the setup chunk, and limit the output to the heads of the tables

2.1

get_sentiments("afinn")

## # A tibble: 2,477 x 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ... with 2,467 more rows

get_sentiments("bing")

## # A tibble: 6,786 x 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # ... with 6,776 more rows

get_sentiments("nrc")

## # A tibble: 13,901 x 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # ... with 13,891 more rows

2.2

tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(linenumber = row_number(),
         chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]", 
                                                 ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)

nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")

tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE) %>%
  head()

## Joining, by = "word"

## # A tibble: 6 x 2
##   word       n
##   <chr>  <int>
## 1 good     359
## 2 young    192
## 3 friend   166
## 4 hope     143
## 5 happy    125
## 6 love     117

jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

## Joining, by = "word"

ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

2.3

pride_prejudice <- tidy_books %>% 
  filter(book == "Pride & Prejudice")

pride_prejudice%>%
  head()

## # A tibble: 6 x 4
##   book              linenumber chapter word     
##   <fct>                  <int>   <int> <chr>    
## 1 Pride & Prejudice          1       0 pride    
## 2 Pride & Prejudice          1       0 and      
## 3 Pride & Prejudice          1       0 prejudice
## 4 Pride & Prejudice          3       0 by       
## 5 Pride & Prejudice          3       0 jane     
## 6 Pride & Prejudice          3       0 austen

afinn <- pride_prejudice %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")

## Joining, by = "word"

bing_and_nrc <- bind_rows(pride_prejudice %>% 
                            inner_join(get_sentiments("bing")) %>%
                            mutate(method = "Bing et al."),
                          pride_prejudice %>% 
                            inner_join(get_sentiments("nrc") %>% 
                                         filter(sentiment %in% c("positive", 
                                                                 "negative"))) %>%
                            mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

## Joining, by = "word"
## Joining, by = "word"

bind_rows(afinn, 
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

get_sentiments("nrc") %>% 
     filter(sentiment %in% c("positive", 
                             "negative")) %>% 
  count(sentiment)

## # A tibble: 2 x 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   3324
## 2 positive   2312

get_sentiments("bing") %>% 
  count(sentiment)

## # A tibble: 2 x 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005

2.4

bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

## Joining, by = "word"

bing_word_counts

## # A tibble: 2,585 x 3
##    word     sentiment     n
##    <chr>    <chr>     <int>
##  1 miss     negative   1855
##  2 well     positive   1523
##  3 good     positive   1380
##  4 great    positive    981
##  5 like     positive    725
##  6 better   positive    639
##  7 enough   positive    613
##  8 happy    positive    534
##  9 love     positive    495
## 10 pleasure positive    462
## # ... with 2,575 more rows

bing_word_counts %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()

## Selecting by n

custom_stop_words <- bind_rows(tibble(word = c("miss"), 
                                          lexicon = c("custom")), 
                               stop_words)

custom_stop_words

## # A tibble: 1,150 x 2
##    word        lexicon
##    <chr>       <chr>  
##  1 miss        custom 
##  2 a           SMART  
##  3 a's         SMART  
##  4 able        SMART  
##  5 about       SMART  
##  6 above       SMART  
##  7 according   SMART  
##  8 accordingly SMART  
##  9 across      SMART  
## 10 actually    SMART  
## # ... with 1,140 more rows

2.5

tidy_books %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))

## Joining, by = "word"

## Warning in wordcloud(word, n, max.words = 100): miss could not be fit on page.
## It will not be plotted.

tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)

## Joining, by = "word"

2.6

PandP_sentences <- tibble(text = prideprejudice) %>% 
  unnest_tokens(sentence, text, token = "sentences")
PandP_sentences$sentence[2]

## [1] "however little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered the rightful property of some one or other of their daughters."

austen_chapters <- austen_books() %>%
  group_by(book) %>%
  unnest_tokens(chapter, text, token = "regex", 
                pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
  ungroup()

austen_chapters %>% 
  group_by(book) %>% 
  summarise(chapters = n())

## # A tibble: 6 x 2
##   book                chapters
##   <fct>                  <int>
## 1 Sense & Sensibility       51
## 2 Pride & Prejudice         62
## 3 Mansfield Park            49
## 4 Emma                      56
## 5 Northanger Abbey          32
## 6 Persuasion                25

bingnegative <- get_sentiments("bing") %>% 
  filter(sentiment == "negative")

wordcounts <- tidy_books %>%
  group_by(book, chapter) %>%
  summarize(words = n())

tidy_books %>%
  semi_join(bingnegative) %>%
  group_by(book, chapter) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts, by = c("book", "chapter")) %>%
  mutate(ratio = negativewords/words) %>%
  filter(chapter != 0) %>%
  top_n(1) %>%
  ungroup()

## Joining, by = "word"

## Selecting by ratio

## # A tibble: 6 x 5
##   book                chapter negativewords words  ratio
##   <fct>                 <int>         <int> <int>  <dbl>
## 1 Sense & Sensibility      43           161  3405 0.0473
## 2 Pride & Prejudice        34           111  2104 0.0528
## 3 Mansfield Park           46           173  3685 0.0469
## 4 Emma                     15           151  3340 0.0452
## 5 Northanger Abbey         21           149  2982 0.0500
## 6 Persuasion                4            62  1807 0.0343

My Extension

Below is my analysis of dialogue between characters in movies, with the same analysis as above (pretty much copied straight, with changes to the variables) and an additional analysis using the Loughran-McDonald lexicon (also included in the ‘tidytext’ package)

Loading a dataframe of movie lines

Cornell published a corpus of movie dialogue, from which we are going to take dialogue lines from many different movies. https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html

Actually loading in the data was trickier than I’d have thought. R does not like multi-character deliminators, especially when they’re comprised of special regex characters

#Reads in the dataframe
movieraw <- "https://raw.githubusercontent.com//SudharshanShanmugasundaram//Chatbot//master//data//cornell%20movie-dialogs%20corpus//movie_lines.txt"

movielinesall <- as.data.frame(readLines(movieraw))
movielinesall <- as.data.frame(do.call(rbind, strsplit(as.character(movielinesall$`readLines(movieraw)`)," +++$+++ ",fixed = TRUE)), stringsAsFactors = FALSE)

## Warning in (function (..., deparse.level = 1) : number of columns of result is
## not a multiple of vector length (arg 539)

#Sets column names
colnames(movielinesall) <- c("LineID", "CharacterID", "MovieID", "Character_Name", "Line_Text")

#Removing "L" from the line number

movielinesall$LineID <- as.numeric(gsub("L","",movielinesall$LineID))

#Changing the data types where nessicary

movielinesall$MovieID <- as.factor(movielinesall$MovieID)
movielinesall$CharacterID <- as.factor(movielinesall$CharacterID)
movielinesall$LineID <- as.numeric(movielinesall$LineID)

#Trimming it down a little (optional)
movielines <- movielinesall
##movielines <- subset.data.frame(movielinesall, MovieID == c("m284", "m5", "m264", "m595"), select = c(LineID:Line_Text))

Lets look at the positive-negative sentiment across several random movies.

#Tidying up the dataset to include words in each line
tidylines <- movielines %>%
  unnest_tokens(word, Line_Text)
rownames(tidylines) <- 1:nrow(tidylines)

#Looking for the count of the joy words in the whole dataset
tidylines %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE) %>%
  head()

## Joining, by = "word"

## # A tibble: 6 x 2
##   word       n
##   <chr>  <int>
## 1 good    7489
## 2 love    3221
## 3 money   2733
## 4 god     2441
## 5 kind    2078
## 6 mother  1472

#Changed this to look at sentiment by line
moviesentiment <- tidylines %>%
  inner_join(get_sentiments("bing")) %>%
  count(MovieID, index = LineID %/% 1, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

## Joining, by = "word"

#Going to take subset of the movies, for the sake of brevity
moviesentimentsub <- subset(moviesentiment, MovieID == c("m100", "m103", "m207", "m519"), select = c(MovieID:sentiment))

## Warning in `==.default`(MovieID, c("m100", "m103", "m207", "m519")): longer
## object length is not a multiple of shorter object length

## Warning in is.na(e1) | is.na(e2): longer object length is not a multiple of
## shorter object length

ggplot(moviesentimentsub, aes(index, sentiment, fill = MovieID)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~MovieID, ncol = 2, scales = "free_x")

### We can see above how the sentiment changes across dialogue lines for 4 movies that I selected at random. The scale of the sentiment isn’t as large as the textbook example as the size of each chunk of text here corrisponds to one line of dialogue, instead of a bunch of lines like in the books.

Sentiment over time

Let’s see how sentiment changes throughout a film. In this case, the movie ‘Heathers’

#We've changed some of the code from the example, but it's mostly just replacing the variables


heathers <- tidylines %>% 
  filter(MovieID == "m383")

afinn <- heathers %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = LineID %/% 1) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")

## Joining, by = "word"

bing_and_nrc <- bind_rows(heathers %>% 
                            inner_join(get_sentiments("bing")) %>%
                            mutate(method = "Bing et al."),
                          heathers %>% 
                            inner_join(get_sentiments("nrc") %>% 
                                         filter(sentiment %in% c("positive", 
                                                                 "negative"))) %>%
                            mutate(method = "NRC")) %>%
  count(method, index = LineID %/% 1, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

## Joining, by = "word"
## Joining, by = "word"

bind_rows(afinn, 
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

get_sentiments("nrc") %>% 
     filter(sentiment %in% c("positive", 
                             "negative")) %>% 
  count(sentiment)

## # A tibble: 2 x 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   3324
## 2 positive   2312

get_sentiments("bing") %>% 
  count(sentiment)

## # A tibble: 2 x 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005

The sentiment appears fairly similar across the different lexicons, although there are differences. Note how some of the spikes in sentiment are captured diferently across lexicons.

Word Counts

Let’s see what words are most frequent across all movies in the dataset

#We've mostly changed variables again

bing_word_counts <- tidylines %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

## Joining, by = "word"

bing_word_counts %>%
  head()

## # A tibble: 6 x 3
##   word  sentiment     n
##   <chr> <chr>     <int>
## 1 like  positive  15041
## 2 right positive  10060
## 3 well  positive   9958
## 4 good  positive   7489
## 5 sorry negative   3290
## 6 love  positive   3221

bing_word_counts %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()

## Selecting by n

#The stop-words for movies are probably different than for Jane-Austen.  Maybe more like this
custom_stop_words <- bind_rows(tibble(word = c("like", "right", "well", "work", "yeah"), 
                                          lexicon = c("custom")), 
                               stop_words)

custom_stop_words %>%
  head()

## # A tibble: 6 x 2
##   word  lexicon
##   <chr> <chr>  
## 1 like  custom 
## 2 right custom 
## 3 well  custom 
## 4 work  custom 
## 5 yeah  custom 
## 6 a     SMART

We can see above the most common positive and negative words used across all movies in the dataset. I suspect some words such as ‘like’, ‘right’, ‘yeah’ and ‘well’ could be removed, as they are more neutral in the context of modern, spoken dialogue.

Wordclouds

The above, but in two wordclouds

#Pretty much no changes here from the example code aside from some variables

tidylines %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))

## Joining, by = "word"

tidylines %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)

## Joining, by = "word"

Looking into dialogue sentances and characters

Again, using the movie ‘Heathers’

#This code is based off of the example, but there are significant changes

#We'll seperate out the sentances of dialog, again for the movie 'Heathers'
#It's easier in this case to just subset our main dataset 'movielines' than it is to pull a script and break it down
heatherslines <- as.data.frame(subset(movielines, MovieID == "m383", ))


#Changing this to group by character instead of chapter
heatherschars <- heatherslines %>%
  group_by(CharacterID) %>%
  mutate(Character_Dialogue = paste0(Line_Text, collapse = "")) %>%
  distinct(heatherschars,Character_Dialogue)

## Warning: Trying to compute distinct() for variables not found in the data:
## - `heatherschars`
## This is an error, but only a warning is raised for compatibility reasons.
## The following variables will be used:
## - Character_Dialogue

#No changes here
bingnegative <- get_sentiments("bing") %>% 
  filter(sentiment == "negative")

#Changed this to reflect characters' words instead of chapers in books
heatherswordcounts <- heathers %>%
  group_by(CharacterID, Character_Name) %>%
  summarize(words = n())


heathers %>%
  semi_join(bingnegative) %>%
  group_by(CharacterID) %>%
  summarize(negativewords = n()) %>%
  left_join(heatherswordcounts, by = "CharacterID") %>%
  mutate(ratio = negativewords/words) %>%
  arrange(desc(ratio)) %>%
  ungroup()

## Joining, by = "word"

## # A tibble: 15 x 5
##    CharacterID negativewords Character_Name   words  ratio
##    <fct>               <int> <chr>            <int>  <dbl>
##  1 u5796                   8 KURT                90 0.0889
##  2 u5790                   6 DAD                 69 0.0870
##  3 u5801                   8 WHITNEY JAMES      140 0.0571
##  4 u5797                   3 MOM                 55 0.0545
##  5 u5793                  17 HEATHER DUKE       323 0.0526
##  6 u5791                   3 EARL                61 0.0492
##  7 u5800                  94 VERONICA          2014 0.0467
##  8 u5787                   4 BETTY               88 0.0455
##  9 u5795                  59 J.D.              1299 0.0454
## 10 u5794                   7 HEATHER MCNAMARA   161 0.0435
## 11 u5799                   4 RAM                104 0.0385
## 12 u5789                   1 COURTNEY            28 0.0357
## 13 u5792                  18 HEATHER CHANDLER   544 0.0331
## 14 u5798                   2 PAULINE             71 0.0282
## 15 u5788                   1 BRAD                81 0.0123

It looks like ‘Kurt’ is the most negative character in the film, though looking at the dialogue it may be more due to his tendancy to use more explatives than other characters, rather than actual negativity (he’s pretty happy about cow-tipping)

Adding an additional lexicon to use for an analysis

First, let’s try using this dictionary for the Heathers movie in the same was as above, and see if there are any differences

#Nearly the same code from the last chunk
loughrannegative <- get_sentiments("loughran") %>% 
  filter(sentiment == "negative")


heatherswordcounts <- heathers %>%
  group_by(CharacterID, Character_Name) %>%
  summarize(words = n())


heathers %>%
  semi_join(loughrannegative) %>%
  group_by(CharacterID) %>%
  summarize(negativewords = n()) %>%
  left_join(heatherswordcounts, by = "CharacterID") %>%
  mutate(ratio = negativewords/words) %>%
  arrange(desc(ratio)) %>%
  ungroup()

## Joining, by = "word"

## # A tibble: 12 x 5
##    CharacterID negativewords Character_Name   words   ratio
##    <fct>               <int> <chr>            <int>   <dbl>
##  1 u5801                   6 WHITNEY JAMES      140 0.0429 
##  2 u5789                   1 COURTNEY            28 0.0357 
##  3 u5790                   2 DAD                 69 0.0290 
##  4 u5796                   2 KURT                90 0.0222 
##  5 u5797                   1 MOM                 55 0.0182 
##  6 u5791                   1 EARL                61 0.0164 
##  7 u5795                  20 J.D.              1299 0.0154 
##  8 u5794                   2 HEATHER MCNAMARA   161 0.0124 
##  9 u5793                   4 HEATHER DUKE       323 0.0124 
## 10 u5787                   1 BETTY               88 0.0114 
## 11 u5800                  21 VERONICA          2014 0.0104 
## 12 u5792                   4 HEATHER CHANDLER   544 0.00735

Turns out there are much fewer negative words in total across all characters, and the negative ranking of the characters also changes. Interesting difference between the lexicons

For Giggles

Whats are the most litigious movies?

litigiousness <- tidylines %>%
  inner_join(get_sentiments("loughran")) %>%
  count(MovieID, index = LineID %/% 1, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  select(MovieID, litigious) %>%
  group_by(MovieID) %>%
  summarise(sum = sum(litigious), n = n()) %>%
  mutate(Litigousness = sum / n) %>%
  arrange(desc(Litigousness)) %>%
  ungroup()

## Joining, by = "word"

 ggplot(head(litigiousness, 5), aes(y = reorder(MovieID, -Litigousness), x = Litigousness,fill = Litigousness)) +
  geom_col(show.legend = FALSE) +
  scale_y_discrete(labels = c("Verdict", "The Verdict", "True Believer", "The Usual Suspects", "Romeo and Juliet")) +
  labs(x = "Litigousness",
       y = "Movie Title") +
  coord_flip()

It turns out the most litigous movie is one titled “Verdict” (1974), followed closely by “The Verdict” (1982). Never seem them, but not suprising given the titles.

Data 607 Homework 10

David Blumenstiel

3/30/2020