Week 10 Assignment from Text Mining with R

The primary example code from chapter 2 working in R with and Install libraries

Introduction

For this assignment we were asked to use In Text Mining with R, Chapter 2 looks at Sentiment Analysis. Use the primary example code from chapter 2 working in an R Markdown document. Also provide a citation to this base code. lasly we were asked to extend the code in two ways:

Work with a different corpus of your choosing, and

Incorporate at least one additional https://cran.r-project.org/web/packages/friends/index.html (possibly from another R package that you’ve found through research).

loading sentiments datasets

library(tidytext)

sentiments

## # A tibble: 6,786 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # ℹ 6,776 more rows

The three general-purpose lexicons are:

• AFINN from Finn Årup Nielsen

• Bing from Bing Liu and collaborators

• NRC from Saif Mohammad and Peter Turney

get_sentiments("afinn")

## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ℹ 2,467 more rows

get_sentiments("bing")

## # A tibble: 6,786 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # ℹ 6,776 more rows

get_sentiments("nrc")

## # A tibble: 13,872 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # ℹ 13,862 more rows

#2. Sentiment analysis of Jane Austen books

library(janeaustenr)
library(dplyr)
library(stringr)

tidy_books <- austen_books() %>%
 group_by(book) %>%
 mutate(linenumber = row_number(),
 chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
 ignore_case = TRUE)))) %>%
 ungroup() %>%
 unnest_tokens(word, text)

tidy_books

## # A tibble: 725,055 × 4
##    book                linenumber chapter word       
##    <fct>                    <int>   <int> <chr>      
##  1 Sense & Sensibility          1       0 sense      
##  2 Sense & Sensibility          1       0 and        
##  3 Sense & Sensibility          1       0 sensibility
##  4 Sense & Sensibility          3       0 by         
##  5 Sense & Sensibility          3       0 jane       
##  6 Sense & Sensibility          3       0 austen     
##  7 Sense & Sensibility          5       0 1811       
##  8 Sense & Sensibility         10       1 chapter    
##  9 Sense & Sensibility         10       1 1          
## 10 Sense & Sensibility         13       1 the        
## # ℹ 725,045 more rows

#3. Sentiment analysis of positive words using NRC dictionary

nrcjoy <- get_sentiments("nrc") %>%
 filter(sentiment == "joy")
tidy_books %>%
 filter(book == "Emma") %>%
 inner_join(nrcjoy) %>%
 count(word, sort = TRUE)

## # A tibble: 301 × 2
##    word          n
##    <chr>     <int>
##  1 good        359
##  2 friend      166
##  3 hope        143
##  4 happy       125
##  5 love        117
##  6 deal         92
##  7 found        92
##  8 present      89
##  9 kind         82
## 10 happiness    76
## # ℹ 291 more rows

#4. Sentiment analysis of positive and negative using Bing dictionary

library(tidyr)

janeaustensentiment <- tidy_books %>%
 inner_join(get_sentiments("bing")) %>%
 count(book, index = linenumber %/% 80, sentiment) %>%
 spread(sentiment, n, fill = 0) %>%
 mutate(sentiment = positive - negative)

janeaustensentiment

## # A tibble: 920 × 5
##    book                index negative positive sentiment
##    <fct>               <dbl>    <dbl>    <dbl>     <dbl>
##  1 Sense & Sensibility     0       16       32        16
##  2 Sense & Sensibility     1       19       53        34
##  3 Sense & Sensibility     2       12       31        19
##  4 Sense & Sensibility     3       15       31        16
##  5 Sense & Sensibility     4       16       34        18
##  6 Sense & Sensibility     5       16       51        35
##  7 Sense & Sensibility     6       24       40        16
##  8 Sense & Sensibility     7       23       51        28
##  9 Sense & Sensibility     8       30       40        10
## 10 Sense & Sensibility     9       15       19         4
## # ℹ 910 more rows

#5. Plot of negative and positive words

library(ggplot2)

ggplot(janeaustensentiment, aes(index, sentiment, fill = book)) +
 geom_col(show.legend = FALSE) +
 facet_wrap(~book, ncol = 2, scales = "free_x")

#6. Filter for Pride and Prejudice

pride_prejudice <- tidy_books %>%
 filter(book == "Pride & Prejudice")
pride_prejudice

## # A tibble: 122,204 × 4
##    book              linenumber chapter word     
##    <fct>                  <int>   <int> <chr>    
##  1 Pride & Prejudice          1       0 pride    
##  2 Pride & Prejudice          1       0 and      
##  3 Pride & Prejudice          1       0 prejudice
##  4 Pride & Prejudice          3       0 by       
##  5 Pride & Prejudice          3       0 jane     
##  6 Pride & Prejudice          3       0 austen   
##  7 Pride & Prejudice          7       1 chapter  
##  8 Pride & Prejudice          7       1 1        
##  9 Pride & Prejudice         10       1 it       
## 10 Pride & Prejudice         10       1 is       
## # ℹ 122,194 more rows

#7. Comparing sentiment analysis of Pride and Prejudice by the 3 libraries, AFINN, BING, and NRC

afinn <- pride_prejudice %>%
 inner_join(get_sentiments("afinn")) %>%
 group_by(index = linenumber %/% 80) %>%
 summarise(sentiment = sum(value)) %>%
 mutate(method = "AFINN")
bing_and_nrc <- bind_rows(
 pride_prejudice %>%
 inner_join(get_sentiments("bing")) %>%
 mutate(method = "Bing et al."),
 pride_prejudice %>%
 inner_join(get_sentiments("nrc") %>%
 filter(sentiment %in% c("positive",
 "negative"))) %>%
 mutate(method = "NRC")) %>%
   count(method, index = linenumber %/% 80, sentiment) %>%
 spread(sentiment, n, fill = 0) %>%
 mutate(sentiment = positive - negative)

#8. Plot of the 3 sentiment dictionaries

bind_rows(afinn,
 bing_and_nrc) %>%
 ggplot(aes(index, sentiment, fill = method)) +
 geom_col(show.legend = FALSE) +
 facet_wrap(~method, ncol = 1, scales = "free_y")

get_sentiments("nrc") %>%
 filter(sentiment %in% c("positive",
 "negative")) %>%
 count(sentiment)

## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   3316
## 2 positive   2308

get_sentiments("bing") %>%
 count(sentiment)

## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005

#9. Most common negative and positive words

bing_word_counts <- tidy_books %>%
 inner_join(get_sentiments("bing")) %>%
 count(word, sentiment, sort = TRUE) %>%
 ungroup()
bing_word_counts

## # A tibble: 2,585 × 3
##    word     sentiment     n
##    <chr>    <chr>     <int>
##  1 miss     negative   1855
##  2 well     positive   1523
##  3 good     positive   1380
##  4 great    positive    981
##  5 like     positive    725
##  6 better   positive    639
##  7 enough   positive    613
##  8 happy    positive    534
##  9 love     positive    495
## 10 pleasure positive    462
## # ℹ 2,575 more rows

#10. Plot of most common negative postive and negative words

bing_word_counts %>%
 group_by(sentiment) %>%
 top_n(10) %>%
 ungroup() %>%
 mutate(word = reorder(word, n)) %>%
 ggplot(aes(word, n, fill = sentiment)) +
 geom_col(show.legend = FALSE) +
 facet_wrap(~sentiment, scales = "free_y") +
 labs(y = "Contribution to sentiment",
 x = NULL) +
 coord_flip()

custom_stop_words <- bind_rows(data_frame(word = c("miss"),
 lexicon = c("custom")),
 stop_words)
custom_stop_words

## # A tibble: 1,150 × 2
##    word        lexicon
##    <chr>       <chr>  
##  1 miss        custom 
##  2 a           SMART  
##  3 a's         SMART  
##  4 able        SMART  
##  5 about       SMART  
##  6 above       SMART  
##  7 according   SMART  
##  8 accordingly SMART  
##  9 across      SMART  
## 10 actually    SMART  
## # ℹ 1,140 more rows

#11. Word Cloud

library(wordcloud)
tidy_books %>%
 anti_join(stop_words) %>%
 count(word) %>%
 with(wordcloud(word, n, max.words = 100))

library(reshape2)

tidy_books %>%
 inner_join(get_sentiments("bing")) %>%
 count(word, sentiment, sort = TRUE) %>%
 acast(word ~ sentiment, value.var = "n", fill = 0) %>%
 comparison.cloud(colors = c("darkblue", "red"),
 max.words = 100)

PandP_sentences <- data_frame(text = prideprejudice) %>%
 unnest_tokens(sentence, text, token = "sentences")

PandP_sentences$sentence[2]

## [1] "by jane austen"

austen_chapters <- austen_books() %>%
 group_by(book) %>%
 unnest_tokens(chapter, text, token = "regex",
 pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
 ungroup()
austen_chapters %>%
 group_by(book) %>%
 summarise(chapters = n())

## # A tibble: 6 × 2
##   book                chapters
##   <fct>                  <int>
## 1 Sense & Sensibility       51
## 2 Pride & Prejudice         62
## 3 Mansfield Park            49
## 4 Emma                      56
## 5 Northanger Abbey          32
## 6 Persuasion                25

bingnegative <- get_sentiments("bing") %>%
 filter(sentiment == "negative")
wordcounts <- tidy_books %>%
 group_by(book, chapter) %>%
 summarize(words = n())
tidy_books %>%
 semi_join(bingnegative) %>%
 group_by(book, chapter) %>%
 summarize(negativewords = n()) %>%
 left_join(wordcounts, by = c("book", "chapter")) %>%
 mutate(ratio = negativewords/words) %>%
 filter(chapter != 0) %>%
 top_n(1) %>%
 ungroup()

## # A tibble: 6 × 5
##   book                chapter negativewords words  ratio
##   <fct>                 <int>         <int> <int>  <dbl>
## 1 Sense & Sensibility      43           161  3405 0.0473
## 2 Pride & Prejudice        34           111  2104 0.0528
## 3 Mansfield Park           46           173  3685 0.0469
## 4 Emma                     15           151  3340 0.0452
## 5 Northanger Abbey         21           149  2982 0.0500
## 6 Persuasion                4            62  1807 0.0343

Different corpus of your choosing

For the corpus of my choosing I will be looking at sentiment analysis of the friends package from this URL: (“https://cran.r-project.org/web/packages/available_packages_by_name.html#available-packages-T”). The friends package contained the complete scripts from the American sitcom Friends in tibble format.I will be Use this package to practice data wrangling, text analysis and network analysis along sentiment analysis according to Chapter 2 from “Text Mining with R: A Tidy Approach”.

First I will begin by Loading and inspect friends corpus

library(friends)

glimpse(friends)

## Rows: 67,373
## Columns: 6
## $ text      <chr> "There's nothing to tell! He's just some guy I work with!", …
## $ speaker   <chr> "Monica Geller", "Joey Tribbiani", "Chandler Bing", "Phoebe …
## $ season    <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ episode   <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ scene     <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ utterance <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1…

For this part I will be looking at the friends and by looking at the seasons

tidy_friends <- friends %>%
  group_by(season) %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, 
                                regex("^chapter [\\divxlc]", 
                                      ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)

tidy_friends

## # A tibble: 716,519 × 8
##    speaker       season episode scene utterance linenumber chapter word   
##    <chr>          <int>   <int> <int>     <int>      <int>   <int> <chr>  
##  1 Monica Geller      1       1     1         1          1       0 there's
##  2 Monica Geller      1       1     1         1          1       0 nothing
##  3 Monica Geller      1       1     1         1          1       0 to     
##  4 Monica Geller      1       1     1         1          1       0 tell   
##  5 Monica Geller      1       1     1         1          1       0 he's   
##  6 Monica Geller      1       1     1         1          1       0 just   
##  7 Monica Geller      1       1     1         1          1       0 some   
##  8 Monica Geller      1       1     1         1          1       0 guy    
##  9 Monica Geller      1       1     1         1          1       0 i      
## 10 Monica Geller      1       1     1         1          1       0 work   
## # ℹ 716,509 more rows

Sentiment analysis of positive words by Chandler Bing NRC dictionary

nrcjoy <- get_sentiments("nrc") %>%
 filter(sentiment == "joy")
tidy_friends %>%
 filter(speaker == "Chandler Bing") %>%
 inner_join(nrcjoy) %>%
 count(word, sort = TRUE)

## # A tibble: 207 × 2
##    word       n
##    <chr>  <int>
##  1 good     231
##  2 love     140
##  3 god      135
##  4 pretty    61
##  5 baby      57
##  6 money     53
##  7 sex       52
##  8 kind      48
##  9 friend    45
## 10 happy     43
## # ℹ 197 more rows

Sentiment analysis of positive and negative using Bing dictionary

library(tidyr)

friends_sentiment <- tidy_friends %>%
  inner_join(get_sentiments("bing")) %>%
  count(speaker, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)

Filter for speaker Chandler Bing

Chandler_Bing <- tidy_friends %>% 
  filter(speaker == "Chandler Bing")

Chandler_Bing

## # A tibble: 87,609 × 8
##    speaker       season episode scene utterance linenumber chapter word 
##    <chr>          <int>   <int> <int>     <int>      <int>   <int> <chr>
##  1 Chandler Bing      1       1     1         3          3       0 all  
##  2 Chandler Bing      1       1     1         3          3       0 right
##  3 Chandler Bing      1       1     1         3          3       0 joey 
##  4 Chandler Bing      1       1     1         3          3       0 be   
##  5 Chandler Bing      1       1     1         3          3       0 nice 
##  6 Chandler Bing      1       1     1         3          3       0 so   
##  7 Chandler Bing      1       1     1         3          3       0 does 
##  8 Chandler Bing      1       1     1         3          3       0 he   
##  9 Chandler Bing      1       1     1         3          3       0 have 
## 10 Chandler Bing      1       1     1         3          3       0 a    
## # ℹ 87,599 more rows

Filter for speaker Phoebe Buffay

Phoebe_Buffay <- tidy_friends %>% 
  filter(speaker == "Phoebe Buffay")

Phoebe_Buffay

## # A tibble: 82,454 × 8
##    speaker       season episode scene utterance linenumber chapter word 
##    <chr>          <int>   <int> <int>     <int>      <int>   <int> <chr>
##  1 Phoebe Buffay      1       1     1         4          4       0 wait 
##  2 Phoebe Buffay      1       1     1         4          4       0 does 
##  3 Phoebe Buffay      1       1     1         4          4       0 he   
##  4 Phoebe Buffay      1       1     1         4          4       0 eat  
##  5 Phoebe Buffay      1       1     1         4          4       0 chalk
##  6 Phoebe Buffay      1       1     1         6          6       0 just 
##  7 Phoebe Buffay      1       1     1         6          6       0 cause
##  8 Phoebe Buffay      1       1     1         6          6       0 i    
##  9 Phoebe Buffay      1       1     1         6          6       0 don't
## 10 Phoebe Buffay      1       1     1         6          6       0 want 
## # ℹ 82,444 more rows

Comparing sentiment analysis of Phoebe Buffay by the 3 libraries, AFINN, BING, and NRC

afinn <- Phoebe_Buffay %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")

bing_and_nrc_Friends <- bind_rows(
  Phoebe_Buffay %>% 
    inner_join(get_sentiments("bing")) %>%
    mutate(method = "Bing et al."),
  Phoebe_Buffay %>% 
    inner_join(get_sentiments("nrc") %>% 
                 filter(sentiment %in% c("positive", 
                                         "negative"))
    ) %>%
    mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>% 
  mutate(sentiment = positive - negative)

Plot of the 3 sentiment dictionaries

bind_rows(afinn, 
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

Most common negative and positive words

bing_word_counts <- tidy_friends %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

bing_word_counts

## # A tibble: 1,946 × 3
##    word  sentiment     n
##    <chr> <chr>     <int>
##  1 well  positive   4229
##  2 right positive   3569
##  3 like  positive   3238
##  4 good  positive   1756
##  5 sorry negative   1460
##  6 great positive   1353
##  7 love  positive   1040
##  8 thank positive    756
##  9 wow   positive    750
## 10 fine  positive    648
## # ℹ 1,936 more rows

Plot of most common negative postive and negative words

bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

Word Cloud

library(wordcloud)

tidy_friends %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))

library(reshape2)

tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("green", "purple"),
                   max.words = 100)

Citation

Text Book: The base code used here is taken from: “Text Mining with R: A Tidy Approach” by Julia Silge and David Robinson licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 United States License.

Friends: https://cran.r-project.org/web/packages/friends/index.html.

Week 10 Assignment from Text Mining with R - Chapter 2

James Naval