library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidytext)
library(stringr)
library(XML)
library(jsonlite)
library(rvest)
library(httr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.4
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()         masks stats::filter()
## ✖ purrr::flatten()        masks jsonlite::flatten()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag()            masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janeaustenr)
library(tidyr)
library(SentimentAnalysis)
## 
## Attaching package: 'SentimentAnalysis'
## 
## The following object is masked from 'package:base':
## 
##     write
#library(qdap)
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## 
## The following object is masked from 'package:ggplot2':
## 
##     annotate
## 
## The following object is masked from 'package:httr':
## 
##     content
library(qdapDictionaries)

Introducion

This assignment consists of two parts. Part 1 loads and tests the existing code in Chapter 2 from https://www.tidytextmining.com/sentiment. Part 2 is an extension of the code and analysis using the Brown Corpus and two lexicons, the mpqa and SentimentAnalysis lexicons.

The Brown corpus is composed of 501 texts. More information on the Brown corpus can be found at http://icame.uib.no/brown/bcm.html.

Part 1: Sentiment analysis of the Jane Austen Corpus

Part 1 of this assignment is the primary example code from https://www.tidytextmining.com/sentiment ## Load the data from the Jane Austen Corpus

original_books <- austen_books() %>%
  group_by(book) %>%
  mutate(linenumber = row_number(),
         chapter = cumsum(str_detect(text, 
                                     regex("^chapter [\\divxlc]",
                                           ignore_case = TRUE)))) %>%
  ungroup()

head(original_books)
## # A tibble: 6 × 4
##   text                    book                linenumber chapter
##   <chr>                   <fct>                    <int>   <int>
## 1 "SENSE AND SENSIBILITY" Sense & Sensibility          1       0
## 2 ""                      Sense & Sensibility          2       0
## 3 "by Jane Austen"        Sense & Sensibility          3       0
## 4 ""                      Sense & Sensibility          4       0
## 5 "(1811)"                Sense & Sensibility          5       0
## 6 ""                      Sense & Sensibility          6       0

Tokenize the text of the six books in the Jane Austen corpus

tidy_books <- original_books %>%
  unnest_tokens(word, text)

head(tidy_books)
## # A tibble: 6 × 4
##   book                linenumber chapter word       
##   <fct>                    <int>   <int> <chr>      
## 1 Sense & Sensibility          1       0 sense      
## 2 Sense & Sensibility          1       0 and        
## 3 Sense & Sensibility          1       0 sensibility
## 4 Sense & Sensibility          3       0 by         
## 5 Sense & Sensibility          3       0 jane       
## 6 Sense & Sensibility          3       0 austen

Remove stop words from the tokenized words

data(stop_words)

tidy_books <- tidy_books %>%
  anti_join(stop_words)
## Joining with `by = join_by(word)`

Display the words in the afinn lexicon

head(get_sentiments("afinn"))
## # A tibble: 6 × 2
##   word       value
##   <chr>      <dbl>
## 1 abandon       -2
## 2 abandoned     -2
## 3 abandons      -2
## 4 abducted      -2
## 5 abduction     -2
## 6 abductions    -2

Display the words in the bing lexicon

head(get_sentiments("bing"))
## # A tibble: 6 × 2
##   word       sentiment
##   <chr>      <chr>    
## 1 2-faces    negative 
## 2 abnormal   negative 
## 3 abolish    negative 
## 4 abominable negative 
## 5 abominably negative 
## 6 abominate  negative

Display the words in the nrc lexicon

#head(get_sentiments("nrc"))

Sentiment analysis using the bing lexicon

jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 131015 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
head(jane_austen_sentiment)
## # A tibble: 6 × 5
##   book                index negative positive sentiment
##   <fct>               <dbl>    <int>    <int>     <int>
## 1 Sense & Sensibility     0       16       26        10
## 2 Sense & Sensibility     1       19       44        25
## 3 Sense & Sensibility     2       12       23        11
## 4 Sense & Sensibility     3       15       22         7
## 5 Sense & Sensibility     4       16       29        13
## 6 Sense & Sensibility     5       16       39        23

Visualization of “jane_austen_sentiment” analysis

library(ggplot2)

ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

Part 2: Extension of Sentiment analysis

We are using the Brown Corpus. Data and information from this corpus is located at http://icame.uib.no/brown/bcm.html. We used python code to download the corpus, tokenize it, and saved it in a data frame. We then sorted the data frame by the count of tokens and saved the top 12 in a separate data frame. We finally used that data for my analysis bellow:

Part 2.1

In this extension, we are using 6886 words from the mpqa word lexicon. The lexicon is read from a json file in Prof. William L Hamilton GitHub repository. William L Hamilton is an Assistant Professor at McGill University and Mila, working on machine learning, NLP, and network analysis.hip github repo can be found here https://github.com/williamleif

This code loads the mpqa seniment.

# Load the JSON lexicon data into an R data frame
mpqa_sentiment <- fromJSON("https://raw.githubusercontent.com/williamleif/socialsent/master/socialsent/data/lexicons/mpqa.json")
mpqa_df <- as.data.frame(mpqa_sentiment)

# This data frame is in a wide format with only one row 
# The columns represent the words and the single row consist of the sentiments with -1 for negative and 1 for positive sentiment. 

# Converting the data frame from wide to long format
mpqa_df <- pivot_longer(
  data = mpqa_df, 
  cols = everything(),        
  names_to = "word",           
  values_to = "value"         
)

mpqa <- mpqa_df |>
  mutate(
    sentiment = if_else(value == 1, "positive", "negative")
  )

head(mpqa)
## # A tibble: 6 × 3
##   word       value sentiment
##   <chr>      <int> <chr>    
## 1 fawn          -1 negative 
## 2 foul          -1 negative 
## 3 mirage        -1 negative 
## 4 aggression    -1 negative 
## 5 eligible       1 positive 
## 6 chatter       -1 negative
mpqa |> group_by(sentiment) |>
summarise(n = n())
## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4590
## 2 positive   2296

Load the selecion of texts from the Brown corpus.

The texts from the Brown corpus were read using a python code. A data frame was created of the 12 texts with the highest token counts. The python code and the data frame are in the github repo.

brown_corpus_top_12_token_count <- read_csv('https://raw.githubusercontent.com/hawa1983/Week-10-Assignment/main/brown_corpus_top_12_token_count.csv')
## Rows: 1833 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): text_id, text
## dbl (1): linenumber
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(brown_corpus_top_12_token_count)
## # A tibble: 6 × 3
##   text_id text                                                        linenumber
##   <chr>   <chr>                                                            <dbl>
## 1 cn29    `` Bastards '' , he would say , `` all I did was put a bea…          1
## 2 cn29    Since then , and since the pure grain had gotten him divor…          2
## 3 cn29    `` There ain't nothin' faster , or lonelier , or more dire…          3
## 4 cn29    `` The accommodations may not be the poshest , but man !             4
## 5 cn29    There ain't nobody askin' for your ticket stub , neither '…          5
## 6 cn29    He had been conning the freights for a long , long time no…          6

Tokenize the texts

tidy_brown_text <- brown_corpus_top_12_token_count %>%
  unnest_tokens(word, text)

head(tidy_brown_text)
## # A tibble: 6 × 3
##   text_id linenumber word    
##   <chr>        <dbl> <chr>   
## 1 cn29             1 bastards
## 2 cn29             1 he      
## 3 cn29             1 would   
## 4 cn29             1 say     
## 5 cn29             1 all     
## 6 cn29             1 i

Remove stop words from the text

data(stop_words)

tidy_brown_text <- tidy_brown_text %>%
  anti_join(stop_words)
## Joining with `by = join_by(word)`
head(tidy_brown_text)
## # A tibble: 6 × 3
##   text_id linenumber word     
##   <chr>        <dbl> <chr>    
## 1 cn29             1 bastards 
## 2 cn29             1 beat     
## 3 cn29             1 vivaldi  
## 4 cn29             1 stuff    
## 5 cn29             1 chair    
## 6 cn29             1 clobbered

Join the mpqa and the tidy_brown_sentiment tables.

This code joins the mpqa lexicon and the tidy_brown_text tables and calculate the sentiment for every 10 sentences.

tidy_brown_sentiment <- tidy_brown_text %>%
  inner_join(mpqa) %>%
  count(text_id, index = linenumber %/% 10, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
head(tidy_brown_sentiment)
## # A tibble: 6 × 5
##   text_id index negative positive sentiment
##   <chr>   <dbl>    <int>    <int>     <int>
## 1 cc14      135        2        1        -1
## 2 cc14      136       12       15         3
## 3 cc14      137        4       16        12
## 4 cc14      138        9       13         4
## 5 cc14      139        5       13         8
## 6 cc14      140       11        7        -4

Sentiments using the mpqa lexicon

The plots shows how the sentiment changed throughout the book. The texts with id cc14, ch17 and cn17 have mostly a positive sentiment. cn17 starts with positive sentiments. The texts in cp16, cp23, cp24, and cr03 are comprised mostly of negative sentiments. The texts in cn29, cp04, cp06 and cp15 are similarly composed mostly of negative sentiments.

library(ggplot2)

ggplot(tidy_brown_sentiment, aes(index, sentiment, fill = text_id)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~text_id, ncol = 4, scales = "free_x")

## Part 2.2

This analysis used the SentimentAnalysis lexicon

Data sets in package ‘SentimentAnalysis’:

DictionaryGI Dictionary with opinionated words from the Harvard-IV dictionary as used in the General Inquirer software DictionaryHE Dictionary with opinionated words from Henry’s Financial dictionary DictionaryLM Dictionary with opinionated words from Loughran-McDonald Financial dictionary

These data sets are combined to create a single data frame

data <- data(package = 'SentimentAnalysis')
sentiment_df <-rbind(
  tibble(word = DictionaryGI$negative) |>
  mutate(sentiment = "negative"),
  tibble(word = DictionaryGI$positive) |>
  mutate(sentiment = "positive"),
  tibble(word = DictionaryHE$negative) |>
  mutate(sentiment = "negative"),
  tibble(word = DictionaryHE$positive) |>
  mutate(sentiment = "positive"),
  tibble(word = DictionaryLM$negative) |>
  mutate(sentiment = "negative"),
  tibble(word = DictionaryLM$positive) |>
  mutate(sentiment = "positive")
  )

head(sentiment_df)
## # A tibble: 6 × 2
##   word        sentiment
##   <chr>       <chr>    
## 1 abandon     negative 
## 2 abandonment negative 
## 3 abate       negative 
## 4 abdicate    negative 
## 5 abhor       negative 
## 6 abject      negative

Join the SentimentAnalysis and the tidy_brown_sentiment tables.

This code joins the SentimentAnalysis lexicon and the tidy_brown_text tables and calculate the sentiment for every 10 sentences.

tidy_brown_GI_sentiment <- tidy_brown_text %>%
  inner_join(sentiment_df) %>%
  count(text_id, index = linenumber %/% 10, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
## Warning in inner_join(., sentiment_df): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 2 of `x` matches multiple rows in `y`.
## ℹ Row 167 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
head(tidy_brown_GI_sentiment)
## # A tibble: 6 × 5
##   text_id index negative positive sentiment
##   <chr>   <dbl>    <int>    <int>     <int>
## 1 cc14      135        2        0        -2
## 2 cc14      136        9       14         5
## 3 cc14      137        5       22        17
## 4 cc14      138        8       17         9
## 5 cc14      139        6       19        13
## 6 cc14      140        4        7         3

Sentiments using the SentimentAnalysis package

The plots shows how the sentiment changed throughout the book. The texts with id cc14 have mostly a positive sentiments. The rest of the other texts are composed mostly of negative sentiments.Overall, the sentiments analysis is similar to that of the mpqa lexicon. However, this lexicon shows more positive sentiments in cr03 than did th mpqa sentiment.

library(ggplot2)

ggplot(tidy_brown_GI_sentiment, aes(index, sentiment, fill = text_id)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~text_id, ncol = 4, scales = "free_x")

Part 2.3 Analysis of the sentiments using the qdap lexicon

The qdapDictionaries package contains datasets that have sentiments for action, amplifying, de-amplifying, negation, negative, positive, power, strength, weakness, and submission word lists. In this extension, we will create a separate data frame for each word list with a second column for sentiment for that word list. We then combine all the data frames to generate a qdap lexicon. We then mutated the data frame to create the following net sentiments

sentiment = positive - negative strength = strong - weak amplify = ampplification - deamplification.

data <-  data(package = "qdapDictionaries")

action_df <- as.data.frame(action.verbs) |> #Action Word List
  mutate(sentiment = "action") |>
  rename(word = action.verbs)

amplification_df <- as.data.frame(amplification.words) |> #Amplifying Words
  mutate(sentiment = "amplification") |>
  rename(word = amplification.words)

deamplification_df <- as.data.frame(deamplification.words) |> #De-amplifying Words
  mutate(sentiment = "deamplification") |>
  rename(word = deamplification.words)


negation_df <- as.data.frame(negation.words) |> #negation Words
  mutate(sentiment = "negation") |>
  rename(word = negation.words)

negative_df <- as.data.frame(negative.words) |> #negative Words
  mutate(sentiment = "negative") |>
  rename(word = negative.words)

positive_df <- as.data.frame(positive.words) |> #Positive Words
  mutate(sentiment = "positive") |>
  rename(word = positive.words)

power_df <- as.data.frame(power.words) |> #Words that Indicate Power
  mutate(sentiment = "power") |>
  rename(word = power.words)

strong_df <- as.data.frame(strong.words) |> #Words that Indicate Strength
  mutate(sentiment = "strong") |>
  rename(word = strong.words)

submit_df <- as.data.frame(submit.words) |> #Words that Indicate Submission
  mutate(sentiment = "submit") |>
  rename(word = submit.words)

weak_df <- as.data.frame(weak.words) |> #Words that Indicate Weakness
  mutate(sentiment = "weak") |>
  rename(word = weak.words)

qdap_lex <- bind_rows(action_df,
                         amplification_df, 
                         deamplification_df, 
                         negation_df, 
                         negative_df,
                         positive_df,
                         power_df,
                         strong_df,
                         submit_df,
                         weak_df)
head(qdap_lex)
##         word sentiment
## 1     abduct    action
## 2      abide    action
## 3    abolish    action
## 4    abscond    action
## 5      abuse    action
## 6 accelerate    action

Join the qdap and the tidy_brown_sentiment tables.

This code joins the SentimentAnalysis lexicon and the tidy_brown_text tables and calculate the sentiment for every 10 sentences.

tidy_brown_qdap_sentiment <- tidy_brown_text %>%
  inner_join(qdap_lex) %>%
  count(text_id, index = linenumber %/% 10, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative, strength = strong - weak, amplify = amplification - deamplification)
## Joining with `by = join_by(word)`
## Warning in inner_join(., qdap_lex): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 2 of `x` matches multiple rows in `y`.
## ℹ Row 1424 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
head(tidy_brown_qdap_sentiment)
## # A tibble: 6 × 14
##   text_id index action negative positive power strong submit  weak amplification
##   <chr>   <dbl>  <int>    <int>    <int> <int>  <int>  <int> <int>         <int>
## 1 cc14      135      1        2        1     1      0      0     0             0
## 2 cc14      136     14        9       12     5      8      3     5             0
## 3 cc14      137     12        2       12     1      4      0     2             0
## 4 cc14      138     10        8        8     5      8      2     2             0
## 5 cc14      139      7        4       12     4     10      1     4             0
## 6 cc14      140      3        3        3     1      2      4     1             2
## # ℹ 4 more variables: deamplification <int>, sentiment <int>, strength <int>,
## #   amplify <int>

Overall sentiments

The overall sentiments is similar to that dipicted by the mpqa lexicon

ggplot(tidy_brown_qdap_sentiment, aes(index, sentiment, fill = text_id)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~text_id, ncol = 4, scales = "free_x") + 
  labs(title = 'Overall sentiments (positive - negative)')

### Strength Sentiment analysis The plots shows that there is a predominant use of strength words than words that express weakness through out each of the texts.

ggplot(tidy_brown_qdap_sentiment, aes(index, strength, fill = text_id)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~text_id, ncol = 4, scales = "free_x") + 
  labs(title = 'Distribution of strength sentiments (strong - weak)')

### Amplification Sentiment analysis The plots shows that there are few words in the texts that expressed amplifying or de-amplifying words..

 ggplot(tidy_brown_qdap_sentiment, aes(index, amplify, fill = text_id)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~text_id, ncol = 4, scales = "free_x") + 
  labs(title = 'Distribution of amplification Sentiments (amplification - de-amplification)')