library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidytext)
library(stringr)
library(XML)
library(jsonlite)
library(rvest)
library(httr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ purrr::flatten() masks jsonlite::flatten()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janeaustenr)
library(tidyr)
library(SentimentAnalysis)
##
## Attaching package: 'SentimentAnalysis'
##
## The following object is masked from 'package:base':
##
## write
#library(qdap)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
##
## The following object is masked from 'package:ggplot2':
##
## annotate
##
## The following object is masked from 'package:httr':
##
## content
library(qdapDictionaries)
This assignment consists of two parts. Part 1 loads and tests the existing code in Chapter 2 from https://www.tidytextmining.com/sentiment. Part 2 is an extension of the code and analysis using the Brown Corpus and two lexicons, the mpqa and SentimentAnalysis lexicons.
The Brown corpus is composed of 501 texts. More information on the Brown corpus can be found at http://icame.uib.no/brown/bcm.html.
Part 1 of this assignment is the primary example code from https://www.tidytextmining.com/sentiment ## Load the data from the Jane Austen Corpus
original_books <- austen_books() %>%
group_by(book) %>%
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup()
head(original_books)
## # A tibble: 6 × 4
## text book linenumber chapter
## <chr> <fct> <int> <int>
## 1 "SENSE AND SENSIBILITY" Sense & Sensibility 1 0
## 2 "" Sense & Sensibility 2 0
## 3 "by Jane Austen" Sense & Sensibility 3 0
## 4 "" Sense & Sensibility 4 0
## 5 "(1811)" Sense & Sensibility 5 0
## 6 "" Sense & Sensibility 6 0
tidy_books <- original_books %>%
unnest_tokens(word, text)
head(tidy_books)
## # A tibble: 6 × 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Sense & Sensibility 1 0 sense
## 2 Sense & Sensibility 1 0 and
## 3 Sense & Sensibility 1 0 sensibility
## 4 Sense & Sensibility 3 0 by
## 5 Sense & Sensibility 3 0 jane
## 6 Sense & Sensibility 3 0 austen
data(stop_words)
tidy_books <- tidy_books %>%
anti_join(stop_words)
## Joining with `by = join_by(word)`
head(get_sentiments("afinn"))
## # A tibble: 6 × 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
head(get_sentiments("bing"))
## # A tibble: 6 × 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
#head(get_sentiments("nrc"))
jane_austen_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 131015 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
head(jane_austen_sentiment)
## # A tibble: 6 × 5
## book index negative positive sentiment
## <fct> <dbl> <int> <int> <int>
## 1 Sense & Sensibility 0 16 26 10
## 2 Sense & Sensibility 1 19 44 25
## 3 Sense & Sensibility 2 12 23 11
## 4 Sense & Sensibility 3 15 22 7
## 5 Sense & Sensibility 4 16 29 13
## 6 Sense & Sensibility 5 16 39 23
library(ggplot2)
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
We are using the Brown Corpus. Data and information from this corpus is located at http://icame.uib.no/brown/bcm.html. We used python code to download the corpus, tokenize it, and saved it in a data frame. We then sorted the data frame by the count of tokens and saved the top 12 in a separate data frame. We finally used that data for my analysis bellow:
In this extension, we are using 6886 words from the mpqa word lexicon. The lexicon is read from a json file in Prof. William L Hamilton GitHub repository. William L Hamilton is an Assistant Professor at McGill University and Mila, working on machine learning, NLP, and network analysis.hip github repo can be found here https://github.com/williamleif
This code loads the mpqa seniment.
# Load the JSON lexicon data into an R data frame
mpqa_sentiment <- fromJSON("https://raw.githubusercontent.com/williamleif/socialsent/master/socialsent/data/lexicons/mpqa.json")
mpqa_df <- as.data.frame(mpqa_sentiment)
# This data frame is in a wide format with only one row
# The columns represent the words and the single row consist of the sentiments with -1 for negative and 1 for positive sentiment.
# Converting the data frame from wide to long format
mpqa_df <- pivot_longer(
data = mpqa_df,
cols = everything(),
names_to = "word",
values_to = "value"
)
mpqa <- mpqa_df |>
mutate(
sentiment = if_else(value == 1, "positive", "negative")
)
head(mpqa)
## # A tibble: 6 × 3
## word value sentiment
## <chr> <int> <chr>
## 1 fawn -1 negative
## 2 foul -1 negative
## 3 mirage -1 negative
## 4 aggression -1 negative
## 5 eligible 1 positive
## 6 chatter -1 negative
mpqa |> group_by(sentiment) |>
summarise(n = n())
## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 4590
## 2 positive 2296
The texts from the Brown corpus were read using a python code. A data frame was created of the 12 texts with the highest token counts. The python code and the data frame are in the github repo.
brown_corpus_top_12_token_count <- read_csv('https://raw.githubusercontent.com/hawa1983/Week-10-Assignment/main/brown_corpus_top_12_token_count.csv')
## Rows: 1833 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): text_id, text
## dbl (1): linenumber
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(brown_corpus_top_12_token_count)
## # A tibble: 6 × 3
## text_id text linenumber
## <chr> <chr> <dbl>
## 1 cn29 `` Bastards '' , he would say , `` all I did was put a bea… 1
## 2 cn29 Since then , and since the pure grain had gotten him divor… 2
## 3 cn29 `` There ain't nothin' faster , or lonelier , or more dire… 3
## 4 cn29 `` The accommodations may not be the poshest , but man ! 4
## 5 cn29 There ain't nobody askin' for your ticket stub , neither '… 5
## 6 cn29 He had been conning the freights for a long , long time no… 6
tidy_brown_text <- brown_corpus_top_12_token_count %>%
unnest_tokens(word, text)
head(tidy_brown_text)
## # A tibble: 6 × 3
## text_id linenumber word
## <chr> <dbl> <chr>
## 1 cn29 1 bastards
## 2 cn29 1 he
## 3 cn29 1 would
## 4 cn29 1 say
## 5 cn29 1 all
## 6 cn29 1 i
data(stop_words)
tidy_brown_text <- tidy_brown_text %>%
anti_join(stop_words)
## Joining with `by = join_by(word)`
head(tidy_brown_text)
## # A tibble: 6 × 3
## text_id linenumber word
## <chr> <dbl> <chr>
## 1 cn29 1 bastards
## 2 cn29 1 beat
## 3 cn29 1 vivaldi
## 4 cn29 1 stuff
## 5 cn29 1 chair
## 6 cn29 1 clobbered
This code joins the mpqa lexicon and the tidy_brown_text tables and calculate the sentiment for every 10 sentences.
tidy_brown_sentiment <- tidy_brown_text %>%
inner_join(mpqa) %>%
count(text_id, index = linenumber %/% 10, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
head(tidy_brown_sentiment)
## # A tibble: 6 × 5
## text_id index negative positive sentiment
## <chr> <dbl> <int> <int> <int>
## 1 cc14 135 2 1 -1
## 2 cc14 136 12 15 3
## 3 cc14 137 4 16 12
## 4 cc14 138 9 13 4
## 5 cc14 139 5 13 8
## 6 cc14 140 11 7 -4
The plots shows how the sentiment changed throughout the book. The texts with id cc14, ch17 and cn17 have mostly a positive sentiment. cn17 starts with positive sentiments. The texts in cp16, cp23, cp24, and cr03 are comprised mostly of negative sentiments. The texts in cn29, cp04, cp06 and cp15 are similarly composed mostly of negative sentiments.
library(ggplot2)
ggplot(tidy_brown_sentiment, aes(index, sentiment, fill = text_id)) +
geom_col(show.legend = FALSE) +
facet_wrap(~text_id, ncol = 4, scales = "free_x")
## Part 2.2
This analysis used the SentimentAnalysis lexicon
Data sets in package ‘SentimentAnalysis’:
DictionaryGI Dictionary with opinionated words from the Harvard-IV dictionary as used in the General Inquirer software DictionaryHE Dictionary with opinionated words from Henry’s Financial dictionary DictionaryLM Dictionary with opinionated words from Loughran-McDonald Financial dictionary
These data sets are combined to create a single data frame
data <- data(package = 'SentimentAnalysis')
sentiment_df <-rbind(
tibble(word = DictionaryGI$negative) |>
mutate(sentiment = "negative"),
tibble(word = DictionaryGI$positive) |>
mutate(sentiment = "positive"),
tibble(word = DictionaryHE$negative) |>
mutate(sentiment = "negative"),
tibble(word = DictionaryHE$positive) |>
mutate(sentiment = "positive"),
tibble(word = DictionaryLM$negative) |>
mutate(sentiment = "negative"),
tibble(word = DictionaryLM$positive) |>
mutate(sentiment = "positive")
)
head(sentiment_df)
## # A tibble: 6 × 2
## word sentiment
## <chr> <chr>
## 1 abandon negative
## 2 abandonment negative
## 3 abate negative
## 4 abdicate negative
## 5 abhor negative
## 6 abject negative
This code joins the SentimentAnalysis lexicon and the tidy_brown_text tables and calculate the sentiment for every 10 sentences.
tidy_brown_GI_sentiment <- tidy_brown_text %>%
inner_join(sentiment_df) %>%
count(text_id, index = linenumber %/% 10, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
## Warning in inner_join(., sentiment_df): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 2 of `x` matches multiple rows in `y`.
## ℹ Row 167 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
head(tidy_brown_GI_sentiment)
## # A tibble: 6 × 5
## text_id index negative positive sentiment
## <chr> <dbl> <int> <int> <int>
## 1 cc14 135 2 0 -2
## 2 cc14 136 9 14 5
## 3 cc14 137 5 22 17
## 4 cc14 138 8 17 9
## 5 cc14 139 6 19 13
## 6 cc14 140 4 7 3
The plots shows how the sentiment changed throughout the book. The texts with id cc14 have mostly a positive sentiments. The rest of the other texts are composed mostly of negative sentiments.Overall, the sentiments analysis is similar to that of the mpqa lexicon. However, this lexicon shows more positive sentiments in cr03 than did th mpqa sentiment.
library(ggplot2)
ggplot(tidy_brown_GI_sentiment, aes(index, sentiment, fill = text_id)) +
geom_col(show.legend = FALSE) +
facet_wrap(~text_id, ncol = 4, scales = "free_x")
The qdapDictionaries package contains datasets that have sentiments for action, amplifying, de-amplifying, negation, negative, positive, power, strength, weakness, and submission word lists. In this extension, we will create a separate data frame for each word list with a second column for sentiment for that word list. We then combine all the data frames to generate a qdap lexicon. We then mutated the data frame to create the following net sentiments
sentiment = positive - negative strength = strong - weak amplify = ampplification - deamplification.
data <- data(package = "qdapDictionaries")
action_df <- as.data.frame(action.verbs) |> #Action Word List
mutate(sentiment = "action") |>
rename(word = action.verbs)
amplification_df <- as.data.frame(amplification.words) |> #Amplifying Words
mutate(sentiment = "amplification") |>
rename(word = amplification.words)
deamplification_df <- as.data.frame(deamplification.words) |> #De-amplifying Words
mutate(sentiment = "deamplification") |>
rename(word = deamplification.words)
negation_df <- as.data.frame(negation.words) |> #negation Words
mutate(sentiment = "negation") |>
rename(word = negation.words)
negative_df <- as.data.frame(negative.words) |> #negative Words
mutate(sentiment = "negative") |>
rename(word = negative.words)
positive_df <- as.data.frame(positive.words) |> #Positive Words
mutate(sentiment = "positive") |>
rename(word = positive.words)
power_df <- as.data.frame(power.words) |> #Words that Indicate Power
mutate(sentiment = "power") |>
rename(word = power.words)
strong_df <- as.data.frame(strong.words) |> #Words that Indicate Strength
mutate(sentiment = "strong") |>
rename(word = strong.words)
submit_df <- as.data.frame(submit.words) |> #Words that Indicate Submission
mutate(sentiment = "submit") |>
rename(word = submit.words)
weak_df <- as.data.frame(weak.words) |> #Words that Indicate Weakness
mutate(sentiment = "weak") |>
rename(word = weak.words)
qdap_lex <- bind_rows(action_df,
amplification_df,
deamplification_df,
negation_df,
negative_df,
positive_df,
power_df,
strong_df,
submit_df,
weak_df)
head(qdap_lex)
## word sentiment
## 1 abduct action
## 2 abide action
## 3 abolish action
## 4 abscond action
## 5 abuse action
## 6 accelerate action
This code joins the SentimentAnalysis lexicon and the tidy_brown_text tables and calculate the sentiment for every 10 sentences.
tidy_brown_qdap_sentiment <- tidy_brown_text %>%
inner_join(qdap_lex) %>%
count(text_id, index = linenumber %/% 10, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative, strength = strong - weak, amplify = amplification - deamplification)
## Joining with `by = join_by(word)`
## Warning in inner_join(., qdap_lex): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 2 of `x` matches multiple rows in `y`.
## ℹ Row 1424 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
head(tidy_brown_qdap_sentiment)
## # A tibble: 6 × 14
## text_id index action negative positive power strong submit weak amplification
## <chr> <dbl> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 cc14 135 1 2 1 1 0 0 0 0
## 2 cc14 136 14 9 12 5 8 3 5 0
## 3 cc14 137 12 2 12 1 4 0 2 0
## 4 cc14 138 10 8 8 5 8 2 2 0
## 5 cc14 139 7 4 12 4 10 1 4 0
## 6 cc14 140 3 3 3 1 2 4 1 2
## # ℹ 4 more variables: deamplification <int>, sentiment <int>, strength <int>,
## # amplify <int>
The overall sentiments is similar to that dipicted by the mpqa lexicon
ggplot(tidy_brown_qdap_sentiment, aes(index, sentiment, fill = text_id)) +
geom_col(show.legend = FALSE) +
facet_wrap(~text_id, ncol = 4, scales = "free_x") +
labs(title = 'Overall sentiments (positive - negative)')
### Strength Sentiment analysis The plots shows that there is a
predominant use of strength words than words that express weakness
through out each of the texts.
ggplot(tidy_brown_qdap_sentiment, aes(index, strength, fill = text_id)) +
geom_col(show.legend = FALSE) +
facet_wrap(~text_id, ncol = 4, scales = "free_x") +
labs(title = 'Distribution of strength sentiments (strong - weak)')
### Amplification Sentiment analysis The plots shows that there are few
words in the texts that expressed amplifying or de-amplifying
words..
ggplot(tidy_brown_qdap_sentiment, aes(index, amplify, fill = text_id)) +
geom_col(show.legend = FALSE) +
facet_wrap(~text_id, ncol = 4, scales = "free_x") +
labs(title = 'Distribution of amplification Sentiments (amplification - de-amplification)')