Load libraries

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyverse)

## Warning: package 'lubridate' was built under R version 4.3.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.4
## ✔ ggplot2   3.4.3     ✔ stringr   1.5.0
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidytext)

## Warning: package 'tidytext' was built under R version 4.3.3

library(wordcloud)

## Warning: package 'wordcloud' was built under R version 4.3.3

## Loading required package: RColorBrewer

library(gutenbergr)

## Warning: package 'gutenbergr' was built under R version 4.3.3

Introduction

In this sentiment analysis three different corpus are taken and used different lexicons for each.

The base code for this assignemnet is taken directy from chapter 2 of Text Mining with R: A Tidy Approach https://www.tidytextmining.com/sentiment.html

get_sentiments("bing")

## # A tibble: 6,786 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # ℹ 6,776 more rows

get_sentiments("afinn")

## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ℹ 2,467 more rows

get_sentiments("nrc")

## # A tibble: 13,872 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # ℹ 13,862 more rows

List book titles from gutenberg_works()

all_books <- gutenberg_works() %>%
  select(title)
all_books

## # A tibble: 44,042 × 1
##    title                                                                        
##    <chr>                                                                        
##  1 "The Declaration of Independence of the United States of America"            
##  2 "The United States Bill of Rights\r\nThe Ten Original Amendments to the Cons…
##  3 "John F. Kennedy's Inaugural Address"                                        
##  4 "Lincoln's Gettysburg Address\r\nGiven November 19, 1863 on the battlefield …
##  5 "The United States Constitution"                                             
##  6 "Give Me Liberty or Give Me Death"                                           
##  7 "The Mayflower Compact"                                                      
##  8 "Abraham Lincoln's Second Inaugural Address"                                 
##  9 "Abraham Lincoln's First Inaugural Address"                                  
## 10 "The King James Version of the Bible"                                        
## # ℹ 44,032 more rows

Get a specific corpus

paradise <- gutenberg_works(title == "Paradise Regained") %>%
  gutenberg_download(meta_fields = "title")

## Determining mirror for Project Gutenberg from https://www.gutenberg.org/robot/harvest

## Using mirror http://aleph.gutenberg.org

paradise

## # A tibble: 2,111 × 3
##    gutenberg_id text                title            
##           <int> <chr>               <chr>            
##  1           58 "Paradise Regained" Paradise Regained
##  2           58 ""                  Paradise Regained
##  3           58 "by John Milton"    Paradise Regained
##  4           58 ""                  Paradise Regained
##  5           58 ""                  Paradise Regained
##  6           58 ""                  Paradise Regained
##  7           58 ""                  Paradise Regained
##  8           58 "Contents"          Paradise Regained
##  9           58 ""                  Paradise Regained
## 10           58 " THE FIRST BOOK"   Paradise Regained
## # ℹ 2,101 more rows

Tokenization

# Mutate chapter and linenumber
# Tokenize the text
paradise1 <- paradise %>%
  mutate(linenumber = row_number(),
         chapter = cumsum(str_detect(text, regex("[IVXLCDM]+", ignore_case = TRUE)))) %>%
    ungroup()%>%
  unnest_tokens(word, text)

Perform sentiment analysis using the Bing lexicon

paradise_sentiments <- paradise1 %>%
  inner_join(get_sentiments("bing")) %>%
  count(title, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)%>%
    mutate(lexicon = "BING")

## Joining with `by = join_by(word)`

Plot the negative and positive

ggplot(paradise_sentiments, aes(index, sentiment, fill = title)) +
  geom_col() +
  scale_fill_manual(values = "skyblue") +
  theme_minimal()

# Afinn lexicon

get_sentiments("afinn")

## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ℹ 2,467 more rows

Get a corpus for afinn

tragedy <- gutenberg_works(title == "The Tragedy of Pudd'nhead Wilson") %>%
  gutenberg_download(meta_fields = "title")

Tokenization

tragedy1 <- tragedy %>%
  mutate(linenumber = row_number(),
         chapter = cumsum(str_detect(text, regex("[IVXLCDM]+", ignore_case = TRUE)))) %>%
    ungroup()%>%
  unnest_tokens(word, text)

Perform sentiment analysis using AFINN lexicon

sentiments_afinn <- tragedy1 %>%
  inner_join(get_sentiments("afinn")) %>%
  group_by(index = linenumber %/% 80) %>%
  summarise(sentiment = sum(value)) %>% 
  mutate(lexicon = "AFINN")

## Joining with `by = join_by(word)`

Plot the negative and positive

ggplot(sentiments_afinn, aes(x = index, y = sentiment)) +
  geom_line(color = "blue") +
  geom_point(color = "blue", size = 2) +
  labs(title = "Sentiment Analysis using AFINN Lexicon",
       x = "Index (Group of Lines)",
       y = "Sentiment Score",
       caption = "Method: AFINN") +
  theme_minimal()

# filer the word ’sadness’from nrc lexicon and assign it to nrc_lexicon

nrc_lexicon <- get_sentiments("nrc")%>%
    filter(sentiment == "sadness")

Get a corpus for sentimental analysis with nrc lexicon

the_poison <- gutenberg_works(title == "The Poison Belt") %>%
  gutenberg_download(meta_fields = "title")

Tokenization

the_poison1 <- the_poison %>%
  mutate(linenumber = row_number(),
         chapter = cumsum(str_detect(text, regex("[IVXLCDM]+", ignore_case = TRUE)))) %>%
    ungroup()%>%
  unnest_tokens(word, text)

Perform sentiment analysis using NRC lexicon

sadness_words <- the_poison1 %>%
  inner_join(filter(nrc_lexicon, sentiment == "sadness"), by = "word") %>%
  count(word, sort = TRUE)%>%
    mutate(lexicon = "NRC")

Plot the output from above analysis

# To avoid the cowded words in y axis it is limited to top 20
top_n_words <- 20

# Filter the top N most frequent words
top_sadness_words <- sadness_words %>%
  slice_max(n, n = top_n_words)

# Create a ggplot to visualize the top N word frequencies
ggplot(top_sadness_words, aes(x = reorder(word, n), y = n)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  labs(title = "Top 20 Words Associated with Sadness",
       x = "Word",
       y = "Frequency") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1, size = 10),  # Adjust text size
        plot.title = element_text(size = 14)) +  # Adjust title size
  coord_flip()  # Flip the coordinates to display horizontally

all_sentiments <- bind_rows(sadness_words, sentiments_afinn, paradise_sentiments)

Plotting to compare sentiment scores obtained from different lexicons

library(ggplot2)

ggplot(all_sentiments, aes(x = index, y = sentiment, color = lexicon)) +
  geom_line() +
  labs(title = "Comparison of Sentiment Scores from Different Lexicons",
       x = "Index",
       y = "Sentiment Score") +
  theme_minimal()

## Warning: Removed 225 rows containing missing values (`geom_line()`).

Assignment10

Chhiring Lama

2024-03-26

Load libraries

Introduction

List book titles from gutenberg_works()

Get a specific corpus

Tokenization

Perform sentiment analysis using the Bing lexicon

Plot the negative and positive

Get a corpus for afinn

Tokenization

Perform sentiment analysis using AFINN lexicon

Plot the negative and positive

Get a corpus for sentimental analysis with nrc lexicon

Tokenization

Perform sentiment analysis using NRC lexicon

Plot the output from above analysis

Plotting to compare sentiment scores obtained from different lexicons