library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(tidytext)
library(textdata)
library(janeaustenr)
library(dplyr)
library(stringr)
Source: “Text Mining with R: A Tidy Approach” by Julia Silge and David Robinson
as shown in Text Mining with R
get_sentiments("afinn")
## # A tibble: 2,477 × 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # … with 2,467 more rows
tidy_books <- austen_books() %>%
group_by(book) %>%
mutate(
linenumber = row_number(),
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
as shown in Text Mining with R
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_books %>%
filter(book == "Emma") %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 301 × 2
## word n
## <chr> <int>
## 1 good 359
## 2 friend 166
## 3 hope 143
## 4 happy 125
## 5 love 117
## 6 deal 92
## 7 found 92
## 8 present 89
## 9 kind 82
## 10 happiness 76
## # … with 291 more rows
jane_austen_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
as shown in Text Mining with R
pride_prejudice <- tidy_books %>%
filter(book == "Pride & Prejudice")
pride_prejudice
## # A tibble: 122,204 × 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Pride & Prejudice 1 0 pride
## 2 Pride & Prejudice 1 0 and
## 3 Pride & Prejudice 1 0 prejudice
## 4 Pride & Prejudice 3 0 by
## 5 Pride & Prejudice 3 0 jane
## 6 Pride & Prejudice 3 0 austen
## 7 Pride & Prejudice 7 1 chapter
## 8 Pride & Prejudice 7 1 1
## 9 Pride & Prejudice 10 1 it
## 10 Pride & Prejudice 10 1 is
## # … with 122,194 more rows
afinn <- pride_prejudice %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
## Joining, by = "word"
bing_and_nrc <- bind_rows(
pride_prejudice %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
pride_prejudice %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative"))
) %>%
mutate(method = "NRC")) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
## Joining, by = "word"
bind_rows(afinn,
bing_and_nrc) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")
Potts, Christopher. 2011. On the negativity of negation. In Nan Li and David Lutz, eds., Proceedings of Semantics and Linguistic Theory 20, 636-659.
load_reviews <- function(filepath,id) {
file_str <- readr::read_file(filepath)
if (str_detect(filepath,'pos')){
sentiment <- 'positive'
}
else{
sentiment <- 'negative'
}
df <- data.frame(id,gsub('.*/(.*)$','\\1',filepath),file_str,sentiment)
cols = c('id','filename','text','load_sentiment')
colnames(df) <- cols
return(df)
}
#comment out below line if data is already loaded
mov_reviews <- textdata::dataset_imdb(dir= './Data',split='train')
pos_files <- list.files("./Data/imdb/aclImdb/train/pos", pattern="*.txt", full.names=TRUE)[1:10]
neg_files <- list.files("./Data/imdb/aclImdb/train/neg", pattern="*.txt", full.names=TRUE)[1:10]
df_reviews <- bind_rows(map2(c(pos_files[0:10],neg_files[0:10]),1:length(c(pos_files,neg_files)), load_reviews))
df_word_detail <- df_reviews %>%
group_by(filename) %>%
mutate(
linenumber = row_number()) %>%
ungroup() %>%
unnest_tokens(word, text)
sentiment_reviews <- bind_rows(
df_word_detail %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
df_word_detail %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative"))
) %>%
mutate(method = "NRC")) %>%
count(method, index = filename, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative) |>
inner_join(df_reviews,by=c('index'='filename')) |>
select(-c(id,text))
## Joining, by = "word"
## Joining, by = "word"
sentiment_reviews %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
geom_text(aes(label = load_sentiment),color='black',size=2,position=position_dodge(.9))+
theme(axis.text.x = element_text(angle=90,hjust=0.9))+
facet_wrap(~method, ncol = 1, scales = "free_y")
sentiment_reviews |>
mutate(sentiment_match = if_else(sentiment==0,'N',if_else(sentiment<0,if_else(load_sentiment=='negative','Y','N'),if_else(load_sentiment=='positive','Y','N'))),total = n()/2) |>
group_by(sentiment_match,method) |>
summarize(cnt = n(),match_perc=cnt/total) |>
distinct(sentiment_match,method,cnt,match_perc) |>
arrange(method)
## `summarise()` has grouped output by 'sentiment_match', 'method'. You can
## override using the `.groups` argument.
## # A tibble: 4 × 4
## # Groups: sentiment_match, method [4]
## sentiment_match method cnt match_perc
## <chr> <chr> <int> <dbl>
## 1 N Bing et al. 4 0.2
## 2 Y Bing et al. 16 0.8
## 3 N NRC 8 0.4
## 4 Y NRC 12 0.6
The graph displays the total sentiment by individual word and although it presents the pre-labeled data it is not intuitive which sentiment model performed better than the other compared to what the expected results should have been. Therefore as a separate output when looking at the correct sentiment classification it appears that the Bing model far outperformed the NRC model with this specific movie review corpus.
There could be several reasons why there were different results, but perhaps the NRC did not train on as many words that were contained in these reviews thereby losing out on impactful words that might correctly drive sentiment. Alternatively, there could have been slightly unexpected usage of words or potential sarcasm that the model may not have taken into account by considering each individual word.