One way to anayze the sentiment of a text is to consider the text as
a combination of its individual words, and the sentiment content of the
whole text as the sum of the sentiment content of the individual words.
In this guide, we show three general-purpose sentiment lexicons,
afinn, bing, nrc, in the
tidytext package in R. We also show to summarize the
results of the sentiment analysis.
WC <- read.csv("C:\\Users\\Asus\\Downloads\\YouTube-World-Cup-Comments.csv")
set.seed(193)
WC <- WC[sample(c(1:dim(WC)[1]), 1000),] # Select only 1000 (for illustration only)
WC$ID <- c(1:1000)
library(tidyverse)
library(tidytext)
library(dplyr)
tidy_comment <- WC %>%
unnest_tokens(word, Comment) %>%
anti_join(stop_words, by="word")
# Use the NRC sentiment lexicon
nrc_word_counts <- tidy_comment %>%
inner_join(get_sentiments("nrc")) %>%
count(word, sentiment, sort=TRUE)
# Find the top words and the associated sentiment from NRC
nrc_word_counts %>%
mutate(word=reorder(word,n)) %>%
top_n(40) %>%
ggplot(aes(word, n, fill=sentiment, color=sentiment)) +
geom_col() +
coord_flip() +
ggtitle("Common Words (Frequency)")
# Find the distribution of sentiments using NRC
nrc_word_counts %>%
ggplot(aes(sentiment, n, fill=sentiment)) +
geom_col(show.legend=FALSE) +
ggtitle("Sentiments (Distribution)") +
coord_flip()
# Using the Bing sentiment lexicon
bing_sentiments <- tidy_comment %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort=TRUE)
bing_sentiments %>%
ggplot(aes(sentiment, n, fill=sentiment)) +
geom_col(show.legend=FALSE) +
ggtitle("Sentiments (Distribution)") +
coord_flip()
# Using the AFINN sentiment lexicon
afinn_sentiments <- tidy_comment %>%
group_by(ID) %>%
inner_join(get_sentiments("afinn")) %>%
summarise(sentiment.score = mean(value))
afinn_sentiments %>%
ggplot(aes(sentiment.score)) +
geom_histogram() +
ggtitle("Comment Sentiment Score Distribution")