\[\\[0.2in]\]

Sentiment analysis

\[\\[0.1in]\]

Housekeeping

Loading needed libraries

library(SentimentAnalysis)
library(lexicon)
library(data.table)
library(dplyr)
library(ggplot2)
library(ggraph)
library(Hmisc)
library(igraph)
library(psych)
library(quanteda)
library(rtweet)
library(tidyr)
library(tidytext)
library(tidyverse)
library(vader)

\[\\[0.1in]\]

Read YouTube data from a CSV file

youtube <- read.csv("tut1b.youtube.csv", header = T)

# The number of variables in the youtube dataframe at this point:
ncol(youtube)

## [1] 17

\[\\[0.001in]\]

\[\\[0.01in]\]

Analyze sentiment direction

ytsent <- analyzeSentiment(youtube$cleaner_text)

names(ytsent)

##  [1] "WordCount"          "SentimentGI"        "NegativityGI"      
##  [4] "PositivityGI"       "SentimentHE"        "NegativityHE"      
##  [7] "PositivityHE"       "SentimentLM"        "NegativityLM"      
## [10] "PositivityLM"       "RatioUncertaintyLM" "SentimentQDAP"     
## [13] "NegativityQDAP"     "PositivityQDAP"

# GI = Harvard-IV dictionary
# HE = Henry’s finance-specific dictionary
# LM = Loughran-McDonald dictionar
# RationUncercaintyLM = Uncertainty words from Loughran-McDonald
# QDAP = Polarity words from qdap

# Add a row number to the sentiment analysis result
ytsent <- ytsent %>%
  mutate(row_num = row_number())

# Convert sentiment scores to sentiment direction
ytsent <- ytsent %>%
  mutate(direction = convertToDirection(ytsent$SentimentQDAP))

# Filter out rows with missing sentiment direction
youtube_filtered <- youtube[!is.na(youtube$direction), ]

# Create a boxplot for sentiment distributions over categories

ytsent1 <- ytsent[!is.na(ytsent$SentimentQDAP), ]

ggplot(ytsent1, aes(x = direction, y = SentimentQDAP)) +
  geom_boxplot() +
  labs(x = "Direction", y = "SentimentQDAP") +
  ggtitle("Distribution of SentimentQDAP by Direction") +
  theme_minimal()

# Gathering the data into a long-form (stacked) for plotting
ytsent1_long <- ytsent1 %>%
  select(c("SentimentGI", "SentimentHE", "SentimentLM", "SentimentQDAP")) %>%
  gather(key = "variable", value = "value")

# Comparing different sentiment scores
ggplot(ytsent1_long, aes(x = variable, y = value, fill = variable)) +
  geom_boxplot() +
  facet_wrap(~ variable, scales = "free") +
  labs(title = "Multiple Boxplots for different sentiment scores",
       x = "Variables", y = "Values") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1),
        legend.position = "none")

# Correlates for different scores:
corPlot(cor(ytsent1[2:14]))

# Join sentiment analysis results with the YouTube data
youtube <- youtube %>%
  left_join(ytsent1)

# The number of variables in the youtube dataframe at this point:
ncol(youtube)

## [1] 32

Analyze emotions

Three lexicons are available: AFINN - Classifies words on a scale ranging from -5 to +5; Bing - Classifies words as positive or negative NRC - Classifies words as emotions and sentiments: Emotions: Anger, Anticipation, Disgust, Fear, Joy, Sadness, Surprise, Trust Sentiments: Positive, Negative

NRC (National Research Council) Sentiment Analysis

# Load NRC sentiment lexicon
nrc <- (get_sentiments("nrc"))

# Words can express multiple sentiment, so it's a many-to-many relations
# You can see an example of different sentiments words load to in the table below:

nrc %>%
  distinct(word, sentiment) %>%
  mutate(present = "yes") %>%
  spread(sentiment, present, fill = "no") %>%
  print(n=50)

## # A tibble: 6,468 x 11
##    word         anger anticipation disgust fear  joy   negative positive sadness
##    <chr>        <chr> <chr>        <chr>   <chr> <chr> <chr>    <chr>    <chr>  
##  1 abacus       no    no           no      no    no    no       no       no     
##  2 abandon      no    no           no      yes   no    yes      no       yes    
##  3 abandoned    yes   no           no      yes   no    yes      no       yes    
##  4 abandonment  yes   no           no      yes   no    yes      no       yes    
##  5 abba         no    no           no      no    no    no       yes      no     
##  6 abbot        no    no           no      no    no    no       no       no     
##  7 abduction    no    no           no      yes   no    yes      no       yes    
##  8 aberrant     no    no           no      no    no    yes      no       no     
##  9 aberration   no    no           yes     no    no    yes      no       no     
## 10 abhor        yes   no           yes     yes   no    yes      no       no     
## 11 abhorrent    yes   no           yes     yes   no    yes      no       no     
## 12 ability      no    no           no      no    no    no       yes      no     
## 13 abject       no    no           yes     no    no    yes      no       no     
## 14 abnormal     no    no           yes     no    no    yes      no       no     
## 15 abolish      yes   no           no      no    no    yes      no       no     
## 16 abolition    no    no           no      no    no    yes      no       no     
## 17 abominable   no    no           yes     yes   no    yes      no       no     
## 18 abomination  yes   no           yes     yes   no    yes      no       no     
## 19 abort        no    no           no      no    no    yes      no       no     
## 20 abortion     no    no           yes     yes   no    yes      no       yes    
## 21 abortive     no    no           no      no    no    yes      no       yes    
## 22 abovementio~ no    no           no      no    no    no       yes      no     
## 23 abrasion     no    no           no      no    no    yes      no       no     
## 24 abrogate     no    no           no      no    no    yes      no       no     
## 25 abrupt       no    no           no      no    no    no       no       no     
## 26 abscess      no    no           no      no    no    yes      no       yes    
## 27 absence      no    no           no      yes   no    yes      no       yes    
## 28 absent       no    no           no      no    no    yes      no       yes    
## 29 absentee     no    no           no      no    no    yes      no       yes    
## 30 absenteeism  no    no           no      no    no    yes      no       no     
## 31 absolute     no    no           no      no    no    no       yes      no     
## 32 absolution   no    no           no      no    yes   no       yes      no     
## 33 absorbed     no    no           no      no    no    no       yes      no     
## 34 absurd       no    no           no      no    no    yes      no       no     
## 35 absurdity    no    no           no      no    no    yes      no       no     
## 36 abundance    no    yes          yes     no    yes   yes      yes      no     
## 37 abundant     no    no           no      no    yes   no       yes      no     
## 38 abuse        yes   no           yes     yes   no    yes      no       yes    
## 39 abysmal      no    no           no      no    no    yes      no       yes    
## 40 abyss        no    no           no      yes   no    yes      no       yes    
## 41 academic     no    no           no      no    no    no       yes      no     
## 42 academy      no    no           no      no    no    no       yes      no     
## 43 accelerate   no    yes          no      no    no    no       no       no     
## 44 acceptable   no    no           no      no    no    no       yes      no     
## 45 acceptance   no    no           no      no    no    no       yes      no     
## 46 accessible   no    no           no      no    no    no       yes      no     
## 47 accident     no    no           no      yes   no    yes      no       yes    
## 48 accidental   no    no           no      yes   no    yes      no       no     
## 49 accidentally no    no           no      no    no    no       no       no     
## 50 accolade     no    yes          no      no    yes   no       yes      no     
## # i 6,418 more rows
## # i 2 more variables: surprise <chr>, trust <chr>

# Count the most common NRC sentiments
nrc %>% 
  count(sentiment, sort=TRUE) %>%
  ungroup() %>%
  top_n(10) %>%
  ggplot(aes(x= reorder(sentiment, n), y= n)) +
  geom_bar(stat ="identity", fill = rainbow(10))+
  coord_flip()+
  theme_minimal()+
  labs(title = "NRC Word-Emotion Association Lexicon",
       subtitle = "Developed by Saif Mohammad and Peter Turney",
       x = "Construct",
       y = "Number of words in dictionary")

# Analyze NRC sentiment for YouTube data
ytnrc <- youtube %>%
  group_by(row_num) %>%
  unnest_tokens(word, cleaner_text) %>%
  full_join(get_sentiments("nrc")) %>%
  count(sentiment) %>%
  spread(sentiment, n, fill=0)

# Let's understand the central tendencies...
describe(ytnrc[2:11])

##              vars    n mean    sd median trimmed  mad min  max range  skew
## anger           1 9634 0.68  7.29      0    0.31 0.00   0  699   699 91.43
## anticipation    2 9634 0.62  5.08      0    0.27 0.00   0  472   472 83.44
## disgust         3 9634 0.59  6.35      0    0.27 0.00   0  613   613 92.98
## fear            4 9634 0.72  8.84      0    0.31 0.00   0  850   850 92.16
## joy             5 9634 0.62  3.94      0    0.30 0.00   0  362   362 80.33
## negative        6 9634 1.51 21.06      1    0.79 1.48   0 2048  2048 95.24
## positive        7 9634 1.52 13.88      1    0.81 1.48   0 1319  1319 89.02
## sadness         8 9634 0.63  6.93      0    0.28 0.00   0  666   666 91.88
## surprise        9 9634 0.55  3.44      0    0.27 0.00   0  300   300 70.00
## trust          10 9634 1.02  7.19      0    0.49 0.00   0  653   653 78.17
##              kurtosis   se
## anger         8748.37 0.07
## anticipation  7706.78 0.05
## disgust       8953.34 0.06
## fear          8842.68 0.09
## joy           7319.59 0.04
## negative      9248.06 0.21
## positive      8429.90 0.14
## sadness       8808.79 0.07
## surprise      5960.76 0.04
## trust         7028.36 0.07

# Create a histogram of polarity using gathered data
ggplot(gather(ytnrc[,c("negative", "positive")], polarity, factor_key=TRUE), 
       aes(x = value, fill = polarity)) +
  geom_histogram(position = "identity", alpha = 0.7, bins = 30) + 
  theme_minimal() +
  labs(title = "Polarity in comments to Trump's speech",
       x = "Polarity Score",
       y = "Frequency") +
  facet_wrap(~ polarity) +  
  theme(legend.position = "top")

# Combine NRC sentiment analysis results with the YouTube data
youtube <- youtube %>%
  left_join(ytnrc)

# The number of variables in the youtube dataframe at this point:
ncol(youtube)

## [1] 43

VADER Sentiment Analysis

Valence Aware Dictionary and sEntiment Reasoner

# Perform VADER sentiment analysis on the YouTube data
# NB: the comments used are the clean ones, not the cleaner ones
vader <- vader_df(youtube$clean_text)

# Let's understand the central tendencies...
describe(vader[3:7])

##           vars    n mean   sd median trimmed  mad min max range  skew kurtosis
## compound     1 9559 0.01 0.51   0.00    0.02 0.62  -1   1     2 -0.05    -0.91
## pos          2 9559 0.13 0.16   0.09    0.10 0.13   0   1     1  1.68     3.58
## neu          3 9559 0.74 0.20   0.75    0.76 0.19   0   1     1 -0.67     0.57
## neg          4 9559 0.12 0.16   0.07    0.09 0.11   0   1     1  1.82     4.23
## but_count    5 9559 0.12 0.42   0.00    0.00 0.00   0   8     8  6.00    61.53
##             se
## compound  0.01
## pos       0.00
## neu       0.00
## neg       0.00
## but_count 0.00

# Add a row number to the VADER sentiment analysis result
vader <- vader %>%
  mutate(row_num = row_number())

# Join VADER sentiment analysis results with the YouTube data
youtube <- youtube %>%
  left_join(vader)

# The number of variables in the youtube dataframe at this point:
ncol(youtube)

## [1] 50

# Save the combined data to a CSV file
write.csv(youtube, "tut3a.youtube.csv")

DSO2008 2023-2024 Working with Big Data

Eliyahu V. Sapir

20 November 2023