\[\\[0.2in]\]
\[\\[0.1in]\]
library(SentimentAnalysis)
library(lexicon)
library(data.table)
library(dplyr)
library(ggplot2)
library(ggraph)
library(Hmisc)
library(igraph)
library(psych)
library(quanteda)
library(rtweet)
library(tidyr)
library(tidytext)
library(tidyverse)
library(vader)
\[\\[0.1in]\]
youtube <- read.csv("tut1b.youtube.csv", header = T)
# The number of variables in the youtube dataframe at this point:
ncol(youtube)
## [1] 17
\[\\[0.001in]\]
\[\\[0.01in]\]
ytsent <- analyzeSentiment(youtube$cleaner_text)
names(ytsent)
## [1] "WordCount" "SentimentGI" "NegativityGI"
## [4] "PositivityGI" "SentimentHE" "NegativityHE"
## [7] "PositivityHE" "SentimentLM" "NegativityLM"
## [10] "PositivityLM" "RatioUncertaintyLM" "SentimentQDAP"
## [13] "NegativityQDAP" "PositivityQDAP"
# GI = Harvard-IV dictionary
# HE = Henry’s finance-specific dictionary
# LM = Loughran-McDonald dictionar
# RationUncercaintyLM = Uncertainty words from Loughran-McDonald
# QDAP = Polarity words from qdap
# Add a row number to the sentiment analysis result
ytsent <- ytsent %>%
mutate(row_num = row_number())
# Convert sentiment scores to sentiment direction
ytsent <- ytsent %>%
mutate(direction = convertToDirection(ytsent$SentimentQDAP))
# Filter out rows with missing sentiment direction
youtube_filtered <- youtube[!is.na(youtube$direction), ]
# Create a boxplot for sentiment distributions over categories
ytsent1 <- ytsent[!is.na(ytsent$SentimentQDAP), ]
ggplot(ytsent1, aes(x = direction, y = SentimentQDAP)) +
geom_boxplot() +
labs(x = "Direction", y = "SentimentQDAP") +
ggtitle("Distribution of SentimentQDAP by Direction") +
theme_minimal()
# Gathering the data into a long-form (stacked) for plotting
ytsent1_long <- ytsent1 %>%
select(c("SentimentGI", "SentimentHE", "SentimentLM", "SentimentQDAP")) %>%
gather(key = "variable", value = "value")
# Comparing different sentiment scores
ggplot(ytsent1_long, aes(x = variable, y = value, fill = variable)) +
geom_boxplot() +
facet_wrap(~ variable, scales = "free") +
labs(title = "Multiple Boxplots for different sentiment scores",
x = "Variables", y = "Values") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
legend.position = "none")
# Correlates for different scores:
corPlot(cor(ytsent1[2:14]))
# Join sentiment analysis results with the YouTube data
youtube <- youtube %>%
left_join(ytsent1)
# The number of variables in the youtube dataframe at this point:
ncol(youtube)
## [1] 32
Three lexicons are available: AFINN - Classifies words on a scale ranging from -5 to +5; Bing - Classifies words as positive or negative NRC - Classifies words as emotions and sentiments: Emotions: Anger, Anticipation, Disgust, Fear, Joy, Sadness, Surprise, Trust Sentiments: Positive, Negative
# Load NRC sentiment lexicon
nrc <- (get_sentiments("nrc"))
# Words can express multiple sentiment, so it's a many-to-many relations
# You can see an example of different sentiments words load to in the table below:
nrc %>%
distinct(word, sentiment) %>%
mutate(present = "yes") %>%
spread(sentiment, present, fill = "no") %>%
print(n=50)
## # A tibble: 6,468 x 11
## word anger anticipation disgust fear joy negative positive sadness
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 abacus no no no no no no no no
## 2 abandon no no no yes no yes no yes
## 3 abandoned yes no no yes no yes no yes
## 4 abandonment yes no no yes no yes no yes
## 5 abba no no no no no no yes no
## 6 abbot no no no no no no no no
## 7 abduction no no no yes no yes no yes
## 8 aberrant no no no no no yes no no
## 9 aberration no no yes no no yes no no
## 10 abhor yes no yes yes no yes no no
## 11 abhorrent yes no yes yes no yes no no
## 12 ability no no no no no no yes no
## 13 abject no no yes no no yes no no
## 14 abnormal no no yes no no yes no no
## 15 abolish yes no no no no yes no no
## 16 abolition no no no no no yes no no
## 17 abominable no no yes yes no yes no no
## 18 abomination yes no yes yes no yes no no
## 19 abort no no no no no yes no no
## 20 abortion no no yes yes no yes no yes
## 21 abortive no no no no no yes no yes
## 22 abovementio~ no no no no no no yes no
## 23 abrasion no no no no no yes no no
## 24 abrogate no no no no no yes no no
## 25 abrupt no no no no no no no no
## 26 abscess no no no no no yes no yes
## 27 absence no no no yes no yes no yes
## 28 absent no no no no no yes no yes
## 29 absentee no no no no no yes no yes
## 30 absenteeism no no no no no yes no no
## 31 absolute no no no no no no yes no
## 32 absolution no no no no yes no yes no
## 33 absorbed no no no no no no yes no
## 34 absurd no no no no no yes no no
## 35 absurdity no no no no no yes no no
## 36 abundance no yes yes no yes yes yes no
## 37 abundant no no no no yes no yes no
## 38 abuse yes no yes yes no yes no yes
## 39 abysmal no no no no no yes no yes
## 40 abyss no no no yes no yes no yes
## 41 academic no no no no no no yes no
## 42 academy no no no no no no yes no
## 43 accelerate no yes no no no no no no
## 44 acceptable no no no no no no yes no
## 45 acceptance no no no no no no yes no
## 46 accessible no no no no no no yes no
## 47 accident no no no yes no yes no yes
## 48 accidental no no no yes no yes no no
## 49 accidentally no no no no no no no no
## 50 accolade no yes no no yes no yes no
## # i 6,418 more rows
## # i 2 more variables: surprise <chr>, trust <chr>
# Count the most common NRC sentiments
nrc %>%
count(sentiment, sort=TRUE) %>%
ungroup() %>%
top_n(10) %>%
ggplot(aes(x= reorder(sentiment, n), y= n)) +
geom_bar(stat ="identity", fill = rainbow(10))+
coord_flip()+
theme_minimal()+
labs(title = "NRC Word-Emotion Association Lexicon",
subtitle = "Developed by Saif Mohammad and Peter Turney",
x = "Construct",
y = "Number of words in dictionary")
# Analyze NRC sentiment for YouTube data
ytnrc <- youtube %>%
group_by(row_num) %>%
unnest_tokens(word, cleaner_text) %>%
full_join(get_sentiments("nrc")) %>%
count(sentiment) %>%
spread(sentiment, n, fill=0)
# Let's understand the central tendencies...
describe(ytnrc[2:11])
## vars n mean sd median trimmed mad min max range skew
## anger 1 9634 0.68 7.29 0 0.31 0.00 0 699 699 91.43
## anticipation 2 9634 0.62 5.08 0 0.27 0.00 0 472 472 83.44
## disgust 3 9634 0.59 6.35 0 0.27 0.00 0 613 613 92.98
## fear 4 9634 0.72 8.84 0 0.31 0.00 0 850 850 92.16
## joy 5 9634 0.62 3.94 0 0.30 0.00 0 362 362 80.33
## negative 6 9634 1.51 21.06 1 0.79 1.48 0 2048 2048 95.24
## positive 7 9634 1.52 13.88 1 0.81 1.48 0 1319 1319 89.02
## sadness 8 9634 0.63 6.93 0 0.28 0.00 0 666 666 91.88
## surprise 9 9634 0.55 3.44 0 0.27 0.00 0 300 300 70.00
## trust 10 9634 1.02 7.19 0 0.49 0.00 0 653 653 78.17
## kurtosis se
## anger 8748.37 0.07
## anticipation 7706.78 0.05
## disgust 8953.34 0.06
## fear 8842.68 0.09
## joy 7319.59 0.04
## negative 9248.06 0.21
## positive 8429.90 0.14
## sadness 8808.79 0.07
## surprise 5960.76 0.04
## trust 7028.36 0.07
# Create a histogram of polarity using gathered data
ggplot(gather(ytnrc[,c("negative", "positive")], polarity, factor_key=TRUE),
aes(x = value, fill = polarity)) +
geom_histogram(position = "identity", alpha = 0.7, bins = 30) +
theme_minimal() +
labs(title = "Polarity in comments to Trump's speech",
x = "Polarity Score",
y = "Frequency") +
facet_wrap(~ polarity) +
theme(legend.position = "top")
# Combine NRC sentiment analysis results with the YouTube data
youtube <- youtube %>%
left_join(ytnrc)
# The number of variables in the youtube dataframe at this point:
ncol(youtube)
## [1] 43
# Perform VADER sentiment analysis on the YouTube data
# NB: the comments used are the clean ones, not the cleaner ones
vader <- vader_df(youtube$clean_text)
# Let's understand the central tendencies...
describe(vader[3:7])
## vars n mean sd median trimmed mad min max range skew kurtosis
## compound 1 9559 0.01 0.51 0.00 0.02 0.62 -1 1 2 -0.05 -0.91
## pos 2 9559 0.13 0.16 0.09 0.10 0.13 0 1 1 1.68 3.58
## neu 3 9559 0.74 0.20 0.75 0.76 0.19 0 1 1 -0.67 0.57
## neg 4 9559 0.12 0.16 0.07 0.09 0.11 0 1 1 1.82 4.23
## but_count 5 9559 0.12 0.42 0.00 0.00 0.00 0 8 8 6.00 61.53
## se
## compound 0.01
## pos 0.00
## neu 0.00
## neg 0.00
## but_count 0.00
# Add a row number to the VADER sentiment analysis result
vader <- vader %>%
mutate(row_num = row_number())
# Join VADER sentiment analysis results with the YouTube data
youtube <- youtube %>%
left_join(vader)
# The number of variables in the youtube dataframe at this point:
ncol(youtube)
## [1] 50
# Save the combined data to a CSV file
write.csv(youtube, "tut3a.youtube.csv")