In Text Mining with R, Chapter 2 looks at Sentiment Analysis. In this assignment, you should start by getting the primary example code from chapter 2 working in an R Markdown document. You should provide a citation to this base code. You’re then asked to extend the code in two ways: Work with a different corpus of your choosing, and Incorporate at least one additional sentiment lexicon (possibly from another R package that you’ve found through research).
set.seed(123)
# Data manipulation
library(data.table)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(DT)
# Time manipulation
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
# Visualization
library(ggplot2)
library(plotrix)
library(corrplot)
## corrplot 0.92 loaded
library(ggdendro)
library(ggrepel)
# Wordcloud
library(wordcloud)
## Loading required package: RColorBrewer
# Text manipulation
library(tidytext)
library(stringr)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(sentimentr)
library(wordcloud)
library(RSentiment)
#Reading and Preparing Data
gb <- tail(fread("~/GBvideos.csv",encoding = "UTF-8"),20000)
gb[,"Location":="GB"]
video <- as.data.table(gb)
We can see that between views and likes we have a high correlation, I thought that we will have a similar correlation between views and dislikes, but is almost half of the like correlation.
corrplot.mixed(corr = cor(video[,c("category_id","views","likes","dislikes","comment_total"),with=F]))
mvideo <- video[,.("Total_Views"=round(max(views,na.rm = T),digits = 2)),by=.(title,thumbnail_link)][order(-Total_Views)]
mvideo %>%
mutate(image = paste0('<img width="80%" height="80%" src="', thumbnail_link , '"></img>')) %>%
arrange(-Total_Views) %>%
top_n(10,wt = Total_Views) %>%
select(image, title, Total_Views) %>%
datatable(class = "nowrap hover row-border", escape = FALSE, options = list(dom = 't',scrollX = TRUE, autoWidth = TRUE))
mvideo <- video[,.("Total_Likes"=round(max(likes,na.rm = T),digits = 2)),by=.(title,thumbnail_link)][order(-Total_Likes)]
mvideo %>%
mutate(image = paste0('<img width="80%" height="80%" src="', thumbnail_link , '"></img>')) %>%
arrange(-Total_Likes) %>%
top_n(10,wt = Total_Likes) %>%
select(image, title, Total_Likes) %>%
datatable(class = "nowrap hover row-border", escape = FALSE, options = list(dom = 't',scrollX = TRUE, autoWidth = TRUE))
mvideo <- video[,.("Total_Dislikes"=round(max(dislikes,na.rm = T),digits = 2)),by=.(title,thumbnail_link)][order(-Total_Dislikes)]
mvideo %>%
mutate(image = paste0('<img width="80%" height="80%" src="', thumbnail_link , '"></img>')) %>%
arrange(-Total_Dislikes) %>%
top_n(10,wt = Total_Dislikes) %>%
select(image, title, Total_Dislikes) %>%
datatable(class = "nowrap hover row-border", escape = FALSE, options = list(dom = 't',scrollX = TRUE, autoWidth = TRUE))
ggplot(video[,.("views"=max(views),"likes"=max(likes)),by=title],aes(views,likes,colour=likes,size=likes))+
geom_jitter()+
geom_smooth()+
guides(fill="none")+
labs(caption=NULL,title="Views Vs Likes",subtitle="In days")+
theme(legend.position ="none")+ geom_text_repel(data=subset(video[,.("views"=max(views),"likes"=max(likes)),by=title], views > 5e+07), aes(views,likes,label=title))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation:
## colour, size
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
sents_eap <- sentiment(video$title)
## Warning: Each time `sentiment` is run it has to do sentence boundary disambiguation when a
## raw `character` vector is passed to `text.var`. This may be costly of time and
## memory. It is highly recommended that the user first runs the raw `character`
## vector through the `get_sentences` function.
sents_eap <- sents_eap[,.("word_count"=sum(word_count),"sentiment"=sum(sentiment)),by=element_id]
ggplot(data=sents_eap)+
geom_histogram(mapping = aes(x=sentiment),binwidth = .1)+
theme_bw()+scale_fill_brewer(palette = "Set1")+
geom_vline(xintercept = 0, color = "red", size = 1.5, alpha = 0.6, linetype = "longdash") +
labs(title="Video Title Score",caption=NULL)+coord_cartesian(xlim = c(-4, 4))
biga <- unnest_tokens(video,bigram, title, token = "ngrams", n = 2)
biga <- as.data.table(biga)
ggplot(biga[,.N,by=bigram][order(-N)][1:19],aes(reorder(bigram,-N),N,fill=bigram))+geom_bar(stat="identity")+geom_label(aes(label=N))+guides(fill="none")+theme(axis.text.x = element_text(angle = 45,hjust = 1))+ labs(caption=NULL,title="Top Video Title bigrams")+xlab(NULL)+ylab(NULL)
SentimentScore == 0 refers to “Neutral”, SentimentScore > 0 means “Positive”, and SentimentScore < 0 stands for “Negative”.
# Perform sentiment analysis (using SentimentIntensityAnalyzer from syuzhet package)
library(syuzhet)
##
## Attaching package: 'syuzhet'
## The following object is masked from 'package:sentimentr':
##
## get_sentences
## The following object is masked from 'package:plotrix':
##
## rescale
sentiment_scores <- get_sentiment(video$title, method = "afinn")
# Combine sentiment scores with the original data
video$SentimentScore <- sentiment_scores
# View summary statistics of sentiment variable
summary(video$SentimentScore)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -4.0000 0.0000 0.0000 0.1325 0.0000 5.0000
# Create a bar plot of sentiment scores
ggplot(video, aes(x = channel_title, y = sentiment_scores, fill = sentiment_scores)) +
geom_bar(stat = "identity") +
ggtitle("Description Sentiments")+xlab("Sentiment")+ylab("SentimentScore")+
theme(axis.text.x = element_text(angle = 45, size=8,hjust = 1))
video[,"Word_len":= str_length(title)]
ggplot(video[,.N,keyby=Word_len],aes(Word_len,N,fill=N))+geom_bar(stat = "identity")+guides(fill="none")+labs(caption="Words Length",title="Word Length in Ttitle")+xlab(NULL)+ylab("Frequency of Words")
# Create a data frame for stop words (common words to exclude)
stop_words <- data.frame(
word = c("a", "an", "the", "and", "is", "it", "this")
)
# Perform text preprocessing using tm package
corpus <- Corpus(VectorSource(video$title))
corpus <- tm_map(corpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
corpus <- tm_map(corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation drops
## documents
corpus <- tm_map(corpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(corpus, removeNumbers): transformation drops
## documents
corpus <- tm_map(corpus, removeWords, stopwords("en"))
## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("en")):
## transformation drops documents
corpus <- tm_map(corpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation drops
## documents
# Create a document-term matrix
dtm <- DocumentTermMatrix(corpus)
# Convert the document-term matrix to a data frame
wordcloud_data <- as.data.frame(as.matrix(dtm))
# Calculate word frequencies
word_frequencies <- colSums(wordcloud_data)
# Generate a word cloud
wordcloud(names(word_frequencies), word_frequencies, max.words = 100)
Sentiment analysis of comparing online videos is to find the viewed, liked, disliked, commented videos and emotions of users such as positive , negative and neutral depending on the sentiment_scores which they express on social media and other online resources. The revolution of social media sites has also attracted the users towards video sharing sites, such as YouTube. We can also see which words in videos’ titles and viewers comments led to higher numbers of word counts and favorites. Finally, All of these top 100 words are displayed in word cloud diagram.