Install packages tm, SnowallC, wordcloud, RColorBrewer, syuzhet, and ggplot2.
# Read the text file from local machine , choose file interactively
text <- readLines(file.choose())
install.packages("wordcloud")
install.packages("RColorBrewer")
# Load the data as a corpus
TextDoc <- Corpus(VectorSource(text))
#Replacing "/", "@" and "|" with space
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
TextDoc <- tm_map(TextDoc, toSpace, "/")
TextDoc <- tm_map(TextDoc, toSpace, "@")
install.packages("SnowballC")
TextDoc <- tm_map(TextDoc, toSpace, "\\|")
install.packages("tm")
# Convert the text to lower case
TextDoc <- tm_map(TextDoc, content_transformer(tolower))
# Remove numbers
TextDoc <- tm_map(TextDoc, removeNumbers)
# Remove english common stopwords
TextDoc <- tm_map(TextDoc, removeWords, stopwords("english"))
# Remove your own stop word
# specify your custom stopwords as a character vector
TextDoc <- tm_map(TextDoc, removeWords, c("s", "company", "team"))
# Remove punctuations
TextDoc <- tm_map(TextDoc, removePunctuation)
# Eliminate extra white spaces
TextDoc <- tm_map(TextDoc, stripWhitespace)
# Text stemming - which reduces words to their root form
TextDoc <- tm_map(TextDoc, stemDocument)
# Build a term-document matrix
TextDoc_dtm <- TermDocumentMatrix(TextDoc)
dtm_m <- as.matrix(TextDoc_dtm)
# Sort by descending value of frequency
dtm_v <- sort(rowSums(dtm_m),decreasing=TRUE)
dtm_d <- data.frame(word = names(dtm_v),freq=dtm_v)
# Display the top 500 most frequent words
head(dtm_d, 500)
write.csv((head(dtm_d, 500)), file = "youtube_top_500.csv")
# Plot the most frequent words
barplot(dtm_d[1:50,]$freq, las = 2, names.arg = dtm_d[1:50,]$word,
col ="lightgreen", main ="Top 50 most frequent words",
ylab = "Word frequencies")
#generate word cloud
set.seed(1234)
wordcloud(words = dtm_d$word, freq = dtm_d$freq, min.freq = 5,
max.words=150, random.order=FALSE, rot.per=0.40,
colors=brewer.pal(8, "Dark2"))
# Find associations
findAssocs(TextDoc_dtm, terms = c("putin","ukraine","ukrainian","power","war","russia","russian","prigozhin","money","pay","work","job","military", "conscript","kids","children","death"), corlimit = 0.5)
$putin
numeric(0)
$ukraine
numeric(0)
$ukrainian
numeric(0)
$power
numeric(0)
$war
numeric(0)
$russia
numeric(0)
$russian
numeric(0)
$prigozhin
kaput kuil urka woke
0.55 0.55 0.55 0.55
$money
numeric(0)
$pay
numeric(0)
$work
awar replac thrown apocalyps crise growth hinder pandem satan trepid
0.52 0.51 0.51 0.50 0.50 0.50 0.50 0.50 0.50 0.50
unbridl
0.50
$job
satanist bracelet kharitonov yaroslav legal scott undeclar
0.53 0.52 0.52 0.52 0.52 0.52 0.52
$military
numeric(0)
$conscript
numeric(0)
$kids
numeric(0)
$children
eighteen unpunish userepwcxpt
0.55 0.55 0.54
$death
numeric(0)
# Find associations for words that occur at least 92 times
findAssocs(TextDoc_dtm, terms = findFreqTerms(TextDoc_dtm, lowfreq = 92), corlimit = 0.5)
$peopl
numeric(0)
$power
numeric(0)
$countri
numeric(0)
$everyth
numeric(0)
$like
numeric(0)
$live
numeric(0)
$now
numeric(0)
$russian
numeric(0)
$time
numeric(0)
$will
numeric(0)
$fool
numeric(0)
$way
apocalyps crise growth hinder pandem trepid unbridl artifici dark manmad
0.79 0.79 0.79 0.79 0.79 0.79 0.79 0.77 0.77 0.77
persist thrown await revolut satan awar impun involv rather due
0.77 0.77 0.76 0.76 0.76 0.73 0.73 0.72 0.71 0.70
earth otherwis collect decad replac sane els level evil civil
0.69 0.68 0.67 0.67 0.66 0.66 0.64 0.64 0.63 0.63
rais real stop larg problem possibl done infinit without rule
0.62 0.60 0.59 0.58 0.56 0.56 0.53 0.52 0.51 0.50
$ukrain
numeric(0)
$ukrainian
numeric(0)
$war
numeric(0)
$world
numeric(0)
$idiot
numeric(0)
$can
numeric(0)
$person
cult forese notori specul mediocr desir
0.52 0.52 0.52 0.52 0.51 0.50
$understand
numeric(0)
$well
numeric(0)
$work
awar replac thrown apocalyps crise growth hinder pandem satan trepid
0.52 0.51 0.51 0.50 0.50 0.50 0.50 0.50 0.50 0.50
unbridl
0.50
$one
numeric(0)
$year
numeric(0)
$russia
numeric(0)
$need
numeric(0)
$putin
numeric(0)
$just
numeric(0)
$think
numeric(0)
$even
apocalyps crise dark growth hinder pandem trepid unbridl artifici manmad
0.73 0.73 0.73 0.73 0.73 0.73 0.73 0.73 0.72 0.72
persist thrown await revolut satan awar impun rather involv replac
0.72 0.72 0.71 0.71 0.71 0.68 0.68 0.67 0.67 0.64
collect earth decad els sane due real otherwis civil stop
0.63 0.62 0.61 0.61 0.61 0.60 0.59 0.59 0.58 0.56
level rais problem larg realli
0.56 0.55 0.53 0.53 0.51
# regular sentiment score using get_sentiment() function and method of your choice
# please note that different methods may have different scales
syuzhet_vector <- get_sentiment(text, method="syuzhet")
# see the first row of the vector
head(syuzhet_vector)
[1] 0.00 -3.25 0.05 -1.75 -1.75 -0.55
# see summary statistics of the vector
summary(syuzhet_vector)
Min. 1st Qu. Median Mean 3rd Qu. Max.
-7.750 -2.300 -0.975 -1.199 0.000 6.000
# bing
bing_vector <- get_sentiment(text, method="bing")
head(bing_vector)
[1] 0 -4 0 -3 -2 0
summary(bing_vector)
Min. 1st Qu. Median Mean 3rd Qu. Max.
-12.000 -3.000 -1.000 -1.473 0.000 7.000
#affin
afinn_vector <- get_sentiment(text, method="afinn")
head(afinn_vector)
[1] 0 -9 2 1 -4 5
summary(afinn_vector)
Min. 1st Qu. Median Mean 3rd Qu. Max.
-26.000 -7.000 -3.000 -3.971 0.000 14.000
#compare the first row of each vector using sign function
rbind(
sign(head(syuzhet_vector)),
sign(head(bing_vector)),
sign(head(afinn_vector))
)
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] 0 -1 1 -1 -1 -1
[2,] 0 -1 0 -1 -1 0
[3,] 0 -1 1 1 -1 1
# run nrc sentiment analysis to return data frame with each row classified as one of the following
# emotions, rather than a score:
# anger, anticipation, disgust, fear, joy, sadness, surprise, trust
# It also counts the number of positive and negative emotions found in each row
d<-get_nrc_sentiment(text)
#See lines of the get_nrc_sentiment dataframe
write.csv(d, file = "youtube_nrc.csv")
datatable(d)
#transpose
td<-data.frame(t(d))
#The function rowSums computes column sums across rows for each level of a grouping variable.
td_new <- data.frame(rowSums(td[2:923]))
#Transformation and cleaning
names(td_new)[1] <- "count"
td_new <- cbind("sentiment" = rownames(td_new), td_new)
rownames(td_new) <- NULL
td_new2<-td_new[1:8,]
#Plot One - count of words associated with each sentiment
quickplot(sentiment, data=td_new2, weight=count, geom="bar", fill=sentiment, ylab="count")+ggtitle("YouTube Comment Sentiments")
#Plot two - count of words associated with each sentiment, expressed as a percentage
barplot(
sort(colSums(prop.table(d[, 1:8]))),
horiz = TRUE,
cex.names = 0.7,
las = 1,
main = "Emotions in YouTube Comments", xlab="Percentage"
)
library(DT)
library(readr)
youtube_text_t <- read_csv("youtube_text_t.csv")
New names:Rows: 923 Columns: 2── Column specification ────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (1): x
dbl (1): ...1
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
datatable(youtube_text_t)