Install packages tm, SnowallC, wordcloud, RColorBrewer, syuzhet, and ggplot2.
library("ggplot2")
# Read the text file from local machine , choose file interactively
text <- readLines(file.choose())
# Load the data as a corpus
TextDoc <- Corpus(VectorSource(text))
#Replacing "/", "@" and "|" with space
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
install.packages("tm")
TextDoc <- tm_map(TextDoc, toSpace, "/")
TextDoc <- tm_map(TextDoc, toSpace, "@")
TextDoc <- tm_map(TextDoc, toSpace, "\\|")
# Convert the text to lower case
TextDoc <- tm_map(TextDoc, content_transformer(tolower))
# Remove numbers
TextDoc <- tm_map(TextDoc, removeNumbers)
# Remove english common stopwords
TextDoc <- tm_map(TextDoc, removeWords, stopwords("english"))
# Remove your own stop word
# specify your custom stopwords as a character vector
TextDoc <- tm_map(TextDoc, removeWords, c("s", "company", "team"))
# Remove punctuations
TextDoc <- tm_map(TextDoc, removePunctuation)
# Eliminate extra white spaces
TextDoc <- tm_map(TextDoc, stripWhitespace)
# Text stemming - which reduces words to their root form
TextDoc <- tm_map(TextDoc, stemDocument)
# Build a term-document matrix
TextDoc_dtm <- TermDocumentMatrix(TextDoc)
dtm_m <- as.matrix(TextDoc_dtm)
# Sort by descending value of frequency
dtm_v <- sort(rowSums(dtm_m),decreasing=TRUE)
dtm_d <- data.frame(word = names(dtm_v),freq=dtm_v)
# Display the top 500 most frequent words
library("DT")
datatable(head(dtm_d, 500))
write.csv((head(dtm_d, 500)), file = "tme_top_500.csv")
# Plot the most frequent words
barplot(dtm_d[1:50,]$freq, las = 2, names.arg = dtm_d[1:50,]$word,
col ="lightgreen", main ="Top 50 most frequent words",
ylab = "Word frequencies")
#generate word cloud
set.seed(1234)
wordcloud(words = dtm_d$word, freq = dtm_d$freq, min.freq = 5,
max.words=100, random.order=FALSE, rot.per=0.40,
colors=brewer.pal(8, "Dark2"))
# Find associations
findAssocs(TextDoc_dtm, terms = c("putin","ukraine","ukrainian","power","war","russia","russian","prigozhin","money","pay","work","job","military", "conscript","kids","children","death"), corlimit = 0.5)
$putin
numeric(0)
$ukraine
numeric(0)
$ukrainian
numeric(0)
$power
numeric(0)
$war
neocon aggress depend superpow pandem
0.56 0.55 0.52 0.52 0.51
$russia
numeric(0)
$russian
numeric(0)
$prigozhin
numeric(0)
$money
numeric(0)
$pay
numeric(0)
$work
numeric(0)
$job
rtvi ⭕️ erad fratern greatest ottoman rebuild
0.53 0.53 0.53 0.53 0.53 0.53 0.53
$military
numeric(0)
$conscript
enlist eroshenko kavkaz rite yeroshenko “satanic” commissar
0.71 0.71 0.71 0.71 0.71 0.71 0.71
$kids
numeric(0)
$children
indigen irrespons kabanov multiethn sadovod tsargrad migrant parent
0.63 0.63 0.63 0.63 0.63 0.63 0.62 0.60
academ ghetto kotelniki school greedi
0.60 0.60 0.60 0.53 0.53
$death
berlusconi silvio
0.54 0.51
# Find associations for words that occur at least 180 times
findAssocs(TextDoc_dtm, terms = findFreqTerms(TextDoc_dtm, lowfreq = 180), corlimit = 0.5)
$author
numeric(0)
$day
apostol lent pentecost
0.6 0.6 0.6
$fact
numeric(0)
$just
numeric(0)
$militari
numeric(0)
$can
numeric(0)
$countri
numeric(0)
$even
numeric(0)
$governor
numeric(0)
$offic
numeric(0)
$one
numeric(0)
$power
numeric(0)
$will
numeric(0)
$first
numeric(0)
$`“`
numeric(0)
$putin
numeric(0)
$said
numeric(0)
$ukrainian
numeric(0)
$vladimir
numeric(0)
$year
numeric(0)
$russia
numeric(0)
$russian
numeric(0)
$state
numeric(0)
$time
numeric(0)
$work
numeric(0)
$accord
numeric(0)
$case
crimin
0.57
$new
numeric(0)
$peopl
numeric(0)
$main
numeric(0)
$say
numeric(0)
$deputi
numeric(0)
$https
numeric(0)
$everyth
numeric(0)
$forc
arm
0.57
$region
numeric(0)
$territori
numeric(0)
$ukrain
numeric(0)
$war
neocon aggress depend superpow pandem
0.56 0.55 0.52 0.52 0.51
$also
numeric(0)
$feder
numeric(0)
$attack
uav
0.54
$ministri
numeric(0)
$now
numeric(0)
$presid
numeric(0)
$citi
numeric(0)
$public
tighten alrosa bike candidaci chemezov kinder nas nasa
0.51 0.50 0.50 0.50 0.50 0.50 0.50 0.50
siluanov smut thievish vaino vouch
0.50 0.50 0.50 0.50 0.50
$alreadi
primari conceptu sidyakin turchak idioci
0.52 0.52 0.52 0.52 0.51
$call
boorish
0.51
$well
numeric(0)
$person
numeric(0)
$head
numeric(0)
$moscow
numeric(0)
$offici
numeric(0)
$hous
numeric(0)
$court
numeric(0)
$rubl
billion million
0.56 0.54
# regular sentiment score using get_sentiment() function and method of your choice
# please note that different methods may have different scales
syuzhet_vector <- get_sentiment(text, method="syuzhet")
# see the first row of the vector
head(syuzhet_vector)
[1] 0.00 -0.10 2.30 1.50 2.65 1.50
# see summary statistics of the vector
summary(syuzhet_vector)
Min. 1st Qu. Median Mean 3rd Qu. Max.
-10.0500 -1.9500 -0.3250 -0.4793 1.1500 8.0000
# bing
bing_vector <- get_sentiment(text, method="bing")
head(bing_vector)
[1] 0 0 -1 3 1 3
summary(bing_vector)
Min. 1st Qu. Median Mean 3rd Qu. Max.
-12.000 -3.000 -1.000 -1.335 0.000 7.000
#affin
afinn_vector <- get_sentiment(text, method="afinn")
head(afinn_vector)
[1] 0 -2 0 9 4 9
summary(afinn_vector)
Min. 1st Qu. Median Mean 3rd Qu. Max.
-27.000 -7.000 -3.000 -3.587 1.000 19.000
#compare the first row of each vector using sign function
rbind(
sign(head(syuzhet_vector)),
sign(head(bing_vector)),
sign(head(afinn_vector))
)
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] 0 -1 1 1 1 1
[2,] 0 0 -1 1 1 1
[3,] 0 -1 0 1 1 1
# run nrc sentiment analysis to return data frame with each row classified as one of the following
# emotions, rather than a score:
# anger, anticipation, disgust, fear, joy, sadness, surprise, trust
# It also counts the number of positive and negative emotions found in each row
d<-get_nrc_sentiment(text)
[WARNING] This document format requires a nonempty <title> element.
Defaulting to 'tme.knit' as the title.
To specify a title, use 'title' in metadata or --metadata title="...".
#See lines of the get_nrc_sentiment dataframe
write.csv(d, file = "tme_nrc.csv")
datatable(d)
#transpose
td<-data.frame(t(d))
#The function rowSums computes column sums across rows for each level of a grouping variable.
td_new <- data.frame(rowSums(td[2:1881]))
#Transformation and cleaning
names(td_new)[1] <- "count"
td_new <- cbind("sentiment" = rownames(td_new), td_new)
rownames(td_new) <- NULL
td_new2<-td_new[1:8,]
#Plot One - count of words associated with each sentiment
quickplot(sentiment, data=td_new2, weight=count, geom="bar", fill=sentiment, ylab="count")+ggtitle("Posts on Telegram sentiments")
#Plot two - count of words associated with each sentiment, expressed as a percentage
barplot(
sort(colSums(prop.table(d[, 1:8]))),
horiz = TRUE,
cex.names = 0.7,
las = 1,
main = "Emotions in Posts on Telegram", xlab="Percentage"
)
library(DT)
library(readr)
tme_textonly_t <- read_csv("tme_textonly_t.csv",
col_types = cols(...1 = col_skip()))
datatable(tme_textonly_t)