library(tm)
## Loading required package: NLP
library(SnowballC)
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
library(syuzhet)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble 3.0.6 v dplyr 1.0.5
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## v purrr 0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x ggplot2::annotate() masks NLP::annotate()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(stringr)
library(tidytext)
library(harrypotter)
text <- readLines(file.choose())
TextDoc <- Corpus(VectorSource(text))
#Replace any '/', '@' and '|' in text with a space
addspaceinstead <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
textindoc <- tm_map(TextDoc, addspaceinstead, "/")
## Warning in tm_map.SimpleCorpus(TextDoc, addspaceinstead, "/"): transformation
## drops documents
textindoc <- tm_map(TextDoc, addspaceinstead, "@")
## Warning in tm_map.SimpleCorpus(TextDoc, addspaceinstead, "@"): transformation
## drops documents
textindoc <- tm_map(TextDoc, addspaceinstead, "\\|")
## Warning in tm_map.SimpleCorpus(TextDoc, addspaceinstead, "\\|"): transformation
## drops documents
# Convert the text to lower case
textindoc <- tm_map(textindoc, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(textindoc, content_transformer(tolower)):
## transformation drops documents
# Remove numbers
textindoc <- tm_map(textindoc, removeNumbers)
## Warning in tm_map.SimpleCorpus(textindoc, removeNumbers): transformation drops
## documents
# Remove english common stopwords
textindoc <- tm_map(textindoc, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(textindoc, removeWords, stopwords("english")):
## transformation drops documents
# Remove your own stop word
# specify your custom stopwords as a character vector
textindoc <- tm_map(textindoc, removeWords, c("s", "company", "team"))
## Warning in tm_map.SimpleCorpus(textindoc, removeWords, c("s", "company", :
## transformation drops documents
# Remove punctuations
textindoc <- tm_map(textindoc, removePunctuation)
## Warning in tm_map.SimpleCorpus(textindoc, removePunctuation): transformation
## drops documents
# Eliminate extra white spaces
textindoc <- tm_map(textindoc, stripWhitespace)
## Warning in tm_map.SimpleCorpus(textindoc, stripWhitespace): transformation drops
## documents
# Text stemming - which reduces words to their root form
textindoc <- tm_map(TextDoc, stemDocument)
## Warning in tm_map.SimpleCorpus(TextDoc, stemDocument): transformation drops
## documents
# Build a term-document matrix
textindoc_matrix <- TermDocumentMatrix(textindoc)
tdm_matrix <- as.matrix(textindoc_matrix)
# Sort by decreasing frequency value
tdm_matrix_value <- sort(rowSums(tdm_matrix),decreasing=TRUE)
tdm_matrix_value_dec <- data.frame(word = names(tdm_matrix_value),freq=tdm_matrix_value)
# Display the top 5 most frequent words
head(tdm_matrix_value_dec, 5)
## word freq
## team team 300
## the the 270
## and and 244
## are are 150
## our our 114
# Plot the most frequent words
barplot(tdm_matrix_value_dec[1:5,]$freq, las = 2, names.arg = tdm_matrix_value_dec[1:5,]$word,
col ="purple", main ="Top Five Words",
ylab = "Word Frequency")

#generate word cloud
set.seed(123)
wordcloud(words = tdm_matrix_value_dec$word, freq = tdm_matrix_value_dec$freq, min.freq = 5,
max.words=100, random.order=FALSE, rot.per=0.40,
colors=brewer.pal(8, "Dark2"))

# Find associations
findAssocs(textindoc_matrix, terms = c("good","work","health"), corlimit = 0.25)
## $good
## time job synergi
## 0.30 0.26 0.26
##
## $work
## together. togeth with enjoy
## 0.32 0.30 0.26 0.26
##
## $health
## ""give ""green"" "current decline." happen noth real sentiment
## 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35
## suppli up"" wors becaus team pressur
## 0.35 0.35 0.35 0.30 0.25 0.25
# Find associations for words that occur at least 50 times
findAssocs(textindoc_matrix, terms = findFreqTerms(textindoc_matrix, lowfreq = 50), corlimit = 0.25)
## $and
## accordingly" collabor current disciplin ensur evolve,
## 0.33 0.33 0.33 0.33 0.33 0.33
## product. proper sound under scope vision
## 0.33 0.33 0.33 0.33 0.32 0.28
## opportun
## 0.27
##
## $are
## there
## 0.34
##
## $have
## teamwork. fun.
## 0.26 0.25
##
## $team
## cross member work.
## 0.31 0.27 0.26
##
## $that
## educ go. quit surprizing. unaware. was
## 0.45 0.45 0.45 0.45 0.45 0.31
## agile. compani
## 0.30 0.30
##
## $with
## numeric(0)
##
## $work
## together. togeth enjoy
## 0.32 0.30 0.26
##
## $`for`
## room improvement. opportun scope clariti solut
## 0.39 0.38 0.27 0.26 0.25 0.25
##
## $the
## accordingly" current disciplin evolve, product. proper
## 0.36 0.36 0.36 0.36 0.36 0.36
## sound under ensur vision scope will
## 0.36 0.36 0.33 0.30 0.28 0.26
## aspect contu euc respons tension transit
## 0.26 0.26 0.26 0.26 0.26 0.26
##
## $good
## time job synergi
## 0.30 0.26 0.26
##
## $health
## ""give ""green"" "current decline." happen noth real sentiment
## 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35
## suppli up"" wors becaus pressur
## 0.35 0.35 0.35 0.30 0.25
##
## $our
## efficiency. gotten itâ’ productivity, success."
## 0.30 0.30 0.30 0.30 0.30
## tasks. though overall, follow principl
## 0.30 0.25 0.25 0.25 0.25
## day
## 0.25
##
## $great
## journey satisfact with, learning." march
## 0.57 0.57 0.57 0.39 0.39
## goals. "love ahead." opportunities, toward
## 0.38 0.34 0.34 0.34 0.29
## hard
## 0.29
##
## $feel
## across board, despit harsh ourselv system healthy. somewhat
## 0.33 0.32 0.32 0.32 0.32 0.32 0.29 0.29
## about
## 0.28
##Sentiment Score
# syuzhetvector
syuzhetvector <- get_sentiment(text, method="syuzhet")
head(syuzhetvector)
## [1] 0.00 2.60 4.65 2.55 1.05 1.00
summary(syuzhetvector)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.4500 0.8625 1.5750 1.8709 2.6375 9.0000
# bing
bingvector <- get_sentiment(text, method="bing")
head(bingvector)
## [1] 0 3 1 4 -1 1
summary(bingvector)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.000 1.000 2.000 1.993 3.000 9.000
#affin
afinnvector <- get_sentiment(text, method="afinn")
head(afinnvector)
## [1] 0 4 8 6 5 6
summary(afinnvector)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -6.000 2.000 4.000 4.391 7.000 18.000
#bind it syuzhet, bing, and affin together
#compare the first row of each vector using sign function
rbind(
sign(head(syuzhetvector)),
sign(head(bingvector)),
sign(head(afinnvector))
)
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 0 1 1 1 1 1
## [2,] 0 1 1 1 -1 1
## [3,] 0 1 1 1 1 1
##Classify emotions in text
# It also counts the number of positive and negative emotions found in each row
emotionanalysis <-get_nrc_sentiment(text)
# Lets view the top 10 lines of the dataframe
head (emotionanalysis,10)
## anger anticipation disgust fear joy sadness surprise trust negative positive
## 1 0 0 0 0 0 0 0 0 0 0
## 2 0 1 0 0 1 0 0 2 1 2
## 3 0 3 0 1 0 0 0 1 1 5
## 4 0 1 0 0 1 0 0 1 0 2
## 5 0 3 0 0 2 1 1 3 2 4
## 6 0 2 0 0 2 0 1 4 1 3
## 7 0 0 0 0 0 0 0 0 0 1
## 8 0 2 0 0 2 0 0 4 0 6
## 9 0 4 0 0 4 0 1 4 0 5
## 10 0 3 0 0 3 0 1 3 0 5
#transpose it
transpose_emotionanalaysis_lite <-data.frame(t(emotionanalysis))
transpose_emotionanalaysis <- data.frame(rowSums(transpose_emotionanalaysis_lite[2:253]))
#Transformation and cleaning
names(transpose_emotionanalaysis)[1] <- "frequency"
transpose_emotionanalaysis <- cbind("sentiment" = rownames(transpose_emotionanalaysis), transpose_emotionanalaysis)
rownames(transpose_emotionanalaysis) <- NULL
transpose_emotionanalaysis_new <-transpose_emotionanalaysis[1:8,]
ggplot(transpose_emotionanalaysis_new) +
geom_point(aes(x = sentiment, y = frequency, col = frequency))

quickplot(sentiment, data=transpose_emotionanalaysis_new, weight=frequency, geom="bar", fill=sentiment,
ylab="frequency")+ggtitle("Key survey sentiments")
