library(tm)
## Loading required package: NLP
library(SnowballC)
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
library(syuzhet)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(tidyverse)     
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble  3.0.6     v dplyr   1.0.5
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## v purrr   0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x ggplot2::annotate() masks NLP::annotate()
## x dplyr::filter()     masks stats::filter()
## x dplyr::lag()        masks stats::lag()
library(stringr)        
library(tidytext)       
library(harrypotter) 
text <- readLines(file.choose())
TextDoc <- Corpus(VectorSource(text))
#Replace any '/', '@' and '|' in text with a space
addspaceinstead <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
textindoc <- tm_map(TextDoc, addspaceinstead, "/") 
## Warning in tm_map.SimpleCorpus(TextDoc, addspaceinstead, "/"): transformation
## drops documents
textindoc <- tm_map(TextDoc, addspaceinstead, "@")
## Warning in tm_map.SimpleCorpus(TextDoc, addspaceinstead, "@"): transformation
## drops documents
textindoc <- tm_map(TextDoc, addspaceinstead, "\\|")
## Warning in tm_map.SimpleCorpus(TextDoc, addspaceinstead, "\\|"): transformation
## drops documents
# Convert the text to lower case
textindoc <- tm_map(textindoc, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(textindoc, content_transformer(tolower)):
## transformation drops documents
# Remove numbers
textindoc <- tm_map(textindoc, removeNumbers)
## Warning in tm_map.SimpleCorpus(textindoc, removeNumbers): transformation drops
## documents
# Remove english common stopwords
textindoc <- tm_map(textindoc, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(textindoc, removeWords, stopwords("english")):
## transformation drops documents
# Remove your own stop word
# specify your custom stopwords as a character vector
textindoc <- tm_map(textindoc, removeWords, c("s", "company", "team")) 
## Warning in tm_map.SimpleCorpus(textindoc, removeWords, c("s", "company", :
## transformation drops documents
# Remove punctuations
textindoc <- tm_map(textindoc, removePunctuation)
## Warning in tm_map.SimpleCorpus(textindoc, removePunctuation): transformation
## drops documents
# Eliminate extra white spaces
textindoc <- tm_map(textindoc, stripWhitespace)
## Warning in tm_map.SimpleCorpus(textindoc, stripWhitespace): transformation drops
## documents
# Text stemming - which reduces words to their root form
textindoc <- tm_map(TextDoc, stemDocument)
## Warning in tm_map.SimpleCorpus(TextDoc, stemDocument): transformation drops
## documents
# Build a term-document matrix
textindoc_matrix <- TermDocumentMatrix(textindoc)
tdm_matrix <- as.matrix(textindoc_matrix)
# Sort by decreasing frequency value
tdm_matrix_value <- sort(rowSums(tdm_matrix),decreasing=TRUE)
tdm_matrix_value_dec <- data.frame(word = names(tdm_matrix_value),freq=tdm_matrix_value)
# Display the top 5 most frequent words
head(tdm_matrix_value_dec, 5)
##      word freq
## team team  300
## the   the  270
## and   and  244
## are   are  150
## our   our  114
# Plot the most frequent words
barplot(tdm_matrix_value_dec[1:5,]$freq, las = 2, names.arg = tdm_matrix_value_dec[1:5,]$word,
        col ="purple", main ="Top Five Words",
        ylab = "Word Frequency")

#generate word cloud
set.seed(123)
wordcloud(words = tdm_matrix_value_dec$word, freq = tdm_matrix_value_dec$freq, min.freq = 5,
          max.words=100, random.order=FALSE, rot.per=0.40, 
          colors=brewer.pal(8, "Dark2"))

# Find associations 
findAssocs(textindoc_matrix, terms = c("good","work","health"), corlimit = 0.25)
## $good
##    time     job synergi 
##    0.30    0.26    0.26 
## 
## $work
## together.    togeth      with     enjoy 
##      0.32      0.30      0.26      0.26 
## 
## $health
##    ""give ""green""  "current decline."    happen      noth      real sentiment 
##      0.35      0.35      0.35      0.35      0.35      0.35      0.35      0.35 
##    suppli      up""      wors    becaus      team   pressur 
##      0.35      0.35      0.35      0.30      0.25      0.25
# Find associations for words that occur at least 50 times
findAssocs(textindoc_matrix, terms = findFreqTerms(textindoc_matrix, lowfreq = 50), corlimit = 0.25)
## $and
## accordingly"     collabor      current    disciplin        ensur      evolve, 
##         0.33         0.33         0.33         0.33         0.33         0.33 
##     product.       proper        sound        under        scope       vision 
##         0.33         0.33         0.33         0.33         0.32         0.28 
##     opportun 
##         0.27 
## 
## $are
## there 
##  0.34 
## 
## $have
## teamwork.      fun. 
##      0.26      0.25 
## 
## $team
##  cross member  work. 
##   0.31   0.27   0.26 
## 
## $that
##        educ         go.        quit surprizing.    unaware.         was 
##        0.45        0.45        0.45        0.45        0.45        0.31 
##      agile.     compani 
##        0.30        0.30 
## 
## $with
## numeric(0)
## 
## $work
## together.    togeth     enjoy 
##      0.32      0.30      0.26 
## 
## $`for`
##         room improvement.     opportun        scope      clariti        solut 
##         0.39         0.38         0.27         0.26         0.25         0.25 
## 
## $the
## accordingly"      current    disciplin      evolve,     product.       proper 
##         0.36         0.36         0.36         0.36         0.36         0.36 
##        sound        under        ensur       vision        scope         will 
##         0.36         0.36         0.33         0.30         0.28         0.26 
##       aspect        contu          euc      respons      tension      transit 
##         0.26         0.26         0.26         0.26         0.26         0.26 
## 
## $good
##    time     job synergi 
##    0.30    0.26    0.26 
## 
## $health
##    ""give ""green""  "current decline."    happen      noth      real sentiment 
##      0.35      0.35      0.35      0.35      0.35      0.35      0.35      0.35 
##    suppli      up""      wors    becaus   pressur 
##      0.35      0.35      0.35      0.30      0.25 
## 
## $our
##   efficiency.        gotten          itâ’ productivity,     success." 
##          0.30          0.30          0.30          0.30          0.30 
##        tasks.        though      overall,        follow      principl 
##          0.30          0.25          0.25          0.25          0.25 
##           day 
##          0.25 
## 
## $great
##        journey      satisfact          with,     learning."          march 
##           0.57           0.57           0.57           0.39           0.39 
##         goals.          "love        ahead." opportunities,         toward 
##           0.38           0.34           0.34           0.34           0.29 
##           hard 
##           0.29 
## 
## $feel
##   across   board,   despit    harsh  ourselv   system healthy. somewhat 
##     0.33     0.32     0.32     0.32     0.32     0.32     0.29     0.29 
##    about 
##     0.28
##Sentiment Score

# syuzhetvector
syuzhetvector <- get_sentiment(text, method="syuzhet")
head(syuzhetvector)
## [1] 0.00 2.60 4.65 2.55 1.05 1.00
summary(syuzhetvector)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -1.4500  0.8625  1.5750  1.8709  2.6375  9.0000
# bing
bingvector <- get_sentiment(text, method="bing")
head(bingvector)
## [1]  0  3  1  4 -1  1
summary(bingvector)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -3.000   1.000   2.000   1.993   3.000   9.000
#affin
afinnvector <- get_sentiment(text, method="afinn")
head(afinnvector)
## [1] 0 4 8 6 5 6
summary(afinnvector)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -6.000   2.000   4.000   4.391   7.000  18.000
#bind it syuzhet, bing, and affin together
#compare the first row of each vector using sign function
rbind(
  sign(head(syuzhetvector)),
  sign(head(bingvector)),
  sign(head(afinnvector))
)
##      [,1] [,2] [,3] [,4] [,5] [,6]
## [1,]    0    1    1    1    1    1
## [2,]    0    1    1    1   -1    1
## [3,]    0    1    1    1    1    1
##Classify emotions in text 
# It also counts the number of positive and negative emotions found in each row
emotionanalysis <-get_nrc_sentiment(text)
# Lets view the top 10 lines of the dataframe
head (emotionanalysis,10)
##    anger anticipation disgust fear joy sadness surprise trust negative positive
## 1      0            0       0    0   0       0        0     0        0        0
## 2      0            1       0    0   1       0        0     2        1        2
## 3      0            3       0    1   0       0        0     1        1        5
## 4      0            1       0    0   1       0        0     1        0        2
## 5      0            3       0    0   2       1        1     3        2        4
## 6      0            2       0    0   2       0        1     4        1        3
## 7      0            0       0    0   0       0        0     0        0        1
## 8      0            2       0    0   2       0        0     4        0        6
## 9      0            4       0    0   4       0        1     4        0        5
## 10     0            3       0    0   3       0        1     3        0        5
#transpose it
transpose_emotionanalaysis_lite <-data.frame(t(emotionanalysis))
transpose_emotionanalaysis <- data.frame(rowSums(transpose_emotionanalaysis_lite[2:253]))
#Transformation and cleaning
names(transpose_emotionanalaysis)[1] <- "frequency"
transpose_emotionanalaysis <- cbind("sentiment" = rownames(transpose_emotionanalaysis), transpose_emotionanalaysis)
rownames(transpose_emotionanalaysis) <- NULL
transpose_emotionanalaysis_new <-transpose_emotionanalaysis[1:8,]
ggplot(transpose_emotionanalaysis_new) +
  geom_point(aes(x = sentiment, y = frequency, col = frequency)) 

quickplot(sentiment, data=transpose_emotionanalaysis_new, weight=frequency, geom="bar", fill=sentiment, 
          ylab="frequency")+ggtitle("Key survey sentiments")