Employee Survey

df=read.csv(file="Exit Survey 2016.csv")
df$QAll <- paste(df$Q1," ",df$Q2," ",df$Q3," ",df$Q4," ",df$Q5)
# Sentiment Analysis
#*******************************************************************************************
library(RColorBrewer)
library(wordcloud)
library(SnowballC)
library(RCurl)

## Loading required package: bitops

library(ggplot2)
library(ggplot2)
library(tm)

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

library(twitteR)
library(ROAuth)
library(plyr)

## 
## Attaching package: 'plyr'

## The following object is masked from 'package:twitteR':
## 
##     id

library(stringr)
library(base64enc)
# install.packages("topicmodels")
library(topicmodels)
library(data.table)
library(stringi)
library(devtools)
#install_github('sentiment140', 'okugami79')
library(sentiment)

## Loading required package: rjson

library(qdap)

## Loading required package: qdapDictionaries

## Loading required package: qdapRegex

## 
## Attaching package: 'qdapRegex'

## The following object is masked from 'package:ggplot2':
## 
##     %+%

## Loading required package: qdapTools

## 
## Attaching package: 'qdapTools'

## The following object is masked from 'package:data.table':
## 
##     shift

## The following object is masked from 'package:plyr':
## 
##     id

## The following object is masked from 'package:twitteR':
## 
##     id

## 
## Attaching package: 'qdap'

## The following object is masked from 'package:stringr':
## 
##     %>%

## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, as.TermDocumentMatrix

## The following object is masked from 'package:NLP':
## 
##     ngrams

## The following object is masked from 'package:base':
## 
##     Filter

library(qdap)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:qdap':
## 
##     %>%

## The following object is masked from 'package:qdapTools':
## 
##     id

## The following object is masked from 'package:qdapRegex':
## 
##     explain

## The following objects are masked from 'package:data.table':
## 
##     between, first, last

## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following objects are masked from 'package:twitteR':
## 
##     id, location

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# Sentiment Analysis to identify positive/negative Responses
#create calculate.sentiment.score function to do the same
#this function takes wach Response text and custom files created with positive and negative words collection as inputs

calculate.sentiment.score <- function(sentences, positive.words, negative.words, .progress='none')
{
  require(plyr)
  require(stringr)
  
  # we got a vector of sentences. plyr will handle a list or a vector as an "l" for us
  # we want a simple array of scores back, so we use "l" + "a" + "ply" = laply:
  scores <- laply(sentences, function(sentence, positive.words, negative.words)
  {
    
    ## clean up sentences with R's regex-driven global substitute, gsub():
    
    sentence <- gsub('[[:cntrl:]]', '', sentence)
    
    # remove reResponses
    sentence <- gsub('(RT|via)((?:\\b\\W*@\\W+)+)', '', sentence)
    
    # remove at people
    sentence <- gsub('@\\w+', '', sentence)
    
    # remove punctuations
    sentence <- gsub('[[:punct:]]', '', sentence)
    
    # remove numbers
    sentence <- gsub('[[:digit:]]', '', sentence)
    
    # remove html links
    sentence <- gsub('http[s]?\\w+', '', sentence)
    
    # remove extra spaces
    sentence <- gsub('[ \t]{2,}', '', sentence)
    sentence <- gsub('^\\s+|\\s+$', '', sentence)
    
    # removing NA's
    sentence <- sentence[!is.na(sentence)]
    
    # convert to lower case:
    sentence <- tolower(sentence)
    
    # split into words. str_split is in the stringr package
    
    word.list <- str_split(sentence, '\\s+')
    
    # sometimes a list() is one level of hierarchy too much
    
    words <- unlist(word.list)
    
    # compare our words to the dictionaries of positive & negative terms
    
    negative.matches <- match(words, negative.words)
    positive.matches <- match(words, positive.words)
    
    # match() returns the position of the matched term or NA
    # we just want a TRUE/FALSE:
    
    positive.matches <- !is.na(positive.matches)
    negative.matches <- !is.na(negative.matches)
    
    # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
    
    score <- sum(positive.matches) - sum(negative.matches)
    
    return(score)
  }, positive.words, negative.words, .progress=.progress )
  
  scores.df <- data.frame(score=scores, text=sentences)
  return(scores.df)
}

#Read the positvie and negative words from custom files created
positive <- scan("positive-words.txt", what= "character", comment.char= ";")
negative <- scan("negative-words.txt", what= "character", comment.char= ";")
Responses_text <- as.character(df$QAll)

#calculate the sentiment score for each Response
Employee_Sentiment <- calculate.sentiment.score(Responses_text, positive, negative, .progress="none")


#Responses with score >0 are posive and <0 are negative
Employee_Sentiment$sentiment[Employee_Sentiment$score == 0] <- "Neutral" 
Employee_Sentiment$sentiment[Employee_Sentiment$score < 0] <- "Negative"
Employee_Sentiment$sentiment[Employee_Sentiment$score > 0] <- "Positive"
Employee_Sentiment$sentiment <- factor(Employee_Sentiment$sentiment)


#check the Response counts under each positve and negative score
table(Employee_Sentiment$score)

## 
##  -8  -6  -5  -3  -2  -1   0   1   2   3   4   5   6   7   8   9  10  11 
##   1   1   5  17  46 104 232 283 229 185 140  89  77  48  27  24  10   6 
##  12  13  14  15 
##   1   1   3   1

mean(Employee_Sentiment$score)

## [1] 2.283007

median(Employee_Sentiment$score)

## [1] 2

#Display the sentiment Summary of Responses Analysed
summary(Employee_Sentiment$sentiment)

## Negative  Neutral Positive 
##      174      232     1124

#Plot the Sentiment summary of Responses Analysed
ggplot(data = Employee_Sentiment, aes(x = score, fill = sentiment)) + 
  geom_bar() + 
  labs(title = "Sentiment Score Bar Plot", x = "Sentiment Score", y = "Survey Count") +
  scale_x_continuous(breaks = seq(-6,6,1)) + 
  scale_y_continuous(breaks = seq(0,4000,500)) + 
  scale_fill_manual(guide = guide_legend("Sentiment"), values = c("#DD0426","#246EB9","#04B430"))

#############################################################################################
##############                         Question 1                         ###################
#############################################################################################

# Sentiment Analysis to identify positive/negative Responses
#create calculate.sentiment.score function to do the same
#this function takes wach Response text and custom files created with positive and negative words collection as inputs




calculate.sentiment.score <- function(sentences, positive.words, negative.words, .progress='none')
{
  require(plyr)
  require(stringr)
  
  # we got a vector of sentences. plyr will handle a list or a vector as an "l" for us
  # we want a simple array of scores back, so we use "l" + "a" + "ply" = laply:
  scores <- laply(sentences, function(sentence, positive.words, negative.words)
  {
    
    ## clean up sentences with R's regex-driven global substitute, gsub():
    
    sentence <- gsub('[[:cntrl:]]', '', sentence)
    
    # remove reResponses
    sentence <- gsub('(RT|via)((?:\\b\\W*@\\W+)+)', '', sentence)
    
    # remove at people
    sentence <- gsub('@\\w+', '', sentence)
    
    # remove punctuations
    sentence <- gsub('[[:punct:]]', '', sentence)
    
    # remove numbers
    sentence <- gsub('[[:digit:]]', '', sentence)
    
    # remove html links
    sentence <- gsub('http[s]?\\w+', '', sentence)
    
    # remove extra spaces
    sentence <- gsub('[ \t]{2,}', '', sentence)
    sentence <- gsub('^\\s+|\\s+$', '', sentence)
    
    # removing NA's
    sentence <- sentence[!is.na(sentence)]
    
    # convert to lower case:
    sentence <- tolower(sentence)
    
    # split into words. str_split is in the stringr package
    
    word.list <- str_split(sentence, '\\s+')
    
    # sometimes a list() is one level of hierarchy too much
    
    words <- unlist(word.list)
    
    # compare our words to the dictionaries of positive & negative terms
    
    negative.matches <- match(words, negative.words)
    positive.matches <- match(words, positive.words)
    
    # match() returns the position of the matched term or NA
    # we just want a TRUE/FALSE:
    
    positive.matches <- !is.na(positive.matches)
    negative.matches <- !is.na(negative.matches)
    
    # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
    
    score <- sum(positive.matches) - sum(negative.matches)
    
    return(score)
  }, positive.words, negative.words, .progress=.progress )
  
  scores.df <- data.frame(score=scores, text=sentences)
  return(scores.df)
}

#Read the positvie and negative words from custom files created
positive <- scan("positive-words.txt", what= "character", comment.char= ";")
negative <- scan("negative-words.txt", what= "character", comment.char= ";")
Responses_text <- as.character(df$Q1)

#calculate the sentiment score for each Response
Employee_Sentiment <- calculate.sentiment.score(Responses_text, positive, negative, .progress="none")

#Responses with score >0 are posive and <0 are negative
Employee_Sentiment$sentiment[Employee_Sentiment$score == 0] <- "Neutral" 
Employee_Sentiment$sentiment[Employee_Sentiment$score < 0] <- "Negative"
Employee_Sentiment$sentiment[Employee_Sentiment$score > 0] <- "Positive"
Employee_Sentiment$sentiment <- factor(Employee_Sentiment$sentiment)


#check the Response counts under each positve and negative score
table(Employee_Sentiment$score)

## 
##  -5  -4  -3  -2  -1   0   1   2   3   4   5   7   8 
##   3   5  21  44 213 714 381 103  25  12   7   1   1

mean(Employee_Sentiment$score)

## [1] 0.2359477

median(Employee_Sentiment$score)

## [1] 0

#Display the sentiment Summary of Responses Analysed
summary(Employee_Sentiment$sentiment)

## Negative  Neutral Positive 
##      286      714      530

#Plot the Sentiment summary of Responses Analysed
ggplot(data = Employee_Sentiment, aes(x = score, fill = sentiment)) + 
  geom_bar() + 
  labs(title = "Sentiment Score Bar Plot", x = "Sentiment Score", y = "Response Count") +
  scale_x_continuous(breaks = seq(-6,6,1)) + 
  scale_y_continuous(breaks = seq(0,4000,500)) + 
  scale_fill_manual(guide = guide_legend("Sentiment"), values = c("#DD0426","#246EB9","#04B430"))

################################################################################################
 ##                                    Question 2
##########################################################################################

# Sentiment Analysis to identify positive/negative Responses
#create calculate.sentiment.score function to do the same
#this function takes wach Response text and custom files created with positive and negative words collection as inputs

calculate.sentiment.score <- function(sentences, positive.words, negative.words, .progress='none')
{
  require(plyr)
  require(stringr)
  
  # we got a vector of sentences. plyr will handle a list or a vector as an "l" for us
  # we want a simple array of scores back, so we use "l" + "a" + "ply" = laply:
  scores <- laply(sentences, function(sentence, positive.words, negative.words)
  {
    
    ## clean up sentences with R's regex-driven global substitute, gsub():
    
    sentence <- gsub('[[:cntrl:]]', '', sentence)
    
    # remove reResponses
    sentence <- gsub('(RT|via)((?:\\b\\W*@\\W+)+)', '', sentence)
    
    # remove at people
    sentence <- gsub('@\\w+', '', sentence)
    
    # remove punctuations
    sentence <- gsub('[[:punct:]]', '', sentence)
    
    # remove numbers
    sentence <- gsub('[[:digit:]]', '', sentence)
    
    # remove html links
    sentence <- gsub('http[s]?\\w+', '', sentence)
    
    # remove extra spaces
    sentence <- gsub('[ \t]{2,}', '', sentence)
    sentence <- gsub('^\\s+|\\s+$', '', sentence)
    
    # removing NA's
    sentence <- sentence[!is.na(sentence)]
    
    # convert to lower case:
    sentence <- tolower(sentence)
    
    # split into words. str_split is in the stringr package
    
    word.list <- str_split(sentence, '\\s+')
    
    # sometimes a list() is one level of hierarchy too much
    
    words <- unlist(word.list)
    
    # compare our words to the dictionaries of positive & negative terms
    
    negative.matches <- match(words, negative.words)
    positive.matches <- match(words, positive.words)
    
    # match() returns the position of the matched term or NA
    # we just want a TRUE/FALSE:
    
    positive.matches <- !is.na(positive.matches)
    negative.matches <- !is.na(negative.matches)
    
    # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
    
    score <- sum(positive.matches) - sum(negative.matches)
    
    return(score)
  }, positive.words, negative.words, .progress=.progress )
  
  scores.df <- data.frame(score=scores, text=sentences)
  return(scores.df)
}

#Read the positvie and negative words from custom files created
positive <- scan("positive-words.txt", what= "character", comment.char= ";")
negative <- scan("negative-words.txt", what= "character", comment.char= ";")
Responses_text <- as.character(df$Q2)

#calculate the sentiment score for each Response
Employee_Sentiment <- calculate.sentiment.score(Responses_text, positive, negative, .progress="none")



#Responses with score >0 are posive and <0 are negative
Employee_Sentiment$sentiment[Employee_Sentiment$score == 0] <- "Neutral" 
Employee_Sentiment$sentiment[Employee_Sentiment$score < 0] <- "Negative"
Employee_Sentiment$sentiment[Employee_Sentiment$score > 0] <- "Positive"
Employee_Sentiment$sentiment <- factor(Employee_Sentiment$sentiment)


#check the Response counts under each positve and negative score
table(Employee_Sentiment$score)

## 
##  -4  -3  -2  -1   0   1   2   3   4   5   6 
##   2   2  29 130 555 431 235  95  36  13   2

mean(Employee_Sentiment$score)

## [1] 0.7875817

median(Employee_Sentiment$score)

## [1] 1

#Display the sentiment Summary of Responses Analysed
summary(Employee_Sentiment$sentiment)

## Negative  Neutral Positive 
##      163      555      812

#Plot the Sentiment summary of Responses Analysed
ggplot(data = Employee_Sentiment, aes(x = score, fill = sentiment)) + 
  geom_bar() + 
  labs(title = "Sentiment Score Bar Plot", x = "Sentiment Score", y = "Response Count") +
  scale_x_continuous(breaks = seq(-6,6,1)) + 
  scale_y_continuous(breaks = seq(0,4000,500)) + 
  scale_fill_manual(guide = guide_legend("Sentiment"), values = c("#DD0426","#246EB9","#04B430"))

###########################################################################################
#                             Question3
########################################################################################

# Sentiment Analysis to identify positive/negative Responses
#create calculate.sentiment.score function to do the same
#this function takes wach Response text and custom files created with positive and negative words collection as inputs

calculate.sentiment.score <- function(sentences, positive.words, negative.words, .progress='none')
{
  require(plyr)
  require(stringr)
  
  # we got a vector of sentences. plyr will handle a list or a vector as an "l" for us
  # we want a simple array of scores back, so we use "l" + "a" + "ply" = laply:
  scores <- laply(sentences, function(sentence, positive.words, negative.words)
  {
    
    ## clean up sentences with R's regex-driven global substitute, gsub():
    
    sentence <- gsub('[[:cntrl:]]', '', sentence)
    
    # remove reResponses
    sentence <- gsub('(RT|via)((?:\\b\\W*@\\W+)+)', '', sentence)
    
    # remove at people
    sentence <- gsub('@\\w+', '', sentence)
    
    # remove punctuations
    sentence <- gsub('[[:punct:]]', '', sentence)
    
    # remove numbers
    sentence <- gsub('[[:digit:]]', '', sentence)
    
    # remove html links
    sentence <- gsub('http[s]?\\w+', '', sentence)
    
    # remove extra spaces
    sentence <- gsub('[ \t]{2,}', '', sentence)
    sentence <- gsub('^\\s+|\\s+$', '', sentence)
    
    # removing NA's
    sentence <- sentence[!is.na(sentence)]
    
    # convert to lower case:
    sentence <- tolower(sentence)
    
    # split into words. str_split is in the stringr package
    
    word.list <- str_split(sentence, '\\s+')
    
    # sometimes a list() is one level of hierarchy too much
    
    words <- unlist(word.list)
    
    # compare our words to the dictionaries of positive & negative terms
    
    negative.matches <- match(words, negative.words)
    positive.matches <- match(words, positive.words)
    
    # match() returns the position of the matched term or NA
    # we just want a TRUE/FALSE:
    
    positive.matches <- !is.na(positive.matches)
    negative.matches <- !is.na(negative.matches)
    
    # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
    
    score <- sum(positive.matches) - sum(negative.matches)
    
    return(score)
  }, positive.words, negative.words, .progress=.progress )
  
  scores.df <- data.frame(score=scores, text=sentences)
  return(scores.df)
}

#Read the positvie and negative words from custom files created
positive <- scan("positive-words.txt", what= "character", comment.char= ";")
negative <- scan("negative-words.txt", what= "character", comment.char= ";")
Responses_text <- as.character(df$Q3)

#calculate the sentiment score for each Response
Employee_Sentiment <- calculate.sentiment.score(Responses_text, positive, negative, .progress="none")


#Responses with score >0 are posive and <0 are negative
Employee_Sentiment$sentiment[Employee_Sentiment$score == 0] <- "Neutral" 
Employee_Sentiment$sentiment[Employee_Sentiment$score < 0] <- "Negative"
Employee_Sentiment$sentiment[Employee_Sentiment$score > 0] <- "Positive"
Employee_Sentiment$sentiment <- factor(Employee_Sentiment$sentiment)


#check the Response counts under each positve and negative score
table(Employee_Sentiment$score)

## 
##  -7  -4  -3  -2  -1   0   1   2   3   4   5   6   8 
##   1   2   4  20 124 826 360 128  41  15   7   1   1

mean(Employee_Sentiment$score)

## [1] 0.4294118

median(Employee_Sentiment$score)

## [1] 0

#Display the sentiment Summary of Responses Analysed
summary(Employee_Sentiment$sentiment)

## Negative  Neutral Positive 
##      151      826      553

#Plot the Sentiment summary of Responses Analysed
ggplot(data = Employee_Sentiment, aes(x = score, fill = sentiment)) + 
  geom_bar() + 
  labs(title = "Sentiment Score Bar Plot", x = "Sentiment Score", y = "Response Count") +
  scale_x_continuous(breaks = seq(-6,6,1)) + 
  scale_y_continuous(breaks = seq(0,4000,500)) + 
  scale_fill_manual(guide = guide_legend("Sentiment"), values = c("#DD0426","#246EB9","#04B430"))

###########################################################################################
#                                 Question 4
########################################################################################
# Sentiment Analysis to identify positive/negative Responses
#create calculate.sentiment.score function to do the same
#this function takes wach Response text and custom files created with positive and negative words collection as inputs

calculate.sentiment.score <- function(sentences, positive.words, negative.words, .progress='none')
{
  require(plyr)
  require(stringr)
  
  # we got a vector of sentences. plyr will handle a list or a vector as an "l" for us
  # we want a simple array of scores back, so we use "l" + "a" + "ply" = laply:
  scores <- laply(sentences, function(sentence, positive.words, negative.words)
  {
    
    ## clean up sentences with R's regex-driven global substitute, gsub():
    
    sentence <- gsub('[[:cntrl:]]', '', sentence)
    
    # remove reResponses
    sentence <- gsub('(RT|via)((?:\\b\\W*@\\W+)+)', '', sentence)
    
    # remove at people
    sentence <- gsub('@\\w+', '', sentence)
    
    # remove punctuations
    sentence <- gsub('[[:punct:]]', '', sentence)
    
    # remove numbers
    sentence <- gsub('[[:digit:]]', '', sentence)
    
    # remove html links
    sentence <- gsub('http[s]?\\w+', '', sentence)
    
    # remove extra spaces
    sentence <- gsub('[ \t]{2,}', '', sentence)
    sentence <- gsub('^\\s+|\\s+$', '', sentence)
    
    # removing NA's
    sentence <- sentence[!is.na(sentence)]
    
    # convert to lower case:
    sentence <- tolower(sentence)
    
    # split into words. str_split is in the stringr package
    
    word.list <- str_split(sentence, '\\s+')
    
    # sometimes a list() is one level of hierarchy too much
    
    words <- unlist(word.list)
    
    # compare our words to the dictionaries of positive & negative terms
    
    negative.matches <- match(words, negative.words)
    positive.matches <- match(words, positive.words)
    
    # match() returns the position of the matched term or NA
    # we just want a TRUE/FALSE:
    
    positive.matches <- !is.na(positive.matches)
    negative.matches <- !is.na(negative.matches)
    
    # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
    
    score <- sum(positive.matches) - sum(negative.matches)
    
    return(score)
  }, positive.words, negative.words, .progress=.progress )
  
  scores.df <- data.frame(score=scores, text=sentences)
  return(scores.df)
}

#Read the positvie and negative words from custom files created
positive <- scan("positive-words.txt", what= "character", comment.char= ";")
negative <- scan("negative-words.txt", what= "character", comment.char= ";")
Responses_text <- as.character(df$Q4)

#calculate the sentiment score for each Response
Employee_Sentiment <- calculate.sentiment.score(Responses_text, positive, negative, .progress="none")

#Responses with score >0 are posive and <0 are negative
Employee_Sentiment$sentiment[Employee_Sentiment$score == 0] <- "Neutral" 
Employee_Sentiment$sentiment[Employee_Sentiment$score < 0] <- "Negative"
Employee_Sentiment$sentiment[Employee_Sentiment$score > 0] <- "Positive"
Employee_Sentiment$sentiment <- factor(Employee_Sentiment$sentiment)


#check the Response counts under each positve and negative score
table(Employee_Sentiment$score)

## 
##  -4  -3  -2  -1   0   1   2   3   4   5   6 
##   1   8  15  57 762 452 176  41  11   6   1

mean(Employee_Sentiment$score)

## [1] 0.5830065

median(Employee_Sentiment$score)

## [1] 0

#Display the sentiment Summary of Responses Analysed
summary(Employee_Sentiment$sentiment)

## Negative  Neutral Positive 
##       81      762      687

#Plot the Sentiment summary of Responses Analysed
ggplot(data = Employee_Sentiment, aes(x = score, fill = sentiment)) + 
  geom_bar() + 
  labs(title = "Sentiment Score Bar Plot", x = "Sentiment Score", y = "Response Count") +
  scale_x_continuous(breaks = seq(-6,6,1)) + 
  scale_y_continuous(breaks = seq(0,4000,500)) + 
  scale_fill_manual(guide = guide_legend("Sentiment"), values = c("#DD0426","#246EB9","#04B430"))

######################################################################################################
#                                 Question 5
###############################################################################################
# Sentiment Analysis to identify positive/negative Responses
#create calculate.sentiment.score function to do the same
#this function takes wach Response text and custom files created with positive and negative words collection as inputs

calculate.sentiment.score <- function(sentences, positive.words, negative.words, .progress='none')
{
  require(plyr)
  require(stringr)
  
  # we got a vector of sentences. plyr will handle a list or a vector as an "l" for us
  # we want a simple array of scores back, so we use "l" + "a" + "ply" = laply:
  scores <- laply(sentences, function(sentence, positive.words, negative.words)
  {
    
    ## clean up sentences with R's regex-driven global substitute, gsub():
    
    sentence <- gsub('[[:cntrl:]]', '', sentence)
    
    # remove reResponses
    sentence <- gsub('(RT|via)((?:\\b\\W*@\\W+)+)', '', sentence)
    
    # remove at people
    sentence <- gsub('@\\w+', '', sentence)
    
    # remove punctuations
    sentence <- gsub('[[:punct:]]', '', sentence)
    
    # remove numbers
    sentence <- gsub('[[:digit:]]', '', sentence)
    
    # remove html links
    sentence <- gsub('http[s]?\\w+', '', sentence)
    
    # remove extra spaces
    sentence <- gsub('[ \t]{2,}', '', sentence)
    sentence <- gsub('^\\s+|\\s+$', '', sentence)
    
    # removing NA's
    sentence <- sentence[!is.na(sentence)]
    
    # convert to lower case:
    sentence <- tolower(sentence)
    
    # split into words. str_split is in the stringr package
    
    word.list <- str_split(sentence, '\\s+')
    
    # sometimes a list() is one level of hierarchy too much
    
    words <- unlist(word.list)
    
    # compare our words to the dictionaries of positive & negative terms
    
    negative.matches <- match(words, negative.words)
    positive.matches <- match(words, positive.words)
    
    # match() returns the position of the matched term or NA
    # we just want a TRUE/FALSE:
    
    positive.matches <- !is.na(positive.matches)
    negative.matches <- !is.na(negative.matches)
    
    # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
    
    score <- sum(positive.matches) - sum(negative.matches)
    
    return(score)
  }, positive.words, negative.words, .progress=.progress )
  
  scores.df <- data.frame(score=scores, text=sentences)
  return(scores.df)
}

#Read the positvie and negative words from custom files created
positive <- scan("positive-words.txt", what= "character", comment.char= ";")
negative <- scan("negative-words.txt", what= "character", comment.char= ";")
Responses_text <- as.character(df$Q5)

#calculate the sentiment score for each Response
Employee_Sentiment <- calculate.sentiment.score(Responses_text, positive, negative, .progress="none")


#Responses with score >0 are posive and <0 are negative
Employee_Sentiment$sentiment[Employee_Sentiment$score == 0] <- "Neutral" 
Employee_Sentiment$sentiment[Employee_Sentiment$score < 0] <- "Negative"
Employee_Sentiment$sentiment[Employee_Sentiment$score > 0] <- "Positive"
Employee_Sentiment$sentiment <- factor(Employee_Sentiment$sentiment)


#check the Response counts under each positve and negative score
table(Employee_Sentiment$score)

## 
##  -3  -2  -1   0   1   2   3   4   5   6   7 
##   1   4  31 914 310 160  68  31   9   1   1

mean(Employee_Sentiment$score)

## [1] 0.6366013

median(Employee_Sentiment$score)

## [1] 0

#Display the sentiment Summary of Responses Analysed
summary(Employee_Sentiment$sentiment)

## Negative  Neutral Positive 
##       36      914      580

#Plot the Sentiment summary of Responses Analysed
ggplot(data = Employee_Sentiment, aes(x = score, fill = sentiment)) + 
  geom_bar() + 
  labs(title = "Sentiment Score Bar Plot", x = "Sentiment Score", y = "Response Count") +
  scale_x_continuous(breaks = seq(-6,6,1)) + 
  scale_y_continuous(breaks = seq(0,4000,500)) + 
  scale_fill_manual(guide = guide_legend("Sentiment"), values = c("#DD0426","#246EB9","#04B430"))

########################################
#       Word Cloud
########################################

Employee Survey

Vyomesh Upadhyay

13 January 2018