require(plyr)
## Loading required package: plyr
require(stringr)
## Loading required package: stringr
require(tidyverse)
## Loading required package: tidyverse
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## arrange():   dplyr, plyr
## compact():   purrr, plyr
## count():     dplyr, plyr
## failwith():  dplyr, plyr
## filter():    dplyr, stats
## id():        dplyr, plyr
## lag():       dplyr, stats
## mutate():    dplyr, plyr
## rename():    dplyr, plyr
## summarise(): dplyr, plyr
## summarize(): dplyr, plyr
require(stringi)
## Loading required package: stringi
#result from sentistrengh
sentimentout <- read.table("sentimentout.txt", 
               sep="\t", 
               col.names=c("Positive", "Negative","Text"), 
               fill=FALSE, 
               strip.white=TRUE)
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec =
## dec, : EOF within quoted string
sentimentout <- sentimentout[3:691,]

#clean data, replace smiley with words
smiles <- data.frame(s=c( ">:(", ":<", ">:", "=(", "=[", "='(",  "^_^",":)","=)","(=" ,"=]","^.^",":(",";)",";-)",":D","XD",":P",";P","<3"),
                     r=c("unhappyface","unhappyface","unhappyface","unhappyface","unhappyface","unhappyface","happyface","happyface","happyface","happyface","happyface","happyface","unhappyface","happyface","happyface","happyface","happyface","happyface","happyface","kiss"))
sentimentout2 <- sentimentout
# #clean data
sentimentout$Text %>%
        as_vector() %>%
        str_replace_all(" ?(f|ht)tp(s?)://(.*)[.][a-z]+", " ")%>%
        str_replace_all("\\d", " ")%>%
        str_replace_all("bobsnewline", " ")%>%
        stri_replace_all_fixed(pattern = smiles$s,replacement = smiles$r,vectorize_all = FALSE)%>%
        str_replace_all("[^[:alnum:]']", " ")%>%
        str_replace_all('\\s\\b[^i]{1}\\s', " ")%>%
        gsub("\\s+", " ",.) -> sentimentout2$Text 

# import positive and negative words
pos = readLines("positive_words.txt")
neg = readLines("negative_words.txt")

score.sentiment = function(sentences, pos, neg, .progress='none')
{
        require(plyr)
        require(stringr)
        
        # we got a vector of sentences. plyr will handle a list or a vector as an "l" for us
        # we want a simple array of scores back, so we use "l" + "a" + "ply" = laply:
        scores = laply(sentences, function(sentence, pos, neg) {
                
                # clean up sentences with R's regex-driven global substitute, gsub():
                sentence = gsub('[[:punct:]]', '', sentence)
                sentence = gsub('[[:cntrl:]]', '', sentence)
                sentence = gsub('\\d+', '', sentence)
                # and convert to lower case:
                sentence = tolower(sentence)
                
                # split into words. str_split is in the stringr package
                word.list = str_split(sentence, '\\s+')
                # sometimes a list() is one level of hierarchy too much
                words = unlist(word.list)
                
                # compare our words to the dictionaries of positive & negative terms
                pos.matches = match(words, pos)
                neg.matches = match(words, neg)
                
                # match() returns the position of the matched term or NA
                # we just want a TRUE/FALSE:
                pos.matches = !is.na(pos.matches)
                neg.matches = !is.na(neg.matches)
                
                # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
                score = sum(pos.matches) - sum(neg.matches)
                
                return(score)
        }, pos, neg, .progress=.progress )
        
        scores.df = data.frame(score=scores, text=sentences)
        return(scores.df)
}

result_text <- score.sentiment(sentimentout$Text, pos, neg)

sentimentout2[,4] <- result_text[,1]
colnames(sentimentout2)[4] <- "senti.wordlist"

#add up sentiment score
sentimentout2$Positive <- as.numeric(sentimentout2$Positive)
sentimentout2$Negative <- as.numeric(sentimentout2$Negative)

sentimentout2 %>%
        mutate(sentistrength = Positive - Negative) ->sentimentout3

cor.test(sentimentout3$sentistrength,sentimentout3$senti.wordlist)
## 
##  Pearson's product-moment correlation
## 
## data:  sentimentout3$sentistrength and sentimentout3$senti.wordlist
## t = 8.9448, df = 687, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2544219 0.3883017
## sample estimates:
##       cor 
## 0.3229766
#0.3229766