require(plyr)
## Loading required package: plyr
require(stringr)
## Loading required package: stringr
require(tidyverse)
## Loading required package: tidyverse
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## arrange(): dplyr, plyr
## compact(): purrr, plyr
## count(): dplyr, plyr
## failwith(): dplyr, plyr
## filter(): dplyr, stats
## id(): dplyr, plyr
## lag(): dplyr, stats
## mutate(): dplyr, plyr
## rename(): dplyr, plyr
## summarise(): dplyr, plyr
## summarize(): dplyr, plyr
require(stringi)
## Loading required package: stringi
#result from sentistrengh
sentimentout <- read.table("sentimentout.txt",
sep="\t",
col.names=c("Positive", "Negative","Text"),
fill=FALSE,
strip.white=TRUE)
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec =
## dec, : EOF within quoted string
sentimentout <- sentimentout[3:691,]
#clean data, replace smiley with words
smiles <- data.frame(s=c( ">:(", ":<", ">:", "=(", "=[", "='(", "^_^",":)","=)","(=" ,"=]","^.^",":(",";)",";-)",":D","XD",":P",";P","<3"),
r=c("unhappyface","unhappyface","unhappyface","unhappyface","unhappyface","unhappyface","happyface","happyface","happyface","happyface","happyface","happyface","unhappyface","happyface","happyface","happyface","happyface","happyface","happyface","kiss"))
sentimentout2 <- sentimentout
# #clean data
sentimentout$Text %>%
as_vector() %>%
str_replace_all(" ?(f|ht)tp(s?)://(.*)[.][a-z]+", " ")%>%
str_replace_all("\\d", " ")%>%
str_replace_all("bobsnewline", " ")%>%
stri_replace_all_fixed(pattern = smiles$s,replacement = smiles$r,vectorize_all = FALSE)%>%
str_replace_all("[^[:alnum:]']", " ")%>%
str_replace_all('\\s\\b[^i]{1}\\s', " ")%>%
gsub("\\s+", " ",.) -> sentimentout2$Text
# import positive and negative words
pos = readLines("positive_words.txt")
neg = readLines("negative_words.txt")
score.sentiment = function(sentences, pos, neg, .progress='none')
{
require(plyr)
require(stringr)
# we got a vector of sentences. plyr will handle a list or a vector as an "l" for us
# we want a simple array of scores back, so we use "l" + "a" + "ply" = laply:
scores = laply(sentences, function(sentence, pos, neg) {
# clean up sentences with R's regex-driven global substitute, gsub():
sentence = gsub('[[:punct:]]', '', sentence)
sentence = gsub('[[:cntrl:]]', '', sentence)
sentence = gsub('\\d+', '', sentence)
# and convert to lower case:
sentence = tolower(sentence)
# split into words. str_split is in the stringr package
word.list = str_split(sentence, '\\s+')
# sometimes a list() is one level of hierarchy too much
words = unlist(word.list)
# compare our words to the dictionaries of positive & negative terms
pos.matches = match(words, pos)
neg.matches = match(words, neg)
# match() returns the position of the matched term or NA
# we just want a TRUE/FALSE:
pos.matches = !is.na(pos.matches)
neg.matches = !is.na(neg.matches)
# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
score = sum(pos.matches) - sum(neg.matches)
return(score)
}, pos, neg, .progress=.progress )
scores.df = data.frame(score=scores, text=sentences)
return(scores.df)
}
result_text <- score.sentiment(sentimentout$Text, pos, neg)
sentimentout2[,4] <- result_text[,1]
colnames(sentimentout2)[4] <- "senti.wordlist"
#add up sentiment score
sentimentout2$Positive <- as.numeric(sentimentout2$Positive)
sentimentout2$Negative <- as.numeric(sentimentout2$Negative)
sentimentout2 %>%
mutate(sentistrength = Positive - Negative) ->sentimentout3
cor.test(sentimentout3$sentistrength,sentimentout3$senti.wordlist)
##
## Pearson's product-moment correlation
##
## data: sentimentout3$sentistrength and sentimentout3$senti.wordlist
## t = 8.9448, df = 687, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2544219 0.3883017
## sample estimates:
## cor
## 0.3229766
#0.3229766