Simple Sentiment Analysis from Twitter

Step 1: Define what is positive and what is negative:

Positive words and Negative words can also found at: http://ptrckprry.com/course/ssd/data/positive-words.txt. and http://ptrckprry.com/course/ssd/data/negative-words.txt.

posWords <- c("yes","great","improvement","love","great improvement","very good","good","right","very")
negWords <- c("no", "hate","bad","not good","horrible")

wordsDF<- data.frame(words = posWords, value = 1,stringsAsFactors=F)
wordsDF<- rbind(wordsDF,data.frame(words = negWords, value = -1))
wordsDF$lengths<-unlist(lapply(wordsDF$words, nchar))
wordsDF<-wordsDF[ order(-wordsDF[,3]),]

Step 2: We will collect writings from social media regrading the subjet matter:

Here we go for twitter, # you need to use your own key, which can be obtain from tweeter, when you create an app on it

library(twitteR)
## Warning: package 'twitteR' was built under R version 3.3.3
library(httr)
## Warning: package 'httr' was built under R version 3.3.3
# api_key <- "xxxxxxxxxxxxxxxxxxxxxxxxx"
# api_secret <- "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
# access_token <- "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
# access_token_secret <- "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
setup_twitter_oauth(api_key,api_secret,access_token,access_token_secret)
## [1] "Using direct authentication"
tweets_UCBerkeley <- searchTwitter('@UCBerkeley', n=500)
tweets_WUSTL <- searchTwitter('@WUSTL', n=500)


# Loop over tweets and extract text
library(plyr)
## Warning: package 'plyr' was built under R version 3.3.3
## 
## Attaching package: 'plyr'
## The following object is masked from 'package:twitteR':
## 
##     id
feed_UCBerkeley = laply(tweets_UCBerkeley, function(t) t$getText())
feed_WUSTL = laply(tweets_WUSTL, function(t) t$getText())

sent_UCBerkeley <- data.frame(words = feed_UCBerkeley, user = "UC_Berkeley")
sent_WUSTL <- data.frame(words = feed_WUSTL, user = "WUSTL")

Step 3: Run the comparison and give a score

# options(warn=-1)

scoreSentence <- function(sentence){
  score<-0
  for(x in 1:nrow(wordsDF)){
    count<-length(grep(wordsDF[x,1],sentence))
    if(count){
      score<-score + (count * wordsDF[x,2])
      sentence<-sub(wordsDF[x,1],'',sentence)
    }
  }
  score
}


SentimentScore_UCBerkeley <- unlist(lapply(sent_UCBerkeley$words, scoreSentence))

SentimentScore_WUSTL <- unlist(lapply(sent_WUSTL$words, scoreSentence))
score_UCBerkeley <- data.frame(cbind("UCBerkeley", SentimentScore_UCBerkeley))
names(score_UCBerkeley) <- c("name", "score")

score_WUSTL <- cbind("WUSTL", SentimentScore_WUSTL)
score_WUSTL <- data.frame(cbind("WUSTL", SentimentScore_WUSTL))
names(score_WUSTL) <- c("name", "score")

Step 4: Visulization

plotdat <- rbind(score_UCBerkeley, score_WUSTL)
y <- as.factor(score_UCBerkeley$score)
x <- as.factor(score_WUSTL$score)

counts <- table(plotdat$name, plotdat$score)
barplot(counts, main="Sentiment Analysis on 500 Tweets: University",
        xlab="Number of Scores", col=c("darkblue","red"),
        legend = rownames(counts), beside=TRUE)

library(RCurl)
## Loading required package: bitops
postive_word <- read.csv("http://ptrckprry.com/course/ssd/data/positive-words.txt")
negative_word <- read.csv("http://ptrckprry.com/course/ssd/data/negative-words.txt")

negative_word <- data.frame(negative_word[-c(515,516,1607), ])
posWords <- as.character(postive_word[48:nrow(postive_word),])
negWords <- as.character(negative_word[48:nrow(negative_word),])

wordsDF<- data.frame(words = posWords, value = 1,stringsAsFactors=F)
wordsDF<- rbind(wordsDF,data.frame(words = negWords, value = -1))
wordsDF$lengths<-unlist(lapply(wordsDF$words, nchar))
wordsDF<-wordsDF[ order(-wordsDF[,3]),]


SentimentScore_UCBerkeley <- unlist(lapply(sent_UCBerkeley$words, scoreSentence))

SentimentScore_WUSTL <- unlist(lapply(sent_WUSTL$words, scoreSentence))
score_UCBerkeley <- data.frame(cbind("UCBerkeley", SentimentScore_UCBerkeley))
names(score_UCBerkeley) <- c("name", "score")

score_WUSTL <- cbind("WUSTL", SentimentScore_WUSTL)
score_WUSTL <- data.frame(cbind("WUSTL", SentimentScore_WUSTL))
names(score_WUSTL) <- c("name", "score")

plotdat <- rbind(score_UCBerkeley, score_WUSTL)
y <- as.factor(score_UCBerkeley$score)
x <- as.factor(score_WUSTL$score)

counts <- table(plotdat$name, plotdat$score)
barplot(counts, main="Sentiment Analysis on 500 Tweets: University",
        xlab="Number of Scores", col=c("darkblue","red"),
        legend = rownames(counts), beside=TRUE)