The below scripts are for carrying out Sentiment analysis on a collection of text.
This analysis requires the following R packages for carrying out sentiment analysis.
library(twitteR)
## Loading required package: ROAuth
## Loading required package: RCurl
## Loading required package: bitops
## Loading required package: digest
## Loading required package: rjson
library(sentiment)
## Loading required package: tm
## Loading required package: NLP
## Loading required package: Rstem
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
library(stringr)
The first step is to prepare the data for doing sentiment analysis. In this exercise, I will do an analysis on hashtag “#Wipro” for carrying out this analysis. The scripts for downloading the scripts from twitter is not included in this, as it has some user specific twitter account secret keys and tokens. The result of this is a data frame with the required data.
The tweets on Wipro are loaded in a dataframe called “wipro_senti”
wipro_senti <- read.csv("wipro_senti.csv")
head(wipro_senti,3)
## X
## 1 1
## 2 2
## 3 3
## text
## 1 Poll: Have the #Infosys results set the course for other IT firms such as #TCS, #Wipro and #HCL? http://t.co/mHGyb42fsY
## 2 RT @Brands_India: #Wipro wins $400 million contract from #Swiss engineering giant #ABB.\nhttp://t.co/ZQiftK2jwA http://t.co/7lJzUlaHQU
## 3 #Wipro wins a big deal from ABB\n\nhttp://t.co/ha2JIXhgyp
## favorited favoriteCount replyToSN created truncated
## 1 FALSE 0 NA 2015-01-09 10:08:46 FALSE
## 2 FALSE 0 NA 2015-01-09 06:54:12 FALSE
## 3 FALSE 0 NA 2015-01-09 01:14:06 FALSE
## replyToSID id replyToUID
## 1 NA 5.535e+17 NA
## 2 NA 5.534e+17 NA
## 3 NA 5.534e+17 NA
## statusSource
## 1 <a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>
## 2 <a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>
## 3 <a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>
## screenName retweetCount isRetweet retweeted longitude latitude
## 1 bsindia 0 FALSE FALSE NA NA
## 2 chai2kul 2 TRUE FALSE NA NA
## 3 abhaymhaskar 0 FALSE FALSE NA NA
As seen, the data frame extracted from twitter has lot of extra information. All what we need is the data under the variable text. Let us first extract the test data and then clean it.
wipro_txt = wipro_senti$text
For cleaning the data, we will create a function which will remove all unwanted elements within the twitter data.
Cleantweet <- function(tweets){
# Removing extra spaces
tweets <- str_replace_all(tweets," "," ")
# Removing URLs
tweets <- str_replace_all(tweets,"http://t.co/[a-z,A-Z,0-9]{8}","")
#Take out the retweet header
tweets <- str_replace_all(tweets,"RT @[a-z,A-Z]*: ","")
#Remove the hashtags
tweets <- str_replace_all(tweets,"#[a-z,A-Z]*","")
#Remove all references to screennames
tweets <- str_replace_all(tweets,"@[a-z,A-Z]*","")
return(tweets)
}
Let us now get the cleaned up text for the Wipro data
Wipro_cleantxt <- Cleantweet(wipro_txt)
We will have to do some more cleaning on this data
# remove punctuation
Wipro_cleantxt = gsub("[[:punct:]]", "", Wipro_cleantxt)
# remove numbers
Wipro_cleantxt = gsub("[[:digit:]]", "", Wipro_cleantxt)
# remove unnecessary spaces
Wipro_cleantxt = gsub("[ \t]{2,}", "", Wipro_cleantxt)
Wipro_cleantxt = gsub("^\\s+|\\s+$", "", Wipro_cleantxt)
After this round of cleaning, let us convert the text to lower case
# define "tolower error handling" function
Wipro.error = function(x)
{
# create missing value
y = NA
# tryCatch error
try_error = tryCatch(tolower(x), error=function(e) e)
# if not an error
if (!inherits(try_error, "error"))
y = tolower(x)
# result
return(y)
}
# lower case using try.error with sapply
Wiprosenti = sapply(Wipro_cleantxt, Wipro.error)
# remove NAs in some_txt
Wiprosenti = Wiprosenti[!is.na(Wiprosenti)]
names(Wiprosenti) = NULL
After all the cleaning up, let us now do the Sentimental analysis. We use the package sentiment() for doing sentiment analysis.
library(sentiment)
Wiproemo <- classify_emotion(Wiprosenti,algorithm="bayes",prior=1.0)
## Get emotion best fit
emotion_Wipro = Wiproemo[,7]
# substitute NA's by "unknown"
emotion_Wipro[is.na(emotion_Wipro)] = "unknown"
head(emotion_Wipro,20)
## [1] "unknown" "unknown" "unknown" "unknown" "joy" "unknown" "unknown"
## [8] "unknown" "unknown" "unknown" "unknown" "unknown" "unknown" "unknown"
## [15] "unknown" "unknown" "unknown" "unknown" "unknown" "joy"
# classify polarity
Wipro_pol = classify_polarity(Wiprosenti, algorithm="bayes")
# get polarity best fit
polarity_Wipro = Wipro_pol[,4]
head(polarity_Wipro,20)
## [1] "positive" "positive" "positive" "positive" "negative" "positive"
## [7] "positive" "positive" "positive" "positive" "positive" "positive"
## [13] "positive" "positive" "positive" "positive" "positive" "positive"
## [19] "positive" "positive"
Let us consolidate these results into a dataframe and validate the results
Wipro_senti <- data.frame(text=Wiprosenti,emotion=emotion_Wipro,polarity=polarity_Wipro,stringAsFactors=FALSE)
head(Wipro_senti,50)
## text
## 1 poll have theresults set the course for other it firms such asandsy
## 2 rt indiawinsmillion contract fromengineering giant \nwa qu
## 3 wins a big deal from abb\n\nyp
## 4 wipro plans to focus on banking services forbu
## 5 no close competitor in d donar list azim premji tops indiaâs philanthropistsâ listthe economic times jv
## 6 winsmillion contract from swiss engineering giant abb le
## 7 for applicationtestingtesting role atf
## 8 it services firms to remain on strong growth path inksenindia
## 9 rallies overonmillion contract win from abb
## 10 rt indiawinsmillion contract fromengineering giant \nwa qu
## 11 topcompanies likeare waiting to hire\nto know more about it click in am
## 12 rt sitwatwinsmillion deal from abb\ner via
## 13 patent issued for methods and systems for minimizing decoding delay i n wipro
## 14 gainson talks of rs crdeal\nclick here\ncf up
## 15 winsmillion deal from abb\ner via
## 16 walkin on th january in hyderabad forb tech freshers ef
## 17 wipro gainson talks of rs cr abb dealclick hereq r
## 18 rt indiawinsmillion contract fromengineering giant \nwa qu
## 19 winsmillion contract fromengineering giant \nwa qu
## 20 system and method for subscribing to a content stream in patent app ez wipro
## 21 gains as media report suggests that it exporter gets ordermillion contract from \nbz
## 22 gainson reports ofmillion order
## 23 rallies overonmillion contract win from abb
## 24 rallies overonmillion contract win fromthe economic times ds
## 25 rallies overonmillion contract win fromthe economic times ds
## 26 markets may open higherin focus r
## 27 minimumoff onampte
## 28 minimumoff onampsr
## 29 led bulbs uptooff frmamponm spread lights fâ
## 30 led bulbs uptooff frmamponm spread lights fâ
## 31 rt bms improvements in performance reliability amp compression revolutionised by databse on silica
## 32 improvements in performance reliability amp compression revolutionised by databse on silica
## 33 led bulbs uptooff frmamponm spread lights f
## 34 technologies walkin drive for freshers on jan th â th qualificationany graduate fc
## 35 technologies walkin drive for freshers on jan th â th qualificationany graduate ph
## 36 technologies walkin drive for freshers on jan th â th qualificationany graduate kx
## 37 technologies walkin drive for freshers on jan th â th qualificationany graduate on
## 38 wipro plans to focus on banking services forac
## 39 sample wipro placement papers\n\nfy
## 40 offcampus drive for freshers on th janhy tcs walkin drive for freshers onbs
## 41 walkin drive for freshersexp on th to th janfs apply for letest jobsbz
## 42 walkin drive for freshers as associate from th â th januaryâ â â m lc
## 43 azim premjibecomes the most generous indian yet again by pledgingcrore to philanthropy
## 44 walkin drive for freshersexp on th to th janfs apply for more walkinâs rz
## 45 walkin drive for freshers as associate from th â th januaryâ â â m yz
## 46 walkin drive online apply link heregtgt rh qualificationany degree graduate jo sc
## 47 s top philanthropistgivestimes more than next on list nd
## 48 walkin drive for freshers from th to th januarywl hp
## 49 wipro plans to focus on banking services forbu
## 50 to india í í \ntripts
## emotion polarity stringAsFactors
## 1 unknown positive FALSE
## 2 unknown positive FALSE
## 3 unknown positive FALSE
## 4 unknown positive FALSE
## 5 joy negative FALSE
## 6 unknown positive FALSE
## 7 unknown positive FALSE
## 8 unknown positive FALSE
## 9 unknown positive FALSE
## 10 unknown positive FALSE
## 11 unknown positive FALSE
## 12 unknown positive FALSE
## 13 unknown positive FALSE
## 14 unknown positive FALSE
## 15 unknown positive FALSE
## 16 unknown positive FALSE
## 17 unknown positive FALSE
## 18 unknown positive FALSE
## 19 unknown positive FALSE
## 20 joy positive FALSE
## 21 unknown positive FALSE
## 22 unknown positive FALSE
## 23 unknown positive FALSE
## 24 unknown positive FALSE
## 25 unknown positive FALSE
## 26 unknown positive FALSE
## 27 unknown positive FALSE
## 28 unknown positive FALSE
## 29 unknown positive FALSE
## 30 unknown positive FALSE
## 31 unknown positive FALSE
## 32 unknown positive FALSE
## 33 unknown positive FALSE
## 34 unknown positive FALSE
## 35 unknown positive FALSE
## 36 unknown positive FALSE
## 37 unknown positive FALSE
## 38 unknown positive FALSE
## 39 unknown positive FALSE
## 40 unknown positive FALSE
## 41 unknown positive FALSE
## 42 unknown positive FALSE
## 43 unknown positive FALSE
## 44 unknown positive FALSE
## 45 unknown positive FALSE
## 46 unknown positive FALSE
## 47 surprise negative FALSE
## 48 unknown positive FALSE
## 49 unknown positive FALSE
## 50 unknown positive FALSE
Let us now do some plotting of these sentiments
# plot distribution of emotions
library(ggplot2)
##
## Attaching package: 'ggplot2'
##
## The following object is masked from 'package:NLP':
##
## annotate
ggplot(Wipro_senti, aes(x=emotion_Wipro)) +
geom_bar(aes(y=..count.., fill=emotion_Wipro)) +
scale_fill_brewer(palette="Dark2") +
labs(x="emotion categories", y="number of tweets")