Sentiment Analysis

Sentiment analysis is the interpretation and classification of emotions (positive, negative and neutral) within text data using text analysis techniques. Sentiment analysis allows businesses to identify customer sentiment toward products, brands or services in online conversations and feedback.

Import Libraries

library(tm)
library(ggplot2)
library(wordcloud)
library(wordcloud2)
library(syuzhet)
library(lubridate)
library(scales)
library(reshape2)
library(dplyr)

Read File

appleData <- read.csv("apple.csv",header = TRUE,sep = ",")

Structure Of The Data

str(appleData)
## 'data.frame':    1000 obs. of  16 variables:
##  $ text         : Factor w/ 629 levels "#Apple #earnings: How long will #iPhone sales be on â\200\230pauseâ\200\231? $AAPL  #iPhone8 #Retail #applenews"| __truncated__,..: 515 515 395 542 17 479 479 525 527 499 ...
##  $ favorited    : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ favoriteCount: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ replyToSN    : Factor w/ 36 levels "AdamBuschbacher",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ created      : Factor w/ 826 levels "2017-08-01 18:37:59",..: 826 825 825 825 824 824 823 823 823 822 ...
##  $ truncated    : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ replyToSID   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ id           : num  8.92e+17 8.92e+17 8.92e+17 8.92e+17 8.92e+17 ...
##  $ replyToUID   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ statusSource : Factor w/ 51 levels "<a href=\"http://127.0.0.1:3000/\" rel=\"nofollow\">Twitter tweets 111</a>",..: 14 14 7 13 7 13 14 14 11 11 ...
##  $ screenName   : Factor w/ 736 levels "__v4gue__","_davidelman",..: 368 423 82 411 395 397 462 173 713 367 ...
##  $ retweetCount : int  3 3 0 85 0 30 30 9 10 1 ...
##  $ isRetweet    : logi  TRUE TRUE FALSE TRUE FALSE TRUE ...
##  $ retweeted    : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ longitude    : logi  NA NA NA NA NA NA ...
##  $ latitude     : logi  NA NA NA NA NA NA ...

Build Corpus

corpus <- iconv(appleData$text,to = "UTF-8")
corpus <- Corpus(VectorSource(corpus))

Inspect Document

inspect(corpus[1:7])
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 7
## 
## [1] RT @option_snipper: $AAPL beat on both eps and revenues. SEES 4Q REV. $49B-$52B, EST. $49.1B https://t.co/hfHXqj0IOB                            
## [2] RT @option_snipper: $AAPL beat on both eps and revenues. SEES 4Q REV. $49B-$52B, EST. $49.1B https://t.co/hfHXqj0IOB                            
## [3] Let's see this break all timers. $AAPL 156.89                                                                                                   
## [4] RT @SylvaCap: Things might get ugly for $aapl with the iphone delay. With $aapl down that means almost all of the FANG stocks were down posâ\200¦  
## [5] $AAPL - wow! This was supposed to be a throw-away quarter and AAPL beats by over 500 million in revenue! Trillion dollar company by 2018!       
## [6] RT @CNBCnow: EARNINGS: Apple Q3 EPS $1.67 vs. $1.57 Est.; Q3 Revs. $45.4B vs. $44.89B Est. â\200¢ $AAPL https://t.co/UzI8Uh9GJI https://t.co/WzXâ\200¦
## [7] RT @CNBCnow: EARNINGS: Apple Q3 EPS $1.67 vs. $1.57 Est.; Q3 Revs. $45.4B vs. $44.89B Est. â\200¢ $AAPL https://t.co/UzI8Uh9GJI https://t.co/WzXâ\200¦

Data Cleaning

Now we do some data cleaning like convert whole document into lower case, remove numbers,punctuation,stop words

corpus <- tm_map(corpus,tolower)
corpus <- tm_map(corpus,removeNumbers)
corpus <- tm_map(corpus,removePunctuation)
cleanData <- tm_map(corpus,removeWords,stopwords("english"))
removeURL <- function(x)gsub('http[[:alnum:]]*','',x)
cleanData <- tm_map(cleanData,content_transformer(removeURL))
cleanData <- tm_map(cleanData,removeWords,c('aapl','aaple'))
cleanData <- tm_map(cleanData,stemDocument)
cleanData <- tm_map(cleanData,stripWhitespace)
inspect(cleanData[1:5])
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 5
## 
## [1] rt optionsnipp beat ep revenu see q rev bb est b                          
## [2] rt optionsnipp beat ep revenu see q rev bb est b                          
## [3] let see break timer                                                       
## [4] rt sylvacap thing might get ugli iphon delay mean almost fang stock posâ\200¦
## [5] wow suppos throwaway quarter beat million revenu trillion dollar compani

Convert Into Matrix

Now we convert document into matrix using TermDocumentMatrix function.

dtm <- DocumentTermMatrix(cleanData)
dtm <- as.data.frame(as.matrix(dtm))
dtm[1:5,1:10]
##   beat est optionsnipp rev revenu see break let timer almost
## 1    1   1           1   1      1   1     0   0     0      0
## 2    1   1           1   1      1   1     0   0     0      0
## 3    0   0           0   0      0   1     1   1     1      0
## 4    0   0           0   0      0   0     0   0     0      1
## 5    1   0           0   0      1   0     0   0     0      0

Check How Often Each Word Appear

wordAppear <- colSums(dtm)
head(wordAppear,34)
##        beat         est optionsnipp         rev      revenu         see 
##          35         136           5          66          27          17 
##       break         let       timer      almost       delay        fang 
##          38          12           1          88          87          87 
##         get       iphon        mean       might      posâ\200¦       stock 
##         143         140          89          95          85         136 
##    sylvacap       thing        ugli     compani      dollar     million 
##         170          90          86           6          12           5 
##     quarter      suppos   throwaway    trillion         wow        appl 
##          19           2           3           2           2         257 
##     cnbcnow        earn         â\200¢          \200¦ 
##          29         388          54         105

Plots

wordPlot <- data.frame(Words=names(wordAppear),Frequency=sort(wordAppear,decreasing = TRUE))
wf <- wordPlot[1:27,]
wf2 <- wordPlot[1:7,]
ggplot(wf,aes(Words,Frequency,fill=Words))+geom_bar(stat="identity")+theme_light()+coord_flip()

ggplot(wf2,aes(Words,Frequency,fill=Words))+geom_bar(stat = "identity")+coord_polar()+theme_light()

Word Clouds

cloudPlot <- sort(colSums(dtm),decreasing = TRUE)
wordcloud(words=names(cloudPlot),freq = cloudPlot,max.words = 600,random.order = FALSE,min.freq = 5,colors = brewer.pal(8,'Dark2'),scale = c(5,0.3),rot.per = 0.5)

wordcloud2(wordPlot,size = 0.5,shape = "star",minSize = 2)

Perfoming Sentiment Analysis

appleTweets <- read.csv("apple.csv",header = TRUE)
tweets <- iconv(appleTweets$text,to="utf-8")
appleTweets2 <- read.csv("apple2.csv",header = TRUE)
tweets2 <- iconv(appleTweets2$text,to="utf-8")

Obtain Sentiment Scores Of Both Files

scores <- get_nrc_sentiment(tweets)
scores2 <- get_nrc_sentiment(tweets2)
head(scores)
##   anger anticipation disgust fear joy sadness surprise trust negative
## 1     0            0       0    0   0       0        0     0        0
## 2     0            0       0    0   0       0        0     0        0
## 3     0            0       0    0   0       0        1     0        0
## 4     1            0       2    2   0       1        0     0        3
## 5     0            0       0    0   0       0        0     0        0
## 6     0            0       0    0   0       0        0     0        0
##   positive
## 1        1
## 2        1
## 3        0
## 4        0
## 5        0
## 6        0
tweets[4]
## [1] "RT @SylvaCap: Things might get ugly for $aapl with the iphone delay. With $aapl down that means almost all of the FANG stocks were down pos…"

Bar Plot Of 1st File Sentiment

dfScore <- data.frame(sentiment=names(scores),Frequency = colSums(scores))
ggplot(dfScore,aes(sentiment,Frequency,fill=sentiment))+geom_bar(stat = "identity")+coord_flip()+ggtitle("Sentiment Scores For Apple Tweets")+theme_light()+xlab("Twitter Sentiments")

Bar Plot Of 2nd File Sentiment

df2 <- data.frame(sentiments =names(scores2),Frequency=colSums(scores2))
ggplot(df2,aes(sentiments,Frequency,fill=sentiments))+geom_bar(stat = "identity")+coord_flip()+theme_light()+ggtitle("Sentiment Scores For Apple Tweets")+xlab("Twitter Sentiments")