Sentiment analysis is the interpretation and classification of emotions (positive, negative and neutral) within text data using text analysis techniques. Sentiment analysis allows businesses to identify customer sentiment toward products, brands or services in online conversations and feedback.
library(tm)
library(ggplot2)
library(wordcloud)
library(wordcloud2)
library(syuzhet)
library(lubridate)
library(scales)
library(reshape2)
library(dplyr)
appleData <- read.csv("apple.csv",header = TRUE,sep = ",")
str(appleData)
## 'data.frame': 1000 obs. of 16 variables:
## $ text : Factor w/ 629 levels "#Apple #earnings: How long will #iPhone sales be on â\200\230pauseâ\200\231? $AAPL #iPhone8 #Retail #applenews"| __truncated__,..: 515 515 395 542 17 479 479 525 527 499 ...
## $ favorited : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ favoriteCount: int 0 0 0 0 0 0 0 0 0 0 ...
## $ replyToSN : Factor w/ 36 levels "AdamBuschbacher",..: NA NA NA NA NA NA NA NA NA NA ...
## $ created : Factor w/ 826 levels "2017-08-01 18:37:59",..: 826 825 825 825 824 824 823 823 823 822 ...
## $ truncated : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ replyToSID : num NA NA NA NA NA NA NA NA NA NA ...
## $ id : num 8.92e+17 8.92e+17 8.92e+17 8.92e+17 8.92e+17 ...
## $ replyToUID : num NA NA NA NA NA NA NA NA NA NA ...
## $ statusSource : Factor w/ 51 levels "<a href=\"http://127.0.0.1:3000/\" rel=\"nofollow\">Twitter tweets 111</a>",..: 14 14 7 13 7 13 14 14 11 11 ...
## $ screenName : Factor w/ 736 levels "__v4gue__","_davidelman",..: 368 423 82 411 395 397 462 173 713 367 ...
## $ retweetCount : int 3 3 0 85 0 30 30 9 10 1 ...
## $ isRetweet : logi TRUE TRUE FALSE TRUE FALSE TRUE ...
## $ retweeted : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ longitude : logi NA NA NA NA NA NA ...
## $ latitude : logi NA NA NA NA NA NA ...
corpus <- iconv(appleData$text,to = "UTF-8")
corpus <- Corpus(VectorSource(corpus))
inspect(corpus[1:7])
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 7
##
## [1] RT @option_snipper: $AAPL beat on both eps and revenues. SEES 4Q REV. $49B-$52B, EST. $49.1B https://t.co/hfHXqj0IOB
## [2] RT @option_snipper: $AAPL beat on both eps and revenues. SEES 4Q REV. $49B-$52B, EST. $49.1B https://t.co/hfHXqj0IOB
## [3] Let's see this break all timers. $AAPL 156.89
## [4] RT @SylvaCap: Things might get ugly for $aapl with the iphone delay. With $aapl down that means almost all of the FANG stocks were down posâ\200¦
## [5] $AAPL - wow! This was supposed to be a throw-away quarter and AAPL beats by over 500 million in revenue! Trillion dollar company by 2018!
## [6] RT @CNBCnow: EARNINGS: Apple Q3 EPS $1.67 vs. $1.57 Est.; Q3 Revs. $45.4B vs. $44.89B Est. â\200¢ $AAPL https://t.co/UzI8Uh9GJI https://t.co/WzXâ\200¦
## [7] RT @CNBCnow: EARNINGS: Apple Q3 EPS $1.67 vs. $1.57 Est.; Q3 Revs. $45.4B vs. $44.89B Est. â\200¢ $AAPL https://t.co/UzI8Uh9GJI https://t.co/WzXâ\200¦
Now we do some data cleaning like convert whole document into lower case, remove numbers,punctuation,stop words
corpus <- tm_map(corpus,tolower)
corpus <- tm_map(corpus,removeNumbers)
corpus <- tm_map(corpus,removePunctuation)
cleanData <- tm_map(corpus,removeWords,stopwords("english"))
removeURL <- function(x)gsub('http[[:alnum:]]*','',x)
cleanData <- tm_map(cleanData,content_transformer(removeURL))
cleanData <- tm_map(cleanData,removeWords,c('aapl','aaple'))
cleanData <- tm_map(cleanData,stemDocument)
cleanData <- tm_map(cleanData,stripWhitespace)
inspect(cleanData[1:5])
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 5
##
## [1] rt optionsnipp beat ep revenu see q rev bb est b
## [2] rt optionsnipp beat ep revenu see q rev bb est b
## [3] let see break timer
## [4] rt sylvacap thing might get ugli iphon delay mean almost fang stock posâ\200¦
## [5] wow suppos throwaway quarter beat million revenu trillion dollar compani
Now we convert document into matrix using TermDocumentMatrix function.
dtm <- DocumentTermMatrix(cleanData)
dtm <- as.data.frame(as.matrix(dtm))
dtm[1:5,1:10]
## beat est optionsnipp rev revenu see break let timer almost
## 1 1 1 1 1 1 1 0 0 0 0
## 2 1 1 1 1 1 1 0 0 0 0
## 3 0 0 0 0 0 1 1 1 1 0
## 4 0 0 0 0 0 0 0 0 0 1
## 5 1 0 0 0 1 0 0 0 0 0
wordAppear <- colSums(dtm)
head(wordAppear,34)
## beat est optionsnipp rev revenu see
## 35 136 5 66 27 17
## break let timer almost delay fang
## 38 12 1 88 87 87
## get iphon mean might posâ\200¦ stock
## 143 140 89 95 85 136
## sylvacap thing ugli compani dollar million
## 170 90 86 6 12 5
## quarter suppos throwaway trillion wow appl
## 19 2 3 2 2 257
## cnbcnow earn â\200¢ \200¦
## 29 388 54 105
wordPlot <- data.frame(Words=names(wordAppear),Frequency=sort(wordAppear,decreasing = TRUE))
wf <- wordPlot[1:27,]
wf2 <- wordPlot[1:7,]
ggplot(wf,aes(Words,Frequency,fill=Words))+geom_bar(stat="identity")+theme_light()+coord_flip()
ggplot(wf2,aes(Words,Frequency,fill=Words))+geom_bar(stat = "identity")+coord_polar()+theme_light()
cloudPlot <- sort(colSums(dtm),decreasing = TRUE)
wordcloud(words=names(cloudPlot),freq = cloudPlot,max.words = 600,random.order = FALSE,min.freq = 5,colors = brewer.pal(8,'Dark2'),scale = c(5,0.3),rot.per = 0.5)
wordcloud2(wordPlot,size = 0.5,shape = "star",minSize = 2)
appleTweets <- read.csv("apple.csv",header = TRUE)
tweets <- iconv(appleTweets$text,to="utf-8")
appleTweets2 <- read.csv("apple2.csv",header = TRUE)
tweets2 <- iconv(appleTweets2$text,to="utf-8")
scores <- get_nrc_sentiment(tweets)
scores2 <- get_nrc_sentiment(tweets2)
head(scores)
## anger anticipation disgust fear joy sadness surprise trust negative
## 1 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 1 0 0
## 4 1 0 2 2 0 1 0 0 3
## 5 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0
## positive
## 1 1
## 2 1
## 3 0
## 4 0
## 5 0
## 6 0
tweets[4]
## [1] "RT @SylvaCap: Things might get ugly for $aapl with the iphone delay. With $aapl down that means almost all of the FANG stocks were down pos…"
dfScore <- data.frame(sentiment=names(scores),Frequency = colSums(scores))
ggplot(dfScore,aes(sentiment,Frequency,fill=sentiment))+geom_bar(stat = "identity")+coord_flip()+ggtitle("Sentiment Scores For Apple Tweets")+theme_light()+xlab("Twitter Sentiments")
df2 <- data.frame(sentiments =names(scores2),Frequency=colSums(scores2))
ggplot(df2,aes(sentiments,Frequency,fill=sentiments))+geom_bar(stat = "identity")+coord_flip()+theme_light()+ggtitle("Sentiment Scores For Apple Tweets")+xlab("Twitter Sentiments")