11. Sentiment Analysis

Perform Sentiment Analysis as instructed below.

Import the textmineR, e1071, dplyr, SnowballC libraries.

if(!require("textmineR"))install.packages("textmineR")

## Loading required package: textmineR

## Loading required package: Matrix

## 
## Attaching package: 'textmineR'

## The following object is masked from 'package:Matrix':
## 
##     update

## The following object is masked from 'package:stats':
## 
##     update

if(!require("e1071"))install.packages("e1071")

## Loading required package: e1071

if(!require("dplyr"))install.packages("dplyr")

## Loading required package: dplyr

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

if(!require("SnowballC"))install.packages("SnowballC")

## Loading required package: SnowballC

if(!require("tm"))install.packages("tm")

## Loading required package: tm

## Loading required package: NLP

library(textmineR)
library(e1071)
library(dplyr)
library(SnowballC)
library(tm)

Consider the following tweets and their respective sentiments (both positive and negative) and construct a data frame. The first 5 tweets are positive, the next 5 are negative and the following 5 are the tweets for the test dataset.
‘I love this car’, ‘positive’,
‘This view is amazing’, ‘positive’,
‘I feel great this morning’, ‘positive’,
‘I like her as she is kind’, ‘positive’,
‘He is my best friend’, ‘positive’
‘I do not like this car’, ‘negative’,
‘This view is horrible’, ‘negative’,
‘I feel tired this morning’, ‘negative’,
‘I am not looking forward to the concert’, ‘negative’,
‘He is my enemy’, ‘negative’
‘I feel happy this morning’, ‘positive’,
‘larry friend’, ‘positive’,
‘Dogs are kind’, ‘positive’,
‘house not great’, ‘negative’,
‘your song annoying’, ‘negative’

tweets<- "'I love this car', 'positive',
'This view is amazing', 'positive',
'I feel great this morning', 'positive',
'I like her as she is kind', 'positive',
'He is my best friend', 'positive'
'I do not like this car', 'negative',
'This view is horrible', 'negative',
'I feel tired this morning', 'negative',
'I am not looking forward to the concert', 'negative',
'He is my.    enemy', 'negative'
'feel happy this morning', 'positive',
'larry friend', 'positive',
'Dogs are kind', 'positive',
'house not great', 'negative',
'your song annoying', 'negative'"

tweets

## [1] "'I love this car', 'positive',\n'This view is amazing', 'positive',\n'I feel great this morning', 'positive',\n'I like her as she is kind', 'positive',\n'He is my best friend', 'positive'\n'I do not like this car', 'negative',\n'This view is horrible', 'negative',\n'I feel tired this morning', 'negative',\n'I am not looking forward to the concert', 'negative',\n'He is my.    enemy', 'negative'\n'feel happy this morning', 'positive',\n'larry friend', 'positive',\n'Dogs are kind', 'positive',\n'house not great', 'negative',\n'your song annoying', 'negative'"

tweets<-gsub("'","", tweets) 
tweets

## [1] "I love this car, positive,\nThis view is amazing, positive,\nI feel great this morning, positive,\nI like her as she is kind, positive,\nHe is my best friend, positive\nI do not like this car, negative,\nThis view is horrible, negative,\nI feel tired this morning, negative,\nI am not looking forward to the concert, negative,\nHe is my.    enemy, negative\nfeel happy this morning, positive,\nlarry friend, positive,\nDogs are kind, positive,\nhouse not great, negative,\nyour song annoying, negative"

fileConn<-file("tweets.txt")
writeLines(tweets, fileConn)
close(fileConn)

tweets_df<- read.csv("tweets.txt",header = FALSE,col.names = c("tweets","Sentiment",""))
tweets_df<- subset(tweets_df, select = c(1,2) )
tweets_df

Create a document term matrix for the given dataset considering only the tweets column and performing the necessary text cleaning.

tweets_text_vs <- VectorSource(tweets_df$tweets)
tweets_corpus <-Corpus(tweets_text_vs)
#inspect(tweets_corpus)
tweets_clean <-tm_map(tweets_corpus,tolower)

## Warning in tm_map.SimpleCorpus(tweets_corpus, tolower): transformation drops
## documents

tweets_clean <-tm_map(tweets_clean,removePunctuation)

## Warning in tm_map.SimpleCorpus(tweets_clean, removePunctuation): transformation
## drops documents

tweets_clean <-tm_map(tweets_clean,stripWhitespace)

## Warning in tm_map.SimpleCorpus(tweets_clean, stripWhitespace): transformation
## drops documents

inspect(tweets_clean)

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 15
## 
##  [1] i love this car                        
##  [2] this view is amazing                   
##  [3] i feel great this morning              
##  [4] i like her as she is kind              
##  [5] he is my best friend                   
##  [6] i do not like this car                 
##  [7] this view is horrible                  
##  [8] i feel tired this morning              
##  [9] i am not looking forward to the concert
## [10] he is my enemy                         
## [11] feel happy this morning                
## [12] larry friend                           
## [13] dogs are kind                          
## [14] house not great                        
## [15] your song annoying

Approach 1: Using function tm_map

getTweetsMatrix <- function(tweets_df){

tweets_text_vs <- VectorSource(tweets_df$tweets)
tweets_corpus <-Corpus(tweets_text_vs)
#remove numbers from text
tweets_clean <-tm_map(tweets_corpus,removeNumbers)

# remove Punctuation marks like ?, . etc
tweets_clean <-tm_map(tweets_clean,removePunctuation)

# remove Punctuation marks like ?, . etc
#tweets_clean <-tm_map(tweets_clean,stemDocument)
# remove stop words

tweets_clean <-tm_map(tweets_clean,removeWords,stopwords())

# to remove extra white space characters
tweets_clean <-tm_map(tweets_clean,stripWhitespace)


return(DocumentTermMatrix(tweets_clean))
}

tweets_dtm <- getTweetsMatrix(tweets_df)

## Warning in tm_map.SimpleCorpus(tweets_corpus, removeNumbers): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(tweets_clean, removePunctuation): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(tweets_clean, removeWords, stopwords()):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(tweets_clean, stripWhitespace): transformation
## drops documents

inspect(tweets_dtm)

## <<DocumentTermMatrix (documents: 15, terms: 24)>>
## Non-/sparse entries: 35/325
## Sparsity           : 90%
## Maximal term length: 8
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs car feel friend great kind like love morning this view
##   1    1    0      0     0    0    0    1       0    0    0
##   11   0    1      0     0    0    0    0       1    0    0
##   2    0    0      0     0    0    0    0       0    1    1
##   3    0    1      0     1    0    0    0       1    0    0
##   4    0    0      0     0    1    1    0       0    0    0
##   5    0    0      1     0    0    0    0       0    0    0
##   6    1    0      0     0    0    1    0       0    0    0
##   7    0    0      0     0    0    0    0       0    1    1
##   8    0    1      0     0    0    0    0       1    0    0
##   9    0    0      0     0    0    0    0       0    0    0

Approach 2: Using control list

tweets_dtm2<- DocumentTermMatrix(tweets_corpus,control=list(
  tolower = TRUE,
  removeNumbers = TRUE,
  stopwords  = TRUE,
  removePunctuation = TRUE,
  stripWhitespace = TRUE
  
))
inspect(tweets_dtm2)

## <<DocumentTermMatrix (documents: 15, terms: 23)>>
## Non-/sparse entries: 33/312
## Sparsity           : 90%
## Maximal term length: 8
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs amazing car feel friend great kind like love morning view
##   1        0   1    0      0     0    0    0    1       0    0
##   11       0   0    1      0     0    0    0    0       1    0
##   2        1   0    0      0     0    0    0    0       0    1
##   3        0   0    1      0     1    0    0    0       1    0
##   4        0   0    0      0     0    1    1    0       0    0
##   5        0   0    0      1     0    0    0    0       0    0
##   6        0   1    0      0     0    0    1    0       0    0
##   7        0   0    0      0     0    0    0    0       0    1
##   8        0   0    1      0     0    0    0    0       1    0
##   9        0   0    0      0     0    0    0    0       0    0

Approach 3: Using control list

textminer_dtm <- function(text, doc_id){
        dtm <- textmineR::CreateDtm(doc_vec = text, 
          doc_names = doc_id, 
          stopword_vec = c(stopwords::stopwords("en")), 
          remove_punctuation = TRUE,
          remove_numbers = TRUE,
          verbose = FALSE, 
          cpus = 1)
}

dtm_miner <- textminer_dtm(tweets_df$tweets, tweets_df$doc_id)

## Warning in textmineR::CreateDtm(doc_vec = text, doc_names = doc_id,
## stopword_vec = c(stopwords::stopwords("en")), : No document names detected.
## Assigning 1:length(doc_vec) as names.

dtm_miner

## 15 x 23 sparse Matrix of class "dgCMatrix"

##   [[ suppressing 23 column names 'amazing', 'annoying', 'best' ... ]]

##                                                 
## 1  . . . . . . . . . . . . 1 . . 1 . . . . . . .
## 2  1 . . . . . . . . . . . . . . . . . . . 1 . .
## 3  . . . . . . . . . . . . . . . . . 1 . . . 1 1
## 4  . . . . . . . . . . . . . . . . . . 1 1 . . .
## 5  . . 1 . . . . . . . . . . . . . 1 . . . . . .
## 6  . . . . . . . . . . . . . . . 1 . . . 1 . . .
## 7  . . . . . . . . 1 . . . . . . . . . . . 1 . .
## 8  . . . . . . . . . . . . . . 1 . . . . . . 1 1
## 9  . . . 1 . . 1 . . . . 1 . . . . . . . . . . .
## 10 . . . . . 1 . . . . . . . . . . . . . . . . .
## 11 . . . . . . . 1 . . . . . . . . . . . . . 1 1
## 12 . . . . . . . . . . 1 . . . . . 1 . . . . . .
## 13 . . . . 1 . . . . . . . . . . . . . 1 . . . .
## 14 . . . . . . . . . 1 . . . . . . . 1 . . . . .
## 15 . 1 . . . . . . . . . . . 1 . . . . . . . . .

str(dtm_miner)

## Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
##   ..@ i       : int [1:33] 1 14 4 8 12 9 8 10 6 13 ...
##   ..@ p       : int [1:24] 0 1 2 3 4 5 6 7 8 9 ...
##   ..@ Dim     : int [1:2] 15 23
##   ..@ Dimnames:List of 2
##   .. ..$ : chr [1:15] "1" "2" "3" "4" ...
##   .. ..$ : chr [1:23] "amazing" "annoying" "best" "concert" ...
##   ..@ x       : num [1:33] 1 1 1 1 1 1 1 1 1 1 ...
##   ..@ factors : list()
##   ..$ args:List of 8
##   .. ..$ doc_names          : NULL
##   .. ..$ ngram_window       : num [1:2] 1 1
##   .. ..$ stopword_vec       : chr [1:150] "i" "me" "my" "myself" ...
##   .. ..$ lower              : logi TRUE
##   .. ..$ remove_punctuation : logi TRUE
##   .. ..$ remove_numbers     : logi TRUE
##   .. ..$ stem_lemma_function: NULL
##   .. ..$ verbose            : logi FALSE
##   ..$ call: chr "CreateDtm"

tweets_dataset <- data.frame(tweets_df$Sentiment, as.matrix(dtm_miner))
colnames(tweets_dataset)

##  [1] "tweets_df.Sentiment" "amazing"             "annoying"           
##  [4] "best"                "concert"             "dogs"               
##  [7] "enemy"               "forward"             "happy"              
## [10] "horrible"            "house"               "larry"              
## [13] "looking"             "love"                "song"               
## [16] "tired"               "car"                 "friend"             
## [19] "great"               "kind"                "like"               
## [22] "view"                "feel"                "morning"

names(tweets_dataset)[1]<-"sentiment"
colnames(tweets_dataset)

##  [1] "sentiment" "amazing"   "annoying"  "best"      "concert"   "dogs"     
##  [7] "enemy"     "forward"   "happy"     "horrible"  "house"     "larry"    
## [13] "looking"   "love"      "song"      "tired"     "car"       "friend"   
## [19] "great"     "kind"      "like"      "view"      "feel"      "morning"

tweets_dataset$sentiment<-factor(tweets_dataset$sentiment)
#str(tweets_dataset)
tweets_dataset

split train(first 10 rows) and test sets (last 5 rows)

training_data = tweets_dataset[1:10,] 
testing_data = tweets_dataset[11:15,] 

dim(training_data)

## [1] 10 24

dim(testing_data)

## [1]  5 24

training_data

Create a Naive Bayes model by considering the first 10 records as the training set. Note: Make sure you convert the sentiment column to a factor variable in order to apply a classification model

if(!require("e1071"))install.packages("e1071")
library(e1071)
model <- naiveBayes(sentiment ~ ., data = training_data)

(model)

## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##  negative  positive 
##       0.5       0.5 
## 
## Conditional probabilities:
##            amazing
## Y           [,1]      [,2]
##    negative  0.0 0.0000000
##    positive  0.2 0.4472136
## 
##            annoying
## Y           [,1] [,2]
##    negative    0    0
##    positive    0    0
## 
##            best
## Y           [,1]      [,2]
##    negative  0.0 0.0000000
##    positive  0.2 0.4472136
## 
##            concert
## Y           [,1]      [,2]
##    negative  0.2 0.4472136
##    positive  0.0 0.0000000
## 
##            dogs
## Y           [,1] [,2]
##    negative    0    0
##    positive    0    0
## 
##            enemy
## Y           [,1]      [,2]
##    negative  0.2 0.4472136
##    positive  0.0 0.0000000
## 
##            forward
## Y           [,1]      [,2]
##    negative  0.2 0.4472136
##    positive  0.0 0.0000000
## 
##            happy
## Y           [,1] [,2]
##    negative    0    0
##    positive    0    0
## 
##            horrible
## Y           [,1]      [,2]
##    negative  0.2 0.4472136
##    positive  0.0 0.0000000
## 
##            house
## Y           [,1] [,2]
##    negative    0    0
##    positive    0    0
## 
##            larry
## Y           [,1] [,2]
##    negative    0    0
##    positive    0    0
## 
##            looking
## Y           [,1]      [,2]
##    negative  0.2 0.4472136
##    positive  0.0 0.0000000
## 
##            love
## Y           [,1]      [,2]
##    negative  0.0 0.0000000
##    positive  0.2 0.4472136
## 
##            song
## Y           [,1] [,2]
##    negative    0    0
##    positive    0    0
## 
##            tired
## Y           [,1]      [,2]
##    negative  0.2 0.4472136
##    positive  0.0 0.0000000
## 
##            car
## Y           [,1]      [,2]
##    negative  0.2 0.4472136
##    positive  0.2 0.4472136
## 
##            friend
## Y           [,1]      [,2]
##    negative  0.0 0.0000000
##    positive  0.2 0.4472136
## 
##            great
## Y           [,1]      [,2]
##    negative  0.0 0.0000000
##    positive  0.2 0.4472136
## 
##            kind
## Y           [,1]      [,2]
##    negative  0.0 0.0000000
##    positive  0.2 0.4472136
## 
##            like
## Y           [,1]      [,2]
##    negative  0.2 0.4472136
##    positive  0.2 0.4472136
## 
##            view
## Y           [,1]      [,2]
##    negative  0.2 0.4472136
##    positive  0.2 0.4472136
## 
##            feel
## Y           [,1]      [,2]
##    negative  0.2 0.4472136
##    positive  0.2 0.4472136
## 
##            morning
## Y           [,1]      [,2]
##    negative  0.2 0.4472136
##    positive  0.2 0.4472136

Predict the sentiments for the test dataset. Never pass the prediction column(Sentiment)

#y_pred <- predict(model, newdata = testing_data[ , names(testing_data) != "sentiment"])

   y_pred <- predict(model, newdata = testing_data[-1])

#option1: Generate Confusion matrix manually by comparing Actual vs predicted values for Sentiment

table(y_pred, testing_data$sentiment,dnn=c("Prediction","Actual"))

##            Actual
## Prediction   negative  positive
##    negative         1         1
##    positive         1         2

Evaluate the model by displaying the confusion matrix and the accuracy score.

if(!require('caret'))install.packages('caret')

## Loading required package: caret

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

## Loading required package: lattice

library(caret)
cfm <- confusionMatrix(y_pred, testing_data$sentiment)
cfm

## Confusion Matrix and Statistics
## 
##            Reference
## Prediction   negative  positive
##    negative         1         1
##    positive         1         2
##                                           
##                Accuracy : 0.6             
##                  95% CI : (0.1466, 0.9473)
##     No Information Rate : 0.6             
##     P-Value [Acc > NIR] : 0.6826          
##                                           
##                   Kappa : 0.1667          
##                                           
##  Mcnemar's Test P-Value : 1.0000          
##                                           
##             Sensitivity : 0.5000          
##             Specificity : 0.6667          
##          Pos Pred Value : 0.5000          
##          Neg Pred Value : 0.6667          
##              Prevalence : 0.4000          
##          Detection Rate : 0.2000          
##    Detection Prevalence : 0.4000          
##       Balanced Accuracy : 0.5833          
##                                           
##        'Positive' Class :  negative       
##

A plot of confusion matrix

if(!require('ggplot2'))install.packages('ggplot2')
if(!require('scales'))install.packages('scales')

## Loading required package: scales

library(ggplot2)
library(scales)

ggplotConfusionMatrix <- function(m){
  mytitle <- paste("Accuracy", percent_format()(m$overall[1]),
                   "Kappa", percent_format()(m$overall[2]))
  p <-
    ggplot(data = as.data.frame(m$table) ,
           aes(x = Reference, y = Prediction)) +
    geom_tile(aes(fill = log(Freq)), colour = "white") +
    scale_fill_gradient(low = "white", high = "steelblue") +
    geom_text(aes(x = Reference, y = Prediction, label = Freq)) +
    theme(legend.position = "none") +
    ggtitle(mytitle)
  return(p)
}

ggplotConfusionMatrix(cfm)

11. Sentiment Analysis – 2

Perform Sentiment Analysis as instructed below.