Sentiment Analysis Tutorial: Machine Learning using R

This chunks are modified from this organic link. This document is not my own. Everyone may copy this syntax for educational purpose only while keep mentioning its original sources.

If the package is not installed, do install required packages upon library activation by removing #.

#install.packages("RTextTools")

#install.packages("e1071")

Library activation:

library(RTextTools)

## Loading required package: SparseM

## 
## Attaching package: 'SparseM'

## The following object is masked from 'package:base':
## 
##     backsolve

library(e1071)

I. Using Naive Bayes Techniques

Create positive text data set:

pos_tweets =  rbind(
  c('I love this car', 'positive'),
  c('This view is amazing', 'positive'),
  c('I feel great this morning', 'positive'),
  c('I am so excited about the concert', 'positive'),
  c('He is my best friend', 'positive')
)

Create negative text data set:

neg_tweets = rbind(
  c('I do not like this car', 'negative'),
  c('This view is horrible', 'negative'),
  c('I feel tired this morning', 'negative'),
  c('I am not looking forward to the concert', 'negative'),
  c('He is my enemy', 'negative')
)

Create testing tweets dataset

test_tweets = rbind(
  c('feel happy this morning', 'positive'),
  c('larry friend', 'positive'),
  c('not like that man', 'negative'),
  c('house not great', 'negative'),
  c('your song annoying', 'negative')
)

Combine by row respectively positive, negative, and testing dataframe

tweets = rbind(pos_tweets, neg_tweets, test_tweets)

Inspect the combined data by removing #:

#View(tweets)
#dim(tweets)

Build document term matrix (dtm):

# build dtm
matrix= create_matrix(tweets[,1], language="english", 
                      removeStopwords=FALSE, removeNumbers=TRUE, 
                      stemWords=FALSE)

## Warning in TermDocumentMatrix.SimpleCorpus(x, control): custom functions are
## ignored

## Warning in TermDocumentMatrix.SimpleCorpus(x, control): custom tokenizer is
## ignored

Prior to training the naive Bayes model one can change type of the response variable to be numeric or factor. Therefore, it is better to convert characters to factors through this chunk:

mat = as.matrix(matrix)
classifier = naiveBayes(mat[1:15,], as.factor(tweets[1:15,2]) )

please note that 15 is number of dataset row. derived when we execute dim() function.

Making prediction according to classifier guidance:

predicted = predict(classifier, mat[1:15,]); predicted

##  [1] positive positive positive positive positive positive positive positive
##  [9] negative positive positive positive negative positive negative
## Levels: negative positive

Construct the contingency table that contained false positive and false negative observation:

table(tweets[1:15, 2], predicted)

##           predicted
##            negative positive
##   negative        3        5
##   positive        0        7

Generate accuracy of the machine learning:

recall_accuracy(tweets[1:15, 2], predicted)

## [1] 0.6666667

II. Random Forest

Firstly, build the data to specify response variable, training set, testing set.

container = create_container(matrix, as.numeric(as.factor(tweets[,2])),
                             trainSize=1:10, testSize=11:15,virgin=FALSE)

Train the model using Random Forest machine learning algorithm:

models_RF = train_models(container, algorithms="RF")

Classify the trained model

results_RF = classify_models(container, models_RF)

# accuracy table
table(as.numeric(as.factor(tweets[11:15, 2])), results_RF[,"FORESTS_LABEL"])

##    
##     1 2
##   1 2 1
##   2 1 1

# recall accuracy
recall_accuracy(as.numeric(as.factor(tweets[11:15, 2])), results_RF[,"FORESTS_LABEL"])

## [1] 0.6

To summarize the validity in professional way:

# model summary
analytics = create_analytics(container, results_RF)
summary(analytics)

## ENSEMBLE SUMMARY
## 
##        n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
## n >= 1                   1               0.6
## 
## 
## ALGORITHM PERFORMANCE
## 
## FORESTS_PRECISION    FORESTS_RECALL    FORESTS_FSCORE 
##             0.585             0.585             0.585

head(analytics@document_summary)

##   FORESTS_LABEL FORESTS_PROB MANUAL_CODE CONSENSUS_CODE CONSENSUS_AGREE
## 1             1        0.510           2              1               1
## 2             2        0.595           2              2               1
## 3             1        0.710           1              1               1
## 4             1        0.540           1              1               1
## 5             2        0.520           1              2               1
##   CONSENSUS_INCORRECT PROBABILITY_CODE PROBABILITY_INCORRECT
## 1                   1                1                     1
## 2                   0                2                     0
## 3                   0                1                     0
## 4                   0                1                     0
## 5                   1                2                     1

analytics@ensemble_summary

##        n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
## n >= 1                   1               0.6

to analyze the cross validation:

N=4
set.seed(2014)
cross_validate(container,N,"RF")

## Fold 1 Out of Sample Accuracy = 1
## Fold 2 Out of Sample Accuracy = 0.5
## Fold 3 Out of Sample Accuracy = 0.5
## Fold 4 Out of Sample Accuracy = 0.3333333

## [[1]]
## [1] 1.0000000 0.5000000 0.5000000 0.3333333
## 
## $meanAccuracy
## [1] 0.5833333

III. Other machine learning methods (Streaming Linear Discriminant Analysis, Support Vector Machine, Bagging, and Decision Tree)

# build the data to specify response variable, training set, testing set.
container = create_container(matrix, as.numeric(as.factor(tweets[,2])),
                             trainSize=1:10, testSize=11:15,virgin=FALSE)

Train the model with multiple machine learning algorithms:

models = train_models(container, algorithms=c("SLDA", "SVM", "BAGGING", "TREE"))

results = classify_models(container, models)

# accuracy table
table(as.numeric(as.factor(tweets[11:15, 2])), results[,"SLDA_LABEL"])

##    
##     1 2
##   1 1 2
##   2 0 2

table(as.numeric(as.factor(tweets[11:15, 2])), results[,"SVM_LABEL"])

##    
##     1 2
##   1 2 1
##   2 2 0

table(as.numeric(as.factor(tweets[11:15, 2])), results[,"BAGGING_LABEL"])

##    
##     1
##   1 3
##   2 2

table(as.numeric(as.factor(tweets[11:15, 2])), results[,"TREE_LABEL"])

##    
##     1
##   1 3
##   2 2

# recall accuracy
recall_accuracy(as.numeric(as.factor(tweets[11:15, 2])), results[,"SLDA_LABEL"])

## [1] 0.6

recall_accuracy(as.numeric(as.factor(tweets[11:15, 2])), results[,"SVM_LABEL"])

## [1] 0.4

recall_accuracy(as.numeric(as.factor(tweets[11:15, 2])), results[,"BAGGING_LABEL"])

## [1] 0.6

recall_accuracy(as.numeric(as.factor(tweets[11:15, 2])), results[,"TREE_LABEL"])

## [1] 0.6

# model summary
analytics = create_analytics(container, results)
summary(analytics)

## ENSEMBLE SUMMARY
## 
##        n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
## n >= 1                   1               0.6
## n >= 2                   1               0.6
## n >= 3                   1               0.6
## 
## 
## ALGORITHM PERFORMANCE
## 
##     SVM_PRECISION        SVM_RECALL        SVM_FSCORE    SLDA_PRECISION 
##             0.250             0.335             0.285             0.750 
##       SLDA_RECALL       SLDA_FSCORE BAGGING_PRECISION    BAGGING_RECALL 
##             0.665             0.585             0.300             0.500 
##    BAGGING_FSCORE    TREE_PRECISION       TREE_RECALL       TREE_FSCORE 
##             0.375             0.300             0.500             0.375

head(analytics@document_summary)

##   SLDA_LABEL SLDA_PROB SVM_LABEL  SVM_PROB BAGGING_LABEL BAGGING_PROB
## 1          2 0.6401045         1 0.5817286             1         0.52
## 2          2 0.5185177         1 0.8223025             1         0.56
## 3          1 0.6835875         2 0.9613499             1         0.88
## 4          2 0.5889474         1 0.6145932             1         0.64
## 5          2 0.5876664         1 0.5007519             1         0.56
##   TREE_LABEL TREE_PROB MANUAL_CODE CONSENSUS_CODE CONSENSUS_AGREE
## 1          1       0.5           2              1               3
## 2          1       0.5           2              1               3
## 3          1       0.5           1              1               3
## 4          1       0.5           1              1               3
## 5          1       0.5           1              1               3
##   CONSENSUS_INCORRECT PROBABILITY_CODE PROBABILITY_INCORRECT
## 1                   1                2                     0
## 2                   1                1                     1
## 3                   0                2                     1
## 4                   0                1                     0
## 5                   0                2                     1

analytics@ensemble_summary

##        n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
## n >= 1                   1               0.6
## n >= 2                   1               0.6
## n >= 3                   1               0.6

IV. Empirical Context

Using master data from GitHub : Set working directory to where the data set is placed:

setwd("~/Documents/2022/14 - Twitter-Sentimental-Analysis-master")

Load the dataset:

happy = readLines("./happy.txt")
sad = readLines("./sad.txt")
happy_test = readLines("./happy_test.txt")
sad_test = readLines("./sad_test.txt")

Data pre-processed:

tweet = c(happy, sad)
tweet_test= c(happy_test, sad_test)
tweet_all = c(tweet, tweet_test)
sentiment = c(rep("happy", length(happy) ), 
              rep("sad", length(sad)))
sentiment_test = c(rep("happy", length(happy_test) ), 
                   rep("sad", length(sad_test)))
sentiment_all = as.factor(c(sentiment, sentiment_test))

Activate library:

library(RTextTools)

A. Naive Bayes Machine Learning Methods

mat= create_matrix(tweet_all, language="english", 
                   removeStopwords=FALSE, removeNumbers=TRUE, 
                   stemWords=FALSE, tm::weightTfIdf)

mat = as.matrix(mat)

classifier = naiveBayes(mat[1:160,], as.factor(sentiment_all[1:160]))
predicted = predict(classifier, mat[161:180,]); predicted

##  [1] sad   happy sad   happy happy sad   happy sad   happy sad   sad   sad  
## [13] sad   sad   sad   sad   sad   sad   happy happy
## Levels: happy sad

table(sentiment_test, predicted)

##               predicted
## sentiment_test happy sad
##          happy     5   5
##          sad       2   8

recall_accuracy(sentiment_test, predicted)

## [1] 0.65

B. Other Machine Learning Methods (Support Vector Machine/ SVM, Streaming Linear Discriminant Analysis/ SLDA, Bootstrap Aggregating/ BAGGING, Random Forest/ RF, and Decision Tree/ TREE):

mat= create_matrix(tweet_all, language="english", 
                   removeStopwords=FALSE, removeNumbers=TRUE, 
                   stemWords=FALSE, tm::weightTfIdf)

container = create_container(mat, as.numeric(sentiment_all),
                             trainSize=1:160, testSize=161:180,virgin=FALSE) #removeSparseTerms

models = train_models(container, algorithms=c("SVM",
                                              "SLDA","BAGGING", 
                                              "RF",
                                              "TREE" 
))

Test the models:

results = classify_models(container, models)
table(as.numeric(as.numeric(sentiment_all[161:180])), results[,"FORESTS_LABEL"])

##    
##      1  2
##   1 10  0
##   2  1  9

recall_accuracy(as.numeric(as.numeric(sentiment_all[161:180])), results[,"FORESTS_LABEL"])

## [1] 0.95

Get formal result and cross validation:

# formal tests
analytics = create_analytics(container, results)
summary(analytics)

## ENSEMBLE SUMMARY
## 
##        n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
## n >= 1                1.00              0.95
## n >= 2                1.00              0.95
## n >= 3                1.00              0.95
## n >= 4                1.00              0.95
## n >= 5                0.95              1.00
## 
## 
## ALGORITHM PERFORMANCE
## 
##     SVM_PRECISION        SVM_RECALL        SVM_FSCORE    SLDA_PRECISION 
##             0.955             0.950             0.950             0.955 
##       SLDA_RECALL       SLDA_FSCORE BAGGING_PRECISION    BAGGING_RECALL 
##             0.950             0.950             0.955             0.950 
##    BAGGING_FSCORE FORESTS_PRECISION    FORESTS_RECALL    FORESTS_FSCORE 
##             0.950             0.955             0.950             0.950 
##    TREE_PRECISION       TREE_RECALL       TREE_FSCORE 
##             1.000             1.000             1.000

head(analytics@algorithm_summary)

##   SVM_PRECISION SVM_RECALL SVM_FSCORE SLDA_PRECISION SLDA_RECALL SLDA_FSCORE
## 1          0.91        1.0       0.95           0.91         1.0        0.95
## 2          1.00        0.9       0.95           1.00         0.9        0.95
##   BAGGING_PRECISION BAGGING_RECALL BAGGING_FSCORE FORESTS_PRECISION
## 1              0.91            1.0           0.95              0.91
## 2              1.00            0.9           0.95              1.00
##   FORESTS_RECALL FORESTS_FSCORE TREE_PRECISION TREE_RECALL TREE_FSCORE
## 1            1.0           0.95              1           1           1
## 2            0.9           0.95              1           1           1

head(analytics@label_summary)

##   NUM_MANUALLY_CODED NUM_CONSENSUS_CODED NUM_PROBABILITY_CODED
## 1                 10                  11                    11
## 2                 10                   9                     9
##   PCT_CONSENSUS_CODED PCT_PROBABILITY_CODED PCT_CORRECTLY_CODED_CONSENSUS
## 1                 110                   110                           100
## 2                  90                    90                            90
##   PCT_CORRECTLY_CODED_PROBABILITY
## 1                             100
## 2                              90

head(analytics@document_summary)

##   SVM_LABEL  SVM_PROB SLDA_LABEL SLDA_PROB BAGGING_LABEL BAGGING_PROB
## 1         1 0.9998395          1         1             1         1.00
## 2         1 0.9997711          1         1             1         1.00
## 3         1 0.9868957          1         1             1         1.00
## 4         1 0.9872106          1         1             1         1.00
## 5         1 0.9921540          1         1             1         1.00
## 6         1 0.9723660          1         1             1         0.96
##   FORESTS_LABEL FORESTS_PROB TREE_LABEL TREE_PROB MANUAL_CODE CONSENSUS_CODE
## 1             1        0.910          1         1           1              1
## 2             1        0.920          1         1           1              1
## 3             1        0.925          1         1           1              1
## 4             1        0.940          1         1           1              1
## 5             1        0.950          1         1           1              1
## 6             1        0.720          1         1           1              1
##   CONSENSUS_AGREE CONSENSUS_INCORRECT PROBABILITY_CODE PROBABILITY_INCORRECT
## 1               5                   0                1                     0
## 2               5                   0                1                     0
## 3               5                   0                1                     0
## 4               5                   0                1                     0
## 5               5                   0                1                     0
## 6               5                   0                1                     0

analytics@ensemble_summary # Ensemble Agreement

##        n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
## n >= 1                1.00              0.95
## n >= 2                1.00              0.95
## n >= 3                1.00              0.95
## n >= 4                1.00              0.95
## n >= 5                0.95              1.00

# Cross Validation
N=3
cross_SVM = cross_validate(container,N,"SVM")

## Fold 1 Out of Sample Accuracy = 0.9137931
## Fold 2 Out of Sample Accuracy = 0.9565217
## Fold 3 Out of Sample Accuracy = 1

cross_GLMNET = cross_validate(container,N,"GLMNET")

## Fold 1 Out of Sample Accuracy = 0.8378378
## Fold 2 Out of Sample Accuracy = 1.153846
## Fold 3 Out of Sample Accuracy = 1.314815