Load the packages

library(SparseM)
## Warning: package 'SparseM' was built under R version 3.2.2
## 
## Attaching package: 'SparseM'
## 
## The following object is masked from 'package:base':
## 
##     backsolve
library(RTextTools)
## Warning: package 'RTextTools' was built under R version 3.2.2
library(e1071)

Read the data

PositiveTweets =  rbind(
  c('I love this shirt', 'positive'),
  c('This movie is amazing', 'positive'),
  c('I feel great this morning', 'positive'),
  c('I am very excited about the upcoming ceremony', 'positive'),
  c('He is my best friend', 'positive')
)

str(PositiveTweets)
##  chr [1:5, 1:2] "I love this shirt" "This movie is amazing" ...
NegativeTweets = rbind(
  c('I do not like this shirt', 'negative'),
  c('This movie is bogus', 'negative'),
  c('I feel tired this morning', 'negative'),
  c('I am not looking forward to the upcoming ceremony', 'negative'),
  c('He is my foe', 'negative')
)

str(NegativeTweets)
##  chr [1:5, 1:2] "I do not like this shirt" "This movie is bogus" ...
TestTweets = rbind(
  c('I feel happy this morning', 'positive'),
  c('Mohit is my friend', 'positive'),
  c('I do not like that man', 'negative'),
  c('This house is not great', 'negative'),
  c('Your song is annoying me', 'negative')
)

str(TestTweets)
##  chr [1:5, 1:2] "I feel happy this morning" "Mohit is my friend" ...
tweets = rbind(PositiveTweets, NegativeTweets, TestTweets)

str(tweets)
##  chr [1:15, 1:2] "I love this shirt" "This movie is amazing" ...

Building of Document term Matrix

dtm = create_matrix(tweets[,1], language="english", 
                      removeStopwords=FALSE, removeNumbers=TRUE,  # we can also removeSparseTerms
                      stemWords=FALSE)

Now, we can train the naive Bayes model with the training set. Note that, e1071 asks the response variable to be numeric or factor. Thus, we convert characters to factors here. This is a little trick.

Train the model

dtm1 = as.matrix(dtm)
classifier = naiveBayes(dtm1[1:10,], as.factor(tweets[1:10,2]) )

Test the Validity

predicted = predict(classifier, dtm1[11:15,]); predicted
## [1] negative positive negative negative negative
## Levels: negative positive
table(tweets[11:15, 2], predicted)
##           predicted
##            negative positive
##   negative        3        0
##   positive        1        1
recall_accuracy(tweets[11:15, 2], predicted) 
## [1] 0.8
recall_accuracy(tweets[11:15, 2], predicted) > 0.8
## [1] FALSE

How about the other machine learning methods? As I mentioned, we can do it using RTextTools.

First to specify our data.

Build the data to specify response variable, training set, testing set.

container = create_container(dtm, as.numeric(as.factor(tweets[,2])),
                             trainSize=1:10, testSize=11:15,virgin=FALSE)

Second, to train the model with multiple machine learning algorithms:

models=train_models(container,algorithms=c("MAXENT","SVM","RF","BAGGING","TREE"))

Now, we can classify the testing set using the trained models.

results = classify_models(container, models)
names(results)
##  [1] "MAXENTROPY_LABEL" "MAXENTROPY_PROB"  "SVM_LABEL"       
##  [4] "SVM_PROB"         "FORESTS_LABEL"    "FORESTS_PROB"    
##  [7] "BAGGING_LABEL"    "BAGGING_PROB"     "TREE_LABEL"      
## [10] "TREE_PROB"

How abput the accuracy?

accuracy table

table(as.numeric(as.factor(tweets[11:15, 2])), results[,"FORESTS_LABEL"])
##    
##     1 2
##   1 3 0
##   2 1 1
table(as.numeric(as.factor(tweets[11:15, 2])), results[,"MAXENTROPY_LABEL"])
##    
##     1 2
##   1 2 1
##   2 0 2
table(as.numeric(as.factor(tweets[11:15, 2])), results[,"SVM_LABEL" ])
##    
##     1 2
##   1 1 2
##   2 2 0
table(as.numeric(as.factor(tweets[11:15, 2])), results[, "BAGGING_LABEL"])
##    
##     1
##   1 3
##   2 2
table(as.numeric(as.factor(tweets[11:15, 2])), results[, "TREE_LABEL" ])
##    
##     1
##   1 3
##   2 2

recall accuracy

recall_accuracy(as.numeric(as.factor(tweets[11:15, 2])), results[,"FORESTS_LABEL"])
## [1] 0.8
recall_accuracy(as.numeric(as.factor(tweets[11:15, 2])), results[,"MAXENTROPY_LABEL"])
## [1] 0.8
recall_accuracy(as.numeric(as.factor(tweets[11:15, 2])), results[,"TREE_LABEL"])
## [1] 0.6
recall_accuracy(as.numeric(as.factor(tweets[11:15, 2])), results[,"BAGGING_LABEL"])
## [1] 0.6
recall_accuracy(as.numeric(as.factor(tweets[11:15, 2])), results[,"SVM_LABEL"])
## [1] 0.2

To summarize the results (especially the validity) in a formal way:

model summary

analytics = create_analytics(container, results)
summary(analytics)
## ENSEMBLE SUMMARY
## 
##        n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
## n >= 1                 1.0              0.60
## n >= 2                 1.0              0.60
## n >= 3                 1.0              0.60
## n >= 4                 0.8              0.75
## 
## 
## ALGORITHM PERFORMANCE
## 
##        SVM_PRECISION           SVM_RECALL           SVM_FSCORE 
##                0.165                0.165                0.165 
##    BAGGING_PRECISION       BAGGING_RECALL       BAGGING_FSCORE 
##                0.300                0.500                0.375 
##    FORESTS_PRECISION       FORESTS_RECALL       FORESTS_FSCORE 
##                0.875                0.750                0.765 
##       TREE_PRECISION          TREE_RECALL          TREE_FSCORE 
##                0.300                0.500                0.375 
## MAXENTROPY_PRECISION    MAXENTROPY_RECALL    MAXENTROPY_FSCORE 
##                0.835                0.835                0.800
head(analytics@document_summary)
##   MAXENTROPY_LABEL MAXENTROPY_PROB SVM_LABEL  SVM_PROB FORESTS_LABEL
## 1                2       0.6333311         1 0.5232675             1
## 2                2       0.9924826         1 0.8185693             2
## 3                1       0.9999965         2 0.9863175             1
## 4                2       0.9314378         1 0.7165028             1
## 5                1       0.5000000         2 0.5987326             1
##   FORESTS_PROB BAGGING_LABEL BAGGING_PROB TREE_LABEL TREE_PROB MANUAL_CODE
## 1        0.515             1         0.72          1       0.5           2
## 2        0.610             1         0.68          1       0.5           2
## 3        0.725             1         0.92          1       0.5           1
## 4        0.555             1         0.64          1       0.5           1
## 5        0.530             1         0.68          1       0.5           1
##   CONSENSUS_CODE CONSENSUS_AGREE CONSENSUS_INCORRECT PROBABILITY_CODE
## 1              1               4                   1                1
## 2              1               3                   1                2
## 3              1               4                   0                1
## 4              1               4                   0                2
## 5              1               4                   0                1
##   PROBABILITY_INCORRECT
## 1                     1
## 2                     0
## 3                     0
## 4                     1
## 5                     0
analytics@ensemble_summary
##        n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
## n >= 1                 1.0              0.60
## n >= 2                 1.0              0.60
## n >= 3                 1.0              0.60
## n >= 4                 0.8              0.75

To cross validate the results:

N=5
set.seed(2015)
cross_validate(container,N,"MAXENT")
cross_validate(container,N,"TREE")
## Fold 1 Out of Sample Accuracy = 0.5
## Fold 2 Out of Sample Accuracy = 0.3333333
## Fold 3 Out of Sample Accuracy = 0
## Fold 4 Out of Sample Accuracy = 0
## Fold 5 Out of Sample Accuracy = 0.2
## [[1]]
## [1] 0.5000000 0.3333333 0.0000000 0.0000000 0.2000000
## 
## $meanAccuracy
## [1] 0.2066667
cross_validate(container,N,"SVM")
## Fold 1 Out of Sample Accuracy = 0
## Fold 2 Out of Sample Accuracy = 0
## Fold 3 Out of Sample Accuracy = 0.2
## Fold 4 Out of Sample Accuracy = 0.75
## Fold 5 Out of Sample Accuracy = 0.75
## [[1]]
## [1] 0.00 0.00 0.20 0.75 0.75
## 
## $meanAccuracy
## [1] 0.34
cross_validate(container,N,"RF")
## Fold 2 Out of Sample Accuracy = 0
## Fold 3 Out of Sample Accuracy = 0
## Fold 4 Out of Sample Accuracy = 0.2
## Fold 5 Out of Sample Accuracy = 0
## [[1]]
## [1]  NA 0.0 0.0 0.2 0.0
## 
## $meanAccuracy
## [1] NA

It seems that maxent reached the same recall accuracy as naive Bayes. The other methods even did a worse job. This is understandable, since we have only a very small data set. To enlarge the training set, we can get a much better results for sentiment analysis of tweets using more sophisticated methods. I will show the results with anther example.