Sentiment analysis using machine learning in R

Cheng-Jun Wang 2014 April 6 @cmc

########################################### 
# Sentiment analysis with machine learning

## [1] "Sentiment analysis with machine learning"

library(RTextTools)

## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## 
## The following object is masked from 'package:base':
## 
##     backsolve
## 
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009

library(e1071)

pos_tweets = rbind(c("I love this car", "positive"), c("This view is amazing", 
    "positive"), c("I feel great this morning", "positive"), c("I am so excited about the concert", 
    "positive"), c("He is my best friend", "positive"))


neg_tweets = rbind(c("I do not like this car", "negative"), c("This view is horrible", 
    "negative"), c("I feel tired this morning", "negative"), c("I am not looking forward to the concert", 
    "negative"), c("He is my enemy", "negative"))


test_tweets = rbind(c("feel happy this morning", "positive"), c("larry friend", 
    "positive"), c("not like that man", "negative"), c("house not great", "negative"), 
    c("your song annoying", "negative"))

tweets = rbind(pos_tweets, neg_tweets, test_tweets)

# native bayes
matrix = create_matrix(tweets[, 1], language = "english", removeStopwords = FALSE, 
    removeNumbers = TRUE, stemWords = FALSE, tm::weightTfIdf)
mat = as.matrix(matrix)
classifier = naiveBayes(mat[1:10, ], as.factor(tweets[1:10, 2]))
predicted = predict(classifier, mat[11:15, ])
predicted

## [1] positive positive negative negative positive
## Levels: negative positive


table(tweets[11:15, 2], predicted)

##           predicted
##            negative positive
##   negative        2        1
##   positive        0        2

recall_accuracy(tweets[11:15, 2], predicted)

## [1] 0.8


# the other methods
container = create_container(matrix, as.numeric(as.factor(tweets[, 2])), trainSize = 1:10, 
    testSize = 11:15, virgin = FALSE)  #removeSparseTerms

models = train_models(container, algorithms = c("MAXENT", "SVM", "RF", "BAGGING", 
    "TREE"))

results = classify_models(container, models)

# accuracy
table(as.numeric(as.factor(tweets[11:15, 2])), results[, "FORESTS_LABEL"])

##    
##     1 2
##   1 3 0
##   2 1 1

table(as.numeric(as.factor(tweets[11:15, 2])), results[, "MAXENTROPY_LABEL"])

##    
##     1 2
##   1 1 2
##   2 0 2


recall_accuracy(as.numeric(as.factor(tweets[11:15, 2])), results[, "FORESTS_LABEL"])

## [1] 0.8

recall_accuracy(as.numeric(as.factor(tweets[11:15, 2])), results[, "MAXENTROPY_LABEL"])

## [1] 0.6

recall_accuracy(as.numeric(as.factor(tweets[11:15, 2])), results[, "TREE_LABEL"])

## [1] 0.6

recall_accuracy(as.numeric(as.factor(tweets[11:15, 2])), results[, "BAGGING_LABEL"])

## [1] 0.4

recall_accuracy(as.numeric(as.factor(tweets[11:15, 2])), results[, "SVM_LABEL"])

## [1] 0.2

# model summary
analytics = create_analytics(container, results)
summary(analytics)

## ENSEMBLE SUMMARY
## 
##        n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
## n >= 1                 1.0               0.6
## n >= 2                 1.0               0.6
## n >= 3                 1.0               0.6
## n >= 4                 0.4               0.5
## 
## 
## ALGORITHM PERFORMANCE
## 
##        SVM_PRECISION           SVM_RECALL           SVM_FSCORE 
##                0.165                0.165                0.165 
##    BAGGING_PRECISION       BAGGING_RECALL       BAGGING_FSCORE 
##                0.250                0.335                0.285 
##    FORESTS_PRECISION       FORESTS_RECALL       FORESTS_FSCORE 
##                0.875                0.750                0.765 
##       TREE_PRECISION          TREE_RECALL          TREE_FSCORE 
##                0.300                0.500                0.375 
## MAXENTROPY_PRECISION    MAXENTROPY_RECALL    MAXENTROPY_FSCORE 
##                0.750                0.665                0.585


head(analytics@document_summary)

##   MAXENTROPY_LABEL MAXENTROPY_PROB SVM_LABEL SVM_PROB FORESTS_LABEL
## 1                2          0.6444         1   0.5655             1
## 2                2          0.9922         1   0.8446             2
## 3                1          1.0000         2   0.9808             1
## 4                2          0.7347         1   0.6059             1
## 5                2          0.5000         2   0.5314             1
##   FORESTS_PROB BAGGING_LABEL BAGGING_PROB TREE_LABEL TREE_PROB MANUAL_CODE
## 1        0.505             1         0.72          1       0.5           2
## 2        0.560             1         0.68          1       0.5           2
## 3        0.790             1         0.84          1       0.5           1
## 4        0.630             2         0.60          1       0.5           1
## 5        0.540             1         0.68          1       0.5           1
##   CONSENSUS_CODE CONSENSUS_AGREE CONSENSUS_INCORRECT PROBABILITY_CODE
## 1              1               4                   1                1
## 2              1               3                   1                2
## 3              1               4                   0                1
## 4              1               3                   0                2
## 5              1               3                   0                1
##   PROBABILITY_INCORRECT
## 1                     1
## 2                     0
## 3                     0
## 4                     1
## 5                     0

analytics@ensemble_summary

##        n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
## n >= 1                 1.0               0.6
## n >= 2                 1.0               0.6
## n >= 3                 1.0               0.6
## n >= 4                 0.4               0.5


N = 4
set.seed(2014)
cross_validate(container, N, "MAXENT")
cross_validate(container, N, "TREE")

## Fold 1 Out of Sample Accuracy = 0.3333
## Fold 2 Out of Sample Accuracy = 0.3333
## Fold 3 Out of Sample Accuracy = 0.25
## Fold 4 Out of Sample Accuracy = 0.6

## [[1]]
## [1] 0.3333 0.3333 0.2500 0.6000
## 
## $meanAccuracy
## [1] 0.3792

cross_validate(container, N, "SVM")

## Fold 1 Out of Sample Accuracy = 0.3333
## Fold 2 Out of Sample Accuracy = 0.6
## Fold 3 Out of Sample Accuracy = 1
## Fold 4 Out of Sample Accuracy = 0.6667

## [[1]]
## [1] 0.3333 0.6000 1.0000 0.6667
## 
## $meanAccuracy
## [1] 0.65

cross_validate(container, N, "RF")

## Fold 1 Out of Sample Accuracy = 0.8
## Fold 2 Out of Sample Accuracy = 0.3333
## Fold 3 Out of Sample Accuracy = 0.4
## Fold 4 Out of Sample Accuracy = 0.5

## [[1]]
## [1] 0.8000 0.3333 0.4000 0.5000
## 
## $meanAccuracy
## [1] 0.5083