library(SparseM)
## Warning: package 'SparseM' was built under R version 3.2.2
##
## Attaching package: 'SparseM'
##
## The following object is masked from 'package:base':
##
## backsolve
library(RTextTools)
## Warning: package 'RTextTools' was built under R version 3.2.2
library(e1071)
PositiveTweets = rbind(
c('I love this shirt', 'positive'),
c('This movie is amazing', 'positive'),
c('I feel great this morning', 'positive'),
c('I am very excited about the upcoming ceremony', 'positive'),
c('He is my best friend', 'positive')
)
str(PositiveTweets)
## chr [1:5, 1:2] "I love this shirt" "This movie is amazing" ...
NegativeTweets = rbind(
c('I do not like this shirt', 'negative'),
c('This movie is bogus', 'negative'),
c('I feel tired this morning', 'negative'),
c('I am not looking forward to the upcoming ceremony', 'negative'),
c('He is my foe', 'negative')
)
str(NegativeTweets)
## chr [1:5, 1:2] "I do not like this shirt" "This movie is bogus" ...
TestTweets = rbind(
c('I feel happy this morning', 'positive'),
c('Mohit is my friend', 'positive'),
c('I do not like that man', 'negative'),
c('This house is not great', 'negative'),
c('Your song is annoying me', 'negative')
)
str(TestTweets)
## chr [1:5, 1:2] "I feel happy this morning" "Mohit is my friend" ...
tweets = rbind(PositiveTweets, NegativeTweets, TestTweets)
str(tweets)
## chr [1:15, 1:2] "I love this shirt" "This movie is amazing" ...
dtm = create_matrix(tweets[,1], language="english",
removeStopwords=FALSE, removeNumbers=TRUE, # we can also removeSparseTerms
stemWords=FALSE)
Now, we can train the naive Bayes model with the training set. Note that, e1071 asks the response variable to be numeric or factor. Thus, we convert characters to factors here. This is a little trick.
dtm1 = as.matrix(dtm)
classifier = naiveBayes(dtm1[1:10,], as.factor(tweets[1:10,2]) )
predicted = predict(classifier, dtm1[11:15,]); predicted
## [1] negative positive negative negative negative
## Levels: negative positive
table(tweets[11:15, 2], predicted)
## predicted
## negative positive
## negative 3 0
## positive 1 1
recall_accuracy(tweets[11:15, 2], predicted)
## [1] 0.8
recall_accuracy(tweets[11:15, 2], predicted) > 0.8
## [1] FALSE
How about the other machine learning methods? As I mentioned, we can do it using RTextTools.
Build the data to specify response variable, training set, testing set.
container = create_container(dtm, as.numeric(as.factor(tweets[,2])),
trainSize=1:10, testSize=11:15,virgin=FALSE)
models=train_models(container,algorithms=c("MAXENT","SVM","RF","BAGGING","TREE"))
results = classify_models(container, models)
names(results)
## [1] "MAXENTROPY_LABEL" "MAXENTROPY_PROB" "SVM_LABEL"
## [4] "SVM_PROB" "FORESTS_LABEL" "FORESTS_PROB"
## [7] "BAGGING_LABEL" "BAGGING_PROB" "TREE_LABEL"
## [10] "TREE_PROB"
table(as.numeric(as.factor(tweets[11:15, 2])), results[,"FORESTS_LABEL"])
##
## 1 2
## 1 3 0
## 2 1 1
table(as.numeric(as.factor(tweets[11:15, 2])), results[,"MAXENTROPY_LABEL"])
##
## 1 2
## 1 2 1
## 2 0 2
table(as.numeric(as.factor(tweets[11:15, 2])), results[,"SVM_LABEL" ])
##
## 1 2
## 1 1 2
## 2 2 0
table(as.numeric(as.factor(tweets[11:15, 2])), results[, "BAGGING_LABEL"])
##
## 1
## 1 3
## 2 2
table(as.numeric(as.factor(tweets[11:15, 2])), results[, "TREE_LABEL" ])
##
## 1
## 1 3
## 2 2
recall_accuracy(as.numeric(as.factor(tweets[11:15, 2])), results[,"FORESTS_LABEL"])
## [1] 0.8
recall_accuracy(as.numeric(as.factor(tweets[11:15, 2])), results[,"MAXENTROPY_LABEL"])
## [1] 0.8
recall_accuracy(as.numeric(as.factor(tweets[11:15, 2])), results[,"TREE_LABEL"])
## [1] 0.6
recall_accuracy(as.numeric(as.factor(tweets[11:15, 2])), results[,"BAGGING_LABEL"])
## [1] 0.6
recall_accuracy(as.numeric(as.factor(tweets[11:15, 2])), results[,"SVM_LABEL"])
## [1] 0.2
analytics = create_analytics(container, results)
summary(analytics)
## ENSEMBLE SUMMARY
##
## n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
## n >= 1 1.0 0.60
## n >= 2 1.0 0.60
## n >= 3 1.0 0.60
## n >= 4 0.8 0.75
##
##
## ALGORITHM PERFORMANCE
##
## SVM_PRECISION SVM_RECALL SVM_FSCORE
## 0.165 0.165 0.165
## BAGGING_PRECISION BAGGING_RECALL BAGGING_FSCORE
## 0.300 0.500 0.375
## FORESTS_PRECISION FORESTS_RECALL FORESTS_FSCORE
## 0.875 0.750 0.765
## TREE_PRECISION TREE_RECALL TREE_FSCORE
## 0.300 0.500 0.375
## MAXENTROPY_PRECISION MAXENTROPY_RECALL MAXENTROPY_FSCORE
## 0.835 0.835 0.800
head(analytics@document_summary)
## MAXENTROPY_LABEL MAXENTROPY_PROB SVM_LABEL SVM_PROB FORESTS_LABEL
## 1 2 0.6333311 1 0.5232675 1
## 2 2 0.9924826 1 0.8185693 2
## 3 1 0.9999965 2 0.9863175 1
## 4 2 0.9314378 1 0.7165028 1
## 5 1 0.5000000 2 0.5987326 1
## FORESTS_PROB BAGGING_LABEL BAGGING_PROB TREE_LABEL TREE_PROB MANUAL_CODE
## 1 0.515 1 0.72 1 0.5 2
## 2 0.610 1 0.68 1 0.5 2
## 3 0.725 1 0.92 1 0.5 1
## 4 0.555 1 0.64 1 0.5 1
## 5 0.530 1 0.68 1 0.5 1
## CONSENSUS_CODE CONSENSUS_AGREE CONSENSUS_INCORRECT PROBABILITY_CODE
## 1 1 4 1 1
## 2 1 3 1 2
## 3 1 4 0 1
## 4 1 4 0 2
## 5 1 4 0 1
## PROBABILITY_INCORRECT
## 1 1
## 2 0
## 3 0
## 4 1
## 5 0
analytics@ensemble_summary
## n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
## n >= 1 1.0 0.60
## n >= 2 1.0 0.60
## n >= 3 1.0 0.60
## n >= 4 0.8 0.75
N=5
set.seed(2015)
cross_validate(container,N,"MAXENT")
cross_validate(container,N,"TREE")
## Fold 1 Out of Sample Accuracy = 0.5
## Fold 2 Out of Sample Accuracy = 0.3333333
## Fold 3 Out of Sample Accuracy = 0
## Fold 4 Out of Sample Accuracy = 0
## Fold 5 Out of Sample Accuracy = 0.2
## [[1]]
## [1] 0.5000000 0.3333333 0.0000000 0.0000000 0.2000000
##
## $meanAccuracy
## [1] 0.2066667
cross_validate(container,N,"SVM")
## Fold 1 Out of Sample Accuracy = 0
## Fold 2 Out of Sample Accuracy = 0
## Fold 3 Out of Sample Accuracy = 0.2
## Fold 4 Out of Sample Accuracy = 0.75
## Fold 5 Out of Sample Accuracy = 0.75
## [[1]]
## [1] 0.00 0.00 0.20 0.75 0.75
##
## $meanAccuracy
## [1] 0.34
cross_validate(container,N,"RF")
## Fold 2 Out of Sample Accuracy = 0
## Fold 3 Out of Sample Accuracy = 0
## Fold 4 Out of Sample Accuracy = 0.2
## Fold 5 Out of Sample Accuracy = 0
## [[1]]
## [1] NA 0.0 0.0 0.2 0.0
##
## $meanAccuracy
## [1] NA
It seems that maxent reached the same recall accuracy as naive Bayes. The other methods even did a worse job. This is understandable, since we have only a very small data set. To enlarge the training set, we can get a much better results for sentiment analysis of tweets using more sophisticated methods. I will show the results with anther example.