Cheng-Jun Wang 2014 April 6 @cmc
###########################################
# Sentiment analysis with machine learning
## [1] "Sentiment analysis with machine learning"
library(RTextTools)
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
##
## The following object is masked from 'package:base':
##
## backsolve
##
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
library(e1071)
pos_tweets = rbind(c("I love this car", "positive"), c("This view is amazing",
"positive"), c("I feel great this morning", "positive"), c("I am so excited about the concert",
"positive"), c("He is my best friend", "positive"))
neg_tweets = rbind(c("I do not like this car", "negative"), c("This view is horrible",
"negative"), c("I feel tired this morning", "negative"), c("I am not looking forward to the concert",
"negative"), c("He is my enemy", "negative"))
test_tweets = rbind(c("feel happy this morning", "positive"), c("larry friend",
"positive"), c("not like that man", "negative"), c("house not great", "negative"),
c("your song annoying", "negative"))
tweets = rbind(pos_tweets, neg_tweets, test_tweets)
# native bayes
matrix = create_matrix(tweets[, 1], language = "english", removeStopwords = FALSE,
removeNumbers = TRUE, stemWords = FALSE, tm::weightTfIdf)
mat = as.matrix(matrix)
classifier = naiveBayes(mat[1:10, ], as.factor(tweets[1:10, 2]))
predicted = predict(classifier, mat[11:15, ])
predicted
## [1] positive positive negative negative positive
## Levels: negative positive
table(tweets[11:15, 2], predicted)
## predicted
## negative positive
## negative 2 1
## positive 0 2
recall_accuracy(tweets[11:15, 2], predicted)
## [1] 0.8
# the other methods
container = create_container(matrix, as.numeric(as.factor(tweets[, 2])), trainSize = 1:10,
testSize = 11:15, virgin = FALSE) #removeSparseTerms
models = train_models(container, algorithms = c("MAXENT", "SVM", "RF", "BAGGING",
"TREE"))
results = classify_models(container, models)
# accuracy
table(as.numeric(as.factor(tweets[11:15, 2])), results[, "FORESTS_LABEL"])
##
## 1 2
## 1 3 0
## 2 1 1
table(as.numeric(as.factor(tweets[11:15, 2])), results[, "MAXENTROPY_LABEL"])
##
## 1 2
## 1 1 2
## 2 0 2
recall_accuracy(as.numeric(as.factor(tweets[11:15, 2])), results[, "FORESTS_LABEL"])
## [1] 0.8
recall_accuracy(as.numeric(as.factor(tweets[11:15, 2])), results[, "MAXENTROPY_LABEL"])
## [1] 0.6
recall_accuracy(as.numeric(as.factor(tweets[11:15, 2])), results[, "TREE_LABEL"])
## [1] 0.6
recall_accuracy(as.numeric(as.factor(tweets[11:15, 2])), results[, "BAGGING_LABEL"])
## [1] 0.4
recall_accuracy(as.numeric(as.factor(tweets[11:15, 2])), results[, "SVM_LABEL"])
## [1] 0.2
# model summary
analytics = create_analytics(container, results)
summary(analytics)
## ENSEMBLE SUMMARY
##
## n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
## n >= 1 1.0 0.6
## n >= 2 1.0 0.6
## n >= 3 1.0 0.6
## n >= 4 0.4 0.5
##
##
## ALGORITHM PERFORMANCE
##
## SVM_PRECISION SVM_RECALL SVM_FSCORE
## 0.165 0.165 0.165
## BAGGING_PRECISION BAGGING_RECALL BAGGING_FSCORE
## 0.250 0.335 0.285
## FORESTS_PRECISION FORESTS_RECALL FORESTS_FSCORE
## 0.875 0.750 0.765
## TREE_PRECISION TREE_RECALL TREE_FSCORE
## 0.300 0.500 0.375
## MAXENTROPY_PRECISION MAXENTROPY_RECALL MAXENTROPY_FSCORE
## 0.750 0.665 0.585
head(analytics@document_summary)
## MAXENTROPY_LABEL MAXENTROPY_PROB SVM_LABEL SVM_PROB FORESTS_LABEL
## 1 2 0.6444 1 0.5655 1
## 2 2 0.9922 1 0.8446 2
## 3 1 1.0000 2 0.9808 1
## 4 2 0.7347 1 0.6059 1
## 5 2 0.5000 2 0.5314 1
## FORESTS_PROB BAGGING_LABEL BAGGING_PROB TREE_LABEL TREE_PROB MANUAL_CODE
## 1 0.505 1 0.72 1 0.5 2
## 2 0.560 1 0.68 1 0.5 2
## 3 0.790 1 0.84 1 0.5 1
## 4 0.630 2 0.60 1 0.5 1
## 5 0.540 1 0.68 1 0.5 1
## CONSENSUS_CODE CONSENSUS_AGREE CONSENSUS_INCORRECT PROBABILITY_CODE
## 1 1 4 1 1
## 2 1 3 1 2
## 3 1 4 0 1
## 4 1 3 0 2
## 5 1 3 0 1
## PROBABILITY_INCORRECT
## 1 1
## 2 0
## 3 0
## 4 1
## 5 0
analytics@ensemble_summary
## n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
## n >= 1 1.0 0.6
## n >= 2 1.0 0.6
## n >= 3 1.0 0.6
## n >= 4 0.4 0.5
N = 4
set.seed(2014)
cross_validate(container, N, "MAXENT")
cross_validate(container, N, "TREE")
## Fold 1 Out of Sample Accuracy = 0.3333
## Fold 2 Out of Sample Accuracy = 0.3333
## Fold 3 Out of Sample Accuracy = 0.25
## Fold 4 Out of Sample Accuracy = 0.6
## [[1]]
## [1] 0.3333 0.3333 0.2500 0.6000
##
## $meanAccuracy
## [1] 0.3792
cross_validate(container, N, "SVM")
## Fold 1 Out of Sample Accuracy = 0.3333
## Fold 2 Out of Sample Accuracy = 0.6
## Fold 3 Out of Sample Accuracy = 1
## Fold 4 Out of Sample Accuracy = 0.6667
## [[1]]
## [1] 0.3333 0.6000 1.0000 0.6667
##
## $meanAccuracy
## [1] 0.65
cross_validate(container, N, "RF")
## Fold 1 Out of Sample Accuracy = 0.8
## Fold 2 Out of Sample Accuracy = 0.3333
## Fold 3 Out of Sample Accuracy = 0.4
## Fold 4 Out of Sample Accuracy = 0.5
## [[1]]
## [1] 0.8000 0.3333 0.4000 0.5000
##
## $meanAccuracy
## [1] 0.5083