sentimetn analysis for tweets in R
###################
"load data"
## [1] "load data"
###################
setwd("D:/Twitter-Sentimental-Analysis-master/")
happy = readLines("./happy.txt")
sad = readLines("./sad.txt")
happy_test = readLines("./happy_test.txt")
sad_test = readLines("./sad_test.txt")
tweet = c(happy, sad)
tweet_test = c(happy_test, sad_test)
tweet_all = c(tweet, tweet_test)
sentiment = c(rep("happy", length(happy)), rep("sad", length(sad)))
sentiment_test = c(rep("happy", length(happy_test)), rep("sad", length(sad_test)))
sentiment_all = as.factor(c(sentiment, sentiment_test))
##############
"RTextTools"
## [1] "RTextTools"
##############
library(RTextTools)
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
##
## The following object is masked from 'package:base':
##
## backsolve
##
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
library(e1071)
# native bayes
mat = create_matrix(tweet_all, language = "english", removeStopwords = FALSE,
removeNumbers = TRUE, stemWords = FALSE, tm::weightTfIdf)
mat = as.matrix(mat)
classifier = naiveBayes(mat[1:160, ], as.factor(sentiment_all[1:160]))
predicted = predict(classifier, mat[161:180, ])
predicted
## [1] sad happy sad happy happy sad happy sad happy sad sad
## [12] sad sad sad sad sad sad sad happy happy
## Levels: happy sad
table(sentiment_test, predicted)
## predicted
## sentiment_test happy sad
## happy 5 5
## sad 2 8
recall_accuracy(sentiment_test, predicted)
## [1] 0.65
# the other methods
mat = create_matrix(tweet_all, language = "english", removeStopwords = FALSE,
removeNumbers = TRUE, stemWords = FALSE, tm::weightTfIdf)
container = create_container(mat, as.numeric(sentiment_all), trainSize = 1:160,
testSize = 161:180, virgin = FALSE) #<U+53EF><U+4EE5><U+8BBE><U+7F6E>removeSparseTerms
models = train_models(container, algorithms = c("MAXENT", "SVM", "SLDA", "BAGGING",
"RF", "TREE"))
results = classify_models(container, models)
analytics = create_analytics(container, results)
summary(analytics)
## ENSEMBLE SUMMARY
##
## n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
## n >= 1 1.0 0.95
## n >= 2 1.0 0.95
## n >= 3 1.0 0.95
## n >= 4 1.0 0.95
## n >= 5 1.0 0.95
## n >= 6 0.9 1.00
##
##
## ALGORITHM PERFORMANCE
##
## SVM_PRECISION SVM_RECALL SVM_FSCORE
## 0.955 0.950 0.950
## SLDA_PRECISION SLDA_RECALL SLDA_FSCORE
## 0.955 0.950 0.950
## BAGGING_PRECISION BAGGING_RECALL BAGGING_FSCORE
## 0.915 0.900 0.900
## FORESTS_PRECISION FORESTS_RECALL FORESTS_FSCORE
## 0.955 0.950 0.950
## TREE_PRECISION TREE_RECALL TREE_FSCORE
## 1.000 1.000 1.000
## MAXENTROPY_PRECISION MAXENTROPY_RECALL MAXENTROPY_FSCORE
## 0.955 0.950 0.950
# analytics@algorithm_summary: SUMMARY OF PRECISION, RECALL, F-SCORES, AND
# ACCURACY SORTED BY TOPIC CODE FOR EACH ALGORITHM analytics@label_summary:
# SUMMARY OF LABEL (e.g. TOPIC) ACCURACY analytics@document_summary: RAW
# SUMMARY OF ALL DATA AND SCORING analytics@ensemble_summary: SUMMARY OF
# ENSEMBLE PRECISION/COVERAGE. USES THE n VARIABLE PASSED INTO
# create_analytics()
head(analytics@algorithm_summary)
## SVM_PRECISION SVM_RECALL SVM_FSCORE SLDA_PRECISION SLDA_RECALL
## 1 0.91 1.0 0.95 0.91 1.0
## 2 1.00 0.9 0.95 1.00 0.9
## SLDA_FSCORE BAGGING_PRECISION BAGGING_RECALL BAGGING_FSCORE
## 1 0.95 0.83 1.0 0.91
## 2 0.95 1.00 0.8 0.89
## FORESTS_PRECISION FORESTS_RECALL FORESTS_FSCORE TREE_PRECISION
## 1 0.91 1.0 0.95 1
## 2 1.00 0.9 0.95 1
## TREE_RECALL TREE_FSCORE MAXENTROPY_PRECISION MAXENTROPY_RECALL
## 1 1 1 0.91 1.0
## 2 1 1 1.00 0.9
## MAXENTROPY_FSCORE
## 1 0.95
## 2 0.95
head(analytics@label_summary)
## NUM_MANUALLY_CODED NUM_CONSENSUS_CODED NUM_PROBABILITY_CODED
## 1 10 11 11
## 2 10 9 9
## PCT_CONSENSUS_CODED PCT_PROBABILITY_CODED PCT_CORRECTLY_CODED_CONSENSUS
## 1 110 110 100
## 2 90 90 90
## PCT_CORRECTLY_CODED_PROBABILITY
## 1 100
## 2 90
head(analytics@document_summary)
## MAXENTROPY_LABEL MAXENTROPY_PROB SVM_LABEL SVM_PROB SLDA_LABEL SLDA_PROB
## 1 1 1 1 0.9998 1 1
## 2 1 1 1 0.9998 1 1
## 3 1 1 1 0.9870 1 1
## 4 1 1 1 0.9874 1 1
## 5 1 1 1 0.9922 1 1
## 6 1 1 1 0.9726 1 1
## BAGGING_LABEL BAGGING_PROB FORESTS_LABEL FORESTS_PROB TREE_LABEL
## 1 1 1 1 0.935 1
## 2 1 1 1 0.895 1
## 3 1 1 1 0.940 1
## 4 1 1 1 0.950 1
## 5 1 1 1 0.945 1
## 6 1 1 1 0.700 1
## TREE_PROB MANUAL_CODE CONSENSUS_CODE CONSENSUS_AGREE CONSENSUS_INCORRECT
## 1 1 1 1 6 0
## 2 1 1 1 6 0
## 3 1 1 1 6 0
## 4 1 1 1 6 0
## 5 1 1 1 6 0
## 6 1 1 1 6 0
## PROBABILITY_CODE PROBABILITY_INCORRECT
## 1 1 0
## 2 1 0
## 3 1 0
## 4 1 0
## 5 1 0
## 6 1 0
head(analytics@document_summary)
## MAXENTROPY_LABEL MAXENTROPY_PROB SVM_LABEL SVM_PROB SLDA_LABEL SLDA_PROB
## 1 1 1 1 0.9998 1 1
## 2 1 1 1 0.9998 1 1
## 3 1 1 1 0.9870 1 1
## 4 1 1 1 0.9874 1 1
## 5 1 1 1 0.9922 1 1
## 6 1 1 1 0.9726 1 1
## BAGGING_LABEL BAGGING_PROB FORESTS_LABEL FORESTS_PROB TREE_LABEL
## 1 1 1 1 0.935 1
## 2 1 1 1 0.895 1
## 3 1 1 1 0.940 1
## 4 1 1 1 0.950 1
## 5 1 1 1 0.945 1
## 6 1 1 1 0.700 1
## TREE_PROB MANUAL_CODE CONSENSUS_CODE CONSENSUS_AGREE CONSENSUS_INCORRECT
## 1 1 1 1 6 0
## 2 1 1 1 6 0
## 3 1 1 1 6 0
## 4 1 1 1 6 0
## 5 1 1 1 6 0
## 6 1 1 1 6 0
## PROBABILITY_CODE PROBABILITY_INCORRECT
## 1 1 0
## 2 1 0
## 3 1 0
## 4 1 0
## 5 1 0
## 6 1 0
# Ensemble Agreement
ensemble = create_ensembleSummary(analytics@document_summary)
ensemble
## n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
## n >= 1 1.0 0.95
## n >= 2 1.0 0.95
## n >= 3 1.0 0.95
## n >= 4 1.0 0.95
## n >= 5 1.0 0.95
## n >= 6 0.9 1.00
# analytics@ensemble_summary
# Cross Validation
N = 5
cross_SVM = cross_validate(container, N, "SVM")
## Fold 1 Out of Sample Accuracy = 0.9643
## Fold 2 Out of Sample Accuracy = 0.9574
## Fold 3 Out of Sample Accuracy = 0.9487
## Fold 4 Out of Sample Accuracy = 0.9429
## Fold 5 Out of Sample Accuracy = 0.9355
cross_GLMNET = cross_validate(container, N, "GLMNET")
## Fold 1 Out of Sample Accuracy = 2.121
## Fold 2 Out of Sample Accuracy = 1.75
## Fold 3 Out of Sample Accuracy = 1.976
## Fold 4 Out of Sample Accuracy = 2.424
## Fold 5 Out of Sample Accuracy = 1.757
cross_MAXENT = cross_validate(container, N, "MAXENT")