sentimetn analysis for tweets in R

################### 
"load data"

## [1] "load data"

################### 
setwd("D:/Twitter-Sentimental-Analysis-master/")
happy = readLines("./happy.txt")
sad = readLines("./sad.txt")
happy_test = readLines("./happy_test.txt")
sad_test = readLines("./sad_test.txt")

tweet = c(happy, sad)
tweet_test = c(happy_test, sad_test)

tweet_all = c(tweet, tweet_test)

sentiment = c(rep("happy", length(happy)), rep("sad", length(sad)))
sentiment_test = c(rep("happy", length(happy_test)), rep("sad", length(sad_test)))

sentiment_all = as.factor(c(sentiment, sentiment_test))

############## 
"RTextTools"

## [1] "RTextTools"

############## 
library(RTextTools)

## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## 
## The following object is masked from 'package:base':
## 
##     backsolve
## 
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009

library(e1071)


# native bayes
mat = create_matrix(tweet_all, language = "english", removeStopwords = FALSE, 
    removeNumbers = TRUE, stemWords = FALSE, tm::weightTfIdf)

mat = as.matrix(mat)

classifier = naiveBayes(mat[1:160, ], as.factor(sentiment_all[1:160]))
predicted = predict(classifier, mat[161:180, ])
predicted

##  [1] sad   happy sad   happy happy sad   happy sad   happy sad   sad  
## [12] sad   sad   sad   sad   sad   sad   sad   happy happy
## Levels: happy sad


table(sentiment_test, predicted)

##               predicted
## sentiment_test happy sad
##          happy     5   5
##          sad       2   8

recall_accuracy(sentiment_test, predicted)

## [1] 0.65


# the other methods
mat = create_matrix(tweet_all, language = "english", removeStopwords = FALSE, 
    removeNumbers = TRUE, stemWords = FALSE, tm::weightTfIdf)

container = create_container(mat, as.numeric(sentiment_all), trainSize = 1:160, 
    testSize = 161:180, virgin = FALSE)  #<U+53EF><U+4EE5><U+8BBE><U+7F6E>removeSparseTerms

models = train_models(container, algorithms = c("MAXENT", "SVM", "SLDA", "BAGGING", 
    "RF", "TREE"))

results = classify_models(container, models)
analytics = create_analytics(container, results)
summary(analytics)

## ENSEMBLE SUMMARY
## 
##        n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
## n >= 1                 1.0              0.95
## n >= 2                 1.0              0.95
## n >= 3                 1.0              0.95
## n >= 4                 1.0              0.95
## n >= 5                 1.0              0.95
## n >= 6                 0.9              1.00
## 
## 
## ALGORITHM PERFORMANCE
## 
##        SVM_PRECISION           SVM_RECALL           SVM_FSCORE 
##                0.955                0.950                0.950 
##       SLDA_PRECISION          SLDA_RECALL          SLDA_FSCORE 
##                0.955                0.950                0.950 
##    BAGGING_PRECISION       BAGGING_RECALL       BAGGING_FSCORE 
##                0.915                0.900                0.900 
##    FORESTS_PRECISION       FORESTS_RECALL       FORESTS_FSCORE 
##                0.955                0.950                0.950 
##       TREE_PRECISION          TREE_RECALL          TREE_FSCORE 
##                1.000                1.000                1.000 
## MAXENTROPY_PRECISION    MAXENTROPY_RECALL    MAXENTROPY_FSCORE 
##                0.955                0.950                0.950


# analytics@algorithm_summary: SUMMARY OF PRECISION, RECALL, F-SCORES, AND
# ACCURACY SORTED BY TOPIC CODE FOR EACH ALGORITHM analytics@label_summary:
# SUMMARY OF LABEL (e.g. TOPIC) ACCURACY analytics@document_summary: RAW
# SUMMARY OF ALL DATA AND SCORING analytics@ensemble_summary: SUMMARY OF
# ENSEMBLE PRECISION/COVERAGE. USES THE n VARIABLE PASSED INTO
# create_analytics()

head(analytics@algorithm_summary)

##   SVM_PRECISION SVM_RECALL SVM_FSCORE SLDA_PRECISION SLDA_RECALL
## 1          0.91        1.0       0.95           0.91         1.0
## 2          1.00        0.9       0.95           1.00         0.9
##   SLDA_FSCORE BAGGING_PRECISION BAGGING_RECALL BAGGING_FSCORE
## 1        0.95              0.83            1.0           0.91
## 2        0.95              1.00            0.8           0.89
##   FORESTS_PRECISION FORESTS_RECALL FORESTS_FSCORE TREE_PRECISION
## 1              0.91            1.0           0.95              1
## 2              1.00            0.9           0.95              1
##   TREE_RECALL TREE_FSCORE MAXENTROPY_PRECISION MAXENTROPY_RECALL
## 1           1           1                 0.91               1.0
## 2           1           1                 1.00               0.9
##   MAXENTROPY_FSCORE
## 1              0.95
## 2              0.95

head(analytics@label_summary)

##   NUM_MANUALLY_CODED NUM_CONSENSUS_CODED NUM_PROBABILITY_CODED
## 1                 10                  11                    11
## 2                 10                   9                     9
##   PCT_CONSENSUS_CODED PCT_PROBABILITY_CODED PCT_CORRECTLY_CODED_CONSENSUS
## 1                 110                   110                           100
## 2                  90                    90                            90
##   PCT_CORRECTLY_CODED_PROBABILITY
## 1                             100
## 2                              90

head(analytics@document_summary)

##   MAXENTROPY_LABEL MAXENTROPY_PROB SVM_LABEL SVM_PROB SLDA_LABEL SLDA_PROB
## 1                1               1         1   0.9998          1         1
## 2                1               1         1   0.9998          1         1
## 3                1               1         1   0.9870          1         1
## 4                1               1         1   0.9874          1         1
## 5                1               1         1   0.9922          1         1
## 6                1               1         1   0.9726          1         1
##   BAGGING_LABEL BAGGING_PROB FORESTS_LABEL FORESTS_PROB TREE_LABEL
## 1             1            1             1        0.935          1
## 2             1            1             1        0.895          1
## 3             1            1             1        0.940          1
## 4             1            1             1        0.950          1
## 5             1            1             1        0.945          1
## 6             1            1             1        0.700          1
##   TREE_PROB MANUAL_CODE CONSENSUS_CODE CONSENSUS_AGREE CONSENSUS_INCORRECT
## 1         1           1              1               6                   0
## 2         1           1              1               6                   0
## 3         1           1              1               6                   0
## 4         1           1              1               6                   0
## 5         1           1              1               6                   0
## 6         1           1              1               6                   0
##   PROBABILITY_CODE PROBABILITY_INCORRECT
## 1                1                     0
## 2                1                     0
## 3                1                     0
## 4                1                     0
## 5                1                     0
## 6                1                     0

head(analytics@document_summary)

##   MAXENTROPY_LABEL MAXENTROPY_PROB SVM_LABEL SVM_PROB SLDA_LABEL SLDA_PROB
## 1                1               1         1   0.9998          1         1
## 2                1               1         1   0.9998          1         1
## 3                1               1         1   0.9870          1         1
## 4                1               1         1   0.9874          1         1
## 5                1               1         1   0.9922          1         1
## 6                1               1         1   0.9726          1         1
##   BAGGING_LABEL BAGGING_PROB FORESTS_LABEL FORESTS_PROB TREE_LABEL
## 1             1            1             1        0.935          1
## 2             1            1             1        0.895          1
## 3             1            1             1        0.940          1
## 4             1            1             1        0.950          1
## 5             1            1             1        0.945          1
## 6             1            1             1        0.700          1
##   TREE_PROB MANUAL_CODE CONSENSUS_CODE CONSENSUS_AGREE CONSENSUS_INCORRECT
## 1         1           1              1               6                   0
## 2         1           1              1               6                   0
## 3         1           1              1               6                   0
## 4         1           1              1               6                   0
## 5         1           1              1               6                   0
## 6         1           1              1               6                   0
##   PROBABILITY_CODE PROBABILITY_INCORRECT
## 1                1                     0
## 2                1                     0
## 3                1                     0
## 4                1                     0
## 5                1                     0
## 6                1                     0

# Ensemble Agreement
ensemble = create_ensembleSummary(analytics@document_summary)
ensemble

##        n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
## n >= 1                 1.0              0.95
## n >= 2                 1.0              0.95
## n >= 3                 1.0              0.95
## n >= 4                 1.0              0.95
## n >= 5                 1.0              0.95
## n >= 6                 0.9              1.00

# analytics@ensemble_summary

# Cross Validation
N = 5
cross_SVM = cross_validate(container, N, "SVM")

## Fold 1 Out of Sample Accuracy = 0.9643
## Fold 2 Out of Sample Accuracy = 0.9574
## Fold 3 Out of Sample Accuracy = 0.9487
## Fold 4 Out of Sample Accuracy = 0.9429
## Fold 5 Out of Sample Accuracy = 0.9355

cross_GLMNET = cross_validate(container, N, "GLMNET")

## Fold 1 Out of Sample Accuracy = 2.121
## Fold 2 Out of Sample Accuracy = 1.75
## Fold 3 Out of Sample Accuracy = 1.976
## Fold 4 Out of Sample Accuracy = 2.424
## Fold 5 Out of Sample Accuracy = 1.757

cross_MAXENT = cross_validate(container, N, "MAXENT")