library(twitteR)
## Warning: package 'twitteR' was built under R version 3.3.3
library(stringr)
library(tm)
## Loading required package: NLP
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.3
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
consumerKey <- 'cOdDOJAKsX6utezYsR4Pfy45F' #put the Consumer Key from Twitter Application
consumerSecret <- 'xBr3yZnhqJOFkUvUfnTKPGiyUNb2EQsPfGDx1zimL1Am3x5svW'  #put the Consumer Secret from Twitter Application

setup_twitter_oauth(consumer_key = consumerKey,consumer_secret =
                      consumerSecret)
## [1] "Using browser based authentication"

Extracting tweets

tweets = searchTwitter('trump',n=100)
Tweets.text = lapply(tweets,function(t)t$getText())

The extracted tweets are written as CSV files and

write.csv(Tweets.text,"tweets.csv")
##hand labelled tweets as positive, negative and neutral
trump_tweets = read.csv("tweets_tag.csv")
trump_tweets[1:10,]
##                                                                                                                                           Tweet
## 1  RT @kylegriffin1: Trump<U+0092>s U.S. businesses have received at least $15,100,000 in revenue from political groups and federal agencies since 20<U+0085>
## 2  RT @morten: @FoxNews @Franklin_Graham Donald Trump cheated on his first wife Ivana Trump with Marla Maples. Thoughts and prayers. https://t<U+0085>
## 3  RT @AviWoolf: The GOP has nothing to sell the American public. No legislative agenda, no programs, no ideas. Just deregulation, judges, and<U+0085>
## 4  @sarahkendzior Screw the dossier! So what!! Even if the dossier never happened Trump would still be on the verge of<U+0085> https://t.co/A8BfACAyMN
## 5  RT @benwikler: As Trump<U+0092>s lifetime of corruption and crime comes fully into public view, remember that every tweet, every speech, every ang<U+0085>
## 6  RT @ABC: James Comey says the Trump presidency is a 'forest fire': "His presidency is doing, and will do, tremendous damage to our norms an<U+0085>
## 7  RT @SethAbramson: God, the horrible irony in James Comey saying he was trying to save the FBI when he violated FBI protocols to inadvertent<U+0085>
## 8          Tiki Torch manufacturers keep a close eye on the strongly approve of Trump numbers. That's their best market these days. #MorningJoe
## 9  RT @Partisangirl: We heard #Trump didn<U+0092>t eat any chocolate cake while bombing #Syria this time unlike last year. so we made him a cake full<U+0085>
## 10  Trump wants to review material seized from personal lawyer before federal investigators.That<U+0092>s not how the rule of<U+0085> https://t.co/goBVTJ1SOi
##         Tag
## 1   Neutral
## 2  Negative
## 3  Negative
## 4  Negative
## 5  Negative
## 6  Negative
## 7  Negative
## 8  Positive
## 9  Negative
## 10 Negative

Then we split the data into training and test datasets

####### Splitting data into train and test data
t.tweets<-as.matrix(trump_tweets[trump_tweets$Tag
                                 %in% c("Positive","Negative")
                                 ,])
library(caret)
## Loading required package: lattice
indexes <- createDataPartition(t.tweets[,2], p=0.78, list = FALSE)
train.data <- t.tweets[indexes,]
test.data <- t.tweets[-indexes,]

Naive Bayes Classifier

##Naive Bayes classifier
library(e1071)
classifier = naiveBayes(train.data, as.factor(train.data[,2]) )

library(RTextTools)
## Warning: package 'RTextTools' was built under R version 3.3.3
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
# test the validity
predicted = predict(classifier, test.data[,1]); predicted
##  [1] Negative Negative Negative Negative Negative Negative Negative
##  [8] Negative Negative Negative Negative Negative Negative Negative
## Levels: Negative Positive
table(test.data[, 2], predicted)
##           predicted
##            Negative Positive
##   Negative       12        0
##   Positive        2        0
nb_classifier = naiveBayes(train.data, as.factor(train.data[,2]) )
summary(nb_classifier)
##         Length Class  Mode     
## apriori 2      table  numeric  
## tables  2      -none- list     
## levels  2      -none- character
## call    3      -none- call

Confusion Matrix

library(caret)
result = confusionMatrix(test.data[, 2], predicted)
result
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Negative Positive
##   Negative       12        0
##   Positive        2        0
##                                           
##                Accuracy : 0.8571          
##                  95% CI : (0.5719, 0.9822)
##     No Information Rate : 1               
##     P-Value [Acc > NIR] : 1.0000          
##                                           
##                   Kappa : 0               
##  Mcnemar's Test P-Value : 0.4795          
##                                           
##             Sensitivity : 0.8571          
##             Specificity :     NA          
##          Pos Pred Value :     NA          
##          Neg Pred Value :     NA          
##              Prevalence : 1.0000          
##          Detection Rate : 0.8571          
##    Detection Prevalence : 0.8571          
##       Balanced Accuracy :     NA          
##                                           
##        'Positive' Class : Negative        
## 

Precision and Recall

prec = result$byClass["Precision"]; prec
## Precision 
##         1
recall_acc = result$byClass["Recall"]; recall_acc
##    Recall 
## 0.8571429

Now we try other classifiers like MAXENT, SVM, Random Forest and Decision Tree

########### MAXENT, SVM, RF, TREE CLASSIFIERS ###########
tweets_1 = rbind(train.data,test.data)
matrix= create_matrix(tweets_1, language="english", 
                      removeStopwords=FALSE, removeNumbers=TRUE, 
                      stemWords=FALSE)
matrix =as.matrix(matrix)
# build the data to specify response variable, training set, testing set.
container = create_container(matrix, as.numeric(as.factor(matrix[,2])),
                             trainSize=1:56, testSize=57:70,virgin=FALSE)
# Second, to train the model with multiple machine learning algorithms:
  
  models = train_models(container, algorithms=c("MAXENT" , "SVM", "RF",  "TREE"))
# Now, we can classify the testing set using the trained models.

results = classify_models(container, models)
results
##    MAXENTROPY_LABEL MAXENTROPY_PROB SVM_LABEL  SVM_PROB FORESTS_LABEL
## 1                 1       0.9997172         1 0.9903849             1
## 2                 1       0.9999622         1 0.9697792             1
## 3                 1       0.9999442         1 0.9679987             1
## 4                 1       0.9999440         1 0.9692131             1
## 5                 1       0.9999592         1 0.9650132             1
## 6                 1       0.9999893         1 0.9664737             1
## 7                 1       0.9999675         1 0.9822457             1
## 8                 1       0.9994954         1 0.9644738             1
## 9                 1       0.9999323         1 0.9644039             1
## 10                1       0.9999535         1 0.9659185             1
## 11                1       0.9998858         1 0.9667536             1
## 12                1       0.9989233         1 0.9685813             1
## 13                1       0.9999884         1 0.9652324             1
## 14                1       0.9965962         1 0.9667593             1
##    FORESTS_PROB TREE_LABEL TREE_PROB
## 1         0.915          1       0.8
## 2         1.000          1       1.0
## 3         1.000          1       1.0
## 4         1.000          1       1.0
## 5         1.000          1       1.0
## 6         1.000          1       1.0
## 7         0.935          1       1.0
## 8         1.000          1       1.0
## 9         1.000          1       1.0
## 10        1.000          1       1.0
## 11        1.000          1       1.0
## 12        1.000          1       1.0
## 13        1.000          1       1.0
## 14        1.000          1       1.0
# accuracy table
res_rf = confusionMatrix( results[,"FORESTS_LABEL"],ifelse(factor(tweets_1[57:70, 2])=="Positive",1,0))
## Warning in confusionMatrix.default(results[, "FORESTS_LABEL"],
## ifelse(factor(tweets_1[57:70, : Levels are not in the same order for
## reference and data. Refactoring data to match.
res_maxent = confusionMatrix( results[,"MAXENTROPY_LABEL"],ifelse(factor(tweets_1[57:70, 2])=="Positive",1,0))
## Warning in confusionMatrix.default(results[, "MAXENTROPY_LABEL"],
## ifelse(factor(tweets_1[57:70, : Levels are not in the same order for
## reference and data. Refactoring data to match.
res_tree = confusionMatrix( results[,"TREE_LABEL"],ifelse(factor(tweets_1[57:70, 2])=="Positive",1,0))
## Warning in confusionMatrix.default(results[, "TREE_LABEL"],
## ifelse(factor(tweets_1[57:70, : Levels are not in the same order for
## reference and data. Refactoring data to match.
res_svm = confusionMatrix( results[,"SVM_LABEL"],ifelse(factor(tweets_1[57:70, 2])=="Positive",1,0))
## Warning in confusionMatrix.default(results[, "SVM_LABEL"],
## ifelse(factor(tweets_1[57:70, : Levels are not in the same order for
## reference and data. Refactoring data to match.
# recall accuracy
res_rf$table[4]/(res_rf$table[4]+res_rf$table[2]) 
## [1] 0.1428571
res_maxent$table[4]/(res_maxent$table[4]+res_maxent$table[2])
## [1] 0.1428571
res_tree$table[4]/(res_tree$table[4]+res_tree$table[2])
## [1] 0.1428571
res_svm$table[4]/(res_svm$table[4]+res_svm$table[2])
## [1] 0.1428571
res_rf$byClass["Recall"]
## Recall 
##      0
res_maxent$byClass["Recall"] 
## Recall 
##      0
res_tree$byClass["Recall"]
## Recall 
##      0
res_svm$byClass["Recall"] 
## Recall 
##      0