In this post we will try to analyse some comments regarding the post of Donald Trump in Facebook “I WILL NEVER, EVER STOP FIGHTING FOR YOU. -DJT” using the machine learning techniques. We will use two packages RTextTools and e1071.
We will extract some comments from the page and try to analyse them by applying some techniques of machine learning.
I get inspired from the course in the link below : https://datascienceplus.com/sentiment-analysis-with-machine-learning-in-r/
Now let’s start the analysis !
The first step is to prepare the data
pos_com = rbind(
c('Keep doing your thing, Mr. President. Nobody can deal with the fact, that a president is actually doing the things he said he d do! I don t need a cute-couple president and first lady who appeals to the eyes only. I need an effective president. That s you!', 'positive'),
c('And we will not stop supporting you! You are doing exactly what you said you would do and that s why we voted for you! We re not worried about the backlash...better it happens now then when we are too far gone to protect our Country! We stand with you President Trump to MAGA and MASA!', 'positive'),
c(' Thank you Donald. Do not listen to the negative people, you are just what we need actually we needed it when Mitt ran but he did not have your fortitude. You are the right person, in the right place, at the right time in history. You will go down as one of the great Presidents along side Lincoln, Kennedy, Reagan and now Trump. Keep the good fight going we are behind you. I admire you so much.', 'positive'),
c('You re doing a great job, Mr. President. Much accomplished in one week!', 'positive'),
c('Mr.President Trump, I truly admire you.You represent a " Game Changer", a turning point for the world, and most of all a Blessing for the United States of America', 'positive')
)
neg_com = rbind(
c('I will never, ever stop fighting against you.You don t represent me or my values. Enjoy your temporary stay in the office you don t understand.', 'negative'),
c(' Mr.trump You are a disgrace to this country and to humanity and human race.Let us all pray that you stupid ignorant 2nd grader bully be impeached as soon as possible.', 'negative'),
c('Oh Donnie please stop fighting for me. The best help you can be to our country is just leave it. You are a illegitimate president. You are so sad and pathetic. Cant wait for your impeachment.', 'negative'),
c('You re a failed president and national embarrassment. All in your first week. Congratulations. You ve given the most hateful of all Americans a voice. I hope that makes you proud.', 'negative'),
c(' Mr Trump, while you are the President of my country, your actions and your words guarantee that you will never be MY President. I will everything I can to make sure your tenure in office is as short as possible.', 'negative')
)
test_com = rbind(
c('And we will never be ashamed that we voted for national security over political correctness. Thank you for putting America first.', 'positive'),
c('Trump hasn t even had enough time to unpack his suitcase and has already accomplished more than the last administration.', 'positive'),
c('You just lost in court. Get used to it and just worry about your small little crowds. No one respects you - you couldn t tell the truth if your life depended on it. The most unpopular president....EVER.', 'negative'),
c('Just dropping by to tell you you are the worst thing to happen to this country. I ll try to stop by every day for the next 4 years', 'negative'),
c('You are ruining what is great about America. Shame on you, your supporters and members of the government who enable you in the destruction of this country.', 'negative')
)
coms = rbind(pos_com, neg_com, test_com)
Then, let’s load the packages
library(RTextTools)
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
library(e1071)
Now we can build the document-term matrix
# build dtm
matrix= create_matrix(coms, language="english",
removeStopwords=FALSE, removeNumbers=TRUE,
stemWords=FALSE)
Now we will convert the character to factors and train the naive Bayes Model with the training test.
# train the model
mat = as.matrix(matrix)
classifier = naiveBayes(mat[1:10,], as.factor(coms[1:10,2]) )
The next step is to test the accuracy
# test the validity
predicted = predict(classifier, mat[11:15,])
predicted
## [1] negative negative negative negative negative
## Levels: negative positive
table(coms[11:15,2], predicted)
## predicted
## negative positive
## negative 3 0
## positive 2 0
recall_accuracy(coms[11:15,2], predicted)
## [1] 0.6
Now let’s try other methods !
# build the data to specify response variable, training set, testing set.
container = create_container(matrix, as.numeric(as.factor(coms[,2])),
trainSize=1:10, testSize=11:15,virgin=FALSE)
Then we will train the model with multiple machine learning techniques
models = train_models(container,algorithms=c("MAXENT","SVM","RF","BAGGING","TREE"))
Now, we can classify the testing set using the trained models.
results = classify_models(container, models)
How about the accuracy?
# accuracy table
table(as.numeric(as.factor(coms[11:15, 2])), results[,"FORESTS_LABEL"])
##
## 1 2
## 1 3 0
## 2 0 2
table(as.numeric(as.factor(coms[11:15, 2])), results[,"MAXENTROPY_LABEL"])
##
## 1 2
## 1 3 0
## 2 0 2
# recall accuracy
recall_accuracy(as.numeric(as.factor(coms[11:15, 2])), results[,"FORESTS_LABEL"])
## [1] 1
recall_accuracy(as.numeric(as.factor(coms[11:15, 2])), results[,"MAXENTROPY_LABEL"])
## [1] 1
recall_accuracy(as.numeric(as.factor(coms[11:15, 2])), results[,"TREE_LABEL"])
## [1] 1
recall_accuracy(as.numeric(as.factor(coms[11:15, 2])), results[,"BAGGING_LABEL"])
## [1] 1
recall_accuracy(as.numeric(as.factor(coms[11:15, 2])), results[,"SVM_LABEL"])
## [1] 0.4
Now we will try to summarize the results by using summary
# model summary
analytics = create_analytics(container, results)
summary(analytics)
## ENSEMBLE SUMMARY
##
## n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
## n >= 1 1.0 1
## n >= 2 1.0 1
## n >= 3 1.0 1
## n >= 4 1.0 1
## n >= 5 0.4 1
##
##
## ALGORITHM PERFORMANCE
##
## SVM_PRECISION SVM_RECALL SVM_FSCORE
## 0.250 0.335 0.285
## BAGGING_PRECISION BAGGING_RECALL BAGGING_FSCORE
## 1.000 1.000 1.000
## FORESTS_PRECISION FORESTS_RECALL FORESTS_FSCORE
## 1.000 1.000 1.000
## TREE_PRECISION TREE_RECALL TREE_FSCORE
## 1.000 1.000 1.000
## MAXENTROPY_PRECISION MAXENTROPY_RECALL MAXENTROPY_FSCORE
## 1.000 1.000 1.000
head(analytics@document_summary)
## MAXENTROPY_LABEL MAXENTROPY_PROB SVM_LABEL SVM_PROB FORESTS_LABEL
## 1 2 0.6170683 1 0.7481528 2
## 2 2 0.9963882 1 0.8759441 2
## 3 1 0.9879639 2 0.5288148 1
## 4 1 0.7760961 1 0.5345409 1
## 5 1 0.8405701 1 0.5347869 1
## FORESTS_PROB BAGGING_LABEL BAGGING_PROB TREE_LABEL TREE_PROB MANUAL_CODE
## 1 0.605 2 0.84 2 1 2
## 2 0.695 2 0.80 2 1 2
## 3 0.590 1 0.72 1 1 1
## 4 0.645 1 0.76 1 1 1
## 5 0.615 1 0.76 1 1 1
## CONSENSUS_CODE CONSENSUS_AGREE CONSENSUS_INCORRECT PROBABILITY_CODE
## 1 2 4 0 2
## 2 2 4 0 2
## 3 1 4 0 1
## 4 1 5 0 1
## 5 1 5 0 1
## PROBABILITY_INCORRECT
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
analytics@ensemble_summary
## n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
## n >= 1 1.0 1
## n >= 2 1.0 1
## n >= 3 1.0 1
## n >= 4 1.0 1
## n >= 5 0.4 1
To cross validate the results:
N=4
set.seed(2014)
cross_validate(container,N,"MAXENT")
cross_validate(container,N,"TREE")
## Fold 1 Out of Sample Accuracy = 0.6666667
## Fold 2 Out of Sample Accuracy = 1
## Fold 3 Out of Sample Accuracy = 1
## Fold 4 Out of Sample Accuracy = 1
## [[1]]
## [1] 0.6666667 1.0000000 1.0000000 1.0000000
##
## $meanAccuracy
## [1] 0.9166667
cross_validate(container,N,"SVM")
## Fold 1 Out of Sample Accuracy = 0.5
## Fold 2 Out of Sample Accuracy = 0.8
## Fold 3 Out of Sample Accuracy = 1
## Fold 4 Out of Sample Accuracy = 1
## [[1]]
## [1] 0.5 0.8 1.0 1.0
##
## $meanAccuracy
## [1] 0.825
cross_validate(container,N,"RF")
## Fold 1 Out of Sample Accuracy = 0.6
## Fold 2 Out of Sample Accuracy = 1
## Fold 3 Out of Sample Accuracy = 0.8
## Fold 4 Out of Sample Accuracy = 1
## [[1]]
## [1] 0.6 1.0 0.8 1.0
##
## $meanAccuracy
## [1] 0.85