This chunks are modified from this organic link. This document is not my own. Everyone may copy this syntax for educational purpose only while keep mentioning its original sources.
If the package is not installed, do install required packages upon library activation by removing #.
#install.packages("RTextTools")
#install.packages("e1071")
Library activation:
library(RTextTools)
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
library(e1071)
Create positive text data set:
pos_tweets = rbind(
c('I love this car', 'positive'),
c('This view is amazing', 'positive'),
c('I feel great this morning', 'positive'),
c('I am so excited about the concert', 'positive'),
c('He is my best friend', 'positive')
)
Create negative text data set:
neg_tweets = rbind(
c('I do not like this car', 'negative'),
c('This view is horrible', 'negative'),
c('I feel tired this morning', 'negative'),
c('I am not looking forward to the concert', 'negative'),
c('He is my enemy', 'negative')
)
Create testing tweets dataset
test_tweets = rbind(
c('feel happy this morning', 'positive'),
c('larry friend', 'positive'),
c('not like that man', 'negative'),
c('house not great', 'negative'),
c('your song annoying', 'negative')
)
Combine by row respectively positive, negative, and testing dataframe
tweets = rbind(pos_tweets, neg_tweets, test_tweets)
Inspect the combined data by removing #:
#View(tweets)
#dim(tweets)
Build document term matrix (dtm):
# build dtm
matrix= create_matrix(tweets[,1], language="english",
removeStopwords=FALSE, removeNumbers=TRUE,
stemWords=FALSE)
## Warning in TermDocumentMatrix.SimpleCorpus(x, control): custom functions are
## ignored
## Warning in TermDocumentMatrix.SimpleCorpus(x, control): custom tokenizer is
## ignored
Prior to training the naive Bayes model one can change type of the response variable to be numeric or factor. Therefore, it is better to convert characters to factors through this chunk:
mat = as.matrix(matrix)
classifier = naiveBayes(mat[1:15,], as.factor(tweets[1:15,2]) )
please note that 15 is number of dataset row. derived when we execute dim() function.
Making prediction according to classifier guidance:
predicted = predict(classifier, mat[1:15,]); predicted
## [1] positive positive positive positive positive positive positive positive
## [9] negative positive positive positive negative positive negative
## Levels: negative positive
Construct the contingency table that contained false positive and false negative observation:
table(tweets[1:15, 2], predicted)
## predicted
## negative positive
## negative 3 5
## positive 0 7
Generate accuracy of the machine learning:
recall_accuracy(tweets[1:15, 2], predicted)
## [1] 0.6666667
Firstly, build the data to specify response variable, training set, testing set.
container = create_container(matrix, as.numeric(as.factor(tweets[,2])),
trainSize=1:10, testSize=11:15,virgin=FALSE)
Train the model using Random Forest machine learning algorithm:
models_RF = train_models(container, algorithms="RF")
Classify the trained model
results_RF = classify_models(container, models_RF)
# accuracy table
table(as.numeric(as.factor(tweets[11:15, 2])), results_RF[,"FORESTS_LABEL"])
##
## 1 2
## 1 2 1
## 2 1 1
# recall accuracy
recall_accuracy(as.numeric(as.factor(tweets[11:15, 2])), results_RF[,"FORESTS_LABEL"])
## [1] 0.6
To summarize the validity in professional way:
# model summary
analytics = create_analytics(container, results_RF)
summary(analytics)
## ENSEMBLE SUMMARY
##
## n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
## n >= 1 1 0.6
##
##
## ALGORITHM PERFORMANCE
##
## FORESTS_PRECISION FORESTS_RECALL FORESTS_FSCORE
## 0.585 0.585 0.585
head(analytics@document_summary)
## FORESTS_LABEL FORESTS_PROB MANUAL_CODE CONSENSUS_CODE CONSENSUS_AGREE
## 1 1 0.510 2 1 1
## 2 2 0.595 2 2 1
## 3 1 0.710 1 1 1
## 4 1 0.540 1 1 1
## 5 2 0.520 1 2 1
## CONSENSUS_INCORRECT PROBABILITY_CODE PROBABILITY_INCORRECT
## 1 1 1 1
## 2 0 2 0
## 3 0 1 0
## 4 0 1 0
## 5 1 2 1
analytics@ensemble_summary
## n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
## n >= 1 1 0.6
to analyze the cross validation:
N=4
set.seed(2014)
cross_validate(container,N,"RF")
## Fold 1 Out of Sample Accuracy = 1
## Fold 2 Out of Sample Accuracy = 0.5
## Fold 3 Out of Sample Accuracy = 0.5
## Fold 4 Out of Sample Accuracy = 0.3333333
## [[1]]
## [1] 1.0000000 0.5000000 0.5000000 0.3333333
##
## $meanAccuracy
## [1] 0.5833333
# build the data to specify response variable, training set, testing set.
container = create_container(matrix, as.numeric(as.factor(tweets[,2])),
trainSize=1:10, testSize=11:15,virgin=FALSE)
Train the model with multiple machine learning algorithms:
models = train_models(container, algorithms=c("SLDA", "SVM", "BAGGING", "TREE"))
results = classify_models(container, models)
# accuracy table
table(as.numeric(as.factor(tweets[11:15, 2])), results[,"SLDA_LABEL"])
##
## 1 2
## 1 1 2
## 2 0 2
table(as.numeric(as.factor(tweets[11:15, 2])), results[,"SVM_LABEL"])
##
## 1 2
## 1 2 1
## 2 2 0
table(as.numeric(as.factor(tweets[11:15, 2])), results[,"BAGGING_LABEL"])
##
## 1
## 1 3
## 2 2
table(as.numeric(as.factor(tweets[11:15, 2])), results[,"TREE_LABEL"])
##
## 1
## 1 3
## 2 2
# recall accuracy
recall_accuracy(as.numeric(as.factor(tweets[11:15, 2])), results[,"SLDA_LABEL"])
## [1] 0.6
recall_accuracy(as.numeric(as.factor(tweets[11:15, 2])), results[,"SVM_LABEL"])
## [1] 0.4
recall_accuracy(as.numeric(as.factor(tweets[11:15, 2])), results[,"BAGGING_LABEL"])
## [1] 0.6
recall_accuracy(as.numeric(as.factor(tweets[11:15, 2])), results[,"TREE_LABEL"])
## [1] 0.6
# model summary
analytics = create_analytics(container, results)
summary(analytics)
## ENSEMBLE SUMMARY
##
## n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
## n >= 1 1 0.6
## n >= 2 1 0.6
## n >= 3 1 0.6
##
##
## ALGORITHM PERFORMANCE
##
## SVM_PRECISION SVM_RECALL SVM_FSCORE SLDA_PRECISION
## 0.250 0.335 0.285 0.750
## SLDA_RECALL SLDA_FSCORE BAGGING_PRECISION BAGGING_RECALL
## 0.665 0.585 0.300 0.500
## BAGGING_FSCORE TREE_PRECISION TREE_RECALL TREE_FSCORE
## 0.375 0.300 0.500 0.375
head(analytics@document_summary)
## SLDA_LABEL SLDA_PROB SVM_LABEL SVM_PROB BAGGING_LABEL BAGGING_PROB
## 1 2 0.6401045 1 0.5817286 1 0.52
## 2 2 0.5185177 1 0.8223025 1 0.56
## 3 1 0.6835875 2 0.9613499 1 0.88
## 4 2 0.5889474 1 0.6145932 1 0.64
## 5 2 0.5876664 1 0.5007519 1 0.56
## TREE_LABEL TREE_PROB MANUAL_CODE CONSENSUS_CODE CONSENSUS_AGREE
## 1 1 0.5 2 1 3
## 2 1 0.5 2 1 3
## 3 1 0.5 1 1 3
## 4 1 0.5 1 1 3
## 5 1 0.5 1 1 3
## CONSENSUS_INCORRECT PROBABILITY_CODE PROBABILITY_INCORRECT
## 1 1 2 0
## 2 1 1 1
## 3 0 2 1
## 4 0 1 0
## 5 0 2 1
analytics@ensemble_summary
## n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
## n >= 1 1 0.6
## n >= 2 1 0.6
## n >= 3 1 0.6
Using master data from GitHub : Set working directory to where the data set is placed:
setwd("~/Documents/2022/14 - Twitter-Sentimental-Analysis-master")
Load the dataset:
happy = readLines("./happy.txt")
sad = readLines("./sad.txt")
happy_test = readLines("./happy_test.txt")
sad_test = readLines("./sad_test.txt")
Data pre-processed:
tweet = c(happy, sad)
tweet_test= c(happy_test, sad_test)
tweet_all = c(tweet, tweet_test)
sentiment = c(rep("happy", length(happy) ),
rep("sad", length(sad)))
sentiment_test = c(rep("happy", length(happy_test) ),
rep("sad", length(sad_test)))
sentiment_all = as.factor(c(sentiment, sentiment_test))
Activate library:
library(RTextTools)
mat= create_matrix(tweet_all, language="english",
removeStopwords=FALSE, removeNumbers=TRUE,
stemWords=FALSE, tm::weightTfIdf)
mat = as.matrix(mat)
classifier = naiveBayes(mat[1:160,], as.factor(sentiment_all[1:160]))
predicted = predict(classifier, mat[161:180,]); predicted
## [1] sad happy sad happy happy sad happy sad happy sad sad sad
## [13] sad sad sad sad sad sad happy happy
## Levels: happy sad
table(sentiment_test, predicted)
## predicted
## sentiment_test happy sad
## happy 5 5
## sad 2 8
recall_accuracy(sentiment_test, predicted)
## [1] 0.65
mat= create_matrix(tweet_all, language="english",
removeStopwords=FALSE, removeNumbers=TRUE,
stemWords=FALSE, tm::weightTfIdf)
container = create_container(mat, as.numeric(sentiment_all),
trainSize=1:160, testSize=161:180,virgin=FALSE) #removeSparseTerms
models = train_models(container, algorithms=c("SVM",
"SLDA","BAGGING",
"RF",
"TREE"
))
Test the models:
results = classify_models(container, models)
table(as.numeric(as.numeric(sentiment_all[161:180])), results[,"FORESTS_LABEL"])
##
## 1 2
## 1 10 0
## 2 1 9
recall_accuracy(as.numeric(as.numeric(sentiment_all[161:180])), results[,"FORESTS_LABEL"])
## [1] 0.95
Get formal result and cross validation:
# formal tests
analytics = create_analytics(container, results)
summary(analytics)
## ENSEMBLE SUMMARY
##
## n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
## n >= 1 1.00 0.95
## n >= 2 1.00 0.95
## n >= 3 1.00 0.95
## n >= 4 1.00 0.95
## n >= 5 0.95 1.00
##
##
## ALGORITHM PERFORMANCE
##
## SVM_PRECISION SVM_RECALL SVM_FSCORE SLDA_PRECISION
## 0.955 0.950 0.950 0.955
## SLDA_RECALL SLDA_FSCORE BAGGING_PRECISION BAGGING_RECALL
## 0.950 0.950 0.955 0.950
## BAGGING_FSCORE FORESTS_PRECISION FORESTS_RECALL FORESTS_FSCORE
## 0.950 0.955 0.950 0.950
## TREE_PRECISION TREE_RECALL TREE_FSCORE
## 1.000 1.000 1.000
head(analytics@algorithm_summary)
## SVM_PRECISION SVM_RECALL SVM_FSCORE SLDA_PRECISION SLDA_RECALL SLDA_FSCORE
## 1 0.91 1.0 0.95 0.91 1.0 0.95
## 2 1.00 0.9 0.95 1.00 0.9 0.95
## BAGGING_PRECISION BAGGING_RECALL BAGGING_FSCORE FORESTS_PRECISION
## 1 0.91 1.0 0.95 0.91
## 2 1.00 0.9 0.95 1.00
## FORESTS_RECALL FORESTS_FSCORE TREE_PRECISION TREE_RECALL TREE_FSCORE
## 1 1.0 0.95 1 1 1
## 2 0.9 0.95 1 1 1
head(analytics@label_summary)
## NUM_MANUALLY_CODED NUM_CONSENSUS_CODED NUM_PROBABILITY_CODED
## 1 10 11 11
## 2 10 9 9
## PCT_CONSENSUS_CODED PCT_PROBABILITY_CODED PCT_CORRECTLY_CODED_CONSENSUS
## 1 110 110 100
## 2 90 90 90
## PCT_CORRECTLY_CODED_PROBABILITY
## 1 100
## 2 90
head(analytics@document_summary)
## SVM_LABEL SVM_PROB SLDA_LABEL SLDA_PROB BAGGING_LABEL BAGGING_PROB
## 1 1 0.9998395 1 1 1 1.00
## 2 1 0.9997711 1 1 1 1.00
## 3 1 0.9868957 1 1 1 1.00
## 4 1 0.9872106 1 1 1 1.00
## 5 1 0.9921540 1 1 1 1.00
## 6 1 0.9723660 1 1 1 0.96
## FORESTS_LABEL FORESTS_PROB TREE_LABEL TREE_PROB MANUAL_CODE CONSENSUS_CODE
## 1 1 0.910 1 1 1 1
## 2 1 0.920 1 1 1 1
## 3 1 0.925 1 1 1 1
## 4 1 0.940 1 1 1 1
## 5 1 0.950 1 1 1 1
## 6 1 0.720 1 1 1 1
## CONSENSUS_AGREE CONSENSUS_INCORRECT PROBABILITY_CODE PROBABILITY_INCORRECT
## 1 5 0 1 0
## 2 5 0 1 0
## 3 5 0 1 0
## 4 5 0 1 0
## 5 5 0 1 0
## 6 5 0 1 0
analytics@ensemble_summary # Ensemble Agreement
## n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
## n >= 1 1.00 0.95
## n >= 2 1.00 0.95
## n >= 3 1.00 0.95
## n >= 4 1.00 0.95
## n >= 5 0.95 1.00
# Cross Validation
N=3
cross_SVM = cross_validate(container,N,"SVM")
## Fold 1 Out of Sample Accuracy = 0.9137931
## Fold 2 Out of Sample Accuracy = 0.9565217
## Fold 3 Out of Sample Accuracy = 1
cross_GLMNET = cross_validate(container,N,"GLMNET")
## Fold 1 Out of Sample Accuracy = 0.8378378
## Fold 2 Out of Sample Accuracy = 1.153846
## Fold 3 Out of Sample Accuracy = 1.314815