if(!require("textmineR"))install.packages("textmineR")
## Loading required package: textmineR
## Loading required package: Matrix
##
## Attaching package: 'textmineR'
## The following object is masked from 'package:Matrix':
##
## update
## The following object is masked from 'package:stats':
##
## update
if(!require("e1071"))install.packages("e1071")
## Loading required package: e1071
if(!require("dplyr"))install.packages("dplyr")
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
if(!require("SnowballC"))install.packages("SnowballC")
## Loading required package: SnowballC
if(!require("tm"))install.packages("tm")
## Loading required package: tm
## Loading required package: NLP
library(textmineR)
library(e1071)
library(dplyr)
library(SnowballC)
library(tm)
tweets<- "'I love this car', 'positive',
'This view is amazing', 'positive',
'I feel great this morning', 'positive',
'I like her as she is kind', 'positive',
'He is my best friend', 'positive'
'I do not like this car', 'negative',
'This view is horrible', 'negative',
'I feel tired this morning', 'negative',
'I am not looking forward to the concert', 'negative',
'He is my. enemy', 'negative'
'feel happy this morning', 'positive',
'larry friend', 'positive',
'Dogs are kind', 'positive',
'house not great', 'negative',
'your song annoying', 'negative'"
tweets
## [1] "'I love this car', 'positive',\n'This view is amazing', 'positive',\n'I feel great this morning', 'positive',\n'I like her as she is kind', 'positive',\n'He is my best friend', 'positive'\n'I do not like this car', 'negative',\n'This view is horrible', 'negative',\n'I feel tired this morning', 'negative',\n'I am not looking forward to the concert', 'negative',\n'He is my. enemy', 'negative'\n'feel happy this morning', 'positive',\n'larry friend', 'positive',\n'Dogs are kind', 'positive',\n'house not great', 'negative',\n'your song annoying', 'negative'"
tweets<-gsub("'","", tweets)
tweets
## [1] "I love this car, positive,\nThis view is amazing, positive,\nI feel great this morning, positive,\nI like her as she is kind, positive,\nHe is my best friend, positive\nI do not like this car, negative,\nThis view is horrible, negative,\nI feel tired this morning, negative,\nI am not looking forward to the concert, negative,\nHe is my. enemy, negative\nfeel happy this morning, positive,\nlarry friend, positive,\nDogs are kind, positive,\nhouse not great, negative,\nyour song annoying, negative"
fileConn<-file("tweets.txt")
writeLines(tweets, fileConn)
close(fileConn)
tweets_df<- read.csv("tweets.txt",header = FALSE,col.names = c("tweets","Sentiment",""))
tweets_df<- subset(tweets_df, select = c(1,2) )
tweets_df
tweets_text_vs <- VectorSource(tweets_df$tweets)
tweets_corpus <-Corpus(tweets_text_vs)
#inspect(tweets_corpus)
tweets_clean <-tm_map(tweets_corpus,tolower)
## Warning in tm_map.SimpleCorpus(tweets_corpus, tolower): transformation drops
## documents
tweets_clean <-tm_map(tweets_clean,removePunctuation)
## Warning in tm_map.SimpleCorpus(tweets_clean, removePunctuation): transformation
## drops documents
tweets_clean <-tm_map(tweets_clean,stripWhitespace)
## Warning in tm_map.SimpleCorpus(tweets_clean, stripWhitespace): transformation
## drops documents
inspect(tweets_clean)
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 15
##
## [1] i love this car
## [2] this view is amazing
## [3] i feel great this morning
## [4] i like her as she is kind
## [5] he is my best friend
## [6] i do not like this car
## [7] this view is horrible
## [8] i feel tired this morning
## [9] i am not looking forward to the concert
## [10] he is my enemy
## [11] feel happy this morning
## [12] larry friend
## [13] dogs are kind
## [14] house not great
## [15] your song annoying
Approach 1: Using function tm_map
getTweetsMatrix <- function(tweets_df){
tweets_text_vs <- VectorSource(tweets_df$tweets)
tweets_corpus <-Corpus(tweets_text_vs)
#remove numbers from text
tweets_clean <-tm_map(tweets_corpus,removeNumbers)
# remove Punctuation marks like ?, . etc
tweets_clean <-tm_map(tweets_clean,removePunctuation)
# remove Punctuation marks like ?, . etc
#tweets_clean <-tm_map(tweets_clean,stemDocument)
# remove stop words
tweets_clean <-tm_map(tweets_clean,removeWords,stopwords())
# to remove extra white space characters
tweets_clean <-tm_map(tweets_clean,stripWhitespace)
return(DocumentTermMatrix(tweets_clean))
}
tweets_dtm <- getTweetsMatrix(tweets_df)
## Warning in tm_map.SimpleCorpus(tweets_corpus, removeNumbers): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(tweets_clean, removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(tweets_clean, removeWords, stopwords()):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(tweets_clean, stripWhitespace): transformation
## drops documents
inspect(tweets_dtm)
## <<DocumentTermMatrix (documents: 15, terms: 24)>>
## Non-/sparse entries: 35/325
## Sparsity : 90%
## Maximal term length: 8
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs car feel friend great kind like love morning this view
## 1 1 0 0 0 0 0 1 0 0 0
## 11 0 1 0 0 0 0 0 1 0 0
## 2 0 0 0 0 0 0 0 0 1 1
## 3 0 1 0 1 0 0 0 1 0 0
## 4 0 0 0 0 1 1 0 0 0 0
## 5 0 0 1 0 0 0 0 0 0 0
## 6 1 0 0 0 0 1 0 0 0 0
## 7 0 0 0 0 0 0 0 0 1 1
## 8 0 1 0 0 0 0 0 1 0 0
## 9 0 0 0 0 0 0 0 0 0 0
Approach 2: Using control list
tweets_dtm2<- DocumentTermMatrix(tweets_corpus,control=list(
tolower = TRUE,
removeNumbers = TRUE,
stopwords = TRUE,
removePunctuation = TRUE,
stripWhitespace = TRUE
))
inspect(tweets_dtm2)
## <<DocumentTermMatrix (documents: 15, terms: 23)>>
## Non-/sparse entries: 33/312
## Sparsity : 90%
## Maximal term length: 8
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs amazing car feel friend great kind like love morning view
## 1 0 1 0 0 0 0 0 1 0 0
## 11 0 0 1 0 0 0 0 0 1 0
## 2 1 0 0 0 0 0 0 0 0 1
## 3 0 0 1 0 1 0 0 0 1 0
## 4 0 0 0 0 0 1 1 0 0 0
## 5 0 0 0 1 0 0 0 0 0 0
## 6 0 1 0 0 0 0 1 0 0 0
## 7 0 0 0 0 0 0 0 0 0 1
## 8 0 0 1 0 0 0 0 0 1 0
## 9 0 0 0 0 0 0 0 0 0 0
Approach 3: Using control list
textminer_dtm <- function(text, doc_id){
dtm <- textmineR::CreateDtm(doc_vec = text,
doc_names = doc_id,
stopword_vec = c(stopwords::stopwords("en")),
remove_punctuation = TRUE,
remove_numbers = TRUE,
verbose = FALSE,
cpus = 1)
}
dtm_miner <- textminer_dtm(tweets_df$tweets, tweets_df$doc_id)
## Warning in textmineR::CreateDtm(doc_vec = text, doc_names = doc_id,
## stopword_vec = c(stopwords::stopwords("en")), : No document names detected.
## Assigning 1:length(doc_vec) as names.
dtm_miner
## 15 x 23 sparse Matrix of class "dgCMatrix"
## [[ suppressing 23 column names 'amazing', 'annoying', 'best' ... ]]
##
## 1 . . . . . . . . . . . . 1 . . 1 . . . . . . .
## 2 1 . . . . . . . . . . . . . . . . . . . 1 . .
## 3 . . . . . . . . . . . . . . . . . 1 . . . 1 1
## 4 . . . . . . . . . . . . . . . . . . 1 1 . . .
## 5 . . 1 . . . . . . . . . . . . . 1 . . . . . .
## 6 . . . . . . . . . . . . . . . 1 . . . 1 . . .
## 7 . . . . . . . . 1 . . . . . . . . . . . 1 . .
## 8 . . . . . . . . . . . . . . 1 . . . . . . 1 1
## 9 . . . 1 . . 1 . . . . 1 . . . . . . . . . . .
## 10 . . . . . 1 . . . . . . . . . . . . . . . . .
## 11 . . . . . . . 1 . . . . . . . . . . . . . 1 1
## 12 . . . . . . . . . . 1 . . . . . 1 . . . . . .
## 13 . . . . 1 . . . . . . . . . . . . . 1 . . . .
## 14 . . . . . . . . . 1 . . . . . . . 1 . . . . .
## 15 . 1 . . . . . . . . . . . 1 . . . . . . . . .
str(dtm_miner)
## Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
## ..@ i : int [1:33] 1 14 4 8 12 9 8 10 6 13 ...
## ..@ p : int [1:24] 0 1 2 3 4 5 6 7 8 9 ...
## ..@ Dim : int [1:2] 15 23
## ..@ Dimnames:List of 2
## .. ..$ : chr [1:15] "1" "2" "3" "4" ...
## .. ..$ : chr [1:23] "amazing" "annoying" "best" "concert" ...
## ..@ x : num [1:33] 1 1 1 1 1 1 1 1 1 1 ...
## ..@ factors : list()
## ..$ args:List of 8
## .. ..$ doc_names : NULL
## .. ..$ ngram_window : num [1:2] 1 1
## .. ..$ stopword_vec : chr [1:150] "i" "me" "my" "myself" ...
## .. ..$ lower : logi TRUE
## .. ..$ remove_punctuation : logi TRUE
## .. ..$ remove_numbers : logi TRUE
## .. ..$ stem_lemma_function: NULL
## .. ..$ verbose : logi FALSE
## ..$ call: chr "CreateDtm"
tweets_dataset <- data.frame(tweets_df$Sentiment, as.matrix(dtm_miner))
colnames(tweets_dataset)
## [1] "tweets_df.Sentiment" "amazing" "annoying"
## [4] "best" "concert" "dogs"
## [7] "enemy" "forward" "happy"
## [10] "horrible" "house" "larry"
## [13] "looking" "love" "song"
## [16] "tired" "car" "friend"
## [19] "great" "kind" "like"
## [22] "view" "feel" "morning"
names(tweets_dataset)[1]<-"sentiment"
colnames(tweets_dataset)
## [1] "sentiment" "amazing" "annoying" "best" "concert" "dogs"
## [7] "enemy" "forward" "happy" "horrible" "house" "larry"
## [13] "looking" "love" "song" "tired" "car" "friend"
## [19] "great" "kind" "like" "view" "feel" "morning"
tweets_dataset$sentiment<-factor(tweets_dataset$sentiment)
#str(tweets_dataset)
tweets_dataset
split train(first 10 rows) and test sets (last 5 rows)
training_data = tweets_dataset[1:10,]
testing_data = tweets_dataset[11:15,]
dim(training_data)
## [1] 10 24
dim(testing_data)
## [1] 5 24
training_data
if(!require("e1071"))install.packages("e1071")
library(e1071)
model <- naiveBayes(sentiment ~ ., data = training_data)
(model)
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## negative positive
## 0.5 0.5
##
## Conditional probabilities:
## amazing
## Y [,1] [,2]
## negative 0.0 0.0000000
## positive 0.2 0.4472136
##
## annoying
## Y [,1] [,2]
## negative 0 0
## positive 0 0
##
## best
## Y [,1] [,2]
## negative 0.0 0.0000000
## positive 0.2 0.4472136
##
## concert
## Y [,1] [,2]
## negative 0.2 0.4472136
## positive 0.0 0.0000000
##
## dogs
## Y [,1] [,2]
## negative 0 0
## positive 0 0
##
## enemy
## Y [,1] [,2]
## negative 0.2 0.4472136
## positive 0.0 0.0000000
##
## forward
## Y [,1] [,2]
## negative 0.2 0.4472136
## positive 0.0 0.0000000
##
## happy
## Y [,1] [,2]
## negative 0 0
## positive 0 0
##
## horrible
## Y [,1] [,2]
## negative 0.2 0.4472136
## positive 0.0 0.0000000
##
## house
## Y [,1] [,2]
## negative 0 0
## positive 0 0
##
## larry
## Y [,1] [,2]
## negative 0 0
## positive 0 0
##
## looking
## Y [,1] [,2]
## negative 0.2 0.4472136
## positive 0.0 0.0000000
##
## love
## Y [,1] [,2]
## negative 0.0 0.0000000
## positive 0.2 0.4472136
##
## song
## Y [,1] [,2]
## negative 0 0
## positive 0 0
##
## tired
## Y [,1] [,2]
## negative 0.2 0.4472136
## positive 0.0 0.0000000
##
## car
## Y [,1] [,2]
## negative 0.2 0.4472136
## positive 0.2 0.4472136
##
## friend
## Y [,1] [,2]
## negative 0.0 0.0000000
## positive 0.2 0.4472136
##
## great
## Y [,1] [,2]
## negative 0.0 0.0000000
## positive 0.2 0.4472136
##
## kind
## Y [,1] [,2]
## negative 0.0 0.0000000
## positive 0.2 0.4472136
##
## like
## Y [,1] [,2]
## negative 0.2 0.4472136
## positive 0.2 0.4472136
##
## view
## Y [,1] [,2]
## negative 0.2 0.4472136
## positive 0.2 0.4472136
##
## feel
## Y [,1] [,2]
## negative 0.2 0.4472136
## positive 0.2 0.4472136
##
## morning
## Y [,1] [,2]
## negative 0.2 0.4472136
## positive 0.2 0.4472136
#y_pred <- predict(model, newdata = testing_data[ , names(testing_data) != "sentiment"])
y_pred <- predict(model, newdata = testing_data[-1])
#option1: Generate Confusion matrix manually by comparing Actual vs predicted values for Sentiment
table(y_pred, testing_data$sentiment,dnn=c("Prediction","Actual"))
## Actual
## Prediction negative positive
## negative 1 1
## positive 1 2
if(!require('caret'))install.packages('caret')
## Loading required package: caret
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
## Loading required package: lattice
library(caret)
cfm <- confusionMatrix(y_pred, testing_data$sentiment)
cfm
## Confusion Matrix and Statistics
##
## Reference
## Prediction negative positive
## negative 1 1
## positive 1 2
##
## Accuracy : 0.6
## 95% CI : (0.1466, 0.9473)
## No Information Rate : 0.6
## P-Value [Acc > NIR] : 0.6826
##
## Kappa : 0.1667
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 0.5000
## Specificity : 0.6667
## Pos Pred Value : 0.5000
## Neg Pred Value : 0.6667
## Prevalence : 0.4000
## Detection Rate : 0.2000
## Detection Prevalence : 0.4000
## Balanced Accuracy : 0.5833
##
## 'Positive' Class : negative
##
A plot of confusion matrix
if(!require('ggplot2'))install.packages('ggplot2')
if(!require('scales'))install.packages('scales')
## Loading required package: scales
library(ggplot2)
library(scales)
ggplotConfusionMatrix <- function(m){
mytitle <- paste("Accuracy", percent_format()(m$overall[1]),
"Kappa", percent_format()(m$overall[2]))
p <-
ggplot(data = as.data.frame(m$table) ,
aes(x = Reference, y = Prediction)) +
geom_tile(aes(fill = log(Freq)), colour = "white") +
scale_fill_gradient(low = "white", high = "steelblue") +
geom_text(aes(x = Reference, y = Prediction, label = Freq)) +
theme(legend.position = "none") +
ggtitle(mytitle)
return(p)
}
ggplotConfusionMatrix(cfm)