Source: https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences
The data contains 500 positive and 500 negative sentences that were selected randomly from larger datasets of yelp reviews. Score is either 1 (for positive) or 0 (for negative) which indicates whether sentence has a positive or negative sentiment
The original tab delimited file did not load correctly into R. The file was opened in Excel, “-” at the beginning of sentences were deleted and the file was saved as an xlsx file.
library(readxl)
## Warning: package 'readxl' was built under R version 3.3.3
library(tm)
## Warning: package 'tm' was built under R version 3.3.3
## Loading required package: NLP
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.3.3
## Loading required package: RColorBrewer
library(e1071)
library(gmodels)
## Warning: package 'gmodels' was built under R version 3.3.3
yelp_labelled <- read_excel("yelp_labelled.xlsx")
yelp_labelled$score <- factor(yelp_labelled$score)
# Check the counts of positive and negative scores
table(yelp_labelled$score)
##
## 0 1
## 500 500
# Create a corpus from the sentences
yelp_corpus <- VCorpus(VectorSource(yelp_labelled$sentence))
# create a document-term sparse matrix directly from the corpus
yelp_dtm <- DocumentTermMatrix(yelp_corpus, control = list(
tolower = TRUE,
removeNumbers = TRUE,
stopwords = TRUE,
removePunctuation = TRUE,
stemming = TRUE
))
# creating training and test datasets
yelp_dtm_train <- yelp_dtm[1:800, ]
yelp_dtm_test <- yelp_dtm[801:1000, ]
# also save the labels
yelp_train_labels <- yelp_labelled[1:800, ]$score
yelp_test_labels <- yelp_labelled[801:1000, ]$score
# check that the proportion of spam is similar
prop.table(table(yelp_train_labels))
## yelp_train_labels
## 0 1
## 0.435 0.565
prop.table(table(yelp_test_labels))
## yelp_test_labels
## 0 1
## 0.76 0.24
Proportions are not the same. We will need to use sampling
rm(yelp_dtm_train)
rm(yelp_dtm_test)
rm(yelp_train_labels)
rm(yelp_test_labels)
# Create random samples
set.seed(123)
train_index <- sample(1000, 800)
yelp_train <- yelp_labelled[train_index, ]
yelp_test <- yelp_labelled[-train_index, ]
# check the proportion of class variable
prop.table(table(yelp_train$score))
##
## 0 1
## 0.505 0.495
prop.table(table(yelp_test$score))
##
## 0 1
## 0.48 0.52
This is much better.
train_corpus <- VCorpus(VectorSource(yelp_train$sentence))
test_corpus <- VCorpus(VectorSource(yelp_test$sentence))
# subset the training data into spam and ham groups
positive <- subset(yelp_train, score == 1)
negative <- subset(yelp_train, score == 0)
wordcloud(positive$sentence, max.words = 40, scale = c(3, 0.5))
In the wordcloud are words you would expect to see in a positive review: “good”, “great”, “best”. Note: this was run before removing stop words, so “the” is one of the most frequent word.
wordcloud(negative$sentence, max.words = 40, scale = c(3, 0.5))
The typical words for a negative review are there: “worst”, “bad”, “disappointed”. But words like “food” and “service” appear often in both classes.
# create a document-term sparse matrix directly for train and test
train_dtm <- DocumentTermMatrix(train_corpus, control = list(
tolower = TRUE,
removeNumbers = TRUE,
stopwords = TRUE,
removePunctuation = TRUE,
stemming = TRUE
))
test_dtm <- DocumentTermMatrix(test_corpus, control = list(
tolower = TRUE,
removeNumbers = TRUE,
stopwords = TRUE,
removePunctuation = TRUE,
stemming = TRUE
))
train_dtm
## <<DocumentTermMatrix (documents: 800, terms: 1439)>>
## Non-/sparse entries: 4453/1146747
## Sparsity : 100%
## Maximal term length: 32
## Weighting : term frequency (tf)
test_dtm
## <<DocumentTermMatrix (documents: 200, terms: 530)>>
## Non-/sparse entries: 965/105035
## Sparsity : 99%
## Maximal term length: 13
## Weighting : term frequency (tf)
Since this is such a small dataset, sparse terms were not removed.
# create function to convert counts to a factor
convert_counts <- function(x) {
x <- ifelse(x > 0, "Yes", "No")
}
# apply() convert_counts() to columns of train/test data
train_dtm_binary <- apply(train_dtm, MARGIN = 2, convert_counts)
test_dtm_binary <- apply(test_dtm, MARGIN = 2, convert_counts)
yelp_classifier <- naiveBayes(as.matrix(train_dtm_binary), yelp_train$score)
yelp_test_pred <- predict(yelp_classifier, as.matrix(test_dtm_binary))
head(yelp_test_pred)
## [1] 1 1 1 0 0 1
## Levels: 0 1
CrossTable(yelp_test_pred, yelp_test$score,
prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE,
dnn = c('predicted', 'actual'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Col Total |
## |-------------------------|
##
##
## Total Observations in Table: 200
##
##
## | actual
## predicted | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 79 | 20 | 99 |
## | 0.823 | 0.192 | |
## -------------|-----------|-----------|-----------|
## 1 | 17 | 84 | 101 |
## | 0.177 | 0.808 | |
## -------------|-----------|-----------|-----------|
## Column Total | 96 | 104 | 200 |
## | 0.480 | 0.520 | |
## -------------|-----------|-----------|-----------|
##
##
Accuracy = .815
yelp_classifier2 <- naiveBayes(as.matrix(train_dtm_binary), yelp_train$score, laplace = 1)
Use Laplace smoothing because the train document term matrix does not contain the terms from the test data.
yelp_test_pred2 <- predict(yelp_classifier2, as.matrix(test_dtm_binary))
CrossTable(yelp_test_pred2, yelp_test$score,
prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE,
dnn = c('predicted', 'actual'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Col Total |
## |-------------------------|
##
##
## Total Observations in Table: 200
##
##
## | actual
## predicted | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 79 | 24 | 103 |
## | 0.823 | 0.231 | |
## -------------|-----------|-----------|-----------|
## 1 | 17 | 80 | 97 |
## | 0.177 | 0.769 | |
## -------------|-----------|-----------|-----------|
## Column Total | 96 | 104 | 200 |
## | 0.480 | 0.520 | |
## -------------|-----------|-----------|-----------|
##
##
Accuracy = 0.795
The accuracy went down.
yelp_classifier3 <- naiveBayes(as.matrix(train_dtm_binary), yelp_train$score, laplace = .5)
Try laplace = .5
yelp_test_pred3 <- predict(yelp_classifier3, as.matrix(test_dtm_binary))
CrossTable(yelp_test_pred3, yelp_test$score,
prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE,
dnn = c('predicted', 'actual'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Col Total |
## |-------------------------|
##
##
## Total Observations in Table: 200
##
##
## | actual
## predicted | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 79 | 19 | 98 |
## | 0.823 | 0.183 | |
## -------------|-----------|-----------|-----------|
## 1 | 17 | 85 | 102 |
## | 0.177 | 0.817 | |
## -------------|-----------|-----------|-----------|
## Column Total | 96 | 104 | 200 |
## | 0.480 | 0.520 | |
## -------------|-----------|-----------|-----------|
##
##
Accuracy = 0.820
The accuracy improved a little from the classifier 1 (no Laplace smoothing). The classifier missclassifies positive and negative sentences fairly equally.