Step 1: Data

Source: https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences

The data contains 500 positive and 500 negative sentences that were selected randomly from larger datasets of yelp reviews. Score is either 1 (for positive) or 0 (for negative) which indicates whether sentence has a positive or negative sentiment

The original tab delimited file did not load correctly into R. The file was opened in Excel, “-” at the beginning of sentences were deleted and the file was saved as an xlsx file.

library(readxl)
## Warning: package 'readxl' was built under R version 3.3.3
library(tm)
## Warning: package 'tm' was built under R version 3.3.3
## Loading required package: NLP
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.3.3
## Loading required package: RColorBrewer
library(e1071)
library(gmodels)
## Warning: package 'gmodels' was built under R version 3.3.3

Step 2: Exploring and preparing the data

yelp_labelled <- read_excel("yelp_labelled.xlsx")
yelp_labelled$score <- factor(yelp_labelled$score)

# Check the counts of positive and negative scores
table(yelp_labelled$score)
## 
##   0   1 
## 500 500
# Create a corpus from the sentences
yelp_corpus <- VCorpus(VectorSource(yelp_labelled$sentence))

# create a document-term sparse matrix directly from the corpus
yelp_dtm <- DocumentTermMatrix(yelp_corpus, control = list(
  tolower = TRUE,
  removeNumbers = TRUE,
  stopwords = TRUE,
  removePunctuation = TRUE,
  stemming = TRUE
))

# creating training and test datasets
yelp_dtm_train <- yelp_dtm[1:800, ]
yelp_dtm_test  <- yelp_dtm[801:1000, ]

# also save the labels
yelp_train_labels <- yelp_labelled[1:800, ]$score
yelp_test_labels  <- yelp_labelled[801:1000, ]$score

# check that the proportion of spam is similar
prop.table(table(yelp_train_labels))
## yelp_train_labels
##     0     1 
## 0.435 0.565
prop.table(table(yelp_test_labels))
## yelp_test_labels
##    0    1 
## 0.76 0.24

Proportions are not the same. We will need to use sampling

rm(yelp_dtm_train)
rm(yelp_dtm_test)
rm(yelp_train_labels)
rm(yelp_test_labels)

# Create random samples
set.seed(123)
train_index <- sample(1000, 800)

yelp_train <- yelp_labelled[train_index, ]
yelp_test  <- yelp_labelled[-train_index, ]

# check the proportion of class variable
prop.table(table(yelp_train$score))
## 
##     0     1 
## 0.505 0.495
prop.table(table(yelp_test$score))
## 
##    0    1 
## 0.48 0.52

This is much better.

train_corpus <- VCorpus(VectorSource(yelp_train$sentence))
test_corpus <- VCorpus(VectorSource(yelp_test$sentence))

Word Cloud Visualization

# subset the training data into spam and ham groups
positive <- subset(yelp_train, score == 1)
negative  <- subset(yelp_train, score == 0)

wordcloud(positive$sentence, max.words = 40, scale = c(3, 0.5))

In the wordcloud are words you would expect to see in a positive review: “good”, “great”, “best”. Note: this was run before removing stop words, so “the” is one of the most frequent word.

wordcloud(negative$sentence, max.words = 40, scale = c(3, 0.5))

The typical words for a negative review are there: “worst”, “bad”, “disappointed”. But words like “food” and “service” appear often in both classes.

# create a document-term sparse matrix directly for train and test
train_dtm <- DocumentTermMatrix(train_corpus, control = list(
  tolower = TRUE,
  removeNumbers = TRUE,
  stopwords = TRUE,
  removePunctuation = TRUE,
  stemming = TRUE
))

test_dtm <- DocumentTermMatrix(test_corpus, control = list(
  tolower = TRUE,
  removeNumbers = TRUE,
  stopwords = TRUE,
  removePunctuation = TRUE,
  stemming = TRUE
))

train_dtm
## <<DocumentTermMatrix (documents: 800, terms: 1439)>>
## Non-/sparse entries: 4453/1146747
## Sparsity           : 100%
## Maximal term length: 32
## Weighting          : term frequency (tf)
test_dtm
## <<DocumentTermMatrix (documents: 200, terms: 530)>>
## Non-/sparse entries: 965/105035
## Sparsity           : 99%
## Maximal term length: 13
## Weighting          : term frequency (tf)

Since this is such a small dataset, sparse terms were not removed.

# create function to convert counts to a factor
convert_counts <- function(x) {
  x <- ifelse(x > 0, "Yes", "No")
}

# apply() convert_counts() to columns of train/test data
train_dtm_binary <- apply(train_dtm, MARGIN = 2, convert_counts)
test_dtm_binary  <- apply(test_dtm, MARGIN = 2, convert_counts)

Step 3: Training a model on the data

yelp_classifier <- naiveBayes(as.matrix(train_dtm_binary), yelp_train$score)

Step 4: Evaluating model performance

yelp_test_pred <- predict(yelp_classifier, as.matrix(test_dtm_binary))
head(yelp_test_pred)
## [1] 1 1 1 0 0 1
## Levels: 0 1
CrossTable(yelp_test_pred, yelp_test$score,
           prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE,
           dnn = c('predicted', 'actual'))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Col Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  200 
## 
##  
##              | actual 
##    predicted |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##            0 |        79 |        20 |        99 | 
##              |     0.823 |     0.192 |           | 
## -------------|-----------|-----------|-----------|
##            1 |        17 |        84 |       101 | 
##              |     0.177 |     0.808 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |        96 |       104 |       200 | 
##              |     0.480 |     0.520 |           | 
## -------------|-----------|-----------|-----------|
## 
## 
Accuracy = .815

Step 5: Improving model performance

yelp_classifier2 <- naiveBayes(as.matrix(train_dtm_binary), yelp_train$score, laplace = 1)

Use Laplace smoothing because the train document term matrix does not contain the terms from the test data.

yelp_test_pred2 <- predict(yelp_classifier2, as.matrix(test_dtm_binary))

CrossTable(yelp_test_pred2, yelp_test$score,
           prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE,
           dnn = c('predicted', 'actual'))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Col Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  200 
## 
##  
##              | actual 
##    predicted |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##            0 |        79 |        24 |       103 | 
##              |     0.823 |     0.231 |           | 
## -------------|-----------|-----------|-----------|
##            1 |        17 |        80 |        97 | 
##              |     0.177 |     0.769 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |        96 |       104 |       200 | 
##              |     0.480 |     0.520 |           | 
## -------------|-----------|-----------|-----------|
## 
## 

Accuracy = 0.795
The accuracy went down.

yelp_classifier3 <- naiveBayes(as.matrix(train_dtm_binary), yelp_train$score, laplace = .5)

Try laplace = .5

yelp_test_pred3 <- predict(yelp_classifier3, as.matrix(test_dtm_binary))

CrossTable(yelp_test_pred3, yelp_test$score,
           prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE,
           dnn = c('predicted', 'actual'))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Col Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  200 
## 
##  
##              | actual 
##    predicted |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##            0 |        79 |        19 |        98 | 
##              |     0.823 |     0.183 |           | 
## -------------|-----------|-----------|-----------|
##            1 |        17 |        85 |       102 | 
##              |     0.177 |     0.817 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |        96 |       104 |       200 | 
##              |     0.480 |     0.520 |           | 
## -------------|-----------|-----------|-----------|
## 
## 

Accuracy = 0.820
The accuracy improved a little from the classifier 1 (no Laplace smoothing). The classifier missclassifies positive and negative sentences fairly equally.