Bag of Word Models

We will applying bag of words model on restaurant reviews to find out if it is positive or negative. You can also apply it to newspapers article to see which categories it belongs to.

Importing the dataset

# the data contains commas in the review so the table is seperated by tabs.
df  <-  read.delim("G:\\RStudio\\udemy\\ml\\Machine Learning AZ\\Part 7 - Natural Language Processing\\Section 36 - Natural Language Processing\\Natural_Language_Processing\\Restaurant_Reviews.tsv", quote = "", stringsAsFactors = FALSE )
head(df)

Cleaning the texts

# create a corpus for the review
# install.package("tm")
library(tm)
corpus <-  VCorpus(VectorSource(df$Review))
corpus <-  tm_map(corpus, content_transformer(tolower))
as.character(corpus[[1]])
[1] "wow... loved this place."

Remove numbers in the corpus

corpus <-  tm_map(corpus, removeNumbers)
as.character(corpus[[841]])
[1] "for  bucks a head, i really expect better food."

Remove punctuations

corpus <-  tm_map(corpus, removePunctuation)
as.character(corpus[[1]])
[1] "wow loved this place"

Remove Stopwords

# install.packages("SnowballC")
library(SnowballC)
corpus <-  tm_map(corpus, removeWords,stopwords())
as.character(corpus[[1]])
[1] "wow loved  place"

Word Stemming

corpus <-  tm_map(corpus, stemDocument)
as.character(corpus[[1]])
[1] "wow love place"

Remove extra spaces

corpus <-  tm_map(corpus, stripWhitespace)
as.character(corpus[[841]])
[1] "buck head realli expect better food"

Creating the sparse tables

dtm <-  DocumentTermMatrix(corpus)
inspect(dtm)
<<DocumentTermMatrix (documents: 1000, terms: 1577)>>
Non-/sparse entries: 5435/1571565
Sparsity           : 100%
Maximal term length: 32
Weighting          : term frequency (tf)
Sample             :
     Terms
Docs  back food good great like place realli servic time will
  124    0    0    0     0    0     1      0      0    0    0
  133    0    1    1     0    0     0      0      0    0    0
  158    0    0    0     0    0     0      1      0    1    0
  237    0    0    0     0    0     0      0      0    0    0
  29     0    2    0     0    1     0      0      0    0    0
  43     0    0    1     0    0     0      0      0    0    0
  534    0    1    0     0    0     1      0      0    0    0
  716    0    0    0     1    0     0      0      0    0    0
  863    0    1    0     0    0     0      0      0    0    0
  885    0    0    0     0    0     0      0      0    1    0
dtm <- removeSparseTerms(dtm, 0.999)
dtm
<<DocumentTermMatrix (documents: 1000, terms: 691)>>
Non-/sparse entries: 4549/686451
Sparsity           : 99%
Maximal term length: 12
Weighting          : term frequency (tf)

Build the model

dataset <-  as.data.frame(as.matrix(dtm)) 
dataset$Liked <-  df$Liked
head(dataset)
# econding the target feature as factor
dataset$Liked <-  factor(dataset$Liked, levels = c(0,1))

Split dataset into training and test set (300 training, 100 test)

library(caTools)
set.seed(123)
split <- sample.split(dataset$Liked, SplitRatio = 0.8)
training_set <- subset(dataset, split == TRUE)
test_set <- subset(dataset, split == FALSE)

Fitting Classifier to the Training Set

# Create the classifier here
# install.packages("randomForest")
library(randomForest)
classifier <- randomForest(x = training_set[-692], 
                           y = training_set$Liked,
                           ntree = 10)
summary(classifier)
                Length Class  Mode     
call               4   -none- call     
type               1   -none- character
predicted        800   factor numeric  
err.rate          30   -none- numeric  
confusion          6   -none- numeric  
votes           1600   matrix numeric  
oob.times        800   -none- numeric  
classes            2   -none- character
importance       691   -none- numeric  
importanceSD       0   -none- NULL     
localImportance    0   -none- NULL     
proximity          0   -none- NULL     
ntree              1   -none- numeric  
mtry               1   -none- numeric  
forest            14   -none- list     
y                800   factor numeric  
test               0   -none- NULL     
inbag              0   -none- NULL     

Predicting the test set results

y_pred <- predict(classifier, newdata = test_set[-692])
y_pred
  4   9  10  16  17  21  24  33  39  40  41  48  56  58  59  61  63  73  76  82  92  93  98  99 105 112 113 
  1   1   1   0   0   0   0   0   1   0   0   1   1   0   1   0   1   0   0   0   1   0   0   1   0   0   1 
115 116 122 123 142 150 152 154 157 158 159 161 169 182 183 184 188 190 191 193 199 202 203 210 211 217 222 
  1   0   0   1   1   0   1   0   1   1   1   0   0   1   0   0   0   1   1   1   1   1   0   0   0   1   0 
228 239 240 250 251 255 258 262 264 270 272 276 287 292 303 306 314 318 326 328 337 344 345 346 349 351 353 
  1   0   1   0   1   1   0   1   1   0   1   0   0   0   0   0   0   1   0   1   1   1   0   1   0   0   0 
361 363 364 370 375 395 396 397 399 412 413 415 416 430 433 445 446 453 456 466 469 470 473 486 495 496 509 
  0   0   1   0   1   1   0   0   1   1   0   0   0   0   1   1   0   1   0   1   1   0   1   0   0   0   0 
519 521 525 528 531 535 539 545 548 555 560 563 568 570 574 583 586 591 598 606 613 614 618 625 628 633 634 
  0   0   1   0   1   0   0   0   0   1   0   1   1   0   0   0   1   1   0   1   0   0   0   1   0   0   0 
639 641 647 648 653 658 668 674 679 688 694 698 712 715 716 719 730 739 743 752 759 761 768 780 789 795 807 
  0   1   0   1   1   1   1   1   1   1   1   0   0   1   1   0   1   1   0   1   1   1   1   1   1   0   0 
809 811 817 818 821 844 848 849 853 855 863 868 874 882 890 891 892 894 900 905 906 912 915 920 924 931 935 
  1   1   0   1   0   0   1   0   1   1   1   0   0   0   1   1   1   0   1   0   1   0   0   1   0   1   0 
938 939 941 953 956 965 973 977 983 985 996 
  0   0   0   1   0   0   0   0   0   0   0 
Levels: 0 1

Evaluate the prediction using confusion Matrix.

# Making the confusion matrix
# [3] refers to the outcome
cm <- table(test_set[,692], y_pred)
cm
   y_pred
     0  1
  0 79 21
  1 30 70
accuracy  <-  (79+70)/200
accuracy
[1] 0.745
LS0tDQp0aXRsZTogIlBhcnQ3IE5hdHVyYWwgTGFuZ3VhZ2UgUHJvY2Vzc2luZyINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KLS0tDQoNCiMgQmFnIG9mIFdvcmQgTW9kZWxzDQpXZSB3aWxsIGFwcGx5aW5nIGJhZyBvZiB3b3JkcyBtb2RlbCBvbiByZXN0YXVyYW50IHJldmlld3MgdG8gZmluZCBvdXQgaWYgaXQgaXMgcG9zaXRpdmUgb3IgbmVnYXRpdmUuIFlvdSBjYW4gYWxzbyBhcHBseSBpdCB0byBuZXdzcGFwZXJzIGFydGljbGUgdG8gc2VlIHdoaWNoIGNhdGVnb3JpZXMgaXQgYmVsb25ncyB0by4gDQoNCg0KSW1wb3J0aW5nIHRoZSBkYXRhc2V0DQpgYGB7cn0NCiMgdGhlIGRhdGEgY29udGFpbnMgY29tbWFzIGluIHRoZSByZXZpZXcgc28gdGhlIHRhYmxlIGlzIHNlcGVyYXRlZCBieSB0YWJzLg0KZGYgIDwtICByZWFkLmRlbGltKCJHOlxcUlN0dWRpb1xcdWRlbXlcXG1sXFxNYWNoaW5lIExlYXJuaW5nIEFaXFxQYXJ0IDcgLSBOYXR1cmFsIExhbmd1YWdlIFByb2Nlc3NpbmdcXFNlY3Rpb24gMzYgLSBOYXR1cmFsIExhbmd1YWdlIFByb2Nlc3NpbmdcXE5hdHVyYWxfTGFuZ3VhZ2VfUHJvY2Vzc2luZ1xcUmVzdGF1cmFudF9SZXZpZXdzLnRzdiIsIHF1b3RlID0gIiIsIHN0cmluZ3NBc0ZhY3RvcnMgPSBGQUxTRSApDQpoZWFkKGRmKQ0KYGBgDQoNCiMgQ2xlYW5pbmcgdGhlIHRleHRzDQoNCmBgYHtyfQ0KIyBjcmVhdGUgYSBjb3JwdXMgZm9yIHRoZSByZXZpZXcNCiMgaW5zdGFsbC5wYWNrYWdlKCJ0bSIpDQpsaWJyYXJ5KHRtKQ0KY29ycHVzIDwtICBWQ29ycHVzKFZlY3RvclNvdXJjZShkZiRSZXZpZXcpKQ0KY29ycHVzIDwtICB0bV9tYXAoY29ycHVzLCBjb250ZW50X3RyYW5zZm9ybWVyKHRvbG93ZXIpKQ0KYXMuY2hhcmFjdGVyKGNvcnB1c1tbMV1dKQ0KDQpgYGANCg0KIyBSZW1vdmUgbnVtYmVycyBpbiB0aGUgY29ycHVzDQoNCmBgYHtyfQ0KY29ycHVzIDwtICB0bV9tYXAoY29ycHVzLCByZW1vdmVOdW1iZXJzKQ0KYXMuY2hhcmFjdGVyKGNvcnB1c1tbODQxXV0pDQoNCmBgYA0KDQojIFJlbW92ZSBwdW5jdHVhdGlvbnMNCg0KYGBge3J9DQpjb3JwdXMgPC0gIHRtX21hcChjb3JwdXMsIHJlbW92ZVB1bmN0dWF0aW9uKQ0KYXMuY2hhcmFjdGVyKGNvcnB1c1tbMV1dKQ0KYGBgDQoNCiMgUmVtb3ZlIFN0b3B3b3Jkcw0KYGBge3J9DQojIGluc3RhbGwucGFja2FnZXMoIlNub3diYWxsQyIpDQpsaWJyYXJ5KFNub3diYWxsQykNCmNvcnB1cyA8LSAgdG1fbWFwKGNvcnB1cywgcmVtb3ZlV29yZHMsc3RvcHdvcmRzKCkpDQphcy5jaGFyYWN0ZXIoY29ycHVzW1sxXV0pDQpgYGANCg0KIyBXb3JkIFN0ZW1taW5nDQoNCmBgYHtyfQ0KY29ycHVzIDwtICB0bV9tYXAoY29ycHVzLCBzdGVtRG9jdW1lbnQpDQphcy5jaGFyYWN0ZXIoY29ycHVzW1sxXV0pDQpgYGANCg0KIyBSZW1vdmUgZXh0cmEgc3BhY2VzDQpgYGB7cn0NCmNvcnB1cyA8LSAgdG1fbWFwKGNvcnB1cywgc3RyaXBXaGl0ZXNwYWNlKQ0KYXMuY2hhcmFjdGVyKGNvcnB1c1tbODQxXV0pDQpgYGANCg0KIyBDcmVhdGluZyB0aGUgc3BhcnNlIHRhYmxlcw0KYGBge3J9DQpkdG0gPC0gIERvY3VtZW50VGVybU1hdHJpeChjb3JwdXMpDQppbnNwZWN0KGR0bSkNCmR0bSA8LSByZW1vdmVTcGFyc2VUZXJtcyhkdG0sIDAuOTk5KQ0KZHRtDQpgYGANCg0KIyBCdWlsZCB0aGUgbW9kZWwgDQpgYGB7cn0NCmRhdGFzZXQgPC0gIGFzLmRhdGEuZnJhbWUoYXMubWF0cml4KGR0bSkpIA0KZGF0YXNldCRMaWtlZCA8LSAgZGYkTGlrZWQNCmhlYWQoZGF0YXNldCkNCmBgYA0KDQoNCmBgYHtyfQ0KIyBlY29uZGluZyB0aGUgdGFyZ2V0IGZlYXR1cmUgYXMgZmFjdG9yDQpkYXRhc2V0JExpa2VkIDwtICBmYWN0b3IoZGF0YXNldCRMaWtlZCwgbGV2ZWxzID0gYygwLDEpKQ0KYGBgDQoNCiMgU3BsaXQgZGF0YXNldCBpbnRvIHRyYWluaW5nIGFuZCB0ZXN0IHNldCAoMzAwIHRyYWluaW5nLCAxMDAgdGVzdCkNCmBgYHtyfQ0KbGlicmFyeShjYVRvb2xzKQ0Kc2V0LnNlZWQoMTIzKQ0Kc3BsaXQgPC0gc2FtcGxlLnNwbGl0KGRhdGFzZXQkTGlrZWQsIFNwbGl0UmF0aW8gPSAwLjgpDQp0cmFpbmluZ19zZXQgPC0gc3Vic2V0KGRhdGFzZXQsIHNwbGl0ID09IFRSVUUpDQp0ZXN0X3NldCA8LSBzdWJzZXQoZGF0YXNldCwgc3BsaXQgPT0gRkFMU0UpDQoNCmBgYA0KDQojIEZpdHRpbmcgQ2xhc3NpZmllciB0byB0aGUgVHJhaW5pbmcgU2V0DQoNCmBgYHtyfQ0KIyBDcmVhdGUgdGhlIGNsYXNzaWZpZXIgaGVyZQ0KIyBpbnN0YWxsLnBhY2thZ2VzKCJyYW5kb21Gb3Jlc3QiKQ0KbGlicmFyeShyYW5kb21Gb3Jlc3QpDQoNCmNsYXNzaWZpZXIgPC0gcmFuZG9tRm9yZXN0KHggPSB0cmFpbmluZ19zZXRbLTY5Ml0sIA0KICAgICAgICAgICAgICAgICAgICAgICAgICAgeSA9IHRyYWluaW5nX3NldCRMaWtlZCwNCiAgICAgICAgICAgICAgICAgICAgICAgICAgIG50cmVlID0gMTApDQpzdW1tYXJ5KGNsYXNzaWZpZXIpDQpgYGANCg0KIyBQcmVkaWN0aW5nIHRoZSB0ZXN0IHNldCByZXN1bHRzDQoNCmBgYHtyfQ0KeV9wcmVkIDwtIHByZWRpY3QoY2xhc3NpZmllciwgbmV3ZGF0YSA9IHRlc3Rfc2V0Wy02OTJdKQ0KeV9wcmVkDQoNCmBgYA0KDQojIEV2YWx1YXRlIHRoZSBwcmVkaWN0aW9uIHVzaW5nIGNvbmZ1c2lvbiBNYXRyaXguDQoNCmBgYHtyfQ0KIyBNYWtpbmcgdGhlIGNvbmZ1c2lvbiBtYXRyaXgNCiMgWzNdIHJlZmVycyB0byB0aGUgb3V0Y29tZQ0KDQpjbSA8LSB0YWJsZSh0ZXN0X3NldFssNjkyXSwgeV9wcmVkKQ0KY20NCmBgYA0KYGBge3J9DQphY2N1cmFjeSAgPC0gICg3OSs3MCkvMjAwDQphY2N1cmFjeQ0KYGBgDQoNCg==