We will applying bag of words model on restaurant reviews to find out if it is positive or negative. You can also apply it to newspapers article to see which categories it belongs to.
Importing the dataset
# the data contains commas in the review so the table is seperated by tabs.
df <- read.delim("G:\\RStudio\\udemy\\ml\\Machine Learning AZ\\Part 7 - Natural Language Processing\\Section 36 - Natural Language Processing\\Natural_Language_Processing\\Restaurant_Reviews.tsv", quote = "", stringsAsFactors = FALSE )
head(df)
# create a corpus for the review
# install.package("tm")
library(tm)
corpus <- VCorpus(VectorSource(df$Review))
corpus <- tm_map(corpus, content_transformer(tolower))
as.character(corpus[[1]])
[1] "wow... loved this place."
corpus <- tm_map(corpus, removeNumbers)
as.character(corpus[[841]])
[1] "for bucks a head, i really expect better food."
corpus <- tm_map(corpus, removePunctuation)
as.character(corpus[[1]])
[1] "wow loved this place"
# install.packages("SnowballC")
library(SnowballC)
corpus <- tm_map(corpus, removeWords,stopwords())
as.character(corpus[[1]])
[1] "wow loved place"
corpus <- tm_map(corpus, stemDocument)
as.character(corpus[[1]])
[1] "wow love place"
corpus <- tm_map(corpus, stripWhitespace)
as.character(corpus[[841]])
[1] "buck head realli expect better food"
dtm <- DocumentTermMatrix(corpus)
inspect(dtm)
<<DocumentTermMatrix (documents: 1000, terms: 1577)>>
Non-/sparse entries: 5435/1571565
Sparsity : 100%
Maximal term length: 32
Weighting : term frequency (tf)
Sample :
Terms
Docs back food good great like place realli servic time will
124 0 0 0 0 0 1 0 0 0 0
133 0 1 1 0 0 0 0 0 0 0
158 0 0 0 0 0 0 1 0 1 0
237 0 0 0 0 0 0 0 0 0 0
29 0 2 0 0 1 0 0 0 0 0
43 0 0 1 0 0 0 0 0 0 0
534 0 1 0 0 0 1 0 0 0 0
716 0 0 0 1 0 0 0 0 0 0
863 0 1 0 0 0 0 0 0 0 0
885 0 0 0 0 0 0 0 0 1 0
dtm <- removeSparseTerms(dtm, 0.999)
dtm
<<DocumentTermMatrix (documents: 1000, terms: 691)>>
Non-/sparse entries: 4549/686451
Sparsity : 99%
Maximal term length: 12
Weighting : term frequency (tf)
dataset <- as.data.frame(as.matrix(dtm))
dataset$Liked <- df$Liked
head(dataset)
# econding the target feature as factor
dataset$Liked <- factor(dataset$Liked, levels = c(0,1))
library(caTools)
set.seed(123)
split <- sample.split(dataset$Liked, SplitRatio = 0.8)
training_set <- subset(dataset, split == TRUE)
test_set <- subset(dataset, split == FALSE)
# Create the classifier here
# install.packages("randomForest")
library(randomForest)
classifier <- randomForest(x = training_set[-692],
y = training_set$Liked,
ntree = 10)
summary(classifier)
Length Class Mode
call 4 -none- call
type 1 -none- character
predicted 800 factor numeric
err.rate 30 -none- numeric
confusion 6 -none- numeric
votes 1600 matrix numeric
oob.times 800 -none- numeric
classes 2 -none- character
importance 691 -none- numeric
importanceSD 0 -none- NULL
localImportance 0 -none- NULL
proximity 0 -none- NULL
ntree 1 -none- numeric
mtry 1 -none- numeric
forest 14 -none- list
y 800 factor numeric
test 0 -none- NULL
inbag 0 -none- NULL
y_pred <- predict(classifier, newdata = test_set[-692])
y_pred
4 9 10 16 17 21 24 33 39 40 41 48 56 58 59 61 63 73 76 82 92 93 98 99 105 112 113
1 1 1 0 0 0 0 0 1 0 0 1 1 0 1 0 1 0 0 0 1 0 0 1 0 0 1
115 116 122 123 142 150 152 154 157 158 159 161 169 182 183 184 188 190 191 193 199 202 203 210 211 217 222
1 0 0 1 1 0 1 0 1 1 1 0 0 1 0 0 0 1 1 1 1 1 0 0 0 1 0
228 239 240 250 251 255 258 262 264 270 272 276 287 292 303 306 314 318 326 328 337 344 345 346 349 351 353
1 0 1 0 1 1 0 1 1 0 1 0 0 0 0 0 0 1 0 1 1 1 0 1 0 0 0
361 363 364 370 375 395 396 397 399 412 413 415 416 430 433 445 446 453 456 466 469 470 473 486 495 496 509
0 0 1 0 1 1 0 0 1 1 0 0 0 0 1 1 0 1 0 1 1 0 1 0 0 0 0
519 521 525 528 531 535 539 545 548 555 560 563 568 570 574 583 586 591 598 606 613 614 618 625 628 633 634
0 0 1 0 1 0 0 0 0 1 0 1 1 0 0 0 1 1 0 1 0 0 0 1 0 0 0
639 641 647 648 653 658 668 674 679 688 694 698 712 715 716 719 730 739 743 752 759 761 768 780 789 795 807
0 1 0 1 1 1 1 1 1 1 1 0 0 1 1 0 1 1 0 1 1 1 1 1 1 0 0
809 811 817 818 821 844 848 849 853 855 863 868 874 882 890 891 892 894 900 905 906 912 915 920 924 931 935
1 1 0 1 0 0 1 0 1 1 1 0 0 0 1 1 1 0 1 0 1 0 0 1 0 1 0
938 939 941 953 956 965 973 977 983 985 996
0 0 0 1 0 0 0 0 0 0 0
Levels: 0 1
# Making the confusion matrix
# [3] refers to the outcome
cm <- table(test_set[,692], y_pred)
cm
y_pred
0 1
0 79 21
1 30 70
accuracy <- (79+70)/200
accuracy
[1] 0.745