Restaurant_Reviews.R

#Natural Language Processing
#Basic program done as part of the course content with Udemy 
#Importing Dataset. Dataset is seperated by Tab 

dataset <- read.delim(file = "Restaurant_Reviews.tsv" , quote = '',stringsAsFactors = FALSE)
dataset_original <- read.delim(file = "Restaurant_Reviews.tsv" , quote = '',stringsAsFactors = FALSE)


View(dataset)

# Clean the rows 

library(tm)

## Loading required package: NLP

library(SnowballC)

corpus <- VCorpus(VectorSource(dataset$Review))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeWords , stopwords("English"))
corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, stripWhitespace)

#Creating a Bag of Words 

dtm <- DocumentTermMatrix(corpus)
dtm <- removeSparseTerms(dtm , 0.999)

#Usually Naive Bayes , Decision Tree or Random Forest models are used 

dataset <- as.data.frame(as.matrix(dtm))
dataset$Liked <- dataset_original$Liked

# Encoding the target feature as factor
dataset$Liked = factor(dataset$Liked, levels = c(0, 1))

# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
library(caTools)
set.seed(123)
split <- sample.split(dataset$Liked, SplitRatio = 0.8)
training_set <- subset(dataset, split == TRUE)
test_set <- subset(dataset, split == FALSE)

# Fitting Random Forest Classification to the Training set
# install.packages('randomForest')
library(randomForest)

## randomForest 4.6-12

## Type rfNews() to see new features/changes/bug fixes.

set.seed(123)
classifier <- randomForest(x = training_set[-692],
                          y = training_set$Liked,
                          ntree = 10)

classifier

## 
## Call:
##  randomForest(x = training_set[-692], y = training_set$Liked,      ntree = 10) 
##                Type of random forest: classification
##                      Number of trees: 10
## No. of variables tried at each split: 26
## 
##         OOB estimate of  error rate: 27.01%
## Confusion matrix:
##     0   1 class.error
## 0 295 104   0.2606516
## 1 111 286   0.2795970

# Predicting the Test set results
y_pred <- predict(classifier, newdata = test_set[-692])

# Making the Confusion Matrix
cm <- table(test_set[, 692], y_pred)

cm

##    y_pred
##      0  1
##   0 78 22
##   1 28 72

Restaurant_Reviews.R

Kamalm

Thu May 18 08:22:37 2017