#Natural Language Processing
#Basic program done as part of the course content with Udemy
#Importing Dataset. Dataset is seperated by Tab
dataset <- read.delim(file = "Restaurant_Reviews.tsv" , quote = '',stringsAsFactors = FALSE)
dataset_original <- read.delim(file = "Restaurant_Reviews.tsv" , quote = '',stringsAsFactors = FALSE)
View(dataset)
# Clean the rows
library(tm)
## Loading required package: NLP
library(SnowballC)
corpus <- VCorpus(VectorSource(dataset$Review))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeWords , stopwords("English"))
corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, stripWhitespace)
#Creating a Bag of Words
dtm <- DocumentTermMatrix(corpus)
dtm <- removeSparseTerms(dtm , 0.999)
#Usually Naive Bayes , Decision Tree or Random Forest models are used
dataset <- as.data.frame(as.matrix(dtm))
dataset$Liked <- dataset_original$Liked
# Encoding the target feature as factor
dataset$Liked = factor(dataset$Liked, levels = c(0, 1))
# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
library(caTools)
set.seed(123)
split <- sample.split(dataset$Liked, SplitRatio = 0.8)
training_set <- subset(dataset, split == TRUE)
test_set <- subset(dataset, split == FALSE)
# Fitting Random Forest Classification to the Training set
# install.packages('randomForest')
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
set.seed(123)
classifier <- randomForest(x = training_set[-692],
y = training_set$Liked,
ntree = 10)
classifier
##
## Call:
## randomForest(x = training_set[-692], y = training_set$Liked, ntree = 10)
## Type of random forest: classification
## Number of trees: 10
## No. of variables tried at each split: 26
##
## OOB estimate of error rate: 27.01%
## Confusion matrix:
## 0 1 class.error
## 0 295 104 0.2606516
## 1 111 286 0.2795970
# Predicting the Test set results
y_pred <- predict(classifier, newdata = test_set[-692])
# Making the Confusion Matrix
cm <- table(test_set[, 692], y_pred)
cm
## y_pred
## 0 1
## 0 78 22
## 1 28 72