Introduction

This project analyzes review data from Yelp, a crowd-sourced review forum. Natural Language Processing and multiple classification alogorithms (Logistic Regression, Deep Learning and randomn forest) are used to conduct sentiment analysis, extract subjective information, and identify the positivity or negativity of a consumer’s review. To attain this objective,I convert text data into Document-Term Matrix and treat each term as a feature to build machine learning model.

Section 1: Data Preprocessing

library(tm)
library(SnowballC)
library(dplyr)
library(caret)
library(tinytex)
#install.packages('tidytext')
library(tidytext)
library(tidyr)
library(stringr)

setwd("C:/Users/caipe/OneDrive - Syracuse University/19F-IST707/assignment/ass4")
yelp<-read.csv("HW4_yelp_sentiment.csv", stringsAsFactors = FALSE)
summary(yelp)
##    reviews           sentiment        
##  Length:988         Length:988        
##  Class :character   Class :character  
##  Mode  :character   Mode  :character
str(yelp)
## 'data.frame':    988 obs. of  2 variables:
##  $ reviews  : chr  "Wow... Loved this place." "Crust is not good." "Not tasty and the texture was just nasty." "Stopped by during the late May bank holiday off Rick Steve recommendation and loved it." ...
##  $ sentiment: chr  "p" "n" "n" "p" ...

Natural language processing

  • Convert review text data into Corpus
  • Conduct stemming and lemmatization, removal of stop words
  • Construct a DTM
#Convert review text data into Corpus
yelp_corpus <- Corpus(VectorSource(yelp$reviews))

#Conduct stemming and lemmatization, removal of stop words
yelp_corpus <- tm_map(yelp_corpus, content_transformer(tolower))
yelp_corpus <- tm_map(yelp_corpus, removeNumbers)
yelp_corpus <- tm_map(yelp_corpus, removePunctuation)
yelp_corpus <- tm_map(yelp_corpus, removeWords, c("the", "and", stopwords("english")))
yelp_corpus <-  tm_map(yelp_corpus, stripWhitespace)

#Construct a DTM
yelp_dtm <- DocumentTermMatrix(yelp_corpus)

# Reduce sparsity less than 0.99

yelp_dtm <- removeSparseTerms(yelp_dtm, 0.99)
yelp_dtm_matrix<- as.matrix(yelp_dtm)
yelp_df<- as.data.frame(yelp_dtm_matrix, stringsAsFactors = FALSE)
dim(yelp_df)
## [1] 988  97
# add sentiment data to matrix, and coding category data as numerical data where it is 1 if the sentiment is positive.
yelp_df$Positive <- 1
yelp_df$Positive[which(yelp$sentiment=='n')] <- 0
str(yelp_df$Positive)
##  num [1:988] 1 0 0 1 1 0 0 0 1 1 ...
yelp_df$Positive <- as.factor(yelp_df$Positive)
head(yelp_df$Positive)
## [1] 1 0 0 1 1 0
## Levels: 0 1
#Export into cvs
write.csv(yelp_df, "yelp_DTM.csv")

Build Train and Test datasets

#Setting seed
set.seed(100)
#Splitting data into 70:30 train and test
train_index <- createDataPartition(yelp_df$Positive,p=0.7,list=FALSE)
train_df <- yelp_df[train_index,]
test_df <- yelp_df[-train_index,]

rownames(test_df)<-NULL
rownames(train_df)<-NULL

Section 2: Logistic regression

#Logistic regression Apply model
Start1 <- Sys.time()
model_glm <- train(Positive ~ ., data = train_df, method = "glm", family = "binomial")
End1 <- Sys.time()-Start1

predict_glm <- predict(model_glm, newdata = test_df, type = "raw")
head(predict_glm,3)
## [1] 1 1 1
## Levels: 0 1
#Evaluating the model
confusionMatrix(predict_glm, test_df$Positive)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 117  55
##          1  31  93
##                                           
##                Accuracy : 0.7095          
##                  95% CI : (0.6541, 0.7605)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 1.979e-13       
##                                           
##                   Kappa : 0.4189          
##                                           
##  Mcnemar's Test P-Value : 0.01313         
##                                           
##             Sensitivity : 0.7905          
##             Specificity : 0.6284          
##          Pos Pred Value : 0.6802          
##          Neg Pred Value : 0.7500          
##              Prevalence : 0.5000          
##          Detection Rate : 0.3953          
##    Detection Prevalence : 0.5811          
##       Balanced Accuracy : 0.7095          
##                                           
##        'Positive' Class : 0               
## 

Section 3: ANN and Deep Learning

Data Preprocessing

#install.packages('rsample')
library(rsample)
set.seed(100)
yelp_df2 <- yelp_df
yelp_df2$Positive <- yelp$sentiment
train_test_split <- initial_split(yelp_df2,prop = 0.8)
train_tbl <- training(train_test_split)
test_tbl <- testing(train_test_split)

library(recipes)
rec_obj <- recipe(Positive ~., data=train_tbl) %>%
  step_center(all_predictors(),-all_outcomes())%>%
  step_scale(all_predictors(),-all_outcomes())%>%
  prep(data=train_tbl)

x_train_tbl <- bake(rec_obj,new_data=train_tbl)%>% select(-Positive)
x_test_tbl <- bake(rec_obj,new_data = test_tbl) %>% select(-Positive)
y_train_vec <- ifelse(pull(train_tbl,Positive)=='p',1,0)
y_test_vec <- ifelse(pull(test_tbl,Positive)=='p',1,0)
class(train_tbl$Positive[1:5])
## [1] "character"
rec_obj
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor         97
## 
## Training data contained 791 data points and no missing data.
## 
## Operations:
## 
## Centering for loved, place, good, just, tasty, ... [trained]
## Scaling for loved, place, good, just, tasty, ... [trained]

Deep Learning Model Specification

#install.packages('kerasR')
#library(kerasR)
library(keras)
#library(reticulate)
#reticulate::py_available()
#reticulate::import("keras.models")
#reticulate::py_config()
#use_python("/usr/local/bin/python")

model_keras <- keras_model_sequential()

library(tensorflow)
Start2 <- Sys.time()
model_keras %>%
  layer_dense(units=16,
              kernel_initializer = 'uniform',
              activation = 'relu',
              input_shape = ncol(x_train_tbl)) %>%
  layer_dropout(rate=0.1) %>%
  layer_dense(units=16,
              kernel_initializer = 'uniform',
              activation = 'relu')%>%
  layer_dense(units=1,
              kernel_initializer = 'uniform',
              activation = 'sigmoid')%>%
  compile(optimizer='adam',
          loss='binary_crossentropy',
          metrics=c('accuracy'))
End2 <- Sys.time()-Start2
model_keras
## Model
## ___________________________________________________________________________
## Layer (type)                     Output Shape                  Param #     
## ===========================================================================
## dense (Dense)                    (None, 16)                    1568        
## ___________________________________________________________________________
## dropout (Dropout)                (None, 16)                    0           
## ___________________________________________________________________________
## dense_1 (Dense)                  (None, 16)                    272         
## ___________________________________________________________________________
## dense_2 (Dense)                  (None, 1)                     17          
## ===========================================================================
## Total params: 1,857
## Trainable params: 1,857
## Non-trainable params: 0
## ___________________________________________________________________________
library(forcats)
yhat_keras_class_vec <- predict_classes(object=model_keras,x=as.matrix(x_test_tbl)) %>% as.vector()
yhat_keras_prob_vec <- predict_proba(object = model_keras,x=as.matrix(x_test_tbl)) %>% as.vector()
estimates_keras_tbl <- tibble(
  truth=as.factor(y_test_vec) %>% fct_recode(p='1',n='0'),
  estimate=as.factor(yhat_keras_class_vec) %>% fct_recode(p='1',n='0'),
  class_prob=yhat_keras_prob_vec
)

estimates_keras_tbl
## # A tibble: 197 x 3
##    truth estimate class_prob
##    <fct> <fct>         <dbl>
##  1 p     p             0.500
##  2 p     n             0.500
##  3 n     n             0.500
##  4 p     n             0.500
##  5 p     p             0.500
##  6 p     n             0.500
##  7 p     p             0.501
##  8 n     p             0.500
##  9 p     p             0.501
## 10 n     p             0.501
## # ... with 187 more rows
#install.packages('yardstick')
library(yardstick)
options(yardstick.event_first=F)
estimates_keras_tbl %>% conf_mat(truth,estimate)
##           Truth
## Prediction  n  p
##          n 27 18
##          p 70 82
estimates_keras_tbl %>% metrics(truth,estimate)
## # A tibble: 2 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary        0.553 
## 2 kap      binary        0.0991
estimates_keras_tbl %>% roc_auc(truth,class_prob)
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary         0.561
estimates_keras_tbl %>% precision(truth,estimate)
## # A tibble: 1 x 3
##   .metric   .estimator .estimate
##   <chr>     <chr>          <dbl>
## 1 precision binary         0.539
estimates_keras_tbl %>% recall(truth,estimate)
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 recall  binary          0.82

Section 4: Random Forest Classification

#Train Random forest classifier
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.5.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
Start3 <- Sys.time()
model_rf<- randomForest(x=train_df, y=train_df$Positive)
End3 <- Sys.time()-Start3
model_rf
## 
## Call:
##  randomForest(x = train_df, y = train_df$Positive) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 9
## 
##         OOB estimate of  error rate: 0%
## Confusion matrix:
##     0   1 class.error
## 0 346   0           0
## 1   0 346           0
#Predict using Random forest Classifier
predict_rf <- predict(model_rf, newdata = test_df)
confusionMatrix(predict_rf, test_df$Positive)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 148   0
##          1   0 148
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9876, 1)
##     No Information Rate : 0.5        
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0        
##             Specificity : 1.0        
##          Pos Pred Value : 1.0        
##          Neg Pred Value : 1.0        
##              Prevalence : 0.5        
##          Detection Rate : 0.5        
##    Detection Prevalence : 0.5        
##       Balanced Accuracy : 1.0        
##                                      
##        'Positive' Class : 0          
## 
#Accuracy rate is 1

#Misclassification error
mean(as.character(test_df$Positive) != as.character(predict_rf))
## [1] 0
mean(as.character(test_df$Positive) == as.character(predict_rf))
## [1] 1

Running time

End1
## Time difference of 4.546226 secs
End2
## Time difference of 0.227603 secs
End3
## Time difference of 0.940731 secs

Section5: Algorithm performance comparison

Randonm Forest has the best performance, with the accuracy achieving 100% and the least time of 1.163 secs. Deep Learning has the lowest accuracy of 50%. It has the second fastest speed, using 1.532 seconds. Logistic has the second accuracy of 70.95%. It uses the most time,reaching 4.84 seconds.