This project analyzes review data from Yelp, a crowd-sourced review forum. Natural Language Processing and multiple classification alogorithms (Logistic Regression, Deep Learning and randomn forest) are used to conduct sentiment analysis, extract subjective information, and identify the positivity or negativity of a consumer’s review. To attain this objective,I convert text data into Document-Term Matrix and treat each term as a feature to build machine learning model.
library(tm)
library(SnowballC)
library(dplyr)
library(caret)
library(tinytex)
#install.packages('tidytext')
library(tidytext)
library(tidyr)
library(stringr)
setwd("C:/Users/caipe/OneDrive - Syracuse University/19F-IST707/assignment/ass4")
yelp<-read.csv("HW4_yelp_sentiment.csv", stringsAsFactors = FALSE)
summary(yelp)
## reviews sentiment
## Length:988 Length:988
## Class :character Class :character
## Mode :character Mode :character
str(yelp)
## 'data.frame': 988 obs. of 2 variables:
## $ reviews : chr "Wow... Loved this place." "Crust is not good." "Not tasty and the texture was just nasty." "Stopped by during the late May bank holiday off Rick Steve recommendation and loved it." ...
## $ sentiment: chr "p" "n" "n" "p" ...
#Convert review text data into Corpus
yelp_corpus <- Corpus(VectorSource(yelp$reviews))
#Conduct stemming and lemmatization, removal of stop words
yelp_corpus <- tm_map(yelp_corpus, content_transformer(tolower))
yelp_corpus <- tm_map(yelp_corpus, removeNumbers)
yelp_corpus <- tm_map(yelp_corpus, removePunctuation)
yelp_corpus <- tm_map(yelp_corpus, removeWords, c("the", "and", stopwords("english")))
yelp_corpus <- tm_map(yelp_corpus, stripWhitespace)
#Construct a DTM
yelp_dtm <- DocumentTermMatrix(yelp_corpus)
# Reduce sparsity less than 0.99
yelp_dtm <- removeSparseTerms(yelp_dtm, 0.99)
yelp_dtm_matrix<- as.matrix(yelp_dtm)
yelp_df<- as.data.frame(yelp_dtm_matrix, stringsAsFactors = FALSE)
dim(yelp_df)
## [1] 988 97
# add sentiment data to matrix, and coding category data as numerical data where it is 1 if the sentiment is positive.
yelp_df$Positive <- 1
yelp_df$Positive[which(yelp$sentiment=='n')] <- 0
str(yelp_df$Positive)
## num [1:988] 1 0 0 1 1 0 0 0 1 1 ...
yelp_df$Positive <- as.factor(yelp_df$Positive)
head(yelp_df$Positive)
## [1] 1 0 0 1 1 0
## Levels: 0 1
#Export into cvs
write.csv(yelp_df, "yelp_DTM.csv")
#Setting seed
set.seed(100)
#Splitting data into 70:30 train and test
train_index <- createDataPartition(yelp_df$Positive,p=0.7,list=FALSE)
train_df <- yelp_df[train_index,]
test_df <- yelp_df[-train_index,]
rownames(test_df)<-NULL
rownames(train_df)<-NULL
#Logistic regression Apply model
Start1 <- Sys.time()
model_glm <- train(Positive ~ ., data = train_df, method = "glm", family = "binomial")
End1 <- Sys.time()-Start1
predict_glm <- predict(model_glm, newdata = test_df, type = "raw")
head(predict_glm,3)
## [1] 1 1 1
## Levels: 0 1
#Evaluating the model
confusionMatrix(predict_glm, test_df$Positive)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 117 55
## 1 31 93
##
## Accuracy : 0.7095
## 95% CI : (0.6541, 0.7605)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 1.979e-13
##
## Kappa : 0.4189
##
## Mcnemar's Test P-Value : 0.01313
##
## Sensitivity : 0.7905
## Specificity : 0.6284
## Pos Pred Value : 0.6802
## Neg Pred Value : 0.7500
## Prevalence : 0.5000
## Detection Rate : 0.3953
## Detection Prevalence : 0.5811
## Balanced Accuracy : 0.7095
##
## 'Positive' Class : 0
##
#install.packages('rsample')
library(rsample)
set.seed(100)
yelp_df2 <- yelp_df
yelp_df2$Positive <- yelp$sentiment
train_test_split <- initial_split(yelp_df2,prop = 0.8)
train_tbl <- training(train_test_split)
test_tbl <- testing(train_test_split)
library(recipes)
rec_obj <- recipe(Positive ~., data=train_tbl) %>%
step_center(all_predictors(),-all_outcomes())%>%
step_scale(all_predictors(),-all_outcomes())%>%
prep(data=train_tbl)
x_train_tbl <- bake(rec_obj,new_data=train_tbl)%>% select(-Positive)
x_test_tbl <- bake(rec_obj,new_data = test_tbl) %>% select(-Positive)
y_train_vec <- ifelse(pull(train_tbl,Positive)=='p',1,0)
y_test_vec <- ifelse(pull(test_tbl,Positive)=='p',1,0)
class(train_tbl$Positive[1:5])
## [1] "character"
rec_obj
## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 97
##
## Training data contained 791 data points and no missing data.
##
## Operations:
##
## Centering for loved, place, good, just, tasty, ... [trained]
## Scaling for loved, place, good, just, tasty, ... [trained]
#install.packages('kerasR')
#library(kerasR)
library(keras)
#library(reticulate)
#reticulate::py_available()
#reticulate::import("keras.models")
#reticulate::py_config()
#use_python("/usr/local/bin/python")
model_keras <- keras_model_sequential()
library(tensorflow)
Start2 <- Sys.time()
model_keras %>%
layer_dense(units=16,
kernel_initializer = 'uniform',
activation = 'relu',
input_shape = ncol(x_train_tbl)) %>%
layer_dropout(rate=0.1) %>%
layer_dense(units=16,
kernel_initializer = 'uniform',
activation = 'relu')%>%
layer_dense(units=1,
kernel_initializer = 'uniform',
activation = 'sigmoid')%>%
compile(optimizer='adam',
loss='binary_crossentropy',
metrics=c('accuracy'))
End2 <- Sys.time()-Start2
model_keras
## Model
## ___________________________________________________________________________
## Layer (type) Output Shape Param #
## ===========================================================================
## dense (Dense) (None, 16) 1568
## ___________________________________________________________________________
## dropout (Dropout) (None, 16) 0
## ___________________________________________________________________________
## dense_1 (Dense) (None, 16) 272
## ___________________________________________________________________________
## dense_2 (Dense) (None, 1) 17
## ===========================================================================
## Total params: 1,857
## Trainable params: 1,857
## Non-trainable params: 0
## ___________________________________________________________________________
library(forcats)
yhat_keras_class_vec <- predict_classes(object=model_keras,x=as.matrix(x_test_tbl)) %>% as.vector()
yhat_keras_prob_vec <- predict_proba(object = model_keras,x=as.matrix(x_test_tbl)) %>% as.vector()
estimates_keras_tbl <- tibble(
truth=as.factor(y_test_vec) %>% fct_recode(p='1',n='0'),
estimate=as.factor(yhat_keras_class_vec) %>% fct_recode(p='1',n='0'),
class_prob=yhat_keras_prob_vec
)
estimates_keras_tbl
## # A tibble: 197 x 3
## truth estimate class_prob
## <fct> <fct> <dbl>
## 1 p p 0.500
## 2 p n 0.500
## 3 n n 0.500
## 4 p n 0.500
## 5 p p 0.500
## 6 p n 0.500
## 7 p p 0.501
## 8 n p 0.500
## 9 p p 0.501
## 10 n p 0.501
## # ... with 187 more rows
#install.packages('yardstick')
library(yardstick)
options(yardstick.event_first=F)
estimates_keras_tbl %>% conf_mat(truth,estimate)
## Truth
## Prediction n p
## n 27 18
## p 70 82
estimates_keras_tbl %>% metrics(truth,estimate)
## # A tibble: 2 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.553
## 2 kap binary 0.0991
estimates_keras_tbl %>% roc_auc(truth,class_prob)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.561
estimates_keras_tbl %>% precision(truth,estimate)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 precision binary 0.539
estimates_keras_tbl %>% recall(truth,estimate)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 recall binary 0.82
#Train Random forest classifier
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.5.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
Start3 <- Sys.time()
model_rf<- randomForest(x=train_df, y=train_df$Positive)
End3 <- Sys.time()-Start3
model_rf
##
## Call:
## randomForest(x = train_df, y = train_df$Positive)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 9
##
## OOB estimate of error rate: 0%
## Confusion matrix:
## 0 1 class.error
## 0 346 0 0
## 1 0 346 0
#Predict using Random forest Classifier
predict_rf <- predict(model_rf, newdata = test_df)
confusionMatrix(predict_rf, test_df$Positive)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 148 0
## 1 0 148
##
## Accuracy : 1
## 95% CI : (0.9876, 1)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0
## Specificity : 1.0
## Pos Pred Value : 1.0
## Neg Pred Value : 1.0
## Prevalence : 0.5
## Detection Rate : 0.5
## Detection Prevalence : 0.5
## Balanced Accuracy : 1.0
##
## 'Positive' Class : 0
##
#Accuracy rate is 1
#Misclassification error
mean(as.character(test_df$Positive) != as.character(predict_rf))
## [1] 0
mean(as.character(test_df$Positive) == as.character(predict_rf))
## [1] 1
Running time
End1
## Time difference of 4.546226 secs
End2
## Time difference of 0.227603 secs
End3
## Time difference of 0.940731 secs
Randonm Forest has the best performance, with the accuracy achieving 100% and the least time of 1.163 secs. Deep Learning has the lowest accuracy of 50%. It has the second fastest speed, using 1.532 seconds. Logistic has the second accuracy of 70.95%. It uses the most time,reaching 4.84 seconds.