This project aims at analyzing Yelp review data set to predict the sentiment towards a business using verbatim review text. This was done as an academic exercise on natural language techniques for deriving insights from text.
Due to computational constraints the analysis was carried out using a sample of 50,000 reviews. The sentiment scores have been derived using the AFINN dictionary (http://www2.imm.dtu.dk/pubdb/views/publication_details.php?id=6010). The Naive Bayes technique has been used to carry out the modelling task.
Loading the required packages and dataset into the current work directory.
rm(list=ls())
library(tm)
library(SnowballC)
library(wordcloud)
library(RColorBrewer)
library(dplyr)
library(tidytext)
library(stringr)
library(reshape2)
library(ggplot2)
library(e1071)
library(caret)
library(glmnet)
reviews <- read.csv("yelpRestaurantReviewSample_50K.csv",header=TRUE, sep =';')
attach(reviews)
dim(reviews)
## [1] 50000 18
sapply(reviews,class)
## business_id cool date funny review_id
## "factor" "numeric" "factor" "numeric" "factor"
## stars text type useful user_id
## "numeric" "factor" "factor" "numeric" "factor"
## categories.0. categories.1. categories.2. name postal_code
## "factor" "factor" "factor" "factor" "factor"
## review_count state businessType
## "numeric" "factor" "factor"
names(reviews)
## [1] "business_id" "cool" "date" "funny"
## [5] "review_id" "stars" "text" "type"
## [9] "useful" "user_id" "categories.0." "categories.1."
## [13] "categories.2." "name" "postal_code" "review_count"
## [17] "state" "businessType"
hist(stars, xlab = "Star Rating", ylab="Count",col = "red")
INFERENCE: Based on the distribution: - Ratings 1,2,3 can be considered negative - RatingS 4 and 5 can be considered positive
rating <- aggregate(stars,list(stars),length)
names(rating)<-c("Stars","Total")
useful <- aggregate(useful,list(stars),sum)
names(useful)<-c("Stars","Useful Reviews")
funny <- aggregate(funny,list(stars),sum)
names(funny)<-c("Stars","Funny Reviews")
cool <- aggregate(cool,list(stars),sum)
names(cool)<-c("Stars","Cool Reviews")
review_rating_type <- merge(rating,useful,by= "Stars")
review_rating_type <- merge(review_rating_type,funny,by= "Stars")
review_rating_type <- merge(review_rating_type,cool,by= "Stars")
#Percentage
#par(mfrow=c(1,3))
plot(review_rating_type$Stars,(review_rating_type$`Useful Reviews`/review_rating_type$Total) *100, type = "b", xlab = "Stars", ylab ="% Useful Ratings", main = "Relationship between Star Reviews and Useful Ratings", col ="darkblue",lwd=1.5)
plot(review_rating_type$Stars,(review_rating_type$`Funny Reviews`/review_rating_type$Total)*100, type = "b", xlab = "Stars", ylab ="% Funny Ratings", main = "Relationship between Star Reviews and Funny Ratings", col ="red",lwd=1.5)
plot(review_rating_type$Stars,(review_rating_type$`Cool Reviews`/review_rating_type$Total)*100, type = "b", xlab = "Stars", ylab ="% Cool Ratings", main = "Relationship between Star Reviews and Cool Ratings", col ="orange",lwd=1.5)
INFERENCE: Based on the distribution: 1. Reviews that are rated low are considered to be more ‘useful’ 2. Lower ratings seem to have been voted as ‘funny’ compared to reviews with higher star rating 3. Rating 3 and 4 have been voted as ‘cool’
Steps Involved:
review_Corpus <- Corpus(VectorSource(reviews$text))
review_Corpus<-sample(review_Corpus, 50000)
#Cleaning reviews text
revs<-tm_map(review_Corpus,tolower)
#removing punctuation marks
revs<-tm_map(revs,removePunctuation)
#removing numbers
revs<-tm_map(revs,removeNumbers)
#removing stop words
revs<-tm_map(revs,removeWords, stopwords("english"))
#removing whitespaces
revs<-tm_map(revs,stripWhitespace)
dtm<-DocumentTermMatrix(revs)
show(dtm)
## <<DocumentTermMatrix (documents: 50000, terms: 111807)>>
## Non-/sparse entries: 2254247/5588095753
## Sparsity : 100%
## Maximal term length: 129
## Weighting : term frequency (tf)
dtm <- removeSparseTerms(dtm, 0.99)
dtm.matrix <- as.matrix(dtm)
dtm_df <- tidy(dtm)
reviews$document <- seq(1:50000)
reviews_df <- merge(subset(reviews, select=c("review_id","stars","document","business_id")),dtm_df,by="document")
wordcount <- colSums(dtm.matrix)
top <- head(sort(wordcount, decreasing=TRUE), 20)
dfplot <- as.data.frame(melt(top))
dfplot$word <- dimnames(dfplot)[[1]]
dfplot$word <- factor(dfplot$word,
levels=dfplot$word[order(dfplot$value,
decreasing=TRUE)])
fig <- ggplot(dfplot, aes(x=word, y=value)) + geom_bar(stat="identity")
fig <- fig + xlab("Word in Corpus")
fig <- fig + ylab("Count")
fig <- fig + theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(fig)
wordcloud(revs, max.words = 100, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
Identifying positive and negative words using star ratings as an indicator (average of ratings wherever the word occurs in the review)
word_avgstar <- aggregate(reviews_df$stars,list(reviews_df$term),mean)
hist(word_avgstar$x)
word_avgstar <- word_avgstar %>%arrange(x)
#Negative Words
head(word_avgstar,10)
## Group.1 x
## 1 outstanding 3.514241
## 2 curry 3.520468
## 3 rude 3.522409
## 4 game 3.529880
## 5 glad 3.530909
## 6 entire 3.533875
## 7 ribs 3.534921
## 8 mostly 3.535849
## 9 phoenix 3.535948
## 10 yum 3.536134
#Positive Words
tail(word_avgstar,10)
## Group.1 x
## 806 employees 3.749562
## 807 fãr 3.755179
## 808 show 3.758261
## 809 mix 3.759599
## 810 excited 3.761006
## 811 takeout 3.761484
## 812 taking 3.763085
## 813 tell 3.779635
## 814 split 3.780822
## 815 spice 3.788091
OBSERVATION: This method of finding top positive and negative words may not be accurate since it does not consider important words, i.e. words that not only occur frequently within a document but also occurs across documents. For this we approach using another method. This is carried out below.
review_words <- reviews_df %>%
mutate_all(as.character)
review_words_counted <- review_words %>%
count(review_id, business_id, stars, term) %>%
ungroup()
#review_words_counted
word_summaries <- review_words_counted %>%
group_by(term) %>%
summarize(businesses = n_distinct(business_id),
reviews = n(),
uses = sum(n),
average_stars = mean(as.numeric(stars))) %>%
ungroup()
#word_summaries
#Words that are present in atleast 100 documents and in more than 5 businesses
word_summaries_filtered <- word_summaries %>%
filter(reviews >= 100, businesses >= 5)
#word_summaries_filtered
#Positive words - from the filtered list
word_summaries_filtered %>%
arrange(desc(average_stars))
## # A tibble: 815 x 5
## term businesses reviews uses average_stars
## <chr> <int> <int> <int> <dbl>
## 1 spice 471 571 571 3.788091
## 2 split 430 511 511 3.780822
## 3 tell 901 1316 1316 3.779635
## 4 taking 546 726 726 3.763085
## 5 takeout 462 566 566 3.761484
## 6 excited 503 636 636 3.761006
## 7 mix 472 599 599 3.759599
## 8 show 458 575 575 3.758261
## 9 fãr 427 531 531 3.755179
## 10 employees 455 571 571 3.749562
## # ... with 805 more rows
#Negative words - from filtered list
word_summaries_filtered %>%
arrange(average_stars)
## # A tibble: 815 x 5
## term businesses reviews uses average_stars
## <chr> <int> <int> <int> <dbl>
## 1 outstanding 516 632 632 3.514241
## 2 curry 655 855 855 3.520468
## 3 rude 571 714 714 3.522409
## 4 game 407 502 502 3.529880
## 5 glad 634 825 825 3.530909
## 6 entire 564 738 738 3.533875
## 7 ribs 519 630 630 3.534921
## 8 mostly 431 530 530 3.535849
## 9 phoenix 494 612 612 3.535948
## 10 yum 474 595 595 3.536134
## # ... with 805 more rows
#Positive and Negative words as per AFINN
AFINN <- sentiments %>%
filter(lexicon == "AFINN") %>%
select(term = word, afinn_score = score)
words_afinn <- word_summaries_filtered %>%
inner_join(AFINN)
words_afinn
## # A tibble: 94 x 6
## term businesses reviews uses average_stars afinn_score
## <chr> <int> <int> <int> <dbl> <int>
## 1 amazing 2326 4733 4733 3.635115 4
## 2 awesome 1621 2887 2887 3.634222 4
## 3 awful 417 505 505 3.540594 -3
## 4 bad 1974 3860 3860 3.664508 -3
## 5 beautiful 596 792 792 3.705808 3
## 6 best 2982 7471 7471 3.657743 3
## 7 better 2402 5238 5238 3.660176 2
## 8 big 1696 3048 3048 3.668635 1
## 9 care 772 1101 1101 3.712080 2
## 10 chance 506 656 656 3.612805 2
## # ... with 84 more rows
ggplot(words_afinn, aes(afinn_score, average_stars, group = afinn_score)) +
geom_boxplot() +
xlab("AFINN Sentiment Score of word") +
ylab("Average Stars of Reviews with this Word")
mid<-mean(words_afinn$afinn_score)
ggplot(words_afinn, aes(x=reviews, y=average_stars,color=afinn_score)) +
geom_point()+
scale_color_gradient2(midpoint=mid, low="red",high="dark blue", space ="Lab" )+
geom_text(aes(label=term ,hjust=0, vjust=0),size=5)
OBSERVATION - The graph illustrates different kind of words (positive or negative) that occur frequently in reviews rated low to high - It can be observed that the density of positive words begin to increase for reviews higher than rating of 3 stars.
Using the dictionary based positive and negative terms to predict sentiment (positive or negative based on star rating) of a restaurant. For (AFINN) dictionary, obtain an aggregated positiveScore and a negativeScore for each review; for the AFINN dictionary, an aggregate positivity score can be obtained for each review.
Based on the scatter plot created earlier average score of 3 has been considered as the threshold to classify reviews as positive and negative in the training set (since density of positive words begin to increase for reviews having rating higher than 3 stars). Reviews having sentiment score >= 3 is considered positive and reviews having sentiment score < 3 is considered negative.
reviews_sentiment <- review_words %>%
inner_join(AFINN, by = "term")
#reviews_sentiment
#Average Sentiment score for each review using AFINN dictionary
reviews_score <- reviews_sentiment%>%
group_by(review_id, stars) %>%
summarize(sentiment = mean(afinn_score))
#reviews_score
reviews_score <- reviews_score %>%
right_join(subset(reviews, select=c("review_id","text","document")), by = "review_id")
#Classifying review as positive and negative based on sentiment score
reviews_score <- mutate(reviews_score,
sentiment_cat = ifelse(sentiment >= 3,"Pos","Neg"))
#reviews_score
Predict review sentiment based on these aggregated sentiment scores and understand model performance
#Creating test and train corpus
revs.train <- revs[1:35000]
revs.test <- revs[35001:50000]
#Creating test and train dataset
df.train <- reviews_score[1:35000,]
df.test <- reviews_score[35001:50000,]
fivefreq <- findFreqTerms(dtm, 5)
length((fivefreq))
## [1] 815
dtm.train <- DocumentTermMatrix(revs.train, control=list(dictionary = fivefreq))
dim(dtm.train)
## [1] 35000 815
dtm.test <- DocumentTermMatrix(revs.test, control=list(dictionary = fivefreq))
dim(dtm.test)
## [1] 15000 815
Training and testing using Naive Bayes Classifier
convert_count_to_boolean <- function(x) {
y <- ifelse(x > 0, 1,0)
y <- factor(y, levels=c(0,1), labels=c("No", "Yes"))
y
}
trainNB <- apply(dtm.train, 2, convert_count_to_boolean)
testNB <- apply(dtm.test, 2, convert_count_to_boolean)
classifier <- naiveBayes(trainNB, as.factor(df.train$sentiment_cat), laplace = 0)
pred <- predict(classifier, newdata=testNB)
table("Predictions"= pred, "Actual" = as.factor(df.test$sentiment_cat))
## Actual
## Predictions Neg Pos
## Neg 9029 1047
## Pos 2851 1391
Model Performance
conf.mat <- confusionMatrix(pred, df.test$sentiment_cat)
conf.mat
## Confusion Matrix and Statistics
##
## Reference
## Prediction Neg Pos
## Neg 9029 1047
## Pos 2851 1391
##
## Accuracy : 0.7278
## 95% CI : (0.7204, 0.735)
## No Information Rate : 0.8297
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2555
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.7600
## Specificity : 0.5705
## Pos Pred Value : 0.8961
## Neg Pred Value : 0.3279
## Prevalence : 0.8297
## Detection Rate : 0.6306
## Detection Prevalence : 0.7037
## Balanced Accuracy : 0.6653
##
## 'Positive' Class : Neg
##