## [1] "English_United States.1252"

Import and process data

review <- read.csv("D:/Users/24772/Desktop/yelp_review_2015.csv")
review$stars <- as.factor(review$stars)
review$date <- as.Date(review$date)
review$useful <- as.numeric(review$useful)
review$funny <- as.numeric(review$funny)
review$cool <- as.numeric(review$cool)
review <- review[, 1:9]
startday <- as.Date("2015-01-01")
endday <- as.Date("2015-12-31")
review <- review[which(review$date >= startday & review$date <= endday), ]
summary(review)
##   review_id           user_id          business_id            stars       
##  Length:911299      Length:911299      Length:911299      5      :408409  
##  Class :character   Class :character   Class :character   4      :194088  
##  Mode  :character   Mode  :character   Mode  :character   1      :138427  
##                                                           3      : 96358  
##                                                           2      : 74017  
##                                                                  :     0  
##                                                           (Other):     0  
##       date                text               useful            funny        
##  Min.   :2015-01-01   Length:911299      Min.   :  0.000   Min.   :  0.000  
##  1st Qu.:2015-04-09   Class :character   1st Qu.:  0.000   1st Qu.:  0.000  
##  Median :2015-07-08   Mode  :character   Median :  0.000   Median :  0.000  
##  Mean   :2015-07-05                      Mean   :  1.347   Mean   :  0.461  
##  3rd Qu.:2015-09-29                      3rd Qu.:  2.000   3rd Qu.:  0.000  
##  Max.   :2015-12-31                      Max.   :782.000   Max.   :511.000  
##                                          NA's   :128       NA's   :114      
##       cool         
##  Min.   :  0.0000  
##  1st Qu.:  0.0000  
##  Median :  0.0000  
##  Mean   :  0.5301  
##  3rd Qu.:  0.0000  
##  Max.   :155.0000  
##  NA's   :187

There are 911,299 rows and 9 variables in the review dataset, including both numerical variables and category variables. Some variables have missing values, but they account for a small proportion.

EDA

stars_count <- review %>%
    group_by(date, stars) %>%
    summarise(Frequency = n())



mycolor <- brewer.pal(5, "RdYlBu")
ggplot(stars_count, aes(x = date, y = Frequency, group = stars, color = stars)) +
    geom_line() + theme_stata() + labs(x = "Date") + scale_color_manual("stars",
    values = mycolor)

In general, five-star reviews account for the largest proportion, followed by four-star reviews.

ufc_count <- review %>%
    group_by(date) %>%
    summarise(useful = mean(useful, na.rm = T), funny = mean(funny, na.rm = T), cool = mean(cool,
        na.rm = T))


p1 <- ggplot(ufc_count, aes(x = date, y = useful)) + geom_line() + theme_stata() +
    labs(x = "Date", y = "Mean Useful") + ggtitle("Daily Mean Useful")
p2 <- ggplot(ufc_count, aes(x = date, y = funny)) + geom_line() + theme_stata() +
    labs(x = "Date", y = "Mean Funny") + ggtitle("Daily Mean Funny")
p3 <- ggplot(ufc_count, aes(x = date, y = cool)) + geom_line() + theme_stata() +
    labs(x = "Date", y = "Mean Cool") + ggtitle("Daily Mean Cool")

grid.arrange(p1, p2, p3, nrow = 3)

Text Mining

Total reviews

To use the tm package we first transfrom the dataset to a corpus:

set.seed(777)
s1 <- sample(nrow(review), 0.1 * nrow(review))
reviews = review[s1, ]
review_corpus = Corpus(VectorSource(reviews$text))

Next we normalize the texts in the reviews using a series of pre-processing steps:

  1. Switch to lower case

  2. Remove numbers

  3. Remove punctuation marks and stopwords

  4. Remove extra whitespaces

review_corpus = tm_map(review_corpus, content_transformer(tolower))
review_corpus = tm_map(review_corpus, removeNumbers)
review_corpus = tm_map(review_corpus, removePunctuation)
review_corpus = tm_map(review_corpus, removeWords, c("the", "and", stopwords("english")))
review_corpus = tm_map(review_corpus, stripWhitespace)

After the above transformations the first review looks like

inspect(review_corpus[1])
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 1
## 
## [1] ordered pick location arrived minutes later even started make order yet told phone take minutes ready liars blamed rush even though people joking around store avoid location joke

To analyze the textual data, we use a Document-Term Matrix (DTM) representation: documents as the rows, terms/words as the columns, frequency of the term in the document as the entries. Because the number of unique words in the corpus the dimension can be large.

review_dtm <- DocumentTermMatrix(review_corpus)

To reduce the dimension of the DTM, we can remove the less frequent terms such that the sparsity is less than 0.99.

review_dtm2 = removeSparseTerms(review_dtm, 0.99)
review_dtm2
## <<DocumentTermMatrix (documents: 91129, terms: 822)>>
## Non-/sparse entries: 2607732/72300306
## Sparsity           : 97%
## Maximal term length: 13
## Weighting          : term frequency (tf)

Now, We can draw a simple word cloud of total reviews.

findFreqTerms(review_dtm2)[1:10]
##  [1] "around"   "arrived"  "avoid"    "even"     "later"    "location"
##  [7] "make"     "minutes"  "order"    "ordered"
set.seed(2)
col <- colorRampPalette(brewer.pal(12, "Paired"))
freq = data.frame(sort(colSums(as.matrix(review_dtm2)), decreasing = TRUE))
wordcloud(rownames(freq), freq[, 1], max.words = 50, colors = col(50))

One and five stars reviews

wordcloud

reviews$text <- removeNumbers(reviews$text)
reviews$text <- removePunctuation(reviews$text)
reviews <- reviews %>%
    unnest_tokens(word, text, token = stringr::str_split, pattern = " ", drop = FALSE)

`%!in%` <- Negate(`%in%`)
reviews <- filter(reviews, reviews$word != "" & reviews$word %!in% c("the", "i",
    "and", stopwords("english")))

Draw the word cloud map of one star review and five star review respectively.

reviews %>%
    filter(stars == 1) %>%
    count(word, sort = T) %>%
    top_n(30, wt = n) %>%
    wordcloud2::wordcloud2()
reviews %>%
    filter(stars == 5) %>%
    count(word, sort = T) %>%
    top_n(30, wt = n) %>%
    wordcloud2::wordcloud2()

TF-IDF

# TF-IDF
reviews_stars <- reviews %>%
    count(stars, word, sort = TRUE)
total_reviews_stars <- reviews_stars %>%
    group_by(stars) %>%
    dplyr::summarize(total = sum(n))
reviews_stars <- left_join(reviews_stars, total_reviews_stars)
review_tf_idf_stars <- reviews_stars %>%
    bind_tf_idf(word, stars, n)
review_tf_idf_stars %>%
    filter(stars == 1 | stars == 5) %>%
    group_by(stars) %>%
    slice_max(tf_idf, n = 20) %>%
    ungroup() %>%
    ggplot(aes(tf_idf, fct_reorder(word, tf_idf), fill = stars)) + geom_col(show.legend = FALSE) +
    facet_wrap(~stars, ncol = 2, scales = "free") + labs(x = "tf-idf", y = NULL)

Sentiment Analysis

star$text<-removeNumbers(star$text)
star$text<-removePunctuation(star$text)
for( i in 1:length(star$text))
{
  star$sentiment[i]<-(sentiment(star$text[i]))[1,4]
}

star$polarity_level <- ifelse(star$sentiment < 0.2, "Negative",
                                 ifelse(star$sentiment > 0.2, "Positive","Neutral"))
star$sentiment <- as.numeric(star$sentiment)
ggplot(star, aes(x = sentiment, group = stars, fill = stars)) + geom_histogram(position = "identity",
    alpha = 0.6) + theme_stata()

ggplot(star, aes(x = polarity_level)) + geom_bar(stat = "count") + theme_stata() +
    facet_wrap(. ~ stars)