Scraping Yelp.com

Scrape comments from yelp.com

This is an R Markdown document for scraping comments from yelp.com website. We are going to fetch the rating, comments from the Dallas branch of a restaurant.

library(rvest, warn.conflicts=F, quietly=T)
library(tidyverse, warn.conflicts=F, quietly=T)

#url object
url="https://www.yelp.com/biz/chipotle-mexican-grill-dallas-9?osq=Chipotle+Mexican+Grill"

# convert url to html object
page<-read_html(url)

pageSequence<-seq(from=0, to=80, by=10)   #there are 9 pages, per 10 comments on

# store items in vectors
review_date_all=c()
review_rating_all=c()
review_text_all=c()

Now let’s fetch all data from all pages by means of loop

for (i in pageSequence) {
  if (i==0) {
    page<-read_html(url)
  } else {
    page<- read_html(paste0(url, '&start=', i))
  }
  
  # review date
  review_dates<-page %>% 
    html_elements(xpath="//*[@class=' css-chan6m']") %>%
    html_text() %>%
    .[str_detect(., "^\\d+[/]\\d+[/]\\d{4}$")]   #This line extracts only dates
  
  # review rating
  review_ratings<-page %>% 
    html_elements(xpath="//div[starts-with(@class, ' review')]") %>%
    html_elements(xpath=".//div[contains(@aria-label, 'rating')]") %>%
    html_attr('aria-label') %>%         
    str_remove_all(' star rating') %>%     # we need only numbers
    as.numeric()
  
  # review text
  review_text<-page %>% 
    html_elements(xpath="//p[starts-with(@class, 'comment')]") %>%
    html_text()
  
  review_date_all=append(review_date_all, review_dates)
  review_rating_all=append(review_rating_all, review_ratings)
  review_text_all=append(review_text_all, review_text)
  
}

Let’s make data frame

df<-data.frame('Date'=review_date_all,
               'Rating'=review_rating_all,
               'Text'=review_text_all)

df$Date<-as.Date(df$Date, "%m/%d/%Y")  # change Date column format to date
dff<-data.frame(df$Date,df$Rating,substring(df$Text,1,28)) 
head(dff)

##      df.Date df.Rating    substring.df.Text..1..28.
## 1 2021-11-29         5 I was at the Apple store and
## 2 2022-08-05         2 Went at peak lunch time. Not
## 3 2022-11-14         1 This chipotle is royally slo
## 4 2022-09-28         1 Write this in hopes it just 
## 5 2022-10-24         1 Went to this chipotle at 8:4
## 6 2022-09-28         1 Have placed maybe 10-15 mobi

Now, it is time to make some sentimental analysis on this comments.

library(sentimentr, warn.conflicts=F, quietly=T)

some_text<-df$Text
result<-sentiment(some_text)
#plot(result)
head(result)

##    element_id sentence_id word_count   sentiment
## 1:          1           1         21 -0.05455447
## 2:          1           2          6  0.00000000
## 3:          1           3          6  0.00000000
## 4:          1           4         26  0.00000000
## 5:          1           5          8  0.08838835
## 6:          1           6         10  0.00000000

Finally, we can draw plot for this analysis by means of syuzhet package

library(syuzhet, warn.conflicts=F, quietly=T)

#Convert it into character vector
review<-as.character(df$Text)
s<-get_nrc_sentiment(review)
head(s)

##   anger anticipation disgust fear joy sadness surprise trust negative positive
## 1     1            4       1    1   1       1        0     1        2        3
## 2     0            1       1    0   1       0        0     1        1        1
## 3     0            1       0    1   1       1        0     2        2        2
## 4     1            1       0    3   1       2        0     0        3        2
## 5     1            3       1    1   1       1        0     1        2        2
## 6     0            3       0    0   0       0        0     0        1        1

barplot(colSums(s), col = rainbow(10), ylab = 'Count', main = 'Sentimental scores of restaurant')

Source

Scraping Yelp.com

Tural Naghi

2022-12-06

Scrape comments from yelp.com