This is an R Markdown document for scraping comments from yelp.com website. We are going to fetch the rating, comments from the Dallas branch of a restaurant.
library(rvest, warn.conflicts=F, quietly=T)
library(tidyverse, warn.conflicts=F, quietly=T)
#url object
url="https://www.yelp.com/biz/chipotle-mexican-grill-dallas-9?osq=Chipotle+Mexican+Grill"
# convert url to html object
page<-read_html(url)
pageSequence<-seq(from=0, to=80, by=10) #there are 9 pages, per 10 comments on
# store items in vectors
review_date_all=c()
review_rating_all=c()
review_text_all=c()
Now let’s fetch all data from all pages by means of loop
for (i in pageSequence) {
if (i==0) {
page<-read_html(url)
} else {
page<- read_html(paste0(url, '&start=', i))
}
# review date
review_dates<-page %>%
html_elements(xpath="//*[@class=' css-chan6m']") %>%
html_text() %>%
.[str_detect(., "^\\d+[/]\\d+[/]\\d{4}$")] #This line extracts only dates
# review rating
review_ratings<-page %>%
html_elements(xpath="//div[starts-with(@class, ' review')]") %>%
html_elements(xpath=".//div[contains(@aria-label, 'rating')]") %>%
html_attr('aria-label') %>%
str_remove_all(' star rating') %>% # we need only numbers
as.numeric()
# review text
review_text<-page %>%
html_elements(xpath="//p[starts-with(@class, 'comment')]") %>%
html_text()
review_date_all=append(review_date_all, review_dates)
review_rating_all=append(review_rating_all, review_ratings)
review_text_all=append(review_text_all, review_text)
}
Let’s make data frame
df<-data.frame('Date'=review_date_all,
'Rating'=review_rating_all,
'Text'=review_text_all)
df$Date<-as.Date(df$Date, "%m/%d/%Y") # change Date column format to date
dff<-data.frame(df$Date,df$Rating,substring(df$Text,1,28))
head(dff)
## df.Date df.Rating substring.df.Text..1..28.
## 1 2021-11-29 5 I was at the Apple store and
## 2 2022-08-05 2 Went at peak lunch time. Not
## 3 2022-11-14 1 This chipotle is royally slo
## 4 2022-09-28 1 Write this in hopes it just
## 5 2022-10-24 1 Went to this chipotle at 8:4
## 6 2022-09-28 1 Have placed maybe 10-15 mobi
Now, it is time to make some sentimental analysis on this comments.
library(sentimentr, warn.conflicts=F, quietly=T)
some_text<-df$Text
result<-sentiment(some_text)
#plot(result)
head(result)
## element_id sentence_id word_count sentiment
## 1: 1 1 21 -0.05455447
## 2: 1 2 6 0.00000000
## 3: 1 3 6 0.00000000
## 4: 1 4 26 0.00000000
## 5: 1 5 8 0.08838835
## 6: 1 6 10 0.00000000
Finally, we can draw plot for this analysis by means of syuzhet package
library(syuzhet, warn.conflicts=F, quietly=T)
#Convert it into character vector
review<-as.character(df$Text)
s<-get_nrc_sentiment(review)
head(s)
## anger anticipation disgust fear joy sadness surprise trust negative positive
## 1 1 4 1 1 1 1 0 1 2 3
## 2 0 1 1 0 1 0 0 1 1 1
## 3 0 1 0 1 1 1 0 2 2 2
## 4 1 1 0 3 1 2 0 0 3 2
## 5 1 3 1 1 1 1 0 1 2 2
## 6 0 3 0 0 0 0 0 0 1 1
barplot(colSums(s), col = rainbow(10), ylab = 'Count', main = 'Sentimental scores of restaurant')