## [1] "English_United States.1252"
review <- read.csv("D:/Users/24772/Desktop/yelp_review_2015.csv")
review$stars <- as.factor(review$stars)
review$date <- as.Date(review$date)
review$useful <- as.numeric(review$useful)
review$funny <- as.numeric(review$funny)
review$cool <- as.numeric(review$cool)
review <- review[, 1:9]
startday <- as.Date("2015-01-01")
endday <- as.Date("2015-12-31")
review <- review[which(review$date >= startday & review$date <= endday), ]
summary(review)
## review_id user_id business_id stars
## Length:911299 Length:911299 Length:911299 5 :408409
## Class :character Class :character Class :character 4 :194088
## Mode :character Mode :character Mode :character 1 :138427
## 3 : 96358
## 2 : 74017
## : 0
## (Other): 0
## date text useful funny
## Min. :2015-01-01 Length:911299 Min. : 0.000 Min. : 0.000
## 1st Qu.:2015-04-09 Class :character 1st Qu.: 0.000 1st Qu.: 0.000
## Median :2015-07-08 Mode :character Median : 0.000 Median : 0.000
## Mean :2015-07-05 Mean : 1.347 Mean : 0.461
## 3rd Qu.:2015-09-29 3rd Qu.: 2.000 3rd Qu.: 0.000
## Max. :2015-12-31 Max. :782.000 Max. :511.000
## NA's :128 NA's :114
## cool
## Min. : 0.0000
## 1st Qu.: 0.0000
## Median : 0.0000
## Mean : 0.5301
## 3rd Qu.: 0.0000
## Max. :155.0000
## NA's :187
There are 911,299 rows and 9 variables in the review dataset, including both numerical variables and category variables. Some variables have missing values, but they account for a small proportion.
stars_count <- review %>%
group_by(date, stars) %>%
summarise(Frequency = n())
mycolor <- brewer.pal(5, "RdYlBu")
ggplot(stars_count, aes(x = date, y = Frequency, group = stars, color = stars)) +
geom_line() + theme_stata() + labs(x = "Date") + scale_color_manual("stars",
values = mycolor)
In general, five-star reviews account for the largest proportion, followed by four-star reviews.
ufc_count <- review %>%
group_by(date) %>%
summarise(useful = mean(useful, na.rm = T), funny = mean(funny, na.rm = T), cool = mean(cool,
na.rm = T))
p1 <- ggplot(ufc_count, aes(x = date, y = useful)) + geom_line() + theme_stata() +
labs(x = "Date", y = "Mean Useful") + ggtitle("Daily Mean Useful")
p2 <- ggplot(ufc_count, aes(x = date, y = funny)) + geom_line() + theme_stata() +
labs(x = "Date", y = "Mean Funny") + ggtitle("Daily Mean Funny")
p3 <- ggplot(ufc_count, aes(x = date, y = cool)) + geom_line() + theme_stata() +
labs(x = "Date", y = "Mean Cool") + ggtitle("Daily Mean Cool")
grid.arrange(p1, p2, p3, nrow = 3)
To use the tm package we first transfrom the dataset to a corpus:
set.seed(777)
s1 <- sample(nrow(review), 0.1 * nrow(review))
reviews = review[s1, ]
review_corpus = Corpus(VectorSource(reviews$text))
Next we normalize the texts in the reviews using a series of pre-processing steps:
Switch to lower case
Remove numbers
Remove punctuation marks and stopwords
Remove extra whitespaces
review_corpus = tm_map(review_corpus, content_transformer(tolower))
review_corpus = tm_map(review_corpus, removeNumbers)
review_corpus = tm_map(review_corpus, removePunctuation)
review_corpus = tm_map(review_corpus, removeWords, c("the", "and", stopwords("english")))
review_corpus = tm_map(review_corpus, stripWhitespace)
After the above transformations the first review looks like
inspect(review_corpus[1])
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 1
##
## [1] ordered pick location arrived minutes later even started make order yet told phone take minutes ready liars blamed rush even though people joking around store avoid location joke
To analyze the textual data, we use a Document-Term Matrix (DTM) representation: documents as the rows, terms/words as the columns, frequency of the term in the document as the entries. Because the number of unique words in the corpus the dimension can be large.
review_dtm <- DocumentTermMatrix(review_corpus)
To reduce the dimension of the DTM, we can remove the less frequent terms such that the sparsity is less than 0.99.
review_dtm2 = removeSparseTerms(review_dtm, 0.99)
review_dtm2
## <<DocumentTermMatrix (documents: 91129, terms: 822)>>
## Non-/sparse entries: 2607732/72300306
## Sparsity : 97%
## Maximal term length: 13
## Weighting : term frequency (tf)
Now, We can draw a simple word cloud of total reviews.
findFreqTerms(review_dtm2)[1:10]
## [1] "around" "arrived" "avoid" "even" "later" "location"
## [7] "make" "minutes" "order" "ordered"
set.seed(2)
col <- colorRampPalette(brewer.pal(12, "Paired"))
freq = data.frame(sort(colSums(as.matrix(review_dtm2)), decreasing = TRUE))
wordcloud(rownames(freq), freq[, 1], max.words = 50, colors = col(50))
reviews$text <- removeNumbers(reviews$text)
reviews$text <- removePunctuation(reviews$text)
reviews <- reviews %>%
unnest_tokens(word, text, token = stringr::str_split, pattern = " ", drop = FALSE)
`%!in%` <- Negate(`%in%`)
reviews <- filter(reviews, reviews$word != "" & reviews$word %!in% c("the", "i",
"and", stopwords("english")))
Draw the word cloud map of one star review and five star review respectively.
reviews %>%
filter(stars == 1) %>%
count(word, sort = T) %>%
top_n(30, wt = n) %>%
wordcloud2::wordcloud2()
reviews %>%
filter(stars == 5) %>%
count(word, sort = T) %>%
top_n(30, wt = n) %>%
wordcloud2::wordcloud2()
# TF-IDF
reviews_stars <- reviews %>%
count(stars, word, sort = TRUE)
total_reviews_stars <- reviews_stars %>%
group_by(stars) %>%
dplyr::summarize(total = sum(n))
reviews_stars <- left_join(reviews_stars, total_reviews_stars)
review_tf_idf_stars <- reviews_stars %>%
bind_tf_idf(word, stars, n)
review_tf_idf_stars %>%
filter(stars == 1 | stars == 5) %>%
group_by(stars) %>%
slice_max(tf_idf, n = 20) %>%
ungroup() %>%
ggplot(aes(tf_idf, fct_reorder(word, tf_idf), fill = stars)) + geom_col(show.legend = FALSE) +
facet_wrap(~stars, ncol = 2, scales = "free") + labs(x = "tf-idf", y = NULL)
star$text<-removeNumbers(star$text)
star$text<-removePunctuation(star$text)
for( i in 1:length(star$text))
{
star$sentiment[i]<-(sentiment(star$text[i]))[1,4]
}
star$polarity_level <- ifelse(star$sentiment < 0.2, "Negative",
ifelse(star$sentiment > 0.2, "Positive","Neutral"))
star$sentiment <- as.numeric(star$sentiment)
ggplot(star, aes(x = sentiment, group = stars, fill = stars)) + geom_histogram(position = "identity",
alpha = 0.6) + theme_stata()
ggplot(star, aes(x = polarity_level)) + geom_bar(stat = "count") + theme_stata() +
facet_wrap(. ~ stars)