#import necessary library
library(tidyverse)
library(leaflet) # For mapping
library(wordcloud2) #For word cloud visualize
library(corrplot) # correlation chart
library(tm) # For text mining
library(SnowballC) #For text mining
library(wordcloud) # For word cloud visualize - interactive
library(caret)
library(caTools) # For sample split function
library(tidytext)
library(htmlwidgets)
library(splitstackshape)
library(devtools)
library(reshape2)
library(countrycode)
library(plotly)
library(igraph)
library(threejs)
library(network)
library(sna)
library(ggraph)
library(widyr)
library(Matrix)
library(xgboost)
library(dummies)
This dataset contains about 500,000 reviews from 1493 hotels from 6 countries. It has 17variables about positive and negative reviews and word count, average score, review’s score, and so on. Some useful information is in the tag and address variables. Therefore, we can make a new variable from these columns by using text mining before starting the analysis.
hotel <- read.csv('Hotel_Reviews.csv')
continent <- read.csv('continent.csv')
#summary(hotel)
#str(hotel)
hotel$Positive_Review <- as.character(hotel$Positive_Review)
hotel$Negative_Review <- as.character(hotel$Negative_Review)
hotel$Tags <- as.character(hotel$Tags)
hotel$Hotel_Name <- as.character(hotel$Hotel_Name)
hotel$review_ID <- seq.int(nrow(hotel))
hotel$review_ID <- as.factor(hotel$review_ID)
hotel[hotel$Positive_Review == 'No Positive','Positive_Review'] <- ""
hotel[hotel$Negative_Review == 'No Negative','Negative_Review'] <- ""
hotel$Reviewer_Nationality <- as.character(hotel$Reviewer_Nationality)
#sum(is.na(hotel$Reviewer_Nationality))
#sum(hotel$Reviewer_Nationality==" ")
hotel[hotel$Reviewer_Nationality== " ",'Reviewer_Nationality'] <- "Unknown"
reviewernationality <- unique(hotel$Reviewer_Nationality)
#Data cleaning
#explore missing value
#for (i in 1:17){
#print(sum(is.na(hotel[,i])))}
null <- hotel[c(which(is.na(hotel[,16:17])==TRUE)),]
nulllocation <- null %>% filter(!duplicated(null[,c('Hotel_Address')]))
#making country vairalbe
hotel$Hotel_Address <- as.character(hotel$Hotel_Address)
location <- hotel %>% filter(!duplicated(hotel[,c('lat','lng','Hotel_Address')]))
country <- c()
for (i in 1:nrow(location)) {
country <- append(country, tail(strsplit(location$Hotel_Address, " ")[[i]],1))}
location$Country <- country
location$Country[location$Country=="Kingdom"] <- "United Kingdom"
location$Country <- as.factor(location$Country)
#Making new variable from tag information
#Travel Type
hotel$Leisure <- ifelse(grepl('Leisure',hotel$Tags),1,0)
hotel$Business <- ifelse(grepl('Business',hotel$Tags),2,0)
hotel$No_type <- hotel$Business + hotel$Leisure
hotel$Travel_type <- ""
hotel$Travel_type[which(hotel$No_type == 0)] <- 'No type'
hotel$Travel_type[which(hotel$Leisure == 1)] <- 'Leisure'
hotel$Travel_type[which(hotel$Business == 2)] <- 'Business'
hotel <- hotel[,-c(19,20,21)]
#With Whom
hotel$Solo <- ifelse(grepl('Solo',hotel$Tags),1,0) # 1=Solo
hotel$Couple <- ifelse(grepl('Couple',hotel$Tags),2,0) # 2=Couple
hotel$Group <- ifelse(grepl('Group',hotel$Tags),3,0) # 3=Group
hotel$F_w_y <- ifelse(grepl('young',hotel$Tags),4,0) # 4=Family with young children
hotel$F_w_o <- ifelse(grepl('older',hotel$Tags),5,0) # 5=Family with older children
hotel$T_w_F <- ifelse(grepl('friend',hotel$Tags),6,0) # 6=With friends
hotel$With <- hotel$Solo + hotel$Group + hotel$Couple + hotel$F_w_o + hotel$F_w_y + hotel$T_w_F
hotel <- hotel[,-c(20,21,22,23,24,25)]
hotel$With[which(hotel$With == 1)] <- 'Solo'
hotel$With[which(hotel$With == 2)] <- 'Couple'
hotel$With[which(hotel$With == 3)] <- 'Group'
hotel$With[which(hotel$With == 4)] <- 'Family w young'
hotel$With[which(hotel$With == 5)] <- 'Family w older'
hotel$With[which(hotel$With == 6)] <- 'W friends'
#How many days stay
hotel$Stay <- str_extract(hotel$Tags, "(?<=Stayed).*(?=night)")
hotel$Stay <-as.numeric(hotel$Stay)
##summary(hotel$stay)
#Review using mobile
hotel$Mobile <- ifelse(grepl('obile',hotel$Tags),1,0)
hotel$Mobile <- as.factor(hotel$Mobile)
#Convert reviewer's country to continent
hotel$Reviewer_country <- countrycode(hotel$Reviewer_Nationality,'country.name', 'iso3c')
#sum(is.na(hotel$Reviewer_country))
continent <- continent[,c(3,6)]
hotel <- merge(hotel, continent, by.x="Reviewer_country", by.y='alpha.3', all.x=TRUE)
hotel <- hotel[,-1]
colnames(hotel)[23] <- "Reviewer_Continent"
#remove wrong and incomplete information
hotel <- drop_na(hotel)
hotel <- hotel[-which(hotel$Reviewer_Continent == ""),]
#sum(is.na(hotel$Reviewer_Continent))
#sum(hotel$With == 0)
#str(location$Country)
#summary(location$Country)
hotel$Country <- ""
for(i in 1:nrow(location))
{hotel$Country[c(which(hotel$Hotel_Address == location$Hotel_Address[[i]]))] <-as.character(location$Country[[i]])}
write.csv(hotel, file = "hotel_clean.csv",row.names=FALSE)
#str(hotel$Country)
#hotel$Country <- as.factor(hotel$Country)
#summary(hotel$Country)
#Explore Data Analysis
#Location
map <- leaflet(location)%>%addProviderTiles(providers$Stamen.TonerLite)%>%
addMarkers(popup = paste0( "<strong>Hotel Name:</strong>"
,"<br>"
,location$Hotel_Name,
"<br>",
"<strong>Address:</strong>"
,"<br>"
,location$Hotel_Address,
"<br>",
"<strong>Average Socre:</strong>"
,"<br>",
location$Average_Score), clusterOptions = markerClusterOptions())
map
#Number of hotels according to contry
number_by_country <- ggplot(data = location, aes(x=Country))+
geom_bar(alpha=0.8,aes(fill=Country)) +
scale_fill_brewer(palette='Blues') +
ggtitle(label="Number of Hotels by Country")
number_by_country
#Number of reviews according to hotel.
number_review_country <- ggplot(data = hotel, aes(x=Country))+
geom_bar(alpha=0.8, aes(fill=Country)) +
scale_fill_brewer(palette='Blues') +
ggtitle(label="Number of Reviews by Country")
number_review_country
In this dataset, there are a lot of hotels in the United Kingdom compare to other countries. Thus the number of reviews also shows similar distribution.
#Distribution of average review score.
dist_average_score<-ggplot(data = hotel %>% filter(!duplicated(hotel[,c('Hotel_Name','Average_Score')])),aes(x=Average_Score))+geom_histogram(fill='deepskyblue',color='deepskyblue4')+xlab("Average Review Score")+ylab("Counts")+ ggtitle(label="Distribution of average review score")
dist_average_score
dist_reviewer_score<-ggplot(data = hotel,aes(x=Reviewer_Score))+geom_histogram(fill='deepskyblue',color='deepskyblue4')+xlab("Reviewer's Score")+ylab("Counts") + ggtitle(label="Distribution of reviewer's score")
dist_reviewer_score
#Distribution of average review score by country
dist_average_score_bycountry<-ggplot(data = hotel,aes(x=Average_Score))+geom_histogram(fill='deepskyblue',color='deepskyblue4')+xlab("Reviewer's Score")+ylab("Counts") + facet_grid(~Country) + ggtitle(label="Distribution of average review score by country")
dist_average_score_bycountry
#Number of reviews by hotel.
Number_of_Reivew_byhotel<-ggplot(data = location, aes(x=Hotel_Name, y=Total_Number_of_Reviews))+
geom_bar(stat = 'identity', alpha=0.8,fill='lightblue1',color='lightblue3') +
scale_fill_brewer(palette='cadetblue1') +
ggtitle(label="Number of Reviews by hotel") + theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) + geom_hline(yintercept = mean(location$Total_Number_of_Reviews),linetype="dashed",color="blue")
Number_of_Reivew_byhotel
Let’s see the distribution of hotel score. We can see a left-skewed distribution of hotel’s average score in the first graph. Besides, We can see people tend to give a generous score from the distribution of the reviewer’s score. Each country’s hotel shows a similar distribution of score. And We can also see each hotel’s number of review’s distribution from the last plot. The blue dashed line is mean
#correlation of each numeric varialbe by hotel
#correlation by hotel
a<-location[,c(2,4,9)]
colnames(a) <- c("Additional score", "Average Score","Total number of reviews")
corrplot(cor(a), method = "color", tl.srt = 20,
tl.col = "black", addCoef.col = TRUE)
#correaltion by total reviews
b<-hotel[,c(8,11,12,13)]
colnames(b) <- c("Negative", "Positive","Reviewer's given", "Score")
corrplot(cor(b), method = "color", tl.srt = 20, tl.cex = 0.8,
tl.col = "black", addCoef.col = TRUE)
Let’s see what the relationship of each numeric variables is. The first correlation plot shows that correlation of each hotel. It shows that an additional score and the total number of reviews is highly correlated. (Additional score means the number of the score without reviews) It can be interpreted that people tend to go where others go often. The second correlation plot shows that correlation of each review. And It shows the number of negative reviews has a negative correlation with the score.
#Distribtuion of the number of positive and negative reviews
dist_positive<-ggplot(data = hotel,aes(x=Review_Total_Positive_Word_Counts))+geom_histogram(fill='lightblue1',color='lightblue3')+xlab("positive word count") +
ggtitle(label="Distribution of postive word count")
dist_positive
dist_negative<-ggplot(data = hotel,aes(x=Review_Total_Negative_Word_Counts))+geom_histogram(fill='lightblue1',color='lightblue3')+xlab("Negative word count") +
ggtitle(label="Disribution of negative word count")
dist_negative
There is no significant difference in the distribution of word count. However, it looks like people tend to write longer when they write positive reviews.
#Does Reviewr's nationality and continent affect the score?
score_by_nationality <- hotel %>% group_by(Reviewer_Nationality) %>% summarise(mean_score=mean(Reviewer_Score),Count = n())
score_by_continent <- hotel %>% group_by(Reviewer_Continent) %>% summarise(mean_score=mean(Reviewer_Score),Count = n())
number_reviewer_continent<-
ggplot(data = hotel, aes(x=Reviewer_Continent))+
geom_bar(alpha=0.8, aes(fill=Reviewer_Continent)) +
scale_fill_brewer(palette='Blues') +
ggtitle(label="Number of Reviews by Country") + theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
avg_reviewer_score <- ggplot(score_by_continent,
aes(Reviewer_Continent, mean_score)) +
geom_bar(stat = "identity",fill='lightblue1',color='lightblue3') + ylab("Average Score") +
ggtitle(label="Average score by reviwer's Continent") +
geom_hline(yintercept = mean(score_by_nationality$mean_score),linetype="dashed",color="blue")
avg_reviewer_nationality <- ggplot(score_by_nationality,
aes(Reviewer_Nationality, mean_score)) +
geom_bar(stat = "identity",fill='lightblue1',color='lightblue3')+ theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) + ylab("Average Score") +
ggtitle(label="Average score by reviwer's nationality") +
geom_hline(yintercept = mean(score_by_nationality$mean_score),linetype="dashed",color="blue")
number_reviewer_nationality<-ggplot(score_by_nationality,
aes(Reviewer_Nationality, Count)) +
geom_bar(stat = "identity",fill='lightblue1',color='lightblue3') + theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) + ylab("Count") +
ggtitle(label="The number of reviews by reviwer's nationality") +
geom_hline(yintercept = mean(score_by_nationality$Count),linetype="dashed",color="blue")
number_reviewer_nationality
number_reviewer_continent
avg_reviewer_nationality
avg_reviewer_score
Does Reviewr’s nationality or continent affect the score? According to the first and second plot, this data is biased for a specific reviewer’s nationality and continent because this dataset is for the hotel in Europe. In the third plot, we can see the distribution of the average reviewer’s score according to the reviewer’s nationality. However, it is hard to distinguish because of many nationalities. Therefore, we can use the continent of reviewer’s citizenship.
#Number of travel type
travel_type <-hotel %>% group_by(Travel_type) %>% summarise(average_score = mean(Reviewer_Score), Frequency=n())
number_travel_type <- ggplot(travel_type,
aes(y=Frequency, x=reorder(Travel_type,-Frequency),fill=Travel_type)) + geom_bar(stat = "identity") + ylab("Frequency") + xlab("Travel type") + ggtitle(label="Number of Travel type")
avg_travel_type <- ggplot(travel_type,
aes(y=average_score, x=reorder(Travel_type,-average_score),fill=Travel_type)) + geom_bar(stat = "identity") + ylab("Average Score") + xlab("Travel type") + ggtitle(label="Average Score by Travel type")
number_travel_type
avg_travel_type
Does Travel type affect the score? Most people’s travel purpose is leisure according to the first plot from this dataset. And leisure’s average score is a little higher than business purpose.
#Number of Group
Frequency_with<-ggplot(data = hotel, aes(x=With,fill=With))+
geom_bar(alpha=0.8) +
scale_fill_brewer(palette='Blues') +
ggtitle(label="Frequency of with whom") + theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
Frequency_with
#Average score depending on with whom
with_score <- hotel %>% group_by(With) %>% summarise(average_score = mean(Reviewer_Score), Frequency=n())
avg_with <- ggplot(data = with_score, aes(x=With,fill=With,y=average_score))+
geom_bar(alpha=0.8,stat = "identity") +
scale_fill_brewer(palette='Blues') +
ggtitle(label="Average Score by with whom") + theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
avg_with
Does travel with whom affect score? The couple traveler is the highest type in this dataset, and traveler with friends is the lowest one according to the first plot. However, it seems that there is not a significant difference in a score by traveler type.
#Frequency of using mobile device when write review
frequency_mobile <- ggplot(data = hotel, aes(x=Mobile))+
geom_bar(alpha=0.8, aes(fill=Mobile)) +
scale_fill_brewer(palette='Blues') +
ggtitle(label="Number of reivew by mobile device") + theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
frequency_mobile
# AVerage of score depending on mobile device
Mobile_score <- hotel %>% group_by(Mobile) %>% summarise(average_score = mean(Reviewer_Score), Frequency=n())
avg_mobile<-ggplot(data = Mobile_score, aes(x=Mobile))+
geom_bar(alpha=0.8, aes(fill=Mobile,y=average_score),stat = "identity") +
scale_fill_brewer(palette='Blues') +
ggtitle(label="Average Score by mobile device") + theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
avg_mobile
Does review by using mobile device affect the score? Most reviews are written by mobile device, but there is no significant difference in the score by using a mobile device.
# Frequency of stayed day
frequency_stayed_day <- ggplot(data = hotel, aes(x=Stay))+
geom_bar(alpha=0.8,fill='lightblue1',color='lightblue3') +
ggtitle(label="Frequency of stayed day")
frequency_stayed_day
# Frequency of stayed day by travel type
Stay_score <- hotel %>% group_by(Travel_type,Stay) %>% summarise(average_score = mean(Reviewer_Score), Frequency=n())
hovertext1 <- paste0("<b>Day:</b>", Stay_score$Stay, "<br>",
"<b>Travel Type:</b>", Stay_score$Travel_type, "<br>",
"<b>Frequency:</b>", Stay_score$Frequency, "<br>")
heatmap_travel_type <- plot_ly(data = Stay_score, x=~Stay, y=~Travel_type, z= ~Frequency, type = "heatmap",
hoverinfo = "text", text = hovertext1, colors="Reds") %>%
layout(
title = "Heatmap for travel type and stayed day ",
xaxis = list(title = "Stayed Day"),
yaxis = list(title = "Travel type"))
heatmap_travel_type
#Average score by stayed day
Stay_score1 <- hotel %>% group_by(Stay) %>% summarise(average_score = mean(Reviewer_Score), Frequency=n())
avg_stayed_day <- ggplot(data = Stay_score1, aes(x=Stay, y=average_score))+
geom_bar(alpha=0.8, stat = "identity",fill='lightblue1',color='lightblue3') +
ggtitle(label="Average Score by stayed day") + ylab("Average Score") + xlab("Stayed_day")
avg_stayed_day
Most people stayed under 5days according to the first plot. We can also see the frequency of rested day and travel type. Most leisure travel is short term. The third plot shows that average score decreased when the stayed days are longer; however, over 20days average score increases.
This dataset has two types of reviews. One is a positive review, and the other is a negative review. We can analyze the sentiment of each review.
#Sentiment analysis for Positive reivew
hotel_name <- unique(hotel$Hotel_Name)
hotel_p_token <- hotel %>% unnest_tokens(word, Positive_Review)
ht_pos <- hotel_p_token %>%
inner_join(get_sentiments("bing"))%>%
count(review_ID, sentiment)%>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
hotel_sent <- merge(hotel, ht_pos, by="review_ID", all=TRUE)
colnames(hotel_sent)[colnames(hotel_sent)=="negative"] <- "pr_negative"
colnames(hotel_sent)[colnames(hotel_sent)=="positive"] <- "pr_positive"
colnames(hotel_sent)[colnames(hotel_sent)=="sentiment"] <- "pr_sentiment"
hotel_n_token <- hotel %>% unnest_tokens(word, Negative_Review)
ht_neg <- hotel_n_token %>%
inner_join(get_sentiments("bing"))%>%
count(review_ID, sentiment)%>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
hotel_sent <- merge(hotel_sent, ht_neg, by="review_ID", all=TRUE)
colnames(hotel_sent)[colnames(hotel_sent)=="negative"] <- "nr_negative"
colnames(hotel_sent)[colnames(hotel_sent)=="positive"] <- "nr_positive"
colnames(hotel_sent)[colnames(hotel_sent)=="sentiment"] <- "nr_sentiment"
hotel_sent$pr_negative[is.na(hotel_sent$pr_negative)] <- 0
hotel_sent$pr_positive[is.na(hotel_sent$pr_positive)] <- 0
hotel_sent$pr_sentiment[is.na(hotel_sent$pr_sentiment)] <- 0
hotel_sent$nr_negative[is.na(hotel_sent$nr_negative)] <- 0
hotel_sent$nr_positive[is.na(hotel_sent$nr_positive)] <- 0
hotel_sent$nr_sentiment[is.na(hotel_sent$nr_sentiment)] <- 0
hotel_sent$Total_sentiment <- hotel_sent$pr_sentiment + hotel_sent$nr_sentiment
#sentiment analysis each Hotel
sentiment_byhotel <- hotel_sent %>% group_by(Hotel_Name, Average_Score,Additional_Number_of_Scoring,Total_Number_of_Reviews) %>%
summarise(Reviwer_average = mean(Reviewer_Score),pr_sentiment=mean(pr_sentiment),
nr_sentiment = mean(nr_sentiment),
Total_sentiment=mean(Total_sentiment))
plot_prsentiment_byhotel<-ggplot(sentiment_byhotel,
aes(Hotel_Name, pr_sentiment)) +
geom_col(show.legend = F,fill='lightblue1',color='lightblue3') + theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) + ggtitle(label = "Sentiment of positive review ")
plot_nrsentiment_byhotel<-ggplot(sentiment_byhotel,
aes(Hotel_Name, nr_sentiment)) +
geom_col(show.legend = F,fill='lightblue1',color='lightblue3') + theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) + ggtitle(label = "Sentiment of negative review ")
plot_sentiment_byhotel<-ggplot(sentiment_byhotel,
aes(Hotel_Name, Total_sentiment)) +
geom_col(show.legend = F,fill='lightblue1',color='lightblue3') + theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) + ggtitle(label = "Sum of Sentiment")
#We can see mean of setnment by each hotel.
#It shows that people use positive word rather than negative word.
plot_prsentiment_byhotel
plot_nrsentiment_byhotel
plot_sentiment_byhotel
The first plot shows each hotel’s mean of positive review’s sentiment. All of the hotel’s sentiment is over zero. It means positive reviews tend to write by positive word. On the other hands, the negative review has a various negative and positive sentiment. It shows people often use the negative word when they write negative reviews. However, sentiment analysis cannot catch well the negative meaning because it can be composed of a positive word. (For example, not suitable) Finally, each hotel’s summation of sentiment has all positive number. It can be interpreted people write a positive word more often when they write their reviews.
plot_prsent_eachhotel <-ggplot(hotel_sent %>% filter(Hotel_Name == hotel_name[5:6]),
aes(review_ID, pr_sentiment, fill = Hotel_Name)) +
geom_col(show.legend = F) +
facet_wrap(~Hotel_Name, ncol = 2, scales = "free_x")+ theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
plot_nrsent_eachhotel <-ggplot(hotel_sent %>% filter(Hotel_Name == hotel_name[5:10]),
aes(review_ID, nr_sentiment, fill = Hotel_Name)) +
geom_col(show.legend = F) +
facet_wrap(~Hotel_Name, ncol = 2, scales = "free_x")+ theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
plot_sent_eachhotel <-ggplot(hotel_sent %>% filter(Hotel_Name == hotel_name[5:10]),
aes(review_ID, Total_sentiment, fill = Hotel_Name)) +
geom_col(show.legend = F) +
facet_wrap(~Hotel_Name, ncol = 2, scales = "free_x")+ theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
plot_prsent_eachhotel
plot_nrsent_eachhotel
plot_sent_eachhotel
After extracting word from both positive reviews and negative reviews about each hotel’s each review, sentiment variable can be made. We can compare each hotel’s sentiment of reviews. For example, The plot shows six hotel’s review sentiments.
#correaltion between sentiment and review
cor <- hotel_sent[,c(9,12,13,14,31)]
corrplot(cor(cor), method = "color", tl.srt = 20, tl.cex = 0.8,
tl.col = "black", addCoef.col = TRUE)
#Correlation sentiment and other varialbes by each hotel
corrplot(cor(sentiment_byhotel[,-1]), method = "color",tl.srt = 20, tl.cex = 0.8,
tl.col = "black", addCoef.col = TRUE)
Then what is the relationship between score and sentiment of reviews? The first plot shows that correlation about each review’s variable. As we can see, the summation of sentiment(Total_sentiment) is positively correlated with word counts and the reviewer’s score. The second plot shows that correlation about each hotel’s variable. We can see more clearly that score and sentiment are highly positively correlated.
# Frequency of positive review
hotel_p_token_count <- hotel_p_token %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
wordc_postive <- wordcloud2(hotel_p_token %>% anti_join(stop_words) %>%
count(word, sort = TRUE) %>% top_n(100), minRotation = 0,maxRotation = 0, shape='circle',backgroundColor = 'black')
#We can see what is the frequent word in positive review.
wordc_postive
hotel_p_token_count %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()
#We can also compare frequent negative word and positive word in positive reviews.
hotel_p_token_count %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("red2", "blue2"),
max.words = 80)
So what is the most used word in a positive review? The wordcloud can show clearly. As we can see, location and staff is an essential word of a positive review. The second plot shows that most contributed word for sensitive of a positive word. As we can see, there are overwhelming positive words. It also can be seen through compared wordcloud.
# Frequency of negative review
hotel_n_token_count <- hotel_n_token %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
wordc_negative<-wordcloud2(hotel_n_token %>% anti_join(stop_words) %>%
count(word, sort = TRUE) %>% top_n(100), minRotation = 0,maxRotation = 0, shape='circle',backgroundColor = 'black')
#We can see what is the frequent word in negative review.
#wordc_negative
hotel_n_token %>% anti_join(stop_words) %>%count(word, sort = TRUE) %>% top_n(100) %>% with(wordcloud(word, n, max.words = 100))
#We can also compare frequent negative word and positive word in negative reviews.
hotel_n_token_count %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()
hotel_n_token_count %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("red2", "blue2"),
max.words = 80)
On the other hands, the negative review has more different words. It also has a similar proportion of negative and positive word.
It is hard to understand the exact meaning of using only one word. Thus we can extend two words for analysis. It is called the bigram.
#bigram for positive reivew
hotel_p_bigram <- hotel %>%
unnest_tokens(bigram, Positive_Review, token = "ngrams", n = 2)
hotel_p_bigram_sep <- hotel_p_bigram %>%
separate(bigram, c("word1", "word2"), sep = " ")
p_bigrams_filtered <- hotel_p_bigram_sep %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
p_bigram_counts <- p_bigrams_filtered %>%
count(word1, word2, sort = TRUE)
p_bigram_counts <- p_bigram_counts %>% drop_na()
total_bigram_counts <- p_bigram_counts %>% unite(bigram, word1, word2, sep = " ")
w_bigram_pos <- wordcloud2(total_bigram_counts %>% top_n(100,wt=n), minRotation = 0,maxRotation = 0, shape='circle',backgroundColor = 'black')
#We can see frequent bigram in postive reviews.
#It is more clear than using only one word
#w_bigram_pos
total_bigram_counts %>% top_n(100,wt=n) %>% with(wordcloud(bigram, n, max.words = 100))
#Top 10 frequency bigram by hotel for positive reivew
p_bigrams_filtered <-p_bigrams_filtered %>% drop_na()
p_bigrams_united <- p_bigrams_filtered %>% unite(bigram, word1, word2, sep = " ")
p_bigram_tf_idf <- p_bigrams_united %>%
count(Hotel_Name, bigram) %>%
bind_tf_idf(bigram, Hotel_Name, n) %>%
arrange(desc(n))
hotel_by_bigram <- p_bigram_tf_idf %>% mutate(word = factor(bigram, levels = rev(unique(bigram)))) %>%
group_by(Hotel_Name) %>%
top_n(10) %>%
ungroup()
frequency_eachhotel<-hotel_by_bigram %>% filter(Hotel_Name == hotel_name[3]) %>% ggplot(aes(x=reorder(word,tf_idf), y=tf_idf)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") + coord_flip() + ggtitle(label = hotel_name[3])
#We can chech each hotel's Top 10 frequent bigram in positive review.
#For example,
frequency_eachhotel
First wordcloud can show frequent bigram. It shows more clearly helpful staff or excellent location is important things for positive reviews. The second plot shows what the critical bigram of each hotel is. (For example, Apollo Hotel)
#bigram for Negative reivew
hotel_n_bigram <- hotel %>%
unnest_tokens(bigram, Negative_Review, token = "ngrams", n = 2)
hotel_n_bigram_sep <- hotel_n_bigram %>%
separate(bigram, c("word1", "word2"), sep = " ")
n_bigrams_filtered <- hotel_n_bigram_sep %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
n_bigram_counts <- n_bigrams_filtered %>%
count(word1, word2, sort = TRUE)
n_bigram_counts <- n_bigram_counts %>% drop_na()
n_total_bigram_counts <- n_bigram_counts %>% unite(bigram, word1, word2, sep = " ")
w_bigram_negative<-wordcloud2(n_total_bigram_counts %>% top_n(100), minRotation = 0,maxRotation = 0, shape='circle',backgroundColor = 'black')
#It seems like people have a complaint about facilities of hotels
#w_bigram_negative
n_total_bigram_counts %>% top_n(100) %>% with(wordcloud(bigram, n, max.words = 100))
##Top 10 frequency bigram by hotel
n_bigrams_filtered <-n_bigrams_filtered %>% drop_na()
n_bigrams_united <- n_bigrams_filtered %>% unite(bigram, word1, word2, sep = " ")
n_bigram_tf_idf <- n_bigrams_united %>%
count(Hotel_Name, bigram) %>%
bind_tf_idf(bigram, Hotel_Name, n) %>%
arrange(desc(n))
n_hotel_by_bigram <- n_bigram_tf_idf %>% mutate(word = factor(bigram, levels = rev(unique(bigram)))) %>%
group_by(Hotel_Name) %>%
top_n(10) %>%
ungroup()
n_frequency_eachhotel<-n_hotel_by_bigram %>% filter(Hotel_Name == hotel_name[3]) %>% ggplot(aes(x=reorder(word,tf_idf), y=tf_idf)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") + coord_flip() + ggtitle(label = hotel_name[3])
#We can chech each hotel's Top 10 frequent bigram in negative review.
#For example,
n_frequency_eachhotel
We can also see bigram for negative reviews. The negative review seems to have many issues about the facility.
#To prevent misunderstanding about negative review we can include negative expression
negation_words <- c("not", "no", "never", "without")
negated_words <- hotel_n_bigram_sep %>%
filter(word1 %in% negation_words) %>%
inner_join(get_sentiments('afinn'), by = c(word2 = "word")) %>%
count(word1, word2, score, sort = TRUE)
# wordcloud include negative word
not_word<-wordcloud2(negated_words[,-3] %>% unite(bigram, word1, word2, sep = " ") %>% top_n(100,wt=n), minRotation = 0,maxRotation = 0, shape='circle',backgroundColor = 'black')
#not_word
negated_words[,-3] %>% unite(bigram, word1, word2, sep = " ") %>% top_n(100,wt=n) %>% with(wordcloud(bigram, n, max.words = 100))
# comapre sensitive score when using negative expression
comapre_not <- negated_words %>% mutate(contribution = n * score) %>% arrange(desc(abs(contribution))) %>% group_by(word1) %>% top_n(20,abs(contribution)) %>% ggplot(aes(x=reorder(word2,n*score), y=n * score, fill = n * score > 0),height=1000) + geom_col(show.legend = FALSE) +
xlab("Words preceded by negative express") +
ylab("Sentiment score * number of occurrences") + facet_wrap(~word1,ncol = 2, scales = "free_y")+coord_flip()+ ggtitle(label="Compare sentiment score include negative term (Not,No,Never,Without)")
comapre_not
To prevent misunderstanding about the negative review, it is required to analyze word combined with a negative term like not and no. This is because negative meaning can be interpreted as a positive meaning.
#Visualization network for positive review
p_bigram_graph <- p_bigram_counts %>%
filter(n > 1000) %>%
graph_from_data_frame()
set.seed(777)
grid <- grid::arrow(type = "closed", length = unit(.15, "inches"))
pos_network<-ggraph(p_bigram_graph, layout = "fr") +
geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "cyan4") +
geom_node_point(size = 5) +
geom_node_text(aes(label = name), repel = TRUE,
point.padding = unit(0.2, "lines")) +
theme_void()+ ggtitle(label="Network visualization of Positive Review")
#We can check bigram relationship in positive reviews
pos_network
We can see more clearly the relationship between each word. In a positive word, Staff and location is the most important keyword.
#Visualization network for negative review
n_bigram_graph <- n_bigram_counts %>%
filter(n > 1000) %>%
graph_from_data_frame()
n_bigram_graph_n <- negated_words[,-3] %>%
filter(n > 100) %>%
graph_from_data_frame()
set.seed(777)
grid <- grid::arrow(type = "closed", length = unit(.10, "inches"))
network_negative<-ggraph(n_bigram_graph, layout = "fr") +
geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "darkred") +
geom_node_point(size = 5) +
geom_node_text(aes(label = name), repel = TRUE,
point.padding = unit(0.2, "lines")) +
theme_void() + ggtitle(label="Network visualization of Negative Review")
not_relationship<-ggraph(n_bigram_graph_n, layout = "fr") +
geom_edge_link(aes(edge_alpha = n),
arrow = grid, end_cap = circle(.07, 'inches')) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
theme_void() + ggtitle(label="Network visualization including negative term")
#We can check bigram relationship in negative reviews
network_negative
#We can also check negative term (not,no,without,never) relationship in negative reviews
not_relationship
On the other hands, negative reviews focus facility more. In the second plot, we can see the connection not, no and other words clearly.
For using the network visualization, we can see which continents traveled to which countries frequently.
#Build model for predict Score
#Making new variable from bigram analysis
not <- negated_words[,-3] %>% unite(bigram, word1, word2, sep = " ")
not <- not[1:10,1]
not <- rbind(not, row = apply(not, 2, paste0, collapse = "|"))
not <- not[11,]
not <- not$bigram
postive_bigram <- total_bigram_counts[1:10,1]
postive_bigram <- rbind(postive_bigram, row = apply(postive_bigram, 2, paste0, collapse = "|"))
postive_bigram<-postive_bigram[11,]
postive_bigram<-postive_bigram$bigram
hotel_sent$Not <- ifelse(grepl(not,hotel_sent$Negative_Review),1,0)
hotel_sent$Postive_bigram <- ifelse(grepl(postive_bigram,hotel_sent$Positive_Review),1,0)
hotel_for_modeling<-hotel_sent[,c(1,9,12,13,14,19,20,21,22,23,24,27,30,31,32,33)]
hotel_for_modeling$Mobile <- as.numeric(hotel_for_modeling$Mobile)
#Model for regression
hotel_for_modeling1 <-hotel_for_modeling[,-1]
#hotel_for_modeling$Level<-as.factor(hotel_for_modeling$Level)
#one hot encoding for xgboost
ohe_feats = c('Travel_type', 'With', 'Reviewer_Continent','Country')
dummies <- dummyVars(~ Travel_type + With + Reviewer_Continent + Country, data = hotel_for_modeling1)
df_all_ohe <- as.data.frame(predict(dummies, newdata = hotel_for_modeling1))
df_all_combined <- cbind(hotel_for_modeling1[,-c(which(colnames(hotel_for_modeling1) %in% ohe_feats))],df_all_ohe)
#Divide train and test set
set.seed(777)
sample = sample.split(df_all_combined, SplitRatio = .7)
x_train = subset(df_all_combined, sample == TRUE)
x_test = subset(df_all_combined, sample == FALSE)
train_label<-x_train$Reviewer_Score
test_label <-x_test$Reviewer_Score
x_train$Reviewer_Score<-NULL
x_test$Reviewer_Score<-NULL
bst <- xgboost(data = data.matrix(x_train), label = data.matrix(train_label), max.depth = 15,
eta = 0.1, nthread = 4, nrounds = 50,colsample_bytree=0.8,subsample=0.5,objective = "reg:linear")
## [1] train-rmse:7.279425
## [2] train-rmse:6.574137
## [3] train-rmse:5.941869
## [4] train-rmse:5.374878
## [5] train-rmse:4.867425
## [6] train-rmse:4.414244
## [7] train-rmse:4.010385
## [8] train-rmse:3.650982
## [9] train-rmse:3.328507
## [10] train-rmse:3.041918
## [11] train-rmse:2.789557
## [12] train-rmse:2.566329
## [13] train-rmse:2.366747
## [14] train-rmse:2.193556
## [15] train-rmse:2.039579
## [16] train-rmse:1.907296
## [17] train-rmse:1.788616
## [18] train-rmse:1.685695
## [19] train-rmse:1.596526
## [20] train-rmse:1.518879
## [21] train-rmse:1.451645
## [22] train-rmse:1.394337
## [23] train-rmse:1.344820
## [24] train-rmse:1.302403
## [25] train-rmse:1.266152
## [26] train-rmse:1.236283
## [27] train-rmse:1.210139
## [28] train-rmse:1.189812
## [29] train-rmse:1.171041
## [30] train-rmse:1.155197
## [31] train-rmse:1.140501
## [32] train-rmse:1.128618
## [33] train-rmse:1.119657
## [34] train-rmse:1.110706
## [35] train-rmse:1.101752
## [36] train-rmse:1.094239
## [37] train-rmse:1.089360
## [38] train-rmse:1.084851
## [39] train-rmse:1.081107
## [40] train-rmse:1.075834
## [41] train-rmse:1.070911
## [42] train-rmse:1.067415
## [43] train-rmse:1.062021
## [44] train-rmse:1.057327
## [45] train-rmse:1.053389
## [46] train-rmse:1.051391
## [47] train-rmse:1.048927
## [48] train-rmse:1.045583
## [49] train-rmse:1.043891
## [50] train-rmse:1.040725
#Test RMSE
y_pred <- predict(bst, data.matrix(x_test))
y_pred_train <- predict(bst, data.matrix(x_train))
#cbind(x_train,train_label,y_pred_train)
sqrt(mean((y_pred-test_label)^2))
## [1] 1.24545
#Find important variable
importance <- xgb.importance(feature_names = colnames(data.matrix(x_train)), model = bst)
#head(importance,10)
xgb.plot.importance(importance_matrix = importance)
From the former analysis, we can make a new variable by using positive, negative bigram and sentiment for finding an important variable for the reviewer’s score and predict the reviewer’s rating. To make a regression model, we can use XGBoost. After applying the model, the train RMSE is 1.04, and test RMSE is 1.24. As we can see the graph, top 5 crucial variables are negative review’s count, positive sentiment, Summation of sentiment, positive review’s count, and the reviewer’s previous number of reviews. We can say that these kinds of factors are meaningful when people give scores.
# Classification model
#From the distribution of Reviewer's Score, Divide good hotel(1) bad hotel(0)
hotel_for_modeling$Level <- ""
hotel_for_modeling[which(hotel_for_modeling$Reviewer_Score <= 10 & hotel_for_modeling$Reviewer_Score >=9),"Level"] <- 1
hotel_for_modeling[which(hotel_for_modeling$Reviewer_Score < 9),"Level"] <- 0
hotel_for_modeling2 <-hotel_for_modeling[,c(-1,-5)]
#hotel_for_modeling$Level<-as.factor(hotel_for_modeling$Level)
#one hot encoding for xgboost
ohe_feats = c('Travel_type', 'With', 'Reviewer_Continent','Country')
dummies1 <- dummyVars(~ Travel_type + With + Reviewer_Continent + Country, data = hotel_for_modeling2)
df_all_ohe1 <- as.data.frame(predict(dummies1, newdata = hotel_for_modeling2))
df_all_combined1 <- cbind(hotel_for_modeling2[,-c(which(colnames(hotel_for_modeling2) %in% ohe_feats))],df_all_ohe)
#Divide train and test set
set.seed(777)
sample = sample.split(df_all_combined1, SplitRatio = .7)
x_train1 = subset(df_all_combined1, sample == TRUE)
x_test1 = subset(df_all_combined1, sample == FALSE)
train_label1<-x_train1$Level
test_label1 <-x_test1$Level
x_train1$Level<-NULL
x_test1$Level<-NULL
bst_class <- xgboost(data = data.matrix(x_train1), label = data.matrix(train_label1), max.depth = 15,
eta = 0.1, nthread = 4, nrounds = 25,colsample_bytree=0.8,subsample=0.5,objective = "binary:logistic")
## [1] train-error:0.258928
## [2] train-error:0.247152
## [3] train-error:0.241068
## [4] train-error:0.238379
## [5] train-error:0.236505
## [6] train-error:0.235448
## [7] train-error:0.234035
## [8] train-error:0.233003
## [9] train-error:0.232195
## [10] train-error:0.230993
## [11] train-error:0.229850
## [12] train-error:0.229384
## [13] train-error:0.228153
## [14] train-error:0.227092
## [15] train-error:0.226225
## [16] train-error:0.225585
## [17] train-error:0.224101
## [18] train-error:0.223285
## [19] train-error:0.222236
## [20] train-error:0.221145
## [21] train-error:0.219971
## [22] train-error:0.218998
## [23] train-error:0.218455
## [24] train-error:0.217631
## [25] train-error:0.216602
#Test Accuracy
y_pred1 <- predict(bst_class, data.matrix(x_test1))
y_pred1 <- ifelse(y_pred1 > 0.5,1,0)
df <- data.frame(test=test_label1,Pred=y_pred1)
confusionMatrix(table(df$test,df$Pred))
## Confusion Matrix and Statistics
##
##
## 0 1
## 0 61828 21280
## 1 21874 54901
##
## Accuracy : 0.7301
## 95% CI : (0.7279, 0.7323)
## No Information Rate : 0.5235
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4592
## Mcnemar's Test P-Value : 0.004309
##
## Sensitivity : 0.7387
## Specificity : 0.7207
## Pos Pred Value : 0.7439
## Neg Pred Value : 0.7151
## Prevalence : 0.5235
## Detection Rate : 0.3867
## Detection Prevalence : 0.5198
## Balanced Accuracy : 0.7297
##
## 'Positive' Class : 0
##
#Find important variable
importance1 <- xgb.importance(feature_names = colnames(data.matrix(x_train1)), model = bst_class)
#head(importance,10)
xgb.plot.importance(importance_matrix = importance1)
We can also make a classification model. From the distribution of the reviewer’s score, if the review’s rating is over 9, we can call it a good hotel. Otherwise, we can call it a not good hotel. So we can make it binary classification model. After applying XGBoost, the test accuracy is 73%. The important variables are similar to the regression model, as well.
From the Hotel review data, we can get lots of information. From the exploratory data analysis, we can know people tend to give generous scores when they write reviews. Also, stayed period seems to affect the reviewer’s rating. And there is a little different average score depending on which continent travelers are from. Through the frequency words of each review, we can catch what is the important things for the positive and negative situation for the hotel. Also, we can see that people tend to use negative words when writing positive reviews, but tend to use positive words when writing negative reviews. To analyze the bigram for more accurate semantic analysis, we could understand the word’s tendency about the negative or positive side of the hotel. Thus we could confirm the characteristic of each hotel. Furthermore, the relationship between the two words can be analyzed through network visualization. Based on this information, the prediction model can be built. Through this model, we can confirm the significant factors for the reviewer’s score. Such as negative reviewer’s count, positive and negative sentiment, Summation of sentiment, positive review’s count, the reviewer’s previous number of reviews and stayed period, and so on.
link provides shinyapp for comparing each hotel’s information from anlaysis https://dw8757.shinyapps.io/hotel_comparing_system/