#import necessary library
library(tidyverse)
library(leaflet) # For mapping
library(wordcloud2) #For word cloud visualize
library(corrplot) # correlation chart
library(tm) # For text mining
library(SnowballC) #For text mining
library(wordcloud) # For word cloud visualize - interactive
library(caret) 
library(caTools) # For sample split function
library(tidytext)
library(htmlwidgets)
library(splitstackshape)
library(devtools)
library(reshape2)
library(countrycode)
library(plotly)
library(igraph)
library(threejs)
library(network)
library(sna)
library(ggraph)
library(widyr)
library(Matrix)
library(xgboost)
library(dummies)

1 Data Preporcessing

This dataset contains about 500,000 reviews from 1493 hotels from 6 countries. It has 17variables about positive and negative reviews and word count, average score, review’s score, and so on. Some useful information is in the tag and address variables. Therefore, we can make a new variable from these columns by using text mining before starting the analysis.

hotel <- read.csv('Hotel_Reviews.csv')
continent <- read.csv('continent.csv')
#summary(hotel)
#str(hotel)
hotel$Positive_Review <- as.character(hotel$Positive_Review)
hotel$Negative_Review <- as.character(hotel$Negative_Review)
hotel$Tags <- as.character(hotel$Tags)
hotel$Hotel_Name <- as.character(hotel$Hotel_Name)
hotel$review_ID <- seq.int(nrow(hotel))
hotel$review_ID <- as.factor(hotel$review_ID)
hotel[hotel$Positive_Review == 'No Positive','Positive_Review'] <- ""
hotel[hotel$Negative_Review == 'No Negative','Negative_Review'] <- ""
hotel$Reviewer_Nationality <- as.character(hotel$Reviewer_Nationality)
#sum(is.na(hotel$Reviewer_Nationality))
#sum(hotel$Reviewer_Nationality==" ")
hotel[hotel$Reviewer_Nationality== " ",'Reviewer_Nationality'] <- "Unknown"

reviewernationality <- unique(hotel$Reviewer_Nationality)
#Data cleaning
#explore missing value
#for (i in 1:17){
#print(sum(is.na(hotel[,i])))}
null <- hotel[c(which(is.na(hotel[,16:17])==TRUE)),]
nulllocation <- null %>% filter(!duplicated(null[,c('Hotel_Address')]))
#making country vairalbe
hotel$Hotel_Address <- as.character(hotel$Hotel_Address)

location <- hotel %>% filter(!duplicated(hotel[,c('lat','lng','Hotel_Address')]))
country <- c()
for (i in 1:nrow(location)) {
country <- append(country, tail(strsplit(location$Hotel_Address, " ")[[i]],1))}

location$Country <- country
location$Country[location$Country=="Kingdom"] <- "United Kingdom"
location$Country <- as.factor(location$Country)
#Making new variable from tag information
  #Travel Type
hotel$Leisure <- ifelse(grepl('Leisure',hotel$Tags),1,0)
hotel$Business <- ifelse(grepl('Business',hotel$Tags),2,0)
hotel$No_type <- hotel$Business + hotel$Leisure
hotel$Travel_type <- ""
hotel$Travel_type[which(hotel$No_type == 0)] <- 'No type'
hotel$Travel_type[which(hotel$Leisure == 1)] <- 'Leisure'
hotel$Travel_type[which(hotel$Business == 2)] <- 'Business'
hotel <- hotel[,-c(19,20,21)]
  #With Whom
hotel$Solo <- ifelse(grepl('Solo',hotel$Tags),1,0) # 1=Solo
hotel$Couple <- ifelse(grepl('Couple',hotel$Tags),2,0) # 2=Couple
hotel$Group <- ifelse(grepl('Group',hotel$Tags),3,0) # 3=Group
hotel$F_w_y <- ifelse(grepl('young',hotel$Tags),4,0) # 4=Family with young children
hotel$F_w_o <- ifelse(grepl('older',hotel$Tags),5,0) # 5=Family with older children
hotel$T_w_F <- ifelse(grepl('friend',hotel$Tags),6,0) # 6=With friends 
hotel$With <- hotel$Solo + hotel$Group + hotel$Couple + hotel$F_w_o + hotel$F_w_y + hotel$T_w_F
hotel <- hotel[,-c(20,21,22,23,24,25)]
hotel$With[which(hotel$With == 1)] <- 'Solo'
hotel$With[which(hotel$With == 2)] <- 'Couple'
hotel$With[which(hotel$With == 3)] <- 'Group'
hotel$With[which(hotel$With == 4)] <- 'Family w young'
hotel$With[which(hotel$With == 5)] <- 'Family w older'
hotel$With[which(hotel$With == 6)] <- 'W friends'

  #How many days stay
hotel$Stay <- str_extract(hotel$Tags, "(?<=Stayed).*(?=night)")
hotel$Stay <-as.numeric(hotel$Stay)
  ##summary(hotel$stay)
  #Review using mobile
hotel$Mobile <- ifelse(grepl('obile',hotel$Tags),1,0)
hotel$Mobile <- as.factor(hotel$Mobile)
#Convert reviewer's country to continent 
hotel$Reviewer_country <- countrycode(hotel$Reviewer_Nationality,'country.name', 'iso3c')
#sum(is.na(hotel$Reviewer_country))
continent <- continent[,c(3,6)]
hotel <- merge(hotel, continent, by.x="Reviewer_country", by.y='alpha.3', all.x=TRUE)
hotel <- hotel[,-1]
colnames(hotel)[23] <- "Reviewer_Continent"
#remove wrong and incomplete information
hotel <- drop_na(hotel)
hotel <- hotel[-which(hotel$Reviewer_Continent == ""),] 
#sum(is.na(hotel$Reviewer_Continent))
#sum(hotel$With == 0)

#str(location$Country)
#summary(location$Country)
hotel$Country <- ""
for(i in 1:nrow(location))
  {hotel$Country[c(which(hotel$Hotel_Address == location$Hotel_Address[[i]]))] <-as.character(location$Country[[i]])}
write.csv(hotel, file = "hotel_clean.csv",row.names=FALSE)
#str(hotel$Country)
#hotel$Country <- as.factor(hotel$Country)
#summary(hotel$Country)

2 Explore Data Analysis

2.1 Location

#Explore Data Analysis
#Location
map <- leaflet(location)%>%addProviderTiles(providers$Stamen.TonerLite)%>%
  addMarkers(popup = paste0( "<strong>Hotel Name:</strong>"
                                 ,"<br>"
                                 ,location$Hotel_Name,
                                 "<br>",
                                 "<strong>Address:</strong>"
                                 ,"<br>"
                                 ,location$Hotel_Address,
                                 "<br>",
                                 "<strong>Average Socre:</strong>"
                                 ,"<br>", 
                                 location$Average_Score), clusterOptions = markerClusterOptions())

map

2.2 Number of hotels and reviews

#Number of hotels according to contry
number_by_country <- ggplot(data = location, aes(x=Country))+
    geom_bar(alpha=0.8,aes(fill=Country)) +
    scale_fill_brewer(palette='Blues') + 
    ggtitle(label="Number of Hotels by Country") 


number_by_country

#Number of reviews according to hotel.
number_review_country <- ggplot(data = hotel, aes(x=Country))+
    geom_bar(alpha=0.8, aes(fill=Country)) +
    scale_fill_brewer(palette='Blues') + 
    ggtitle(label="Number of Reviews by Country") 


number_review_country

In this dataset, there are a lot of hotels in the United Kingdom compare to other countries. Thus the number of reviews also shows similar distribution.

2.3 Distribution of score

#Distribution of average review score. 
dist_average_score<-ggplot(data = hotel  %>% filter(!duplicated(hotel[,c('Hotel_Name','Average_Score')])),aes(x=Average_Score))+geom_histogram(fill='deepskyblue',color='deepskyblue4')+xlab("Average Review Score")+ylab("Counts")+ ggtitle(label="Distribution of average review score")


dist_average_score

dist_reviewer_score<-ggplot(data = hotel,aes(x=Reviewer_Score))+geom_histogram(fill='deepskyblue',color='deepskyblue4')+xlab("Reviewer's Score")+ylab("Counts") + ggtitle(label="Distribution of reviewer's score")



dist_reviewer_score

#Distribution of average review score by country
dist_average_score_bycountry<-ggplot(data = hotel,aes(x=Average_Score))+geom_histogram(fill='deepskyblue',color='deepskyblue4')+xlab("Reviewer's Score")+ylab("Counts") + facet_grid(~Country) + ggtitle(label="Distribution of average review score by country")

dist_average_score_bycountry

#Number of reviews by hotel.
Number_of_Reivew_byhotel<-ggplot(data = location, aes(x=Hotel_Name, y=Total_Number_of_Reviews))+
    geom_bar(stat = 'identity', alpha=0.8,fill='lightblue1',color='lightblue3') +
    scale_fill_brewer(palette='cadetblue1') + 
    ggtitle(label="Number of Reviews by hotel") + theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) +   geom_hline(yintercept = mean(location$Total_Number_of_Reviews),linetype="dashed",color="blue")


Number_of_Reivew_byhotel

Let’s see the distribution of hotel score. We can see a left-skewed distribution of hotel’s average score in the first graph. Besides, We can see people tend to give a generous score from the distribution of the reviewer’s score. Each country’s hotel shows a similar distribution of score. And We can also see each hotel’s number of review’s distribution from the last plot. The blue dashed line is mean

2.4 Correlation of each variable

#correlation of each numeric varialbe by hotel
#correlation by hotel
a<-location[,c(2,4,9)]
colnames(a) <- c("Additional score", "Average Score","Total number of reviews")
corrplot(cor(a), method = "color", tl.srt = 20, 
         tl.col = "black", addCoef.col = TRUE)

#correaltion by total reviews
b<-hotel[,c(8,11,12,13)]
colnames(b) <- c("Negative", "Positive","Reviewer's given", "Score")
corrplot(cor(b), method = "color", tl.srt = 20, tl.cex = 0.8,
         tl.col = "black", addCoef.col = TRUE)

Let’s see what the relationship of each numeric variables is. The first correlation plot shows that correlation of each hotel. It shows that an additional score and the total number of reviews is highly correlated. (Additional score means the number of the score without reviews) It can be interpreted that people tend to go where others go often. The second correlation plot shows that correlation of each review. And It shows the number of negative reviews has a negative correlation with the score.

2.5 Distribution of review

#Distribtuion of the number of positive and negative reviews 
dist_positive<-ggplot(data = hotel,aes(x=Review_Total_Positive_Word_Counts))+geom_histogram(fill='lightblue1',color='lightblue3')+xlab("positive word count") + 
    ggtitle(label="Distribution of postive word count")

dist_positive

dist_negative<-ggplot(data = hotel,aes(x=Review_Total_Negative_Word_Counts))+geom_histogram(fill='lightblue1',color='lightblue3')+xlab("Negative word count") + 
    ggtitle(label="Disribution of negative word count")

dist_negative

There is no significant difference in the distribution of word count. However, it looks like people tend to write longer when they write positive reviews.

2.6 Nationality and continent

#Does Reviewr's nationality and continent affect the score? 
score_by_nationality <- hotel %>% group_by(Reviewer_Nationality) %>% summarise(mean_score=mean(Reviewer_Score),Count = n()) 

score_by_continent <- hotel %>% group_by(Reviewer_Continent) %>% summarise(mean_score=mean(Reviewer_Score),Count = n()) 

number_reviewer_continent<-
ggplot(data = hotel, aes(x=Reviewer_Continent))+
    geom_bar(alpha=0.8, aes(fill=Reviewer_Continent)) +
    scale_fill_brewer(palette='Blues') + 
    ggtitle(label="Number of Reviews by Country") + theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

avg_reviewer_score <- ggplot(score_by_continent,
       aes(Reviewer_Continent, mean_score)) +
         geom_bar(stat = "identity",fill='lightblue1',color='lightblue3') + ylab("Average Score") + 
    ggtitle(label="Average score by reviwer's Continent") + 
  geom_hline(yintercept = mean(score_by_nationality$mean_score),linetype="dashed",color="blue")

avg_reviewer_nationality <- ggplot(score_by_nationality,
       aes(Reviewer_Nationality, mean_score)) +
         geom_bar(stat = "identity",fill='lightblue1',color='lightblue3')+ theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) + ylab("Average Score") + 
    ggtitle(label="Average score by reviwer's nationality") + 
  geom_hline(yintercept = mean(score_by_nationality$mean_score),linetype="dashed",color="blue")


number_reviewer_nationality<-ggplot(score_by_nationality,
       aes(Reviewer_Nationality, Count)) +
         geom_bar(stat = "identity",fill='lightblue1',color='lightblue3')  + theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) + ylab("Count") + 
    ggtitle(label="The number of reviews by reviwer's nationality") + 
  geom_hline(yintercept = mean(score_by_nationality$Count),linetype="dashed",color="blue")


number_reviewer_nationality

number_reviewer_continent

avg_reviewer_nationality

avg_reviewer_score

Does Reviewr’s nationality or continent affect the score? According to the first and second plot, this data is biased for a specific reviewer’s nationality and continent because this dataset is for the hotel in Europe. In the third plot, we can see the distribution of the average reviewer’s score according to the reviewer’s nationality. However, it is hard to distinguish because of many nationalities. Therefore, we can use the continent of reviewer’s citizenship.

2.7 Travel type

#Number of travel type
travel_type <-hotel %>% group_by(Travel_type) %>% summarise(average_score = mean(Reviewer_Score), Frequency=n()) 

number_travel_type <- ggplot(travel_type,
       aes(y=Frequency, x=reorder(Travel_type,-Frequency),fill=Travel_type)) +  geom_bar(stat = "identity")  + ylab("Frequency") + xlab("Travel type") + ggtitle(label="Number of Travel type") 

avg_travel_type <- ggplot(travel_type,
       aes(y=average_score, x=reorder(Travel_type,-average_score),fill=Travel_type)) +  geom_bar(stat = "identity")  + ylab("Average Score") + xlab("Travel type") + ggtitle(label="Average Score by Travel type") 

number_travel_type

avg_travel_type

Does Travel type affect the score? Most people’s travel purpose is leisure according to the first plot from this dataset. And leisure’s average score is a little higher than business purpose.

2.8 Travel with whom

#Number of Group
Frequency_with<-ggplot(data = hotel, aes(x=With,fill=With))+
    geom_bar(alpha=0.8) +
    scale_fill_brewer(palette='Blues') + 
    ggtitle(label="Frequency of with whom") + theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

Frequency_with

#Average score depending on with whom
with_score <- hotel %>%  group_by(With) %>% summarise(average_score = mean(Reviewer_Score), Frequency=n())

avg_with <- ggplot(data = with_score, aes(x=With,fill=With,y=average_score))+
    geom_bar(alpha=0.8,stat = "identity") +
    scale_fill_brewer(palette='Blues') + 
    ggtitle(label="Average Score by with whom") + theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) 

avg_with

Does travel with whom affect score? The couple traveler is the highest type in this dataset, and traveler with friends is the lowest one according to the first plot. However, it seems that there is not a significant difference in a score by traveler type.

2.9 Review writing by mobile device

#Frequency of using mobile device when write review 

frequency_mobile <- ggplot(data = hotel, aes(x=Mobile))+
    geom_bar(alpha=0.8, aes(fill=Mobile)) +
    scale_fill_brewer(palette='Blues') + 
    ggtitle(label="Number of reivew by mobile device") + theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) 

frequency_mobile

# AVerage of score depending on mobile device
Mobile_score <- hotel %>%  group_by(Mobile) %>% summarise(average_score = mean(Reviewer_Score), Frequency=n())


avg_mobile<-ggplot(data = Mobile_score, aes(x=Mobile))+
    geom_bar(alpha=0.8, aes(fill=Mobile,y=average_score),stat = "identity") +
    scale_fill_brewer(palette='Blues') + 
    ggtitle(label="Average Score by mobile device") + theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) 

avg_mobile

Does review by using mobile device affect the score? Most reviews are written by mobile device, but there is no significant difference in the score by using a mobile device.

2.10 Stayed day

# Frequency of stayed day

frequency_stayed_day <- ggplot(data = hotel, aes(x=Stay))+
    geom_bar(alpha=0.8,fill='lightblue1',color='lightblue3') +
    ggtitle(label="Frequency of stayed day") 

frequency_stayed_day

# Frequency of stayed day by travel type
Stay_score <- hotel %>%  group_by(Travel_type,Stay) %>% summarise(average_score = mean(Reviewer_Score), Frequency=n())

hovertext1 <- paste0("<b>Day:</b>", Stay_score$Stay, "<br>",
                         "<b>Travel Type:</b>", Stay_score$Travel_type, "<br>",
                         "<b>Frequency:</b>", Stay_score$Frequency, "<br>")

heatmap_travel_type <- plot_ly(data = Stay_score, x=~Stay, y=~Travel_type, z= ~Frequency, type = "heatmap",
            hoverinfo = "text", text = hovertext1, colors="Reds") %>% 
    layout(
      title = "Heatmap for travel type and stayed day ",
        xaxis = list(title = "Stayed Day"),
        yaxis = list(title = "Travel type"))

heatmap_travel_type
#Average score by stayed day
Stay_score1 <- hotel %>%  group_by(Stay) %>% summarise(average_score = mean(Reviewer_Score), Frequency=n())

avg_stayed_day <- ggplot(data = Stay_score1, aes(x=Stay, y=average_score))+
    geom_bar(alpha=0.8, stat = "identity",fill='lightblue1',color='lightblue3') +
    ggtitle(label="Average Score by stayed day") + ylab("Average Score") + xlab("Stayed_day")

avg_stayed_day

Most people stayed under 5days according to the first plot. We can also see the frequency of rested day and travel type. Most leisure travel is short term. The third plot shows that average score decreased when the stayed days are longer; however, over 20days average score increases.

3 Sentiment Analysis

This dataset has two types of reviews. One is a positive review, and the other is a negative review. We can analyze the sentiment of each review.

3.1 Sentiment of review

#Sentiment analysis for Positive reivew
hotel_name <- unique(hotel$Hotel_Name)

hotel_p_token <- hotel %>% unnest_tokens(word, Positive_Review)

ht_pos <- hotel_p_token %>%
  inner_join(get_sentiments("bing"))%>%
  count(review_ID, sentiment)%>%
  spread(sentiment, n, fill = 0) %>% 
  mutate(sentiment = positive - negative)

hotel_sent <- merge(hotel, ht_pos, by="review_ID", all=TRUE)
colnames(hotel_sent)[colnames(hotel_sent)=="negative"] <- "pr_negative"
colnames(hotel_sent)[colnames(hotel_sent)=="positive"] <- "pr_positive"
colnames(hotel_sent)[colnames(hotel_sent)=="sentiment"] <- "pr_sentiment"

hotel_n_token <- hotel %>% unnest_tokens(word, Negative_Review)

ht_neg <- hotel_n_token %>%
  inner_join(get_sentiments("bing"))%>%
  count(review_ID, sentiment)%>%
  spread(sentiment, n, fill = 0) %>% 
  mutate(sentiment = positive - negative)

hotel_sent <- merge(hotel_sent, ht_neg, by="review_ID", all=TRUE)
colnames(hotel_sent)[colnames(hotel_sent)=="negative"] <- "nr_negative"
colnames(hotel_sent)[colnames(hotel_sent)=="positive"] <- "nr_positive"
colnames(hotel_sent)[colnames(hotel_sent)=="sentiment"] <- "nr_sentiment"

hotel_sent$pr_negative[is.na(hotel_sent$pr_negative)] <- 0
hotel_sent$pr_positive[is.na(hotel_sent$pr_positive)] <- 0
hotel_sent$pr_sentiment[is.na(hotel_sent$pr_sentiment)] <- 0
hotel_sent$nr_negative[is.na(hotel_sent$nr_negative)] <- 0
hotel_sent$nr_positive[is.na(hotel_sent$nr_positive)] <- 0
hotel_sent$nr_sentiment[is.na(hotel_sent$nr_sentiment)] <- 0

hotel_sent$Total_sentiment <- hotel_sent$pr_sentiment + hotel_sent$nr_sentiment

#sentiment analysis each Hotel 
sentiment_byhotel <- hotel_sent %>% group_by(Hotel_Name, Average_Score,Additional_Number_of_Scoring,Total_Number_of_Reviews) %>%
  summarise(Reviwer_average = mean(Reviewer_Score),pr_sentiment=mean(pr_sentiment),
            nr_sentiment = mean(nr_sentiment),
            Total_sentiment=mean(Total_sentiment)) 


plot_prsentiment_byhotel<-ggplot(sentiment_byhotel,
       aes(Hotel_Name, pr_sentiment)) +
         geom_col(show.legend = F,fill='lightblue1',color='lightblue3') + theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) + ggtitle(label = "Sentiment of positive review ")

plot_nrsentiment_byhotel<-ggplot(sentiment_byhotel,
       aes(Hotel_Name, nr_sentiment)) +
         geom_col(show.legend = F,fill='lightblue1',color='lightblue3') + theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())  + ggtitle(label = "Sentiment of negative review ")

plot_sentiment_byhotel<-ggplot(sentiment_byhotel,
       aes(Hotel_Name, Total_sentiment)) +
         geom_col(show.legend = F,fill='lightblue1',color='lightblue3') + theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) + ggtitle(label = "Sum of Sentiment")

#We can see mean of setnment by each hotel. 
#It shows that people use positive word rather than negative word.
plot_prsentiment_byhotel

plot_nrsentiment_byhotel

plot_sentiment_byhotel

The first plot shows each hotel’s mean of positive review’s sentiment. All of the hotel’s sentiment is over zero. It means positive reviews tend to write by positive word. On the other hands, the negative review has a various negative and positive sentiment. It shows people often use the negative word when they write negative reviews. However, sentiment analysis cannot catch well the negative meaning because it can be composed of a positive word. (For example, not suitable) Finally, each hotel’s summation of sentiment has all positive number. It can be interpreted people write a positive word more often when they write their reviews.

3.2 Compare sentiment each hotel

plot_prsent_eachhotel <-ggplot(hotel_sent %>% filter(Hotel_Name == hotel_name[5:6]),
       aes(review_ID, pr_sentiment, fill = Hotel_Name)) +
         geom_col(show.legend = F) +
         facet_wrap(~Hotel_Name, ncol = 2, scales = "free_x")+ theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())


plot_nrsent_eachhotel <-ggplot(hotel_sent %>% filter(Hotel_Name == hotel_name[5:10]),
       aes(review_ID, nr_sentiment, fill = Hotel_Name)) +
         geom_col(show.legend = F) +
         facet_wrap(~Hotel_Name, ncol = 2, scales = "free_x")+ theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

plot_sent_eachhotel <-ggplot(hotel_sent %>% filter(Hotel_Name == hotel_name[5:10]),
       aes(review_ID, Total_sentiment, fill = Hotel_Name)) +
         geom_col(show.legend = F) +
         facet_wrap(~Hotel_Name, ncol = 2, scales = "free_x")+ theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

plot_prsent_eachhotel

plot_nrsent_eachhotel

plot_sent_eachhotel

After extracting word from both positive reviews and negative reviews about each hotel’s each review, sentiment variable can be made. We can compare each hotel’s sentiment of reviews. For example, The plot shows six hotel’s review sentiments.

3.3 Correaltion sentiment

#correaltion between sentiment and review 
cor <- hotel_sent[,c(9,12,13,14,31)]

corrplot(cor(cor), method = "color", tl.srt = 20, tl.cex = 0.8,
         tl.col = "black", addCoef.col = TRUE)

#Correlation sentiment and other varialbes by each hotel
corrplot(cor(sentiment_byhotel[,-1]), method = "color",tl.srt = 20, tl.cex = 0.8,
         tl.col = "black", addCoef.col = TRUE)

Then what is the relationship between score and sentiment of reviews? The first plot shows that correlation about each review’s variable. As we can see, the summation of sentiment(Total_sentiment) is positively correlated with word counts and the reviewer’s score. The second plot shows that correlation about each hotel’s variable. We can see more clearly that score and sentiment are highly positively correlated.

3.4 Wordcloud for Positive review

# Frequency of positive review
hotel_p_token_count <- hotel_p_token %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

wordc_postive <- wordcloud2(hotel_p_token %>% anti_join(stop_words) %>%
  count(word, sort = TRUE) %>% top_n(100), minRotation = 0,maxRotation = 0, shape='circle',backgroundColor = 'black')

#We can see what is the frequent word in positive review.
wordc_postive
hotel_p_token_count %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()

#We can also compare frequent negative word and positive word in positive reviews.
hotel_p_token_count %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("red2", "blue2"),
                   max.words = 80)

So what is the most used word in a positive review? The wordcloud can show clearly. As we can see, location and staff is an essential word of a positive review. The second plot shows that most contributed word for sensitive of a positive word. As we can see, there are overwhelming positive words. It also can be seen through compared wordcloud.

3.5 Wordcloud for Negative review

# Frequency of negative review
hotel_n_token_count <- hotel_n_token %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()



wordc_negative<-wordcloud2(hotel_n_token %>% anti_join(stop_words) %>%
  count(word, sort = TRUE) %>% top_n(100), minRotation = 0,maxRotation = 0, shape='circle',backgroundColor = 'black') 


#We can see what is the frequent word in negative review.
#wordc_negative

hotel_n_token %>% anti_join(stop_words) %>%count(word, sort = TRUE) %>% top_n(100) %>% with(wordcloud(word, n, max.words = 100))

#We can also compare frequent negative word and positive word in negative reviews.

hotel_n_token_count %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()

hotel_n_token_count %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("red2", "blue2"),
                   max.words = 80)

On the other hands, the negative review has more different words. It also has a similar proportion of negative and positive word.

3.6 Bigram in positive reviews

It is hard to understand the exact meaning of using only one word. Thus we can extend two words for analysis. It is called the bigram.

#bigram for positive reivew
hotel_p_bigram <- hotel %>%
  unnest_tokens(bigram, Positive_Review, token = "ngrams", n = 2)

hotel_p_bigram_sep <- hotel_p_bigram %>% 
 separate(bigram, c("word1", "word2"), sep = " ")

p_bigrams_filtered <- hotel_p_bigram_sep %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)




p_bigram_counts <- p_bigrams_filtered %>% 
  count(word1, word2, sort = TRUE) 

p_bigram_counts <- p_bigram_counts %>% drop_na()
total_bigram_counts <- p_bigram_counts %>% unite(bigram, word1, word2, sep = " ")

w_bigram_pos <- wordcloud2(total_bigram_counts %>% top_n(100,wt=n), minRotation = 0,maxRotation = 0, shape='circle',backgroundColor = 'black') 

#We can see frequent bigram in postive reviews.
#It is more clear than using only one word
#w_bigram_pos

total_bigram_counts %>% top_n(100,wt=n) %>% with(wordcloud(bigram, n, max.words = 100))

#Top 10 frequency bigram by hotel for positive reivew
p_bigrams_filtered <-p_bigrams_filtered %>% drop_na()
p_bigrams_united <- p_bigrams_filtered %>% unite(bigram, word1, word2, sep = " ")

p_bigram_tf_idf <- p_bigrams_united %>%
  count(Hotel_Name, bigram) %>%
  bind_tf_idf(bigram, Hotel_Name, n) %>%
  arrange(desc(n)) 

hotel_by_bigram <- p_bigram_tf_idf %>% mutate(word = factor(bigram, levels = rev(unique(bigram)))) %>%
  group_by(Hotel_Name) %>%
  top_n(10) %>%
  ungroup()

frequency_eachhotel<-hotel_by_bigram %>% filter(Hotel_Name == hotel_name[3]) %>% ggplot(aes(x=reorder(word,tf_idf), y=tf_idf)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf")  + coord_flip() + ggtitle(label = hotel_name[3])

#We can chech each hotel's Top 10 frequent bigram in positive review.
#For example,
frequency_eachhotel

First wordcloud can show frequent bigram. It shows more clearly helpful staff or excellent location is important things for positive reviews. The second plot shows what the critical bigram of each hotel is. (For example, Apollo Hotel)

3.7 Bigram in negative reviews

#bigram for Negative reivew
hotel_n_bigram <- hotel %>%
  unnest_tokens(bigram, Negative_Review, token = "ngrams", n = 2)

hotel_n_bigram_sep <- hotel_n_bigram %>% 
 separate(bigram, c("word1", "word2"), sep = " ")


n_bigrams_filtered <- hotel_n_bigram_sep %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)

n_bigram_counts <- n_bigrams_filtered %>% 
  count(word1, word2, sort = TRUE) 

n_bigram_counts <- n_bigram_counts %>% drop_na()
n_total_bigram_counts <- n_bigram_counts %>% unite(bigram, word1, word2, sep = " ")

w_bigram_negative<-wordcloud2(n_total_bigram_counts %>% top_n(100), minRotation = 0,maxRotation = 0, shape='circle',backgroundColor = 'black') 

#It seems like people have a complaint about facilities of hotels 
#w_bigram_negative

n_total_bigram_counts %>% top_n(100) %>% with(wordcloud(bigram, n, max.words = 100))

##Top 10 frequency bigram by hotel
n_bigrams_filtered <-n_bigrams_filtered %>% drop_na()
n_bigrams_united <- n_bigrams_filtered %>% unite(bigram, word1, word2, sep = " ")

n_bigram_tf_idf <- n_bigrams_united %>%
  count(Hotel_Name, bigram) %>%
  bind_tf_idf(bigram, Hotel_Name, n) %>%
  arrange(desc(n)) 

n_hotel_by_bigram <- n_bigram_tf_idf %>% mutate(word = factor(bigram, levels = rev(unique(bigram)))) %>%
  group_by(Hotel_Name) %>%
  top_n(10) %>%
  ungroup()

n_frequency_eachhotel<-n_hotel_by_bigram %>%  filter(Hotel_Name == hotel_name[3]) %>% ggplot(aes(x=reorder(word,tf_idf), y=tf_idf)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf")  + coord_flip() + ggtitle(label = hotel_name[3])


#We can chech each hotel's Top 10 frequent bigram in negative review.
#For example,
n_frequency_eachhotel

We can also see bigram for negative reviews. The negative review seems to have many issues about the facility.

3.8 Negative term in negative reviews

#To prevent misunderstanding about negative review we can include negative expression 
negation_words <- c("not", "no", "never", "without")

negated_words <- hotel_n_bigram_sep %>%
  filter(word1 %in% negation_words) %>%
  inner_join(get_sentiments('afinn'), by = c(word2 = "word")) %>%
  count(word1, word2, score, sort = TRUE) 

# wordcloud include negative word 
not_word<-wordcloud2(negated_words[,-3] %>% unite(bigram, word1, word2, sep = " ") %>% top_n(100,wt=n), minRotation = 0,maxRotation = 0, shape='circle',backgroundColor = 'black') 

#not_word

negated_words[,-3] %>% unite(bigram, word1, word2, sep = " ") %>% top_n(100,wt=n) %>% with(wordcloud(bigram, n, max.words = 100))

# comapre sensitive score when using negative expression 
comapre_not <- negated_words %>% mutate(contribution = n * score) %>% arrange(desc(abs(contribution))) %>% group_by(word1) %>%  top_n(20,abs(contribution)) %>% ggplot(aes(x=reorder(word2,n*score), y=n * score, fill = n * score > 0),height=1000) + geom_col(show.legend = FALSE) +
  xlab("Words preceded by negative express") +
  ylab("Sentiment score * number of occurrences") + facet_wrap(~word1,ncol = 2, scales = "free_y")+coord_flip()+ ggtitle(label="Compare sentiment score include negative term (Not,No,Never,Without)")

comapre_not

To prevent misunderstanding about the negative review, it is required to analyze word combined with a negative term like not and no. This is because negative meaning can be interpreted as a positive meaning.

4 Network visualization

4.1 Postive review

#Visualization network for positive review

p_bigram_graph <- p_bigram_counts %>%
  filter(n > 1000) %>%
  graph_from_data_frame()

set.seed(777)

grid <- grid::arrow(type = "closed", length = unit(.15, "inches"))

pos_network<-ggraph(p_bigram_graph, layout = "fr") +
  geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "cyan4") +
  geom_node_point(size = 5) +
  geom_node_text(aes(label = name), repel = TRUE, 
                 point.padding = unit(0.2, "lines")) +
  theme_void()+ ggtitle(label="Network visualization of Positive Review")
#We can check bigram relationship in positive reviews 
pos_network

We can see more clearly the relationship between each word. In a positive word, Staff and location is the most important keyword.

4.2 Negative review

#Visualization network for negative review

n_bigram_graph <- n_bigram_counts %>%
  filter(n > 1000) %>%
  graph_from_data_frame()

n_bigram_graph_n <- negated_words[,-3] %>%
  filter(n > 100) %>%
  graph_from_data_frame()


set.seed(777)

grid <- grid::arrow(type = "closed", length = unit(.10, "inches"))

network_negative<-ggraph(n_bigram_graph, layout = "fr") +
  geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "darkred") +
  geom_node_point(size = 5) +
  geom_node_text(aes(label = name), repel = TRUE,
                 point.padding = unit(0.2, "lines")) +
  theme_void() + ggtitle(label="Network visualization of Negative Review")

not_relationship<-ggraph(n_bigram_graph_n, layout = "fr") +
  geom_edge_link(aes(edge_alpha = n),
                 arrow = grid, end_cap = circle(.07, 'inches')) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
  theme_void() + ggtitle(label="Network visualization including negative term")

#We can check bigram relationship in negative reviews 
network_negative

#We can also check negative term (not,no,without,never) relationship in negative reviews 
not_relationship

On the other hands, negative reviews focus facility more. In the second plot, we can see the connection not, no and other words clearly.

4.3 From which continent

For using the network visualization, we can see which continents traveled to which countries frequently.

5 XGBoost Score predcition model

5.1 Regression

#Build model for predict Score
#Making new variable from bigram analysis 
not <- negated_words[,-3] %>% unite(bigram, word1, word2, sep = " ")
not <- not[1:10,1]
not <- rbind(not, row = apply(not, 2, paste0, collapse = "|"))
not <- not[11,]
not <- not$bigram

postive_bigram <- total_bigram_counts[1:10,1]
postive_bigram <- rbind(postive_bigram, row = apply(postive_bigram, 2, paste0, collapse = "|"))
postive_bigram<-postive_bigram[11,]
postive_bigram<-postive_bigram$bigram


hotel_sent$Not <- ifelse(grepl(not,hotel_sent$Negative_Review),1,0)
hotel_sent$Postive_bigram <- ifelse(grepl(postive_bigram,hotel_sent$Positive_Review),1,0)

hotel_for_modeling<-hotel_sent[,c(1,9,12,13,14,19,20,21,22,23,24,27,30,31,32,33)]
hotel_for_modeling$Mobile <- as.numeric(hotel_for_modeling$Mobile)

#Model for regression
hotel_for_modeling1 <-hotel_for_modeling[,-1]
#hotel_for_modeling$Level<-as.factor(hotel_for_modeling$Level)


#one hot encoding for xgboost
ohe_feats = c('Travel_type', 'With', 'Reviewer_Continent','Country')
dummies <- dummyVars(~ Travel_type +  With + Reviewer_Continent + Country, data = hotel_for_modeling1)
df_all_ohe <- as.data.frame(predict(dummies, newdata = hotel_for_modeling1))
df_all_combined <- cbind(hotel_for_modeling1[,-c(which(colnames(hotel_for_modeling1) %in% ohe_feats))],df_all_ohe)

#Divide train and test set
set.seed(777) 
sample = sample.split(df_all_combined, SplitRatio = .7)
x_train = subset(df_all_combined, sample == TRUE)
x_test = subset(df_all_combined, sample == FALSE)
train_label<-x_train$Reviewer_Score
test_label <-x_test$Reviewer_Score
x_train$Reviewer_Score<-NULL
x_test$Reviewer_Score<-NULL


bst <- xgboost(data = data.matrix(x_train), label = data.matrix(train_label), max.depth = 15,
               eta = 0.1, nthread = 4, nrounds = 50,colsample_bytree=0.8,subsample=0.5,objective = "reg:linear")
## [1]  train-rmse:7.279425 
## [2]  train-rmse:6.574137 
## [3]  train-rmse:5.941869 
## [4]  train-rmse:5.374878 
## [5]  train-rmse:4.867425 
## [6]  train-rmse:4.414244 
## [7]  train-rmse:4.010385 
## [8]  train-rmse:3.650982 
## [9]  train-rmse:3.328507 
## [10] train-rmse:3.041918 
## [11] train-rmse:2.789557 
## [12] train-rmse:2.566329 
## [13] train-rmse:2.366747 
## [14] train-rmse:2.193556 
## [15] train-rmse:2.039579 
## [16] train-rmse:1.907296 
## [17] train-rmse:1.788616 
## [18] train-rmse:1.685695 
## [19] train-rmse:1.596526 
## [20] train-rmse:1.518879 
## [21] train-rmse:1.451645 
## [22] train-rmse:1.394337 
## [23] train-rmse:1.344820 
## [24] train-rmse:1.302403 
## [25] train-rmse:1.266152 
## [26] train-rmse:1.236283 
## [27] train-rmse:1.210139 
## [28] train-rmse:1.189812 
## [29] train-rmse:1.171041 
## [30] train-rmse:1.155197 
## [31] train-rmse:1.140501 
## [32] train-rmse:1.128618 
## [33] train-rmse:1.119657 
## [34] train-rmse:1.110706 
## [35] train-rmse:1.101752 
## [36] train-rmse:1.094239 
## [37] train-rmse:1.089360 
## [38] train-rmse:1.084851 
## [39] train-rmse:1.081107 
## [40] train-rmse:1.075834 
## [41] train-rmse:1.070911 
## [42] train-rmse:1.067415 
## [43] train-rmse:1.062021 
## [44] train-rmse:1.057327 
## [45] train-rmse:1.053389 
## [46] train-rmse:1.051391 
## [47] train-rmse:1.048927 
## [48] train-rmse:1.045583 
## [49] train-rmse:1.043891 
## [50] train-rmse:1.040725
#Test RMSE
y_pred <- predict(bst, data.matrix(x_test))
y_pred_train <- predict(bst, data.matrix(x_train))

#cbind(x_train,train_label,y_pred_train)

sqrt(mean((y_pred-test_label)^2))
## [1] 1.24545
#Find important variable 
importance <- xgb.importance(feature_names = colnames(data.matrix(x_train)), model = bst)
#head(importance,10)
xgb.plot.importance(importance_matrix = importance)

From the former analysis, we can make a new variable by using positive, negative bigram and sentiment for finding an important variable for the reviewer’s score and predict the reviewer’s rating. To make a regression model, we can use XGBoost. After applying the model, the train RMSE is 1.04, and test RMSE is 1.24. As we can see the graph, top 5 crucial variables are negative review’s count, positive sentiment, Summation of sentiment, positive review’s count, and the reviewer’s previous number of reviews. We can say that these kinds of factors are meaningful when people give scores.

5.2 Binary Classification

# Classification model

#From the distribution of Reviewer's Score, Divide good hotel(1) bad hotel(0)
hotel_for_modeling$Level <- ""
hotel_for_modeling[which(hotel_for_modeling$Reviewer_Score <= 10 & hotel_for_modeling$Reviewer_Score >=9),"Level"] <- 1
hotel_for_modeling[which(hotel_for_modeling$Reviewer_Score < 9),"Level"] <- 0
hotel_for_modeling2 <-hotel_for_modeling[,c(-1,-5)]
#hotel_for_modeling$Level<-as.factor(hotel_for_modeling$Level)


#one hot encoding for xgboost
ohe_feats = c('Travel_type', 'With', 'Reviewer_Continent','Country')
dummies1 <- dummyVars(~ Travel_type +  With + Reviewer_Continent + Country, data = hotel_for_modeling2)
df_all_ohe1 <- as.data.frame(predict(dummies1, newdata = hotel_for_modeling2))
df_all_combined1 <- cbind(hotel_for_modeling2[,-c(which(colnames(hotel_for_modeling2) %in% ohe_feats))],df_all_ohe)

#Divide train and test set
set.seed(777)
sample = sample.split(df_all_combined1, SplitRatio = .7)
x_train1 = subset(df_all_combined1, sample == TRUE)
x_test1 = subset(df_all_combined1, sample == FALSE)
train_label1<-x_train1$Level
test_label1 <-x_test1$Level
x_train1$Level<-NULL
x_test1$Level<-NULL


bst_class <- xgboost(data = data.matrix(x_train1), label = data.matrix(train_label1), max.depth = 15,
               eta = 0.1, nthread = 4, nrounds = 25,colsample_bytree=0.8,subsample=0.5,objective = "binary:logistic")
## [1]  train-error:0.258928 
## [2]  train-error:0.247152 
## [3]  train-error:0.241068 
## [4]  train-error:0.238379 
## [5]  train-error:0.236505 
## [6]  train-error:0.235448 
## [7]  train-error:0.234035 
## [8]  train-error:0.233003 
## [9]  train-error:0.232195 
## [10] train-error:0.230993 
## [11] train-error:0.229850 
## [12] train-error:0.229384 
## [13] train-error:0.228153 
## [14] train-error:0.227092 
## [15] train-error:0.226225 
## [16] train-error:0.225585 
## [17] train-error:0.224101 
## [18] train-error:0.223285 
## [19] train-error:0.222236 
## [20] train-error:0.221145 
## [21] train-error:0.219971 
## [22] train-error:0.218998 
## [23] train-error:0.218455 
## [24] train-error:0.217631 
## [25] train-error:0.216602
#Test Accuracy
y_pred1 <- predict(bst_class, data.matrix(x_test1))
y_pred1 <- ifelse(y_pred1 > 0.5,1,0)
df <- data.frame(test=test_label1,Pred=y_pred1)
confusionMatrix(table(df$test,df$Pred))
## Confusion Matrix and Statistics
## 
##    
##         0     1
##   0 61828 21280
##   1 21874 54901
##                                           
##                Accuracy : 0.7301          
##                  95% CI : (0.7279, 0.7323)
##     No Information Rate : 0.5235          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4592          
##  Mcnemar's Test P-Value : 0.004309        
##                                           
##             Sensitivity : 0.7387          
##             Specificity : 0.7207          
##          Pos Pred Value : 0.7439          
##          Neg Pred Value : 0.7151          
##              Prevalence : 0.5235          
##          Detection Rate : 0.3867          
##    Detection Prevalence : 0.5198          
##       Balanced Accuracy : 0.7297          
##                                           
##        'Positive' Class : 0               
## 
#Find important variable
importance1 <- xgb.importance(feature_names = colnames(data.matrix(x_train1)), model = bst_class)
#head(importance,10)
xgb.plot.importance(importance_matrix = importance1)

We can also make a classification model. From the distribution of the reviewer’s score, if the review’s rating is over 9, we can call it a good hotel. Otherwise, we can call it a not good hotel. So we can make it binary classification model. After applying XGBoost, the test accuracy is 73%. The important variables are similar to the regression model, as well.

6 Conclusion

From the Hotel review data, we can get lots of information. From the exploratory data analysis, we can know people tend to give generous scores when they write reviews. Also, stayed period seems to affect the reviewer’s rating. And there is a little different average score depending on which continent travelers are from. Through the frequency words of each review, we can catch what is the important things for the positive and negative situation for the hotel. Also, we can see that people tend to use negative words when writing positive reviews, but tend to use positive words when writing negative reviews. To analyze the bigram for more accurate semantic analysis, we could understand the word’s tendency about the negative or positive side of the hotel. Thus we could confirm the characteristic of each hotel. Furthermore, the relationship between the two words can be analyzed through network visualization. Based on this information, the prediction model can be built. Through this model, we can confirm the significant factors for the reviewer’s score. Such as negative reviewer’s count, positive and negative sentiment, Summation of sentiment, positive review’s count, the reviewer’s previous number of reviews and stayed period, and so on.

7 Dashboard for hotel comparison

link provides shinyapp for comparing each hotel’s information from anlaysis https://dw8757.shinyapps.io/hotel_comparing_system/