Loading the Hotel Reviews data by Kaggle

reviews <- read.csv("Hotel_Reviews.csv", stringsAsFactors = FALSE)

Extracting for the number of nights stayed

library(stringr)
stay_num <- str_extract(reviews$Tags, "Stayed\\s[:digit:]\\snights")
Num_nights <- str_extract(stay_num, "\\d")
Num_nights <- as.integer(Num_nights)
mean(Num_nights, na.rm=T)
## [1] 3.108839

The average length of stay by each guest is 3.1 days

Combining the new column to the dataset

reviews <- cbind(reviews, Num_nights)

Removing the NA values on Num_nights

library(dplyr)
stay <- reviews %>% filter(Num_nights!="NA")

Changing reviewers’ nationality into factor

stay$Reviewer_Nationality <- as.factor(stay$Reviewer_Nationality)

Seeing the length of stay for each nationality (graphical)

library(ggplot2)
stay %>% group_by(Reviewer_Nationality) %>%
summarise(avg=mean(Num_nights)) %>%
top_n(15) %>%
ggplot(aes(x=reorder(Reviewer_Nationality, -avg), y=avg, fill=Reviewer_Nationality)) +
geom_bar(stat = "identity") +
ggtitle("Length of Stay by Nationality") +
xlab("Nationality") +
theme(axis.text.x = element_text(angle = 90)) +
ylab("Average length of stay")
## Selecting by avg