Executive summary

  1. What is the relationship between how unhappy a guest is while they rating to how long their reviews are?
  2. What is the strategy from the management, are they focusing on happy or unhappy costumers.
  3. I also checked how the demographic affects the ratings and expectations on what 5 start experience is.

Data background

I have used TripAdvisor Hotel Reviews Dataset that was public available to learn the difference between the rating that are left by costumers. The difference between satisfied and unsatisfied guests, they rating and the length of their feedback and the managment approah to the reviews.

Data cleaning

library(flexdashboard)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
# Load the data
 df<- read_csv("C:/Users/yudit/Downloads/final project/tripadvisor_hotel_reviews_dataset.csv") %>%
  mutate(
    has_response = !is.na(management_response),
    word_count = str_count(review_text, "\\S+"),
    trip_type = str_to_title(ifelse(is.na(trip_type) | trip_type == "NONE", "Other", trip_type))
  )
## Rows: 1098 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (13): hotel_name, place_type, category, hotel_url, review_title, review...
## dbl   (3): review_id, rating, helpful_votes
## date  (2): publishedDate, stay_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Individual figures

Figure 1

response_summary <- df %>%
  group_by(rating) %>%
  summarize(rate = mean(has_response) * 100)

p1 <- ggplot(response_summary, aes(x = as.factor(rating), y = rate, fill = as.factor(rating))) +
  geom_col() +
  scale_fill_brewer(palette = "RdYlGn") +
  labs(x = "Rating (1-5)", y = "% Answered") +
  theme_minimal() +
  guides(fill = "none")

ggplotly(p1)

Figure 2

# 1. CALCULATE the rates - we must name it 'response_rate' here
response_summary <- df %>%
  group_by(rating) %>%
  summarize(
    response_rate = mean(has_response) * 100
  )

# 2. CREATE the Lollipop Chart
p_lollipop <- ggplot(response_summary, aes(x = as.factor(rating), y = response_rate)) +
  # Draw the 'stick' of the lollipop
  geom_segment(aes(x = as.factor(rating), 
                   xend = as.factor(rating), 
                   y = 0, 
                   yend = response_rate), 
               color = "grey70") +
  # Draw the 'candy' (the dot)
  geom_point(aes(color = response_rate), size = 5) +
  scale_color_gradient(low = "#e41a1c", high = "#4daf4a") +
  coord_flip() + 
  theme_minimal() +
  labs(
    title = "Response Rate Efficiency",
    x = "Guest Rating", 
    y = "Percent Answered (%)"
  ) +
  theme(legend.position = "none")

# 3. CONVERT TO INTERACTIVE
# This is where the error was happening because it couldn't find 'response_rate'
ggplotly(p_lollipop)

Figure 3

p2 <- df %>%
  filter(!is.na(trip_type) & trip_type != "NONE") %>%
  ggplot(aes(x = trip_type, y = rating, fill = trip_type)) +
  geom_violin(alpha = 0.5) +
  coord_flip() +
  theme_minimal() +
  labs(x = "") +
  guides(fill = "none")


ggplotly(p2)

Figure 4

# 1. Filter data to include only valid traveler types
boxplot_data <- df %>%
  filter(!is.na(trip_type)) %>%
  filter(trip_type != "Other" & trip_type != "NONE")

# 2. Create the Faceted Boxplot
# Note: Ensure 'word_count' was created in your setup chunk
p_faceted <- ggplot(boxplot_data, aes(x = as.factor(rating), y = word_count, fill = as.factor(rating))) +
  geom_boxplot(outlier.shape = NA, alpha = 0.7) +
  facet_wrap(~trip_type) + 
  coord_cartesian(ylim = c(0, 600)) + 
  scale_fill_brewer(palette = "RdYlGn") +
  theme_minimal() +
  labs(
    x = "Rating (1 = Unhappy, 5 = Happy)",
    y = "Review Word Count",
    fill = "Rating"
  ) +
  theme(legend.position = "none")

# 3. Output the interactive version
ggplotly(p_faceted)

Figure 5

# 1. Prepare the data
# We filter out 'NONE' and 'Other' to focus on the main demographics
violin_data <- df %>%
  filter(!is.na(trip_type) & !trip_type %in% c("NONE", "Other")) %>%
  mutate(trip_type = str_to_title(trip_type))

# 2. Create the Violin Plot
p_violin <- ggplot(violin_data, aes(x = trip_type, y = rating, fill = trip_type)) +
  # The violin shows the density of the ratings
  geom_violin(alpha = 0.5, trim = FALSE) +
  # Adding a thin boxplot inside helps show the median and quartiles clearly
  geom_boxplot(width = 0.1, color = "black", outlier.shape = NA, alpha = 0.7) +
  scale_fill_brewer(palette = "Set3") +
  theme_minimal() +
  labs(
    title = "Distribution of Satisfaction by Traveler Type",
    subtitle = "The width of the violin represents the concentration of ratings",
    x = "Traveler Category",
    y = "Rating (1-5)"
  ) +
  theme(legend.position = "none")

# 3. Make it interactive for your dashboard
ggplotly(p_violin)

```