I have used TripAdvisor Hotel Reviews Dataset that was public available to learn the difference between the rating that are left by costumers. The difference between satisfied and unsatisfied guests, they rating and the length of their feedback and the managment approah to the reviews.
library(flexdashboard)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
# Load the data
df<- read_csv("C:/Users/yudit/Downloads/final project/tripadvisor_hotel_reviews_dataset.csv") %>%
mutate(
has_response = !is.na(management_response),
word_count = str_count(review_text, "\\S+"),
trip_type = str_to_title(ifelse(is.na(trip_type) | trip_type == "NONE", "Other", trip_type))
)
## Rows: 1098 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (13): hotel_name, place_type, category, hotel_url, review_title, review...
## dbl (3): review_id, rating, helpful_votes
## date (2): publishedDate, stay_date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
response_summary <- df %>%
group_by(rating) %>%
summarize(rate = mean(has_response) * 100)
p1 <- ggplot(response_summary, aes(x = as.factor(rating), y = rate, fill = as.factor(rating))) +
geom_col() +
scale_fill_brewer(palette = "RdYlGn") +
labs(x = "Rating (1-5)", y = "% Answered") +
theme_minimal() +
guides(fill = "none")
ggplotly(p1)
# 1. CALCULATE the rates - we must name it 'response_rate' here
response_summary <- df %>%
group_by(rating) %>%
summarize(
response_rate = mean(has_response) * 100
)
# 2. CREATE the Lollipop Chart
p_lollipop <- ggplot(response_summary, aes(x = as.factor(rating), y = response_rate)) +
# Draw the 'stick' of the lollipop
geom_segment(aes(x = as.factor(rating),
xend = as.factor(rating),
y = 0,
yend = response_rate),
color = "grey70") +
# Draw the 'candy' (the dot)
geom_point(aes(color = response_rate), size = 5) +
scale_color_gradient(low = "#e41a1c", high = "#4daf4a") +
coord_flip() +
theme_minimal() +
labs(
title = "Response Rate Efficiency",
x = "Guest Rating",
y = "Percent Answered (%)"
) +
theme(legend.position = "none")
# 3. CONVERT TO INTERACTIVE
# This is where the error was happening because it couldn't find 'response_rate'
ggplotly(p_lollipop)
p2 <- df %>%
filter(!is.na(trip_type) & trip_type != "NONE") %>%
ggplot(aes(x = trip_type, y = rating, fill = trip_type)) +
geom_violin(alpha = 0.5) +
coord_flip() +
theme_minimal() +
labs(x = "") +
guides(fill = "none")
ggplotly(p2)
# 1. Filter data to include only valid traveler types
boxplot_data <- df %>%
filter(!is.na(trip_type)) %>%
filter(trip_type != "Other" & trip_type != "NONE")
# 2. Create the Faceted Boxplot
# Note: Ensure 'word_count' was created in your setup chunk
p_faceted <- ggplot(boxplot_data, aes(x = as.factor(rating), y = word_count, fill = as.factor(rating))) +
geom_boxplot(outlier.shape = NA, alpha = 0.7) +
facet_wrap(~trip_type) +
coord_cartesian(ylim = c(0, 600)) +
scale_fill_brewer(palette = "RdYlGn") +
theme_minimal() +
labs(
x = "Rating (1 = Unhappy, 5 = Happy)",
y = "Review Word Count",
fill = "Rating"
) +
theme(legend.position = "none")
# 3. Output the interactive version
ggplotly(p_faceted)
# 1. Prepare the data
# We filter out 'NONE' and 'Other' to focus on the main demographics
violin_data <- df %>%
filter(!is.na(trip_type) & !trip_type %in% c("NONE", "Other")) %>%
mutate(trip_type = str_to_title(trip_type))
# 2. Create the Violin Plot
p_violin <- ggplot(violin_data, aes(x = trip_type, y = rating, fill = trip_type)) +
# The violin shows the density of the ratings
geom_violin(alpha = 0.5, trim = FALSE) +
# Adding a thin boxplot inside helps show the median and quartiles clearly
geom_boxplot(width = 0.1, color = "black", outlier.shape = NA, alpha = 0.7) +
scale_fill_brewer(palette = "Set3") +
theme_minimal() +
labs(
title = "Distribution of Satisfaction by Traveler Type",
subtitle = "The width of the violin represents the concentration of ratings",
x = "Traveler Category",
y = "Rating (1-5)"
) +
theme(legend.position = "none")
# 3. Make it interactive for your dashboard
ggplotly(p_violin)
```