library(tidyverse)
df <- read.csv("Datasets/enhanced.csv") %>%
mutate(
across(where(is.character), ~ na_if(., "")),
Date = as.Date(Date),
Year = year(Date),
YearMonth = floor_date(Date, "month")
) %>%
select(-X)
df %>% head(16)library(NLP)
library(tm)
# This is my (Ardian's) personal text cleaner function
clean_text <- function(text, as.corpus = T, lower = T, rm.number = T, rm.stopwords_english = T, rm.stopwords_bahasa = F, rm.punctuation = T, stem = T, rm.whitespace = T){
text_corpus <- text %>% VectorSource() %>% VCorpus()
# Lowercasing
if (lower){
text_corpus <- tm_map(x = text_corpus,
FUN = content_transformer(tolower))
}
# Removing numbers
if (rm.number){
text_corpus <- tm_map(x = text_corpus,
FUN = removeNumbers)
}
# Removing punctuation
if (rm.punctuation){
text_corpus <- tm_map(x = text_corpus,
FUN = removePunctuation)
text_corpus <- tm_map(text_corpus, content_transformer(function(x) gsub("[[:punct:]]+", "", x)))
text_corpus <- tm_map(text_corpus, content_transformer(function(x) gsub("/", " ", x)))
}
# Removing english stop words
if (rm.stopwords_english){
list_stop_words_english <- readLines("stop-words_english.txt", warn = FALSE, encoding = "UTF-8")
text_corpus <- tm_map(x = text_corpus,
FUN = removeWords,
list_stop_words_english)
}
# Removing bahasa stop words
if (rm.stopwords_bahasa){
list_stop_words_bahasa <- readLines("stop-words_bahasa.txt", warn = FALSE, encoding = "UTF-8")
text_corpus <- tm_map(x = text_corpus,
FUN = removeWords,
list_stop_words_bahasa)
}
# Reducing words to their base form
if (stem){
text_corpus <- tm_map(x = text_corpus,
FUN = stemDocument)
}
# Removing white/blank spaces
if (rm.whitespace){
text_corpus <- tm_map(x = text_corpus,
FUN = stripWhitespace)
}
# Returning the text as or not as corpus
if (as.corpus){
return(text_corpus)
}
else{
return(sapply(text_corpus, as.character))
}
}library(tokenizers)
# This is also my (Ardian's) token counter generator function
get_tokens_count <- function(texts, rm.specific_terms = "") {
tokens_count <- texts %>%
tokenize_words() %>%
unlist() %>%
data.frame(token = .) %>%
count(token, sort = TRUE) %>%
na.omit() %>%
filter(!token %in% (rm.specific_terms %>% stemDocument()))
return(tokens_count)
}## Loading required package: RColorBrewer
# This is also my (Ardian's) word cloud generator function
get_wordcloud <- function(tokens, scale = c(2, 0), normalize_higher_ngrams = F, palette = c("pink", "maroon")) {
# Calculate the tf-idf weight of each token
if (normalize_higher_ngrams){
tokens <- tokens %>% rowwise() %>% mutate(n = n * length(unlist(str_split(token, " "))) + 1)
}
# Generating a word cloud with specified settings, scaling word size by frequency
tokens %>%
with(
wordcloud(
words = token,
random.order = FALSE,
color = colorRampPalette(palette)(length(unique(tokens$token))),
min.freq = 1,
scale = scale,
rot.per = 0,
freq = n
)
)
}df %>%
ggplot(aes(x = Rating)) +
geom_histogram(bins = 10, fill = "#0077C0", color = "#0077C0", alpha = 0.6) +
geom_density(aes(y = ..count..), fill = "maroon", col = "maroon", alpha = 0.5) +
labs(
title = "Histogram and Density of Rating",
x = "Rating",
y = "Frequency"
) +
theme_minimal()df %>%
filter(Customer_Service == 1 | Seat_and_Comfort == 1 | Time_and_Delay == 1 | Food_and_Beverages == 1) %>%
ggplot(aes(
x = Rating,
fill = factor(case_when(
Customer_Service == 1 ~ "Customer Service",
Seat_and_Comfort == 1 ~ "Seat and Comfort",
Time_and_Delay == 1 ~ "Time and Delay",
Food_and_Beverages == 1 ~ "Food and Beverages"
))
)) +
geom_density(alpha = 0.6, col = NA) +
labs(
title = "Density of Ratings by Topic",
x = "Rating",
y = "Density",
fill = NULL
) +
theme_minimal() +
scale_fill_manual(values = c(
"Customer Service" = "#0077C0",
"Seat and Comfort" = "darkgreen",
"Time and Delay" = "maroon",
"Food and Beverages" = "goldenrod"
)) +
theme(legend.position = "bottom")ts_monthly <- df %>%
group_by(YearMonth) %>%
summarize(AverageRating = mean(Rating))
ts_yearly <- df %>%
group_by(Year) %>%
summarize(AverageRating = mean(Rating))
ggplot() +
geom_line(data = ts_monthly, aes(x = YearMonth, y = AverageRating), color = "#0077C0", size = 1) +
geom_line(data = ts_yearly, aes(x = as.Date(paste(Year, "01-01", sep="-")), y = AverageRating), color = "maroon", size = 1) +
labs(title = "Monthly and Yearly Rating Trend", x = NULL, y = "Average Rating") +
theme_minimal() +
scale_x_date(date_labels = "%Y", date_breaks = "1 year") +
scale_y_continuous(limits = c(1, 10), breaks = 1:10) +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10, face = "bold"))## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ts_yearly_cs <- df %>%
filter(Customer_Service == 1) %>%
group_by(Year) %>%
summarize(AverageRating = mean(Rating, na.rm = TRUE), Category = "Customer Service")
ts_yearly_sc <- df %>%
filter(Seat_and_Comfort == 1) %>%
group_by(Year) %>%
summarize(AverageRating = mean(Rating, na.rm = TRUE), Category = "Seat and Comfort")
ts_yearly_td <- df %>%
filter(Time_and_Delay == 1) %>%
group_by(Year) %>%
summarize(AverageRating = mean(Rating, na.rm = TRUE), Category = "Time and Delay")
ts_yearly_fb <- df %>%
filter(Food_and_Beverages == 1) %>%
group_by(Year) %>%
summarize(AverageRating = mean(Rating, na.rm = TRUE), Category = "Food and Beverages")
ts_yearly_all <- bind_rows(ts_yearly_cs, ts_yearly_sc, ts_yearly_td, ts_yearly_fb)
ts_yearly_all %>%
ggplot(aes(x = as.Date(paste(Year, "01-01", sep = "-")), y = AverageRating, color = Category)) +
geom_line(size = 1) +
labs(
title = "Average Yearly Rating Trends by Topic",
x = NULL,
y = "Average Rating",
color = NULL
) +
theme_minimal() +
scale_x_date(date_labels = "%Y", date_breaks = "1 year") +
scale_y_continuous(limits = c(1, 10), breaks = 1:10) +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10, face = "bold")) +
scale_color_manual(values = c("Customer Service" = "#0077C0",
"Seat and Comfort" = "darkgreen",
"Time and Delay" = "maroon",
"Food and Beverages" = "goldenrod")) +
theme(legend.position = "bottom")overall_negative_tokens_count <- df %>%
filter(Sentiment_Category == "Negative") %>%
select(Review.Body) %>%
clean_text(F) %>%
get_tokens_count(rm.specific_terms = c("flight", "qatar", "doha", "airway", "airline"))
overall_negative_tokens_count %>% write.csv("Datasets/Negative Tokens Count/overall_negative_tokens_count.csv")
overall_negative_tokens_count %>%
get_wordcloud(scale = c(2, 0))customer_service_negative_tokens_count <- df %>%
filter(Sentiment_Category == "Negative", Customer_Service == 1) %>%
select(Review.Body) %>%
clean_text(F) %>%
get_tokens_count(rm.specific_terms = c("flight", "qatar", "doha", "airway", "airline"))
customer_service_negative_tokens_count %>% write.csv("Datasets/Negative Tokens Count/customer_service_negative_tokens_count.csv")
customer_service_negative_tokens_count %>%
get_wordcloud(scale = c(2, 0))seat_and_comfort_tokens_count <- df %>%
filter(Sentiment_Category == "Negative", Seat_and_Comfort == 1) %>%
select(Review.Body) %>%
clean_text(F) %>%
get_tokens_count(rm.specific_terms = c("flight", "qatar", "doha", "airway", "airline"))
seat_and_comfort_tokens_count %>% write.csv("Datasets/Negative Tokens Count/seat_and_comfort_negative_tokens_count.csv")
seat_and_comfort_tokens_count %>%
get_wordcloud(scale = c(2, 0))food_and_beverages_negative_tokens_count <- df %>%
filter(Sentiment_Category == "Negative", Food_and_Beverages == 1) %>%
select(Review.Body) %>%
clean_text(F) %>%
get_tokens_count(rm.specific_terms = c("flight", "qatar", "doha", "airway", "airline"))
food_and_beverages_negative_tokens_count %>% write.csv("Datasets/Negative Tokens Count/food_and_beverages_negative_tokens_count.csv")
food_and_beverages_negative_tokens_count %>%
get_wordcloud(scale = c(2, 0))time_and_delay_negative_tokens_count <- df %>%
filter(Sentiment_Category == "Negative", Time_and_Delay == 1) %>%
select(Review.Body) %>%
clean_text(F) %>%
get_tokens_count(rm.specific_terms = c("flight", "qatar", "doha", "airway", "airline"))
time_and_delay_negative_tokens_count %>% write.csv("Datasets/Negative Tokens Count/time_and_delay_negative_tokens_count.csv")
time_and_delay_negative_tokens_count %>%
get_wordcloud(scale = c(2, 0))df %>%
filter(Sentiment_Category == "Negative") %>%
summarise(
Customer_Service = sum(Customer_Service == 1, na.rm = TRUE),
Time_and_Delay = sum(Time_and_Delay == 1, na.rm = TRUE),
Seat_and_Comfort = sum(Seat_and_Comfort == 1, na.rm = TRUE),
Food_and_Beverages = sum(Food_and_Beverages == 1, na.rm = TRUE)
) %>%
pivot_longer(cols = everything(), names_to = "Category", values_to = "TotalNegativeSentiment") %>%
ggplot(aes(x = reorder(Category, -TotalNegativeSentiment), y = TotalNegativeSentiment, fill = TotalNegativeSentiment)) +
geom_bar(stat = "identity", show.legend = FALSE) +
geom_text(aes(label = TotalNegativeSentiment), vjust = 1.5, color = "white", size = 8, fontface = "bold") +
scale_fill_gradient(low = "black", high = "Maroon") +
labs(title = "Total Negative Sentiments or Each Topic", x = NULL, y = NULL) +
theme_minimal()overall_positive_tokens_count_2020_to_2022 <- df %>%
filter(Sentiment_Category == "Positive",
Date < as.Date("2023-01-01"),
Date >= as.Date("2020-01-01")) %>%
select(Review.Body) %>%
clean_text(F) %>%
get_tokens_count(rm.specific_terms = c("flight", "qatar", "doha", "airway", "airline"))
overall_positive_tokens_count_2020_to_2022 %>% write.csv("Datasets/2020 to 2022 Positive Tokens Count/overall_positive_tokens_count_2020_to_2022.csv")
overall_positive_tokens_count_2020_to_2022 %>% get_wordcloud(scale = c(2, 0), palette = c("lightblue", "#0077C0"))customer_service_positive_tokens_count_2020_to_2022 <- df %>%
filter(Sentiment_Category == "Positive",
Date < as.Date("2023-01-01"),
Date >= as.Date("2020-01-01"),
Customer_Service == 1) %>%
select(Review.Body) %>%
clean_text(F) %>%
get_tokens_count(rm.specific_terms = c("flight", "qatar", "doha", "airway", "airline"))
customer_service_positive_tokens_count_2020_to_2022 %>% write.csv("Datasets/2020 to 2022 Positive Tokens Count/customer_service_positive_tokens_count_2020_to_2022.csv")
customer_service_positive_tokens_count_2020_to_2022 %>% get_wordcloud(scale = c(2, 0), palette = c("lightblue", "#0077C0"))seat_and_comfort_positive_tokens_count_2020_to_2022 <- df %>%
filter(Sentiment_Category == "Positive",
Date < as.Date("2023-01-01"),
Date >= as.Date("2020-01-01"),
Seat_and_Comfort == 1) %>%
select(Review.Body) %>%
clean_text(F) %>%
get_tokens_count(rm.specific_terms = c("flight", "qatar", "doha", "airway", "airline"))
seat_and_comfort_positive_tokens_count_2020_to_2022 %>% write.csv("Datasets/2020 to 2022 Positive Tokens Count/seat_and_comfort_positive_tokens_count_2020_to_2022.csv")
seat_and_comfort_positive_tokens_count_2020_to_2022 %>% get_wordcloud(scale = c(2, 0), palette = c("lightblue", "#0077C0"))food_and_beverages_positive_tokens_count_2020_to_2022 <- df %>%
filter(Sentiment_Category == "Positive",
Date < as.Date("2023-01-01"),
Date >= as.Date("2020-01-01"),
Food_and_Beverages == 1) %>%
select(Review.Body) %>%
clean_text(F) %>%
get_tokens_count(rm.specific_terms = c("flight", "qatar", "doha", "airway", "airline"))
food_and_beverages_positive_tokens_count_2020_to_2022 %>% write.csv("Datasets/2020 to 2022 Positive Tokens Count/food_and_beverages_positive_tokens_count_2020_to_2022.csv")
food_and_beverages_positive_tokens_count_2020_to_2022 %>% get_wordcloud(scale = c(2, 0), palette = c("lightblue", "#0077C0"))time_and_delay_positive_tokens_count_2020_to_2022 <- df %>%
filter(Sentiment_Category == "Positive",
Date < as.Date("2023-01-01"),
Date >= as.Date("2020-01-01"),
Time_and_Delay == 1) %>%
select(Review.Body) %>%
clean_text(F) %>%
get_tokens_count(rm.specific_terms = c("flight", "qatar", "doha", "airway", "airline", "seat"))
time_and_delay_positive_tokens_count_2020_to_2022 %>% write.csv("Datasets/2020 to 2022 Positive Tokens Count/time_and_delay_positive_tokens_count_2020_to_2022.csv")
time_and_delay_positive_tokens_count_2020_to_2022 %>% get_wordcloud(scale = c(2, 0), palette = c("lightblue", "#0077C0"))df %>%
filter(Sentiment_Category == "Positive",
Date < as.Date("2023-01-01"),
Date >= as.Date("2020-01-01")) %>%
summarise(
Customer_Service = sum(Customer_Service == 1, na.rm = TRUE),
Time_and_Delay = sum(Time_and_Delay == 1, na.rm = TRUE),
Seat_and_Comfort = sum(Seat_and_Comfort == 1, na.rm = TRUE),
Food_and_Beverages = sum(Food_and_Beverages == 1, na.rm = TRUE)
) %>%
pivot_longer(cols = everything(), names_to = "Category", values_to = "TotalNegativeSentiment") %>%
ggplot(aes(x = reorder(Category, -TotalNegativeSentiment), y = TotalNegativeSentiment, fill = TotalNegativeSentiment)) +
geom_bar(stat = "identity", show.legend = FALSE) +
geom_text(aes(label = TotalNegativeSentiment), vjust = 1.5, color = "white", size = 8, fontface = "bold") +
scale_fill_gradient(low = "black", high = "#0077C0") +
labs(title = "Total Positive Sentiments for Each Topic", x = NULL, y = NULL) +
theme_minimal()