Bro Intelligence
November 13th, 2024

Falcon Project 14

In this Data Analysis and Visualization notebook, we focus on creating insightful visualizations to explore and present the Qatar Airlines reviews data. The notebook includes:
  1. Distributions of Ratings: Overall rating distribution and for each topic.
  2. Average Ratings Trends Overall average rating trend and for each topic.
  3. Negative Analysis: Overall negative words distribution and for each topic, and also the negative topic distribution.
  4. 2020 to 2022 Positive Analysis: Overall positive words distribution and for each topic in range year 2020 to 2022, and also the positive topic distribution.

Read the data:

library(tidyverse)

df <- read.csv("Datasets/enhanced.csv") %>%
  mutate(
    across(where(is.character), ~ na_if(., "")),
    Date = as.Date(Date),
    Year = year(Date),
    YearMonth = floor_date(Date, "month")
  ) %>% 
  select(-X)

df %>% head(16)

Build necessary function for text analysis:

Text cleaner function

library(NLP)
library(tm)

# This is my (Ardian's) personal text cleaner function
clean_text <- function(text, as.corpus = T, lower = T, rm.number = T, rm.stopwords_english = T, rm.stopwords_bahasa = F, rm.punctuation = T, stem = T, rm.whitespace = T){
  text_corpus <- text %>% VectorSource() %>% VCorpus()
  
  # Lowercasing
  if (lower){
    text_corpus <- tm_map(x = text_corpus,
                          FUN = content_transformer(tolower))
  }
  
  # Removing numbers
  if (rm.number){
    text_corpus <- tm_map(x = text_corpus,
                          FUN = removeNumbers)
  }
  
  # Removing punctuation
  if (rm.punctuation){
    text_corpus <- tm_map(x = text_corpus,
                          FUN = removePunctuation)
    text_corpus <- tm_map(text_corpus, content_transformer(function(x) gsub("[[:punct:]]+", "", x)))
    text_corpus <- tm_map(text_corpus, content_transformer(function(x) gsub("/", " ", x)))
  }
  
  # Removing english stop words
  if (rm.stopwords_english){
    list_stop_words_english <- readLines("stop-words_english.txt", warn = FALSE, encoding = "UTF-8")
    
    text_corpus <- tm_map(x = text_corpus,
                          FUN = removeWords,
                          list_stop_words_english)
  }
  
  # Removing bahasa stop words
  if (rm.stopwords_bahasa){
    list_stop_words_bahasa <- readLines("stop-words_bahasa.txt", warn = FALSE, encoding = "UTF-8")
    
    text_corpus <- tm_map(x = text_corpus,
                          FUN = removeWords,
                          list_stop_words_bahasa)
  }
  
  # Reducing words to their base form
  if (stem){
    text_corpus <- tm_map(x = text_corpus,
                          FUN = stemDocument)
  }
  
  # Removing white/blank spaces
  if (rm.whitespace){
    text_corpus <- tm_map(x = text_corpus,
                          FUN = stripWhitespace)
  }
  
  # Returning the text as or not as corpus
  if (as.corpus){
    return(text_corpus)
  }
  else{
    return(sapply(text_corpus, as.character))
  }
}

Token counter function

library(tokenizers)

# This is also my (Ardian's) token counter generator function
get_tokens_count <- function(texts, rm.specific_terms = "") {
  
  tokens_count <- texts %>%
    tokenize_words() %>% 
    unlist() %>% 
    data.frame(token = .) %>%
    count(token, sort = TRUE) %>% 
    na.omit() %>% 
    filter(!token %in% (rm.specific_terms %>% stemDocument()))
  
  return(tokens_count)
}

Wordcloud generator function

library(wordcloud)
## Loading required package: RColorBrewer
# This is also my (Ardian's) word cloud generator function
get_wordcloud <- function(tokens, scale = c(2, 0), normalize_higher_ngrams = F, palette = c("pink", "maroon")) {
  
  # Calculate the tf-idf weight of each token
  if (normalize_higher_ngrams){
    tokens <- tokens %>% rowwise() %>% mutate(n = n * length(unlist(str_split(token, " "))) + 1)
  }
  
  # Generating a word cloud with specified settings, scaling word size by frequency
  tokens %>%
    with(
      wordcloud(
        words = token,
        random.order = FALSE,
        color = colorRampPalette(palette)(length(unique(tokens$token))),
        min.freq = 1,
        scale = scale,
        rot.per = 0,
        freq = n
      )
    )
}

Ratings Distributions

Histogram and Density of Rating:

df %>% 
  ggplot(aes(x = Rating)) +
  geom_histogram(bins = 10, fill = "#0077C0", color = "#0077C0", alpha = 0.6) +
  geom_density(aes(y = ..count..), fill = "maroon", col = "maroon", alpha = 0.5) +
  labs(
    title = "Histogram and Density of Rating",
    x = "Rating",
    y = "Frequency"
  ) +
  theme_minimal()

Density of Ratings by Topic:

df %>%
  filter(Customer_Service == 1 | Seat_and_Comfort == 1 | Time_and_Delay == 1 | Food_and_Beverages == 1) %>% 
  ggplot(aes(
    x = Rating,
    fill = factor(case_when(
      Customer_Service == 1 ~ "Customer Service",
      Seat_and_Comfort == 1 ~ "Seat and Comfort",
      Time_and_Delay == 1 ~ "Time and Delay",
      Food_and_Beverages == 1 ~ "Food and Beverages"
    ))
  )) + 
    geom_density(alpha = 0.6, col = NA) + 
    labs(
      title = "Density of Ratings by Topic",
      x = "Rating",
      y = "Density",
      fill = NULL
    ) + 
    theme_minimal() + 
    scale_fill_manual(values = c(
      "Customer Service" = "#0077C0",
      "Seat and Comfort" = "darkgreen",
      "Time and Delay" = "maroon",
      "Food and Beverages" = "goldenrod"
    )) +
    theme(legend.position = "bottom")

Negative Analysis

Overall Negative Words Distribution:

overall_negative_tokens_count <- df %>%
  filter(Sentiment_Category == "Negative") %>%
  select(Review.Body) %>% 
  clean_text(F) %>% 
  get_tokens_count(rm.specific_terms = c("flight", "qatar", "doha", "airway", "airline")) 
  
overall_negative_tokens_count %>% write.csv("Datasets/Negative Tokens Count/overall_negative_tokens_count.csv")

overall_negative_tokens_count %>% 
  get_wordcloud(scale = c(2, 0))

Costumer Service Negative Words Distribution:

customer_service_negative_tokens_count <- df %>%
  filter(Sentiment_Category == "Negative", Customer_Service == 1) %>%
  select(Review.Body) %>% 
  clean_text(F) %>% 
  get_tokens_count(rm.specific_terms = c("flight", "qatar", "doha", "airway", "airline"))

customer_service_negative_tokens_count %>% write.csv("Datasets/Negative Tokens Count/customer_service_negative_tokens_count.csv")

customer_service_negative_tokens_count %>% 
  get_wordcloud(scale = c(2, 0))

Seat and Comfort Negative Words Distribution:

seat_and_comfort_tokens_count <- df %>%
  filter(Sentiment_Category == "Negative", Seat_and_Comfort == 1) %>%
  select(Review.Body) %>% 
  clean_text(F) %>% 
  get_tokens_count(rm.specific_terms = c("flight", "qatar", "doha", "airway", "airline"))

seat_and_comfort_tokens_count %>% write.csv("Datasets/Negative Tokens Count/seat_and_comfort_negative_tokens_count.csv")

seat_and_comfort_tokens_count %>% 
  get_wordcloud(scale = c(2, 0))

Food and Beverages Negative Words Distribution:

food_and_beverages_negative_tokens_count <- df %>%
  filter(Sentiment_Category == "Negative", Food_and_Beverages == 1) %>%
  select(Review.Body) %>% 
  clean_text(F) %>% 
  get_tokens_count(rm.specific_terms = c("flight", "qatar", "doha", "airway", "airline"))

food_and_beverages_negative_tokens_count %>% write.csv("Datasets/Negative Tokens Count/food_and_beverages_negative_tokens_count.csv")

food_and_beverages_negative_tokens_count %>% 
  get_wordcloud(scale = c(2, 0))

Time and Delay Negative Words Distribution:

time_and_delay_negative_tokens_count <- df %>%
  filter(Sentiment_Category == "Negative", Time_and_Delay == 1) %>%
  select(Review.Body) %>% 
  clean_text(F) %>% 
  get_tokens_count(rm.specific_terms = c("flight", "qatar", "doha", "airway", "airline"))

time_and_delay_negative_tokens_count %>% write.csv("Datasets/Negative Tokens Count/time_and_delay_negative_tokens_count.csv")

time_and_delay_negative_tokens_count %>% 
  get_wordcloud(scale = c(2, 0))

Negative Topic Distribution:

df %>%
  filter(Sentiment_Category == "Negative") %>%  
  summarise(
    Customer_Service = sum(Customer_Service == 1, na.rm = TRUE),
    Time_and_Delay = sum(Time_and_Delay == 1, na.rm = TRUE),
    Seat_and_Comfort = sum(Seat_and_Comfort == 1, na.rm = TRUE),
    Food_and_Beverages = sum(Food_and_Beverages == 1, na.rm = TRUE)
  ) %>%
  pivot_longer(cols = everything(), names_to = "Category", values_to = "TotalNegativeSentiment") %>%
  ggplot(aes(x = reorder(Category, -TotalNegativeSentiment), y = TotalNegativeSentiment, fill = TotalNegativeSentiment)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  geom_text(aes(label = TotalNegativeSentiment), vjust = 1.5, color = "white", size = 8, fontface = "bold") +
  scale_fill_gradient(low = "black", high = "Maroon") +
  labs(title = "Total Negative Sentiments or Each Topic", x = NULL, y = NULL) +
  theme_minimal()

2020 to 2022 Positive Analysis

Overall Positive Words in 2020 to 2022 Distribution:

overall_positive_tokens_count_2020_to_2022 <- df %>%
  filter(Sentiment_Category == "Positive", 
         Date < as.Date("2023-01-01"), 
         Date >= as.Date("2020-01-01")) %>%
  select(Review.Body) %>% 
  clean_text(F) %>% 
  get_tokens_count(rm.specific_terms = c("flight", "qatar", "doha", "airway", "airline"))

overall_positive_tokens_count_2020_to_2022 %>% write.csv("Datasets/2020 to 2022 Positive Tokens Count/overall_positive_tokens_count_2020_to_2022.csv")

overall_positive_tokens_count_2020_to_2022 %>% get_wordcloud(scale = c(2, 0), palette = c("lightblue", "#0077C0"))

Customer Service Positive Words in 2020 to 2022 Distribution:

customer_service_positive_tokens_count_2020_to_2022 <- df %>%
  filter(Sentiment_Category == "Positive", 
         Date < as.Date("2023-01-01"), 
         Date >= as.Date("2020-01-01"),
         Customer_Service == 1) %>%
  select(Review.Body) %>% 
  clean_text(F) %>% 
  get_tokens_count(rm.specific_terms = c("flight", "qatar", "doha", "airway", "airline"))

customer_service_positive_tokens_count_2020_to_2022 %>% write.csv("Datasets/2020 to 2022 Positive Tokens Count/customer_service_positive_tokens_count_2020_to_2022.csv")

customer_service_positive_tokens_count_2020_to_2022 %>% get_wordcloud(scale = c(2, 0), palette = c("lightblue", "#0077C0"))

Seat and Comfort Positive Words in 2020 to 2022 Distribution:

seat_and_comfort_positive_tokens_count_2020_to_2022 <- df %>%
  filter(Sentiment_Category == "Positive", 
         Date < as.Date("2023-01-01"), 
         Date >= as.Date("2020-01-01"),
         Seat_and_Comfort == 1) %>%
  select(Review.Body) %>% 
  clean_text(F) %>% 
  get_tokens_count(rm.specific_terms = c("flight", "qatar", "doha", "airway", "airline"))

seat_and_comfort_positive_tokens_count_2020_to_2022 %>% write.csv("Datasets/2020 to 2022 Positive Tokens Count/seat_and_comfort_positive_tokens_count_2020_to_2022.csv")

seat_and_comfort_positive_tokens_count_2020_to_2022 %>% get_wordcloud(scale = c(2, 0), palette = c("lightblue", "#0077C0"))

Food and Beverages Positive Words in 2020 to 2022 Distribution:

food_and_beverages_positive_tokens_count_2020_to_2022 <- df %>%
  filter(Sentiment_Category == "Positive", 
         Date < as.Date("2023-01-01"), 
         Date >= as.Date("2020-01-01"),
         Food_and_Beverages == 1) %>%
  select(Review.Body) %>% 
  clean_text(F) %>% 
  get_tokens_count(rm.specific_terms = c("flight", "qatar", "doha", "airway", "airline"))

food_and_beverages_positive_tokens_count_2020_to_2022 %>% write.csv("Datasets/2020 to 2022 Positive Tokens Count/food_and_beverages_positive_tokens_count_2020_to_2022.csv")

food_and_beverages_positive_tokens_count_2020_to_2022 %>% get_wordcloud(scale = c(2, 0), palette = c("lightblue", "#0077C0"))

Time and Delay Positive Words in 2020 to 2022 Distribution:

time_and_delay_positive_tokens_count_2020_to_2022 <- df %>%
  filter(Sentiment_Category == "Positive", 
         Date < as.Date("2023-01-01"), 
         Date >= as.Date("2020-01-01"),
         Time_and_Delay == 1) %>%
  select(Review.Body) %>% 
  clean_text(F) %>% 
  get_tokens_count(rm.specific_terms = c("flight", "qatar", "doha", "airway", "airline", "seat"))

time_and_delay_positive_tokens_count_2020_to_2022 %>% write.csv("Datasets/2020 to 2022 Positive Tokens Count/time_and_delay_positive_tokens_count_2020_to_2022.csv")

time_and_delay_positive_tokens_count_2020_to_2022 %>% get_wordcloud(scale = c(2, 0), palette = c("lightblue", "#0077C0"))

Positive Topic Distribution:

df %>%
  filter(Sentiment_Category == "Positive", 
         Date < as.Date("2023-01-01"), 
         Date >= as.Date("2020-01-01")) %>%
  summarise(
    Customer_Service = sum(Customer_Service == 1, na.rm = TRUE),
    Time_and_Delay = sum(Time_and_Delay == 1, na.rm = TRUE),
    Seat_and_Comfort = sum(Seat_and_Comfort == 1, na.rm = TRUE),
    Food_and_Beverages = sum(Food_and_Beverages == 1, na.rm = TRUE)
  ) %>%
  pivot_longer(cols = everything(), names_to = "Category", values_to = "TotalNegativeSentiment") %>%
  ggplot(aes(x = reorder(Category, -TotalNegativeSentiment), y = TotalNegativeSentiment, fill = TotalNegativeSentiment)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  geom_text(aes(label = TotalNegativeSentiment), vjust = 1.5, color = "white", size = 8, fontface = "bold") +
  scale_fill_gradient(low = "black", high = "#0077C0") +
  labs(title = "Total Positive Sentiments for Each Topic", x = NULL, y = NULL) +
  theme_minimal()