In this Data Preprocessing notebook, we conduct the initial steps for analyzing Qatar Airlines reviews data frame, focusing on cleaning and preparing the data for further enhancement, analysis and visualization, and forecasting.
This notebook serves as a preliminary stage before Data Enhancement.
## Rows: 2,369
## Columns: 16
## $ Unnamed..0 <dbl> 0, NA, 2, 3, 4, 5, NA, 7, 8, NA, NA, NA, 12, 13, NA,ā¦
## $ Date.Published <chr> "2024-03-02", "2024-02-29", "2024-02-29", "2024-02-2ā¦
## $ Rating <dbl> 1, 1, 1, 10, 7, 10, 10, 7, 10, 4, 6, 10, 7, 1, 1, 2,ā¦
## $ Max.Rating <dbl> 10, 10, 10, 10, NA, 10, 10, 10, 10, 10, 10, 10, 10, ā¦
## $ Title <chr> "āmarred by inconveniencesā", "", "\"no attempt to aā¦
## $ Author <chr> "Mary Le", "Brian English", "Wayne Burgess", "Alwaleā¦
## $ Country <chr> "United Kingdom", "", "Australia", "Qatar", "United ā¦
## $ Date <chr> "2024-03-02", "2024-02-29", "2024-02-29", "2024-02-2ā¦
## $ Review.Body <chr> " The delay of my flight from Haneda to Doha caused ā¦
## $ Type.Of.Traveller <chr> "Solo Leisure", "Couple Leisure", "Business", "Couplā¦
## $ Seat.Type <chr> "Economy Class", "Business Class", "Economy Class", ā¦
## $ Route <chr> "Tokyo to London Heathrow via Doha", "Doha to Montreā¦
## $ Date.Flown <chr> "March 2024", "February 2024", "", "February 2024", ā¦
## $ Recommended <chr> "no", "no", "no", "", "no", "", "", "", "", "no", "nā¦
## $ Aircraft <chr> "", "", "A380", "A380-800", "", "A380", "", "Boeing ā¦
## $ Verified <dbl> 0, 1, 1, NA, NA, 1, NA, NA, 1, NA, 1, NA, 1, 1, 0, 1ā¦
df <- df %>%
select(
-Unnamed..0,
-Date.Published,
-Max.Rating,
-Author,
-Date.Flown,
-Recommended,
-Verified
) %>%
slice(-1)
df %>% glimpse()## Rows: 2,368
## Columns: 9
## $ Rating <dbl> 1, 1, 10, 7, 10, 10, 7, 10, 4, 6, 10, 7, 1, 1, 2, NAā¦
## $ Title <chr> NA, "\"no attempt to address our complaint\"", "āI hā¦
## $ Country <chr> NA, "Australia", "Qatar", "United Kingdom", NA, "Uniā¦
## $ Date <chr> "2024-02-29", "2024-02-29", "2024-02-25", "2024-02-2ā¦
## $ Review.Body <chr> " They convinced me that I needed to pay $1500 to aā¦
## $ Type.Of.Traveller <chr> "Couple Leisure", "Business", "Couple Leisure", "Couā¦
## $ Seat.Type <chr> "Business Class", "Economy Class", "First Class", "Bā¦
## $ Route <chr> "Doha to Montreal", "Doha to Perth", "Doha to Londonā¦
## $ Aircraft <chr> NA, "A380", "A380-800", NA, "A380", NA, "Boeing 787-ā¦
## Rating Title Country Date
## 475 474 474 474
## Review.Body Type.Of.Traveller Seat.Type Route
## 474 825 474 821
## Aircraft
## 1351
There still many missing values but we dont need to do anything with them except for Rating and Date which weāll deal with them in Data Enhancement stage. What we need to do for now is just removing rows that are missing both the Rating and Review.Body
Remove rows where Rating and Review.Body are both missing
## Rows: 2,269
## Columns: 9
## $ Rating <dbl> 1, 1, 10, 7, 10, 10, 7, 10, 4, 6, 10, 7, 1, 1, 2, NAā¦
## $ Title <chr> NA, "\"no attempt to address our complaint\"", "āI hā¦
## $ Country <chr> NA, "Australia", "Qatar", "United Kingdom", NA, "Uniā¦
## $ Date <chr> "2024-02-29", "2024-02-29", "2024-02-25", "2024-02-2ā¦
## $ Review.Body <chr> " They convinced me that I needed to pay $1500 to aā¦
## $ Type.Of.Traveller <chr> "Couple Leisure", "Business", "Couple Leisure", "Couā¦
## $ Seat.Type <chr> "Business Class", "Economy Class", "First Class", "Bā¦
## $ Route <chr> "Doha to Montreal", "Doha to Perth", "Doha to Londonā¦
## $ Aircraft <chr> NA, "A380", "A380-800", NA, "A380", NA, "Boeing 787-ā¦
Classify between Airbus and Boeing aircraft
library(stringr)
df <- df %>%
mutate(
Aircraft_Cleaned = Aircraft %>%
sub("/.*", "", .) %>%
sub("-.*", "", .) %>%
trimws() %>%
tolower(),
Aircraft_Classified = case_when(
str_detect(Aircraft_Cleaned, "7") & !str_detect(Aircraft_Cleaned, "a") ~ "Boeing", # Boeing jika ada angka 7 dan tidak ada A
str_detect(Aircraft_Cleaned, "a") ~ "Airbus", # Airbus jika ada huruf A
TRUE ~ NA # NA jika tidak memenuhi keduanya
) %>% as.factor()
)
df %>%
select(Aircraft, Aircraft_Cleaned, Aircraft_Classified) %>%
filter(complete.cases(.)) %>%
head()## Rating Title Country Date
## 376 449 458 456
## Review.Body Type.Of.Traveller Seat.Type Route
## 375 789 456 786
## Aircraft Aircraft_Cleaned Aircraft_Classified
## 1296 1296 1299
We categorize reviews into different topics based on keywords. The topics are designed to identify key areas of customer feedback, such as customer service, time and delay, seat comfort, and food and beverage. Reviews that contain specific keywords will be counted under the respective topics.
library(NLP)
library(tm)
# My text cleaner function
clean_text <- function(text, as.corpus = T, lower = T, rm.number = T, rm.stopwords_english = T, rm.stopwords_bahasa = F, rm.punctuation = T, stem = T, rm.whitespace = T){
text_corpus <- text %>% VectorSource() %>% VCorpus()
# Lowercasing
if (lower){
text_corpus <- tm_map(x = text_corpus,
FUN = content_transformer(tolower))
}
# Removing numbers
if (rm.number){
text_corpus <- tm_map(x = text_corpus,
FUN = removeNumbers)
# Removing punctuation
if (rm.punctuation){
text_corpus <- tm_map(x = text_corpus,
FUN = removePunctuation)
text_corpus <- tm_map(text_corpus, content_transformer(function(x) gsub("[[:punct:]]+", "", x)))
text_corpus <- tm_map(text_corpus, content_transformer(function(x) gsub("/", " ", x)))
}
}
# Removing english stop words
if (rm.stopwords_english){
list_stop_words_english <- readLines("stop-words_english.txt", warn = FALSE, encoding = "UTF-8")
text_corpus <- tm_map(x = text_corpus,
FUN = removeWords,
list_stop_words_english)
}
# Removing bahasa stop words
if (rm.stopwords_bahasa){
list_stop_words_bahasa <- readLines("stop-words_bahasa.txt", warn = FALSE, encoding = "UTF-8")
text_corpus <- tm_map(x = text_corpus,
FUN = removeWords,
list_stop_words_bahasa)
}
# Reducing words to their base form
if (stem){
text_corpus <- tm_map(x = text_corpus,
FUN = stemDocument)
}
# Removing white/blank spaces
if (rm.whitespace){
text_corpus <- tm_map(x = text_corpus,
FUN = stripWhitespace)
}
# Returning the text as or not as corpus
if (as.corpus){
return(text_corpus)
}
else(
return(sapply(text_corpus, as.character))
)
}
# Function to check for topic keywords and classify reviews
classify_review <- function(review, category_keywords) {
if (is.na(review)) {
return(0) # If the review is NA
}
cleaned_review <- clean_text(review, F, rm.stopwords_bahasa = F) # Clean the review text
# Check if any of the keywords appear in the cleaned review
if (any(sapply(category_keywords, function(keyword) {
str_detect(cleaned_review, regex(keyword, ignore_case = TRUE))
}))) {
return(1) # Return 1 if any keyword is detected
} else {
return(0) # Return 0 if no keyword is detected
}
}
# Keywords for each topic
keywords_customer_service <- c("service", "support", "customer", "response", "staff", "help", "crew", "email")
keywords_time_delay <- c("delay", "wait", "time", "hour", "late", "boarding", "missed")
keywords_seat_comfort <- c("seat", "comfort", "space", "legroom", "comfortability", "recline", "pillow", "blanket", "ac", "temperature", "air", "conditioner", "hot")
keywords_food_beverage <- c("food", "meal", "beverage", "drink", "snack", "refreshment", "menu", "lunch", "dinner", "breakfast")
# Classify topics
df <- df %>%
mutate(
"Customer_Service" = ifelse(is.na(Review.Body), 0, sapply(Review.Body, classify_review, category_keywords = keywords_customer_service)),
"Time_and_Delay" = ifelse(is.na(Review.Body), 0, sapply(Review.Body, classify_review, category_keywords = keywords_time_delay)),
"Seat_and_Comfort" = ifelse(is.na(Review.Body), 0, sapply(Review.Body, classify_review, category_keywords = keywords_seat_comfort)),
"Food_and_Beverages" = ifelse(is.na(Review.Body), 0, sapply(Review.Body, classify_review, category_keywords = keywords_food_beverage))
)
df %>% glimpse()## Rows: 2,269
## Columns: 15
## $ Rating <dbl> 1, 1, 10, 7, 10, 10, 7, 10, 4, 6, 10, 7, 1, 1, 2, ā¦
## $ Title <chr> NA, "\"no attempt to address our complaint\"", "āIā¦
## $ Country <chr> NA, "Australia", "Qatar", "United Kingdom", NA, "Uā¦
## $ Date <chr> "2024-02-29", "2024-02-29", "2024-02-25", "2024-02ā¦
## $ Review.Body <chr> " They convinced me that I needed to pay $1500 toā¦
## $ Type.Of.Traveller <chr> "Couple Leisure", "Business", "Couple Leisure", "Cā¦
## $ Seat.Type <chr> "Business Class", "Economy Class", "First Class", ā¦
## $ Route <chr> "Doha to Montreal", "Doha to Perth", "Doha to Londā¦
## $ Aircraft <chr> NA, "A380", "A380-800", NA, "A380", NA, "Boeing 78ā¦
## $ Aircraft_Cleaned <chr> NA, "a380", "a380", NA, "a380", NA, "boeing 787", ā¦
## $ Aircraft_Classified <fct> NA, Airbus, Airbus, NA, Airbus, NA, Boeing, Boeingā¦
## $ Customer_Service <dbl> 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1,ā¦
## $ Time_and_Delay <dbl> 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1,ā¦
## $ Seat_and_Comfort <dbl> 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1,ā¦
## $ Food_and_Beverages <dbl> 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,ā¦
## Rows: 2,269
## Columns: 15
## $ Rating <dbl> 1, 1, 10, 7, 10, 10, 7, 10, 4, 6, 10, 7, 1, 1, 2, ā¦
## $ Title <chr> NA, "\"no attempt to address our complaint\"", "āIā¦
## $ Country <chr> NA, "Australia", "Qatar", "United Kingdom", NA, "Uā¦
## $ Date <chr> "2024-02-29", "2024-02-29", "2024-02-25", "2024-02ā¦
## $ Review.Body <chr> " They convinced me that I needed to pay $1500 toā¦
## $ Type.Of.Traveller <chr> "Couple Leisure", "Business", "Couple Leisure", "Cā¦
## $ Seat.Type <chr> "Business Class", "Economy Class", "First Class", ā¦
## $ Route <chr> "Doha to Montreal", "Doha to Perth", "Doha to Londā¦
## $ Aircraft <chr> NA, "A380", "A380-800", NA, "A380", NA, "Boeing 78ā¦
## $ Aircraft_Cleaned <chr> NA, "a380", "a380", NA, "a380", NA, "boeing 787", ā¦
## $ Aircraft_Classified <fct> NA, Airbus, Airbus, NA, Airbus, NA, Boeing, Boeingā¦
## $ Customer_Service <dbl> 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1,ā¦
## $ Time_and_Delay <dbl> 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1,ā¦
## $ Seat_and_Comfort <dbl> 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1,ā¦
## $ Food_and_Beverages <dbl> 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,ā¦
Bro is actually intelligentš