## [1] 1591 16
# remove branch_id, review_id, reviewer_name and content
review_clean <- review %>%
select(-branch_id) %>%
select(-review_id) %>%
select(-reviewer_name) %>%
select(-content)
head(review_clean)## [1] "5 bintang" "1 bintang" "3 bintang" "4 bintang" "2 bintang"
# remove unused strings in review_counts and rating
review_clean$review_counts <- gsub("ulasan", "", review_clean$review_counts)
review_clean$rating <- gsub("bintang", "", review_clean$rating)## [1] "" "1" "2" "4" "3" "Lainnya" "9"
Most of the likes is missing value, so later we are going to replace it with 0.
# delete the row that contains lainnya
review_clean <- review_clean %>%
filter(likes != "Lainnya")
unique(review_clean$likes)## [1] "" "1" "2" "4" "3" "9"
## [1] "female" "male" ""
# replace the missing value in reviewer_gender with "missing"
review_clean$reviewer_gender = str_replace(review_clean$reviewer_gender, "^$", "missing")
unique(review_clean$reviewer_gender)## [1] "female" "male" "missing"
## [1] "Positive" "Negative" "Neutral" ""
# replace the missing value in sentiment with "missing"
review_clean$sentiment = str_replace(review_clean$sentiment, "^$", "missing")
unique(review_clean$sentiment)## [1] "Positive" "Negative" "Neutral" "missing"
## [1] "satisfied" "disgusted" "disliked" "neutral" "unsatisfied"
## [6] "enjoyed" "happy" "Satisfied" "Happy" "Enjoyed"
## [11] "Neutral" "" "delighted" "Unsatisfied" "sad"
## [16] "unsatisfied." "unhappy" "Question" "Happy." "fear"
## [21] "Disgusted" "disgusted." "neutral."
# make all value to lower case, remove ., replace missing value with "missing"
review_clean$review_tone = str_to_lower(str_replace(review_clean$review_tone, "\\.", ""))
review_clean$review_tone = str_replace(review_clean$review_tone, "^$", "missing")
unique(review_clean$review_tone)## [1] "satisfied" "disgusted" "disliked" "neutral" "unsatisfied"
## [6] "enjoyed" "happy" "missing" "delighted" "sad"
## [11] "unhappy" "question" "fear"
Make review_tone only has 3 categories: satisfied,
unsatisfied, neutral.
review_clean <- review_clean %>%
mutate(review_tone = ifelse(review_tone == "satisfied" |
review_tone == "enjoyed" |
review_tone == "happy" |
review_tone == "delighted",
"satisfied",
ifelse(review_tone == "neutral" |
review_tone == "question" |
review_tone == "fear",
"neutral",
ifelse(review_tone == "missing", "missing", "unsatisfied"))))
unique(review_clean$review_tone)## [1] "satisfied" "unsatisfied" "neutral" "missing"
## Rows: 1,590
## Columns: 12
## $ branch <chr> "Bandung", "Bandung", "Bandung", "Bandung", "Bandung",…
## $ reviewer_gender <chr> "female", "female", "male", "female", "female", "femal…
## $ review_date <chr> "8/7/2022", "9/3/2022", "10/20/2019", "2/20/2023", "2/…
## $ review_counts <chr> "83 ", "133 ", "24 ", "13 ", "294 ", "14 ", "39 ", "11…
## $ rating <chr> "5 ", "5 ", "1 ", "3 ", "5 ", "5 ", "5 ", "1 ", "3 ", …
## $ likes <chr> "", "", "", "", "", "", "1", "", "", "", "", "", "", "…
## $ sentiment <chr> "Positive", "Positive", "Negative", "Neutral", "Neutra…
## $ review_tone <chr> "satisfied", "satisfied", "unsatisfied", "unsatisfied"…
## $ tags <chr> "[minuman, rasanya]", "[ parkiran, tempat, pelayanan ]…
## $ about_place <int> 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, …
## $ about_food <int> 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, …
## $ about_service <int> 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
Columns to change: - branch, reviewer_gender, rating, sentiment, review_tone, about_place, about_food, about_service -> factor - review_counts, likes -> int - reviewer_date -> date
review_clean <- review_clean %>%
mutate_at(vars(branch, reviewer_gender, rating, sentiment, review_tone, about_place, about_food, about_service), as.factor) %>%
mutate_at(vars(review_counts, likes), as.integer) %>%
mutate(review_date = mdy(review_date))
glimpse(review_clean)## Rows: 1,590
## Columns: 12
## $ branch <fct> Bandung, Bandung, Bandung, Bandung, Bandung, Bandung, …
## $ reviewer_gender <fct> female, female, male, female, female, female, female, …
## $ review_date <date> 2022-08-07, 2022-09-03, 2019-10-20, 2023-02-20, 2022-…
## $ review_counts <int> 83, 133, 24, 13, 294, 14, 39, 11, 32, 44, 8, 5, 35, 28…
## $ rating <fct> 5 , 5 , 1 , 3 , 5 , 5 , 5 , 1 , 3 , 5 , 5 , 5 , 5 , 4 …
## $ likes <int> NA, NA, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA, NA,…
## $ sentiment <fct> Positive, Positive, Negative, Neutral, Neutral, Positi…
## $ review_tone <fct> satisfied, satisfied, unsatisfied, unsatisfied, neutra…
## $ tags <chr> "[minuman, rasanya]", "[ parkiran, tempat, pelayanan ]…
## $ about_place <fct> 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, …
## $ about_food <fct> 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, …
## $ about_service <fct> 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## branch reviewer_gender review_date review_counts rating
## 0 0 0 81 0
## likes sentiment review_tone tags about_place
## 1504 0 0 0 676
## about_food about_service
## 676 676
# fill in missing value
review_clean$about_service <- ifelse(is.na(review_clean$about_service), "missing", review_clean$about_service)
review_clean$about_food <- ifelse(is.na(review_clean$about_food), "missing", review_clean$about_food)
review_clean$about_place <- ifelse(is.na(review_clean$about_place), "missing", review_clean$about_place)
review_clean$review_counts <- ifelse(is.na(review_clean$review_counts), median(review_clean$review_counts, na.rm = TRUE), review_clean$review_counts)
review_clean$likes <- ifelse(is.na(review_clean$likes), 0, review_clean$likes)## branch reviewer_gender review_date review_counts rating
## 0 0 0 0 0
## likes sentiment review_tone tags about_place
## 0 0 0 0 0
## about_food about_service
## 0 0
## [1] "[minuman, rasanya]" "[ parkiran, tempat, pelayanan ]"
## [3] "[minuman, pesanan, keluhan]" "[tempat, pesanan, minuman, rasa]"
## [5] "['tempat', 'rasa']" "[]"
# remove unnecessary strings from tags
review_clean$tags <- str_replace_all(review_clean$tags, "'", "")
review_clean$tags <- str_remove_all(review_clean$tags, " ")
review_clean$tags <- str_remove_all(review_clean$tags, "\\[|\\]")## [1] "minuman,rasanya" "parkiran,tempat,pelayanan"
## [3] "minuman,pesanan,keluhan" "tempat,pesanan,minuman,rasa"
## [5] "tempat,rasa" ""
# extract tags
tags <- review_clean$tags
tag_frequency <- c()
for (tag_string in tags) {
if (tag_string != "") {
individual_tags <- strsplit(tag_string, ",")[[1]]
for (tag in individual_tags) {
if (tag %in% names(tag_frequency)) {
tag_frequency[tag] <- tag_frequency[tag] + 1
} else {
tag_frequency[tag] <- 1
}
}
}
}
# Print tag frequencies
head(tag_frequency)## minuman rasanya parkiran tempat pelayanan pesanan
## 239 2 74 429 207 42
# make tag_frequency into dataframe and sort it descendingly
tag_df <- data.frame(Tag = names(tag_frequency), Frequency = unname(tag_frequency))
tag_df <- tag_df[order(tag_df$Frequency, decreasing = TRUE), ]
head(tag_df)# extract year and quarter from review_date
review_clean$review_date_quarter <- quarter(review_clean$review_date)
review_clean$review_date_year <- year(review_clean$review_date)
head(review_clean)# make quarter year
review_clean <- review_clean %>%
mutate(review_year_quarter = ifelse(review_date_year == "2019" & review_date_quarter == "3", "2019 Q3",
ifelse(review_date_year == "2019" & review_date_quarter == "4", "2019 Q4",
ifelse(review_date_year == "2020" & review_date_quarter == "1", "2020 Q1",
ifelse(review_date_year == "2020" & review_date_quarter == "2", "2020 Q2",
ifelse(review_date_year == "2020" & review_date_quarter == "3", "2020 Q3",
ifelse(review_date_year == "2020" & review_date_quarter == "4", "2020 Q4",
ifelse(review_date_year == "2021" & review_date_quarter == "1", "2021 Q1",
ifelse(review_date_year == "2021" & review_date_quarter == "2", "2021 Q2",
ifelse(review_date_year == "2021" & review_date_quarter == "3", "2021 Q3",
ifelse(review_date_year == "2021" & review_date_quarter == "4", "2021 Q4",
ifelse(review_date_year == "2022" & review_date_quarter == "1", "2022 Q1",
ifelse(review_date_year == "2022" & review_date_quarter == "2", "2022 Q2",
ifelse(review_date_year == "2022" & review_date_quarter == "3", "2022 Q3",
ifelse(review_date_year == "2022" & review_date_quarter == "4", "2022 Q4",
ifelse(review_date_year == "2023" & review_date_quarter == "1", "2023 Q1",
ifelse(review_date_year == "2023" & review_date_quarter == "2", "2023 Q2", "missing")))))))))))))))))we suppose if rating = 4 or rating = 5, it is called higher ratings.
# make a new column named higher_rating
review_clean$rating <- as.numeric(review_clean$rating)
review_clean <- review_clean %>%
mutate(higher_rating = ifelse(rating == 4 | rating == 5, 1, 0))
head(review_clean)# Jakarta has high rating, so we are going to analyze further
review_jakarta <- review_clean %>%
filter(branch == "Jakarta")
head(review_jakarta)# extract tags from review_jakarta
tags <- review_jakarta$tags
tag_frequency <- c()
for (tag_string in tags) {
if (tag_string != "") {
individual_tags <- strsplit(tag_string, ",")[[1]]
for (tag in individual_tags) {
if (tag %in% names(tag_frequency)) {
tag_frequency[tag] <- tag_frequency[tag] + 1
} else {
tag_frequency[tag] <- 1
}
}
}
}
tag_df <- data.frame(Tag = names(tag_frequency), Frequency = unname(tag_frequency))
tag_df <- tag_df[order(tag_df$Frequency, decreasing = TRUE), ]
head(tag_df)# visualize tags from review_jakarta
tag_df %>%
ggplot(mapping = aes(x = reorder(Tag, -Frequency),
y = Frequency,
fill = Tag)) +
geom_bar(stat = "identity") +
labs(title = "Tags based on Jakarta's Customer Reviews",
y = "Amount of Reviews",
x = "Tags",
fill = "Review Tone") +
theme_minimal() +
theme(legend.position = "none") +
theme(axis.text.x = element_text(angle = 60, hjust = 1))# Bandung has high rating, so we are going to analyze further
review_bandung <- review_clean %>%
filter(branch == "Bandung")
tags <- review_bandung$tags
tag_frequency <- c()
# extract tags
for (tag_string in tags) {
if (tag_string != "") {
individual_tags <- strsplit(tag_string, ",")[[1]]
for (tag in individual_tags) {
if (tag %in% names(tag_frequency)) {
tag_frequency[tag] <- tag_frequency[tag] + 1
} else {
tag_frequency[tag] <- 1
}
}
}
}
tag_df <- data.frame(Tag = names(tag_frequency), Frequency = unname(tag_frequency))
tag_df <- tag_df[order(tag_df$Frequency, decreasing = TRUE), ]
head(tag_df)# plot tags from review_bandung
tag_df %>%
ggplot(mapping = aes(x = reorder(Tag, -Frequency),
y = Frequency,
fill = Tag)) +
geom_bar(stat = "identity") +
labs(title = "Tags based on Bandung's Customer Reviews",
y = "Amount of Reviews",
x = "Tags",
fill = "Review Tone") +
theme_minimal() +
theme(legend.position = "none") +
theme(axis.text.x = element_text(angle = 60, hjust = 1))The rest of the question are answered in the report and dashboard.