The Exploratory Data Analysis (EDA) phase is for understanding the patterns, relationships, and anomalies in the data. This involves visualizing data through graphs, charts, and summary statistics to uncover underlying trends and characteristics. For predicting which restaurant has the highest rating, EDA involves examining factors like location , customer reviews, other relevant variables. By exploring these aspects, we can identify key features that influence restaurant ratings. This not only helps in building a more accurate predictive model but also provides valuable insights into what makes a restaurant appealing to customers.
df_tripadvisor <- read.csv('tripadvisor.csv')
# to see the proportion of missing (NA)
sum(is.na(df_tripadvisor$Country)/nrow(df_tripadvisor))
## [1] 0.5766721
#clean and covert the 'Dates COlumn'
df_tripadvisor$Dates <- gsub("Reviewed ", "", df_tripadvisor$Dates) #removing 'reviewed'
df_tripadvisor$Dates <- as.Date(df_tripadvisor$Dates, format = '%d %B %Y') #coverts format of data strings
#Only KL area
#converting rating to a factor and creating a summary table
df_tripadvisor$Rating <- as.factor(df_tripadvisor$Rating) #to factor
df_tripadvisor_kl_rating_count <- as.data.frame(table(df_tripadvisor$Rating)) #creating a frequency table
names(df_tripadvisor_kl_rating_count) <- c('Rating', 'Count') #rename the data frame to Rating and Count
df_tripadvisor_kl <- df_tripadvisor[df_tripadvisor$Location == "KL", ] #filtering, only KL area
summary(df_tripadvisor_kl)
## X Author Title Review
## Min. : 1 Length:71368 Length:71368 Length:71368
## 1st Qu.:17843 Class :character Class :character Class :character
## Median :35685 Mode :character Mode :character Mode :character
## Mean :35685
## 3rd Qu.:53526
## Max. :71368
##
## Rating Dates Restaurant Location
## 1: 2836 Min. :2008-01-02 Length:71368 Length:71368
## 2: 2944 1st Qu.:2015-10-28 Class :character Class :character
## 3: 7547 Median :2017-08-14 Mode :character Mode :character
## 4:20227 Mean :2017-06-18
## 5:37814 3rd Qu.:2019-06-14
## Max. :2022-02-19
## NA's :418
## Country
## Length:71368
## Class :character
## Mode :character
##
##
##
##
ggplot(df_tripadvisor_kl_rating_count, aes(x="", y=Count, fill=Rating)) +
geom_bar(stat="identity", width=1) +
coord_polar("y", start=0) +
theme_void() +
labs(title = 'Ratings of restaurants in KL on a scale of 1 to 5')
#From this pie chart, we can see majority of the ratings are at least 4 and above
#Grouping the data by month.then sums the 'Count' values
df_tripadvisor_kl_review_dates <- as.data.frame(table(df_tripadvisor_kl$Dates))
names(df_tripadvisor_kl_review_dates) <- c('Date', "Count")
df_tripadvisor_kl_review_dates$Date <- floor_date(as.Date(df_tripadvisor_kl_review_dates$Date), unit='month')
df_tripadvisor_kl_review_dates <- df_tripadvisor_kl_review_dates %>% group_by(Date) %>% summarise(Count = sum(Count), .groups = 'drop')
#To view the number of KL restaurant reviews given overtime:
ggplot(df_tripadvisor_kl_review_dates, aes(x = Date, y = Count, group=1)) +
geom_line(size=1) +
labs(title = 'Number of KL restaurant reviews given on TripAdvisor over time')
#The graph shows a significant decrease in review given during MCO, the amount remain low until now.
df_tripadvisor_kl_review_countries <- as.data.frame(table(df_tripadvisor_kl$Country)) #creating a frequency table
names(df_tripadvisor_kl_review_countries) <- c('Country', 'Count')
df_tripadvisor_kl_review_countries <- df_tripadvisor_kl_review_countries[df_tripadvisor_kl_review_countries$Country != "Malaysia", ] #all except Malaysia
df_tripadvisor_kl_review_countries <- df_tripadvisor_kl_review_countries %>% arrange(desc(Count)) %>% head(25)
ggplot(df_tripadvisor_kl_review_countries, aes(x = reorder(Country, -Count), y = Count)) +
geom_bar(stat = "identity") +
scale_x_discrete(guide = guide_axis(n.dodge = 2)) +
labs(title = 'Number of KL restaurant reviews given by tourists', x = 'Country')
#this bar chart shows that UK tourists gave the most reviews
df_tripadvisor <- read.csv('tripadvisor.csv')
df_googlemaps <- read.csv("googlemaps.csv")
#Combining Google maps with Tripadvisor datasets
df_googlemaps$Rating <- as.factor(df_googlemaps$Rating)
df_tripadvisor$Rating <- as.factor(df_tripadvisor$Rating)
#converts ratings to factor
df <- bind_rows(df_tripadvisor, df_googlemaps) #combining both data frames
df <- df[, c('Review', 'Rating', 'Restaurant', 'Location')] #selecting specific columns
df <- df[df$Location == "KL", ] #only KL
df_restaurant <- data.frame(matrix(ncol=3, nrow=0)) #creating a matrix
colnames(df_restaurant) <- c('Name', 'Number of reviews', 'Average rating') #naming the empty df
df_restaurant <- df %>% group_by(Restaurant) %>%
summarise(`Number of reviews` = n(), `Average rating` = round(mean(as.numeric(Rating), na.rm = TRUE), 2)) %>%
arrange(desc(`Number of reviews`), desc(`Average rating`))
#To group the restaurants based on their ratings
df_restaurant
## # A tibble: 862 × 3
## Restaurant `Number of reviews` `Average rating`
## <chr> <int> <dbl>
## 1 Dining In The Dark KL 1783 4.67
## 2 The Whisky Bar 1284 4.74
## 3 Canopy Rooftop Bar and Lounge 1268 4.88
## 4 Madam Kwan's KLCC 1268 3.84
## 5 Iketeru Restaurant 1257 4.75
## 6 BBQ NIGHTS 1012 4.45
## 7 Ishin Japanese Dining 1005 4.44
## 8 Canopy Lounge Rooftop Bar KL 948 4.55
## 9 Khan’s Indian Cuisine 939 4.42
## 10 El Cerdo 915 4.57
## # ℹ 852 more rows
#Applying Bayesian Average to ensures that both highly-rated restaurants
#and those with a large number of reviews are fairly represented.
#It avoids bias towards restaurants with few, but perfect, ratings.
global_avg_rating <- mean(df_restaurant$`Average rating`, na.rm = TRUE) #mean of ratings
C <- mean(df_restaurant$`Number of reviews`, na.rm = TRUE) #mean of reviews
df_restaurant <- df_restaurant %>%
mutate(Bayesian_Average = ((`Average rating` * `Number of reviews`) + (C * global_avg_rating)) / (`Number of reviews` + C)) %>%
arrange(desc(Bayesian_Average)) #now all ratings are fair
top_20_restaurants <- df_restaurant %>%
top_n(20, Bayesian_Average) #selecting the top 20 restaurant
top_20_restaurants
## # A tibble: 20 × 4
## Restaurant `Number of reviews` `Average rating` Bayesian_Average
## <chr> <int> <dbl> <dbl>
## 1 Canopy Rooftop Bar and… 1268 4.88 4.82
## 2 Positano Risto 848 4.87 4.78
## 3 Chambers Grill 703 4.86 4.76
## 4 Cielo Sky Dining & Lou… 490 4.85 4.72
## 5 Iketeru Restaurant 1257 4.75 4.70
## 6 Sausage KL Cafe & Deli 393 4.85 4.69
## 7 The Whisky Bar 1284 4.74 4.69
## 8 Healy Mac's 460 4.78 4.65
## 9 Dining In The Dark KL 1783 4.67 4.64
## 10 Opium KL 890 4.67 4.61
## 11 Zenzero Restaurant & W… 500 4.68 4.58
## 12 Nizza 172 4.85 4.57
## 13 Manja 484 4.67 4.57
## 14 Knowhere Bangsar 710 4.63 4.56
## 15 Quivo Pavilion 650 4.62 4.55
## 16 Favola Italian Restaur… 379 4.67 4.55
## 17 Vin's Restaurant and B… 348 4.66 4.53
## 18 Sushi Hibiki 137 4.84 4.53
## 19 The Steakhouse KL 660 4.59 4.53
## 20 El Cerdo 915 4.57 4.52
#Plotting the graph:
ggplot(top_20_restaurants, aes(x = reorder(Restaurant, Bayesian_Average), y = Bayesian_Average)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() + # Flips the axes for better readability
labs(title = "Top 20 Restaurants in KL based on Bayesian Average Rating",
x = "Restaurant",
y = "Bayesian Average Rating")
#"Based on the Bayesian average, which considers both average ratings and review volume,
#'Canopy Rooftop Bar and Lounge' stands out as the top-rated restaurant in Kuala Lumpur,
#'reflecting its high customer ratings and consistent performance across numerous reviews.
df_tripadvisor2 <- read.csv('tripadvisor.csv')
df_googlemaps2 <- read.csv("googlemaps.csv")
df_googlemaps2$Rating <- as.factor(df_googlemaps2$Rating)
df_tripadvisor2$Rating <- as.factor(df_tripadvisor2$Rating)
df2 <- bind_rows(df_tripadvisor2, df_googlemaps2)
df2 <- df2[, c('Review', 'Rating', 'Restaurant', 'Location')]
unique_locations <- df2 %>%
distinct(Location) %>%
arrange(Location)
unique_locations #to show how many locations
## Location
## 1 Ipoh
## 2 JB
## 3 KL
## 4 Kuching
## 5 Langkawi
## 6 Melaka
## 7 Miri
## 8 Penang
## 9 Petaling Jaya
## 10 Shah Alam
df_restaurant2 <- data.frame(matrix(ncol=3, nrow=0)) #creating a matrix
colnames(df_restaurant) <- c('Name', 'Number of reviews', 'Average rating', 'Location')
df_restaurant2 <- df2 %>% group_by(Restaurant) %>%
group_by(Restaurant, Location ) %>%
summarise(`Number of reviews` = n(),
`Average rating` = round(mean(as.numeric(Rating), na.rm = TRUE), 2)) %>%
arrange(desc(`Average rating`))
## `summarise()` has grouped output by 'Restaurant'. You can override using the
## `.groups` argument.
df_restaurant2
## # A tibble: 3,635 × 4
## # Groups: Restaurant [3,515]
## Restaurant Location `Number of reviews` `Average rating`
## <chr> <chr> <int> <dbl>
## 1 362 Gunung Rapat Heong Peah Ipoh 2 5
## 2 Agape Foodcourt JB 1 5
## 3 Ah Lan Hainanese Satay Melaka 1 5
## 4 Ah Ni Bak Kut Teh Shah Alam 1 5
## 5 Aji Dataran Ipoh 1 5
## 6 Arabic Food Ttdi Jaya Shah Alam 1 5
## 7 Aryan Restaurant KL 18 5
## 8 Asam pedas house JB 1 5
## 9 Asma Cake House Kuching 3 5
## 10 Atap Food Court 新香儐美食閣 JB 1 5
## # ℹ 3,625 more rows
#Applying Bayesian_Average
global_avg_rating2 <- mean(df_restaurant2$`Average rating`, na.rm = TRUE) #mean of ratings
C2 <- mean(df_restaurant2$`Number of reviews`, na.rm = TRUE) #mean of reviews
df_restaurant2 <- df_restaurant2 %>%
mutate(Bayesian_Average = ((`Average rating` * `Number of reviews`) +
(C2 * global_avg_rating2)) / (`Number of reviews` + C2)) %>%
arrange(desc(Bayesian_Average))
#Selecting the top 20
top_20_restaurant2 <- df_restaurant2 %>%
arrange(desc(Bayesian_Average)) %>%
head(20)
#Shorten one of the restaurant's name to vitualize better
top_20_restaurant2 <- top_20_restaurant2 %>%
mutate(Modified_Restaurant =
ifelse(Restaurant == "The Argan Trees Restaurant-Moroccan and Mediterran- Restaurant Langkawi",
"The Argan Restaurant", Restaurant))
top_20_restaurant2
## # A tibble: 20 × 6
## # Groups: Restaurant [20]
## Restaurant Location `Number of reviews` `Average rating` Bayesian_Average
## <chr> <chr> <int> <dbl> <dbl>
## 1 Canopy Roofto… KL 1268 4.88 4.82
## 2 Positano Risto KL 848 4.87 4.79
## 3 Chambers Grill KL 703 4.86 4.76
## 4 Alhamdulillah… Langkawi 436 4.9 4.75
## 5 Cielo Sky Din… KL 490 4.85 4.72
## 6 Haroo Haroo K… Langkawi 856 4.79 4.72
## 7 Iketeru Resta… KL 1257 4.75 4.70
## 8 Sausage KL Ca… KL 393 4.85 4.69
## 9 The Whisky Bar KL 1284 4.74 4.69
## 10 Healy Mac's KL 460 4.78 4.65
## 11 Dining In The… KL 1783 4.67 4.64
## 12 Haroo Korean … Langkawi 426 4.75 4.62
## 13 AIN- ARABIA R… Langkawi 258 4.82 4.61
## 14 The Argan Tre… Langkawi 296 4.79 4.61
## 15 Opium KL KL 890 4.67 4.61
## 16 Kayuputi Langkawi 375 4.74 4.60
## 17 Antipodean Ca… Petalin… 988 4.64 4.59
## 18 Zenzero Resta… KL 500 4.68 4.58
## 19 MY French Fac… Langkawi 770 4.64 4.58
## 20 Rubin Mardini… Penang 200 4.82 4.57
## # ℹ 1 more variable: Modified_Restaurant <chr>
#Plotting the bar graph
ggplot(top_20_restaurant2, aes(x = reorder(Modified_Restaurant, Bayesian_Average), y = Bayesian_Average)) +
geom_bar(stat = "identity", fill = "steelblue") +
geom_text(aes(label = Location), position = position_dodge(width = 0.9), hjust = -0.1, vjust = -0.5, color = "black", size = 3) +
coord_flip() +
labs(title = "Top 20 Restaurants in Malaysia based on Bayesian Average Rating",
x = "Restaurant",
y = "Bayesian Average Rating")
#Based on the graph, the top 3 restaurants in KL still remain at the top
#after considering other locations.
#Sample a subset of the data
set.seed(123) #for reproducibility
df_tripadvisor <- df_tripadvisor %>% sample_n(1000)
#showing the amount of words, characters and sentence
df_tripadvisor <- df_tripadvisor %>%
mutate(
word_count= str_count(Review, boundary("word")),
character_count= nchar(Review),
sentence_count= str_count(Review, boundary('sentence'))
)
#Text Preprocessing- lowering casing, removing punctuation and numbers, and eliminating stop words.
#creating a corpus
corpus <- Corpus(VectorSource(df_tripadvisor$Review))
#Preprocessing steps:
corpus_clean <- corpus %>%
tm_map(content_transformer(tolower)) %>% # Convert text to lower case
tm_map(removePunctuation) %>% # Remove punctuation
tm_map(removeNumbers) %>% # Remove all numbers
tm_map(removeWords, stopwords('english')) # Remove common stop words
#Word Frequency Analysis
dtm <- TermDocumentMatrix(corpus_clean)
m <- as.matrix(dtm)
word_freqs <- sort(rowSums(m), decreasing= TRUE)
word_freqs_df <- data.frame(word= names(word_freqs), freq=word_freqs)
head(word_freqs_df)
## word freq
## food food 802
## good good 592
## service service 356
## place place 347
## restaurant restaurant 336
## great great 306
#using word cloud to visualize the frequency of words
wordcloud(names(word_freqs), freq = word_freqs, min.freq = 1, random.order = FALSE, colors = brewer.pal(8, "Dark2"))
#Here we can see what words are mentioned regulary in the review stage.
#Base on location
df_tripadvisor <- read.csv('tripadvisor.csv')
df_googlemaps <- read.csv("googlemaps.csv")
df <- bind_rows(df_tripadvisor, df_googlemaps)
#using weighted rating to identify which locations has the highest average ratings
m <- mean(df$Rating, na.rm = TRUE)
v <- 50 # Minimum reviews threshold
weighted_ratings <- df %>%
group_by(Location) %>%
summarise(Average_Rating = mean(Rating, na.rm = TRUE),
Review_Count = n()) %>%
mutate(Weighted_Rating = (Review_Count * Average_Rating + v * m) / (Review_Count + v)) %>%
arrange(desc(Weighted_Rating))
#ploting the graph
ggplot(weighted_ratings[1:10, ], aes(x = reorder(Location, Weighted_Rating), y = Weighted_Rating)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = "Top 10 Locations by Weighted Average Rating",
x = "Location",
y = "Weighted Average Rating")
#Based on this, we can identify that Langkawi has the highest weighted average ratings.This approach gives more weight to locations with more reviews, addressing the potential bias of small sample sizes.