Introduction

The Exploratory Data Analysis (EDA) phase is for understanding the patterns, relationships, and anomalies in the data. This involves visualizing data through graphs, charts, and summary statistics to uncover underlying trends and characteristics. For predicting which restaurant has the highest rating, EDA involves examining factors like location , customer reviews, other relevant variables. By exploring these aspects, we can identify key features that influence restaurant ratings. This not only helps in building a more accurate predictive model but also provides valuable insights into what makes a restaurant appealing to customers.

1) Exploratory Data Analysis

df_tripadvisor <- read.csv('tripadvisor.csv')
# to see the proportion of missing (NA)
sum(is.na(df_tripadvisor$Country)/nrow(df_tripadvisor))
## [1] 0.5766721
#clean and covert the 'Dates COlumn' 
df_tripadvisor$Dates <- gsub("Reviewed ", "", df_tripadvisor$Dates) #removing 'reviewed'
df_tripadvisor$Dates <- as.Date(df_tripadvisor$Dates, format = '%d %B %Y') #coverts format of data strings

1.2) Exploring Ratings Columns

#Only KL area
#converting rating to a factor and creating a summary table 
df_tripadvisor$Rating <- as.factor(df_tripadvisor$Rating)  #to factor
df_tripadvisor_kl_rating_count <- as.data.frame(table(df_tripadvisor$Rating)) #creating a frequency table
names(df_tripadvisor_kl_rating_count) <- c('Rating', 'Count') #rename the data frame to Rating and Count
df_tripadvisor_kl <- df_tripadvisor[df_tripadvisor$Location == "KL", ] #filtering, only KL area
summary(df_tripadvisor_kl)
##        X            Author             Title              Review         
##  Min.   :    1   Length:71368       Length:71368       Length:71368      
##  1st Qu.:17843   Class :character   Class :character   Class :character  
##  Median :35685   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :35685                                                           
##  3rd Qu.:53526                                                           
##  Max.   :71368                                                           
##                                                                          
##  Rating        Dates             Restaurant          Location        
##  1: 2836   Min.   :2008-01-02   Length:71368       Length:71368      
##  2: 2944   1st Qu.:2015-10-28   Class :character   Class :character  
##  3: 7547   Median :2017-08-14   Mode  :character   Mode  :character  
##  4:20227   Mean   :2017-06-18                                        
##  5:37814   3rd Qu.:2019-06-14                                        
##            Max.   :2022-02-19                                        
##            NA's   :418                                               
##    Country         
##  Length:71368      
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 

1.3) Plotting the ratings of restaurants in KL on a scale of 1 to 5

ggplot(df_tripadvisor_kl_rating_count, aes(x="", y=Count, fill=Rating)) +
  geom_bar(stat="identity", width=1) +
  coord_polar("y", start=0) +
  theme_void() + 
  labs(title = 'Ratings of restaurants in KL on a scale of 1 to 5')

#From this pie chart, we can see majority of the ratings are at least 4 and above

1.4) To examine the number of KL restaurant Reviews Given on TripAdvisor overtime.

#Grouping the data by month.then sums the 'Count' values
df_tripadvisor_kl_review_dates <- as.data.frame(table(df_tripadvisor_kl$Dates))
names(df_tripadvisor_kl_review_dates) <- c('Date', "Count")
df_tripadvisor_kl_review_dates$Date <- floor_date(as.Date(df_tripadvisor_kl_review_dates$Date), unit='month')
df_tripadvisor_kl_review_dates <- df_tripadvisor_kl_review_dates %>% group_by(Date) %>% summarise(Count = sum(Count), .groups = 'drop')

#To view the number of KL restaurant reviews given overtime:
ggplot(df_tripadvisor_kl_review_dates, aes(x = Date, y = Count, group=1)) + 
  geom_line(size=1) +
  labs(title = 'Number of KL restaurant reviews given on TripAdvisor over time')

#The graph shows a significant decrease in review given during MCO, the amount remain low until now. 

1.5) To view number of KL restaurant reviews given by tourists

df_tripadvisor_kl_review_countries <- as.data.frame(table(df_tripadvisor_kl$Country)) #creating a frequency table 
names(df_tripadvisor_kl_review_countries) <- c('Country', 'Count')
df_tripadvisor_kl_review_countries <- df_tripadvisor_kl_review_countries[df_tripadvisor_kl_review_countries$Country != "Malaysia", ] #all except Malaysia
df_tripadvisor_kl_review_countries <- df_tripadvisor_kl_review_countries %>% arrange(desc(Count)) %>% head(25)
ggplot(df_tripadvisor_kl_review_countries, aes(x = reorder(Country, -Count), y = Count)) + 
  geom_bar(stat = "identity") +
  scale_x_discrete(guide = guide_axis(n.dodge = 2)) + 
  labs(title = 'Number of KL restaurant reviews given by tourists', x = 'Country')

#this bar chart shows that UK tourists gave the most reviews

1.6) To identify the top 20 restaurant in KL based on Bayesian Average ratings.

df_tripadvisor <- read.csv('tripadvisor.csv')
df_googlemaps <- read.csv("googlemaps.csv")


#Combining Google maps with Tripadvisor datasets
df_googlemaps$Rating <- as.factor(df_googlemaps$Rating)
df_tripadvisor$Rating <- as.factor(df_tripadvisor$Rating)
#converts ratings to factor
df <- bind_rows(df_tripadvisor, df_googlemaps) #combining both data frames
df <- df[, c('Review', 'Rating', 'Restaurant', 'Location')] #selecting specific columns
df <- df[df$Location == "KL", ] #only KL
df_restaurant <- data.frame(matrix(ncol=3, nrow=0)) #creating a matrix 
colnames(df_restaurant) <- c('Name', 'Number of reviews', 'Average rating') #naming the empty df 

df_restaurant <- df %>% group_by(Restaurant) %>% 
  summarise(`Number of reviews` = n(), `Average rating` = round(mean(as.numeric(Rating), na.rm = TRUE), 2)) %>% 
  arrange(desc(`Number of reviews`), desc(`Average rating`))

#To group the restaurants based on their ratings
df_restaurant
## # A tibble: 862 × 3
##    Restaurant                    `Number of reviews` `Average rating`
##    <chr>                                       <int>            <dbl>
##  1 Dining In The Dark KL                        1783             4.67
##  2 The Whisky Bar                               1284             4.74
##  3 Canopy Rooftop Bar and Lounge                1268             4.88
##  4 Madam Kwan's KLCC                            1268             3.84
##  5 Iketeru Restaurant                           1257             4.75
##  6 BBQ NIGHTS                                   1012             4.45
##  7 Ishin Japanese Dining                        1005             4.44
##  8 Canopy Lounge Rooftop Bar KL                  948             4.55
##  9 Khan’s Indian Cuisine                         939             4.42
## 10 El Cerdo                                      915             4.57
## # ℹ 852 more rows
#Applying Bayesian Average to ensures that both highly-rated restaurants 
#and those with a large number of reviews are fairly represented.  
#It avoids bias towards restaurants with few, but perfect, ratings.

global_avg_rating <- mean(df_restaurant$`Average rating`, na.rm = TRUE) #mean of ratings
C <- mean(df_restaurant$`Number of reviews`, na.rm = TRUE) #mean of reviews

df_restaurant <- df_restaurant %>%
  mutate(Bayesian_Average = ((`Average rating` * `Number of reviews`) + (C * global_avg_rating)) / (`Number of reviews` + C)) %>%
  arrange(desc(Bayesian_Average)) #now all ratings are fair 

top_20_restaurants <- df_restaurant %>%
  top_n(20, Bayesian_Average) #selecting the top 20 restaurant
top_20_restaurants
## # A tibble: 20 × 4
##    Restaurant              `Number of reviews` `Average rating` Bayesian_Average
##    <chr>                                 <int>            <dbl>            <dbl>
##  1 Canopy Rooftop Bar and…                1268             4.88             4.82
##  2 Positano Risto                          848             4.87             4.78
##  3 Chambers Grill                          703             4.86             4.76
##  4 Cielo Sky Dining & Lou…                 490             4.85             4.72
##  5 Iketeru Restaurant                     1257             4.75             4.70
##  6 Sausage KL Cafe & Deli                  393             4.85             4.69
##  7 The Whisky Bar                         1284             4.74             4.69
##  8 Healy Mac's                             460             4.78             4.65
##  9 Dining In The Dark KL                  1783             4.67             4.64
## 10 Opium KL                                890             4.67             4.61
## 11 Zenzero Restaurant & W…                 500             4.68             4.58
## 12 Nizza                                   172             4.85             4.57
## 13 Manja                                   484             4.67             4.57
## 14 Knowhere Bangsar                        710             4.63             4.56
## 15 Quivo Pavilion                          650             4.62             4.55
## 16 Favola Italian Restaur…                 379             4.67             4.55
## 17 Vin's Restaurant and B…                 348             4.66             4.53
## 18 Sushi Hibiki                            137             4.84             4.53
## 19 The Steakhouse KL                       660             4.59             4.53
## 20 El Cerdo                                915             4.57             4.52
#Plotting the graph:
ggplot(top_20_restaurants, aes(x = reorder(Restaurant, Bayesian_Average), y = Bayesian_Average)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +  # Flips the axes for better readability
  labs(title = "Top 20 Restaurants in KL based on Bayesian Average Rating",
       x = "Restaurant",
       y = "Bayesian Average Rating")

#"Based on the Bayesian average, which considers both average ratings and review volume, 
#'Canopy Rooftop Bar and Lounge' stands out as the top-rated restaurant in Kuala Lumpur,
#'reflecting its high customer ratings and consistent performance across numerous reviews. 

1.7) To identify the top 20 restaurant in Malaysia:

df_tripadvisor2 <- read.csv('tripadvisor.csv')
df_googlemaps2 <- read.csv("googlemaps.csv")

df_googlemaps2$Rating <- as.factor(df_googlemaps2$Rating)
df_tripadvisor2$Rating <- as.factor(df_tripadvisor2$Rating)
df2 <- bind_rows(df_tripadvisor2, df_googlemaps2)
df2 <- df2[, c('Review', 'Rating', 'Restaurant', 'Location')]

unique_locations <- df2 %>% 
  distinct(Location) %>% 
  arrange(Location)
unique_locations #to show how many locations 
##         Location
## 1           Ipoh
## 2             JB
## 3             KL
## 4        Kuching
## 5       Langkawi
## 6         Melaka
## 7           Miri
## 8         Penang
## 9  Petaling Jaya
## 10     Shah Alam
df_restaurant2 <- data.frame(matrix(ncol=3, nrow=0)) #creating a matrix 
colnames(df_restaurant) <- c('Name', 'Number of reviews', 'Average rating', 'Location')

df_restaurant2 <- df2 %>% group_by(Restaurant) %>% 
  group_by(Restaurant, Location ) %>% 
  summarise(`Number of reviews` = n(), 
            `Average rating` = round(mean(as.numeric(Rating), na.rm = TRUE), 2)) %>% 
  arrange(desc(`Average rating`))
## `summarise()` has grouped output by 'Restaurant'. You can override using the
## `.groups` argument.
df_restaurant2
## # A tibble: 3,635 × 4
## # Groups:   Restaurant [3,515]
##    Restaurant                   Location  `Number of reviews` `Average rating`
##    <chr>                        <chr>                   <int>            <dbl>
##  1 362 Gunung Rapat Heong Peah  Ipoh                        2                5
##  2 Agape Foodcourt              JB                          1                5
##  3 Ah Lan Hainanese Satay       Melaka                      1                5
##  4 Ah Ni Bak Kut Teh            Shah Alam                   1                5
##  5 Aji Dataran                  Ipoh                        1                5
##  6 Arabic Food Ttdi Jaya        Shah Alam                   1                5
##  7 Aryan Restaurant             KL                         18                5
##  8 Asam pedas house             JB                          1                5
##  9 Asma Cake House              Kuching                     3                5
## 10 Atap Food Court 新香儐美食閣 JB                          1                5
## # ℹ 3,625 more rows
#Applying Bayesian_Average

global_avg_rating2 <- mean(df_restaurant2$`Average rating`, na.rm = TRUE) #mean of ratings
C2 <- mean(df_restaurant2$`Number of reviews`, na.rm = TRUE) #mean of reviews

df_restaurant2 <- df_restaurant2 %>%
  mutate(Bayesian_Average = ((`Average rating` * `Number of reviews`) + 
                               (C2 * global_avg_rating2)) / (`Number of reviews` + C2)) %>%
  arrange(desc(Bayesian_Average))

#Selecting the top 20
top_20_restaurant2 <- df_restaurant2 %>%
  arrange(desc(Bayesian_Average)) %>%
  head(20)

#Shorten one of the restaurant's name to vitualize better
top_20_restaurant2 <- top_20_restaurant2 %>%
  mutate(Modified_Restaurant = 
           ifelse(Restaurant == "The Argan Trees Restaurant-Moroccan and Mediterran- Restaurant Langkawi", 
                  "The Argan Restaurant", Restaurant))
top_20_restaurant2
## # A tibble: 20 × 6
## # Groups:   Restaurant [20]
##    Restaurant     Location `Number of reviews` `Average rating` Bayesian_Average
##    <chr>          <chr>                  <int>            <dbl>            <dbl>
##  1 Canopy Roofto… KL                      1268             4.88             4.82
##  2 Positano Risto KL                       848             4.87             4.79
##  3 Chambers Grill KL                       703             4.86             4.76
##  4 Alhamdulillah… Langkawi                 436             4.9              4.75
##  5 Cielo Sky Din… KL                       490             4.85             4.72
##  6 Haroo Haroo K… Langkawi                 856             4.79             4.72
##  7 Iketeru Resta… KL                      1257             4.75             4.70
##  8 Sausage KL Ca… KL                       393             4.85             4.69
##  9 The Whisky Bar KL                      1284             4.74             4.69
## 10 Healy Mac's    KL                       460             4.78             4.65
## 11 Dining In The… KL                      1783             4.67             4.64
## 12 Haroo Korean … Langkawi                 426             4.75             4.62
## 13 AIN- ARABIA R… Langkawi                 258             4.82             4.61
## 14 The Argan Tre… Langkawi                 296             4.79             4.61
## 15 Opium KL       KL                       890             4.67             4.61
## 16 Kayuputi       Langkawi                 375             4.74             4.60
## 17 Antipodean Ca… Petalin…                 988             4.64             4.59
## 18 Zenzero Resta… KL                       500             4.68             4.58
## 19 MY French Fac… Langkawi                 770             4.64             4.58
## 20 Rubin Mardini… Penang                   200             4.82             4.57
## # ℹ 1 more variable: Modified_Restaurant <chr>
#Plotting the bar graph
ggplot(top_20_restaurant2, aes(x = reorder(Modified_Restaurant, Bayesian_Average), y = Bayesian_Average)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  geom_text(aes(label = Location), position = position_dodge(width = 0.9), hjust = -0.1, vjust = -0.5, color = "black", size = 3) +
  coord_flip() +
  labs(title = "Top 20 Restaurants in Malaysia based on Bayesian Average Rating",
       x = "Restaurant",
       y = "Bayesian Average Rating")

#Based on the graph, the top 3 restaurants in KL still remain at the top 
#after considering other locations.

1.8) To examine which word appeared the most in the review column

Conducting Basic Text Analysis:
#Sample a subset of the data
set.seed(123)  #for reproducibility
df_tripadvisor <- df_tripadvisor %>% sample_n(1000)

#showing the amount of words, characters and sentence
df_tripadvisor <- df_tripadvisor %>% 
  mutate(
    word_count= str_count(Review, boundary("word")),
    character_count= nchar(Review),
    sentence_count= str_count(Review, boundary('sentence'))
  )

#Text Preprocessing- lowering casing, removing punctuation and numbers, and eliminating stop words.
#creating a corpus
corpus <- Corpus(VectorSource(df_tripadvisor$Review))

#Preprocessing steps:
corpus_clean <- corpus %>% 
  tm_map(content_transformer(tolower)) %>%  # Convert text to lower case
  tm_map(removePunctuation) %>% # Remove punctuation
  tm_map(removeNumbers) %>%   # Remove all numbers
  tm_map(removeWords, stopwords('english')) # Remove common stop words

#Word Frequency Analysis
dtm <- TermDocumentMatrix(corpus_clean)

m <- as.matrix(dtm)
word_freqs <- sort(rowSums(m), decreasing= TRUE)
word_freqs_df <- data.frame(word= names(word_freqs), freq=word_freqs)
head(word_freqs_df)
##                  word freq
## food             food  802
## good             good  592
## service       service  356
## place           place  347
## restaurant restaurant  336
## great           great  306
#using word cloud to visualize the frequency of words
wordcloud(names(word_freqs), freq = word_freqs, min.freq = 1, random.order = FALSE, colors = brewer.pal(8, "Dark2"))

#Here we can see what words are mentioned regulary in the review stage. 

1.9) Identify which location has the highest average ratings

#Base on location
df_tripadvisor <- read.csv('tripadvisor.csv')
df_googlemaps <- read.csv("googlemaps.csv")
df <- bind_rows(df_tripadvisor, df_googlemaps)

#using weighted rating to identify which locations has the highest average ratings
m <- mean(df$Rating, na.rm = TRUE)
v <- 50 # Minimum reviews threshold

weighted_ratings <- df %>%
  group_by(Location) %>%
  summarise(Average_Rating = mean(Rating, na.rm = TRUE),
            Review_Count = n()) %>%
  mutate(Weighted_Rating = (Review_Count * Average_Rating + v * m) / (Review_Count + v)) %>%
  arrange(desc(Weighted_Rating))

#ploting the graph
ggplot(weighted_ratings[1:10, ], aes(x = reorder(Location, Weighted_Rating), y = Weighted_Rating)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "Top 10 Locations by Weighted Average Rating",
       x = "Location",
       y = "Weighted Average Rating") 

#Based on this, we can identify that Langkawi has the highest weighted average ratings.This approach gives more weight to locations with more reviews, addressing the potential bias of small sample sizes.