1. Loading Libraries and Data

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)

# Load the dataset
data <- read.csv("INDIA ZOMATO RESTURAUNT ANALYSIS.csv", stringsAsFactors = FALSE)

# View structure
str(data)
## 'data.frame':    8652 obs. of  21 variables:
##  $ Restaurant.ID       : int  3400025 3400341 3400005 3400021 3400017 3400325 3400059 3400060 3400348 3400072 ...
##  $ Restaurant.Name     : chr  "Jahanpanah" "Rangrezz Restaurant" "Time2Eat - Mama Chicken" "Chokho Jeeman Marwari Jain Bhojanalya" ...
##  $ Country.Code        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ City                : chr  "Agra" "Agra" "Agra" "Agra" ...
##  $ Address             : chr  "E 23, Shopping Arcade, Sadar Bazaar, Agra Cantt, Agra" "E-20, Shopping Arcade, Sadar Bazaar, Agra Cantt, Agra" "Main Market, Sadar Bazaar, Agra Cantt, Agra" "1/48, Delhi Gate, Station Road, Raja Mandi, Civil Lines, Agra" ...
##  $ Locality            : chr  "Agra Cantt" "Agra Cantt" "Agra Cantt" "Civil Lines" ...
##  $ Locality.Verbose    : chr  "Agra Cantt, Agra" "Agra Cantt, Agra" "Agra Cantt, Agra" "Civil Lines, Agra" ...
##  $ Longitude           : num  78 0 78 78 78 ...
##  $ Latitude            : num  27.2 0 27.2 27.2 27.2 ...
##  $ Cuisines            : chr  "North Indian, Mughlai" "North Indian, Mughlai" "North Indian" "Rajasthani" ...
##  $ Average.Cost.for.two: int  850 700 500 400 1000 2000 2500 2500 800 3600 ...
##  $ Currency            : chr  "Indian Rupees(Rs.)" "Indian Rupees(Rs.)" "Indian Rupees(Rs.)" "Indian Rupees(Rs.)" ...
##  $ Has.Table.booking   : chr  "No" "No" "No" "No" ...
##  $ Has.Online.delivery : chr  "No" "No" "No" "No" ...
##  $ Is.delivering.now   : chr  "No" "No" "No" "No" ...
##  $ Switch.to.order.menu: chr  "No" "No" "No" "No" ...
##  $ Price.range         : int  3 2 2 2 3 4 4 4 3 4 ...
##  $ Aggregate.rating    : num  3.9 3.5 3.6 4 4.2 4 4.3 4 3.6 3.8 ...
##  $ Rating.color        : chr  "Yellow" "Yellow" "Yellow" "Green" ...
##  $ Rating.text         : chr  "Good" "Good" "Good" "Very Good" ...
##  $ Votes               : int  140 71 94 87 177 45 133 41 59 46 ...
# Check for missing values
colSums(is.na(data))
##        Restaurant.ID      Restaurant.Name         Country.Code 
##                    0                    0                    0 
##                 City              Address             Locality 
##                    0                    0                    0 
##     Locality.Verbose            Longitude             Latitude 
##                    0                    0                    0 
##             Cuisines Average.Cost.for.two             Currency 
##                    0                    0                    0 
##    Has.Table.booking  Has.Online.delivery    Is.delivering.now 
##                    0                    0                    0 
## Switch.to.order.menu          Price.range     Aggregate.rating 
##                    0                    0                    0 
##         Rating.color          Rating.text                Votes 
##                    0                    0                    0

2. Basic Metrics

# Average rating and cost
avg_rating <- mean(data$Aggregate.rating, na.rm = TRUE)
avg_cost <- mean(data$Average.Cost.for.two, na.rm = TRUE)

cat("Average Rating:", avg_rating, "\n")
## Average Rating: 2.523324
cat("Average Cost for Two:", avg_cost, "\n")
## Average Cost for Two: 623.3703

3. Exploratory Data Analysis

3.1 Top 10 Cities with Most Restaurants

top_cities <- data %>% count(City, sort = TRUE) %>% head(10)
print(top_cities)
##            City    n
## 1     New Delhi 5473
## 2       Gurgaon 1118
## 3         Noida 1080
## 4     Faridabad  251
## 5     Ghaziabad   25
## 6     Ahmedabad   21
## 7      Amritsar   21
## 8  Bhubaneshwar   21
## 9      Guwahati   21
## 10      Lucknow   21

3.2 High Rating & Low Cost Restaurants

high_rating_low_cost <- data %>% filter(Aggregate.rating >= 4.0, Average.Cost.for.two < 300)
print(head(high_rating_low_cost, 5))
##   Restaurant.ID        Restaurant.Name Country.Code     City
## 1       3400346        Sheroes Hangout            1     Agra
## 2      18204507     Ahuja Milk Bhandar            1 Amritsar
## 3       2200175 Gurdas Ram Jalebi Wala            1 Amritsar
## 4       2200153           Kanha Sweets            1 Amritsar
## 5       2600109  Sagar Gaire Fast Food            1   Bhopal
##                                                                         Address
## 1                     Opposite The Gateway Hotel, Fatehabad Road, Tajganj, Agra
## 2                       Dhab Khatikan, Near Hindu College, Hathi Gate, Amritsar
## 3                                       Near Golden Temple, Town Hall, Amritsar
## 4 Shop 1, Opposite Bijli Pehalwan Mandir, Lawrence Road, White Avenue, Amritsar
## 5                                       10, Number Market, Arera Colony, Bhopal
##       Locality       Locality.Verbose Longitude Latitude
## 1      Tajganj          Tajganj, Agra  78.04017 27.16185
## 2   Hathi Gate   Hathi Gate, Amritsar   0.00000  0.00000
## 3    Town Hall    Town Hall, Amritsar   0.00000  0.00000
## 4 White Avenue White Avenue, Amritsar   0.00000  0.00000
## 5 Arera Colony   Arera Colony, Bhopal  77.43536 23.21429
##                      Cuisines Average.Cost.for.two           Currency
## 1 Cafe, North Indian, Chinese                    0 Indian Rupees(Rs.)
## 2                   Beverages                  100 Indian Rupees(Rs.)
## 3                      Mithai                  100 Indian Rupees(Rs.)
## 4                   Fast Food                  150 Indian Rupees(Rs.)
## 5                   Fast Food                  250 Indian Rupees(Rs.)
##   Has.Table.booking Has.Online.delivery Is.delivering.now Switch.to.order.menu
## 1                No                  No                No                   No
## 2                No                  No                No                   No
## 3                No                  No                No                   No
## 4                No                  No                No                   No
## 5                No                  No                No                   No
##   Price.range Aggregate.rating Rating.color Rating.text Votes
## 1           1              4.9   Dark Green   Excellent    77
## 2           1              4.0        Green   Very Good    52
## 3           1              4.1        Green   Very Good   104
## 4           1              4.1        Green   Very Good   140
## 5           1              4.9   Dark Green   Excellent   427

3.3 Restaurants with Online Delivery and Table Booking

online_table <- data %>% filter(Has.Table.booking == "Yes", Has.Online.delivery == "Yes")
print(head(online_table, 5))
##   Restaurant.ID                  Restaurant.Name Country.Code      City
## 1         50943                 Sultans of Spice            1 Bangalore
## 2         58268 The Fatty Bao - Asian Gastro Bar            1 Bangalore
## 3         58882                      Big Brewsky            1 Bangalore
## 4         69024                That Madras Place            1   Chennai
## 5         72475                          Haunted            1   Chennai
##                                                                         Address
## 1 BluPetal Hotel, 60 Jyoti Nivas College Road, Koramangala 5th Block, Bangalore
## 2           610, 3rd Floor, 12th Main, Off 80 Feet Road, Indiranagar, Bangalore
## 3     Behind MK Retail, Before WIPRO Corporate Office, Sarjapur Road, Bangalore
## 4                        34/29, 2nd Main Road, Kasturibai Nagar, Adyar, Chennai
## 5              273, F13, New Number 71, 2nd Main Road, Anna Nagar East, Chennai
##                      Locality                       Locality.Verbose Longitude
## 1 BluPetal Hotel, Koramangala BluPetal Hotel, Koramangala, Bangalore  77.61543
## 2                 Indiranagar                 Indiranagar, Bangalore  77.64540
## 3               Sarjapur Road               Sarjapur Road, Bangalore  77.68324
## 4                       Adyar                         Adyar, Chennai  80.25074
## 5             Anna Nagar East               Anna Nagar East, Chennai  80.22067
##   Latitude                                                            Cuisines
## 1 12.93328                                               North Indian, Mughlai
## 2 12.97022                                                               Asian
## 3 12.91304 Finger Food, North Indian, Italian, Continental, Thai, South Indian
## 4 13.00580                                         European, Italian, Desserts
## 5 13.08644                                      North Indian, Chinese, Arabian
##   Average.Cost.for.two           Currency Has.Table.booking Has.Online.delivery
## 1                 1300 Indian Rupees(Rs.)               Yes                 Yes
## 2                 2400 Indian Rupees(Rs.)               Yes                 Yes
## 3                 1800 Indian Rupees(Rs.)               Yes                 Yes
## 4                  800 Indian Rupees(Rs.)               Yes                 Yes
## 5                  800 Indian Rupees(Rs.)               Yes                 Yes
##   Is.delivering.now Switch.to.order.menu Price.range Aggregate.rating
## 1                No                   No           3              4.1
## 2                No                   No           4              4.7
## 3                No                   No           3              4.5
## 4                No                   No           2              4.2
## 5                No                   No           2              3.8
##   Rating.color Rating.text Votes
## 1        Green   Very Good  2416
## 2   Dark Green   Excellent  2369
## 3   Dark Green   Excellent  5705
## 4        Green   Very Good  1810
## 5       Yellow        Good   519

3.4 Cuisine Analysis

cuisine_data <- data %>% 
  separate_rows(Cuisines, sep = ",\\s*") %>% 
  group_by(Cuisines) %>% 
  summarise(Avg_Cost = mean(Average.Cost.for.two, na.rm = TRUE),
            Avg_Rating = mean(Aggregate.rating, na.rm = TRUE),
            Count = n(),
            .groups = 'drop') %>%
  arrange(desc(Count))

print(head(cuisine_data, 5))
## # A tibble: 5 × 4
##   Cuisines     Avg_Cost Avg_Rating Count
##   <chr>           <dbl>      <dbl> <int>
## 1 North Indian     691.       2.51  3946
## 2 Chinese          699.       2.60  2690
## 3 Fast Food        437.       2.55  1963
## 4 Mughlai          728.       2.61   992
## 5 Bakery           398.       2.40   726

3.5 Most Common Cuisines

most_common_cuisines <- cuisine_data %>% head(5)
print(most_common_cuisines)
## # A tibble: 5 × 4
##   Cuisines     Avg_Cost Avg_Rating Count
##   <chr>           <dbl>      <dbl> <int>
## 1 North Indian     691.       2.51  3946
## 2 Chinese          699.       2.60  2690
## 3 Fast Food        437.       2.55  1963
## 4 Mughlai          728.       2.61   992
## 5 Bakery           398.       2.40   726

3.6 Top Cities with High Rated Restaurants

top_city_high_rated <- data %>% 
  filter(Aggregate.rating >= 4.0) %>% 
  count(City, sort = TRUE) %>% 
  slice(1:3)

print(top_city_high_rated)
##        City   n
## 1 New Delhi 328
## 2   Gurgaon  95
## 3     Noida  29

4. Affordability Analysis

4.1 Affordable High-Rated Restaurants

affordable_high_rating <- data %>% 
  filter(Aggregate.rating > 4.0) %>% 
  arrange(Average.Cost.for.two) %>% 
  select(Restaurant.Name, City, Average.Cost.for.two, Aggregate.rating) %>% 
  head(5)

print(affordable_high_rating)
##             Restaurant.Name      City Average.Cost.for.two Aggregate.rating
## 1           Sheroes Hangout      Agra                    0              4.9
## 2      BMG - All Day Dining  Dehradun                    0              4.3
## 3 Jung Bahadur Kachori Wala New Delhi                   50              4.1
## 4    Gurdas Ram Jalebi Wala  Amritsar                  100              4.1
## 5        Ashok Chaat Corner New Delhi                  100              4.1

4.2 Most Expensive Cities

expensive_cities <- data %>% 
  group_by(City) %>% 
  summarise(Avg_Cost = mean(Average.Cost.for.two, na.rm = TRUE), .groups = 'drop') %>% 
  arrange(desc(Avg_Cost)) %>% 
  head(5)

print(expensive_cities)
## # A tibble: 5 × 2
##   City      Avg_Cost
##   <chr>        <dbl>
## 1 Panchkula    2000 
## 2 Hyderabad    1361.
## 3 Pune         1338.
## 4 Jaipur       1310 
## 5 Kolkata      1272.

5. Feature Engineering

5.1 Price Category

data <- data %>% mutate(PriceCategory = case_when(
  Average.Cost.for.two < 300 ~ "Low",
  Average.Cost.for.two >= 300 & Average.Cost.for.two <= 800 ~ "Medium",
  Average.Cost.for.two > 800 ~ "High"
))

head(select(data, Restaurant.Name, City, Average.Cost.for.two, PriceCategory), 10)
##                          Restaurant.Name City Average.Cost.for.two
## 1                             Jahanpanah Agra                  850
## 2                    Rangrezz Restaurant Agra                  700
## 3                Time2Eat - Mama Chicken Agra                  500
## 4  Chokho Jeeman Marwari Jain Bhojanalya Agra                  400
## 5                         Pinch Of Spice Agra                 1000
## 6                              MoMo Cafe Agra                 2000
## 7                  Peshawri - ITC Mughal Agra                 2500
## 8                  Taj Bano - ITC Mughal Agra                 2500
## 9                                 G Thal Agra                  800
## 10          Dawat-e-Nawab - Radisson Blu Agra                 3600
##    PriceCategory
## 1           High
## 2         Medium
## 3         Medium
## 4         Medium
## 5           High
## 6           High
## 7           High
## 8           High
## 9         Medium
## 10          High

5.2 Popularity Tag

data <- data %>% mutate(PopularityTag = ifelse(Aggregate.rating > 4 & Votes > 500, "Popular", "Regular"))

popular_percentage <- data %>% 
  group_by(City) %>% 
  summarise(Total = n(),
            Popular = sum(PopularityTag == "Popular"),
            Percentage_Popular = round((Popular / Total) * 100, 2),
            .groups = 'drop') %>% 
  arrange(desc(Percentage_Popular))

print(head(popular_percentage, 10))
## # A tibble: 10 × 4
##    City         Total Popular Percentage_Popular
##    <chr>        <int>   <int>              <dbl>
##  1 Panchkula        1       1              100  
##  2 Chennai         20      15               75  
##  3 Bangalore       20      14               70  
##  4 Kolkata         20      14               70  
##  5 Hyderabad       18      11               61.1
##  6 Pune            20      11               55  
##  7 Secunderabad     2       1               50  
##  8 Goa             20       8               40  
##  9 Mumbai          20       8               40  
## 10 Ahmedabad       21       8               38.1

6. Visualizations

6.1 Distribution of Votes

filtered_data <- data %>% filter(Votes <= quantile(Votes, 0.99, na.rm = TRUE))

ggplot(filtered_data, aes(x = Votes)) +
  geom_histogram(bins = 50, fill = "steelblue", color = "black") +
  labs(title = "Distribution of Votes Across Restaurants", x = "Number of Votes", y = "Frequency") +
  theme_minimal()

6.2 Distribution of Cost

filtered_cost_data <- data %>% filter(Average.Cost.for.two <= quantile(Average.Cost.for.two, 0.99, na.rm = TRUE))

ggplot(filtered_cost_data, aes(x = Average.Cost.for.two)) +
  geom_histogram(bins = 50, fill = "darkorange", color = "black") +
  labs(title = "Distribution of Average Cost for Two", x = "Average Cost for Two (INR)", y = "Frequency") +
  theme_minimal()

7. Modeling

7.1 Simple Linear Regression: Votes vs Cost

simple_model <- lm(Average.Cost.for.two ~ Votes, data = data)
summary(simple_model)
## 
## Call:
## lm(formula = Average.Cost.for.two ~ Votes, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3560.1  -283.7  -139.0    78.9  7373.6 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 569.56724    6.45293   88.27   <2e-16 ***
## Votes         0.39211    0.01436   27.31   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 571.6 on 8650 degrees of freedom
## Multiple R-squared:  0.07939,    Adjusted R-squared:  0.07929 
## F-statistic:   746 on 1 and 8650 DF,  p-value: < 2.2e-16
# Plot
ggplot(data, aes(x = Votes, y = Average.Cost.for.two)) +
  geom_point(alpha = 0.4, color = "gray40") +
  geom_smooth(method = "lm", se = TRUE, color = "blue") +
  labs(title = "Votes vs Average Cost for Two", x = "Votes", y = "Average Cost for Two")
## `geom_smooth()` using formula = 'y ~ x'

7.2 Multiple Linear Regression: Rating

data$OnlineDelivery <- ifelse(data$Has.Online.delivery == "Yes", 1, 0)

tier1_cities <- c("Delhi", "New Delhi", "Mumbai", "Bangalore", "Bengaluru")
tier2_cities <- c("Hyderabad", "Chennai", "Kolkata", "Pune")

data$CityTier <- case_when(
  data$City %in% tier1_cities ~ 1,
  data$City %in% tier2_cities ~ 2,
  TRUE ~ 3
)

multi_model_final <- lm(Aggregate.rating ~ Votes + OnlineDelivery + CityTier, data = data)
summary(multi_model_final)
## 
## Call:
## lm(formula = Aggregate.rating ~ Votes + OnlineDelivery + CityTier, 
##     data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.3257 -1.6020  0.5276  1.0414  2.6337 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    2.013e+00  3.221e-02  62.493  < 2e-16 ***
## Votes          9.178e-04  3.506e-05  26.178  < 2e-16 ***
## OnlineDelivery 8.992e-01  3.343e-02  26.900  < 2e-16 ***
## CityTier       7.733e-02  1.564e-02   4.944 7.81e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.388 on 8648 degrees of freedom
## Multiple R-squared:  0.1563, Adjusted R-squared:  0.156 
## F-statistic: 533.9 on 3 and 8648 DF,  p-value: < 2.2e-16
data$Predicted_Rating_Final <- predict(multi_model_final, newdata = data)

ggplot(data, aes(x = Aggregate.rating, y = Predicted_Rating_Final)) +
  geom_point(color = "darkorange", alpha = 0.6) +
  geom_abline(intercept = 0, slope = 1, color = "blue", linetype = "dashed") +
  labs(title = "Actual vs Predicted Restaurant Ratings", x = "Actual Aggregate Rating", y = "Predicted Aggregate Rating") +
  theme_minimal()

7.3 Polynomial Regression: Votes vs Cost

poly_cost_votes_model <- lm(Votes ~ poly(Average.Cost.for.two, 2, raw = TRUE), data = data)
summary(poly_cost_votes_model)
## 
## Call:
## lm(formula = Votes ~ poly(Average.Cost.for.two, 2, raw = TRUE), 
##     data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
##  -602.9  -112.6   -40.9    14.6 10420.0 
## 
## Coefficients:
##                                              Estimate Std. Error t value
## (Intercept)                                -8.472e+01  8.286e+00  -10.22
## poly(Average.Cost.for.two, 2, raw = TRUE)1  4.397e-01  1.525e-02   28.83
## poly(Average.Cost.for.two, 2, raw = TRUE)2 -7.017e-05  3.964e-06  -17.70
##                                            Pr(>|t|)    
## (Intercept)                                  <2e-16 ***
## poly(Average.Cost.for.two, 2, raw = TRUE)1   <2e-16 ***
## poly(Average.Cost.for.two, 2, raw = TRUE)2   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 403.5 on 8649 degrees of freedom
## Multiple R-squared:  0.1116, Adjusted R-squared:  0.1114 
## F-statistic: 543.2 on 2 and 8649 DF,  p-value: < 2.2e-16
data$Predicted_Votes_Cost <- predict(poly_cost_votes_model, newdata = data)

ggplot(data, aes(x = Average.Cost.for.two, y = Votes)) +
  geom_point(color = "blue", alpha = 0.5) +
  geom_line(aes(y = Predicted_Votes_Cost), color = "darkblue", linewidth = 1) +
  labs(title = "Polynomial Regression: Cost vs Votes", x = "Average Cost for Two", y = "Votes") +
  theme_minimal()

8. Statistical Tests

8.1 ANOVA: Online Delivery vs Rating

data$Has.Online.delivery <- as.factor(data$Has.Online.delivery)
anova_result <- aov(Aggregate.rating ~ Has.Online.delivery, data = data)
summary(anova_result)
##                       Df Sum Sq Mean Sq F value Pr(>F)    
## Has.Online.delivery    1   1723  1723.0   826.7 <2e-16 ***
## Residuals           8650  18028     2.1                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
boxplot(Aggregate.rating ~ Has.Online.delivery, data = data, col = c("tomato", "skyblue"))

8.2 ANOVA: Table Booking vs Votes

data$Has.Table.booking <- as.factor(data$Has.Table.booking)
anova_votes <- aov(Votes ~ Has.Table.booking, data = data)
summary(anova_votes)
##                     Df    Sum Sq  Mean Sq F value Pr(>F)    
## Has.Table.booking    1 5.606e+07 56056876   317.1 <2e-16 ***
## Residuals         8650 1.529e+09   176758                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
boxplot(Votes ~ Has.Table.booking, data = data, col = c("lightcoral", "lightblue"))

8.3 ANOVA: Average Cost across Cities

selected_data <- subset(data, City %in% c("Mumbai", "New Delhi", "Bangalore"))
selected_data <- selected_data[!is.na(selected_data$Average.Cost.for.two), ]
selected_data$City <- factor(selected_data$City)

anova_city <- aov(Average.Cost.for.two ~ City, data = selected_data)
summary(anova_city)
##               Df    Sum Sq Mean Sq F value   Pr(>F)    
## City           2 1.255e+07 6274959   15.61 1.74e-07 ***
## Residuals   5510 2.215e+09  402030                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
boxplot(Average.Cost.for.two ~ City, data = selected_data, col = c("lightblue", "lightpink", "lightgreen"))

8.4 ANOVA: Cuisine Type vs Votes

data$Primary.Cuisine <- sapply(strsplit(as.character(data$Cuisines), ","), `[`, 1)
data$Primary.Cuisine <- trimws(data$Primary.Cuisine)

top3_cuisines <- data %>% count(Primary.Cuisine, sort = TRUE) %>% top_n(3, n) %>% pull(Primary.Cuisine)
filtered_data <- data %>% filter(Primary.Cuisine %in% top3_cuisines)

anova_top3 <- aov(Votes ~ Primary.Cuisine, data = filtered_data)
summary(anova_top3)
##                   Df    Sum Sq Mean Sq F value   Pr(>F)    
## Primary.Cuisine    2   1992269  996134   9.859 5.34e-05 ***
## Residuals       4474 452042276  101038                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(filtered_data, aes(x = Primary.Cuisine, y = Votes, fill = Primary.Cuisine)) +
  geom_boxplot() +
  theme_minimal()

9. Correlation Analysis

9.1 Rating vs Cost

cor(data$Aggregate.rating, data$Average.Cost.for.two, use = "complete.obs")
## [1] 0.3441716
ggplot(data, aes(x = Aggregate.rating, y = Average.Cost.for.two)) +
  geom_point(color = "blue", alpha = 0.5) +
  geom_smooth(method = "lm", color = "red", se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'

9.2 Rating vs Votes

correlation_value <- cor(data$Aggregate.rating, data$Votes, use = "complete.obs")
cor_test_result <- cor.test(data$Aggregate.rating, data$Votes)

print(correlation_value)
## [1] 0.2876924
print(cor_test_result)
## 
##  Pearson's product-moment correlation
## 
## data:  data$Aggregate.rating and data$Votes
## t = 27.938, df = 8650, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2682468 0.3069037
## sample estimates:
##       cor 
## 0.2876924
ggplot(data, aes(x = Aggregate.rating, y = Votes)) +
  geom_point(color = "darkgreen", alpha = 0.5) +
  geom_smooth(method = "lm", color = "red", se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'

9.3 Online Delivery vs Votes

data$Online_Delivery_Numeric <- ifelse(data$Has.Online.delivery == "Yes", 1, 0)
correlation_value <- cor(data$Online_Delivery_Numeric, data$Votes, use = "complete.obs")

print(correlation_value)
## [1] 0.1047307
ggplot(data, aes(x = Online_Delivery_Numeric, y = Votes)) + 
  geom_jitter(width = 0.2, height = 0, alpha = 0.5, color = "blue") + 
  geom_smooth(method = "lm", color = "red", se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'

## Conclusion

## **In this project, we explored the Zomato restaurant data and found many interesting things. Most restaurants are in New Delhi, and North Indian and Chinese food are the most popular cuisines. We also saw that many low-cost restaurants have very good ratings. Features like online delivery and table booking affect customer ratings and votes. 
## Our analysis showed that there is a small connection between restaurant cost, votes, and ratings. Overall, this study helps us understand what customers like and how restaurants perform in different cities.**