1 Import

# Consumers

consumers <- read.csv("~/Library/CloudStorage/Dropbox/WCAS/Data Analysis/data_MA/multiple_table_data/Restaurant_Ratings/consumers.csv")

consumer_preferences <- read.csv("~/Library/CloudStorage/Dropbox/WCAS/Data Analysis/data_MA/multiple_table_data/Restaurant_Ratings/consumer_preferences.csv")


# Restaurants
restaurants <- read.csv("~/Library/CloudStorage/Dropbox/WCAS/Data Analysis/data_MA/multiple_table_data/Restaurant_Ratings/restaurants.csv")

restaurant_cuisines <- read.csv("~/Library/CloudStorage/Dropbox/WCAS/Data Analysis/data_MA/multiple_table_data/Restaurant_Ratings/restaurant_cuisines.csv")

ratings <- read.csv("~/Library/CloudStorage/Dropbox/WCAS/Data Analysis/data_MA/multiple_table_data/Restaurant_Ratings/ratings.csv")

2 Merge

2.1 Consumers

2.1.0.1 Visualize

Check if any the missing data

library(visdat)
?visdat
vis_dat(consumers)

vis_dat(consumer_preferences)

Lets merge now.

?merge
CONSUMERS <- 
merge(x = consumers, 
      y = consumer_preferences, 
      by = c("Consumer_ID") 
      )

2.2 Restaurants

2.2.0.1 Visualize

vis_dat(restaurants)

vis_dat(restaurant_cuisines)

vis_dat(ratings)

Check if any the missing data

RESTAURANT <-
merge(x = restaurants, 
      y = restaurant_cuisines, 
      by.x = c("Restaurant_ID"),
      by.y = c("Restaurant_ID")
      )

2.3 FINAL MERGE

2.3.0.1 Visualize

vis_dat(ratings)

  nrow(ratings)
## [1] 1161
vis_dat(CONSUMERS)

  nrow(CONSUMERS)
## [1] 330
vis_dat(RESTAURANT)

  nrow(RESTAURANT)
## [1] 112
table(duplicated(x = ratings$Consumer_ID))
## 
## FALSE  TRUE 
##   138  1023
MERGED_1 <- 
merge(x = ratings, 
      y = CONSUMERS, 
      by = c("Consumer_ID")
      )

MERGED<-
merge(x = MERGED_1, 
      y = RESTAURANT, 
      by = c("Restaurant_ID")
      )

remove(MERGED_1)
remove(list=c("CONSUMERS",
              "RESTAURANT")
       )

vis_dat(MERGED)

3 AVERAGE CUISING RATINGS (given by people)

str(MERGED)
## 'data.frame':    3429 obs. of  33 variables:
##  $ Restaurant_ID        : int  132560 132560 132560 132560 132572 132572 132572 132572 132572 132572 ...
##  $ Consumer_ID          : chr  "U1082" "U1050" "U1087" "U1067" ...
##  $ Overall_Rating       : int  0 0 1 1 1 1 1 1 2 2 ...
##  $ Food_Rating          : int  0 2 2 0 2 2 2 2 2 2 ...
##  $ Service_Rating       : int  0 0 1 0 1 1 1 1 2 2 ...
##  $ City.x               : chr  "Ciudad Victoria" "Ciudad Victoria" "Ciudad Victoria" "Ciudad Victoria" ...
##  $ State.x              : chr  "Tamaulipas" "Tamaulipas" "Tamaulipas" "Tamaulipas" ...
##  $ Country.x            : chr  "Mexico" "Mexico" "Mexico" "Mexico" ...
##  $ Latitude.x           : num  23.8 23.8 23.8 23.8 22.1 ...
##  $ Longitude.x          : num  -99.2 -99.2 -99.2 -99.2 -101 ...
##  $ Smoker               : chr  "Yes" "No" "No" "No" ...
##  $ Drink_Level          : chr  "Abstemious" "Social Drinker" "Abstemious" "Abstemious" ...
##  $ Transportation_Method: chr  "Public" "Car" "Public" "Public" ...
##  $ Marital_Status       : chr  "Single" "Single" "Single" "Single" ...
##  $ Children             : chr  "" "Independent" "Independent" "Independent" ...
##  $ Age                  : int  23 23 23 25 29 29 29 29 31 23 ...
##  $ Occupation           : chr  "Student" "Student" "Student" "Student" ...
##  $ Budget               : chr  "Medium" "Medium" "Medium" "Medium" ...
##  $ Preferred_Cuisine    : chr  "Mexican" "Mexican" "Chinese" "Mexican" ...
##  $ Name                 : chr  "Puesto de Gorditas" "Puesto de Gorditas" "Puesto de Gorditas" "Puesto de Gorditas" ...
##  $ City.y               : chr  "Ciudad Victoria" "Ciudad Victoria" "Ciudad Victoria" "Ciudad Victoria" ...
##  $ State.y              : chr  "Tamaulipas" "Tamaulipas" "Tamaulipas" "Tamaulipas" ...
##  $ Country.y            : chr  "Mexico" "Mexico" "Mexico" "Mexico" ...
##  $ Zip_Code             : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ Latitude.y           : num  23.8 23.8 23.8 23.8 22.1 ...
##  $ Longitude.y          : num  -99.2 -99.2 -99.2 -99.2 -101 ...
##  $ Alcohol_Service      : chr  "None" "None" "None" "None" ...
##  $ Smoking_Allowed      : chr  "Yes" "Yes" "Yes" "Yes" ...
##  $ Price                : chr  "Low" "Low" "Low" "Low" ...
##  $ Franchise            : chr  "No" "No" "No" "No" ...
##  $ Area                 : chr  "Open" "Open" "Open" "Open" ...
##  $ Parking              : chr  "Public" "Public" "Public" "Public" ...
##  $ Cuisine              : chr  "Regional" "Regional" "Regional" "Regional" ...
# Load the dplyr package if not already loaded
# install.packages("dplyr")
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Group the data by cuisine and calculate the average rating for each cuisine
cuisine_ratings <- MERGED %>%
  group_by(Preferred_Cuisine) %>%
  summarize(
    Average_Overall_Rating = mean(Overall_Rating),
    Average_Food_Rating = mean(Food_Rating),
    Average_Service_Rating = mean(Service_Rating)
  )

# Print the resulting data frame
head(cuisine_ratings)
## # A tibble: 6 × 4
##   Preferred_Cuisine Average_Overall_Rating Average_Food_Rating
##   <chr>                              <dbl>               <dbl>
## 1 Afghan                             1.02                0.889
## 2 African                            0                   0    
## 3 American                           0.956               0.989
## 4 Armenian                           0                   0    
## 5 Asian                              0.405               0.429
## 6 Australian                         0.519               0.370
## # ℹ 1 more variable: Average_Service_Rating <dbl>

4 AVERAGE RESTAURANT RATINGS (given by people)

# Calculate the average ratings for all restaurants
average_restaurant_ratings <- MERGED %>%
  group_by(Restaurant_ID) %>%
  summarize(
    Average_Overall_Rating = mean(Overall_Rating),
    Average_Food_Rating = mean(Food_Rating),
    Average_Service_Rating = mean(Service_Rating)
  )

# Print the resulting data frame
head(average_restaurant_ratings)
## # A tibble: 6 × 4
##   Restaurant_ID Average_Overall_Rating Average_Food_Rating
##           <int>                  <dbl>               <dbl>
## 1        132560                   0.5                 1   
## 2        132572                   1.03                1.55
## 3        132583                   1                   1   
## 4        132584                   1.33                1.5 
## 5        132594                   0.6                 1.2 
## 6        132608                   1                   1.17
## # ℹ 1 more variable: Average_Service_Rating <dbl>

5 Average ratings by restaurant and cuisine (given by people)

# Group the data by Restaurant_ID and Preferred_Cuisine
average_ratings_restaurant_cuisine <- MERGED %>%
  group_by(Restaurant_ID, Preferred_Cuisine) %>%
  summarize(
    Average_Overall_Rating = mean(Overall_Rating),
    Average_Food_Rating = mean(Food_Rating),
    Average_Service_Rating = mean(Service_Rating)
  )
## `summarise()` has grouped output by 'Restaurant_ID'. You can override using the
## `.groups` argument.
# Print the resulting data frame
head(average_ratings_restaurant_cuisine)
## # A tibble: 6 × 5
## # Groups:   Restaurant_ID [2]
##   Restaurant_ID Preferred_Cuisine Average_Overall_Rating Average_Food_Rating
##           <int> <chr>                              <dbl>               <dbl>
## 1        132560 Chinese                            1                   2    
## 2        132560 Mexican                            0.333               0.667
## 3        132572 American                           1                   1.33 
## 4        132572 Asian                              1                   2    
## 5        132572 Bar                                1                   2    
## 6        132572 Breakfast                          2                   2    
## # ℹ 1 more variable: Average_Service_Rating <dbl>