# Consumers
consumers <- read.csv("~/Library/CloudStorage/Dropbox/WCAS/Data Analysis/data_MA/multiple_table_data/Restaurant_Ratings/consumers.csv")
consumer_preferences <- read.csv("~/Library/CloudStorage/Dropbox/WCAS/Data Analysis/data_MA/multiple_table_data/Restaurant_Ratings/consumer_preferences.csv")
# Restaurants
restaurants <- read.csv("~/Library/CloudStorage/Dropbox/WCAS/Data Analysis/data_MA/multiple_table_data/Restaurant_Ratings/restaurants.csv")
restaurant_cuisines <- read.csv("~/Library/CloudStorage/Dropbox/WCAS/Data Analysis/data_MA/multiple_table_data/Restaurant_Ratings/restaurant_cuisines.csv")
ratings <- read.csv("~/Library/CloudStorage/Dropbox/WCAS/Data Analysis/data_MA/multiple_table_data/Restaurant_Ratings/ratings.csv")
Check if any the missing data
library(visdat)
?visdat
vis_dat(consumers)
vis_dat(consumer_preferences)
Lets merge now.
?merge
CONSUMERS <-
merge(x = consumers,
y = consumer_preferences,
by = c("Consumer_ID")
)
vis_dat(restaurants)
vis_dat(restaurant_cuisines)
vis_dat(ratings)
Check if any the missing data
RESTAURANT <-
merge(x = restaurants,
y = restaurant_cuisines,
by.x = c("Restaurant_ID"),
by.y = c("Restaurant_ID")
)
vis_dat(ratings)
nrow(ratings)
## [1] 1161
vis_dat(CONSUMERS)
nrow(CONSUMERS)
## [1] 330
vis_dat(RESTAURANT)
nrow(RESTAURANT)
## [1] 112
table(duplicated(x = ratings$Consumer_ID))
##
## FALSE TRUE
## 138 1023
MERGED_1 <-
merge(x = ratings,
y = CONSUMERS,
by = c("Consumer_ID")
)
MERGED<-
merge(x = MERGED_1,
y = RESTAURANT,
by = c("Restaurant_ID")
)
remove(MERGED_1)
remove(list=c("CONSUMERS",
"RESTAURANT")
)
vis_dat(MERGED)
str(MERGED)
## 'data.frame': 3429 obs. of 33 variables:
## $ Restaurant_ID : int 132560 132560 132560 132560 132572 132572 132572 132572 132572 132572 ...
## $ Consumer_ID : chr "U1082" "U1050" "U1087" "U1067" ...
## $ Overall_Rating : int 0 0 1 1 1 1 1 1 2 2 ...
## $ Food_Rating : int 0 2 2 0 2 2 2 2 2 2 ...
## $ Service_Rating : int 0 0 1 0 1 1 1 1 2 2 ...
## $ City.x : chr "Ciudad Victoria" "Ciudad Victoria" "Ciudad Victoria" "Ciudad Victoria" ...
## $ State.x : chr "Tamaulipas" "Tamaulipas" "Tamaulipas" "Tamaulipas" ...
## $ Country.x : chr "Mexico" "Mexico" "Mexico" "Mexico" ...
## $ Latitude.x : num 23.8 23.8 23.8 23.8 22.1 ...
## $ Longitude.x : num -99.2 -99.2 -99.2 -99.2 -101 ...
## $ Smoker : chr "Yes" "No" "No" "No" ...
## $ Drink_Level : chr "Abstemious" "Social Drinker" "Abstemious" "Abstemious" ...
## $ Transportation_Method: chr "Public" "Car" "Public" "Public" ...
## $ Marital_Status : chr "Single" "Single" "Single" "Single" ...
## $ Children : chr "" "Independent" "Independent" "Independent" ...
## $ Age : int 23 23 23 25 29 29 29 29 31 23 ...
## $ Occupation : chr "Student" "Student" "Student" "Student" ...
## $ Budget : chr "Medium" "Medium" "Medium" "Medium" ...
## $ Preferred_Cuisine : chr "Mexican" "Mexican" "Chinese" "Mexican" ...
## $ Name : chr "Puesto de Gorditas" "Puesto de Gorditas" "Puesto de Gorditas" "Puesto de Gorditas" ...
## $ City.y : chr "Ciudad Victoria" "Ciudad Victoria" "Ciudad Victoria" "Ciudad Victoria" ...
## $ State.y : chr "Tamaulipas" "Tamaulipas" "Tamaulipas" "Tamaulipas" ...
## $ Country.y : chr "Mexico" "Mexico" "Mexico" "Mexico" ...
## $ Zip_Code : int NA NA NA NA NA NA NA NA NA NA ...
## $ Latitude.y : num 23.8 23.8 23.8 23.8 22.1 ...
## $ Longitude.y : num -99.2 -99.2 -99.2 -99.2 -101 ...
## $ Alcohol_Service : chr "None" "None" "None" "None" ...
## $ Smoking_Allowed : chr "Yes" "Yes" "Yes" "Yes" ...
## $ Price : chr "Low" "Low" "Low" "Low" ...
## $ Franchise : chr "No" "No" "No" "No" ...
## $ Area : chr "Open" "Open" "Open" "Open" ...
## $ Parking : chr "Public" "Public" "Public" "Public" ...
## $ Cuisine : chr "Regional" "Regional" "Regional" "Regional" ...
# Load the dplyr package if not already loaded
# install.packages("dplyr")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Group the data by cuisine and calculate the average rating for each cuisine
cuisine_ratings <- MERGED %>%
group_by(Preferred_Cuisine) %>%
summarize(
Average_Overall_Rating = mean(Overall_Rating),
Average_Food_Rating = mean(Food_Rating),
Average_Service_Rating = mean(Service_Rating)
)
# Print the resulting data frame
head(cuisine_ratings)
## # A tibble: 6 × 4
## Preferred_Cuisine Average_Overall_Rating Average_Food_Rating
## <chr> <dbl> <dbl>
## 1 Afghan 1.02 0.889
## 2 African 0 0
## 3 American 0.956 0.989
## 4 Armenian 0 0
## 5 Asian 0.405 0.429
## 6 Australian 0.519 0.370
## # ℹ 1 more variable: Average_Service_Rating <dbl>
# Calculate the average ratings for all restaurants
average_restaurant_ratings <- MERGED %>%
group_by(Restaurant_ID) %>%
summarize(
Average_Overall_Rating = mean(Overall_Rating),
Average_Food_Rating = mean(Food_Rating),
Average_Service_Rating = mean(Service_Rating)
)
# Print the resulting data frame
head(average_restaurant_ratings)
## # A tibble: 6 × 4
## Restaurant_ID Average_Overall_Rating Average_Food_Rating
## <int> <dbl> <dbl>
## 1 132560 0.5 1
## 2 132572 1.03 1.55
## 3 132583 1 1
## 4 132584 1.33 1.5
## 5 132594 0.6 1.2
## 6 132608 1 1.17
## # ℹ 1 more variable: Average_Service_Rating <dbl>
# Group the data by Restaurant_ID and Preferred_Cuisine
average_ratings_restaurant_cuisine <- MERGED %>%
group_by(Restaurant_ID, Preferred_Cuisine) %>%
summarize(
Average_Overall_Rating = mean(Overall_Rating),
Average_Food_Rating = mean(Food_Rating),
Average_Service_Rating = mean(Service_Rating)
)
## `summarise()` has grouped output by 'Restaurant_ID'. You can override using the
## `.groups` argument.
# Print the resulting data frame
head(average_ratings_restaurant_cuisine)
## # A tibble: 6 × 5
## # Groups: Restaurant_ID [2]
## Restaurant_ID Preferred_Cuisine Average_Overall_Rating Average_Food_Rating
## <int> <chr> <dbl> <dbl>
## 1 132560 Chinese 1 2
## 2 132560 Mexican 0.333 0.667
## 3 132572 American 1 1.33
## 4 132572 Asian 1 2
## 5 132572 Bar 1 2
## 6 132572 Breakfast 2 2
## # ℹ 1 more variable: Average_Service_Rating <dbl>