library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(sf)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
library(tidycensus)
## Warning: package 'tidycensus' was built under R version 4.3.3
library(tigris)
## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.
library(knitr)
combined_businesses <- readRDS("combined_yelp_categories.rds")
combined_businesses
## # A tibble: 539 × 18
## id alias name image_url is_closed url review_count categories rating
## <chr> <chr> <chr> <chr> <lgl> <chr> <int> <list> <dbl>
## 1 v1JfKtF… geor… Geor… "https:/… FALSE http… 3 <df> 3.7
## 2 LEIfqhs… bost… Bost… "https:/… FALSE http… 1 <df> 4
## 3 icS9mTf… bost… Bost… "" FALSE http… 1 <df> 5
## 4 w8LSLGu… bost… Bost… "https:/… FALSE http… 1 <df> 5
## 5 AhJCN6c… bost… Bost… "https:/… FALSE http… 1 <df> 5
## 6 E1DIjRU… coll… Coll… "" FALSE http… 2 <df> 4
## 7 Rnv-NVi… bost… Bost… "https:/… FALSE http… 0 <df> 0
## 8 zu6p7QP… bost… Bost… "" FALSE http… 1 <df> 2
## 9 9hiwb0z… bost… Bost… "https:/… FALSE http… 0 <df> 0
## 10 5J-cMNa… bost… Bost… "" FALSE http… 0 <df> 0
## # ℹ 529 more rows
## # ℹ 9 more variables: coordinates <df[,2]>, transactions <list>,
## # location <df[,8]>, phone <chr>, display_phone <chr>, distance <dbl>,
## # business_hours <list>, attributes <df[,4]>, price <chr>
combined_businesses %>% distinct()
## # A tibble: 539 × 18
## id alias name image_url is_closed url review_count categories rating
## <chr> <chr> <chr> <chr> <lgl> <chr> <int> <list> <dbl>
## 1 v1JfKtF… geor… Geor… "https:/… FALSE http… 3 <df> 3.7
## 2 LEIfqhs… bost… Bost… "https:/… FALSE http… 1 <df> 4
## 3 icS9mTf… bost… Bost… "" FALSE http… 1 <df> 5
## 4 w8LSLGu… bost… Bost… "https:/… FALSE http… 1 <df> 5
## 5 AhJCN6c… bost… Bost… "https:/… FALSE http… 1 <df> 5
## 6 E1DIjRU… coll… Coll… "" FALSE http… 2 <df> 4
## 7 Rnv-NVi… bost… Bost… "https:/… FALSE http… 0 <df> 0
## 8 zu6p7QP… bost… Bost… "" FALSE http… 1 <df> 2
## 9 9hiwb0z… bost… Bost… "https:/… FALSE http… 0 <df> 0
## 10 5J-cMNa… bost… Bost… "" FALSE http… 0 <df> 0
## # ℹ 529 more rows
## # ℹ 9 more variables: coordinates <df[,2]>, transactions <list>,
## # location <df[,8]>, phone <chr>, display_phone <chr>, distance <dbl>,
## # business_hours <list>, attributes <df[,4]>, price <chr>
flattened_categories <- combined_businesses %>%
unnest_wider(categories, names_sep = "_") #un-nesting the list that contained the two variables "alias" and "title"
flattened_coordinates <- flattened_categories %>% # using flattened_categories instead of combined_businesses in order to keep all changes together
unnest_wider(coordinates, names_sep = "_") # splits the nested columns 'latitude' and 'longitude'
flattened_location <-flattened_coordinates %>%
unnest_wider(location, names_sep = "_")
flattened_hours <-flattened_location %>%
unnest_wider(business_hours, names_sep = "_")
final_flattened <-flattened_hours %>%
unnest_wider(attributes, names_sep = "_")
final_flattened
## # A tibble: 539 × 32
## id alias name image_url is_closed url review_count categories_alias
## <chr> <chr> <chr> <chr> <lgl> <chr> <int> <list<chr>>
## 1 v1JfKtFT… geor… Geor… "https:/… FALSE http… 3 [1]
## 2 LEIfqhsd… bost… Bost… "https:/… FALSE http… 1 [1]
## 3 icS9mTfO… bost… Bost… "" FALSE http… 1 [3]
## 4 w8LSLGuV… bost… Bost… "https:/… FALSE http… 1 [3]
## 5 AhJCN6cy… bost… Bost… "https:/… FALSE http… 1 [2]
## 6 E1DIjRUI… coll… Coll… "" FALSE http… 2 [1]
## 7 Rnv-NVi_… bost… Bost… "https:/… FALSE http… 0 [1]
## 8 zu6p7QPj… bost… Bost… "" FALSE http… 1 [1]
## 9 9hiwb0zf… bost… Bost… "https:/… FALSE http… 0 [1]
## 10 5J-cMNa8… bost… Bost… "" FALSE http… 0 [1]
## # ℹ 529 more rows
## # ℹ 24 more variables: categories_title <list<chr>>, rating <dbl>,
## # coordinates_latitude <dbl>, coordinates_longitude <dbl>,
## # transactions <list>, location_address1 <chr>, location_address2 <chr>,
## # location_address3 <chr>, location_city <chr>, location_zip_code <chr>,
## # location_country <chr>, location_state <chr>,
## # location_display_address <list<list>>, phone <chr>, display_phone <chr>, …
combined_businesses %>% filter(!is.na(coordinates$latitude), !is.na(coordinates$longitude)) #resulted in same number of rows because there were no NAs in the coordinates column.
## # A tibble: 539 × 18
## id alias name image_url is_closed url review_count categories rating
## <chr> <chr> <chr> <chr> <lgl> <chr> <int> <list> <dbl>
## 1 v1JfKtF… geor… Geor… "https:/… FALSE http… 3 <df> 3.7
## 2 LEIfqhs… bost… Bost… "https:/… FALSE http… 1 <df> 4
## 3 icS9mTf… bost… Bost… "" FALSE http… 1 <df> 5
## 4 w8LSLGu… bost… Bost… "https:/… FALSE http… 1 <df> 5
## 5 AhJCN6c… bost… Bost… "https:/… FALSE http… 1 <df> 5
## 6 E1DIjRU… coll… Coll… "" FALSE http… 2 <df> 4
## 7 Rnv-NVi… bost… Bost… "https:/… FALSE http… 0 <df> 0
## 8 zu6p7QP… bost… Bost… "" FALSE http… 1 <df> 2
## 9 9hiwb0z… bost… Bost… "https:/… FALSE http… 0 <df> 0
## 10 5J-cMNa… bost… Bost… "" FALSE http… 0 <df> 0
## # ℹ 529 more rows
## # ℹ 9 more variables: coordinates <df[,2]>, transactions <list>,
## # location <df[,8]>, phone <chr>, display_phone <chr>, distance <dbl>,
## # business_hours <list>, attributes <df[,4]>, price <chr>
cambridge_boundary <- readRDS("cambridge_boundary.RDS")
cambridge_boundary
## Simple feature collection with 48 features and 4 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -71.1777 ymin: 42.3524 xmax: -71.06406 ymax: 42.41815
## Geodetic CRS: NAD83
## First 10 features:
## GEOID NAME hhincomeE
## 7 25017353300 Census Tract 3533, Middlesex County, Massachusetts 145526
## 8 25017353700 Census Tract 3537, Middlesex County, Massachusetts 92969
## 9 25017350701 Census Tract 3507.01, Middlesex County, Massachusetts 107132
## 10 25017352800 Census Tract 3528, Middlesex County, Massachusetts 115924
## 15 25017354100 Census Tract 3541, Middlesex County, Massachusetts 120688
## 18 25017354601 Census Tract 3546.01, Middlesex County, Massachusetts 107206
## 26 25017352900 Census Tract 3529, Middlesex County, Massachusetts 160833
## 30 25017353600 Census Tract 3536, Middlesex County, Massachusetts 120417
## 33 25017352200 Census Tract 3522, Middlesex County, Massachusetts 70119
## 34 25017351203 Census Tract 3512.03, Middlesex County, Massachusetts 139896
## hhincomeM geometry
## 7 19246 MULTIPOLYGON (((-71.11716 4...
## 8 34316 MULTIPOLYGON (((-71.11858 4...
## 9 13773 MULTIPOLYGON (((-71.13363 4...
## 10 14493 MULTIPOLYGON (((-71.10083 4...
## 15 9322 MULTIPOLYGON (((-71.13058 4...
## 18 39782 MULTIPOLYGON (((-71.15845 4...
## 26 36422 MULTIPOLYGON (((-71.10795 4...
## 30 18210 MULTIPOLYGON (((-71.11998 4...
## 33 15122 MULTIPOLYGON (((-71.08787 4...
## 34 23579 MULTIPOLYGON (((-71.10727 4...
#manually extracting bounding box coordinates printed from "cambridge_boundary"
xmin <- -71.1777
ymin <- 42.3524
xmax <- -71.06406
ymax <- 42.41815
final_filtered <- final_flattened %>%
filter(coordinates_latitude >= ymin &
coordinates_latitude <= ymax &
coordinates_longitude >= xmin &
coordinates_longitude <= xmax)
final_filtered
## # A tibble: 426 × 32
## id alias name image_url is_closed url review_count categories_alias
## <chr> <chr> <chr> <chr> <lgl> <chr> <int> <list<chr>>
## 1 5ns6T8ek… coll… Coll… "" FALSE http… 0 [1]
## 2 cF-Nobqz… harv… Harv… "https:/… FALSE http… 52 [1]
## 3 i4f2eTZ_… harv… Harv… "https:/… FALSE http… 15 [1]
## 4 mknY9p6y… harv… Harv… "https:/… FALSE http… 5 [1]
## 5 V_OTJgMK… harv… Harv… "https:/… FALSE http… 1 [1]
## 6 gbeCaZrC… harv… Harv… "https:/… FALSE http… 21 [2]
## 7 FVp4d2rC… harv… Harv… "https:/… FALSE http… 6 [1]
## 8 Eq5FZAu-… will… Will… "" FALSE http… 4 [1]
## 9 ZK989WTm… harv… Harv… "https:/… FALSE http… 4 [1]
## 10 MES9xnky… harv… Harv… "" FALSE http… 2 [1]
## # ℹ 416 more rows
## # ℹ 24 more variables: categories_title <list<chr>>, rating <dbl>,
## # coordinates_latitude <dbl>, coordinates_longitude <dbl>,
## # transactions <list>, location_address1 <chr>, location_address2 <chr>,
## # location_address3 <chr>, location_city <chr>, location_zip_code <chr>,
## # location_country <chr>, location_state <chr>,
## # location_display_address <list<list>>, phone <chr>, display_phone <chr>, …
final_filtered %>% group_by(categories_alias) %>% summarize(avg_review_count = mean(review_count, na.rm = TRUE)) %>% kable()
| categories_alias | avg_review_count |
|---|---|
| collegeuniv | 6.187279 |
| collegeuniv, medcenters | 21.000000 |
| educationservices, collegeuniv | 0.000000 |
| festivals , collegeuniv | 0.000000 |
| museums , collegeuniv | 0.000000 |
| employmentagencies, collegeuniv , catering | 0.000000 |
| physicians , healthtrainers, collegeuniv | 0.000000 |
| collegeuniv , religiousorgs | 0.000000 |
| collegeuniv, libraries | 8.000000 |
| collegeuniv, venues | 6.000000 |
| venues , collegeuniv | 4.000000 |
| collegeuniv, itservices | 0.000000 |
| collegeuniv, recreation | 0.000000 |
| collegeuniv , educationservices | 0.000000 |
| bubbletea, juicebars | 13.666667 |
| bubbletea | 18.958333 |
| bubbletea , breakfast_brunch, wraps | 181.000000 |
| bubbletea, coffee , juicebars | 61.000000 |
| bubbletea, coffee | 18.277778 |
| coffee , bubbletea, taiwanese | 41.000000 |
| desserts , bubbletea, icecream | 35.000000 |
| bubbletea, desserts | 24.000000 |
| hawaiian , bubbletea, poke | 80.000000 |
| korean , bubbletea, hotdog | 103.000000 |
| coffee , bubbletea, juicebars | 160.000000 |
| bubbletea, coffee , desserts | 81.000000 |
final_filtered %>% group_by(categories_alias) %>% summarize(avg_rating = mean(rating, na.rm = TRUE)) %>% kable()
| categories_alias | avg_rating |
|---|---|
| collegeuniv | 1.660071 |
| collegeuniv, medcenters | 3.300000 |
| educationservices, collegeuniv | 0.000000 |
| festivals , collegeuniv | 0.000000 |
| museums , collegeuniv | 0.000000 |
| employmentagencies, collegeuniv , catering | 0.000000 |
| physicians , healthtrainers, collegeuniv | 0.000000 |
| collegeuniv , religiousorgs | 0.000000 |
| collegeuniv, libraries | 4.600000 |
| collegeuniv, venues | 4.700000 |
| venues , collegeuniv | 3.800000 |
| collegeuniv, itservices | 0.000000 |
| collegeuniv, recreation | 0.000000 |
| collegeuniv , educationservices | 0.000000 |
| bubbletea, juicebars | 4.588889 |
| bubbletea | 4.087500 |
| bubbletea , breakfast_brunch, wraps | 3.900000 |
| bubbletea, coffee , juicebars | 4.100000 |
| bubbletea, coffee | 4.438889 |
| coffee , bubbletea, taiwanese | 4.200000 |
| desserts , bubbletea, icecream | 4.000000 |
| bubbletea, desserts | 3.400000 |
| hawaiian , bubbletea, poke | 4.600000 |
| korean , bubbletea, hotdog | 3.600000 |
| coffee , bubbletea, juicebars | 3.900000 |
| bubbletea, coffee , desserts | 4.300000 |
final_filtered %>%
count(rating) %>%
arrange(desc(n)) %>%
kable()
| rating | n |
|---|---|
| 0.0 | 206 |
| 5.0 | 28 |
| 4.0 | 21 |
| 4.6 | 18 |
| 4.7 | 18 |
| 3.9 | 15 |
| 4.2 | 13 |
| 3.7 | 11 |
| 4.5 | 9 |
| 3.0 | 8 |
| 3.3 | 8 |
| 3.2 | 7 |
| 3.8 | 7 |
| 4.3 | 7 |
| 4.9 | 7 |
| 3.5 | 6 |
| 4.4 | 6 |
| 2.0 | 5 |
| 2.7 | 5 |
| 3.4 | 5 |
| 4.1 | 5 |
| 3.6 | 4 |
| 4.8 | 4 |
| 2.5 | 2 |
| 2.8 | 1 |
The data was not affected by the deletion of duplicate rows or NA values within the coordinates column. The most useful data cleaning part was flattening the nested columns. Unnesting the columns was important because a total of 5 columns were nested, and it made all these other variables accessible to understand and analyze the data. Filtering the data based on the city’s boundary also had an affect as the data frame went from 539 rows to 426.
Fore the two categories combined, the most frequent rating score was 0. There were a total of 206 businesses that rated 0 or had no rating. Least used rating is 2.8.
I discovered that the boba shops have significantly higher ratings and review counts than colleges. This does not surprise me as food/restaurants are commonly reviewed whereas colleges and universities are not. When colleges are rated, it’s normally specific departments as well and not as a whole. However, the sate of MA is known for its education so I thought there would be higher ratings for the colleges. It’s interesting how all boba shops are successful. It looks like Cambridge, MA is very competitive for bubble tea and only the good shops stand.