Loading the necessary libraries

library(tidycensus)
library(sf)
## Linking to GEOS 3.12.1, GDAL 3.8.4, PROJ 9.3.1; sf_use_s2() is TRUE
library(sf)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(tmap)
## Breaking News: tmap 3.x is retiring. Please test v4, e.g. with
## remotes::install_github('r-tmap/tmap')
library(here)
## here() starts at C:/Users/srini/OneDrive/Documents/Urban Analytics
library(jsonlite)
## 
## Attaching package: 'jsonlite'
## The following object is masked from 'package:purrr':
## 
##     flatten
library(dplyr)
library(dplyr)
library(purrr)
library(jsonlite)
library(stringr)

For data cleaning the following processes will be followed

#Removing Duplicates

#Loading the Data Frame
yelp_total=readRDS("C:\\Users\\srini\\OneDrive\\Documents\\Urban Analytics\\yelp_total.rds")

yelp_unique <- yelp_total %>% 
  distinct(id, .keep_all=T)

glue::glue("Before dropping NA, there were {nrow(yelp_total)} rows. After dropping them, there are {nrow(yelp_unique)} rows") %>% 
  print()
## Before dropping NA, there were 2287 rows. After dropping them, there are 785 rows

#Flattening the Data

concate_list <- function(x){
  titles <- x[["title"]] %>% str_c(collapse = ", ")
  return(titles)
}
yelp_flat_1 <- yelp_unique %>% 
  # 1. Flattening columns with data frame
  jsonlite::flatten() %>% 
  # 2. Handling list-columns
  mutate(transactions = transactions %>% 
           map_chr(., function(x) str_c(x, collapse=", ")),
         location.display_address = location.display_address %>% 
           map_chr(., function(x) str_c(x, collapse=", ")),
         categories = categories %>% map_chr(concate_list))

#Removing the NA Values from the data frame

yelp_flat_1 %>% 
  map_dbl(., function(x) sum(is.na(x))) 
##                              id                           alias 
##                               0                               0 
##                            name                       image_url 
##                               0                               0 
##                       is_closed                             url 
##                               0                               0 
##                    review_count                      categories 
##                               0                               0 
##                          rating                    transactions 
##                               0                               0 
##                           price                           phone 
##                             396                               0 
##                   display_phone                        distance 
##                               0                               0 
##                  business_hours            coordinates.latitude 
##                               0                               0 
##           coordinates.longitude               location.address1 
##                               0                              15 
##               location.address2               location.address3 
##                             239                             297 
##                   location.city               location.zip_code 
##                               0                               0 
##                location.country                  location.state 
##                               0                               0 
##        location.display_address attributes.business_temp_closed 
##                               0                             785 
##             attributes.menu_url         attributes.open24_hours 
##                             444                             785 
## attributes.waitlist_reservation 
##                             768

For the next step, we’ll be filtering out the businesses that lie outside the city boundaries of Camden

yelp_data_final <- yelp_flat_1 %>%
      filter(location.city == 'Camden')

# Comparing before and after tidying
glue::glue("nrow before: {nrow(yelp_total)} -> nrow after: {nrow(yelp_data_final)}
            ncol before: {ncol(yelp_total)} -> ncol after: {ncol(yelp_data_final)} \n") %>% 
  print()
## nrow before: 2287 -> nrow after: 165
## ncol before: 18 -> ncol after: 29
print(yelp_data_final %>% head())
##                       id                          alias                    name
## 1 1rPM_E_NGWxgJA29G2w-rQ           jamaica-vibes-camden           Jamaica VIBES
## 2 03YfclexPdS_yrIbU6ahaA            mcdonalds-camden-11              McDonald's
## 3 XX94YQ5u_VIEzc32qEwWFg          little-caesars-camden          Little Caesars
## 4 A7OyKRz_B3HHm8GqGWQRKQ            taste-budz-camden-3              Taste Budz
## 5 QUM6YFcm1pa-zC04ywYzSA old-san-juan-restaurant-camden Old San Juan Restaurant
## 6 Oq1LlUCCGXwA2Lt3HuZAog       la-ingrata-camden-camden       La Ingrata Camden
##                                                              image_url
## 1 https://s3-media2.fl.yelpcdn.com/bphoto/PmmuAo27ePx-x0GyeJ5Mlw/o.jpg
## 2 https://s3-media4.fl.yelpcdn.com/bphoto/FvvWZqdVWaYqsksdOJz6Fg/o.jpg
## 3 https://s3-media2.fl.yelpcdn.com/bphoto/q0PxIPRm7d-U44sEc2xEFA/o.jpg
## 4                                                                     
## 5 https://s3-media1.fl.yelpcdn.com/bphoto/F5uDuP5NdgE2oaP90QdRQg/o.jpg
## 6 https://s3-media2.fl.yelpcdn.com/bphoto/v7sEt93Wo9L2S5iyhN2JOQ/o.jpg
##   is_closed
## 1     FALSE
## 2     FALSE
## 3     FALSE
## 4     FALSE
## 5     FALSE
## 6     FALSE
##                                                                                                                                                                                           url
## 1           https://www.yelp.com/biz/jamaica-vibes-camden?adjust_creative=IIcoxsHZylAK38wuHdFFOA&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=IIcoxsHZylAK38wuHdFFOA
## 2            https://www.yelp.com/biz/mcdonalds-camden-11?adjust_creative=IIcoxsHZylAK38wuHdFFOA&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=IIcoxsHZylAK38wuHdFFOA
## 3          https://www.yelp.com/biz/little-caesars-camden?adjust_creative=IIcoxsHZylAK38wuHdFFOA&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=IIcoxsHZylAK38wuHdFFOA
## 4            https://www.yelp.com/biz/taste-budz-camden-3?adjust_creative=IIcoxsHZylAK38wuHdFFOA&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=IIcoxsHZylAK38wuHdFFOA
## 5 https://www.yelp.com/biz/old-san-juan-restaurant-camden?adjust_creative=IIcoxsHZylAK38wuHdFFOA&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=IIcoxsHZylAK38wuHdFFOA
## 6       https://www.yelp.com/biz/la-ingrata-camden-camden?adjust_creative=IIcoxsHZylAK38wuHdFFOA&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=IIcoxsHZylAK38wuHdFFOA
##   review_count                        categories rating     transactions price
## 1            2      American, Caribbean, Seafood    5.0         delivery  <NA>
## 2           19  Fast Food, Burgers, Coffee & Tea    1.3         delivery     $
## 3            0              Pizza, Chicken Wings    0.0         delivery  <NA>
## 4            0      American, Soul Food, Seafood    0.0                   <NA>
## 5           71      Latin American, Puerto Rican    4.0 pickup, delivery    $$
## 6           13 Tacos, Pizza, New Mexican Cuisine    4.7 pickup, delivery  <NA>
##          phone  display_phone  distance
## 1 +18562037801 (856) 203-7801 1185.9850
## 2 +18569628077 (856) 962-8077 1646.8039
## 3 +18569620500 (856) 962-0500 1651.7150
## 4 +18563506171 (856) 350-6171 1338.0782
## 5 +18569631200 (856) 963-1200  473.1607
## 6 +18562033424 (856) 203-3424  472.8572
##                                                                                                                                                                                                                                                     business_hours
## 1                                                                                                                FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 1100, 1100, 1100, 1100, 1100, 1100, 2100, 2100, 2100, 2100, 2100, 2100, 0, 1, 2, 3, 4, 5, REGULAR, TRUE
## 2                                                                                                 TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0, 1, 2, 3, 4, 5, 6, REGULAR, TRUE
## 3                                                                                          FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 1100, 1100, 1100, 1100, 1100, 1100, 1100, 2200, 2200, 2200, 2200, 2200, 2200, 2200, 0, 1, 2, 3, 4, 5, 6, REGULAR, TRUE
## 4 FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 0630, 1200, 0630, 1200, 0630, 1200, 0630, 0630, 0630, 0630, 1200, 1130, 2200, 1130, 2200, 1130, 2200, 1500, 1500, 1500, 1130, 2200, 0, 0, 1, 1, 2, 2, 3, 4, 5, 6, 6, REGULAR, FALSE
## 5                                                                                          FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 1100, 1100, 1100, 1100, 1100, 1100, 1100, 2100, 2100, 2100, 2100, 2100, 2100, 2100, 0, 1, 2, 3, 4, 5, 6, REGULAR, TRUE
## 6                                                                                          FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 2100, 2100, 2100, 2100, 2100, 2100, 2100, 0, 1, 2, 3, 4, 5, 6, REGULAR, TRUE
##   coordinates.latitude coordinates.longitude      location.address1
## 1             39.90578             -75.09614    3102 Mt Ephraim Ave
## 2             39.91059             -75.09808 2720 Mount Ephraim Ave
## 3             39.91052             -75.09825 2768 Mount Ephraim Ave
## 4             39.90745             -75.09682   2900 Mt Emphraim Ave
## 5             39.94306             -75.08890        217 Marlton Ave
## 6             39.94560             -75.09548        1999 Federal St
##   location.address2 location.address3 location.city location.zip_code
## 1                                <NA>        Camden             08104
## 2                                            Camden             08104
## 3              <NA>              <NA>        Camden             08104
## 4              <NA>                          Camden             08104
## 5              <NA>              <NA>        Camden             08105
## 6              <NA>                          Camden             08105
##   location.country location.state                 location.display_address
## 1               US             NJ    3102 Mt Ephraim Ave, Camden, NJ 08104
## 2               US             NJ 2720 Mount Ephraim Ave, Camden, NJ 08104
## 3               US             NJ 2768 Mount Ephraim Ave, Camden, NJ 08104
## 4               US             NJ   2900 Mt Emphraim Ave, Camden, NJ 08104
## 5               US             NJ        217 Marlton Ave, Camden, NJ 08105
## 6               US             NJ        1999 Federal St, Camden, NJ 08105
##   attributes.business_temp_closed
## 1                              NA
## 2                              NA
## 3                              NA
## 4                              NA
## 5                              NA
## 6                              NA
##                                                             attributes.menu_url
## 1                                                                          <NA>
## 2 https://www.mcdonalds.com/us/en-us/full-menu.html?cid=PS:GCM_MOP:NB::Yelp:All
## 3                                          https://littlecaesars.com/en-us/menu
## 4                                                                          <NA>
## 5                                        https://oldsanjuancamden.com/menu.html
## 6                                              https://laingratacamden.com/menu
##   attributes.open24_hours attributes.waitlist_reservation
## 1                      NA                              NA
## 2                      NA                              NA
## 3                      NA                              NA
## 4                      NA                              NA
## 5                      NA                              NA
## 6                      NA                              NA

#Comparing Messy Data with Tidy Data

glue::glue("nrow before: {nrow(yelp_total)} -> nrow after: {nrow(yelp_data_final)} \n
            ncol before: {ncol(yelp_total)} -> ncol after: {ncol(yelp_data_final)} \n") %>% 
  print()
## nrow before: 2287 -> nrow after: 165 
## 
## ncol before: 18 -> ncol after: 29

#Visualizing Observations

x1 = yelp_data_final$review_count[yelp_data_final$rating == 1.0]
sum_x1 = sum(x1)

x2 = yelp_data_final$review_count[yelp_data_final$rating == 1.5]
sum_x2 = sum(x2)

x3 = yelp_data_final$review_count[yelp_data_final$rating == 2.0]
sum_x3 = sum(x3)

x4 = yelp_data_final$review_count[yelp_data_final$rating == 2.5]
sum_x4 = sum(x4)

x5 = yelp_data_final$review_count[yelp_data_final$rating == 3.0]
sum_x5 = sum(x5)

x6 = yelp_data_final$review_count[yelp_data_final$rating == 3.5]
sum_x6 = sum(x6)

x7 = yelp_data_final$review_count[yelp_data_final$rating == 4.0]
sum_x7 = sum(x7)

x8 = yelp_data_final$review_count[yelp_data_final$rating == 4.5]
sum_x8 = sum(x8)

x9 = yelp_data_final$review_count[yelp_data_final$rating == 5.0]
sum_x9 = sum(x9)

glue::glue("
Sum of review_count for rating 1.0: {sum_x1}
Sum of review_count for rating 1.5: {sum_x2}
Sum of review_count for rating 2.0: {sum_x3}
Sum of review_count for rating 2.5: {sum_x4}
Sum of review_count for rating 3.0: {sum_x5}
Sum of review_count for rating 3.5: {sum_x6}
Sum of review_count for rating 4.0: {sum_x7}
Sum of review_count for rating 4.5: {sum_x8}
Sum of review_count for rating 5.0: {sum_x9}
") %>% print()
## Sum of review_count for rating 1.0: 10
## Sum of review_count for rating 1.5: 6
## Sum of review_count for rating 2.0: 7
## Sum of review_count for rating 2.5: 12
## Sum of review_count for rating 3.0: 33
## Sum of review_count for rating 3.5: 52
## Sum of review_count for rating 4.0: 128
## Sum of review_count for rating 4.5: 68
## Sum of review_count for rating 5.0: 40
rating <- c(1,1.5,2,2.5,3,3.5,4,4.5,5)
reviews <- c(10,6,7,12,33,52,128,68,40)
df <- data.frame(rating,reviews)

barplot(height = df$reviews, names = df$rating,
        col = c("green", "yellow"))

In conclusion, the data cleanup process effectively removed duplicated entries and eliminated businesses located outside Camden’s city boundaries, resulting in a clean and interpretable dataset for analysis. Our findings indicate that businesses within the restaurant and fitness categories predominantly have average ratings ranging from 3.5 to 4.5. This suggests a strong quality of social and healthy environments in Camden, aligning with the positive reviews reflecting the city’s quality of life. Additionally, a significant number of businesses received more than 10 ratings on Yelp, further underscoring the community’s favorable perception of local establishments.