Loading the necessary libraries
library(tidycensus)
library(sf)
## Linking to GEOS 3.12.1, GDAL 3.8.4, PROJ 9.3.1; sf_use_s2() is TRUE
library(sf)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(tmap)
## Breaking News: tmap 3.x is retiring. Please test v4, e.g. with
## remotes::install_github('r-tmap/tmap')
library(here)
## here() starts at C:/Users/srini/OneDrive/Documents/Urban Analytics
library(jsonlite)
##
## Attaching package: 'jsonlite'
## The following object is masked from 'package:purrr':
##
## flatten
library(dplyr)
library(dplyr)
library(purrr)
library(jsonlite)
library(stringr)
For data cleaning the following processes will be followed
#Removing Duplicates
#Loading the Data Frame
yelp_total=readRDS("C:\\Users\\srini\\OneDrive\\Documents\\Urban Analytics\\yelp_total.rds")
yelp_unique <- yelp_total %>%
distinct(id, .keep_all=T)
glue::glue("Before dropping NA, there were {nrow(yelp_total)} rows. After dropping them, there are {nrow(yelp_unique)} rows") %>%
print()
## Before dropping NA, there were 2287 rows. After dropping them, there are 785 rows
#Flattening the Data
concate_list <- function(x){
titles <- x[["title"]] %>% str_c(collapse = ", ")
return(titles)
}
yelp_flat_1 <- yelp_unique %>%
# 1. Flattening columns with data frame
jsonlite::flatten() %>%
# 2. Handling list-columns
mutate(transactions = transactions %>%
map_chr(., function(x) str_c(x, collapse=", ")),
location.display_address = location.display_address %>%
map_chr(., function(x) str_c(x, collapse=", ")),
categories = categories %>% map_chr(concate_list))
#Removing the NA Values from the data frame
yelp_flat_1 %>%
map_dbl(., function(x) sum(is.na(x)))
## id alias
## 0 0
## name image_url
## 0 0
## is_closed url
## 0 0
## review_count categories
## 0 0
## rating transactions
## 0 0
## price phone
## 396 0
## display_phone distance
## 0 0
## business_hours coordinates.latitude
## 0 0
## coordinates.longitude location.address1
## 0 15
## location.address2 location.address3
## 239 297
## location.city location.zip_code
## 0 0
## location.country location.state
## 0 0
## location.display_address attributes.business_temp_closed
## 0 785
## attributes.menu_url attributes.open24_hours
## 444 785
## attributes.waitlist_reservation
## 768
For the next step, we’ll be filtering out the businesses that lie outside the city boundaries of Camden
yelp_data_final <- yelp_flat_1 %>%
filter(location.city == 'Camden')
# Comparing before and after tidying
glue::glue("nrow before: {nrow(yelp_total)} -> nrow after: {nrow(yelp_data_final)}
ncol before: {ncol(yelp_total)} -> ncol after: {ncol(yelp_data_final)} \n") %>%
print()
## nrow before: 2287 -> nrow after: 165
## ncol before: 18 -> ncol after: 29
print(yelp_data_final %>% head())
## id alias name
## 1 1rPM_E_NGWxgJA29G2w-rQ jamaica-vibes-camden Jamaica VIBES
## 2 03YfclexPdS_yrIbU6ahaA mcdonalds-camden-11 McDonald's
## 3 XX94YQ5u_VIEzc32qEwWFg little-caesars-camden Little Caesars
## 4 A7OyKRz_B3HHm8GqGWQRKQ taste-budz-camden-3 Taste Budz
## 5 QUM6YFcm1pa-zC04ywYzSA old-san-juan-restaurant-camden Old San Juan Restaurant
## 6 Oq1LlUCCGXwA2Lt3HuZAog la-ingrata-camden-camden La Ingrata Camden
## image_url
## 1 https://s3-media2.fl.yelpcdn.com/bphoto/PmmuAo27ePx-x0GyeJ5Mlw/o.jpg
## 2 https://s3-media4.fl.yelpcdn.com/bphoto/FvvWZqdVWaYqsksdOJz6Fg/o.jpg
## 3 https://s3-media2.fl.yelpcdn.com/bphoto/q0PxIPRm7d-U44sEc2xEFA/o.jpg
## 4
## 5 https://s3-media1.fl.yelpcdn.com/bphoto/F5uDuP5NdgE2oaP90QdRQg/o.jpg
## 6 https://s3-media2.fl.yelpcdn.com/bphoto/v7sEt93Wo9L2S5iyhN2JOQ/o.jpg
## is_closed
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## url
## 1 https://www.yelp.com/biz/jamaica-vibes-camden?adjust_creative=IIcoxsHZylAK38wuHdFFOA&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=IIcoxsHZylAK38wuHdFFOA
## 2 https://www.yelp.com/biz/mcdonalds-camden-11?adjust_creative=IIcoxsHZylAK38wuHdFFOA&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=IIcoxsHZylAK38wuHdFFOA
## 3 https://www.yelp.com/biz/little-caesars-camden?adjust_creative=IIcoxsHZylAK38wuHdFFOA&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=IIcoxsHZylAK38wuHdFFOA
## 4 https://www.yelp.com/biz/taste-budz-camden-3?adjust_creative=IIcoxsHZylAK38wuHdFFOA&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=IIcoxsHZylAK38wuHdFFOA
## 5 https://www.yelp.com/biz/old-san-juan-restaurant-camden?adjust_creative=IIcoxsHZylAK38wuHdFFOA&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=IIcoxsHZylAK38wuHdFFOA
## 6 https://www.yelp.com/biz/la-ingrata-camden-camden?adjust_creative=IIcoxsHZylAK38wuHdFFOA&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=IIcoxsHZylAK38wuHdFFOA
## review_count categories rating transactions price
## 1 2 American, Caribbean, Seafood 5.0 delivery <NA>
## 2 19 Fast Food, Burgers, Coffee & Tea 1.3 delivery $
## 3 0 Pizza, Chicken Wings 0.0 delivery <NA>
## 4 0 American, Soul Food, Seafood 0.0 <NA>
## 5 71 Latin American, Puerto Rican 4.0 pickup, delivery $$
## 6 13 Tacos, Pizza, New Mexican Cuisine 4.7 pickup, delivery <NA>
## phone display_phone distance
## 1 +18562037801 (856) 203-7801 1185.9850
## 2 +18569628077 (856) 962-8077 1646.8039
## 3 +18569620500 (856) 962-0500 1651.7150
## 4 +18563506171 (856) 350-6171 1338.0782
## 5 +18569631200 (856) 963-1200 473.1607
## 6 +18562033424 (856) 203-3424 472.8572
## business_hours
## 1 FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 1100, 1100, 1100, 1100, 1100, 1100, 2100, 2100, 2100, 2100, 2100, 2100, 0, 1, 2, 3, 4, 5, REGULAR, TRUE
## 2 TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0, 1, 2, 3, 4, 5, 6, REGULAR, TRUE
## 3 FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 1100, 1100, 1100, 1100, 1100, 1100, 1100, 2200, 2200, 2200, 2200, 2200, 2200, 2200, 0, 1, 2, 3, 4, 5, 6, REGULAR, TRUE
## 4 FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 0630, 1200, 0630, 1200, 0630, 1200, 0630, 0630, 0630, 0630, 1200, 1130, 2200, 1130, 2200, 1130, 2200, 1500, 1500, 1500, 1130, 2200, 0, 0, 1, 1, 2, 2, 3, 4, 5, 6, 6, REGULAR, FALSE
## 5 FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 1100, 1100, 1100, 1100, 1100, 1100, 1100, 2100, 2100, 2100, 2100, 2100, 2100, 2100, 0, 1, 2, 3, 4, 5, 6, REGULAR, TRUE
## 6 FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 2100, 2100, 2100, 2100, 2100, 2100, 2100, 0, 1, 2, 3, 4, 5, 6, REGULAR, TRUE
## coordinates.latitude coordinates.longitude location.address1
## 1 39.90578 -75.09614 3102 Mt Ephraim Ave
## 2 39.91059 -75.09808 2720 Mount Ephraim Ave
## 3 39.91052 -75.09825 2768 Mount Ephraim Ave
## 4 39.90745 -75.09682 2900 Mt Emphraim Ave
## 5 39.94306 -75.08890 217 Marlton Ave
## 6 39.94560 -75.09548 1999 Federal St
## location.address2 location.address3 location.city location.zip_code
## 1 <NA> Camden 08104
## 2 Camden 08104
## 3 <NA> <NA> Camden 08104
## 4 <NA> Camden 08104
## 5 <NA> <NA> Camden 08105
## 6 <NA> Camden 08105
## location.country location.state location.display_address
## 1 US NJ 3102 Mt Ephraim Ave, Camden, NJ 08104
## 2 US NJ 2720 Mount Ephraim Ave, Camden, NJ 08104
## 3 US NJ 2768 Mount Ephraim Ave, Camden, NJ 08104
## 4 US NJ 2900 Mt Emphraim Ave, Camden, NJ 08104
## 5 US NJ 217 Marlton Ave, Camden, NJ 08105
## 6 US NJ 1999 Federal St, Camden, NJ 08105
## attributes.business_temp_closed
## 1 NA
## 2 NA
## 3 NA
## 4 NA
## 5 NA
## 6 NA
## attributes.menu_url
## 1 <NA>
## 2 https://www.mcdonalds.com/us/en-us/full-menu.html?cid=PS:GCM_MOP:NB::Yelp:All
## 3 https://littlecaesars.com/en-us/menu
## 4 <NA>
## 5 https://oldsanjuancamden.com/menu.html
## 6 https://laingratacamden.com/menu
## attributes.open24_hours attributes.waitlist_reservation
## 1 NA NA
## 2 NA NA
## 3 NA NA
## 4 NA NA
## 5 NA NA
## 6 NA NA
#Comparing Messy Data with Tidy Data
glue::glue("nrow before: {nrow(yelp_total)} -> nrow after: {nrow(yelp_data_final)} \n
ncol before: {ncol(yelp_total)} -> ncol after: {ncol(yelp_data_final)} \n") %>%
print()
## nrow before: 2287 -> nrow after: 165
##
## ncol before: 18 -> ncol after: 29
#Visualizing Observations
x1 = yelp_data_final$review_count[yelp_data_final$rating == 1.0]
sum_x1 = sum(x1)
x2 = yelp_data_final$review_count[yelp_data_final$rating == 1.5]
sum_x2 = sum(x2)
x3 = yelp_data_final$review_count[yelp_data_final$rating == 2.0]
sum_x3 = sum(x3)
x4 = yelp_data_final$review_count[yelp_data_final$rating == 2.5]
sum_x4 = sum(x4)
x5 = yelp_data_final$review_count[yelp_data_final$rating == 3.0]
sum_x5 = sum(x5)
x6 = yelp_data_final$review_count[yelp_data_final$rating == 3.5]
sum_x6 = sum(x6)
x7 = yelp_data_final$review_count[yelp_data_final$rating == 4.0]
sum_x7 = sum(x7)
x8 = yelp_data_final$review_count[yelp_data_final$rating == 4.5]
sum_x8 = sum(x8)
x9 = yelp_data_final$review_count[yelp_data_final$rating == 5.0]
sum_x9 = sum(x9)
glue::glue("
Sum of review_count for rating 1.0: {sum_x1}
Sum of review_count for rating 1.5: {sum_x2}
Sum of review_count for rating 2.0: {sum_x3}
Sum of review_count for rating 2.5: {sum_x4}
Sum of review_count for rating 3.0: {sum_x5}
Sum of review_count for rating 3.5: {sum_x6}
Sum of review_count for rating 4.0: {sum_x7}
Sum of review_count for rating 4.5: {sum_x8}
Sum of review_count for rating 5.0: {sum_x9}
") %>% print()
## Sum of review_count for rating 1.0: 10
## Sum of review_count for rating 1.5: 6
## Sum of review_count for rating 2.0: 7
## Sum of review_count for rating 2.5: 12
## Sum of review_count for rating 3.0: 33
## Sum of review_count for rating 3.5: 52
## Sum of review_count for rating 4.0: 128
## Sum of review_count for rating 4.5: 68
## Sum of review_count for rating 5.0: 40
rating <- c(1,1.5,2,2.5,3,3.5,4,4.5,5)
reviews <- c(10,6,7,12,33,52,128,68,40)
df <- data.frame(rating,reviews)
barplot(height = df$reviews, names = df$rating,
col = c("green", "yellow"))
In conclusion, the data cleanup process effectively removed duplicated entries and eliminated businesses located outside Camden’s city boundaries, resulting in a clean and interpretable dataset for analysis. Our findings indicate that businesses within the restaurant and fitness categories predominantly have average ratings ranging from 3.5 to 4.5. This suggests a strong quality of social and healthy environments in Camden, aligning with the positive reviews reflecting the city’s quality of life. Additionally, a significant number of businesses received more than 10 ratings on Yelp, further underscoring the community’s favorable perception of local establishments.