tidycensus::census_api_key(Sys.getenv("Census_API")) #importing census API key from environment
## To install your API key for use in future sessions, run this function with `install = TRUE`.
Q1. Which city did you choose? I chose Cambridge, MA as showcased in this chunk of code and the image produced outlining the census tracts within the county (Middlesex County).
#### Tract polygons for the Yelp query
tract1 <- suppressMessages(
get_acs(geography = "tract", # or "block group", "county", "state" etc.
state = "MA",
county = c("Middlesex County"),
variables = c(hhincome = 'B19019_001'),
year = 2021,
survey = "acs5", # American Community Survey 5-year estimate
geometry = TRUE, # returns sf objects
output = "wide") # wide vs. long
)
##
|
| | 0%
|
|== | 3%
|
|==== | 5%
|
|===== | 8%
|
|======= | 10%
|
|========= | 12%
|
|========== | 15%
|
|============ | 17%
|
|================== | 25%
|
|===================== | 30%
|
|========================= | 35%
|
|=================================== | 51%
|
|========================================= | 58%
|
|================================================================== | 94%
|
|======================================================================| 100%
cambridge <- tigris::places('MA') %>%
filter(NAME == 'Cambridge')
## Retrieving data for the year 2022
##
|
| | 0%
|
|= | 1%
|
|==== | 6%
|
|============ | 17%
|
|================= | 24%
|
|================== | 25%
|
|====================== | 32%
|
|============================== | 43%
|
|================================= | 47%
|
|================================== | 49%
|
|===================================== | 53%
|
|====================================================== | 78%
|
|========================================================= | 82%
|
|=========================================================== | 85%
|
|============================================================== | 89%
|
|================================================================ | 92%
|
|================================================================== | 94%
|
|======================================================================| 100%
tract_cambridge <- tract1[cambridge,]
# View the data
message(sprintf("nrow: %s, ncol: %s", nrow(tract_cambridge), ncol(tract_cambridge)))
## nrow: 48, ncol: 5
tract_cambridge %>% head() %>% knitr::kable() # Ignore this kable() function. This function is for neatly displaying tables on HTML document.
| GEOID | NAME | hhincomeE | hhincomeM | geometry | |
|---|---|---|---|---|---|
| 7 | 25017353300 | Census Tract 3533, Middlesex County, Massachusetts | 145526 | 19246 | MULTIPOLYGON (((-71.11716 4… |
| 8 | 25017353700 | Census Tract 3537, Middlesex County, Massachusetts | 92969 | 34316 | MULTIPOLYGON (((-71.11858 4… |
| 9 | 25017350701 | Census Tract 3507.01, Middlesex County, Massachusetts | 107132 | 13773 | MULTIPOLYGON (((-71.13363 4… |
| 10 | 25017352800 | Census Tract 3528, Middlesex County, Massachusetts | 115924 | 14493 | MULTIPOLYGON (((-71.10083 4… |
| 15 | 25017354100 | Census Tract 3541, Middlesex County, Massachusetts | 120688 | 9322 | MULTIPOLYGON (((-71.13058 4… |
| 18 | 25017354601 | Census Tract 3546.01, Middlesex County, Massachusetts | 107206 | 39782 | MULTIPOLYGON (((-71.15845 4… |
# Function: Get tract-wise radius
get_r <- function(poly, epsg_id){
# Get bounding box of a given polygon
bb <- st_bbox(poly)
# Get lat & long coordinates of any one corner of the bounding box.
bb_corner <- st_point(c(bb[1], bb[2])) %>% st_sfc(crs = epsg_id)
# Get centroid of the bb
bb_center_x <- (bb[3]+bb[1])/2
bb_center_y <- (bb[4]+bb[2])/2
bb_center <- st_point(c(bb_center_x, bb_center_y)) %>% st_sfc(crs = epsg_id) %>% st_sf()
# Get the distance between bb_p and c
r <- st_distance(bb_corner, bb_center)
# Multiply 1.1 to make the circle a bit larger than the Census Tract.
# See the Yelp explanation of their radius parameter to see why we do this.
bb_center$radius <- r*1.1
return(bb_center)
}
epsg_id <- 4326
# We use a functional (sapply) to apply this custom function to each Census Tract.
r4all_apply <- tract_cambridge %>%
st_geometry() %>%
st_transform(crs = epsg_id) %>%
lapply(., function(x) get_r(x, epsg_id = epsg_id))
r4all_apply <- bind_rows(r4all_apply)
# Appending X Y coordinates as seprate columns
ready_4_yelp <- r4all_apply %>%
mutate(x = st_coordinates(.)[,1],
y = st_coordinates(.)[,2])
tmap_mode('view')
## tmap mode set to interactive viewing
ready_4_yelp %>%
# Draw a buffer centered at the centroid of Tract polygons.
# Radius of the buffer is the radius we just calculated using loop
st_buffer(., dist = .$radius) %>%
# Display this buffer in red
tm_shape(.) + tm_polygons(alpha = 0.5, col = 'red') +
# Display the original polygon in blue
tm_shape(tract_cambridge) + tm_borders(col= 'blue')
# FUNCTION
get_yelp <- function(tract, category){
# ----------------------------------
# Gets one row of tract information (1,) and category name (str),
# Outputs a list of business data.frame
Sys.sleep(1)
n <- 1
# First request --------------------------------------------------------------
resp <- business_search(api_key = Sys.getenv("Yelp_API"),
categories = category,
latitude = tract$y,
longitude = tract$x,
offset = (n - 1) * 50, # = 0 when n = 1
radius = round(tract$radius),
limit = 50)
# Calculate how many requests are needed in total
required_n <- ceiling(resp$total/50)
# out is where the results will be appended to.
out <- vector("list", required_n)
# Store the business information to nth slot in out
out[[n]] <- resp$businesses
# Change the name of the elements to the total required_n
# This is to know if there are more than 1000 businesses,
# we know how many.
names(out)[n] <- required_n
# Throw error if more than 1000
if (resp$total >= 1000)
{
# glue formats string by inserting {n} with what's currently stored in object n.
print(glue::glue("{n}th row has >= 1000 businesses."))
# Stop before going into the loop because we need to
# break down Census Tract to something smaller.
return(out)
}
else
{
# add 1 to n
n <- n + 1
# Now we know required_n -----------------------------------------------------
# Starting a loop
while(n <= required_n){
resp <- business_search(api_key = Sys.getenv("Yelp_API"),
categories = category,
latitude = tract$y,
longitude = tract$x,
offset = (n - 1) * 50,
radius = round(tract$radius),
limit = 50)
out[[n]] <- resp$businesses
n <- n + 1
} #<< end of while loop
# Merge all elements in the list into a single data frame
out <- out %>% bind_rows()
return(out)
}
}
Fetching bubble tea data for all census tracts:
# Ensure ready_4_yelp has valid rows
if (nrow(ready_4_yelp) > 0) {
# Prepare a collector for the Yelp data
boba_all_list <- vector("list", nrow(ready_4_yelp))
# Loop through all Census Tracts
for (row in 1:nrow(ready_4_yelp)) {
# Safely retrieve Yelp data for the current row, with error handling
tryCatch({
boba_all_list[[row]] <- suppressMessages(get_yelp(ready_4_yelp[row, ], "bubbletea"))
print(paste0("Current row: ", row)) # Progress message
}, error = function(e) {
print(paste("Error at row:", row, e$message)) # Handle any errors
})
}
# Collapse the list into a data frame
boba_all <- boba_all_list %>% bind_rows() %>% as_tibble()
} else {
print("ready_4_yelp has no rows!")
}
## [1] "Current row: 1"
## [1] "Current row: 2"
## [1] "Current row: 3"
## [1] "Current row: 4"
## [1] "Current row: 5"
## [1] "Current row: 6"
## [1] "Current row: 7"
## [1] "Current row: 8"
## [1] "Current row: 9"
## [1] "Current row: 10"
## [1] "Current row: 11"
## [1] "Current row: 12"
## [1] "Current row: 13"
## [1] "Current row: 14"
## [1] "Current row: 15"
## [1] "Current row: 16"
## [1] "Current row: 17"
## [1] "Current row: 18"
## [1] "Current row: 19"
## [1] "Current row: 20"
## [1] "Current row: 21"
## [1] "Current row: 22"
## [1] "Current row: 23"
## [1] "Current row: 24"
## [1] "Current row: 25"
## [1] "Current row: 26"
## [1] "Current row: 27"
## [1] "Current row: 28"
## [1] "Current row: 29"
## [1] "Current row: 30"
## [1] "Current row: 31"
## [1] "Current row: 32"
## [1] "Current row: 33"
## [1] "Current row: 34"
## [1] "Current row: 35"
## [1] "Current row: 36"
## [1] "Current row: 37"
## [1] "Current row: 38"
## [1] "Current row: 39"
## [1] "Current row: 40"
## [1] "Current row: 41"
## [1] "Current row: 42"
## [1] "Current row: 43"
## [1] "Current row: 44"
## [1] "Current row: 45"
## [1] "Current row: 46"
## [1] "Current row: 47"
## [1] "Current row: 48"
Fetching colleges/universities data for all census tracts:
# Ensure ready_4_yelp has valid rows
if (nrow(ready_4_yelp) > 0) {
# Prepare a collector for the Yelp data
uni_all_list <- vector("list", nrow(ready_4_yelp))
# Loop through all Census Tracts
for (row in 1:nrow(ready_4_yelp)) {
# Safely retrieve Yelp data for the current row, with error handling
tryCatch({
uni_all_list[[row]] <- suppressMessages(get_yelp(ready_4_yelp[row, ], "collegeuniv"))
print(paste0("Current row: ", row)) # Progress message
}, error = function(e) {
print(paste("Error at row:", row, e$message)) # Handle any errors
})
}
# Collapse the list into a data frame
uni_all <- uni_all_list %>% bind_rows() %>% as_tibble()
} else {
print("ready_4_yelp has no rows!")
}
## [1] "Current row: 1"
## [1] "Current row: 2"
## [1] "Current row: 3"
## [1] "Current row: 4"
## [1] "Current row: 5"
## [1] "Current row: 6"
## [1] "Current row: 7"
## [1] "Current row: 8"
## [1] "Current row: 9"
## [1] "Current row: 10"
## [1] "Current row: 11"
## [1] "Current row: 12"
## [1] "Current row: 13"
## [1] "Current row: 14"
## [1] "Current row: 15"
## [1] "Current row: 16"
## [1] "Current row: 17"
## [1] "Current row: 18"
## [1] "Current row: 19"
## [1] "Current row: 20"
## [1] "Current row: 21"
## [1] "Current row: 22"
## [1] "Current row: 23"
## [1] "Current row: 24"
## [1] "Current row: 25"
## [1] "Current row: 26"
## [1] "Current row: 27"
## [1] "Current row: 28"
## [1] "Current row: 29"
## [1] "Current row: 30"
## [1] "Current row: 31"
## [1] "Current row: 32"
## [1] "Current row: 33"
## [1] "Current row: 34"
## [1] "Current row: 35"
## [1] "Current row: 36"
## [1] "Current row: 37"
## [1] "Current row: 38"
## [1] "Current row: 39"
## [1] "Current row: 40"
## [1] "Current row: 41"
## [1] "Current row: 42"
## [1] "Current row: 43"
## [1] "Current row: 44"
## [1] "Current row: 45"
## [1] "Current row: 46"
## [1] "Current row: 47"
## [1] "Current row: 48"
Combining results from both responses using bind_rows as suggested:
combined_businesses <- bind_rows(uni_all, boba_all)
Q2. How many businesses are there in total (whole county)? There are 275 businesses in Cambridge including both categories Colleges/Universities and Boba shops.
length(boba_all$name) + length(uni_all$name)
## [1] 539
Q3. How many businesses are there for each business category (in whole county)? When printing the total business of each category, I get 265 colleges/universities and 10 bubble tea shops.
length(boba_all$name)
## [1] 114
length(uni_all$name)
## [1] 425
Q4. Upon visual inspection, can you see any noticeable spatial patterns to the way they are distributed across the city (e.g., clustering of businesses at some parts of the city)? By the splitting the combined map into two, we can see there are many more universities than boba shops in Cambridge. Interestingly, both categories are clustered in the same areas of the city.
Q5. (Optional) Are there any other interesting findings? When comparing the two categories, I found that universities have significantly worse ratings than Boba shops.
Total businesses (both categories) plotted:
# Define breaks for the categories from 0.0 to 5.0
breaks <- seq(0.0, 5.0, length.out = 6) # 5 categories, so 6 breaks
yelp_sf <- combined_businesses %>%
mutate(x = .$coordinates$longitude,
y = .$coordinates$latitude) %>%
mutate(rating_category = cut(.$rating, breaks = breaks,
include.lowest = TRUE,
labels = c("0.0-1.0", "1.1-2.0", "2.1-3.0", "3.1-4.0", "4.1-5.0"))) %>%
st_as_sf(coords = c("x", "y"), crs = 4326)
# Map
tm_shape(yelp_sf) +
tm_dots(col = "rating_category", id = "name", palette = "viridis")
Only bubble tea shops plotted:
# Define breaks for the categories from 0.0 to 5.0
breaks <- seq(0.0, 5.0, length.out = 6) # 5 categories, so 6 breaks
yelp_sf <- boba_all %>%
mutate(x = .$coordinates$longitude,
y = .$coordinates$latitude) %>%
mutate(rating_category = cut(.$rating, breaks = breaks,
include.lowest = TRUE,
labels = c("0.0-1.0", "1.1-2.0", "2.1-3.0", "3.1-4.0", "4.1-5.0"))) %>%
st_as_sf(coords = c("x", "y"), crs = 4326)
# Map
tm_shape(yelp_sf) +
tm_dots(col = "rating_category", id = "name", palette = "viridis")
Only colleges/universities plotted:
# Define breaks for the categories from 0.0 to 5.0
breaks <- seq(0.0, 5.0, length.out = 6) # 5 categories, so 6 breaks
yelp_sf <- uni_all %>%
mutate(x = .$coordinates$longitude,
y = .$coordinates$latitude) %>%
mutate(rating_category = cut(.$rating, breaks = breaks,
include.lowest = TRUE,
labels = c("0.0-1.0", "1.1-2.0", "2.1-3.0", "3.1-4.0", "4.1-5.0"))) %>%
st_as_sf(coords = c("x", "y"), crs = 4326)
# Map
tm_shape(yelp_sf) +
tm_dots(col = "rating_category", id = "name", palette = "viridis")