tidycensus::census_api_key(Sys.getenv("Census_API")) #importing census API key from environment
## To install your API key for use in future sessions, run this function with `install = TRUE`.

Libraries needed for assignment

Step 1: Establishing tract through census API key

Q1. Which city did you choose? I chose Cambridge, MA as showcased in this chunk of code and the image produced outlining the census tracts within the county (Middlesex County).

#### Tract polygons for the Yelp query
tract1 <- suppressMessages(
  get_acs(geography = "tract", # or "block group", "county", "state" etc. 
          state = "MA",
          county = c("Middlesex County"), 
          variables = c(hhincome = 'B19019_001'),
          year = 2021,
          survey = "acs5", # American Community Survey 5-year estimate
          geometry = TRUE, # returns sf objects
          output = "wide") # wide vs. long
)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |=====                                                                 |   8%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |=========                                                             |  12%
  |                                                                            
  |==========                                                            |  15%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |=========================                                             |  35%
  |                                                                            
  |===================================                                   |  51%
  |                                                                            
  |=========================================                             |  58%
  |                                                                            
  |==================================================================    |  94%
  |                                                                            
  |======================================================================| 100%
cambridge <- tigris::places('MA') %>% 
  filter(NAME == 'Cambridge') 
## Retrieving data for the year 2022
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |=                                                                     |   1%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |======================                                                |  32%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |==================================                                    |  49%
  |                                                                            
  |=====================================                                 |  53%
  |                                                                            
  |======================================================                |  78%
  |                                                                            
  |=========================================================             |  82%
  |                                                                            
  |===========================================================           |  85%
  |                                                                            
  |==============================================================        |  89%
  |                                                                            
  |================================================================      |  92%
  |                                                                            
  |==================================================================    |  94%
  |                                                                            
  |======================================================================| 100%
tract_cambridge <- tract1[cambridge,]

# View the data
message(sprintf("nrow: %s, ncol: %s", nrow(tract_cambridge), ncol(tract_cambridge)))
## nrow: 48, ncol: 5
tract_cambridge %>% head() %>% knitr::kable() # Ignore this kable() function. This function is for neatly displaying tables on HTML document.
GEOID NAME hhincomeE hhincomeM geometry
7 25017353300 Census Tract 3533, Middlesex County, Massachusetts 145526 19246 MULTIPOLYGON (((-71.11716 4…
8 25017353700 Census Tract 3537, Middlesex County, Massachusetts 92969 34316 MULTIPOLYGON (((-71.11858 4…
9 25017350701 Census Tract 3507.01, Middlesex County, Massachusetts 107132 13773 MULTIPOLYGON (((-71.13363 4…
10 25017352800 Census Tract 3528, Middlesex County, Massachusetts 115924 14493 MULTIPOLYGON (((-71.10083 4…
15 25017354100 Census Tract 3541, Middlesex County, Massachusetts 120688 9322 MULTIPOLYGON (((-71.13058 4…
18 25017354601 Census Tract 3546.01, Middlesex County, Massachusetts 107206 39782 MULTIPOLYGON (((-71.15845 4…

Step 2: Visualizing census tracts with buffers around center of each tract.

# Function: Get tract-wise radius
get_r <- function(poly, epsg_id){

  # Get bounding box of a given polygon
  bb <- st_bbox(poly)
  # Get lat & long coordinates of any one corner of the bounding box.
  bb_corner <- st_point(c(bb[1], bb[2])) %>% st_sfc(crs = epsg_id)
  # Get centroid of the bb
  bb_center_x <- (bb[3]+bb[1])/2
  bb_center_y <- (bb[4]+bb[2])/2
  bb_center <- st_point(c(bb_center_x, bb_center_y)) %>% st_sfc(crs = epsg_id) %>% st_sf()
    
  # Get the distance between bb_p and c
  r <- st_distance(bb_corner, bb_center)
  # Multiply 1.1 to make the circle a bit larger than the Census Tract.
  # See the Yelp explanation of their radius parameter to see why we do this.
  bb_center$radius <- r*1.1
  return(bb_center)
}

epsg_id <- 4326

# We use a functional (sapply) to apply this custom function to each Census Tract.
r4all_apply <- tract_cambridge %>%
  st_geometry() %>% 
  st_transform(crs = epsg_id) %>% 
  lapply(., function(x) get_r(x, epsg_id = epsg_id))

r4all_apply <- bind_rows(r4all_apply)

# Appending X Y coordinates as seprate columns
ready_4_yelp <- r4all_apply %>% 
  mutate(x = st_coordinates(.)[,1],
         y = st_coordinates(.)[,2])

tmap_mode('view')
## tmap mode set to interactive viewing
ready_4_yelp %>% 
  # Draw a buffer centered at the centroid of Tract polygons.
  # Radius of the buffer is the radius we just calculated using loop
  st_buffer(., dist = .$radius) %>% 
  # Display this buffer in red
  tm_shape(.) + tm_polygons(alpha = 0.5, col = 'red') +
  # Display the original polygon in blue
  tm_shape(tract_cambridge) + tm_borders(col= 'blue')

Step 3: Applying Yelpr::business_search function to get Yelp data

# FUNCTION
get_yelp <- function(tract, category){
  # ----------------------------------
  # Gets one row of tract information (1,) and category name (str),
  # Outputs a list of business data.frame
  Sys.sleep(1)
  n <- 1
  # First request --------------------------------------------------------------
  resp <- business_search(api_key = Sys.getenv("Yelp_API"),
                          categories = category,
                          latitude = tract$y,
                          longitude = tract$x,
                          offset = (n - 1) * 50, # = 0 when n = 1
                          radius = round(tract$radius),
                          limit = 50)
  # Calculate how many requests are needed in total
  required_n <- ceiling(resp$total/50)

  # out is where the results will be appended to.
  out <- vector("list", required_n)

  # Store the business information to nth slot in out
  out[[n]] <- resp$businesses

  # Change the name of the elements to the total required_n
  # This is to know if there are more than 1000 businesses,
  # we know how many.
  names(out)[n] <- required_n

  # Throw error if more than 1000
  if (resp$total >= 1000)
  {
    # glue formats string by inserting {n} with what's currently stored in object n.
    print(glue::glue("{n}th row has >= 1000 businesses."))
    # Stop before going into the loop because we need to
    # break down Census Tract to something smaller.
    return(out)
  }
  else
  {
    # add 1 to n
    n <- n + 1

    # Now we know required_n -----------------------------------------------------
    # Starting a loop
    while(n <= required_n){
      resp <- business_search(api_key = Sys.getenv("Yelp_API"),
                              categories = category,
                              latitude = tract$y,
                              longitude = tract$x,
                              offset = (n - 1) * 50,
                              radius = round(tract$radius),
                              limit = 50)

      out[[n]] <- resp$businesses

      n <- n + 1
    } #<< end of while loop

    # Merge all elements in the list into a single data frame
    out <- out %>% bind_rows()

    return(out)
  }
}

Step 4: Gathering data for all other census tracts (done two times for each category)

Fetching bubble tea data for all census tracts:

# Ensure ready_4_yelp has valid rows
if (nrow(ready_4_yelp) > 0) {

  # Prepare a collector for the Yelp data
  boba_all_list <- vector("list", nrow(ready_4_yelp))

  # Loop through all Census Tracts
  for (row in 1:nrow(ready_4_yelp)) {
    # Safely retrieve Yelp data for the current row, with error handling
    tryCatch({
      boba_all_list[[row]] <- suppressMessages(get_yelp(ready_4_yelp[row, ], "bubbletea"))
      print(paste0("Current row: ", row))  # Progress message
    }, error = function(e) {
      print(paste("Error at row:", row, e$message))  # Handle any errors
    })
  }

  # Collapse the list into a data frame
  boba_all <- boba_all_list %>% bind_rows() %>% as_tibble()

} else {
  print("ready_4_yelp has no rows!")
}
## [1] "Current row: 1"
## [1] "Current row: 2"
## [1] "Current row: 3"
## [1] "Current row: 4"
## [1] "Current row: 5"
## [1] "Current row: 6"
## [1] "Current row: 7"
## [1] "Current row: 8"
## [1] "Current row: 9"
## [1] "Current row: 10"
## [1] "Current row: 11"
## [1] "Current row: 12"
## [1] "Current row: 13"
## [1] "Current row: 14"
## [1] "Current row: 15"
## [1] "Current row: 16"
## [1] "Current row: 17"
## [1] "Current row: 18"
## [1] "Current row: 19"
## [1] "Current row: 20"
## [1] "Current row: 21"
## [1] "Current row: 22"
## [1] "Current row: 23"
## [1] "Current row: 24"
## [1] "Current row: 25"
## [1] "Current row: 26"
## [1] "Current row: 27"
## [1] "Current row: 28"
## [1] "Current row: 29"
## [1] "Current row: 30"
## [1] "Current row: 31"
## [1] "Current row: 32"
## [1] "Current row: 33"
## [1] "Current row: 34"
## [1] "Current row: 35"
## [1] "Current row: 36"
## [1] "Current row: 37"
## [1] "Current row: 38"
## [1] "Current row: 39"
## [1] "Current row: 40"
## [1] "Current row: 41"
## [1] "Current row: 42"
## [1] "Current row: 43"
## [1] "Current row: 44"
## [1] "Current row: 45"
## [1] "Current row: 46"
## [1] "Current row: 47"
## [1] "Current row: 48"

Fetching colleges/universities data for all census tracts:

# Ensure ready_4_yelp has valid rows
if (nrow(ready_4_yelp) > 0) {

  # Prepare a collector for the Yelp data
  uni_all_list <- vector("list", nrow(ready_4_yelp))

  # Loop through all Census Tracts
  for (row in 1:nrow(ready_4_yelp)) {
    # Safely retrieve Yelp data for the current row, with error handling
    tryCatch({
      uni_all_list[[row]] <- suppressMessages(get_yelp(ready_4_yelp[row, ], "collegeuniv"))
      print(paste0("Current row: ", row))  # Progress message
    }, error = function(e) {
      print(paste("Error at row:", row, e$message))  # Handle any errors
    })
  }

  # Collapse the list into a data frame
  uni_all <- uni_all_list %>% bind_rows() %>% as_tibble()


} else {
  print("ready_4_yelp has no rows!")
}
## [1] "Current row: 1"
## [1] "Current row: 2"
## [1] "Current row: 3"
## [1] "Current row: 4"
## [1] "Current row: 5"
## [1] "Current row: 6"
## [1] "Current row: 7"
## [1] "Current row: 8"
## [1] "Current row: 9"
## [1] "Current row: 10"
## [1] "Current row: 11"
## [1] "Current row: 12"
## [1] "Current row: 13"
## [1] "Current row: 14"
## [1] "Current row: 15"
## [1] "Current row: 16"
## [1] "Current row: 17"
## [1] "Current row: 18"
## [1] "Current row: 19"
## [1] "Current row: 20"
## [1] "Current row: 21"
## [1] "Current row: 22"
## [1] "Current row: 23"
## [1] "Current row: 24"
## [1] "Current row: 25"
## [1] "Current row: 26"
## [1] "Current row: 27"
## [1] "Current row: 28"
## [1] "Current row: 29"
## [1] "Current row: 30"
## [1] "Current row: 31"
## [1] "Current row: 32"
## [1] "Current row: 33"
## [1] "Current row: 34"
## [1] "Current row: 35"
## [1] "Current row: 36"
## [1] "Current row: 37"
## [1] "Current row: 38"
## [1] "Current row: 39"
## [1] "Current row: 40"
## [1] "Current row: 41"
## [1] "Current row: 42"
## [1] "Current row: 43"
## [1] "Current row: 44"
## [1] "Current row: 45"
## [1] "Current row: 46"
## [1] "Current row: 47"
## [1] "Current row: 48"

Combining results from both responses using bind_rows as suggested:

combined_businesses <- bind_rows(uni_all, boba_all)

Q2. How many businesses are there in total (whole county)? There are 275 businesses in Cambridge including both categories Colleges/Universities and Boba shops.

length(boba_all$name) + length(uni_all$name)
## [1] 539

Q3. How many businesses are there for each business category (in whole county)? When printing the total business of each category, I get 265 colleges/universities and 10 bubble tea shops.

length(boba_all$name)
## [1] 114
length(uni_all$name)
## [1] 425

Step 5: Visualizing colleges/universities and boba businesses on Cambridge’s census tract layer.

Q4. Upon visual inspection, can you see any noticeable spatial patterns to the way they are distributed across the city (e.g., clustering of businesses at some parts of the city)? By the splitting the combined map into two, we can see there are many more universities than boba shops in Cambridge. Interestingly, both categories are clustered in the same areas of the city.

Q5. (Optional) Are there any other interesting findings? When comparing the two categories, I found that universities have significantly worse ratings than Boba shops.

Total businesses (both categories) plotted:

# Define breaks for the categories from 0.0 to 5.0
breaks <- seq(0.0, 5.0, length.out = 6)  # 5 categories, so 6 breaks

yelp_sf <- combined_businesses %>%
  mutate(x = .$coordinates$longitude,
         y = .$coordinates$latitude) %>%
  mutate(rating_category = cut(.$rating, breaks = breaks,
                                include.lowest = TRUE,
                                labels = c("0.0-1.0", "1.1-2.0", "2.1-3.0", "3.1-4.0", "4.1-5.0"))) %>%
  st_as_sf(coords = c("x", "y"), crs = 4326)

# Map
tm_shape(yelp_sf) +
  tm_dots(col = "rating_category", id = "name", palette = "viridis")

Only bubble tea shops plotted:

# Define breaks for the categories from 0.0 to 5.0
breaks <- seq(0.0, 5.0, length.out = 6)  # 5 categories, so 6 breaks

yelp_sf <- boba_all %>%
  mutate(x = .$coordinates$longitude,
         y = .$coordinates$latitude) %>%
  mutate(rating_category = cut(.$rating, breaks = breaks,
                                include.lowest = TRUE,
                                labels = c("0.0-1.0", "1.1-2.0", "2.1-3.0", "3.1-4.0", "4.1-5.0"))) %>%
  st_as_sf(coords = c("x", "y"), crs = 4326)

# Map
tm_shape(yelp_sf) +
  tm_dots(col = "rating_category", id = "name", palette = "viridis")

Only colleges/universities plotted:

# Define breaks for the categories from 0.0 to 5.0
breaks <- seq(0.0, 5.0, length.out = 6)  # 5 categories, so 6 breaks

yelp_sf <- uni_all %>%
  mutate(x = .$coordinates$longitude,
         y = .$coordinates$latitude) %>%
  mutate(rating_category = cut(.$rating, breaks = breaks,
                                include.lowest = TRUE,
                                labels = c("0.0-1.0", "1.1-2.0", "2.1-3.0", "3.1-4.0", "4.1-5.0"))) %>%
  st_as_sf(coords = c("x", "y"), crs = 4326)

# Map
tm_shape(yelp_sf) +
  tm_dots(col = "rating_category", id = "name", palette = "viridis")