# tidycensus::census_api_key(Sys.getenv("census_api"))
library(tidycensus)
library(sf)
library(tmap)
library(jsonlite)
library(tidyverse)
library(httr)
library(reshape2)
library(here)
library(yelpr)
library(knitr)
library(skimr)
library(tigris)
tmap_mode("view")
tidycensus::census_api_key(Sys.getenv("census_api_key"))
yelp_api_key = Sys.getenv("yelp_api_key")

Instructions

For this assignment, imagine that you are a bicycle enthusiast and want to set up a bicycle rental store in a place within the City of Atlanta. The question in your mind is about where this store should be located. You realize that you can get the locations of all bike rental stores from Yelp API (categories: bikerentals). Another data that might help you could be the census data showing how many commuters commute on a bike (which could be a proxy for bike-friendliness of a community and environment). Here you will be looking for places where there is gap between bike stores (rentals in this case - but hopefully these also have other bike related services) and tracts with bike commuters.

To complete this assignment, follow the directions below:

Download Census and Yelp data

  1. Download the Census ACS 5-year estimate data for Census Tracts in Fulton and DeKalb counties, focusing on commuting. You may also need to include other variables. You may use older Census boundary (e.g., 2019) to reduce the number of Yelp API queries in Step 3.
# Function of downloading census data

# Define the variables related to commuting
vars <- c("total" = "B08301_001",  # Total
          "percent_bike_to_work"="S0801_C01_011",
          "bike"="B08301_014",
          "drove_alone" = "B08301_003",  
          "carpooled" = "B08301_004",  
          "public_transportation" = "B08301_010",  
          "walked" = "B08301_018", 
          "other_means" = "B08301_019", 
          "worked_at_home" = "B08301_021") 


census <- suppressMessages(
  get_acs(geography = "tract", # or "block group", "county", "state" etc. 
          state = "GA",
          county = c("Fulton", "Dekalb"), 
          variables = vars,
          year = 2021,
          survey = "acs5", # American Community Survey 5-year estimate
          geometry = TRUE, # returns sf objects
          output = "wide") # wide vs. long
)

# Download the Census ACS 5-year estimate data for Census Tracts in Fulton and DeKalb counties

# You may also need to include other variables.
  
# Save 
save(census, file = 'census.RData')
# You may use older Census boundary (e.g., 2019) to reduce the number of Yelp API queries in Step

old_census <- suppressMessages(
  get_acs(geography = "tract", # or "block group", "county", "state" etc. 
          state = "GA",
          county = c("Fulton", "Dekalb"), 
          variables = vars,
          year = 2019,
          survey = "acs5", # American Community Survey 5-year estimate
          geometry = TRUE, # returns sf objects
          output = "wide") # wide vs. long
)
  
# Save  
save(old_census, file = 'old_census.RData')
load('census.RData')
  1. Download the City of Atlanta boundary and use it to filter the Census Tracts that either fall within or intersect with the City of Atlanta. Download Yelp data on categories = bikerentals for the City of Atlanta. If you encounter issues with Queries-Per-Second limit, try Sys.sleep() as you see appropriate. Remember to save the downloaded Yelp data (e.g., as .RData format) in your hard drive to avoid making excessive API requests. Once you’ve saved the data, you can simply read the Yelp data from your hard drive and proceed from there.
# Download the City of Atlanta boundary 
atlanta <- places('GA') %>% 
  filter(NAME == 'Atlanta') 
# Save 
save(atlanta, file = 'atlanta.RData')
load('atlanta.RData')

# Filter the Census Tracts that either fall within or intersect with the City of Atlanta.
census_in_atlanta <- st_intersection(census, atlanta)
basemap <- tm_shape(census) + tm_borders() + tm_shape(census_in_atlanta) + tm_borders(col = 'red')
basemap

# Save 
save(census_in_atlanta, file = 'census_in_atlanta.RData')
old_census_in_atlanta <- st_intersection(old_census, atlanta)
## Error in eval(expr, envir, enclos): object 'old_census' not found
old_basemap <- tm_shape(census) + tm_borders() + tm_shape(census_in_atlanta) + tm_borders(col = 'red')
## Error in eval(expr, envir, enclos): object 'census' not found
tmap_arrange(basemap, old_basemap)
## Error in eval(expr, envir, enclos): object 'basemap' not found
# Download Yelp data on categories = bikerentals for the City of Atlanta.

# FUNCTION
get_yelp <- function(tract, category){
  # ----------------------------------
  # Gets one row of tract information (1,) and category name (str),
  # Outputs a list of business data.frame
  Sys.sleep(1)
  n <- 1
  # First request --------------------------------------------------------------
  resp <- business_search(api_key = Sys.getenv("yelp_api_key"), 
                          categories = category, 
                          latitude = tract$y, 
                          longitude = tract$x, 
                          offset = (n - 1) * 50, # = 0 when n = 1
                          radius = round(tract$radius), 
                          limit = 50)
  # Calculate how many requests are needed in total
  required_n <- ceiling(resp$total/50)
  
  # out is where the results will be appended to.
  out <- vector("list", required_n)
  
  # Store the business information to nth slot in out
  out[[n]] <- resp$businesses
    
  # Change the name of the elements to the total required_n
  # This is to know if there are more than 1000 businesses,
  # we know how many.
  names(out)[n] <- required_n
  
  # Throw error if more than 1000
  if (resp$total >= 1000)
  {
    # glue formats string by inserting {n} with what's currently stored in object n.
    print(glue::glue("{n}th row has >= 1000 businesses."))
    # Stop before going into the loop because we need to
    # break down Census Tract to something smaller.
    return(out)
  } 
  else 
  {
    # add 1 to n
    n <- n + 1
    # Now we know required_n -----------------------------------------------------
    # Starting a loop
    # pagination
    # because we don't know how many times to loop
    while(n <= required_n){
      resp <- business_search(api_key = Sys.getenv("yelp_api_key"), 
                              categories = category, 
                              latitude = tract$y, 
                              longitude = tract$x, 
                              offset = (n - 1) * 50, 
                              radius = round(tract$radius), 
                              limit = 50)
      
      out[[n]] <- resp$businesses
      
      n <- n + 1
    } #<< end of while loop
    
    # Merge all elements in the list into a single data frame
    out <- out %>% bind_rows()
    
    return(out)
  }
}
# Function: Get tract-wise radius
get_r <- function(poly, epsg_id){
  #---------------------
  # Takes: a single POLYGON or LINESTRTING
  # Outputs: distance between the centroid of the boundingbox and a corner of the bounding box
  #---------------------
  
  # Get bounding box of a given polygon
  bb <- st_bbox(poly)
  # Get lat & long coordinates of any one corner of the bounding box.
  bb_corner <- st_point(c(bb[1], bb[2])) %>% st_sfc(crs = epsg_id)
  # Get centroid of the bb
  bb_center_x <- (bb[3]+bb[1])/2
  bb_center_y <- (bb[4]+bb[2])/2
  bb_center <- st_point(c(bb_center_x, bb_center_y)) %>% st_sfc(crs = epsg_id) %>% st_sf()
    
  # Get the distance between bb_p and c
  r <- st_distance(bb_corner, bb_center)
  # Multiply 1.1 to make the circle a bit larger than the Census Tract.
  # See the Yelp explanation of their radius parameter to see why we do this.
  bb_center$radius <- r*1.2
  return(bb_center)
}
epsg_id <- 4326
r4all_loop <- vector("list", nrow(census_in_atlanta))
## Error in eval(expr, envir, enclos): object 'census_in_atlanta' not found
r4all_apply <- census_in_atlanta %>%
  st_geometry() %>% 
  st_transform(crs = epsg_id) %>% 
  lapply(., function(x) get_r(x, epsg_id = epsg_id))
## Error in eval(expr, envir, enclos): object 'census_in_atlanta' not found
r4all_apply <- bind_rows(r4all_apply)
## Error in eval(expr, envir, enclos): object 'r4all_apply' not found
# Appending X Y coordinates as seprate columns
ready_4_yelp <- r4all_apply %>% 
  mutate(x = st_coordinates(.)[,1],
         y = st_coordinates(.)[,2])
## Error in eval(expr, envir, enclos): object 'r4all_apply' not found
# Yelp download function
# Prepare a collector
yelp_list <- vector("list", nrow(ready_4_yelp))
## Error in eval(expr, envir, enclos): object 'ready_4_yelp' not found
# Looping through all Census Tracts
for (row in 1:nrow(ready_4_yelp)){
  if (row > 150){
    break
  }
  yelp_list[[row]] <- suppressMessages(get_yelp(ready_4_yelp[row,], "bikerentals"))
  if (row %% 10 == 0){
    print(paste0("Current row: ", row))
  }
}
## Error in eval(expr, envir, enclos): object 'ready_4_yelp' not found
# Save
yelp_bikes <- yelp_list %>% bind_rows() %>% as_tibble()
## Error in eval(expr, envir, enclos): object 'yelp_list' not found
# print
yelp_bikes %>% print(width=1000)
## Error in eval(expr, envir, enclos): object 'yelp_bikes' not found
save(yelp_bikes, file = 'yelp_bikes.RData')
## Error in save(yelp_bikes, file = "yelp_bikes.RData"): object 'yelp_bikes' not found
load('yelp_bikes.RData')
yelp_bikes %>% print(width=1000)
## # A tibble: 45 × 16
##    id                     alias                                  
##    <chr>                  <chr>                                  
##  1 rbf8bVY0cuqyGZtbn691lg pedego-electric-bikes-atlanta-atlanta-2
##  2 tMNV5bj4rqud0cRRQiPbWA outback-bikes-atlanta                  
##  3 b3nacMG8PR77GNCaI4RBKA atlanta-bicycle-barn-atlanta           
##  4 tMNV5bj4rqud0cRRQiPbWA outback-bikes-atlanta                  
##  5 rbf8bVY0cuqyGZtbn691lg pedego-electric-bikes-atlanta-atlanta-2
##  6 tMNV5bj4rqud0cRRQiPbWA outback-bikes-atlanta                  
##  7 tMNV5bj4rqud0cRRQiPbWA outback-bikes-atlanta                  
##  8 rbf8bVY0cuqyGZtbn691lg pedego-electric-bikes-atlanta-atlanta-2
##  9 rbf8bVY0cuqyGZtbn691lg pedego-electric-bikes-atlanta-atlanta-2
## 10 JkkHRgYj0mvdgbMXFm436w civil-bikes-atlanta                    
##    name                         
##    <chr>                        
##  1 Pedego Electric Bikes Atlanta
##  2 Outback Bikes                
##  3 Atlanta Bicycle Barn         
##  4 Outback Bikes                
##  5 Pedego Electric Bikes Atlanta
##  6 Outback Bikes                
##  7 Outback Bikes                
##  8 Pedego Electric Bikes Atlanta
##  9 Pedego Electric Bikes Atlanta
## 10 Civil Bikes                  
##    image_url                                                           
##    <chr>                                                               
##  1 https://s3-media2.fl.yelpcdn.com/bphoto/Z7KlxC0vcyoxOCSxbH3Rrg/o.jpg
##  2 https://s3-media4.fl.yelpcdn.com/bphoto/rnpOTcs3WwjTq1JsfV1b8w/o.jpg
##  3 https://s3-media3.fl.yelpcdn.com/bphoto/Ik2pMce41_MRcg3svjTbSQ/o.jpg
##  4 https://s3-media4.fl.yelpcdn.com/bphoto/rnpOTcs3WwjTq1JsfV1b8w/o.jpg
##  5 https://s3-media2.fl.yelpcdn.com/bphoto/Z7KlxC0vcyoxOCSxbH3Rrg/o.jpg
##  6 https://s3-media4.fl.yelpcdn.com/bphoto/rnpOTcs3WwjTq1JsfV1b8w/o.jpg
##  7 https://s3-media4.fl.yelpcdn.com/bphoto/rnpOTcs3WwjTq1JsfV1b8w/o.jpg
##  8 https://s3-media2.fl.yelpcdn.com/bphoto/Z7KlxC0vcyoxOCSxbH3Rrg/o.jpg
##  9 https://s3-media2.fl.yelpcdn.com/bphoto/Z7KlxC0vcyoxOCSxbH3Rrg/o.jpg
## 10 https://s3-media4.fl.yelpcdn.com/bphoto/JqTLT-chrqtbyuoB-52gdw/o.jpg
##    is_closed
##    <lgl>    
##  1 FALSE    
##  2 FALSE    
##  3 FALSE    
##  4 FALSE    
##  5 FALSE    
##  6 FALSE    
##  7 FALSE    
##  8 FALSE    
##  9 FALSE    
## 10 FALSE    
##    url                                                                          
##    <chr>                                                                        
##  1 https://www.yelp.com/biz/pedego-electric-bikes-atlanta-atlanta-2?adjust_crea…
##  2 https://www.yelp.com/biz/outback-bikes-atlanta?adjust_creative=aOMz3ElZ4MnHC…
##  3 https://www.yelp.com/biz/atlanta-bicycle-barn-atlanta?adjust_creative=aOMz3E…
##  4 https://www.yelp.com/biz/outback-bikes-atlanta?adjust_creative=aOMz3ElZ4MnHC…
##  5 https://www.yelp.com/biz/pedego-electric-bikes-atlanta-atlanta-2?adjust_crea…
##  6 https://www.yelp.com/biz/outback-bikes-atlanta?adjust_creative=aOMz3ElZ4MnHC…
##  7 https://www.yelp.com/biz/outback-bikes-atlanta?adjust_creative=aOMz3ElZ4MnHC…
##  8 https://www.yelp.com/biz/pedego-electric-bikes-atlanta-atlanta-2?adjust_crea…
##  9 https://www.yelp.com/biz/pedego-electric-bikes-atlanta-atlanta-2?adjust_crea…
## 10 https://www.yelp.com/biz/civil-bikes-atlanta?adjust_creative=aOMz3ElZ4MnHCyS…
##    review_count categories   rating coordinates$latitude $longitude transactions
##           <int> <list>        <dbl>                <dbl>      <dbl> <list>      
##  1           15 <df [3 × 2]>    4.5                 33.7      -84.4 <list [0]>  
##  2          103 <df [3 × 2]>    4                   33.8      -84.3 <list [0]>  
##  3          128 <df [3 × 2]>    4.5                 33.8      -84.4 <list [0]>  
##  4          103 <df [3 × 2]>    4                   33.8      -84.3 <list [0]>  
##  5           15 <df [3 × 2]>    4.5                 33.7      -84.4 <list [0]>  
##  6          103 <df [3 × 2]>    4                   33.8      -84.3 <list [0]>  
##  7          103 <df [3 × 2]>    4                   33.8      -84.3 <list [0]>  
##  8           15 <df [3 × 2]>    4.5                 33.7      -84.4 <list [0]>  
##  9           15 <df [3 × 2]>    4.5                 33.7      -84.4 <list [0]>  
## 10           11 <df [2 × 2]>    4.5                 33.7      -84.4 <list [0]>  
##    location$address1      $address2 $address3 $city   $zip_code $country $state
##    <chr>                  <chr>     <chr>     <chr>   <chr>     <chr>    <chr> 
##  1 "414 Bill Kennedy Way" "Ste 101" <NA>      Atlanta 30316     US       GA    
##  2 "484 Moreland Ave NE"  "Ste E"   ""        Atlanta 30307     US       GA    
##  3 "151 Sampson St NE"    ""        ""        Atlanta 30312     US       GA    
##  4 "484 Moreland Ave NE"  "Ste E"   ""        Atlanta 30307     US       GA    
##  5 "414 Bill Kennedy Way" "Ste 101" <NA>      Atlanta 30316     US       GA    
##  6 "484 Moreland Ave NE"  "Ste E"   ""        Atlanta 30307     US       GA    
##  7 "484 Moreland Ave NE"  "Ste E"   ""        Atlanta 30307     US       GA    
##  8 "414 Bill Kennedy Way" "Ste 101" <NA>      Atlanta 30316     US       GA    
##  9 "414 Bill Kennedy Way" "Ste 101" <NA>      Atlanta 30316     US       GA    
## 10 ""                      <NA>     ""        Atlanta 30312     US       GA    
##    $display_address phone        display_phone  distance price
##    <list>           <chr>        <chr>             <dbl> <chr>
##  1 <chr [3]>        +14049753915 (404) 975-3915    7824. <NA> 
##  2 <chr [3]>        +14046884878 (404) 688-4878     557. $$   
##  3 <chr [2]>        +17708732413 (770) 873-2413     646. $$   
##  4 <chr [3]>        +14046884878 (404) 688-4878    1276. $$   
##  5 <chr [3]>        +14049753915 (404) 975-3915     732. <NA> 
##  6 <chr [3]>        +14046884878 (404) 688-4878    7892. $$   
##  7 <chr [3]>        +14046884878 (404) 688-4878   11842. $$   
##  8 <chr [3]>        +14049753915 (404) 975-3915    8187. <NA> 
##  9 <chr [3]>        +14049753915 (404) 975-3915    1453. <NA> 
## 10 <chr [1]>        +14043238754 (404) 323-8754    1455. <NA> 
## # ℹ 35 more rows

However, even when you’ve stopped using the code for downloading Yelp data, do not delete them from your RMD; instructors need to see the entire code. You can use code chunk option {r, eval=FALSE} to keep the code for downloading Yelp in your RMD but not actually run it. Read this to learn more.

Tidy data

load('bikes.RData')

# Deleting duplicated rows.
drop_duplicates <- function(df, name) {
  unique_df <- df %>% distinct(id, .keep_all=T)
  print(glue::glue("Before dropping duplicated rows, there were {nrow(df)} rows in {name}. After dropping them, there are {nrow(unique_df)} rows."))
  return(unique_df)
}

# Note: The distinct function from the ‘dplyr’ package keeps the first occurrence of each duplicated row and removes the rest. 

drop_dup_bikes <- drop_duplicates(yelp_bikes, 'id')
## Before dropping duplicated rows, there were 45 rows in id. After dropping them, there are 9 rows.
# Flattening nested columns that have multiple variables in one column. Pay particular attention to the “category” column.
concate_list <- function(x){
  # x is a data frame with columns "alias" and "title" from Yelp$categories
  # returns a character vector containing category concatenated titles 
  titles <- x[["title"]] %>% str_c(collapse = ", ")
  return(titles)
}

flatten <- function(df, col){
  flat_df <- df %>% 
    mutate({{col}} := map_chr(.data[[col]], concate_list))
  return(flat_df)
}

flat1_bikes <- flatten(drop_dup_bikes, 'categories')

flat2_bikes <- flat1_bikes %>% 
  jsonlite::flatten() %>% 
  as_tibble() 

flat2_bikes$coordinates.latitude %>% head()
## [1] 33.74172 33.76756 33.75833 33.74468 33.80578 33.78293
# Delete rows that have missing data in coordinates variable and other variables of interest. It’s okay to have NAs in variables that are not of interest.

# Fist, let's verify that the 4 missing values in lat/long columns are in the same rows.
identical(is.na(flat2_bikes$coordinates.latitude),
          is.na(flat2_bikes$coordinates.longitude)) # Yes, they are in the same 4 rows.
## [1] TRUE
# Drop them.
dropna1_bikes <- flat2_bikes %>% 
  drop_na(coordinates.longitude)

# Dropping NAs in price
dropna2_bikes <- dropna1_bikes %>% 
  drop_na(price)
# Delete rows that fall outside of the City of Atlanta boundary.
sf_bikes <- dropna1_bikes %>% 
  st_as_sf(coords=c("coordinates.longitude", "coordinates.latitude"), crs = 4326)

# Filter the points
in_atlanta_bikes <- st_intersection(sf_bikes, census_in_atlanta)
## Error in eval(expr, envir, enclos): object 'census_in_atlanta' not found
save(in_atlanta_bikes, file = "in_atlanta_bikes.RData")
## Error in save(in_atlanta_bikes, file = "in_atlanta_bikes.RData"): object 'in_atlanta_bikes' not found
map_eda <- tm_shape(in_atlanta_bikes) + tm_dots(size=0.05, col = "review_count", style="quantile", palette = "Blues") 
## Error in eval(expr, envir, enclos): object 'in_atlanta_bikes' not found
map_eda
## Error in eval(expr, envir, enclos): object 'map_eda' not found
# Examine the associations among the variables bike rentals and bike commuting.
skim(in_atlanta_bikes)
## Error in eval(expr, envir, enclos): object 'in_atlanta_bikes' not found

EDA

  1. Find the right metrics: you may need to create new variables such as proportion, density, etc. Show at least one graph and at least one map with two variables that you are interested in examining (you can do more!).
load("in_atlanta_bikes.RData")
load("census_in_atlanta.RData")

census_in_atlanta <- st_set_crs(census_in_atlanta, 4326)
## Warning: st_crs<- : replacing crs does not reproject data; use st_transform for
## that
# proportion 
bike_percentage <- census_in_atlanta$bikeM / census_in_atlanta$totalM
census_in_atlanta$bike_percentage <- bike_percentage  
  
# density: bike store/ census tract
indices <- st_intersects(in_atlanta_bikes, census_in_atlanta)
counts <- sapply(indices, length)
areas <- st_area(census_in_atlanta)
density <- counts / as.numeric(areas)
## Warning in counts/as.numeric(areas): longer object length is not a multiple of
## shorter object length
# Add the density to the census_tract data frame as a new column
census_in_atlanta$density <- density

# plot map
bike_store_density_map <- tm_shape(census_in_atlanta)+tm_polygons(col = "density", style="quantile", palette = "Blues")
bike_percentage_map <- tm_shape(census_in_atlanta)+tm_polygons(col = "bike_percentage", style="quantile")
tmap_arrange(bike_store_density_map, bike_percentage_map)
## Legend labels were too wide. The labels have been resized to 0.44, 0.44, 0.44, 0.44, 0.03. Increase legend.width (argument of tm_layout) to make the legend wider and therefore the labels larger.

# plot graph
ggplot(census_in_atlanta, aes(x = density, y = bike_percentage)) +
geom_point() +
labs(x = "Bike Store Density", y = "Bike Commute Percentage", title = "Scatter Plot of Bike Store Density vs Bike Commute Percentage") +
theme_minimal()

# plot graph
ggplot(census_in_atlanta, aes(x = density, y = bike_percentage)) +
geom_point() +
labs(x = "Bike Store Density", y = "Bike Commute Percentage", title = "Scatter Plot of Bike Store Density vs Bike Commute Percentage") +
theme_minimal()