#Include yelp data prep function.
source('../Functions/yelp_and_radius.R')

Mini Project 3

In this mini-assignment, you get to build upon the code you implemented so far and include aspects of the Week 5 lectures on Exploratory Data Analysis.

For this assignment, imagine that you are a bicycle enthusiast and want to set up a bicycle rental store in a place within Fulton and Dekalb counties. The question in your mind is about where this store should be located. You realize that you can get the locations of all bike rental stores from Yelp API (categories: bikerentals). Another data that might help you could be the census data showing how many commuters commute on a bike (which could be a proxy for bike-friendliness of a community and environment). Here you will be looking for places where there is gap between bike stores (rentals in this case - but hopefully these also have other bike related services) and tracts with bike commuters.

To complete this assignment, follow the directions below:

  1. Download Yelp data on categories = bikerentals for Atlanta metro.
    • If you encounter issues in API requests, try Sys.sleep() as you see appropriate.
    • Remember to save the downloaded Yelp data in your hard drive to avoid making excessive API requests. Once you’ve downloaded the data, you can simply read the Yelp data from your hard drive and proceed from there.
    • However, even when you’ve stopped using the code for downloading Yelp data, do not delete them from your RMD; instructors need to see the entire code. You can use code chunk option {r, eval=FALSE} to keep the code for downloading Yelp in your RMD but not actually run it. Read this to learn more.
  2. Download census data for the commuting (and other variables that you may choose to use).
  3. Tidy your data by:
    • Deleting duplicated rows.
    • Flattening nested columns that have multiple variables in one column. Pay particular attention to the “category” column.
    • Delete rows that have missing data in coordinates variable and other variables of interest. It’s okay to have NAs in variables that are not of interest.
    • Delete rows that fall outside of the boundary of your choice.
    • Examine the associations among the variables bike rentals and bike commuting. Find the right metrics (you may need to create new variables e.g., proportion, density, etc.).
    • Show at least one graph and at least one map with two variables that you are interested in examining (you can do more!).

Get and Plot Yelp Data

atl_metro_bikerentals_raw <- get_yelp(atl_metro_geom, 'bikerentals', epsg_id) 
## [1] "Processing Category: bikerentals"
## [1] "Processing Tract: 100 of 1006"
## [1] "Processing Tract: 200 of 1006"
## [1] "Processing Tract: 300 of 1006"
## [1] "Processing Tract: 400 of 1006"
## [1] "Processing Tract: 500 of 1006"
## [1] "Processing Tract: 600 of 1006"
## [1] "Processing Tract: 700 of 1006"
## [1] "Processing Tract: 800 of 1006"
## [1] "Processing Tract: 900 of 1006"
## [1] "Processing Tract: 1000 of 1006"
atl_metro_bikerentals <- atl_metro_bikerentals_raw %>%
  
  #Clean data
  clean_yelp(atl_metro_tracts$geometry, epsg_id)
  
#Create a variable to indicate if a bikeshop is in a tract
atl_metro_data <- st_join(atl_acs, select(atl_metro_bikerentals, c(alias.business, geometry)), join = st_intersects) %>%
  
  #Detect if a bike rental shop is in the tract
  mutate('bikeshop.true' = if_else(is.na(`alias.business`),FALSE,TRUE))

#Skim yelp data
skim(atl_metro_bikerentals)
## Warning: Couldn't find skimmers for class: sfc_POINT, sfc; No user-defined `sfl`
## provided. Falling back to `character`.
Data summary
Name atl_metro_bikerentals
Number of rows 19
Number of columns 24
_______________________
Column type frequency:
character 18
list 2
logical 1
numeric 3
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
id 0 1.00 22 22 0 19 0
alias.business 0 1.00 14 46 0 19 0
name 0 1.00 4 35 0 19 0
image_url 0 1.00 68 68 0 19 0
url 0 1.00 171 203 0 19 0
alias.category 0 1.00 18 43 0 14 0
title.category 0 1.00 19 52 0 14 0
price 11 0.42 1 4 0 4 0
phone 0 1.00 12 12 0 19 0
display_phone 0 1.00 14 14 0 19 0
location.address1 1 0.95 0 26 3 16 0
location.address2 5 0.74 0 9 8 7 0
location.address3 7 0.63 0 0 12 1 0
location.city 0 1.00 6 14 0 10 0
location.zip_code 0 1.00 5 5 0 16 0
location.country 0 1.00 2 2 0 1 0
location.state 0 1.00 2 2 0 1 0
geometry 0 1.00 21 38 0 19 0

Variable type: list

skim_variable n_missing complete_rate n_unique min_length max_length
transactions 0 1 1 0 0
location.display_address 0 1 19 1 3

Variable type: logical

skim_variable n_missing complete_rate mean count
is_closed 0 1 0 FAL: 19

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
review_count 0 1 24.84 32.51 1.00 3.50 9.00 32.00 124.00 ▇▁▂▁▁
rating 0 1 3.84 1.44 1.00 3.75 4.50 4.75 5.00 ▂▁▁▂▇
distance 0 1 3705.71 6897.39 221.63 886.00 1484.12 2719.89 30529.46 ▇▁▁▁▁
#Plot the businesses
tmap_mode('view')
## tmap mode set to interactive viewing
tm_shape(atl_metro_tracts) +
  tm_polygons() + 
  tm_shape(atl_metro_bikerentals) + tm_dots()

Here are the functions I used to pull in the Yelp data.

#Get radius function

get_radius <- function(geo, epsg_id){
  
  #Get the bounding box of the polygon
  bb <- st_bbox(geo)
  
  #Get the coordinates of a corner on the box
  bb_corner <- st_point(c(bb[1], bb[2])) %>%
    #Adjust it to a point on the Earth's surface
    st_sfc(crs = epsg_id)
  
  #Get the centroid of the bounding box by finding the center of both lines.
  bb_center_x <- (bb[3] + bb[1]) / 2
  bb_center_y <- (bb[4] + bb[2]) / 2
  bb_centroid <- st_point(c(bb_center_x, bb_center_y)) %>%
    #Adjust the point to the earths surface
    #And convert it to an SF object
    st_sfc(crs = epsg_id) %>% st_sf()
  
  #Lastly, calculate the radius
  r <- st_distance(bb_corner, bb_centroid)
  #Multiply by 1.1 to make it slightly bigger than the bounding box
  bb_centroid$radius <- r
  return(bb_centroid)
}

#Get yelp data function
get_yelp <- function(tracts, categories, epsg_id, key = Sys.getenv('yelp_key')){
  
  #Establish output frame
  frame = data.frame()
  
  #For each category
  for(cat in categories){
    paste0('Processing Category: ', cat) %>% print()
    
    #For every tract
    for(i in 1:nrow(tracts)){
      if(i %% 100 == 0){
        paste0('Processing Tract: ', i, ' of ', nrow(tracts)) %>% print()
      }
      
      #Offset ticker
      n <- 1
      
      #Get the list of businesses
      businessList <- business_search(key,
                                      categories = cat,
                                      latitude = tracts[i,]$y,
                                      longitude = tracts[i,]$x,
                                      radius = round(tracts[i,]$radius),
                                      offset = (n - 1)*50,
                                      limit = 50
      ) %>% suppressMessages()
      
      #Append to final dataframe
      frame <- bind_rows(frame, businessList$businesses)
      
      #If the list is not empty
      if(!is_empty(businessList$total)){
        
        #If total is greater than 50
        if(businessList$total > 50){
          
          #Iterate through the remaining businesses
          while(n < ceiling(businessList$total / 50)){
            
            #Append the business list to a final output dataframe
            
            #Adjust the offset ticker
            n <- n + 1
            
            #Get the list of businesses
            businessList <- business_search(key,
                                            categories = cat,
                                            latitude = tracts[i,]$y,
                                            longitude = tracts[i,]$x,
                                            radius = round(tracts[i,]$radius),
                                            offset = (n - 1)*50,
                                            limit = 50
            ) %>% suppressMessages()
            
            #Append the list to the dataframe
            frame <- bind_rows(frame, businessList$businesses)
            
          }
        }
      }
      
      
    }
  }
  
  return(frame)
}


#Build a function to clean yelp data.
clean_yelp <- function(frame, clipping, epsg_id){  
  #Convert x, y coords to ST points, normalized with a crs
  frameGeom <- frame %>%
    #Rename the alias field to unnest the categories
    rename('alias.business' = alias) %>%
    #Unnest and pivot into one categories column
    unnest_wider(categories, transform = ~ paste(.x, collapse = ', ')) %>%
    
    #Rename the category variables
    rename('title.category' = 'title',
           'alias.category' = 'alias'
    ) %>%
    
    #Flatten nested frame
    jsonlite::flatten() %>%
    st_as_sf(coords = c('coordinates.longitude', 'coordinates.latitude'), crs = epsg_id) %>%
    
    #Delete duplicate rows on the basis of the yelp id
    filter(!duplicated(`id`)) %>%
    
    #Delete rows with missing coordinates
    filter(!is.na(geometry)) %>%
    
    #Delete observations that are outside the tracts of interest.
    filter(geometry %in% st_intersection(.$geometry, st_transform(clipping, crs = epsg_id)))
  return(frameGeom)
}

Create Visualizations

In this section I will create the following visualizations

  1. Boxplot of income, stratified by whehter or not there is a bikeshop in the tract.
  2. Boxplot of proportion of houses with no vehicle
  3. Boxplot of proportion of car commuters
  4. Stacked bar chart of the proportion of tracts that have a bike commuting population, and the tracts that don’t, grouped by tracts with a bikeshop and tracts without a bikeshop.
  5. Spatial representation of the bar chart mentioned above.
#Plot distribution of income by presence of bike shop
income_plot <- ggplot(data = atl_metro_data %>% select(ends_with('E'), 'bikeshop.true'), aes(y = `hhincomeE`, x =`bikeshop.true`)) +
  geom_violin(col = 'lightgreen') +
  geom_boxplot(width = 0.1,aes(col = bikeshop.true)) +
  labs(x = 'Bikeshop in Tract', y = 'Median Tract Income (USD)') +
  dark_mode() +
  theme(legend.position = 'none', axis.title.y = element_text(size = 6), axis.title.x = element_text(size = 6), axis.text = element_text(size = 6))
## Inverted geom defaults of fill and color/colour.
## To change them back, use invert_geom_defaults().
#Plot distribution of proportion of houses with no vehicle by presence of bike shop
novehic_plot <- ggplot(data = atl_metro_data %>% select(ends_with('E'), 'bikeshop.true'), aes(y = `housing.novehicE`, x =`bikeshop.true`)) +
  geom_violin(col = 'lightgreen') +
  geom_boxplot(width = 0.1,aes(col = bikeshop.true)) +
  labs(x = 'Bikeshop in Tract', y = '% Households with no Vehicle') +
  dark_mode() +
  theme(legend.position = 'none', axis.title.y = element_text(size = 6), axis.title.x = element_text(size = 6), axis.text = element_text(size = 6))

#Plot distribution of proportion of car commuters by presence of bikeshop
carcom_plot <- ggplot(data = atl_metro_data %>% select(ends_with('E'), 'bikeshop.true'), aes(y = `trans.carE`, x =`bikeshop.true`)) +
  geom_violin(col = 'lightgreen') +
  geom_boxplot(width = 0.1,aes(col = bikeshop.true)) +
  labs(x = 'Bikeshop in Tract', y = 'Prop of Car Commuters') +
  dark_mode() +
  theme(legend.position = 'none', axis.title.y = element_text(size = 6), axis.title.x = element_text(size = 6), axis.text = element_text(size = 6))

#Plot distribution of public transit commuters by presence of bikeshop
bikecom_plot <- ggplot(data = atl_metro_data %>% select(ends_with('E'), 'bikeshop.true') %>% drop_na(`trans.bicycleE`), aes(x = `bikeshop.true` %>% factor(labels = c('No Bikeshop in Tract', 'Bikeshop in Tract')), fill = factor(`trans.bicycleE`, labels = c(FALSE, TRUE)))) +
  geom_bar(position = 'fill') +
  labs(x = '', y = 'Prop of Bike Commuters', fill = 'Bike Commuters in Tract?') +
  dark_mode() +
  theme(legend.position = 'bottom', axis.title.y = element_text(size = 6), legend.title = element_text(size = 6), legend.text = element_text(size = 6), legend.key.size = unit(6, 'pt'), axis.text = element_text(size = 6))

#Arrange plots into 1 visualization
ggarrange(income_plot, novehic_plot, carcom_plot, bikecom_plot,ncol = 2, nrow = 2, font.label = c(size = 2))

#Map the location of bikeshops by whether or not the tract has a bike commuting population
tmap_mode('view')
## tmap mode set to interactive viewing
tm_shape(atl_metro_data %>% mutate(trans.bicycleE = factor(replace_na(trans.bicycleE, 0), labels = c('No Bike Commuting Population', 'Has Bike Commuting Population'), exclude = NA))) + tm_borders() + tm_fill(col = 'trans.bicycleE', alpha = 0.75, palette = c('lightgrey', 'blue'), title = '') +
  
  tm_shape(atl_metro_bikerentals) + tm_dots(col = 'red')