#Include yelp data prep function.
source('../Functions/yelp_and_radius.R')
In this mini-assignment, you get to build upon the code you implemented so far and include aspects of the Week 5 lectures on Exploratory Data Analysis.
For this assignment, imagine that you are a bicycle enthusiast and want to set up a bicycle rental store in a place within Fulton and Dekalb counties. The question in your mind is about where this store should be located. You realize that you can get the locations of all bike rental stores from Yelp API (categories: bikerentals). Another data that might help you could be the census data showing how many commuters commute on a bike (which could be a proxy for bike-friendliness of a community and environment). Here you will be looking for places where there is gap between bike stores (rentals in this case - but hopefully these also have other bike related services) and tracts with bike commuters.
To complete this assignment, follow the directions below:
First we need tracts in Metro Atlanta.
To define Metro Atlanta, I used the counties that were considered part of the Metro Atlanta MSA. Since the current MSA has 21 counties, I decided to instead use the 5 most populous counties.
After this, we will calculate radius buffers around each tract centroid to use in the yelp search.
#Set the epsg to the Georgia West State Plane CRS
epsg_id <- 4326
#Get tracts for Atlanta Metro
#Get list of Atlanta MSA Counties
atl_counties <- c(
'Fulton County',
'Gwinnett County',
'Cobb County',
'DeKalb County',
'Clayton County'
)
#Get commuter variables
acs_varnames <- c(hhincome = 'B19019_001',
total.housing = 'DP04_0057',
total.trans = "B08006_001",
trans.car = "B08006_002",
trans.drovealone = "B08006_003",
trans.carpooled = "B08006_004",
trans.pubtrans = "B08006_008",
trans.bicycle = "B08006_014",
trans.walk = "B08006_015",
trans.WfH = "B08006_017",
housing.novehic = 'DP04_0058'
)
#Get tracts and acs data
atl_metro_tracts <- tracts(state = 'GA', county = atl_counties, year = 2020) %>%
#Project tracts to the crs
st_transform(crs = epsg_id)
#Get census data
atl_acs <- get_acs(geography = 'tract', key = Sys.getenv('census_api'), state = 'GA', county = atl_counties, variables = acs_varnames, year = 2020, output = 'wide', geometry = TRUE) %>%
#Filter out margin of error
select(!ends_with('M')) %>%
#Get relative values for transportation
mutate(across(contains('trans.'), ~ round(.x/`total.transE`,2))) %>%
#Get relative values for transportation
mutate(across(contains('housing.'), ~ round(.x/`total.transE`,2))) %>%
#Some of these variables have very low variance, let's convert them to boolean
mutate(
trans.bicycleE = if_else(trans.bicycleE > 0, 1,0),
trans.walkE = if_else(trans.walkE > 0,1,0)
) %>%
#Change crs
st_transform(crs = epsg_id)
## Getting data from the 2016-2020 5-year ACS
## Downloading feature geometry from the Census website. To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
## Fetching data by table type ("B/C", "S", "DP") and combining the result.
#Calculate radius buffers for yelp search
atl_metro_geom <- lapply(atl_metro_tracts$geometry, get_radius, epsg_id = epsg_id) %>% #Calculate radius for each tract centroid
bind_rows() %>% #Bind the outputted features together into a dataframe
mutate(x = st_coordinates(geometry)[,1], y = st_coordinates(geometry)[,2]) #Calculate the x and y coordinate to pass through yelp
#Plot the metro atlanta radii
atl_metro_geom %>%
st_buffer(.,dist = .$radius) %>%
tm_shape(.) + tm_polygons(alpha = 0.5, col = 'red') +
tm_shape(atl_metro_tracts) + tm_borders(col = 'blue')
atl_metro_bikerentals_raw <- get_yelp(atl_metro_geom, 'bikerentals', epsg_id)
## [1] "Processing Category: bikerentals"
## [1] "Processing Tract: 100 of 1006"
## [1] "Processing Tract: 200 of 1006"
## [1] "Processing Tract: 300 of 1006"
## [1] "Processing Tract: 400 of 1006"
## [1] "Processing Tract: 500 of 1006"
## [1] "Processing Tract: 600 of 1006"
## [1] "Processing Tract: 700 of 1006"
## [1] "Processing Tract: 800 of 1006"
## [1] "Processing Tract: 900 of 1006"
## [1] "Processing Tract: 1000 of 1006"
atl_metro_bikerentals <- atl_metro_bikerentals_raw %>%
#Clean data
clean_yelp(atl_metro_tracts$geometry, epsg_id)
#Create a variable to indicate if a bikeshop is in a tract
atl_metro_data <- st_join(atl_acs, select(atl_metro_bikerentals, c(alias.business, geometry)), join = st_intersects) %>%
#Detect if a bike rental shop is in the tract
mutate('bikeshop.true' = if_else(is.na(`alias.business`),FALSE,TRUE))
#Skim yelp data
skim(atl_metro_bikerentals)
## Warning: Couldn't find skimmers for class: sfc_POINT, sfc; No user-defined `sfl`
## provided. Falling back to `character`.
| Name | atl_metro_bikerentals |
| Number of rows | 19 |
| Number of columns | 24 |
| _______________________ | |
| Column type frequency: | |
| character | 18 |
| list | 2 |
| logical | 1 |
| numeric | 3 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| id | 0 | 1.00 | 22 | 22 | 0 | 19 | 0 |
| alias.business | 0 | 1.00 | 14 | 46 | 0 | 19 | 0 |
| name | 0 | 1.00 | 4 | 35 | 0 | 19 | 0 |
| image_url | 0 | 1.00 | 68 | 68 | 0 | 19 | 0 |
| url | 0 | 1.00 | 171 | 203 | 0 | 19 | 0 |
| alias.category | 0 | 1.00 | 18 | 43 | 0 | 14 | 0 |
| title.category | 0 | 1.00 | 19 | 52 | 0 | 14 | 0 |
| price | 11 | 0.42 | 1 | 4 | 0 | 4 | 0 |
| phone | 0 | 1.00 | 12 | 12 | 0 | 19 | 0 |
| display_phone | 0 | 1.00 | 14 | 14 | 0 | 19 | 0 |
| location.address1 | 1 | 0.95 | 0 | 26 | 3 | 16 | 0 |
| location.address2 | 5 | 0.74 | 0 | 9 | 8 | 7 | 0 |
| location.address3 | 7 | 0.63 | 0 | 0 | 12 | 1 | 0 |
| location.city | 0 | 1.00 | 6 | 14 | 0 | 10 | 0 |
| location.zip_code | 0 | 1.00 | 5 | 5 | 0 | 16 | 0 |
| location.country | 0 | 1.00 | 2 | 2 | 0 | 1 | 0 |
| location.state | 0 | 1.00 | 2 | 2 | 0 | 1 | 0 |
| geometry | 0 | 1.00 | 21 | 38 | 0 | 19 | 0 |
Variable type: list
| skim_variable | n_missing | complete_rate | n_unique | min_length | max_length |
|---|---|---|---|---|---|
| transactions | 0 | 1 | 1 | 0 | 0 |
| location.display_address | 0 | 1 | 19 | 1 | 3 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| is_closed | 0 | 1 | 0 | FAL: 19 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| review_count | 0 | 1 | 24.84 | 32.51 | 1.00 | 3.50 | 9.00 | 32.00 | 124.00 | ▇▁▂▁▁ |
| rating | 0 | 1 | 3.84 | 1.44 | 1.00 | 3.75 | 4.50 | 4.75 | 5.00 | ▂▁▁▂▇ |
| distance | 0 | 1 | 3705.71 | 6897.39 | 221.63 | 886.00 | 1484.12 | 2719.89 | 30529.46 | ▇▁▁▁▁ |
#Plot the businesses
tmap_mode('view')
## tmap mode set to interactive viewing
tm_shape(atl_metro_tracts) +
tm_polygons() +
tm_shape(atl_metro_bikerentals) + tm_dots()
Here are the functions I used to pull in the Yelp data.
#Get radius function
get_radius <- function(geo, epsg_id){
#Get the bounding box of the polygon
bb <- st_bbox(geo)
#Get the coordinates of a corner on the box
bb_corner <- st_point(c(bb[1], bb[2])) %>%
#Adjust it to a point on the Earth's surface
st_sfc(crs = epsg_id)
#Get the centroid of the bounding box by finding the center of both lines.
bb_center_x <- (bb[3] + bb[1]) / 2
bb_center_y <- (bb[4] + bb[2]) / 2
bb_centroid <- st_point(c(bb_center_x, bb_center_y)) %>%
#Adjust the point to the earths surface
#And convert it to an SF object
st_sfc(crs = epsg_id) %>% st_sf()
#Lastly, calculate the radius
r <- st_distance(bb_corner, bb_centroid)
#Multiply by 1.1 to make it slightly bigger than the bounding box
bb_centroid$radius <- r
return(bb_centroid)
}
#Get yelp data function
get_yelp <- function(tracts, categories, epsg_id, key = Sys.getenv('yelp_key')){
#Establish output frame
frame = data.frame()
#For each category
for(cat in categories){
paste0('Processing Category: ', cat) %>% print()
#For every tract
for(i in 1:nrow(tracts)){
if(i %% 100 == 0){
paste0('Processing Tract: ', i, ' of ', nrow(tracts)) %>% print()
}
#Offset ticker
n <- 1
#Get the list of businesses
businessList <- business_search(key,
categories = cat,
latitude = tracts[i,]$y,
longitude = tracts[i,]$x,
radius = round(tracts[i,]$radius),
offset = (n - 1)*50,
limit = 50
) %>% suppressMessages()
#Append to final dataframe
frame <- bind_rows(frame, businessList$businesses)
#If the list is not empty
if(!is_empty(businessList$total)){
#If total is greater than 50
if(businessList$total > 50){
#Iterate through the remaining businesses
while(n < ceiling(businessList$total / 50)){
#Append the business list to a final output dataframe
#Adjust the offset ticker
n <- n + 1
#Get the list of businesses
businessList <- business_search(key,
categories = cat,
latitude = tracts[i,]$y,
longitude = tracts[i,]$x,
radius = round(tracts[i,]$radius),
offset = (n - 1)*50,
limit = 50
) %>% suppressMessages()
#Append the list to the dataframe
frame <- bind_rows(frame, businessList$businesses)
}
}
}
}
}
return(frame)
}
#Build a function to clean yelp data.
clean_yelp <- function(frame, clipping, epsg_id){
#Convert x, y coords to ST points, normalized with a crs
frameGeom <- frame %>%
#Rename the alias field to unnest the categories
rename('alias.business' = alias) %>%
#Unnest and pivot into one categories column
unnest_wider(categories, transform = ~ paste(.x, collapse = ', ')) %>%
#Rename the category variables
rename('title.category' = 'title',
'alias.category' = 'alias'
) %>%
#Flatten nested frame
jsonlite::flatten() %>%
st_as_sf(coords = c('coordinates.longitude', 'coordinates.latitude'), crs = epsg_id) %>%
#Delete duplicate rows on the basis of the yelp id
filter(!duplicated(`id`)) %>%
#Delete rows with missing coordinates
filter(!is.na(geometry)) %>%
#Delete observations that are outside the tracts of interest.
filter(geometry %in% st_intersection(.$geometry, st_transform(clipping, crs = epsg_id)))
return(frameGeom)
}
In this section I will create the following visualizations
#Plot distribution of income by presence of bike shop
income_plot <- ggplot(data = atl_metro_data %>% select(ends_with('E'), 'bikeshop.true'), aes(y = `hhincomeE`, x =`bikeshop.true`)) +
geom_violin(col = 'lightgreen') +
geom_boxplot(width = 0.1,aes(col = bikeshop.true)) +
labs(x = 'Bikeshop in Tract', y = 'Median Tract Income (USD)') +
dark_mode() +
theme(legend.position = 'none', axis.title.y = element_text(size = 6), axis.title.x = element_text(size = 6), axis.text = element_text(size = 6))
## Inverted geom defaults of fill and color/colour.
## To change them back, use invert_geom_defaults().
#Plot distribution of proportion of houses with no vehicle by presence of bike shop
novehic_plot <- ggplot(data = atl_metro_data %>% select(ends_with('E'), 'bikeshop.true'), aes(y = `housing.novehicE`, x =`bikeshop.true`)) +
geom_violin(col = 'lightgreen') +
geom_boxplot(width = 0.1,aes(col = bikeshop.true)) +
labs(x = 'Bikeshop in Tract', y = '% Households with no Vehicle') +
dark_mode() +
theme(legend.position = 'none', axis.title.y = element_text(size = 6), axis.title.x = element_text(size = 6), axis.text = element_text(size = 6))
#Plot distribution of proportion of car commuters by presence of bikeshop
carcom_plot <- ggplot(data = atl_metro_data %>% select(ends_with('E'), 'bikeshop.true'), aes(y = `trans.carE`, x =`bikeshop.true`)) +
geom_violin(col = 'lightgreen') +
geom_boxplot(width = 0.1,aes(col = bikeshop.true)) +
labs(x = 'Bikeshop in Tract', y = 'Prop of Car Commuters') +
dark_mode() +
theme(legend.position = 'none', axis.title.y = element_text(size = 6), axis.title.x = element_text(size = 6), axis.text = element_text(size = 6))
#Plot distribution of public transit commuters by presence of bikeshop
bikecom_plot <- ggplot(data = atl_metro_data %>% select(ends_with('E'), 'bikeshop.true') %>% drop_na(`trans.bicycleE`), aes(x = `bikeshop.true` %>% factor(labels = c('No Bikeshop in Tract', 'Bikeshop in Tract')), fill = factor(`trans.bicycleE`, labels = c(FALSE, TRUE)))) +
geom_bar(position = 'fill') +
labs(x = '', y = 'Prop of Bike Commuters', fill = 'Bike Commuters in Tract?') +
dark_mode() +
theme(legend.position = 'bottom', axis.title.y = element_text(size = 6), legend.title = element_text(size = 6), legend.text = element_text(size = 6), legend.key.size = unit(6, 'pt'), axis.text = element_text(size = 6))
#Arrange plots into 1 visualization
ggarrange(income_plot, novehic_plot, carcom_plot, bikecom_plot,ncol = 2, nrow = 2, font.label = c(size = 2))
#Map the location of bikeshops by whether or not the tract has a bike commuting population
tmap_mode('view')
## tmap mode set to interactive viewing
tm_shape(atl_metro_data %>% mutate(trans.bicycleE = factor(replace_na(trans.bicycleE, 0), labels = c('No Bike Commuting Population', 'Has Bike Commuting Population'), exclude = NA))) + tm_borders() + tm_fill(col = 'trans.bicycleE', alpha = 0.75, palette = c('lightgrey', 'blue'), title = '') +
tm_shape(atl_metro_bikerentals) + tm_dots(col = 'red')