tidycensus::census_api_key(Sys.getenv("census_api"))
## To install your API key for use in future sessions, run this function with `install = TRUE`.
# Download the Census ACS 5-year estimate data for Census Tracts
tract <- suppressMessages(
get_acs(geography = "tract", # or "block group", "county", "state" etc.
state = "GA",
county = c("Fulton", "Dekalb"),
variables = c(hhincome = 'B19019_001',
race.tot = "B02001_001",
race.white = "B02001_002",
race.black = "B02001_003",
trans.total = "B08006_001",
trans.car = "B08006_002",
trans.drovealone = "B08006_003",
trans.carpooled = "B08006_004",
trans.pubtrans = "B08006_008",
trans.bicycle = "B08006_014",
trans.walk = "B08006_015",
trans.WfH = "B08006_017",
med_housexp = "B25104_001",
med_realestate_taxes = "B25103_001"
),
year = 2019,
survey = "acs5", # American Community Survey 5-year estimate
geometry = TRUE, # returns sf objects
output = "wide") # wide/long
)
##
|
| | 0%
|
|= | 1%
|
|= | 2%
|
|== | 3%
|
|=== | 4%
|
|==== | 6%
|
|===== | 7%
|
|====== | 9%
|
|======= | 11%
|
|========= | 12%
|
|========= | 13%
|
|========== | 14%
|
|============ | 16%
|
|============ | 17%
|
|============ | 18%
|
|============= | 19%
|
|============== | 20%
|
|=============== | 21%
|
|================ | 22%
|
|================= | 24%
|
|================= | 25%
|
|================== | 26%
|
|=================== | 27%
|
|==================== | 29%
|
|===================== | 29%
|
|====================== | 31%
|
|======================= | 32%
|
|======================= | 33%
|
|======================== | 34%
|
|========================= | 36%
|
|========================== | 37%
|
|=========================== | 38%
|
|============================ | 39%
|
|============================= | 42%
|
|============================== | 43%
|
|=============================== | 44%
|
|================================ | 45%
|
|================================= | 47%
|
|================================== | 49%
|
|=================================== | 50%
|
|==================================== | 51%
|
|===================================== | 52%
|
|====================================== | 54%
|
|====================================== | 55%
|
|======================================= | 56%
|
|======================================== | 57%
|
|========================================== | 60%
|
|=========================================== | 61%
|
|============================================ | 63%
|
|============================================= | 64%
|
|============================================== | 66%
|
|=============================================== | 67%
|
|================================================ | 68%
|
|================================================= | 70%
|
|================================================== | 72%
|
|=================================================== | 73%
|
|==================================================== | 74%
|
|===================================================== | 75%
|
|====================================================== | 77%
|
|======================================================= | 78%
|
|======================================================= | 79%
|
|======================================================== | 81%
|
|========================================================= | 81%
|
|========================================================== | 82%
|
|========================================================== | 83%
|
|=========================================================== | 85%
|
|============================================================ | 86%
|
|============================================================== | 88%
|
|=============================================================== | 90%
|
|================================================================ | 92%
|
|================================================================= | 93%
|
|=================================================================== | 95%
|
|==================================================================== | 97%
|
|===================================================================== | 99%
|
|======================================================================| 100%
atlanta <- places('GA') %>%
filter(NAME == 'Atlanta')
## Retrieving data for the year 2021
##
|
| | 0%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|== | 4%
|
|=== | 4%
|
|=== | 5%
|
|==== | 5%
|
|==== | 6%
|
|===== | 7%
|
|===== | 8%
|
|====== | 8%
|
|====== | 9%
|
|======= | 10%
|
|======== | 11%
|
|======== | 12%
|
|========= | 12%
|
|========= | 13%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 16%
|
|============ | 17%
|
|============ | 18%
|
|============= | 19%
|
|============== | 20%
|
|=============== | 21%
|
|=============== | 22%
|
|================ | 22%
|
|================ | 23%
|
|================= | 24%
|
|================= | 25%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 27%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 29%
|
|===================== | 30%
|
|====================== | 31%
|
|====================== | 32%
|
|======================= | 32%
|
|======================= | 33%
|
|======================= | 34%
|
|======================== | 34%
|
|======================== | 35%
|
|========================= | 35%
|
|========================= | 36%
|
|========================== | 37%
|
|========================== | 38%
|
|=========================== | 38%
|
|=========================== | 39%
|
|============================ | 39%
|
|============================ | 40%
|
|============================= | 41%
|
|============================= | 42%
|
|============================== | 42%
|
|============================== | 43%
|
|=============================== | 44%
|
|=============================== | 45%
|
|================================ | 46%
|
|================================= | 47%
|
|================================= | 48%
|
|================================== | 49%
|
|=================================== | 50%
|
|==================================== | 51%
|
|==================================== | 52%
|
|===================================== | 53%
|
|====================================== | 54%
|
|======================================= | 55%
|
|======================================= | 56%
|
|======================================== | 57%
|
|======================================== | 58%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 59%
|
|========================================== | 60%
|
|=========================================== | 61%
|
|============================================ | 62%
|
|============================================ | 63%
|
|============================================ | 64%
|
|============================================= | 64%
|
|============================================= | 65%
|
|============================================== | 65%
|
|============================================== | 66%
|
|=============================================== | 67%
|
|=============================================== | 68%
|
|================================================ | 68%
|
|================================================ | 69%
|
|================================================= | 69%
|
|================================================= | 70%
|
|================================================== | 71%
|
|================================================== | 72%
|
|=================================================== | 72%
|
|=================================================== | 73%
|
|=================================================== | 74%
|
|==================================================== | 74%
|
|==================================================== | 75%
|
|===================================================== | 76%
|
|====================================================== | 77%
|
|====================================================== | 78%
|
|======================================================= | 78%
|
|======================================================= | 79%
|
|======================================================== | 80%
|
|======================================================== | 81%
|
|========================================================= | 81%
|
|========================================================= | 82%
|
|========================================================== | 83%
|
|=========================================================== | 84%
|
|============================================================ | 85%
|
|============================================================ | 86%
|
|============================================================= | 87%
|
|============================================================= | 88%
|
|============================================================== | 88%
|
|============================================================== | 89%
|
|=============================================================== | 89%
|
|=============================================================== | 90%
|
|=============================================================== | 91%
|
|================================================================ | 91%
|
|================================================================ | 92%
|
|================================================================= | 92%
|
|================================================================= | 93%
|
|================================================================== | 94%
|
|================================================================== | 95%
|
|=================================================================== | 95%
|
|=================================================================== | 96%
|
|==================================================================== | 97%
|
|==================================================================== | 98%
|
|===================================================================== | 98%
|
|===================================================================== | 99%
|
|======================================================================| 99%
|
|======================================================================| 100%
tract <- tract[atlanta,]
# View the data
message(sprintf("nrow: %s, ncol: %s", nrow(tract), ncol(tract)))
## nrow: 161, ncol: 31
# Retaining only those I want.
# Notice that select function can also change names when it selects columns.
tract <- tract %>%
select(GEOID,
hhincome = hhincomeE, # New name = old name
race.tot = race.totE,
race.white = race.whiteE,
race.black = race.blackE,
trans.total = trans.totalE,
trans.car = trans.carE,
trans.drovealone = trans.drovealoneE,
trans.carpooled = trans.carpooledE,
trans.pubtrans = trans.pubtransE,
trans.bicycle = trans.bicycleE,
trans.walk = trans.walkE,
trans.WfH = trans.WfHE,
med_housexp = med_housexpE,
med_realestate_taxes = med_realestate_taxesE)
tmap_mode("view")
## tmap mode set to interactive viewing
tm_shape(tract) + tm_borders() + tm_shape(atlanta) + tm_borders(col = 'red')
# Function: Get tract-wise radius
get_r <- function(poly, epsg_id){
#---------------------
# Takes: a single POLYGON or LINESTRTING
# Outputs: distance between the centroid of the boundingbox and a corner of the bounding box
#---------------------
# Get bounding box of a given polygon
bb <- st_bbox(poly)
# Get lat & long coordinates of any one corner of the bounding box.
bb_corner <- st_point(c(bb[1], bb[2])) %>% st_sfc(crs = epsg_id)
# Get centroid of the bb
bb_center_x <- (bb[3]+bb[1])/2
bb_center_y <- (bb[4]+bb[2])/2
bb_center <- st_point(c(bb_center_x, bb_center_y)) %>% st_sfc(crs = epsg_id) %>% st_sf()
# Get the distance between bb_p and c
r <- st_distance(bb_corner, bb_center)
# Multiply 1.1 to make the circle a bit larger than the Census Tract.
# See the Yelp explanation of their radius parameter to see why we do this.
bb_center$radius <- r*1.2
return(bb_center)
}
## Using a loop -----------------------------------------------------------------
# Creating an empty vector of NA.
# Results will fill this vector
epsg_id <- 4326
r4all_loop <- vector("list", nrow(tract))
# Starting a for-loop
for (i in 1:nrow(tract)){
r4all_loop[[i]] <- tract %>%
st_transform(crs = epsg_id) %>%
st_geometry() %>%
.[[i]] %>%
get_r(epsg_id = epsg_id)
}
r4all_loop <- bind_rows(r4all_loop)
# Using a functional -----------------------------------------------------------
# We use a functional (sapply) to apply this custom function to each Census Tract.
r4all_apply <- tract %>%
st_geometry() %>%
st_transform(crs = epsg_id) %>%
lapply(., function(x) get_r(x, epsg_id = epsg_id))
r4all_apply <- bind_rows(r4all_apply)
# Are these two identical?
identical(r4all_apply, r4all_loop)
## [1] TRUE
# Appending X Y coordinates as seprate columns
ready_4_yelp <- r4all_apply %>%
mutate(x = st_coordinates(.)[,1],
y = st_coordinates(.)[,2])
tmap_mode('view')
## tmap mode set to interactive viewing
# Select the first 10 rows
ready_4_yelp[1:10,] %>%
# Draw a buffer centered at the centroid of Tract polygons.
# Radius of the buffer is the radius we just calculated using loop
st_buffer(., dist = .$radius) %>%
# Display this buffer in red
tm_shape(.) + tm_polygons(alpha = 0.5, col = 'red') +
# Display the original polygon in blue
tm_shape(tract[1:10,]) + tm_borders(col= 'blue')
# FUNCTION
get_yelp <- function(tract, category){
# Gets one row of tract information (1,) and category name (str),
# Outputs a list of business data.frame
Sys.sleep(1)
n <- 1
# First request --------------------------------------------------------------
resp <- business_search(api_key = Sys.getenv("yelp_api"),
categories = category,
latitude = tract$y,
longitude = tract$x,
offset = (n - 1) * 50, # = 0 when n = 1
radius = round(tract$radius),
limit = 50)
# Calculate how many requests are needed in total
required_n <- ceiling(resp$total/50)
# out is where the results will be appended to.
out <- vector("list", required_n)
# Store the business information to nth slot in out
out[[n]] <- resp$businesses
# Change the name of the elements to the total required_n
# This is to know if there are more than 1000 businesses,
# we know how many.
names(out)[n] <- required_n
# Throw error if more than 500
if (resp$total >= 500)
{
# glue formats string by inserting {n} with what's currently stored in object n.
print(glue::glue("{n}th row has >= 1000 businesses."))
# Stop before going into the loop because we need to
# break down Census Tract to something smaller.
return(out)
}
else
{
# add 1 to n
n <- n + 1
# Now we know required_n -----------------------------------------------------
# Starting a loop
while(n <= required_n){
resp <- business_search(api_key = Sys.getenv("yelp_api"),
categories = category,
latitude = tract$y,
longitude = tract$x,
offset = (n - 1) * 50,
radius = round(tract$radius),
limit = 50)
out[[n]] <- resp$businesses
n <- n + 1
} #<< end of while loop
# Merge all elements in the list into a single data frame
out <- out %>% bind_rows()
return(out)
}
}
# Apply the function for the first Census Tract
#yelp_first_tract <- get_yelp(ready_4_yelp[1,], "bikerentals") %>%
# as_tibble()
# Print
#yelp_first_tract %>% print
# Prepare a collector
yelp_all_list <- vector("list", nrow(ready_4_yelp))
# Looping through all Census Tracts
for (row in 1:nrow(ready_4_yelp)){
yelp_all_list[[row]] <- suppressMessages(get_yelp(ready_4_yelp[row,], "bikerentals"))
if (row %% 10 == 0){
print(paste0("Current row: ", row))
}
}
# Collapsing the list into a data.frame
yelp_all <- yelp_all_list %>% bind_rows() %>% as_tibble()
# print
yelp_all %>% print(width=1000)
# Extract coordinates
yelp_sf <- yelp_all %>%
mutate(x = .$coordinates$longitude,
y = .$coordinates$latitude) %>%
filter(!is.na(x) & !is.na(y)) %>%
st_as_sf(coords = c("x", "y"), crs = 4326)
# Map
tm_shape(yelp_sf) +
tm_dots(col = "review_count", style="quantile")
write_rds(yelp_all, file = 'C:/Users/49765/Desktop/Urban Analytics/mini3/yelp_all.rds')
st_write(atlanta,"atlanta.geojson",
delete_layer=TRUE)
# Read the full data
my_yelp <- read_rds('C:/Users/49765/Desktop/Urban Analytics/mini3/yelp_all.rds')
yelp_unique <- my_yelp %>%
distinct(id, .keep_all=T)
glue::glue("Before dropping duplicated rows, there were {nrow(my_yelp)} rows. After dropping them, there are {nrow(yelp_unique)} rows") %>%
print()
## Before dropping duplicated rows, there were 91 rows. After dropping them, there are 10 rows
concate_list <- function(x){
# x is a data frame with columns "alias" and "title" from Yelp$categories
# returns a character vector containing category concatenated titles
titles <- x[["title"]] %>% str_c(collapse = ", ")
return(titles)
}
yelp_flat <- yelp_unique %>%
# 1. Flattening columns with data frame
jsonlite::flatten() %>%
# 2. Handling list-columns
mutate(transactions = transactions %>%
map_chr(., function(x) str_c(x, collapse=", ")),
location.display_address = location.display_address %>%
map_chr(., function(x) str_c(x, collapse=", ")),
categories = categories %>% map_chr(concate_list)) # concate_list is the custom function
# Identify NA values
yelp_flat %>%
map_dbl(., function(x) sum(is.na(x)))
## id alias name
## 0 0 0
## image_url is_closed url
## 0 0 0
## review_count categories rating
## 0 0 0
## transactions phone display_phone
## 0 0 0
## distance price coordinates.latitude
## 0 6 0
## coordinates.longitude location.address1 location.address2
## 0 1 3
## location.address3 location.city location.zip_code
## 3 0 0
## location.country location.state location.display_address
## 0 0 0
# map_dbl is a variant of map() which outputs
# numeric vector rather than a list.
# Fist, let's verify that the missing values in lat/long columns are in the same rows.
identical(is.na(yelp_flat$coordinates.latitude),
is.na(yelp_flat$coordinates.longitude)) # Yes, they are in the same 4 rows.
## [1] TRUE
# Drop them.
yelp_dropna1 <- yelp_flat %>%
drop_na(coordinates.longitude)
We only have 4 shops left after drop the Na in column price, so we are not going to use yelp_dropna2.
# Dropping NAs in price
yelp_dropna2 <- yelp_dropna1 %>%
drop_na(price)
# census boundary
census <- st_read('C:/Users/49765/Desktop/Urban Analytics/mini3/atlanta.geojson')
## Reading layer `atlanta' from data source
## `C:\Users\49765\Desktop\Urban Analytics\mini3\atlanta.geojson'
## using driver `GeoJSON'
## Simple feature collection with 1 feature and 16 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -84.55085 ymin: 33.64781 xmax: -84.28956 ymax: 33.88682
## Geodetic CRS: NAD83
# Converting yelp_dropna1 into a sf object
yelp_sf <- yelp_dropna1 %>%
st_as_sf(coords=c("coordinates.longitude", "coordinates.latitude"), crs = 4326)
atlanta <- st_transform(atlanta, crs = st_crs(yelp_sf))
# sf subsets
yelp_in <- yelp_sf[atlanta, ,op = st_intersects]
tract <- st_transform(tract, crs = st_crs(yelp_sf))
# census is currently sfc. Convert it to sf.
tract_sf <- tract %>% filter() %>% st_sf()
# Spatial join
yelp_census <- st_join(yelp_in, tract_sf, join = st_intersects)
census_yelp <- st_join(tract_sf, yelp_in, join = st_intersects)
# Join tract geometry with the number of bikerentals in tract
test <- st_join(tract_sf, yelp_in %>% mutate(count = 1))
out <- test %>%
group_by(GEOID) %>%
summarise(count = sum(count, na.rm = T))
# Lets' check to see if the polygons and the poin data on bikerentals match
tm_shape(out) + tm_polygons(col = "count") + tm_shape(yelp_in) + tm_dots()
The overall location of current bikerentals in Atlanta is appropriate. However, two of the stores are sited in locations that have a high number of walkers and a low number of riders, and do not receive high ratings. This is not quite explainable. Therefore, we can analyze the cycling rate.
boxplot(trans.bicycle/trans.total~bikerentals, data=tract_sf_bike, main="Boxplot of bikerentals by trans.bicycle", xlab="Whether bikerentals are present", ylab="trans.bicycle/trans.total")
boxplot(trans.car/trans.total~bikerentals, data=tract_sf_bike, main="Boxplot of bikerentals by trans.bicycle", xlab="Whether bikerentals are present", ylab="trans.car/trans.total")