Download Yelp data on categories = bikerentals for Atlanta metro. If you encounter issues in API requests, try Sys.sleep() as you see appropriate. Remember to save the downloaded Yelp data in your hard drive to avoid making excessive API requests. Once you’ve downloaded the data, you can simply read the Yelp data from your hard drive and proceed from there. However, even when you’ve stopped using the code for downloading Yelp data, do not delete them from your RMD; instructors need to see the entire code. You can use code chunk option {r, eval=FALSE} to keep the code for downloading Yelp in your RMD but not actually run it. Read this to learn more.
Download census data for the commuting (and other variables that you may choose to use).
Tidy your data by: Deleting duplicated rows. Flattening nested columns that have multiple variables in one column. Pay particular attention to the “category” column. Delete rows that have missing data in coordinates variable and other variables of interest. It’s okay to have NAs in variables that are not of interest. Delete rows that fall outside of the boundary of your choice.
Examine the associations among the variables bike rentals and bike commuting. Find the right metrics (you may need to create new variables e.g., proportion, density, etc.). Show at least one graph and at least one map with two variables that you are interested in examining (you can do more!).
library(tidycensus) # to access census api
library(sf) # to read/write shp files
## Linking to GEOS 3.9.1, GDAL 3.4.3, PROJ 7.2.1; sf_use_s2() is TRUE
library(tmap)
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(httr) # to use api requests
library(jsonlite) # to read/write json files
##
## Attaching package: 'jsonlite'
##
## The following object is masked from 'package:purrr':
##
## flatten
library(reshape2)
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
library(yelpr)
library(knitr)
library(here) # to use relative paths
## here() starts at C:/Users/chan303/OneDrive - Georgia Institute of Technology/CP8883BK/UA_module1
library(skimr)
library(readr)
tract <- suppressMessages(
get_acs(geography = "tract", # or "block group", "county", "state" etc.
state = "GA",
county = c("Fulton", "Dekalb"),
variables = c(hhincome = 'B19019_001',
race.tot = "B02001_001",
race.white = "B02001_002",
race.black = "B02001_003",
trans.total = "B08006_001",
trans.car = "B08006_002",
trans.drovealone = "B08006_003",
trans.carpooled = "B08006_004",
trans.pubtrans = "B08006_008",
trans.bicycle = "B08006_014",
trans.walk = "B08006_015",
trans.WfH = "B08006_017"
),
year = 2020,
survey = "acs5", # American Community Survey 5-year estimate
geometry = TRUE, # returns sf objects
output = "wide") # wide vs. long
)
##
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|== | 4%
|
|=== | 4%
|
|=== | 5%
|
|==== | 5%
|
|==== | 6%
|
|===== | 7%
|
|====== | 8%
|
|====== | 9%
|
|======= | 10%
|
|======== | 11%
|
|======== | 12%
|
|========= | 13%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 16%
|
|============ | 17%
|
|============ | 18%
|
|============= | 19%
|
|============== | 20%
|
|============== | 21%
|
|=============== | 22%
|
|================ | 23%
|
|================= | 24%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 27%
|
|==================== | 28%
|
|===================== | 30%
|
|====================== | 31%
|
|======================= | 33%
|
|======================== | 34%
|
|========================= | 36%
|
|========================== | 37%
|
|========================== | 38%
|
|=========================== | 39%
|
|============================ | 39%
|
|============================ | 40%
|
|============================= | 41%
|
|============================== | 42%
|
|=============================== | 44%
|
|================================ | 45%
|
|================================= | 47%
|
|================================== | 48%
|
|=================================== | 50%
|
|==================================== | 51%
|
|===================================== | 53%
|
|====================================== | 54%
|
|======================================= | 56%
|
|======================================== | 57%
|
|========================================= | 58%
|
|========================================== | 60%
|
|=========================================== | 61%
|
|============================================ | 62%
|
|============================================= | 64%
|
|============================================== | 65%
|
|=============================================== | 67%
|
|================================================ | 68%
|
|================================================= | 70%
|
|================================================== | 71%
|
|=================================================== | 73%
|
|==================================================== | 74%
|
|===================================================== | 75%
|
|====================================================== | 77%
|
|======================================================= | 78%
|
|======================================================== | 79%
|
|========================================================= | 81%
|
|========================================================== | 82%
|
|=========================================================== | 84%
|
|============================================================ | 85%
|
|============================================================= | 87%
|
|============================================================= | 88%
|
|=============================================================== | 90%
|
|=============================================================== | 91%
|
|================================================================= | 92%
|
|================================================================= | 94%
|
|=================================================================== | 95%
|
|=================================================================== | 96%
|
|===================================================================== | 98%
|
|===================================================================== | 99%
|
|======================================================================| 100%
# visualize tract
tmap_mode("view")
## tmap mode set to interactive viewing
tm_shape(tract) + tm_borders()
I need to define functions to call yelp API for the tracts that I am analyzing.
# FUNCTION
get_yelp <- function(tract, category){
# ----------------------------------
# Gets one row of tract information (1,) and category name (str),
# Outputs a list of business data.frame
n <- 1
# First request --------------------------------------------------------------
resp <- business_search(api_key = Sys.getenv("yelp_api"),
categories = category,
latitude = tract$y,
longitude = tract$x,
offset = (n - 1) * 50, # = 0 when n = 1
radius = round(tract$radius),
limit = 50)
# Calculate how many requests are needed in total
required_n <- ceiling(resp$total/50)
# out is where the results will be appended to.
out <- vector("list", required_n)
# Store the business information to nth slot in out
out[[n]] <- resp$businesses
# Change the name of the elements to the total required_n
# This is to know if there are more than 1000 businesses,
# we know how many.
names(out)[n] <- required_n
# Throw error if more than 1000
if (resp$total >= 1000)
{
# glue formats string by inserting {n} with what's currently stored in object n.
print(glue::glue("{n}th row has >= 1000 businesses."))
# Stop before going into the loop because we need to
# break down Census Tract to something smaller.
return(out)
}
else
{
# add 1 to n
n <- n + 1
# Now we know required_n -----------------------------------------------------
# Starting a loop
while(n <= required_n){
resp <- business_search(api_key = Sys.getenv("yelp_api"),
categories = category,
latitude = tract$y,
longitude = tract$x,
offset = (n - 1) * 50,
radius = round(tract$radius),
limit = 50)
out[[n]] <- resp$businesses
n <- n + 1
} #<< end of while loop
# Merge all elements in the list into a single data frame
out <- out %>% bind_rows()
return(out)
}
}
# Function: Get tract-wise radius
get_r <- function(poly, epsg_id){
#---------------------
# Takes: a single POLYGON or LINESTRTING
# Outputs: distance between the centroid of the bounding box and a corner of the bounding box
#---------------------
# Get bounding box of a given polygon
bb <- st_bbox(poly)
# Get lat & long coordinates of any one corner of the bounding box.
bb_corner <- st_point(c(bb[1], bb[2])) %>% st_sfc(crs = epsg_id)
# Get centroid of the bb
bb_center_x <- (bb[3]+bb[1])/2
bb_center_y <- (bb[4]+bb[2])/2
bb_center <- st_point(c(bb_center_x, bb_center_y)) %>% st_sfc(crs = epsg_id) %>% st_sf()
# Get the distance between bb_p and c
r <- st_distance(bb_corner, bb_center)
# Multiply 1.1 to make the circle a bit larger than the Census Tract.
# See the Yelp explanation of their radius parameter to see why we do this.
bb_center$radius <- r*1.2
return(bb_center)
}
epsg_id <- 4326
r4all <- tract %>%
st_geometry() %>%
st_transform(crs = epsg_id) %>%
lapply(., function(x) get_r(x, epsg_id = epsg_id))
r4all <- bind_rows(r4all)
# Appending X Y coordinates as separate columns
ready_4_yelp <- r4all %>%
mutate(x = st_coordinates(.)[,1],
y = st_coordinates(.)[,2])
checking if ready_4_yelp is successfully created…
tmap_mode('view')
## tmap mode set to interactive viewing
# Select the first 10 rows
ready_4_yelp[1:10,] %>%
# Draw a buffer centered at the centroid of Tract polygons.
# Radius of the buffer is the radius we just calculated using loop
st_buffer(., dist = .$radius) %>%
# Display this buffer in red
tm_shape(.) + tm_polygons(alpha = 0.5, col = 'red') +
# Display the original polygon in blue
tm_shape(tract[1:10,]) + tm_borders(col= 'blue')
Below is the code that I used, but I would read from the rds file to save api calls when I knit as an HTML.
# Prepare a collector
yelp_bikerentals_list <- vector("list", nrow(ready_4_yelp))
# Looping through all Census Tracts
for (row in 1:nrow(ready_4_yelp)){
yelp_bikerentals_list[[row]] <- suppressMessages(get_yelp(ready_4_yelp[row,], "bikerentals"))
if (row %% 5 == 0){
Sys.sleep(10)
if (row %% 50 == 0) {
print(paste0("Current row: ", row))
}
}
}
# Collapsing the list into a data.frame
yelp_bikerentals_df <- yelp_bikerentals_list %>% bind_rows() %>% as_tibble()
# print
yelp_bikerentals_df %>% print(width=1000)
To save API calls, I saved this to my hard drive.
write_rds(yelp_bikerentals_df, "yelp_bikerentals.rds")
Reading the RDS file that I wrote.
# Read my data
yelp_bikerentals_df <- read_rds(here("..", "UA_module1", "rmd", "yelp_bikerentals.rds"))
I already downloaded related variables in the beginning to get tract boundaries. Here, I would delete redundant variables.
FD_tract <- tract %>%
select(GEOID,
hhincome = hhincomeE, # New name = old name
race.tot = race.totE,
race.white = race.whiteE,
race.black = race.blackE,
trans.total = trans.totalE,
trans.car = trans.carE,
trans.drovealone = trans.drovealoneE,
trans.carpooled = trans.carpooledE,
trans.pubtrans = trans.pubtransE,
trans.bicycle = trans.bicycleE,
trans.walk = trans.walkE,
trans.WfH = trans.WfHE)
Before I tidy my data, I would like to take a look at how the data looks like.
tmap_mode("view")
## tmap mode set to interactive viewing
## tmap mode set to interactive viewing
comm_bicycle <- tm_shape(FD_tract) + tm_polygons("trans.bicycle")
comm_car <- tm_shape(FD_tract) + tm_polygons("trans.car")
tmap_arrange(comm_bicycle, comm_car)
head(yelp_bikerentals_df)
## # A tibble: 6 × 16
## id alias name image…¹ is_cl…² url revie…³ categ…⁴ rating coord…⁵ trans…⁶
## <chr> <chr> <chr> <chr> <lgl> <chr> <int> <list> <dbl> <dbl> <list>
## 1 FK7-… azte… Azte… https:… FALSE http… 54 <df> 5 33.8 <list>
## 2 FK7-… azte… Azte… https:… FALSE http… 54 <df> 5 33.8 <list>
## 3 FK7-… azte… Azte… https:… FALSE http… 54 <df> 5 33.8 <list>
## 4 FK7-… azte… Azte… https:… FALSE http… 54 <df> 5 33.8 <list>
## 5 FK7-… azte… Azte… https:… FALSE http… 54 <df> 5 33.8 <list>
## 6 FK7-… azte… Azte… https:… FALSE http… 54 <df> 5 33.8 <list>
## # … with 6 more variables: coordinates$longitude <dbl>, price <chr>,
## # location <df[,8]>, phone <chr>, display_phone <chr>, distance <dbl>, and
## # abbreviated variable names ¹image_url, ²is_closed, ³review_count,
## # ⁴categories, ⁵coordinates$latitude, ⁶transactions
## # ℹ Use `colnames()` to see all variable names
Issue 1: Duplicate Data
# Deleting duplicate businesses
yelp_br_unique <- yelp_bikerentals_df %>%
distinct(id, .keep_all=T)
glue::glue("Before dropping NA, there were {nrow(yelp_bikerentals_df)} rows. After dropping them, there are {nrow(yelp_br_unique)} rows") %>%
print()
## Before dropping NA, there were 231 rows. After dropping them, there are 15 rows
Issue 2: Flattening Columns
# Custom function that takes the data frame in "categories" column in Yelp data
# and returns a character vector
concate_list <- function(x){
# x is a data frame with columns "alias" and "title" from Yelp$categories
# returns a character vector containing category concatenated titles
titles <- x[["title"]] %>% str_c(collapse = ", ")
return(titles)
}
yelp_br_flat <- yelp_br_unique %>%
# 1. Flattening columns with data frame
jsonlite::flatten() %>%
# 2. Handling list-columns
mutate(transactions = transactions %>%
map_chr(., function(x) str_c(x, collapse=", ")),
location.display_address = location.display_address %>%
map_chr(., function(x) str_c(x, collapse=", ")),
categories = categories %>% map_chr(concate_list))
Issue 3: Missing Values
#find missing values
yelp_br_flat %>%
map_dbl(., function(x) sum(is.na(x)))
## id alias name
## 0 0 0
## image_url is_closed url
## 0 0 0
## review_count categories rating
## 0 0 0
## transactions price phone
## 0 9 0
## display_phone distance coordinates.latitude
## 0 0 0
## coordinates.longitude location.address1 location.address2
## 0 1 4
## location.address3 location.city location.zip_code
## 6 0 0
## location.country location.state location.display_address
## 0 0 0
I am not interested in address, so I will not drop anything.
yelp_br_sf <- yelp_br_flat %>%
st_as_sf(coords=c("coordinates.longitude", "coordinates.latitude"), crs = 4326)
tmap_mode("view")
## tmap mode set to interactive viewing
tm_shape(yelp_br_sf) + tm_dots(col = "price")
## Check to see that the CRS for both the sf files are the same
head(FD_tract)
## Simple feature collection with 6 features and 13 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -84.47101 ymin: 33.65339 xmax: -84.18201 ymax: 33.81229
## Geodetic CRS: NAD83
## GEOID hhincome race.tot race.white race.black trans.total trans.car
## 1 13089021908 43191 4829 323 3368 2421 2137
## 2 13089020400 123711 2743 2435 48 1691 1202
## 3 13089023410 41065 3903 131 3562 2030 1562
## 4 13121010601 55479 3673 1057 2514 1699 1290
## 5 13121008302 28214 1891 33 1836 481 401
## 6 13121004200 23906 2675 252 2423 1073 487
## trans.drovealone trans.carpooled trans.pubtrans trans.bicycle trans.walk
## 1 1759 378 86 0 101
## 2 1069 133 63 62 13
## 3 1339 223 307 0 87
## 4 1131 159 154 0 113
## 5 347 54 57 0 8
## 6 437 50 460 19 0
## trans.WfH geometry
## 1 97 MULTIPOLYGON (((-84.20619 3...
## 2 300 MULTIPOLYGON (((-84.34921 3...
## 3 55 MULTIPOLYGON (((-84.29784 3...
## 4 116 MULTIPOLYGON (((-84.46957 3...
## 5 0 MULTIPOLYGON (((-84.46315 3...
## 6 106 MULTIPOLYGON (((-84.42334 3...
## Check to see that the CRS for both the sf files are the same
head(yelp_br_sf)
## Simple feature collection with 6 features and 22 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: -84.35797 ymin: 33.74172 xmax: -84.16982 ymax: 34.04687
## Geodetic CRS: WGS 84
## id alias
## 1 FK7-M9BGyCgpEmVifcPfoA aztec-cycles-stone-mountain
## 2 OJVvH1CUZacuHjSLkhSKOg roswell-bicycles-roswell
## 3 kJJiJqGbiO_QXmdhol3hIQ british-and-american-bikes-atlanta
## 4 ot4UyUsRAlTFudTlCVRMrQ pedego-electric-bikes-alpharetta-alpharetta
## 5 rbf8bVY0cuqyGZtbn691lg pedego-electric-bikes-atlanta-atlanta-2
## 6 8PfRbXo6qhKliGDCp5l79g atlanta-pro-bikes-atlanta
## name
## 1 Aztec Cycles
## 2 Roswell Bicycles
## 3 British and American Bikes
## 4 Pedego Electric Bikes Alpharetta
## 5 Pedego Electric Bikes Atlanta
## 6 Atlanta Pro Bikes
## image_url
## 1 https://s3-media3.fl.yelpcdn.com/bphoto/re-aoEuun-QS1SE7dQCySQ/o.jpg
## 2 https://s3-media2.fl.yelpcdn.com/bphoto/1YUjKn4QEkVQG_eIH7wiXQ/o.jpg
## 3 https://s3-media2.fl.yelpcdn.com/bphoto/9I0G3Ge2yKFg6184t1XYEQ/o.jpg
## 4 https://s3-media2.fl.yelpcdn.com/bphoto/gPNidtknlSWKu3Bh4iOJ3Q/o.jpg
## 5 https://s3-media2.fl.yelpcdn.com/bphoto/Z7KlxC0vcyoxOCSxbH3Rrg/o.jpg
## 6 https://s3-media1.fl.yelpcdn.com/bphoto/1s0fhhJvN3z-SUPizRy5sw/o.jpg
## is_closed
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## url
## 1 https://www.yelp.com/biz/aztec-cycles-stone-mountain?adjust_creative=-To_AbVIHKh8mMAunCeEbg&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=-To_AbVIHKh8mMAunCeEbg
## 2 https://www.yelp.com/biz/roswell-bicycles-roswell?adjust_creative=-To_AbVIHKh8mMAunCeEbg&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=-To_AbVIHKh8mMAunCeEbg
## 3 https://www.yelp.com/biz/british-and-american-bikes-atlanta?adjust_creative=-To_AbVIHKh8mMAunCeEbg&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=-To_AbVIHKh8mMAunCeEbg
## 4 https://www.yelp.com/biz/pedego-electric-bikes-alpharetta-alpharetta?adjust_creative=-To_AbVIHKh8mMAunCeEbg&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=-To_AbVIHKh8mMAunCeEbg
## 5 https://www.yelp.com/biz/pedego-electric-bikes-atlanta-atlanta-2?adjust_creative=-To_AbVIHKh8mMAunCeEbg&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=-To_AbVIHKh8mMAunCeEbg
## 6 https://www.yelp.com/biz/atlanta-pro-bikes-atlanta?adjust_creative=-To_AbVIHKh8mMAunCeEbg&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=-To_AbVIHKh8mMAunCeEbg
## review_count categories rating transactions
## 1 54 Bikes, Bike Repair/Maintenance, Bike Rentals 5.0
## 2 70 Bikes, Bike Rentals 4.0
## 3 2 Motorcycle Repair, Bike Rentals 5.0
## 4 7 Bike Rentals, Bikes, Bike Repair/Maintenance 5.0
## 5 9 Bikes, Bike Rentals, Bike Repair/Maintenance 4.5
## 6 40 Bike Repair/Maintenance, Bikes, Bike Rentals 4.5
## price phone display_phone distance location.address1
## 1 $$ +16786369043 (678) 636-9043 2322.2660 901 Main St
## 2 $$$ +17706424057 (770) 642-4057 2841.9264 670 Houze Way
## 3 <NA> +17704518868 (770) 451-8868 597.5853 4264 F Winters Chapel Rd
## 4 <NA> +14042810264 (404) 281-0264 1931.4652 6480 N Point Pkwy
## 5 <NA> +14049753915 (404) 975-3915 8603.6159 414 Bill Kennedy Way
## 6 <NA> +14042541230 (404) 254-1230 1096.5805 1039 N Highland Ave NE
## location.address2 location.address3 location.city location.zip_code
## 1 Stone Mountain 30083
## 2 Roswell 30076
## 3 Atlanta 30360
## 4 Ste 1100b <NA> Alpharetta 30022
## 5 Ste 101 <NA> Atlanta 30316
## 6 <NA> Atlanta 30306
## location.country location.state
## 1 US GA
## 2 US GA
## 3 US GA
## 4 US GA
## 5 US GA
## 6 US GA
## location.display_address geometry
## 1 901 Main St, Stone Mountain, GA 30083 POINT (-84.16982 33.80586)
## 2 670 Houze Way, Roswell, GA 30076 POINT (-84.34202 34.04687)
## 3 4264 F Winters Chapel Rd, Atlanta, GA 30360 POINT (-84.27059 33.91903)
## 4 6480 N Point Pkwy, Ste 1100b, Alpharetta, GA 30022 POINT (-84.29336 34.0444)
## 5 414 Bill Kennedy Way, Ste 101, Atlanta, GA 30316 POINT (-84.35797 33.74172)
## 6 1039 N Highland Ave NE, Atlanta, GA 30306 POINT (-84.35415 33.78293)
## It turns out they are NOT! So we need to transform their CRS into the same
FD_tract_Geom <- st_transform(FD_tract, crs=4326)
Yelp_bikerentals_Geom <- st_transform(yelp_br_sf, crs=4326)
# sf subsets
yelp_br_Geom2 <- Yelp_bikerentals_Geom[FD_tract_Geom %>% st_union(), ,op = st_intersects]
# join
bikerentals_in_tract <- st_join(FD_tract_Geom, yelp_br_Geom2, join = st_intersects)
skim(bikerentals_in_tract)
## Warning: Couldn't find skimmers for class: sfc_MULTIPOLYGON, sfc; No user-
## defined `sfl` provided. Falling back to `character`.
| Name | bikerentals_in_tract |
| Number of rows | 530 |
| Number of columns | 36 |
| _______________________ | |
| Column type frequency: | |
| character | 20 |
| logical | 1 |
| numeric | 15 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| GEOID | 0 | 1.00 | 11 | 11 | 0 | 530 | 0 |
| id | 516 | 0.03 | 22 | 22 | 0 | 14 | 0 |
| alias | 516 | 0.03 | 14 | 43 | 0 | 14 | 0 |
| name | 516 | 0.03 | 4 | 32 | 0 | 14 | 0 |
| image_url | 516 | 0.03 | 68 | 68 | 0 | 14 | 0 |
| url | 516 | 0.03 | 171 | 200 | 0 | 14 | 0 |
| categories | 516 | 0.03 | 19 | 52 | 0 | 12 | 0 |
| transactions | 516 | 0.03 | 0 | 0 | 14 | 1 | 0 |
| price | 524 | 0.01 | 1 | 4 | 0 | 4 | 0 |
| phone | 516 | 0.03 | 12 | 12 | 0 | 14 | 0 |
| display_phone | 516 | 0.03 | 14 | 14 | 0 | 14 | 0 |
| location.address1 | 517 | 0.02 | 0 | 24 | 3 | 11 | 0 |
| location.address2 | 520 | 0.02 | 0 | 9 | 7 | 4 | 0 |
| location.address3 | 522 | 0.02 | 0 | 0 | 8 | 1 | 0 |
| location.city | 516 | 0.03 | 7 | 14 | 0 | 5 | 0 |
| location.zip_code | 516 | 0.03 | 5 | 5 | 0 | 11 | 0 |
| location.country | 516 | 0.03 | 2 | 2 | 0 | 1 | 0 |
| location.state | 516 | 0.03 | 2 | 2 | 0 | 1 | 0 |
| location.display_address | 516 | 0.03 | 17 | 50 | 0 | 14 | 0 |
| geometry | 0 | 1.00 | 193 | 3750 | 0 | 530 | 0 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| is_closed | 516 | 0.03 | 0 | FAL: 14 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| hhincome | 8 | 0.98 | 81559.87 | 49076.11 | 13577.00 | 46780.50 | 69623.0 | 100893.50 | 250001.00 | ▇▇▃▁▁ |
| race.tot | 0 | 1.00 | 3409.13 | 1276.00 | 0.00 | 2498.25 | 3267.0 | 4243.00 | 7401.00 | ▁▇▇▃▁ |
| race.white | 0 | 1.00 | 1337.87 | 1198.31 | 0.00 | 237.75 | 1062.5 | 2146.00 | 5039.00 | ▇▅▃▁▁ |
| race.black | 0 | 1.00 | 1624.68 | 1545.94 | 0.00 | 367.00 | 996.5 | 2739.25 | 7021.00 | ▇▂▂▁▁ |
| trans.total | 0 | 1.00 | 1718.82 | 683.32 | 0.00 | 1268.00 | 1653.5 | 2140.25 | 4114.00 | ▂▇▇▂▁ |
| trans.car | 0 | 1.00 | 1315.87 | 585.48 | 0.00 | 926.75 | 1250.0 | 1690.50 | 3342.00 | ▂▇▆▂▁ |
| trans.drovealone | 0 | 1.00 | 1179.78 | 536.94 | 0.00 | 831.25 | 1121.0 | 1525.75 | 3164.00 | ▃▇▆▂▁ |
| trans.carpooled | 0 | 1.00 | 136.09 | 134.39 | 0.00 | 46.25 | 99.0 | 191.25 | 1109.00 | ▇▂▁▁▁ |
| trans.pubtrans | 0 | 1.00 | 117.38 | 128.89 | 0.00 | 26.00 | 87.5 | 166.00 | 1194.00 | ▇▁▁▁▁ |
| trans.bicycle | 0 | 1.00 | 7.84 | 21.88 | 0.00 | 0.00 | 0.0 | 0.00 | 220.00 | ▇▁▁▁▁ |
| trans.walk | 0 | 1.00 | 38.72 | 84.26 | 0.00 | 0.00 | 8.5 | 38.00 | 788.00 | ▇▁▁▁▁ |
| trans.WfH | 0 | 1.00 | 203.40 | 161.01 | 0.00 | 83.00 | 165.0 | 292.75 | 856.00 | ▇▅▂▁▁ |
| review_count | 516 | 0.03 | 29.50 | 36.61 | 1.00 | 3.25 | 10.5 | 50.50 | 124.00 | ▇▁▂▁▁ |
| rating | 516 | 0.03 | 3.64 | 1.61 | 1.00 | 2.50 | 4.5 | 4.50 | 5.00 | ▂▁▁▁▇ |
| distance | 516 | 0.03 | 2199.77 | 2144.27 | 230.64 | 758.52 | 1820.0 | 2304.95 | 8603.62 | ▇▆▁▁▁ |
# Now count the Yoga Studios by tract
br_count_tract <- count(as_tibble(bikerentals_in_tract), GEOID) %>%
print()
## # A tibble: 530 × 2
## GEOID n
## <chr> <int>
## 1 13089020100 1
## 2 13089020200 1
## 3 13089020300 1
## 4 13089020400 1
## 5 13089020500 1
## 6 13089020600 1
## 7 13089020700 1
## 8 13089020801 1
## 9 13089020802 1
## 10 13089020901 1
## # … with 520 more rows
## # ℹ Use `print(n = ...)` to see more rows
# Join tract geometry with the number of Yoga studios in tract
test <- st_join(FD_tract_Geom, yelp_br_Geom2 %>% mutate(count = 1))
out <- test %>%
group_by(GEOID) %>%
summarise(count = sum(count, na.rm = T))
# Lets' check to see if the polygons and the poin data on Yoga Studios match
tm_shape(out) + tm_polygons(col = "count") + tm_shape(yelp_br_Geom2) + tm_dots()
FD_tract_Geom_bikerentals <- FD_tract_Geom %>%
left_join(out %>% st_set_geometry(NULL), by = "GEOID")
print(skim(FD_tract_Geom_bikerentals))
## Warning: Couldn't find skimmers for class: sfc_MULTIPOLYGON, sfc; No user-
## defined `sfl` provided. Falling back to `character`.
## ── Data Summary ────────────────────────
## Values
## Name FD_tract_Geom_bikerentals
## Number of rows 530
## Number of columns 15
## _______________________
## Column type frequency:
## character 2
## numeric 13
## ________________________
## Group variables None
##
## ── Variable type: character ────────────────────────────────────────────────────
## skim_variable n_missing complete_rate min max empty n_unique whitespace
## 1 GEOID 0 1 11 11 0 530 0
## 2 geometry 0 1 193 3750 0 530 0
##
## ── Variable type: numeric ──────────────────────────────────────────────────────
## skim_variable n_missing complete_rate mean sd p0 p25
## 1 hhincome 8 0.985 81560. 49076. 13577 46780.
## 2 race.tot 0 1 3409. 1276. 0 2498.
## 3 race.white 0 1 1338. 1198. 0 238.
## 4 race.black 0 1 1625. 1546. 0 367
## 5 trans.total 0 1 1719. 683. 0 1268
## 6 trans.car 0 1 1316. 585. 0 927.
## 7 trans.drovealone 0 1 1180. 537. 0 831.
## 8 trans.carpooled 0 1 136. 134. 0 46.2
## 9 trans.pubtrans 0 1 117. 129. 0 26
## 10 trans.bicycle 0 1 7.84 21.9 0 0
## 11 trans.walk 0 1 38.7 84.3 0 0
## 12 trans.WfH 0 1 203. 161. 0 83
## 13 count 0 1 0.0264 0.161 0 0
## p50 p75 p100 hist
## 1 69623 100894. 250001 ▇▇▃▁▁
## 2 3267 4243 7401 ▁▇▇▃▁
## 3 1062. 2146 5039 ▇▅▃▁▁
## 4 996. 2739. 7021 ▇▂▂▁▁
## 5 1654. 2140. 4114 ▂▇▇▂▁
## 6 1250 1690. 3342 ▂▇▆▂▁
## 7 1121 1526. 3164 ▃▇▆▂▁
## 8 99 191. 1109 ▇▂▁▁▁
## 9 87.5 166 1194 ▇▁▁▁▁
## 10 0 0 220 ▇▁▁▁▁
## 11 8.5 38 788 ▇▁▁▁▁
## 12 165 293. 856 ▇▅▂▁▁
## 13 0 0 1 ▇▁▁▁▁
## $character
##
## ── Variable type: character ────────────────────────────────────────────────────
## skim_variable n_missing complete_rate min max empty n_unique whitespace
## 1 GEOID 0 1 11 11 0 530 0
## 2 geometry 0 1 193 3750 0 530 0
##
## $numeric
##
## ── Variable type: numeric ──────────────────────────────────────────────────────
## skim_vari…¹ n_mis…² compl…³ mean sd p0 p25 p50 p75 p100
## 1 hhincome 8 0.985 8.16e+4 4.91e+4 13577 4.68e4 6.96e4 1.01e5 250001
## 2 race.tot 0 1 3.41e+3 1.28e+3 0 2.50e3 3.27e3 4.24e3 7401
## 3 race.white 0 1 1.34e+3 1.20e+3 0 2.38e2 1.06e3 2.15e3 5039
## 4 race.black 0 1 1.62e+3 1.55e+3 0 3.67e2 9.96e2 2.74e3 7021
## 5 trans.total 0 1 1.72e+3 6.83e+2 0 1.27e3 1.65e3 2.14e3 4114
## 6 trans.car 0 1 1.32e+3 5.85e+2 0 9.27e2 1.25e3 1.69e3 3342
## 7 trans.drov… 0 1 1.18e+3 5.37e+2 0 8.31e2 1.12e3 1.53e3 3164
## 8 trans.carp… 0 1 1.36e+2 1.34e+2 0 4.62e1 9.9 e1 1.91e2 1109
## 9 trans.pubt… 0 1 1.17e+2 1.29e+2 0 2.6 e1 8.75e1 1.66e2 1194
## 10 trans.bicy… 0 1 7.84e+0 2.19e+1 0 0 0 0 220
## 11 trans.walk 0 1 3.87e+1 8.43e+1 0 0 8.5 e0 3.8 e1 788
## 12 trans.WfH 0 1 2.03e+2 1.61e+2 0 8.3 e1 1.65e2 2.93e2 856
## 13 count 0 1 2.64e-2 1.61e-1 0 0 0 0 1
## # … with 1 more variable: hist <chr>, and abbreviated variable names
## # ¹skim_variable, ²n_missing, ³complete_rate
## # ℹ Use `colnames()` to see all variable names
I can now visualize the number of bike rental shops for each tract. It looks like there are only 15 rental shops, and each tract has just one rental shop if they have any.
tm_shape(FD_tract_Geom_bikerentals) + tm_polygons(col="count") +tm_shape(yelp_br_Geom2) +tm_dots()
I dropped 8 rows that had missing income variables. Although there were several rows with missing addresses, I passed it because address field is not important in this assignment.
# Dropping the missing values
br_census_dropnaHH2 <- FD_tract_Geom_bikerentals[!is.na(FD_tract_Geom_bikerentals$hhincome),]
skim(br_census_dropnaHH2)
## Warning: Couldn't find skimmers for class: sfc_MULTIPOLYGON, sfc; No user-
## defined `sfl` provided. Falling back to `character`.
| Name | br_census_dropnaHH2 |
| Number of rows | 522 |
| Number of columns | 15 |
| _______________________ | |
| Column type frequency: | |
| character | 2 |
| numeric | 13 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| GEOID | 0 | 1 | 11 | 11 | 0 | 522 | 0 |
| geometry | 0 | 1 | 194 | 3750 | 0 | 522 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| hhincome | 0 | 1 | 81559.87 | 49076.11 | 13577 | 46780.50 | 69623.0 | 100893.50 | 250001 | ▇▇▃▁▁ |
| race.tot | 0 | 1 | 3434.26 | 1262.91 | 176 | 2548.50 | 3303.5 | 4252.00 | 7401 | ▁▇▇▃▁ |
| race.white | 0 | 1 | 1348.84 | 1200.38 | 0 | 240.25 | 1094.5 | 2168.75 | 5039 | ▇▅▃▁▁ |
| race.black | 0 | 1 | 1634.44 | 1552.84 | 0 | 367.00 | 994.5 | 2764.75 | 7021 | ▇▂▂▁▁ |
| trans.total | 0 | 1 | 1737.83 | 668.07 | 54 | 1274.25 | 1663.0 | 2149.25 | 4114 | ▂▇▇▂▁ |
| trans.car | 0 | 1 | 1331.53 | 574.70 | 54 | 944.50 | 1256.0 | 1710.50 | 3342 | ▂▇▆▂▁ |
| trans.drovealone | 0 | 1 | 1193.61 | 527.88 | 54 | 834.50 | 1130.5 | 1528.50 | 3164 | ▃▇▆▂▁ |
| trans.carpooled | 0 | 1 | 137.92 | 134.54 | 0 | 47.25 | 101.5 | 192.75 | 1109 | ▇▂▁▁▁ |
| trans.pubtrans | 0 | 1 | 118.63 | 129.35 | 0 | 26.25 | 88.5 | 167.75 | 1194 | ▇▁▁▁▁ |
| trans.bicycle | 0 | 1 | 7.89 | 22.00 | 0 | 0.00 | 0.0 | 0.00 | 220 | ▇▁▁▁▁ |
| trans.walk | 0 | 1 | 38.43 | 83.92 | 0 | 0.00 | 9.0 | 38.00 | 788 | ▇▁▁▁▁ |
| trans.WfH | 0 | 1 | 205.55 | 160.85 | 0 | 86.00 | 169.0 | 293.00 | 856 | ▇▅▂▁▁ |
| count | 0 | 1 | 0.02 | 0.16 | 0 | 0.00 | 0.0 | 0.00 | 1 | ▇▁▁▁▁ |
#changing name because it's too long..
br_final <- br_census_dropnaHH2
(1) Household Income and Bike Rentals
It’s hard to see any relationship between household income and bike rental shops through the plot below.
ggplot(br_final, aes(x=hhincome, y=count)) +
geom_point() +
ylab("Number of bike rental shops and Household Median Income in Tract")
(2) Number of Commuters using Bikes, and Bike Rentals
Even with the number of bicycle commuters, it almost looks like there are bike rental shops where commuters don’t bike.
ggplot(br_final, aes(x=trans.bicycle, y=count)) +
geom_point() +
ylab("Number of bike rental shops and bicycle commuters in Tract")
(3) White Population and Bike Rentals
Even with the number of bicycle commuters, it almost looks like there are bike rental shops where commuters don’t bike.
ggplot(br_final, aes(x=race.white, y=count)) +
geom_point() +
ylab("Number of bike rental shops and White population")
Creating y field (‘bikerentals’)
I would create a binary field that shows the presence of bicycle rental shops to make boxplots.
br_final$bikerental <- ifelse(br_final$count>0, 1, 0)
boxplot(trans.bicycle~bikerental, data=br_final, main="Boxplot of bike rentals by bicycle commuters", xlab="Whether bike rental shops are present", ylab="number of commuters using bicycle")
It was hard to see any relationship between existing fields with the presences of bike rental shops. So I decided to create new features.
I would create a new field that contains population density of a tract. The field ‘pop_density’ would indicate the land area for 1000 people (1/m^2).
br_final$pop_density <- 1000*br_final$race.tot/st_area(br_final$geometry)
I would also create a new field that contains the proportion of bike commuters among total number of commuters.
br_final$bike_commute_p <- 100*br_final$trans.bicycle/br_final$trans.total
Finally, I am creating boxplots and trying binary logistic regression for each of the new feature I created.
boxplot(pop_density~bikerental, data=br_final, main="Boxplot of bike rentals by population density of a tract", xlab="Whether bike rental shops are present", ylab="population density")
binary_br <- glm(bikerental~pop_density, family=binomial, data=br_final)
summary(binary_br)
##
## Call:
## glm(formula = bikerental ~ pop_density, family = binomial, data = br_final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.2546 -0.2384 -0.2294 -0.2149 2.8097
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.4088 0.4536 -7.514 5.71e-14 ***
## pop_density -0.1670 0.2517 -0.664 0.507
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 121.68 on 521 degrees of freedom
## Residual deviance: 121.09 on 520 degrees of freedom
## AIC: 125.09
##
## Number of Fisher Scoring iterations: 7
boxplot(bike_commute_p~bikerental, data=br_final, main="Boxplot of bike rentals by percentage of bike commuters", xlab="Whether bike rental shops are present", ylab="percentage of people using bicycle to commute")
binary_br2 <- glm(bikerental~bike_commute_p, family=binomial, data=br_final)
summary(binary_br2)
##
## Call:
## glm(formula = bikerental ~ bike_commute_p, family = binomial,
## data = br_final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.3389 -0.2211 -0.2211 -0.2211 2.7288
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.69885 0.30010 -12.32 <2e-16 ***
## bike_commute_p 0.05981 0.18102 0.33 0.741
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 121.68 on 521 degrees of freedom
## Residual deviance: 121.59 on 520 degrees of freedom
## AIC: 125.59
##
## Number of Fisher Scoring iterations: 6
It was difficult to find significant relationships between new features and the presence of bike rental shops at Dekalb and Fulton County. The number of bicycle rental shops in the study area is only 15, which makes finding the relationship very difficult.
tmap_mode("view")
## tmap mode set to interactive viewing
t_pop_den <- tm_shape(br_final) + tm_polygons(col ="pop_density") +tm_shape(yelp_br_Geom2) +tm_dots()
t_bike_commute <- tm_shape(br_final) + tm_polygons(col="bike_commute_p") +tm_shape(yelp_br_Geom2) +tm_dots()
tmap_arrange(t_pop_den, t_bike_commute)