library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(here)
## here() starts at C:/Users/chan303/OneDrive - Georgia Institute of Technology/CP8883BK/UA_module2
library(readr)
library(sf)
## Linking to GEOS 3.9.1, GDAL 3.4.3, PROJ 7.2.1; sf_use_s2() is TRUE
library(tmap)
library(tidycensus)
library(skimr)
# Read my data
my_yelp <- read_rds(here("..", "UA_module1", "Data", "yelp_all.rds"))
Issue 1: Duplicate Data
# Deleting duplicate businesses
yelp_unique <- my_yelp %>%
distinct(id, .keep_all=T)
glue::glue("Before dropping NA, there were {nrow(my_yelp)} rows. After dropping them, there are {nrow(yelp_unique)} rows") %>%
print()
## Before dropping NA, there were 3286 rows. After dropping them, there are 466 rows
# why are there so many duplicate rows in the first place?
Issue 2: Flattening Columns
# Custom function that takes the data frame in "categories" column in Yelp data
# and returns a character vector
concate_list <- function(x){
# x is a data frame with columns "alias" and "title" from Yelp$categories
# returns a character vector containing category concatenated titles
titles <- x[["title"]] %>% str_c(collapse = ", ")
return(titles)
}
yelp_flat <- yelp_unique %>%
# 1. Flattening columns with data frame
jsonlite::flatten() %>%
# 2. Handling list-columns
mutate(transactions = transactions %>%
map_chr(., function(x) str_c(x, collapse=", ")),
location.display_address = location.display_address %>%
map_chr(., function(x) str_c(x, collapse=", ")),
categories = categories %>% map_chr(concate_list))
Issue 3: Missing Values
#find missing values
yelp_flat %>%
map_dbl(., function(x) sum(is.na(x)))
## id alias name
## 0 0 0
## image_url is_closed url
## 0 0 0
## review_count categories rating
## 0 0 0
## transactions price phone
## 0 209 0
## display_phone distance coordinates.latitude
## 0 0 0
## coordinates.longitude location.address1 location.address2
## 0 4 100
## location.address3 location.city location.zip_code
## 117 0 0
## location.country location.state location.display_address
## 0 0 0
# First, let's verify that the 4 missing values in lat/long columns are in the same rows.
identical(is.na(yelp_flat$coordinates.latitude),
is.na(yelp_flat$coordinates.longitude)) # Yes, they are in the same 4 rows.
## [1] TRUE
# Drop them.
yelp_dropna1 <- yelp_flat %>%
drop_na(coordinates.longitude)
# Dropping NAs in price
yelp_dropna2 <- yelp_dropna1 %>%
drop_na(price)
#### Tract polygons for the Yelp query
champaign_census <- suppressMessages(
get_acs(geography = "tract", # or "block group", "county", "state" etc.
state = "IL",
county = c("Champaign"),
variables = c(hhincome = 'B19019_001'),
year = 2019,
survey = "acs5", # American Community Survey 5-year estimate
geometry = TRUE, # returns sf objects
output = "wide") # wide vs. long
)
##
|
| | 0%
|
|= | 1%
|
|== | 2%
|
|== | 3%
|
|=== | 4%
|
|=== | 5%
|
|==== | 6%
|
|===== | 7%
|
|====== | 9%
|
|======= | 10%
|
|======== | 12%
|
|========= | 13%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 16%
|
|============ | 17%
|
|============ | 18%
|
|============= | 18%
|
|============== | 20%
|
|=============== | 21%
|
|================ | 23%
|
|================= | 24%
|
|================== | 26%
|
|=================== | 27%
|
|=================== | 28%
|
|==================== | 29%
|
|===================== | 31%
|
|======================= | 32%
|
|======================= | 33%
|
|========================= | 35%
|
|========================== | 37%
|
|=========================== | 38%
|
|============================ | 40%
|
|============================= | 42%
|
|============================== | 43%
|
|=============================== | 44%
|
|================================ | 46%
|
|================================= | 47%
|
|================================== | 49%
|
|==================================== | 51%
|
|==================================== | 52%
|
|====================================== | 54%
|
|======================================= | 55%
|
|======================================== | 57%
|
|========================================= | 58%
|
|========================================== | 60%
|
|=========================================== | 61%
|
|============================================ | 63%
|
|============================================= | 65%
|
|=============================================== | 68%
|
|================================================ | 69%
|
|================================================== | 72%
|
|==================================================== | 74%
|
|====================================================== | 77%
|
|======================================================= | 78%
|
|========================================================= | 81%
|
|========================================================== | 83%
|
|============================================================ | 86%
|
|============================================================= | 88%
|
|=============================================================== | 91%
|
|================================================================= | 92%
|
|=================================================================== | 95%
|
|==================================================================== | 97%
|
|======================================================================| 100%
I found that census data CRS is NAD83, while Yelp data CRS is WGS84. I will set census CRS to WGS84 to avoid error while running ‘st_intersects’.
# head(champaign_census)
champaign_census <- st_transform(champaign_census, 'WGS84')
# head(champaign_census)
# Converting yelp_dropna2 into a sf object
yelp_sf <- yelp_dropna2 %>%
st_as_sf(coords=c("coordinates.longitude", "coordinates.latitude"), crs = 4326)
# sf subsets
yelp_in <- yelp_sf[champaign_census %>% st_union(), ,op = st_intersects]
I found that 6 businesses were outside the boundary.
glue::glue("nrow before: {nrow(my_yelp)} -> nrow after: {nrow(yelp_in)} \n
ncol before: {ncol(my_yelp)} -> ncol after: {ncol(yelp_in)} \n") %>%
print()
## nrow before: 3286 -> nrow after: 251
##
## ncol before: 16 -> ncol after: 23
Figure 1: Price distribution of shops and hotels in Champaign County
tmap_mode("view")
## tmap mode set to interactive viewing
tm_shape(yelp_in) + tm_dots(col = "price")
# census is currently sfc. Convert it to sf.
census_sf <- champaign_census %>% st_sf()
# Spatial join
census_yelp <- st_join(census_sf, yelp_in, join = st_intersects)
yelp_census <- st_join(yelp_in, census_sf, join = st_intersects)
# View
census_yelp %>% head()
## Simple feature collection with 6 features and 26 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -88.3143 ymin: 40.0622 xmax: -88.12398 ymax: 40.13507
## Geodetic CRS: WGS 84
## GEOID NAME hhincomeE
## 1 17019005800 Census Tract 58, Champaign County, Illinois 49855
## 2 17019005900 Census Tract 59, Champaign County, Illinois 10988
## 3 17019001206 Census Tract 12.06, Champaign County, Illinois 108385
## 4 17019005600 Census Tract 56, Champaign County, Illinois 60774
## 4.1 17019005600 Census Tract 56, Champaign County, Illinois 60774
## 5 17019000700 Census Tract 7, Champaign County, Illinois 40074
## hhincomeM id alias
## 1 6225 z-jVqx3Wx9fKOK6fn86Csg klose-knit-urbana
## 2 2328 5lXhJe7gob74m93eyjyp3A illini-union-hotel-urbana-2
## 3 3924 <NA> <NA>
## 4 11208 mqW1uq15v8iABsh9BzDZjA bakers-bikes-urbana
## 4.1 11208 Q9apdosOkT4rnX3YRurbmA country-arbors-nursery-urbana-3
## 5 5883 0HVZte6_aF1yDNeAde9u9Q orphans-treasure-box-champaign
## name
## 1 Klose Knit
## 2 Illini Union Hotel
## 3 <NA>
## 4 Baker's Bikes
## 4.1 Country Arbors Nursery
## 5 Orphans Treasure Box
## image_url
## 1 https://s3-media4.fl.yelpcdn.com/bphoto/weHG-Pp-SSTwdlOid20SwQ/o.jpg
## 2 https://s3-media3.fl.yelpcdn.com/bphoto/ly2xBb_vyNq3FLiEYaV3Pw/o.jpg
## 3 <NA>
## 4 https://s3-media4.fl.yelpcdn.com/bphoto/yvbHZ0fmRfvuFRzxwuTCUw/o.jpg
## 4.1 https://s3-media2.fl.yelpcdn.com/bphoto/E9XXy-fjkzL9FZz-s4Yy7Q/o.jpg
## 5 https://s3-media3.fl.yelpcdn.com/bphoto/1A-x7_-9PMGgtJ1QtmTj8A/o.jpg
## is_closed
## 1 FALSE
## 2 FALSE
## 3 NA
## 4 FALSE
## 4.1 FALSE
## 5 FALSE
## url
## 1 https://www.yelp.com/biz/klose-knit-urbana?adjust_creative=-To_AbVIHKh8mMAunCeEbg&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=-To_AbVIHKh8mMAunCeEbg
## 2 https://www.yelp.com/biz/illini-union-hotel-urbana-2?adjust_creative=-To_AbVIHKh8mMAunCeEbg&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=-To_AbVIHKh8mMAunCeEbg
## 3 <NA>
## 4 https://www.yelp.com/biz/bakers-bikes-urbana?adjust_creative=-To_AbVIHKh8mMAunCeEbg&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=-To_AbVIHKh8mMAunCeEbg
## 4.1 https://www.yelp.com/biz/country-arbors-nursery-urbana-3?adjust_creative=-To_AbVIHKh8mMAunCeEbg&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=-To_AbVIHKh8mMAunCeEbg
## 5 https://www.yelp.com/biz/orphans-treasure-box-champaign?adjust_creative=-To_AbVIHKh8mMAunCeEbg&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=-To_AbVIHKh8mMAunCeEbg
## review_count categories rating
## 1 12 Knitting Supplies 4.5
## 2 16 Hotels 2.5
## 3 NA <NA> NA
## 4 33 Bikes, Bike Repair/Maintenance 4.5
## 4.1 10 Nurseries & Gardening 4.5
## 5 4 Community Service/Non-Profit, Used Bookstore 5.0
## transactions price phone display_phone distance
## 1 $$$$ +12173442123 (217) 344-2123 744.0377
## 2 $$ +12173331241 (217) 333-1241 553.3484
## 3 <NA> <NA> <NA> <NA> NA
## 4 $ +12173650318 (217) 365-0318 1125.3185
## 4.1 $$ +12173671072 (217) 367-1072 2439.1093
## 5 $ +12172983202 (217) 298-3202 972.7355
## location.address1 location.address2 location.address3 location.city
## 1 311 W Springfield Ave Urbana
## 2 1401 W Green St Urbana
## 3 <NA> <NA> <NA> <NA>
## 4 1003 S Lynn St Urbana
## 4.1 1742 County Rd 1400 N Urbana
## 5 826 Pioneer St <NA> Champaign
## location.zip_code location.country location.state
## 1 61801 US IL
## 2 61801 US IL
## 3 <NA> <NA> <NA>
## 4 61801 US IL
## 4.1 61802 US IL
## 5 61820 US IL
## location.display_address geometry
## 1 311 W Springfield Ave, Urbana, IL 61801 MULTIPOLYGON (((-88.21775 4...
## 2 1401 W Green St, Urbana, IL 61801 MULTIPOLYGON (((-88.22882 4...
## 3 <NA> MULTIPOLYGON (((-88.3143 40...
## 4 1003 S Lynn St, Urbana, IL 61801 MULTIPOLYGON (((-88.20497 4...
## 4.1 1742 County Rd 1400 N, Urbana, IL 61802 MULTIPOLYGON (((-88.20497 4...
## 5 826 Pioneer St, Champaign, IL 61820 MULTIPOLYGON (((-88.25797 4...
Figure 2: Ratings distribution of shops and hotels in Champaign County
# visualize distribution of ratings in census tracts
tm_shape(census_yelp %>% group_by(GEOID) %>% summarise(rating=mean(rating))) +
tm_polygons(col = "rating", style = "quantile")
yelp_census %>% head()
## Simple feature collection with 6 features and 26 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: -88.2366 ymin: 40.10431 xmax: -88.19821 ymax: 40.11276
## Geodetic CRS: WGS 84
## id alias name
## 1 KuJEyXesWoyOm3GQm0rqcg the-idea-store-urbana The Idea Store
## 2 mqW1uq15v8iABsh9BzDZjA bakers-bikes-urbana Baker's Bikes
## 3 e0tj2Jip560QbC8N9pF6xw fyxit-champaign-2 FYXIT
## 4 ckdGk8ForF9zEczmu4-tTA strawberry-fields-urbana-2 Strawberry Fields
## 5 POuzQLJuPWx0i-dUNdRsWQ international-galleries-urbana International Galleries
## 6 7Vaj54SeGM3RupuvNlccOw heel-to-toe-urbana Heel To Toe
## image_url
## 1 https://s3-media2.fl.yelpcdn.com/bphoto/Rc6DmJoBp9zdjrsdURIKPg/o.jpg
## 2 https://s3-media4.fl.yelpcdn.com/bphoto/yvbHZ0fmRfvuFRzxwuTCUw/o.jpg
## 3 https://s3-media1.fl.yelpcdn.com/bphoto/3u2wpWz6vU4geIOI9K61VA/o.jpg
## 4 https://s3-media2.fl.yelpcdn.com/bphoto/zO4Ie2IkCY9Asifnboy82w/o.jpg
## 5
## 6 https://s3-media2.fl.yelpcdn.com/bphoto/sVy8H_AQ0Zy4jX7Dhq5CJA/o.jpg
## is_closed
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## url
## 1 https://www.yelp.com/biz/the-idea-store-urbana?adjust_creative=-To_AbVIHKh8mMAunCeEbg&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=-To_AbVIHKh8mMAunCeEbg
## 2 https://www.yelp.com/biz/bakers-bikes-urbana?adjust_creative=-To_AbVIHKh8mMAunCeEbg&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=-To_AbVIHKh8mMAunCeEbg
## 3 https://www.yelp.com/biz/fyxit-champaign-2?adjust_creative=-To_AbVIHKh8mMAunCeEbg&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=-To_AbVIHKh8mMAunCeEbg
## 4 https://www.yelp.com/biz/strawberry-fields-urbana-2?adjust_creative=-To_AbVIHKh8mMAunCeEbg&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=-To_AbVIHKh8mMAunCeEbg
## 5 https://www.yelp.com/biz/international-galleries-urbana?adjust_creative=-To_AbVIHKh8mMAunCeEbg&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=-To_AbVIHKh8mMAunCeEbg
## 6 https://www.yelp.com/biz/heel-to-toe-urbana?adjust_creative=-To_AbVIHKh8mMAunCeEbg&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=-To_AbVIHKh8mMAunCeEbg
## review_count categories rating
## 1 22 Art Supplies, Thrift Stores 4.5
## 2 33 Bikes, Bike Repair/Maintenance 4.5
## 3 36 Electronics Repair, Mobile Phone Repair, Computers 4.5
## 4 30 Organic Stores, Bakeries, Vitamins & Supplements 3.5
## 5 9 Jewelry, Art Galleries, Framing 4.5
## 6 32 Shoe Stores, Shoe Repair, Orthotics 3.5
## transactions price phone display_phone distance
## 1 $ +12173527878 (217) 352-7878 612.9849
## 2 $ +12173650318 (217) 365-0318 1125.3185
## 3 $$ +12176974171 (217) 697-4171 2216.2918
## 4 delivery, pickup $$ +12173281655 (217) 328-1655 783.2241
## 5 $$ +12173282254 (217) 328-2254 725.2433
## 6 $$$ +12173672880 (217) 367-2880 843.9406
## location.address1 location.address2 location.address3 location.city
## 1 125 Lincoln Square Urbana
## 2 1003 S Lynn St Urbana
## 3 202 E Green St Ste 3 Champaign
## 4 306 W Springfield Ave Urbana
## 5 118 Lincoln Square Urbana
## 6 106 W Main St Urbana
## location.zip_code location.country location.state
## 1 61801 US IL
## 2 61801 US IL
## 3 61820 US IL
## 4 61801 US IL
## 5 61801 US IL
## 6 61801 US IL
## location.display_address GEOID
## 1 125 Lincoln Square, Urbana, IL 61801 17019011100
## 2 1003 S Lynn St, Urbana, IL 61801 17019005600
## 3 202 E Green St, Ste 3, Champaign, IL 61820 17019000301
## 4 306 W Springfield Ave, Urbana, IL 61801 17019011100
## 5 118 Lincoln Square, Urbana, IL 61801 17019011100
## 6 106 W Main St, Urbana, IL 61801 17019011100
## NAME hhincomeE hhincomeM
## 1 Census Tract 111, Champaign County, Illinois 22741 11621
## 2 Census Tract 56, Champaign County, Illinois 60774 11208
## 3 Census Tract 3.01, Champaign County, Illinois 7099 1978
## 4 Census Tract 111, Champaign County, Illinois 22741 11621
## 5 Census Tract 111, Champaign County, Illinois 22741 11621
## 6 Census Tract 111, Champaign County, Illinois 22741 11621
## geometry
## 1 POINT (-88.20729 40.11008)
## 2 POINT (-88.19821 40.10431)
## 3 POINT (-88.2366 40.11037)
## 4 POINT (-88.21127 40.11263)
## 5 POINT (-88.20721 40.11124)
## 6 POINT (-88.20776 40.11276)
Figure 3: Income distribution census tracts and shops/hotels in Champaign County
tm_shape(yelp_census) + tm_dots(col="hhincomeE")
After tidying the data, I could drastically reduce the number of businesses as many were duplicated data points beforehand. There are 251 businesses in total after data cleaning, while as there were 3286 businesses before tidying. From Figure 1, I found that cheapest shops/hotels are usually located in Downtown Champaign. Businesses with slightly higher price ($$) were mostly aligned with Neil Street, which vertically crosses the City of Champaign.This pattern could be related to household income. Figure 3 shows that Downtown Champaign has lower household income compared to Southern Champaign. Meanwhile, the relationship between ratings and prices is not clear. This might be because the number of business is not large enough, or simply because the distribution is random. From Figure 2, the map shows that ratings were likely to be high in the East part of Champaign but this is also not reliable because the number of business is small.
-END-