library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.1
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## Warning: package 'ggplot2' was built under R version 4.2.1
## Warning: package 'tibble' was built under R version 4.2.1
## Warning: package 'readr' was built under R version 4.2.1
## Warning: package 'dplyr' was built under R version 4.2.1
## Warning: package 'forcats' was built under R version 4.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(tidyr)
library(here)
## Warning: package 'here' was built under R version 4.2.1
## here() starts at D:/Georgia Tech/Spec topic_
library(tidycensus)
## Warning: package 'tidycensus' was built under R version 4.2.1
library(sf)
## Warning: package 'sf' was built under R version 4.2.1
## Linking to GEOS 3.9.1, GDAL 3.4.3, PROJ 7.2.1; sf_use_s2() is TRUE
library(tmap)
## Warning: package 'tmap' was built under R version 4.2.1
library(jsonlite)
## Warning: package 'jsonlite' was built under R version 4.2.1
##
## Attaching package: 'jsonlite'
## The following object is masked from 'package:purrr':
##
## flatten
library(tidyverse)
library(httr)
## Warning: package 'httr' was built under R version 4.2.1
library(jsonlite)
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.2.1
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(here)
library(yelpr)
library(knitr)
## Warning: package 'knitr' was built under R version 4.2.1
tidycensus::census_api_key(Sys.getenv("google_api"))
## To install your API key for use in future sessions, run this function with `install = TRUE`.
install = TRUE
# Read a subset of Yelp data we downloaded last week
yelp_subset <- read_rds(here("D:/Georgia Tech/Spec topic_/yelp_all_4.rds"))
# Print to see what's inside
yelp_subset %>%
tibble() %>%
print(width = 1000)
## # A tibble: 39,702 × 17
## id alias name
## <chr> <chr> <chr>
## 1 PkOM7wJZzZ0DoxW84_uLDg edenvale-garden-park-san-jose Edenvale Garden Park
## 2 j-gBxd5Nkhr9iRHi_NJp7w great-oaks-park-san-jose Great Oaks Park
## 3 C41iNUrHTWYN9rUJSn_rJw chynoweth-park-san-jose Chynoweth Park
## 4 wvUCOyYuNcsc7g3OsMxWiA danna-rock-park-san-jose Danna Rock Park
## 5 sJ0RCNgqZ4nlLMl4LhKDmQ lake-cunningham-park-san-jose Lake Cunningham Park
## 6 R6Eb_p72vynnpq20aOLATg nisich-park-san-jose-2 Nisich Park
## 7 jcxoXMWrqreHQT0P5h1s9g welch-park-san-jose Welch Park
## 8 Y-tuulUyvCfDzzH4yV-WgQ hillview-park-san-jose Hillview Park
## 9 BCJx7oQ8sN55mcqMyWFnmg vieira-park-san-jose Vieira Park
## 10 DGSPySxKJJJfO6stJEF7Uw lincoln-glen-park-san-jose Lincoln Glen Park
## image_url
## <chr>
## 1 https://s3-media4.fl.yelpcdn.com/bphoto/yFTWpEtAU5xib85UvVq1pQ/o.jpg
## 2 https://s3-media4.fl.yelpcdn.com/bphoto/rcIYKSNUQ84vatIfw6_h7A/o.jpg
## 3 https://s3-media1.fl.yelpcdn.com/bphoto/gx6Ei11wytx_V215HF1gzg/o.jpg
## 4 https://s3-media3.fl.yelpcdn.com/bphoto/F0AQ22xR2eMrDiImNn66fA/o.jpg
## 5 https://s3-media3.fl.yelpcdn.com/bphoto/ZzHo_dwM5ar6CKV6TsxshQ/o.jpg
## 6 https://s3-media3.fl.yelpcdn.com/bphoto/ECTqvmIlk5LyKxKM7A-Row/o.jpg
## 7 https://s3-media4.fl.yelpcdn.com/bphoto/g5mj_1kJMixlfUYA2AgfWg/o.jpg
## 8 https://s3-media1.fl.yelpcdn.com/bphoto/p7Gg879Fu8GzS66cPeu9ig/o.jpg
## 9 https://s3-media2.fl.yelpcdn.com/bphoto/6s2pA1-_oklqy66z0EaKdQ/o.jpg
## 10 https://s3-media4.fl.yelpcdn.com/bphoto/qfk9XMXuvS_0GOHknXZL6w/o.jpg
## is_closed
## <lgl>
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## 7 FALSE
## 8 FALSE
## 9 FALSE
## 10 FALSE
## url
## <chr>
## 1 https://www.yelp.com/biz/edenvale-garden-park-san-jose?adjust_creative=9FUT8…
## 2 https://www.yelp.com/biz/great-oaks-park-san-jose?adjust_creative=9FUT8HLBJS…
## 3 https://www.yelp.com/biz/chynoweth-park-san-jose?adjust_creative=9FUT8HLBJS3…
## 4 https://www.yelp.com/biz/danna-rock-park-san-jose?adjust_creative=9FUT8HLBJS…
## 5 https://www.yelp.com/biz/lake-cunningham-park-san-jose?adjust_creative=9FUT8…
## 6 https://www.yelp.com/biz/nisich-park-san-jose-2?adjust_creative=9FUT8HLBJS3n…
## 7 https://www.yelp.com/biz/welch-park-san-jose?adjust_creative=9FUT8HLBJS3nIJ7…
## 8 https://www.yelp.com/biz/hillview-park-san-jose?adjust_creative=9FUT8HLBJS3n…
## 9 https://www.yelp.com/biz/vieira-park-san-jose?adjust_creative=9FUT8HLBJS3nIJ…
## 10 https://www.yelp.com/biz/lincoln-glen-park-san-jose?adjust_creative=9FUT8HLB…
## review_count categories rating coordinates$latitude $longitude transactions
## <int> <list> <dbl> <dbl> <dbl> <list>
## 1 39 <df [1 × 2]> 4 37.3 -122. <list [0]>
## 2 6 <df [1 × 2]> 3.5 37.3 -122. <list [0]>
## 3 4 <df [2 × 2]> 3 37.3 -122. <list [0]>
## 4 3 <df [1 × 2]> 2.5 37.3 -122. <list [0]>
## 5 95 <df [1 × 2]> 3.5 37.3 -122. <list [0]>
## 6 4 <df [1 × 2]> 4.5 37.3 -122. <list [0]>
## 7 7 <df [1 × 2]> 3 37.3 -122. <list [0]>
## 8 2 <df [2 × 2]> 4 37.3 -122. <list [0]>
## 9 22 <df [1 × 2]> 4 37.3 -122. <list [0]>
## 10 32 <df [1 × 2]> 4 37.3 -122. <list [0]>
## location$address1 $address2 $address3 $city $zip_code $country
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 200 Edenvale Ave "" "" San Jose 95136 US
## 2 5248 Snow Dr "" "" San Jose 95111 US
## 3 Chynoweth Ave & Edenvale Ave "" "" San Jose 95136 US
## 4 4524-4534 Houndshaven Way "" "" San Jose 95111 US
## 5 2305 S White Rd "" "" San Jose 95101 US
## 6 1401-1437 Suzay Ct "" "" San Jose 95122 US
## 7 Kenesta Wy "" "" San Jose 95122 US
## 8 Adrian Way & Ocala Ave "" "" San Jose 95122 US
## 9 700 Adeline Ave "" "" San Jose 95136 US
## 10 Radio Ave & Curtner Ave "" "" San Jose 95125 US
## $state $display_address phone display_phone distance price keys
## <chr> <list> <chr> <chr> <dbl> <chr> <chr>
## 1 CA <chr [2]> "" "" 526. <NA> p
## 2 CA <chr [2]> "+14087935510" "(408) 793-5510" 909. <NA> p
## 3 CA <chr [2]> "+14087935510" "(408) 793-5510" 683. <NA> p
## 4 CA <chr [2]> "" "" 1082. <NA> p
## 5 CA <chr [2]> "+14087935510" "(408) 793-5510" 1346. <NA> p
## 6 CA <chr [2]> "" "" 1804. <NA> p
## 7 CA <chr [2]> "" "" 689. <NA> p
## 8 CA <chr [2]> "+14087935510" "(408) 793-5510" 850. <NA> p
## 9 CA <chr [2]> "" "" 1063. <NA> p
## 10 CA <chr [2]> "+14085353570" "(408) 535-3570" 1553. <NA> p
## # … with 39,692 more rows
yelp_subset$coordinates %>% head()
## latitude longitude
## 1 37.26251 -121.8203
## 2 37.26807 -121.8057
## 3 37.25949 -121.8188
## 4 37.27216 -121.8234
## 5 37.33745 -121.8089
## 6 37.31825 -121.8339
yelp_flat <- yelp_subset %>%
jsonlite::flatten() %>%
as_tibble()
yelp_flat$coordinates %>% head()
## Warning: Unknown or uninitialised column: `coordinates`.
## NULL
# Concatenate what's inside the list
yelp_concat <- yelp_flat %>%
mutate(transactions = transactions %>%
map_chr(., function(x) str_c(x, collapse=", ")),
location.display_address = location.display_address %>%
map_chr(., function(x) str_c(x, collapse=", ")))
# Custom function that takes the data frame in "categories" column in Yelp data
# and returns a character vector
concate_list <- function(x){
# x is a data frame with columns "alias" and "title" from Yelp$categories
# returns a character vector containing category concatenated titles
titles <- x[["title"]] %>% str_c(collapse = ", ")
return(titles)
}
yelp_flat2 <- yelp_concat %>%
mutate(categories = categories %>% map_chr(concate_list))
yelp_flat2 %>% print(width = 1000)
## # A tibble: 39,702 × 25
## id alias name
## <chr> <chr> <chr>
## 1 PkOM7wJZzZ0DoxW84_uLDg edenvale-garden-park-san-jose Edenvale Garden Park
## 2 j-gBxd5Nkhr9iRHi_NJp7w great-oaks-park-san-jose Great Oaks Park
## 3 C41iNUrHTWYN9rUJSn_rJw chynoweth-park-san-jose Chynoweth Park
## 4 wvUCOyYuNcsc7g3OsMxWiA danna-rock-park-san-jose Danna Rock Park
## 5 sJ0RCNgqZ4nlLMl4LhKDmQ lake-cunningham-park-san-jose Lake Cunningham Park
## 6 R6Eb_p72vynnpq20aOLATg nisich-park-san-jose-2 Nisich Park
## 7 jcxoXMWrqreHQT0P5h1s9g welch-park-san-jose Welch Park
## 8 Y-tuulUyvCfDzzH4yV-WgQ hillview-park-san-jose Hillview Park
## 9 BCJx7oQ8sN55mcqMyWFnmg vieira-park-san-jose Vieira Park
## 10 DGSPySxKJJJfO6stJEF7Uw lincoln-glen-park-san-jose Lincoln Glen Park
## image_url
## <chr>
## 1 https://s3-media4.fl.yelpcdn.com/bphoto/yFTWpEtAU5xib85UvVq1pQ/o.jpg
## 2 https://s3-media4.fl.yelpcdn.com/bphoto/rcIYKSNUQ84vatIfw6_h7A/o.jpg
## 3 https://s3-media1.fl.yelpcdn.com/bphoto/gx6Ei11wytx_V215HF1gzg/o.jpg
## 4 https://s3-media3.fl.yelpcdn.com/bphoto/F0AQ22xR2eMrDiImNn66fA/o.jpg
## 5 https://s3-media3.fl.yelpcdn.com/bphoto/ZzHo_dwM5ar6CKV6TsxshQ/o.jpg
## 6 https://s3-media3.fl.yelpcdn.com/bphoto/ECTqvmIlk5LyKxKM7A-Row/o.jpg
## 7 https://s3-media4.fl.yelpcdn.com/bphoto/g5mj_1kJMixlfUYA2AgfWg/o.jpg
## 8 https://s3-media1.fl.yelpcdn.com/bphoto/p7Gg879Fu8GzS66cPeu9ig/o.jpg
## 9 https://s3-media2.fl.yelpcdn.com/bphoto/6s2pA1-_oklqy66z0EaKdQ/o.jpg
## 10 https://s3-media4.fl.yelpcdn.com/bphoto/qfk9XMXuvS_0GOHknXZL6w/o.jpg
## is_closed
## <lgl>
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## 7 FALSE
## 8 FALSE
## 9 FALSE
## 10 FALSE
## url
## <chr>
## 1 https://www.yelp.com/biz/edenvale-garden-park-san-jose?adjust_creative=9FUT8…
## 2 https://www.yelp.com/biz/great-oaks-park-san-jose?adjust_creative=9FUT8HLBJS…
## 3 https://www.yelp.com/biz/chynoweth-park-san-jose?adjust_creative=9FUT8HLBJS3…
## 4 https://www.yelp.com/biz/danna-rock-park-san-jose?adjust_creative=9FUT8HLBJS…
## 5 https://www.yelp.com/biz/lake-cunningham-park-san-jose?adjust_creative=9FUT8…
## 6 https://www.yelp.com/biz/nisich-park-san-jose-2?adjust_creative=9FUT8HLBJS3n…
## 7 https://www.yelp.com/biz/welch-park-san-jose?adjust_creative=9FUT8HLBJS3nIJ7…
## 8 https://www.yelp.com/biz/hillview-park-san-jose?adjust_creative=9FUT8HLBJS3n…
## 9 https://www.yelp.com/biz/vieira-park-san-jose?adjust_creative=9FUT8HLBJS3nIJ…
## 10 https://www.yelp.com/biz/lincoln-glen-park-san-jose?adjust_creative=9FUT8HLB…
## review_count categories rating transactions phone
## <int> <chr> <dbl> <chr> <chr>
## 1 39 Parks 4 "" ""
## 2 6 Parks 3.5 "" "+14087935510"
## 3 4 Parks, Playgrounds 3 "" "+14087935510"
## 4 3 Parks 2.5 "" ""
## 5 95 Parks 3.5 "" "+14087935510"
## 6 4 Parks 4.5 "" ""
## 7 7 Parks 3 "" ""
## 8 2 Parks, Playgrounds 4 "" "+14087935510"
## 9 22 Parks 4 "" ""
## 10 32 Parks 4 "" "+14085353570"
## display_phone distance price keys coordinates.latitude
## <chr> <dbl> <chr> <chr> <dbl>
## 1 "" 526. <NA> p 37.3
## 2 "(408) 793-5510" 909. <NA> p 37.3
## 3 "(408) 793-5510" 683. <NA> p 37.3
## 4 "" 1082. <NA> p 37.3
## 5 "(408) 793-5510" 1346. <NA> p 37.3
## 6 "" 1804. <NA> p 37.3
## 7 "" 689. <NA> p 37.3
## 8 "(408) 793-5510" 850. <NA> p 37.3
## 9 "" 1063. <NA> p 37.3
## 10 "(408) 535-3570" 1553. <NA> p 37.3
## coordinates.longitude location.address1 location.address2
## <dbl> <chr> <chr>
## 1 -122. 200 Edenvale Ave ""
## 2 -122. 5248 Snow Dr ""
## 3 -122. Chynoweth Ave & Edenvale Ave ""
## 4 -122. 4524-4534 Houndshaven Way ""
## 5 -122. 2305 S White Rd ""
## 6 -122. 1401-1437 Suzay Ct ""
## 7 -122. Kenesta Wy ""
## 8 -122. Adrian Way & Ocala Ave ""
## 9 -122. 700 Adeline Ave ""
## 10 -122. Radio Ave & Curtner Ave ""
## location.address3 location.city location.zip_code location.country
## <chr> <chr> <chr> <chr>
## 1 "" San Jose 95136 US
## 2 "" San Jose 95111 US
## 3 "" San Jose 95136 US
## 4 "" San Jose 95111 US
## 5 "" San Jose 95101 US
## 6 "" San Jose 95122 US
## 7 "" San Jose 95122 US
## 8 "" San Jose 95122 US
## 9 "" San Jose 95136 US
## 10 "" San Jose 95125 US
## location.state location.display_address
## <chr> <chr>
## 1 CA 200 Edenvale Ave, San Jose, CA 95136
## 2 CA 5248 Snow Dr, San Jose, CA 95111
## 3 CA Chynoweth Ave & Edenvale Ave, San Jose, CA 95136
## 4 CA 4524-4534 Houndshaven Way, San Jose, CA 95111
## 5 CA 2305 S White Rd, San Jose, CA 95101
## 6 CA 1401-1437 Suzay Ct, San Jose, CA 95122
## 7 CA Kenesta Wy, San Jose, CA 95122
## 8 CA Adrian Way & Ocala Ave, San Jose, CA 95122
## 9 CA 700 Adeline Ave, San Jose, CA 95136
## 10 CA Radio Ave & Curtner Ave, San Jose, CA 95125
## # … with 39,692 more rows
# Read the full data
my_yelp <- read_rds(here("D:/Georgia Tech/Spec topic_/yelp_all_4.rds"))
# Issue 2 ------------------------------
yelp_unique <- my_yelp %>%
distinct(id, .keep_all=T)
glue::glue("Before dropping NA, there were {nrow(my_yelp)} rows. After dropping them, there are {nrow(yelp_unique)} rows") %>%
print()
## Before dropping NA, there were 39702 rows. After dropping them, there are 5551 rows
# Issue 3 ------------------------------
yelp_flat <- yelp_unique %>%
# 1. Flattening columns with data frame
jsonlite::flatten() %>%
# 2. Handling list-columns
mutate(transactions = transactions %>%
map_chr(., function(x) str_c(x, collapse=", ")),
location.display_address = location.display_address %>%
map_chr(., function(x) str_c(x, collapse=", ")),
categories = categories %>% map_chr(concate_list)) # concate_list is the custom function
# Issue 4 ------------------------------
yelp_flat %>%
map_dbl(., function(x) sum(is.na(x)))
## id alias name
## 0 0 0
## image_url is_closed url
## 0 0 0
## review_count categories rating
## 0 0 0
## transactions phone display_phone
## 0 0 0
## distance price keys
## 0 1650 0
## coordinates.latitude coordinates.longitude location.address1
## 2 2 73
## location.address2 location.address3 location.city
## 1146 1327 0
## location.zip_code location.country location.state
## 0 0 0
## location.display_address
## 0
# Fist, let's verify that the 4 missing values in lat/long columns are in the same rows.
identical(is.na(yelp_flat$coordinates.latitude),
is.na(yelp_flat$coordinates.longitude)) # Yes, they are in the same 4 rows.
## [1] TRUE
# Drop them.
yelp_dropna1 <- yelp_flat %>%
drop_na(coordinates.longitude)
# Dropping NAs in price
yelp_dropna2 <- yelp_dropna1 %>%
drop_na(price)
head(yelp_dropna2)
## id alias
## 1 cToVW-dCXcxhqFQX_xSc-w orchard-heritage-park-sunnyvale
## 2 Qbuy0qvdlXaXKDOpaDnCBg pho-bowl-san-jose-4
## 3 pyCLzdHoqs3ndmGV1vPamA mingles-mango-san-jose-2
## 4 nm7YhFGOMAw1ojcX32RmpQ tacos-el-pollo-y-el-pollito-san-jose
## 5 Iw0u4il3SpX7rtY0ookQdA puro-michoacan-restaurant-san-jose-2
## 6 WE1O8_MfVq4kM4G1OVAP7g trines-cafe-6-san-jose
## name
## 1 Orchard Heritage Park
## 2 Pho Bowl
## 3 Mingle's Mango
## 4 Tacos El Pollo Y El Pollito
## 5 Puro Michoacan Restaurant
## 6 Trines Cafe 6
## image_url
## 1 https://s3-media2.fl.yelpcdn.com/bphoto/HSOJ1H2XNrLBhlATnPHoKg/o.jpg
## 2 https://s3-media2.fl.yelpcdn.com/bphoto/OwRnJ_0kxA_glrocLu1spg/o.jpg
## 3 https://s3-media3.fl.yelpcdn.com/bphoto/QZSPNHxnVyu9KBwDEqbkbw/o.jpg
## 4 https://s3-media2.fl.yelpcdn.com/bphoto/2OcE1RSGx8MEFtfj566Hew/o.jpg
## 5 https://s3-media2.fl.yelpcdn.com/bphoto/tPx6AVzj0Xh7SnKt1jalEQ/o.jpg
## 6 https://s3-media2.fl.yelpcdn.com/bphoto/KFiqnd1ETApqoPgSq_0Ebg/o.jpg
## is_closed
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## url
## 1 https://www.yelp.com/biz/orchard-heritage-park-sunnyvale?adjust_creative=9FUT8HLBJS3nIJ7hDk8tZw&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=9FUT8HLBJS3nIJ7hDk8tZw
## 2 https://www.yelp.com/biz/pho-bowl-san-jose-4?adjust_creative=9FUT8HLBJS3nIJ7hDk8tZw&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=9FUT8HLBJS3nIJ7hDk8tZw
## 3 https://www.yelp.com/biz/mingles-mango-san-jose-2?adjust_creative=9FUT8HLBJS3nIJ7hDk8tZw&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=9FUT8HLBJS3nIJ7hDk8tZw
## 4 https://www.yelp.com/biz/tacos-el-pollo-y-el-pollito-san-jose?adjust_creative=9FUT8HLBJS3nIJ7hDk8tZw&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=9FUT8HLBJS3nIJ7hDk8tZw
## 5 https://www.yelp.com/biz/puro-michoacan-restaurant-san-jose-2?adjust_creative=9FUT8HLBJS3nIJ7hDk8tZw&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=9FUT8HLBJS3nIJ7hDk8tZw
## 6 https://www.yelp.com/biz/trines-cafe-6-san-jose?adjust_creative=9FUT8HLBJS3nIJ7hDk8tZw&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=9FUT8HLBJS3nIJ7hDk8tZw
## review_count categories rating transactions
## 1 8 Museums, Parks, Gift Shops 5
## 2 382 Vietnamese, Soup, Noodles 4 delivery
## 3 362 Asian Fusion, Dim Sum, Noodles 4 pickup, delivery
## 4 65 Mexican, Food Trucks 4 delivery
## 5 38 Mexican 4
## 6 68 Mexican 4 delivery
## phone display_phone distance price keys coordinates.latitude
## 1 +14087490220 (408) 749-0220 1531.2446 $$ p 37.35776
## 2 +14082818288 (408) 281-8288 881.1513 $ a 37.25979
## 3 +14083001425 (408) 300-1425 778.2162 $$ a 37.26058
## 4 +14085092940 (408) 509-2940 702.4299 $ a 37.26053
## 5 +16692347944 (669) 234-7944 262.5733 $$ a 37.26297
## 6 +14082243243 (408) 224-3243 889.1281 $$ a 37.25960
## coordinates.longitude location.address1 location.address2 location.address3
## 1 -122.0268 560 E Remington Dr <NA>
## 2 -121.8077 5316 Monterey Hwy
## 3 -121.8083 Monterey Rd D Ste D
## 4 -121.8096 5270 Monterey Hwy
## 5 -121.8136 5138 Monterey Hwy Ste A <NA>
## 6 -121.8078 5304 Monterey Hwy
## location.city location.zip_code location.country location.state
## 1 Sunnyvale 94087 US CA
## 2 San Jose 95111 US CA
## 3 San Jose 95111 US CA
## 4 San Jose 95111 US CA
## 5 San Jose 95111 US CA
## 6 San Jose 95111 US CA
## location.display_address
## 1 560 E Remington Dr, Sunnyvale, CA 94087
## 2 5316 Monterey Hwy, San Jose, CA 95111
## 3 Monterey Rd D, Ste D, San Jose, CA 95111
## 4 5270 Monterey Hwy, San Jose, CA 95111
## 5 5138 Monterey Hwy, Ste A, San Jose, CA 95111
## 6 5304 Monterey Hwy, San Jose, CA 95111
summary(yelp_dropna1$price)
## Length Class Mode
## 5549 character character
# census boundary
#census_a <- st_read("https://raw.githubusercontent.com/BonwooKoo/UrbanAnalytics2022/main/Lab/module_0/testdata.geojson")
#getting the tract data
census_1 <- suppressMessages(
get_acs(geography = "tract", # or "block group", "county", "state" etc.
state = "CA",
county = c("Santa Clara"),
variables = c(hhincome = 'B19019_001',
race.tot = "B02001_001",
race.white = "B02001_002",
race.black = 'B02001_003'
),
year = 2019,
survey = "acs5", # American Community Survey 5-year estimate
geometry = TRUE, # returns sf objects
output = "wide") # wide vs. long
)
##
|
| | 0%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|=== | 4%
|
|=== | 5%
|
|==== | 5%
|
|==== | 6%
|
|===== | 7%
|
|===== | 8%
|
|====== | 8%
|
|====== | 9%
|
|======= | 9%
|
|======= | 10%
|
|======== | 11%
|
|======== | 12%
|
|========= | 12%
|
|========= | 13%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 15%
|
|=========== | 16%
|
|============ | 17%
|
|============ | 18%
|
|============= | 18%
|
|============= | 19%
|
|============== | 19%
|
|============== | 20%
|
|=============== | 21%
|
|=============== | 22%
|
|================ | 22%
|
|================ | 23%
|
|================= | 24%
|
|================= | 25%
|
|================== | 26%
|
|=================== | 27%
|
|=================== | 28%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 30%
|
|====================== | 31%
|
|====================== | 32%
|
|======================= | 33%
|
|======================== | 34%
|
|======================== | 35%
|
|========================= | 36%
|
|========================== | 37%
|
|========================== | 38%
|
|=========================== | 38%
|
|=========================== | 39%
|
|============================ | 40%
|
|============================= | 41%
|
|============================= | 42%
|
|============================== | 42%
|
|============================== | 43%
|
|=============================== | 44%
|
|=============================== | 45%
|
|================================ | 45%
|
|================================ | 46%
|
|================================= | 47%
|
|================================= | 48%
|
|================================== | 48%
|
|================================== | 49%
|
|=================================== | 50%
|
|==================================== | 51%
|
|==================================== | 52%
|
|===================================== | 52%
|
|===================================== | 53%
|
|====================================== | 54%
|
|====================================== | 55%
|
|======================================= | 55%
|
|======================================= | 56%
|
|======================================== | 57%
|
|======================================== | 58%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 60%
|
|=========================================== | 61%
|
|=========================================== | 62%
|
|============================================ | 63%
|
|============================================= | 64%
|
|============================================= | 65%
|
|============================================== | 65%
|
|============================================== | 66%
|
|=============================================== | 67%
|
|=============================================== | 68%
|
|================================================ | 68%
|
|================================================ | 69%
|
|================================================= | 70%
|
|================================================== | 71%
|
|================================================== | 72%
|
|=================================================== | 72%
|
|=================================================== | 73%
|
|==================================================== | 74%
|
|==================================================== | 75%
|
|===================================================== | 75%
|
|===================================================== | 76%
|
|====================================================== | 77%
|
|====================================================== | 78%
|
|======================================================= | 78%
|
|======================================================= | 79%
|
|======================================================== | 80%
|
|========================================================= | 81%
|
|========================================================= | 82%
|
|========================================================== | 83%
|
|=========================================================== | 84%
|
|=========================================================== | 85%
|
|============================================================ | 85%
|
|============================================================ | 86%
|
|============================================================= | 87%
|
|============================================================= | 88%
|
|============================================================== | 88%
|
|============================================================== | 89%
|
|=============================================================== | 90%
|
|=============================================================== | 91%
|
|================================================================ | 91%
|
|================================================================ | 92%
|
|================================================================= | 93%
|
|================================================================== | 94%
|
|================================================================== | 95%
|
|=================================================================== | 95%
|
|=================================================================== | 96%
|
|==================================================================== | 97%
|
|==================================================================== | 98%
|
|===================================================================== | 98%
|
|===================================================================== | 99%
|
|======================================================================| 100%
census_1_s <- separate(data= census_1, col=NAME, into= c('tract', 'county', 'state'), sep=",")
head(census_1_s)
## Simple feature collection with 6 features and 12 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -121.8832 ymin: 37.2602 xmax: -121.7828 ymax: 37.34074
## Geodetic CRS: NAD83
## GEOID tract county state hhincomeE
## 1 06085512042 Census Tract 5120.42 Santa Clara County California 78382
## 2 06085503306 Census Tract 5033.06 Santa Clara County California 87361
## 3 06085503108 Census Tract 5031.08 Santa Clara County California 111618
## 4 06085503323 Census Tract 5033.23 Santa Clara County California 135877
## 5 06085503401 Census Tract 5034.01 Santa Clara County California 79318
## 6 06085512020 Census Tract 5120.20 Santa Clara County California 102401
## hhincomeM race.totE race.totM race.whiteE race.whiteM race.blackE race.blackM
## 1 3798 3020 292 878 173 151 79
## 2 14909 4373 379 1349 443 86 71
## 3 23929 8222 627 3301 619 637 211
## 4 14938 4763 402 1308 458 251 155
## 5 17404 4468 431 1202 366 75 65
## 6 10813 7825 879 4039 902 167 106
## geometry
## 1 MULTIPOLYGON (((-121.82 37....
## 2 MULTIPOLYGON (((-121.8318 3...
## 3 MULTIPOLYGON (((-121.8832 3...
## 4 MULTIPOLYGON (((-121.8006 3...
## 5 MULTIPOLYGON (((-121.8455 3...
## 6 MULTIPOLYGON (((-121.8419 3...
yelp_sf <- yelp_dropna1 %>% st_as_sf(coords=c("coordinates.longitude", "coordinates.latitude"), crs = 4326)
# sf subsets
yelp_in <- yelp_sf[census_1_s %>% st_transform(4326) %>%
filter(county %in% c(" Santa Clara County")) %>%
st_union(), ,op = st_intersects]
nrow(yelp_in)
## [1] 5291
glue::glue("nrow before: {nrow(my_yelp)} -> nrow after: {nrow(yelp_in)} \n
ncol before: {ncol(my_yelp)} -> ncol after: {ncol(yelp_in)} \n") %>%
print()
## nrow before: 39702 -> nrow after: 5291
##
## ncol before: 17 -> ncol after: 24
# Visualize
tmap_mode("view")
## tmap mode set to interactive viewing
tm_shape(yelp_in) + tm_dots(col = "price")
# census is currently sfc. Convert it to sf.
census_sf <- census_1 %>% st_sf()
st_crs(census_sf) <- 4326
## Warning: st_crs<- : replacing crs does not reproject data; use st_transform for
## that
# Spatial join
census_yelp <- st_join(census_sf, yelp_in, join = st_intersects) #%>% st_transform(4326)
yelp_census <- st_join(yelp_in, census_sf, join = st_intersects)
# View
census_yelp %>% head()
## Simple feature collection with 6 features and 33 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -121.8318 ymin: 37.2602 xmax: -121.8083 ymax: 37.34074
## Geodetic CRS: WGS 84
## GEOID NAME hhincomeE
## 1 06085512042 Census Tract 5120.42, Santa Clara County, California 78382
## 2 06085503306 Census Tract 5033.06, Santa Clara County, California 87361
## 2.1 06085503306 Census Tract 5033.06, Santa Clara County, California 87361
## 2.2 06085503306 Census Tract 5033.06, Santa Clara County, California 87361
## 2.3 06085503306 Census Tract 5033.06, Santa Clara County, California 87361
## 2.4 06085503306 Census Tract 5033.06, Santa Clara County, California 87361
## hhincomeM race.totE race.totM race.whiteE race.whiteM race.blackE
## 1 3798 3020 292 878 173 151
## 2 14909 4373 379 1349 443 86
## 2.1 14909 4373 379 1349 443 86
## 2.2 14909 4373 379 1349 443 86
## 2.3 14909 4373 379 1349 443 86
## 2.4 14909 4373 379 1349 443 86
## race.blackM id
## 1 79 Iw0u4il3SpX7rtY0ookQdA
## 2 71 jcxoXMWrqreHQT0P5h1s9g
## 2.1 71 lZUd_rDdO5FyZQZfKjGqZw
## 2.2 71 4XaESbCqIsmETYxpPiv9Lg
## 2.3 71 NlYprq0SB1tNdgWv1GpXdA
## 2.4 71 smFno58h21Rd4RihK87xvw
## alias
## 1 puro-michoacan-restaurant-san-jose-2
## 2 welch-park-san-jose
## 2.1 cha-ca-long-phung-san-jose-2
## 2.2 mexican-style-churros-san-jose-2
## 2.3 que-ta-banh-canh-trang-bang-udon-noodle-soup-san-jose
## 2.4 taste-of-persia-san-jose
## name
## 1 Puro Michoacan Restaurant
## 2 Welch Park
## 2.1 Cha Ca Long Phung
## 2.2 Mexican Style Churros
## 2.3 Que Ta Banh Canh Trang Bang - Udon Noodle Soup
## 2.4 Taste of Persia
## image_url
## 1 https://s3-media2.fl.yelpcdn.com/bphoto/tPx6AVzj0Xh7SnKt1jalEQ/o.jpg
## 2 https://s3-media4.fl.yelpcdn.com/bphoto/g5mj_1kJMixlfUYA2AgfWg/o.jpg
## 2.1 https://s3-media1.fl.yelpcdn.com/bphoto/HF_-0eyrZQ9_-lKVjNpUyg/o.jpg
## 2.2 https://s3-media3.fl.yelpcdn.com/bphoto/L4IDuy_D2wpBwiSC0P6qzw/o.jpg
## 2.3 https://s3-media1.fl.yelpcdn.com/bphoto/ykTpTBey8xmauZpnv3s22A/o.jpg
## 2.4 https://s3-media2.fl.yelpcdn.com/bphoto/6g5XM0Pgi3qIYv9PnkAESw/o.jpg
## is_closed
## 1 FALSE
## 2 FALSE
## 2.1 FALSE
## 2.2 FALSE
## 2.3 FALSE
## 2.4 FALSE
## url
## 1 https://www.yelp.com/biz/puro-michoacan-restaurant-san-jose-2?adjust_creative=9FUT8HLBJS3nIJ7hDk8tZw&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=9FUT8HLBJS3nIJ7hDk8tZw
## 2 https://www.yelp.com/biz/welch-park-san-jose?adjust_creative=9FUT8HLBJS3nIJ7hDk8tZw&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=9FUT8HLBJS3nIJ7hDk8tZw
## 2.1 https://www.yelp.com/biz/cha-ca-long-phung-san-jose-2?adjust_creative=9FUT8HLBJS3nIJ7hDk8tZw&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=9FUT8HLBJS3nIJ7hDk8tZw
## 2.2 https://www.yelp.com/biz/mexican-style-churros-san-jose-2?adjust_creative=9FUT8HLBJS3nIJ7hDk8tZw&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=9FUT8HLBJS3nIJ7hDk8tZw
## 2.3 https://www.yelp.com/biz/que-ta-banh-canh-trang-bang-udon-noodle-soup-san-jose?adjust_creative=9FUT8HLBJS3nIJ7hDk8tZw&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=9FUT8HLBJS3nIJ7hDk8tZw
## 2.4 https://www.yelp.com/biz/taste-of-persia-san-jose?adjust_creative=9FUT8HLBJS3nIJ7hDk8tZw&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=9FUT8HLBJS3nIJ7hDk8tZw
## review_count categories rating transactions
## 1 38 Mexican 4.0
## 2 7 Parks 3.0
## 2.1 200 Vietnamese 4.0
## 2.2 52 Desserts, Mexican 5.0
## 2.3 192 Vietnamese, Seafood, Noodles 3.5 pickup, delivery
## 2.4 82 Mediterranean, Persian/Iranian 4.0 pickup, delivery
## phone display_phone distance price keys location.address1
## 1 +16692347944 (669) 234-7944 262.5733 $$ a 5138 Monterey Hwy
## 2 689.0162 <NA> p Kenesta Wy
## 2.1 +14082549941 (408) 254-9941 533.5555 $ a 2145 Tully Rd
## 2.2 +14085129594 (408) 512-9594 757.5121 $ a 1812 Cunningham Ave
## 2.3 +14082591445 (408) 259-1445 603.5719 $$ a 2005 Tully Rd
## 2.4 +14084935978 (408) 493-5978 597.6317 $$ a 2011 Tully Rd
## location.address2 location.address3 location.city location.zip_code
## 1 Ste A <NA> San Jose 95111
## 2 San Jose 95122
## 2.1 San Jose 95122
## 2.2 <NA> San Jose 95122
## 2.3 <NA> San Jose 95122
## 2.4 <NA> San Jose 95122
## location.country location.state
## 1 US CA
## 2 US CA
## 2.1 US CA
## 2.2 US CA
## 2.3 US CA
## 2.4 US CA
## location.display_address geometry
## 1 5138 Monterey Hwy, Ste A, San Jose, CA 95111 MULTIPOLYGON (((-121.82 37....
## 2 Kenesta Wy, San Jose, CA 95122 MULTIPOLYGON (((-121.8318 3...
## 2.1 2145 Tully Rd, San Jose, CA 95122 MULTIPOLYGON (((-121.8318 3...
## 2.2 1812 Cunningham Ave, San Jose, CA 95122 MULTIPOLYGON (((-121.8318 3...
## 2.3 2005 Tully Rd, San Jose, CA 95122 MULTIPOLYGON (((-121.8318 3...
## 2.4 2011 Tully Rd, San Jose, CA 95122 MULTIPOLYGON (((-121.8318 3...
yelp_census %>% head()
## Simple feature collection with 6 features and 33 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: -121.8339 ymin: 37.25949 xmax: -121.8057 ymax: 37.33745
## Geodetic CRS: WGS 84
## id alias name
## 1 PkOM7wJZzZ0DoxW84_uLDg edenvale-garden-park-san-jose Edenvale Garden Park
## 2 j-gBxd5Nkhr9iRHi_NJp7w great-oaks-park-san-jose Great Oaks Park
## 3 C41iNUrHTWYN9rUJSn_rJw chynoweth-park-san-jose Chynoweth Park
## 4 wvUCOyYuNcsc7g3OsMxWiA danna-rock-park-san-jose Danna Rock Park
## 5 sJ0RCNgqZ4nlLMl4LhKDmQ lake-cunningham-park-san-jose Lake Cunningham Park
## 6 R6Eb_p72vynnpq20aOLATg nisich-park-san-jose-2 Nisich Park
## image_url
## 1 https://s3-media4.fl.yelpcdn.com/bphoto/yFTWpEtAU5xib85UvVq1pQ/o.jpg
## 2 https://s3-media4.fl.yelpcdn.com/bphoto/rcIYKSNUQ84vatIfw6_h7A/o.jpg
## 3 https://s3-media1.fl.yelpcdn.com/bphoto/gx6Ei11wytx_V215HF1gzg/o.jpg
## 4 https://s3-media3.fl.yelpcdn.com/bphoto/F0AQ22xR2eMrDiImNn66fA/o.jpg
## 5 https://s3-media3.fl.yelpcdn.com/bphoto/ZzHo_dwM5ar6CKV6TsxshQ/o.jpg
## 6 https://s3-media3.fl.yelpcdn.com/bphoto/ECTqvmIlk5LyKxKM7A-Row/o.jpg
## is_closed
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## url
## 1 https://www.yelp.com/biz/edenvale-garden-park-san-jose?adjust_creative=9FUT8HLBJS3nIJ7hDk8tZw&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=9FUT8HLBJS3nIJ7hDk8tZw
## 2 https://www.yelp.com/biz/great-oaks-park-san-jose?adjust_creative=9FUT8HLBJS3nIJ7hDk8tZw&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=9FUT8HLBJS3nIJ7hDk8tZw
## 3 https://www.yelp.com/biz/chynoweth-park-san-jose?adjust_creative=9FUT8HLBJS3nIJ7hDk8tZw&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=9FUT8HLBJS3nIJ7hDk8tZw
## 4 https://www.yelp.com/biz/danna-rock-park-san-jose?adjust_creative=9FUT8HLBJS3nIJ7hDk8tZw&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=9FUT8HLBJS3nIJ7hDk8tZw
## 5 https://www.yelp.com/biz/lake-cunningham-park-san-jose?adjust_creative=9FUT8HLBJS3nIJ7hDk8tZw&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=9FUT8HLBJS3nIJ7hDk8tZw
## 6 https://www.yelp.com/biz/nisich-park-san-jose-2?adjust_creative=9FUT8HLBJS3nIJ7hDk8tZw&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=9FUT8HLBJS3nIJ7hDk8tZw
## review_count categories rating transactions phone
## 1 39 Parks 4.0
## 2 6 Parks 3.5 +14087935510
## 3 4 Parks, Playgrounds 3.0 +14087935510
## 4 3 Parks 2.5
## 5 95 Parks 3.5 +14087935510
## 6 4 Parks 4.5
## display_phone distance price keys location.address1
## 1 526.0228 <NA> p 200 Edenvale Ave
## 2 (408) 793-5510 909.4110 <NA> p 5248 Snow Dr
## 3 (408) 793-5510 682.7830 <NA> p Chynoweth Ave & Edenvale Ave
## 4 1082.4227 <NA> p 4524-4534 Houndshaven Way
## 5 (408) 793-5510 1346.1879 <NA> p 2305 S White Rd
## 6 1803.6135 <NA> p 1401-1437 Suzay Ct
## location.address2 location.address3 location.city location.zip_code
## 1 San Jose 95136
## 2 San Jose 95111
## 3 San Jose 95136
## 4 San Jose 95111
## 5 San Jose 95101
## 6 San Jose 95122
## location.country location.state
## 1 US CA
## 2 US CA
## 3 US CA
## 4 US CA
## 5 US CA
## 6 US CA
## location.display_address GEOID
## 1 200 Edenvale Ave, San Jose, CA 95136 06085512021
## 2 5248 Snow Dr, San Jose, CA 95111 06085512017
## 3 Chynoweth Ave & Edenvale Ave, San Jose, CA 95136 06085512021
## 4 4524-4534 Houndshaven Way, San Jose, CA 95111 06085512017
## 5 2305 S White Rd, San Jose, CA 95101 06085503321
## 6 1401-1437 Suzay Ct, San Jose, CA 95122 06085503111
## NAME hhincomeE hhincomeM
## 1 Census Tract 5120.21, Santa Clara County, California 130460 7700
## 2 Census Tract 5120.17, Santa Clara County, California 92855 17920
## 3 Census Tract 5120.21, Santa Clara County, California 130460 7700
## 4 Census Tract 5120.17, Santa Clara County, California 92855 17920
## 5 Census Tract 5033.21, Santa Clara County, California 156667 28278
## 6 Census Tract 5031.11, Santa Clara County, California 99423 18943
## race.totE race.totM race.whiteE race.whiteM race.blackE race.blackM
## 1 6491 502 3331 469 275 225
## 2 7565 781 3316 634 196 175
## 3 6491 502 3331 469 275 225
## 4 7565 781 3316 634 196 175
## 5 4690 355 419 182 44 49
## 6 5132 483 1167 444 17 27
## geometry
## 1 POINT (-121.8203 37.26251)
## 2 POINT (-121.8057 37.26807)
## 3 POINT (-121.8188 37.25949)
## 4 POINT (-121.8234 37.27216)
## 5 POINT (-121.8089 37.33745)
## 6 POINT (-121.8339 37.31825)
tm_shape(census_yelp %>% group_by(GEOID) %>% summarise(rating=mean(rating))) +
tm_polygons(col = "rating", style = "quantile")
tm_shape(yelp_census) + tm_dots(col="hhincomeE")
yelp_in %>%
# Use mutate bc the re-coded variable is a new variable
mutate(review_count_binary = case_when(review_count > 1000 ~ "many",
review_count <= 1000 ~ "few")) %>%
# Select these two columns to simplify the print out
select(review_count, review_count_binary) %>%
head()
## Simple feature collection with 6 features and 2 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: -121.8339 ymin: 37.25949 xmax: -121.8057 ymax: 37.33745
## Geodetic CRS: WGS 84
## review_count review_count_binary geometry
## 1 39 few POINT (-121.8203 37.26251)
## 2 6 few POINT (-121.8057 37.26807)
## 3 4 few POINT (-121.8188 37.25949)
## 4 3 few POINT (-121.8234 37.27216)
## 5 95 few POINT (-121.8089 37.33745)
## 6 4 few POINT (-121.8339 37.31825)
yelp_in %>%
mutate(across(is.numeric, scale)) %>%
select(is.numeric)
## Warning: Predicate functions must be wrapped in `where()`.
##
## # Bad
## data %>% select(is.numeric)
##
## # Good
## data %>% select(where(is.numeric))
##
## ℹ Please update your code.
## This message is displayed once per session.
## Simple feature collection with 5291 features and 3 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: -122.1847 ymin: 36.95046 xmax: -121.52 ymax: 37.45962
## Geodetic CRS: WGS 84
## First 10 features:
## review_count rating distance geometry
## 1 -0.5120726 0.4815183 -0.50796146 POINT (-121.8203 37.26251)
## 2 -0.5844259 -0.1124927 -0.32989489 POINT (-121.8057 37.26807)
## 3 -0.5888110 -0.7065038 -0.43515339 POINT (-121.8188 37.25949)
## 4 -0.5910035 -1.3005149 -0.24953872 POINT (-121.8234 37.27216)
## 5 -0.3892912 -0.1124927 -0.12703162 POINT (-121.8089 37.33745)
## 6 -0.5888110 1.0755294 0.08542202 POINT (-121.8339 37.31825)
## 7 -0.5822334 -0.7065038 -0.43225838 POINT (-121.8239 37.3255)
## 8 -0.5931960 0.4815183 -0.35756898 POINT (-121.8254 37.33876)
## 9 -0.5493455 0.4815183 -0.25870205 POINT (-121.8614 37.28697)
## 10 -0.5274203 0.4815183 -0.03095143 POINT (-121.8896 37.28915)
The data has changed a lot in terms of patterns formed after cleaning the data. We can clearly see the difference in the second map where most of the poi s are along the transportation network. The clusters are denser in the center of the city and fades away as it goes away in the outer peripheral areas. The review count is higher for 4.0 rating and second highest 3.5. Spatially the price and rating have some correlations, the higher price restaurants are with good rating areas and vice versa.