tract <- suppressMessages(
get_acs(geography = "tract",
state = "IL",
county = c("DuPage County", "Will County"),
variables = c(hhincome = 'B19019_001'),
year =2021,
survey = "acs5",
geometry = TRUE,
output = "wide",
progress = FALSE)
)
# select the city I want to see
naperville <- tigris::places('IL') %>% filter(NAME == "Naperville")
## | | | 0% | |= | 1% | |= | 2% | |== | 2% | |== | 3% | |=== | 4% | |=== | 5% | |==== | 5% | |==== | 6% | |===== | 6% | |===== | 7% | |===== | 8% | |====== | 8% | |====== | 9% | |======= | 9% | |======= | 10% | |======== | 11% | |======== | 12% | |========= | 12% | |========= | 13% | |========== | 14% | |========== | 15% | |=========== | 15% | |=========== | 16% | |============ | 17% | |============ | 18% | |============= | 18% | |============= | 19% | |============== | 20% | |=============== | 21% | |================ | 23% | |================= | 24% | |================= | 25% | |=================== | 27% | |=================== | 28% | |==================== | 28% | |==================== | 29% | |===================== | 29% | |===================== | 30% | |===================== | 31% | |====================== | 31% | |====================== | 32% | |======================= | 32% | |======================= | 33% | |======================== | 34% | |======================== | 35% | |========================= | 35% | |========================= | 36% | |========================== | 37% | |========================== | 38% | |=========================== | 38% | |=========================== | 39% | |============================ | 40% | |============================ | 41% | |============================= | 41% | |============================= | 42% | |============================== | 42% | |============================== | 43% | |=============================== | 44% | |=============================== | 45% | |================================ | 45% | |================================ | 46% | |================================= | 47% | |================================== | 48% | |================================== | 49% | |=================================== | 50% | |=================================== | 51% | |==================================== | 51% | |==================================== | 52% | |===================================== | 52% | |===================================== | 53% | |===================================== | 54% | |====================================== | 54% | |====================================== | 55% | |======================================= | 55% | |======================================= | 56% | |======================================== | 57% | |======================================== | 58% | |========================================= | 58% | |========================================= | 59% | |========================================== | 60% | |========================================== | 61% | |=========================================== | 61% | |=========================================== | 62% | |============================================ | 62% | |============================================ | 63% | |============================================ | 64% | |============================================= | 64% | |============================================= | 65% | |============================================== | 65% | |============================================== | 66% | |=============================================== | 66% | |=============================================== | 67% | |=============================================== | 68% | |================================================ | 68% | |================================================= | 69% | |================================================= | 70% | |================================================= | 71% | |================================================== | 71% | |================================================== | 72% | |=================================================== | 72% | |=================================================== | 73% | |==================================================== | 74% | |==================================================== | 75% | |===================================================== | 75% | |===================================================== | 76% | |====================================================== | 76% | |====================================================== | 77% | |====================================================== | 78% | |======================================================= | 78% | |======================================================= | 79% | |======================================================== | 79% | |======================================================== | 80% | |========================================================= | 81% | |========================================================= | 82% | |========================================================== | 82% | |========================================================== | 83% | |=========================================================== | 84% | |=========================================================== | 85% | |============================================================ | 85% | |============================================================ | 86% | |============================================================= | 87% | |============================================================= | 88% | |============================================================== | 88% | |============================================================== | 89% | |=============================================================== | 89% | |=============================================================== | 90% | |=============================================================== | 91% | |================================================================ | 91% | |================================================================ | 92% | |================================================================= | 92% | |================================================================= | 93% | |================================================================== | 94% | |================================================================== | 95% | |=================================================================== | 95% | |=================================================================== | 96% | |==================================================================== | 97% | |==================================================================== | 98% | |===================================================================== | 98% | |===================================================================== | 99% | |======================================================================| 99% | |======================================================================| 100%
tract_naperville <- tract[naperville,]
# create a function
get_r <- function(poly, epsg_id){
bb <- st_bbox(poly)
bb_corner <- st_point(c(bb[1], bb[2])) %>% st_sfc(crs = epsg_id)
bb_center_x <- (bb[1]+bb[3])/2
bb_center_y <- (bb[2]+bb[4])/2
bb_center <- st_point(c(bb_center_x, bb_center_y)) %>% st_sfc(crs = epsg_id) %>% st_sf
r = st_distance(bb_corner, bb_center)
bb_center$radius <- r*1.1
return(bb_center)
}
# run this function on our data
epsg_id <- 4326
r4all_loop <- vector("list", nrow(tract_naperville))
for (i in 1:nrow(tract_naperville)){
r4all_loop[[i]] <- tract_naperville %>%
st_transform(crs = epsg_id) %>%
st_geometry() %>%
.[[i]] %>%
get_r(epsg_id)
}
r4all_loop<-bind_rows(r4all_loop)
# add the location data at columns
ready_4_yelp <- r4all_loop %>%
mutate(x = st_coordinates(.)[,1],
y = st_coordinates(.)[,2])
# create a function to get data from yelp api
get_yelp <- function(tract, category){
Sys.sleep(1)
n <- 1
resp <- business_search(api_key = Sys.getenv("yelp_api"),
categories = category,
latitude = tract$y,
longitude = tract$x,
offset = (n - 1) * 50,
radius = round(tract$radius),
limit = 50)
required_n <- ceiling(resp$total/50)
out <- vector("list", required_n)
out[[n]] <- resp$businesses
names(out)[n] <- required_n
if (resp$total >= 1000)
{
print(glue::glue("{n}th row has >= 1000 businesses."))
return(out)
}
else
{
n <- n + 1
while(n <= required_n){
resp <- business_search(api_key = Sys.getenv("yelp_api"),
categories = category,
latitude = tract$y,
longitude = tract$x,
offset = (n - 1) * 50,
radius = round(tract$radius),
limit = 50)
out[[n]] <- resp$businesses
n <- n + 1
}
out <- out %>% bind_rows()
return(out)
}
}
# 2) Get the first Business data : Kids Activities
yelp_kids_list <- vector("list", nrow(ready_4_yelp))
for (row in 1:nrow(ready_4_yelp)){
yelp_kids_list[[row]] <- suppressMessages(get_yelp(ready_4_yelp[row,], "kids_activities"))
print(paste0("Current row: ", row))
}
## [1] "Current row: 1"
## [1] "Current row: 2"
## [1] "Current row: 3"
## [1] "Current row: 4"
## [1] "Current row: 5"
## [1] "Current row: 6"
## [1] "Current row: 7"
## [1] "Current row: 8"
## [1] "Current row: 9"
## [1] "Current row: 10"
## [1] "Current row: 11"
## [1] "Current row: 12"
## [1] "Current row: 13"
## [1] "Current row: 14"
## [1] "Current row: 15"
## [1] "Current row: 16"
## [1] "Current row: 17"
## [1] "Current row: 18"
## [1] "Current row: 19"
## [1] "Current row: 20"
## [1] "Current row: 21"
## [1] "Current row: 22"
## [1] "Current row: 23"
## [1] "Current row: 24"
## [1] "Current row: 25"
## [1] "Current row: 26"
## [1] "Current row: 27"
## [1] "Current row: 28"
## [1] "Current row: 29"
## [1] "Current row: 30"
## [1] "Current row: 31"
## [1] "Current row: 32"
## [1] "Current row: 33"
## [1] "Current row: 34"
## [1] "Current row: 35"
## [1] "Current row: 36"
## [1] "Current row: 37"
## [1] "Current row: 38"
## [1] "Current row: 39"
## [1] "Current row: 40"
## [1] "Current row: 41"
## [1] "Current row: 42"
## [1] "Current row: 43"
## [1] "Current row: 44"
## [1] "Current row: 45"
## [1] "Current row: 46"
## [1] "Current row: 47"
## [1] "Current row: 48"
## [1] "Current row: 49"
## [1] "Current row: 50"
## [1] "Current row: 51"
## [1] "Current row: 52"
yelp_kids <- yelp_kids_list %>% bind_rows() %>% as_tibble()
yelp_ice_list <- vector("list", nrow(ready_4_yelp))
for (row in 1:nrow(ready_4_yelp)){
yelp_ice_list[[row]] <- suppressMessages(get_yelp(ready_4_yelp[row,], "icecream"))
print(paste0("Current row: ", row))
}
## [1] "Current row: 1"
## [1] "Current row: 2"
## [1] "Current row: 3"
## [1] "Current row: 4"
## [1] "Current row: 5"
## [1] "Current row: 6"
## [1] "Current row: 7"
## [1] "Current row: 8"
## [1] "Current row: 9"
## [1] "Current row: 10"
## [1] "Current row: 11"
## [1] "Current row: 12"
## [1] "Current row: 13"
## [1] "Current row: 14"
## [1] "Current row: 15"
## [1] "Current row: 16"
## [1] "Current row: 17"
## [1] "Current row: 18"
## [1] "Current row: 19"
## [1] "Current row: 20"
## [1] "Current row: 21"
## [1] "Current row: 22"
## [1] "Current row: 23"
## [1] "Current row: 24"
## [1] "Current row: 25"
## [1] "Current row: 26"
## [1] "Current row: 27"
## [1] "Current row: 28"
## [1] "Current row: 29"
## [1] "Current row: 30"
## [1] "Current row: 31"
## [1] "Current row: 32"
## [1] "Current row: 33"
## [1] "Current row: 34"
## [1] "Current row: 35"
## [1] "Current row: 36"
## [1] "Current row: 37"
## [1] "Current row: 38"
## [1] "Current row: 39"
## [1] "Current row: 40"
## [1] "Current row: 41"
## [1] "Current row: 42"
## [1] "Current row: 43"
## [1] "Current row: 44"
## [1] "Current row: 45"
## [1] "Current row: 46"
## [1] "Current row: 47"
## [1] "Current row: 48"
## [1] "Current row: 49"
## [1] "Current row: 50"
## [1] "Current row: 51"
## [1] "Current row: 52"
yelp_icecream <- yelp_ice_list %>% bind_rows() %>% as_tibble()
yelp_kids_uni <- distinct(yelp_kids)
yelp_ice_uni <- distinct(yelp_icecream)
yelp_kids_uni <- yelp_kids_uni %>%
tidyr::unnest(categories, names_sep = "_")
yelp_ice_uni <- yelp_ice_uni %>%
tidyr::unnest(categories, names_sep = "_")
#remove duplicate values
yelp_kids_uni <- yelp_kids_uni %>%
drop_na('coordinates')
yelp_ice_uni <- yelp_ice_uni %>%
drop_na('coordinates')
yelp_kids_sf <- yelp_kids_uni %>%
mutate(x = .$coordinates$longitude,
y = .$coordinates$latitude) %>%
filter(!is.na(x) & !is.na(y)) %>%
st_as_sf(coords = c("x", "y"), crs = 4326)
yelp_ice_sf <- yelp_ice_uni %>%
mutate(x = .$coordinates$longitude,
y = .$coordinates$latitude) %>%
filter(!is.na(x) & !is.na(y)) %>%
st_as_sf(coords = c("x","y"), crs=4326)
# create the buffer area
buffer_area <- st_buffer(ready_4_yelp, dist = ready_4_yelp$radius)
# Select the points within the buffer area
# 1) Kids Activities
in_yelp_kids_sf <- yelp_kids_sf %>%
filter(rowSums(st_within(., buffer_area, sparse = FALSE)) > 0)
# 2) Ice cream
in_yelp_ice_sf <- yelp_ice_sf %>%
filter(rowSums(st_within(., buffer_area, sparse = FALSE)) > 0)
icecream_r <- in_yelp_ice_sf %>%
group_by(rating) %>%
summarise(n = n()) %>%
ungroup() %>%
mutate(pct = n/sum(n)*100)
icecream_r <- icecream_r %>%
mutate(rating = case_when(
rating >= 0 & rating < 1 ~ '0-1',
rating >= 1 & rating < 2 ~ '1-2',
rating >= 2 & rating < 3 ~ '2-3',
rating >= 3 & rating < 4 ~ '3-4',
rating >= 4 & rating <= 5 ~ '4-5'
))
ggplot(icecream_r, aes(x = rating, y = n, fill = rating)) +
geom_bar(stat = "identity") +
labs(title = "Review Count and Rating",
x = "Review Count",
y = "Rating") +
theme_minimal()
kids_r <- in_yelp_kids_sf %>%
group_by(rating) %>%
summarise(n = n()) %>%
ungroup() %>%
mutate(pct = n/sum(n)*100)
kids_r <- kids_r %>%
mutate(rating = case_when(
rating >= 0 & rating < 1 ~ '0-1',
rating >= 1 & rating < 2 ~ '1-2',
rating >= 2 & rating < 3 ~ '2-3',
rating >= 3 & rating < 4 ~ '3-4',
rating >= 4 & rating <= 5 ~ '4-5'
))
ggplot(kids_r, aes(x = rating, y = n, fill = rating)) +
geom_bar(stat = "identity") +
labs(title = "Review Count and Rating",
x = "Review Count",
y = "Rating") +
theme_minimal()
I analyzed both data tables and found that neither had duplicate rows or rows with null values. However, there were some data points outside the buffer area. Additionally, after flattening the ‘category’ columns, I found that the ice cream table had 21 category titles, while the kids activities table had 26. I initially wanted to explore the relationship between review count and rating, so I visualized the data with a scatter plot, but no noticeable correlation emerged. Consequently, I categorized the ratings into five groups and visualized the number of ratings. The distribution of ratings differs between the two tables: for ice cream, the most frequent rating range is from 3 to 4, followed by 4 to 5. In contrast, for kids activities, the highest number of ratings falls within the 4 to 5 range, with the next most common range being 0 to 1.