library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.4     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(tidycensus)
library(sf)
## Linking to GEOS 3.10.2, GDAL 3.4.1, PROJ 7.2.1; sf_use_s2() is TRUE
library(tmap)
library(jsonlite)
## 
## 载入程辑包:'jsonlite'
## The following object is masked from 'package:purrr':
## 
##     flatten
library(tidyverse)
library(httr)
library(jsonlite)
library(reshape2)
## 
## 载入程辑包:'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(here)
## here() starts at C:/Users/11969/Desktop/Intro to Urban_Analytics/Assignment2
library(yelpr)
library(knitr)
ass2<-readRDS("C:/Users/11969/Desktop/Intro to Urban_Analytics/Assignment1/mini_assignment1.rds")
mq<-readRDS("C:/Users/11969/Desktop/Intro to Urban_Analytics/Assignment2.rds")
#Tidying Yelp data
yelp_unique <- ass2 %>% 
  distinct(id, .keep_all=TRUE)

glue::glue("Before dropping NA, there were {nrow(ass2)} rows. After dropping them, there are {nrow(yelp_unique)} rows") %>% 
  print()
## Before dropping NA, there were 76 rows. After dropping them, there are 58 rows
yelp_flat <- yelp_unique %>% 
  st_set_geometry(NULL) %>% 
  jsonlite::flatten() %>%
  as_tibble()
yelp_flat$coordinates %>% head()
## Warning: Unknown or uninitialised column: `coordinates`.
# Concatenate what's inside the list
yelp_concat <- yelp_flat %>% 
  mutate(transactions = transactions %>% 
           map_chr(., function(x) str_c(x, collapse=", ")),
         location.display_address = location.display_address %>% 
           map_chr(., function(x) str_c(x, collapse=", ")))
# Custom function that takes the data frame in "categories" column in Yelp data
# and returns a character vector
concate_list <- function(x){
  titles <- x[["title"]] %>% str_c(collapse = ", ")
  return(titles)
}
yelp_flat2 <- yelp_concat %>% 
  mutate(categories = categories %>% map_chr(concate_list))

yelp_flat2 %>% print(width = 1000)
#identify whether there exists any NA values.
yelp_flat2 %>% 
  map_dbl(., function(x) sum(is.na(x))) 
#verify that missing values in lat/long columns are in the same rows.
identical(is.na(yelp_flat2$coordinates.latitude),
          is.na(yelp_flat2$coordinates.longitude))
# Drop them.
yelp_dropna1 <- yelp_flat2 %>% 
  drop_na(coordinates.longitude)
# Dropping NAs in price
yelp_dropna2 <- yelp_dropna1 %>% 
  drop_na(price)
# Converting yelp_dropna2 into a sf object
yelp_sf <- yelp_dropna2 %>% 
  st_as_sf(coords=c("coordinates.longitude", "coordinates.latitude"), crs = 4326)
# Make them in the same 'EPSG'
st_crs(yelp_sf) <- 4269 
## Warning: st_crs<- : replacing crs does not reproject data; use st_transform for
## that
# sf subsets
yelp_in <- yelp_sf[mq %>% 
                     st_union(), ,op = st_intersects]

Join two tables

# census is currently sfc. Convert it to sf.
census_sf <- mq %>% st_sf()
# Spatial join
census_yelp <- st_join(census_sf, yelp_in, join = st_intersects)
yelp_census <- st_join(yelp_in, census_sf, join = st_intersects)

Visualize

#Price
tmap_mode("view")
## tmap mode set to interactive viewing
tm_shape(yelp_in) + tm_dots(col = "price")

Seperate categories

yelp_in %>% rowwise() %>% mutate(categories2 = paste0(unlist(categories), collapse = ',')) %>% select(c(categories2)) %>% head()
## Simple feature collection with 6 features and 1 field
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -73.88447 ymin: 40.74769 xmax: -73.88159 ymax: 40.75009
## Geodetic CRS:  NAD83
## # A tibble: 6 x 2
## # Rowwise: 
##   categories2                                                 geometry
##   <chr>                                           <POINT [arc_degree]>
## 1 Tattoo, Piercing                                (-73.88159 40.74769)
## 2 Tattoo, Piercing                                (-73.88163 40.74773)
## 3 Tattoo, Piercing                                (-73.88447 40.74778)
## 4 Grocery, Juice Bars & Smoothies, Health Markets (-73.88233 40.75009)
## 5 Tattoo                                          (-73.88361 40.74791)
## 6 Health Markets, Vitamins & Supplements          (-73.88396 40.74881)

Visualize

tm_shape(census_yelp %>% group_by(GEOID) %>% summarise(rating=mean(rating))) + 
  tm_polygons(col = "rating", style = "quantile")

Visualize

tm_shape(yelp_census) + tm_dots(col="hhincome")

My story: #Have the findings changed before and after the tidying the data? -That’s a huge change. The initial data download was up to 600! But as it became clear that the number had changed to 33, the difference was staggering! You can imagine how much duplicate and empty data there is!

#What’s the most frequent rating score? Does that seem to be related with review_count? -4.0~4.4 seems most frequent.I don’t think there’s a relationship.There are stores with lots of people and low scores, and there are stores with very high scores, and there are stores with almost no people and close to perfect scores.

#(if your Yelp data has price variable) is there any distributional pattern to expensive vs. cheap POIs? Do you think rating and price have some associations? -I didn’t find there have distributional pattern. And I don’t think there have associations.