Delete Duplicated Rows

library(tidycensus)
library(sf)
## Linking to GEOS 3.9.1, GDAL 3.4.3, PROJ 7.2.1; sf_use_s2() is TRUE
library(tmap)
library(jsonlite)
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()  masks stats::filter()
## ✖ purrr::flatten() masks jsonlite::flatten()
## ✖ dplyr::lag()     masks stats::lag()
library(httr)
library(jsonlite)
library(reshape2)
## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(here)
## here() starts at C:/Users/fhasan30/OneDrive - Georgia Institute of Technology/Documents/CP 8883
library(yelpr)
library(knitr)


#delete duplicates 
my_yelp<- readRDS(file = "yelp_all.rds")

yelp_unique <- my_yelp %>% 
  distinct(id, .keep_all=T)

glue::glue("Before dropping NA, there were {nrow(my_yelp)} rows. After dropping them, there are {nrow(yelp_unique)} rows") %>% 
  print()
## Before dropping NA, there were 17268 rows. After dropping them, there are 3660 rows

Flatten

concate_list <- function(x){
  # x is a data frame with columns "alias" and "title" from Yelp$categories
  # returns a character vector containing category concatenated titles 
  titles <- x[["title"]] %>% str_c(collapse = ", ")
  return(titles)
}

yelp_flat <- yelp_unique %>% 
  # 1. Flattening columns with data frame
  jsonlite::flatten() %>% 
  # 2. Handling list-columns
  mutate(transactions = transactions %>% 
           map_chr(., function(x) str_c(x, collapse=", ")),
         location.display_address = location.display_address %>% 
           map_chr(., function(x) str_c(x, collapse=", ")),
         categories = categories %>% map_chr(concate_list)) # concate_list is the custom function 

Missing Coordinate Data

#are there any N/A values?
yelp_flat %>% 
  map_dbl(., function(x) sum(is.na(x))) 
##                       id                    alias                     name 
##                        0                        0                        0 
##                image_url                is_closed                      url 
##                        0                        0                        0 
##             review_count               categories                   rating 
##                        0                        0                        0 
##             transactions                    price                    phone 
##                        0                     1267                        0 
##            display_phone                 distance     coordinates.latitude 
##                        0                        0                        0 
##    coordinates.longitude        location.address1        location.address2 
##                        0                       45                      615 
##        location.address3            location.city        location.zip_code 
##                     1309                        0                        0 
##         location.country           location.state location.display_address 
##                        0                        0                        0

Rows Falling Outside Boundry

a <- census_api_key(Sys.getenv("census_api"))
## To install your API key for use in future sessions, run this function with `install = TRUE`.
b <- Sys.getenv("census_api")

#collin county TX census tract boundary
tract <- suppressMessages(
  get_acs(geography = "tract", # or "block group", "county", "state" etc. 
          state = "TX",
          county = c("collin county"),
          variables = c(hhincome = 'B19019_001',
                        race.tot = "B02001_001", 
                        race.white = "B02001_002", 
                        race.black = 'B02001_003'
          ),
          year = 2019,
          survey = "acs5", # American Community Survey 5-year estimate
          geometry = TRUE, # returns sf objects
          output = "wide") # wide vs. long
)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |=                                                                     |   1%
  |                                                                            
  |=                                                                     |   2%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |===                                                                   |   4%
  |                                                                            
  |===                                                                   |   5%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |=====                                                                 |   8%
  |                                                                            
  |======                                                                |   8%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |=======                                                               |   9%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |=======                                                               |  11%
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |========                                                              |  12%
  |                                                                            
  |=========                                                             |  12%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |=========                                                             |  14%
  |                                                                            
  |==========                                                            |  14%
  |                                                                            
  |==========                                                            |  15%
  |                                                                            
  |===========                                                           |  15%
  |                                                                            
  |===========                                                           |  16%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |=============                                                         |  19%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |==============                                                        |  21%
  |                                                                            
  |===============                                                       |  21%
  |                                                                            
  |===============                                                       |  22%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |=================                                                     |  25%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |====================                                                  |  28%
  |                                                                            
  |====================                                                  |  29%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |======================                                                |  32%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |========================                                              |  34%
  |                                                                            
  |========================                                              |  35%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |==========================                                            |  37%
  |                                                                            
  |===========================                                           |  38%
  |                                                                            
  |===========================                                           |  39%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |============================                                          |  41%
  |                                                                            
  |=============================                                         |  42%
  |                                                                            
  |==============================                                        |  42%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |================================                                      |  45%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |==================================                                    |  48%
  |                                                                            
  |==================================                                    |  49%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |====================================                                  |  51%
  |                                                                            
  |====================================                                  |  52%
  |                                                                            
  |=====================================                                 |  52%
  |                                                                            
  |=====================================                                 |  54%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |=======================================                               |  55%
  |                                                                            
  |=======================================                               |  56%
  |                                                                            
  |========================================                              |  57%
  |                                                                            
  |========================================                              |  58%
  |                                                                            
  |=========================================                             |  59%
  |                                                                            
  |==========================================                            |  60%
  |                                                                            
  |===========================================                           |  61%
  |                                                                            
  |===========================================                           |  62%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |=============================================                         |  64%
  |                                                                            
  |==============================================                        |  65%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |===============================================                       |  68%
  |                                                                            
  |================================================                      |  69%
  |                                                                            
  |=================================================                     |  69%
  |                                                                            
  |=================================================                     |  70%
  |                                                                            
  |==================================================                    |  71%
  |                                                                            
  |===================================================                   |  72%
  |                                                                            
  |===================================================                   |  73%
  |                                                                            
  |====================================================                  |  74%
  |                                                                            
  |=====================================================                 |  76%
  |                                                                            
  |======================================================                |  77%
  |                                                                            
  |=======================================================               |  78%
  |                                                                            
  |=======================================================               |  79%
  |                                                                            
  |========================================================              |  80%
  |                                                                            
  |========================================================              |  81%
  |                                                                            
  |=========================================================             |  81%
  |                                                                            
  |==========================================================            |  82%
  |                                                                            
  |==========================================================            |  83%
  |                                                                            
  |===========================================================           |  84%
  |                                                                            
  |===========================================================           |  85%
  |                                                                            
  |============================================================          |  86%
  |                                                                            
  |=============================================================         |  87%
  |                                                                            
  |==============================================================        |  88%
  |                                                                            
  |==============================================================        |  89%
  |                                                                            
  |===============================================================       |  90%
  |                                                                            
  |================================================================      |  91%
  |                                                                            
  |=================================================================     |  92%
  |                                                                            
  |=================================================================     |  93%
  |                                                                            
  |==================================================================    |  94%
  |                                                                            
  |==================================================================    |  95%
  |                                                                            
  |===================================================================   |  96%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |====================================================================  |  98%
  |                                                                            
  |===================================================================== |  98%
  |                                                                            
  |===================================================================== |  99%
  |                                                                            
  |======================================================================| 100%
#separate name column to get county 
tract<-separate(tract, NAME, sep=", ", into = c("tract", "county"))
## Warning: Expected 2 pieces. Additional pieces discarded in 152 rows [1, 2, 3, 4,
## 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
yelp_sf <- yelp_flat %>% 
  st_as_sf(coords=c("coordinates.longitude", "coordinates.latitude"), crs = 4326)

yelp_in <- yelp_sf[tract%>% st_transform(crs = st_crs(yelp_sf)) %>% filter(county %in% c("Collin County")) %>% st_union(), , op = st_intersects]

Comparing Messy Vs. Tidy

glue::glue("nrow before: {nrow(my_yelp)} -> nrow after: {nrow(yelp_in)} \n
            ncol before: {ncol(my_yelp)} -> ncol after: {ncol(yelp_in)} \n") %>% 
  print()
## nrow before: 17268 -> nrow after: 3096 
## 
## ncol before: 16 -> ncol after: 23
tmap_mode("view")
## tmap mode set to interactive viewing
tm_shape(yelp_in) + tm_dots(col = "rating")

Do number of reviews have anything to do with rating?

x = yelp_in$review_count[yelp_in$rating==5.0]
sum(x) #5087
## [1] 5087
x1 =yelp_in$review_count[yelp_in$rating==4.5]
sum(x1) #69715
## [1] 69715
x2 = yelp_in$review_count[yelp_in$rating== 4.0]
sum(x2) #136743
## [1] 136743
x3 = yelp_in$review_count[yelp_in$rating== 3.5]
sum(x3) #71834
## [1] 71834
x4 =  yelp_in$review_count[yelp_in$rating== 3.0]
sum(x4) #31097
## [1] 31097
x5 = yelp_in$review_count[yelp_in$rating== 2.5]
sum(x5) #18327
## [1] 18327
x6 = yelp_in$review_count[yelp_in$rating== 2.0]
sum(x6) #9453
## [1] 9453
x7=yelp_in$review_count[yelp_in$rating== 1.5]
sum(x7) #4189
## [1] 4189
x8 = x7=yelp_in$review_count[yelp_in$rating== 1.0]
sum(x8) #279
## [1] 279
c1 <- c(1,1.5,2,2.5,3,3.5,4,4.5,5)
c2 <-c(279,4189,9453,18327,31097,71834,136743,69715,5087)
df <-data.frame(c1,c2)

ggplot(data=df, aes(x=c1, y=c2)) +
  geom_line(color="red")+
  xlab("Ratings")+
  ylab("Reviews")+
  geom_point()

cor(c1,c2)
## [1] 0.5218572
z = str_detect(yelp_in$categories, "Park")
sum(z)
## [1] 107

Data Explanation

Tidying the data has changed the data frame by reducing the number of observations from 17268 to 3660 because there were a lot of duplicates. The number of parks have decreased from 546 to 107 and the restaurants have decreased from 17,268 to 3660.I think these values are a lot more realistic. I grew up in Collin County and was very surprised during the last assignment to see the amount of parks and restaurants that were shown in the data frame. If someone wasn’t from the same county and saw the original data they would have an incorrect understanding of the area. This shows how important it is to clean data. The graph above as well as the output from the code shows that the number of reviews does have something to do with the rating. The two variables have a r value of .52 indicating that there is a moderate positive correlation. As the reviews increase so do the ratings. It is possible that this has to do with social media culture. If someone gives a business a high rating everyone will go there and the reviews will increase, but one or two bad reviews could ruin an entire establishment. It’s also interesting to see that people tend to not give establishments 5 stars as the graph peaks at 4.