Mini_Project2

Delete Duplicated Rows

library(tidycensus)
library(sf)

## Linking to GEOS 3.9.1, GDAL 3.4.3, PROJ 7.2.1; sf_use_s2() is TRUE

library(tmap)
library(jsonlite)
library(tidyverse)

## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──

## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()  masks stats::filter()
## ✖ purrr::flatten() masks jsonlite::flatten()
## ✖ dplyr::lag()     masks stats::lag()

library(httr)
library(jsonlite)
library(reshape2)

## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths

library(here)

## here() starts at C:/Users/fhasan30/OneDrive - Georgia Institute of Technology/Documents/CP 8883

library(yelpr)
library(knitr)


#delete duplicates 
my_yelp<- readRDS(file = "yelp_all.rds")

yelp_unique <- my_yelp %>% 
  distinct(id, .keep_all=T)

glue::glue("Before dropping NA, there were {nrow(my_yelp)} rows. After dropping them, there are {nrow(yelp_unique)} rows") %>% 
  print()

## Before dropping NA, there were 17268 rows. After dropping them, there are 3660 rows

Flatten

concate_list <- function(x){
  # x is a data frame with columns "alias" and "title" from Yelp$categories
  # returns a character vector containing category concatenated titles 
  titles <- x[["title"]] %>% str_c(collapse = ", ")
  return(titles)
}

yelp_flat <- yelp_unique %>% 
  # 1. Flattening columns with data frame
  jsonlite::flatten() %>% 
  # 2. Handling list-columns
  mutate(transactions = transactions %>% 
           map_chr(., function(x) str_c(x, collapse=", ")),
         location.display_address = location.display_address %>% 
           map_chr(., function(x) str_c(x, collapse=", ")),
         categories = categories %>% map_chr(concate_list)) # concate_list is the custom function

Missing Coordinate Data

#are there any N/A values?
yelp_flat %>% 
  map_dbl(., function(x) sum(is.na(x)))

##                       id                    alias                     name 
##                        0                        0                        0 
##                image_url                is_closed                      url 
##                        0                        0                        0 
##             review_count               categories                   rating 
##                        0                        0                        0 
##             transactions                    price                    phone 
##                        0                     1267                        0 
##            display_phone                 distance     coordinates.latitude 
##                        0                        0                        0 
##    coordinates.longitude        location.address1        location.address2 
##                        0                       45                      615 
##        location.address3            location.city        location.zip_code 
##                     1309                        0                        0 
##         location.country           location.state location.display_address 
##                        0                        0                        0

none of the coordinate variables are missing

Rows Falling Outside Boundry

a <- census_api_key(Sys.getenv("census_api"))

## To install your API key for use in future sessions, run this function with `install = TRUE`.

b <- Sys.getenv("census_api")

#collin county TX census tract boundary
tract <- suppressMessages(
  get_acs(geography = "tract", # or "block group", "county", "state" etc. 
          state = "TX",
          county = c("collin county"),
          variables = c(hhincome = 'B19019_001',
                        race.tot = "B02001_001", 
                        race.white = "B02001_002", 
                        race.black = 'B02001_003'
          ),
          year = 2019,
          survey = "acs5", # American Community Survey 5-year estimate
          geometry = TRUE, # returns sf objects
          output = "wide") # wide vs. long
)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |=                                                                     |   1%
  |                                                                            
  |=                                                                     |   2%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |===                                                                   |   4%
  |                                                                            
  |===                                                                   |   5%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |=====                                                                 |   8%
  |                                                                            
  |======                                                                |   8%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |=======                                                               |   9%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |=======                                                               |  11%
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |========                                                              |  12%
  |                                                                            
  |=========                                                             |  12%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |=========                                                             |  14%
  |                                                                            
  |==========                                                            |  14%
  |                                                                            
  |==========                                                            |  15%
  |                                                                            
  |===========                                                           |  15%
  |                                                                            
  |===========                                                           |  16%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |=============                                                         |  19%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |==============                                                        |  21%
  |                                                                            
  |===============                                                       |  21%
  |                                                                            
  |===============                                                       |  22%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |=================                                                     |  25%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |====================                                                  |  28%
  |                                                                            
  |====================                                                  |  29%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |======================                                                |  32%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |========================                                              |  34%
  |                                                                            
  |========================                                              |  35%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |==========================                                            |  37%
  |                                                                            
  |===========================                                           |  38%
  |                                                                            
  |===========================                                           |  39%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |============================                                          |  41%
  |                                                                            
  |=============================                                         |  42%
  |                                                                            
  |==============================                                        |  42%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |================================                                      |  45%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |==================================                                    |  48%
  |                                                                            
  |==================================                                    |  49%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |====================================                                  |  51%
  |                                                                            
  |====================================                                  |  52%
  |                                                                            
  |=====================================                                 |  52%
  |                                                                            
  |=====================================                                 |  54%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |=======================================                               |  55%
  |                                                                            
  |=======================================                               |  56%
  |                                                                            
  |========================================                              |  57%
  |                                                                            
  |========================================                              |  58%
  |                                                                            
  |=========================================                             |  59%
  |                                                                            
  |==========================================                            |  60%
  |                                                                            
  |===========================================                           |  61%
  |                                                                            
  |===========================================                           |  62%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |=============================================                         |  64%
  |                                                                            
  |==============================================                        |  65%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |===============================================                       |  68%
  |                                                                            
  |================================================                      |  69%
  |                                                                            
  |=================================================                     |  69%
  |                                                                            
  |=================================================                     |  70%
  |                                                                            
  |==================================================                    |  71%
  |                                                                            
  |===================================================                   |  72%
  |                                                                            
  |===================================================                   |  73%
  |                                                                            
  |====================================================                  |  74%
  |                                                                            
  |=====================================================                 |  76%
  |                                                                            
  |======================================================                |  77%
  |                                                                            
  |=======================================================               |  78%
  |                                                                            
  |=======================================================               |  79%
  |                                                                            
  |========================================================              |  80%
  |                                                                            
  |========================================================              |  81%
  |                                                                            
  |=========================================================             |  81%
  |                                                                            
  |==========================================================            |  82%
  |                                                                            
  |==========================================================            |  83%
  |                                                                            
  |===========================================================           |  84%
  |                                                                            
  |===========================================================           |  85%
  |                                                                            
  |============================================================          |  86%
  |                                                                            
  |=============================================================         |  87%
  |                                                                            
  |==============================================================        |  88%
  |                                                                            
  |==============================================================        |  89%
  |                                                                            
  |===============================================================       |  90%
  |                                                                            
  |================================================================      |  91%
  |                                                                            
  |=================================================================     |  92%
  |                                                                            
  |=================================================================     |  93%
  |                                                                            
  |==================================================================    |  94%
  |                                                                            
  |==================================================================    |  95%
  |                                                                            
  |===================================================================   |  96%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |====================================================================  |  98%
  |                                                                            
  |===================================================================== |  98%
  |                                                                            
  |===================================================================== |  99%
  |                                                                            
  |======================================================================| 100%

#separate name column to get county 
tract<-separate(tract, NAME, sep=", ", into = c("tract", "county"))

## Warning: Expected 2 pieces. Additional pieces discarded in 152 rows [1, 2, 3, 4,
## 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].

yelp_sf <- yelp_flat %>% 
  st_as_sf(coords=c("coordinates.longitude", "coordinates.latitude"), crs = 4326)

yelp_in <- yelp_sf[tract%>% st_transform(crs = st_crs(yelp_sf)) %>% filter(county %in% c("Collin County")) %>% st_union(), , op = st_intersects]

Comparing Messy Vs. Tidy

glue::glue("nrow before: {nrow(my_yelp)} -> nrow after: {nrow(yelp_in)} \n
            ncol before: {ncol(my_yelp)} -> ncol after: {ncol(yelp_in)} \n") %>% 
  print()

## nrow before: 17268 -> nrow after: 3096 
## 
## ncol before: 16 -> ncol after: 23

tmap_mode("view")

## tmap mode set to interactive viewing

tm_shape(yelp_in) + tm_dots(col = "rating")

Do number of reviews have anything to do with rating?

x = yelp_in$review_count[yelp_in$rating==5.0]
sum(x) #5087

## [1] 5087

x1 =yelp_in$review_count[yelp_in$rating==4.5]
sum(x1) #69715

## [1] 69715

x2 = yelp_in$review_count[yelp_in$rating== 4.0]
sum(x2) #136743

## [1] 136743

x3 = yelp_in$review_count[yelp_in$rating== 3.5]
sum(x3) #71834

## [1] 71834

x4 =  yelp_in$review_count[yelp_in$rating== 3.0]
sum(x4) #31097

## [1] 31097

x5 = yelp_in$review_count[yelp_in$rating== 2.5]
sum(x5) #18327

## [1] 18327

x6 = yelp_in$review_count[yelp_in$rating== 2.0]
sum(x6) #9453

## [1] 9453

x7=yelp_in$review_count[yelp_in$rating== 1.5]
sum(x7) #4189

## [1] 4189

x8 = x7=yelp_in$review_count[yelp_in$rating== 1.0]
sum(x8) #279

## [1] 279

c1 <- c(1,1.5,2,2.5,3,3.5,4,4.5,5)
c2 <-c(279,4189,9453,18327,31097,71834,136743,69715,5087)
df <-data.frame(c1,c2)

ggplot(data=df, aes(x=c1, y=c2)) +
  geom_line(color="red")+
  xlab("Ratings")+
  ylab("Reviews")+
  geom_point()

cor(c1,c2)

## [1] 0.5218572

z = str_detect(yelp_in$categories, "Park")
sum(z)

## [1] 107

Data Explanation

Tidying the data has changed the data frame by reducing the number of observations from 17268 to 3660 because there were a lot of duplicates. The number of parks have decreased from 546 to 107 and the restaurants have decreased from 17,268 to 3660.I think these values are a lot more realistic. I grew up in Collin County and was very surprised during the last assignment to see the amount of parks and restaurants that were shown in the data frame. If someone wasn’t from the same county and saw the original data they would have an incorrect understanding of the area. This shows how important it is to clean data. The graph above as well as the output from the code shows that the number of reviews does have something to do with the rating. The two variables have a r value of .52 indicating that there is a moderate positive correlation. As the reviews increase so do the ratings. It is possible that this has to do with social media culture. If someone gives a business a high rating everyone will go there and the reviews will increase, but one or two bad reviews could ruin an entire establishment. It’s also interesting to see that people tend to not give establishments 5 stars as the graph peaks at 4.