library(tidycensus)
library(sf)
## Linking to GEOS 3.9.1, GDAL 3.4.3, PROJ 7.2.1; sf_use_s2() is TRUE
library(tmap)
library(jsonlite)
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ purrr::flatten() masks jsonlite::flatten()
## ✖ dplyr::lag() masks stats::lag()
library(httr)
library(jsonlite)
library(reshape2)
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
library(here)
## here() starts at C:/Users/fhasan30/OneDrive - Georgia Institute of Technology/Documents/CP 8883
library(yelpr)
library(knitr)
#delete duplicates
my_yelp<- readRDS(file = "yelp_all.rds")
yelp_unique <- my_yelp %>%
distinct(id, .keep_all=T)
glue::glue("Before dropping NA, there were {nrow(my_yelp)} rows. After dropping them, there are {nrow(yelp_unique)} rows") %>%
print()
## Before dropping NA, there were 17268 rows. After dropping them, there are 3660 rows
concate_list <- function(x){
# x is a data frame with columns "alias" and "title" from Yelp$categories
# returns a character vector containing category concatenated titles
titles <- x[["title"]] %>% str_c(collapse = ", ")
return(titles)
}
yelp_flat <- yelp_unique %>%
# 1. Flattening columns with data frame
jsonlite::flatten() %>%
# 2. Handling list-columns
mutate(transactions = transactions %>%
map_chr(., function(x) str_c(x, collapse=", ")),
location.display_address = location.display_address %>%
map_chr(., function(x) str_c(x, collapse=", ")),
categories = categories %>% map_chr(concate_list)) # concate_list is the custom function
#are there any N/A values?
yelp_flat %>%
map_dbl(., function(x) sum(is.na(x)))
## id alias name
## 0 0 0
## image_url is_closed url
## 0 0 0
## review_count categories rating
## 0 0 0
## transactions price phone
## 0 1267 0
## display_phone distance coordinates.latitude
## 0 0 0
## coordinates.longitude location.address1 location.address2
## 0 45 615
## location.address3 location.city location.zip_code
## 1309 0 0
## location.country location.state location.display_address
## 0 0 0
a <- census_api_key(Sys.getenv("census_api"))
## To install your API key for use in future sessions, run this function with `install = TRUE`.
b <- Sys.getenv("census_api")
#collin county TX census tract boundary
tract <- suppressMessages(
get_acs(geography = "tract", # or "block group", "county", "state" etc.
state = "TX",
county = c("collin county"),
variables = c(hhincome = 'B19019_001',
race.tot = "B02001_001",
race.white = "B02001_002",
race.black = 'B02001_003'
),
year = 2019,
survey = "acs5", # American Community Survey 5-year estimate
geometry = TRUE, # returns sf objects
output = "wide") # wide vs. long
)
##
|
| | 0%
|
|= | 1%
|
|= | 2%
|
|== | 3%
|
|=== | 4%
|
|=== | 5%
|
|==== | 5%
|
|==== | 6%
|
|===== | 7%
|
|===== | 8%
|
|====== | 8%
|
|====== | 9%
|
|======= | 9%
|
|======= | 10%
|
|======= | 11%
|
|======== | 11%
|
|======== | 12%
|
|========= | 12%
|
|========= | 13%
|
|========= | 14%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 15%
|
|=========== | 16%
|
|============ | 17%
|
|============= | 18%
|
|============= | 19%
|
|============== | 20%
|
|============== | 21%
|
|=============== | 21%
|
|=============== | 22%
|
|================ | 23%
|
|================= | 24%
|
|================= | 25%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 27%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 30%
|
|====================== | 32%
|
|======================= | 33%
|
|======================== | 34%
|
|======================== | 35%
|
|========================= | 36%
|
|========================== | 37%
|
|=========================== | 38%
|
|=========================== | 39%
|
|============================ | 40%
|
|============================ | 41%
|
|============================= | 42%
|
|============================== | 42%
|
|============================== | 43%
|
|=============================== | 44%
|
|================================ | 45%
|
|================================ | 46%
|
|================================= | 47%
|
|================================== | 48%
|
|================================== | 49%
|
|=================================== | 50%
|
|==================================== | 51%
|
|==================================== | 52%
|
|===================================== | 52%
|
|===================================== | 54%
|
|====================================== | 54%
|
|======================================= | 55%
|
|======================================= | 56%
|
|======================================== | 57%
|
|======================================== | 58%
|
|========================================= | 59%
|
|========================================== | 60%
|
|=========================================== | 61%
|
|=========================================== | 62%
|
|============================================ | 63%
|
|============================================= | 64%
|
|============================================== | 65%
|
|============================================== | 66%
|
|=============================================== | 67%
|
|=============================================== | 68%
|
|================================================ | 69%
|
|================================================= | 69%
|
|================================================= | 70%
|
|================================================== | 71%
|
|=================================================== | 72%
|
|=================================================== | 73%
|
|==================================================== | 74%
|
|===================================================== | 76%
|
|====================================================== | 77%
|
|======================================================= | 78%
|
|======================================================= | 79%
|
|======================================================== | 80%
|
|======================================================== | 81%
|
|========================================================= | 81%
|
|========================================================== | 82%
|
|========================================================== | 83%
|
|=========================================================== | 84%
|
|=========================================================== | 85%
|
|============================================================ | 86%
|
|============================================================= | 87%
|
|============================================================== | 88%
|
|============================================================== | 89%
|
|=============================================================== | 90%
|
|================================================================ | 91%
|
|================================================================= | 92%
|
|================================================================= | 93%
|
|================================================================== | 94%
|
|================================================================== | 95%
|
|=================================================================== | 96%
|
|==================================================================== | 97%
|
|==================================================================== | 98%
|
|===================================================================== | 98%
|
|===================================================================== | 99%
|
|======================================================================| 100%
#separate name column to get county
tract<-separate(tract, NAME, sep=", ", into = c("tract", "county"))
## Warning: Expected 2 pieces. Additional pieces discarded in 152 rows [1, 2, 3, 4,
## 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
yelp_sf <- yelp_flat %>%
st_as_sf(coords=c("coordinates.longitude", "coordinates.latitude"), crs = 4326)
yelp_in <- yelp_sf[tract%>% st_transform(crs = st_crs(yelp_sf)) %>% filter(county %in% c("Collin County")) %>% st_union(), , op = st_intersects]
glue::glue("nrow before: {nrow(my_yelp)} -> nrow after: {nrow(yelp_in)} \n
ncol before: {ncol(my_yelp)} -> ncol after: {ncol(yelp_in)} \n") %>%
print()
## nrow before: 17268 -> nrow after: 3096
##
## ncol before: 16 -> ncol after: 23
tmap_mode("view")
## tmap mode set to interactive viewing
tm_shape(yelp_in) + tm_dots(col = "rating")
x = yelp_in$review_count[yelp_in$rating==5.0]
sum(x) #5087
## [1] 5087
x1 =yelp_in$review_count[yelp_in$rating==4.5]
sum(x1) #69715
## [1] 69715
x2 = yelp_in$review_count[yelp_in$rating== 4.0]
sum(x2) #136743
## [1] 136743
x3 = yelp_in$review_count[yelp_in$rating== 3.5]
sum(x3) #71834
## [1] 71834
x4 = yelp_in$review_count[yelp_in$rating== 3.0]
sum(x4) #31097
## [1] 31097
x5 = yelp_in$review_count[yelp_in$rating== 2.5]
sum(x5) #18327
## [1] 18327
x6 = yelp_in$review_count[yelp_in$rating== 2.0]
sum(x6) #9453
## [1] 9453
x7=yelp_in$review_count[yelp_in$rating== 1.5]
sum(x7) #4189
## [1] 4189
x8 = x7=yelp_in$review_count[yelp_in$rating== 1.0]
sum(x8) #279
## [1] 279
c1 <- c(1,1.5,2,2.5,3,3.5,4,4.5,5)
c2 <-c(279,4189,9453,18327,31097,71834,136743,69715,5087)
df <-data.frame(c1,c2)
ggplot(data=df, aes(x=c1, y=c2)) +
geom_line(color="red")+
xlab("Ratings")+
ylab("Reviews")+
geom_point()
cor(c1,c2)
## [1] 0.5218572
z = str_detect(yelp_in$categories, "Park")
sum(z)
## [1] 107
Tidying the data has changed the data frame by reducing the number of observations from 17268 to 3660 because there were a lot of duplicates. The number of parks have decreased from 546 to 107 and the restaurants have decreased from 17,268 to 3660.I think these values are a lot more realistic. I grew up in Collin County and was very surprised during the last assignment to see the amount of parks and restaurants that were shown in the data frame. If someone wasn’t from the same county and saw the original data they would have an incorrect understanding of the area. This shows how important it is to clean data. The graph above as well as the output from the code shows that the number of reviews does have something to do with the rating. The two variables have a r value of .52 indicating that there is a moderate positive correlation. As the reviews increase so do the ratings. It is possible that this has to do with social media culture. If someone gives a business a high rating everyone will go there and the reviews will increase, but one or two bad reviews could ruin an entire establishment. It’s also interesting to see that people tend to not give establishments 5 stars as the graph peaks at 4.