In this Week 7 Homework we scrape the NYC Restaurant data from the site “https://nycopendata.socrata.com/Health/DOHMH-New-York-City-Restaurant-Inspection-Results/xx67-kt59” and explore as directed in teh questions that follow.
library(RSocrata) #To download the data
library(tidyverse) # TO use the purr package and tibble commands
library(lubridate) # To use the year function
library(stringr) #To subset strings
url <-"https://nycopendata.socrata.com/Health/DOHMH-New-York-City-Restaurant-Inspection-Results/xx67-kt59"
Resdata<- read.socrata(url)
#write_rds(Resdata, path = 'Desktop/Data') # To store the data locally
#Rest<- read_rds('.../Desktop/Data')
Rest<-as_tibble(Resdata)
head(Rest)
## # A tibble: 6 × 18
## CAMIS DBA BORO BUILDING STREET ZIPCODE
## <int> <chr> <chr> <chr> <chr> <int>
## 1 41606387 LA CUARTA RESTAURANT BROOKLYN 782 4 AVENUE 11232
## 2 50007091 NEW PEKING RESTAURANT BROOKLYN 1581 FLATBUSH AVE 11210
## 3 50012185 CASTILLO RESTAURNAT BROOKLYN 709 5TH AVE 11215
## 4 40750062 PANEANTICO BAKERY BROOKLYN 9124 3 AVENUE 11209
## 5 50034621 SHI LI XIANG QUEENS 13358 41ST AVE 11355
## 6 50007874 vapor lounge BRONX 3758 E TREMONT AVE 10465
## # ... with 12 more variables: PHONE <chr>, CUISINE.DESCRIPTION <chr>,
## # INSPECTION.DATE <dttm>, ACTION <chr>, VIOLATION.CODE <chr>,
## # VIOLATION.DESCRIPTION <chr>, CRITICAL.FLAG <chr>, SCORE <int>,
## # GRADE <chr>, GRADE.DATE <dttm>, RECORD.DATE <dttm>,
## # INSPECTION.TYPE <chr>
# Solution 1
Rest %>% map(class)
## $CAMIS
## [1] "integer"
##
## $DBA
## [1] "character"
##
## $BORO
## [1] "character"
##
## $BUILDING
## [1] "character"
##
## $STREET
## [1] "character"
##
## $ZIPCODE
## [1] "integer"
##
## $PHONE
## [1] "character"
##
## $CUISINE.DESCRIPTION
## [1] "character"
##
## $INSPECTION.DATE
## [1] "POSIXct" "POSIXt"
##
## $ACTION
## [1] "character"
##
## $VIOLATION.CODE
## [1] "character"
##
## $VIOLATION.DESCRIPTION
## [1] "character"
##
## $CRITICAL.FLAG
## [1] "character"
##
## $SCORE
## [1] "integer"
##
## $GRADE
## [1] "character"
##
## $GRADE.DATE
## [1] "POSIXct" "POSIXt"
##
## $RECORD.DATE
## [1] "POSIXct" "POSIXt"
##
## $INSPECTION.TYPE
## [1] "character"
#Solution 2
Date_check<-function(x){
if(any(class(x)=="POSIXct")){
return(as.Date(x))
}
else {
return(x)
}
}
Restnew <- Rest %>% map(Date_check) %>% as_tibble()
Restnew%>% map(class)
## $CAMIS
## [1] "integer"
##
## $DBA
## [1] "character"
##
## $BORO
## [1] "character"
##
## $BUILDING
## [1] "character"
##
## $STREET
## [1] "character"
##
## $ZIPCODE
## [1] "integer"
##
## $PHONE
## [1] "character"
##
## $CUISINE.DESCRIPTION
## [1] "character"
##
## $INSPECTION.DATE
## [1] "Date"
##
## $ACTION
## [1] "character"
##
## $VIOLATION.CODE
## [1] "character"
##
## $VIOLATION.DESCRIPTION
## [1] "character"
##
## $CRITICAL.FLAG
## [1] "character"
##
## $SCORE
## [1] "integer"
##
## $GRADE
## [1] "character"
##
## $GRADE.DATE
## [1] "Date"
##
## $RECORD.DATE
## [1] "Date"
##
## $INSPECTION.TYPE
## [1] "character"
#Solution 3
Restnew %>% filter(year(INSPECTION.DATE)==2016) %>%
mutate(ViolationType = ifelse (grepl("mice", VIOLATION.DESCRIPTION),"Mice Violation",
ifelse( grepl("hair",VIOLATION.DESCRIPTION),"Hair Violation",
ifelse( grepl("sewage",VIOLATION.DESCRIPTION),"Sewage Violation","NA")))) %>%
filter(ViolationType %in% c("Mice Violation","Hair Violation","Sewage Violation")) %>%
group_by(ViolationType) %>%
summarise(ViolRest=length(ViolationType))
## # A tibble: 2 × 2
## ViolationType ViolRest
## <chr> <int>
## 1 Hair Violation 2132
## 2 Mice Violation 8283
# Solution 4
TopRestViol <- function(exp, year){
Restnew %>%
filter(year(INSPECTION.DATE) == year,
str_detect(VIOLATION.DESCRIPTION,
exp)) %>%
count(DBA) %>%
top_n(20, n) %>%
arrange(desc(n)) %>%
ggplot() +
geom_bar(mapping = aes(x = reorder(DBA, n),y = n),
stat = "identity", color = "darkblue", fill = "lightblue") +coord_flip()+
labs(x = "Restaurant Name", y = "No. of Violations")+
theme(text = element_text(size = 10))
}
TopRestViol("mice", 2016)