Load Packages required for assignment
library(RSocrata) #read in data from website
library(readr)
library(tidyverse) # Package to tidy, manipulate, and display data
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
library(stringr) #package for handling strings
library(base)
1 Import File and write to directory, convert to tibble, write to directory
NYCRest<- read.socrata("https://nycopendata.socrata.com/Health/DOHMH-New-York-City-Restaurant-Inspection-Results/xx67-kt59")
as_tibble(NYCRest)
## # A tibble: 436,612 × 18
## CAMIS DBA BORO BUILDING
## <int> <chr> <chr> <chr>
## 1 41606387 LA CUARTA RESTAURANT BROOKLYN 782
## 2 50007091 NEW PEKING RESTAURANT BROOKLYN 1581
## 3 50012185 CASTILLO RESTAURNAT BROOKLYN 709
## 4 40750062 PANEANTICO BAKERY BROOKLYN 9124
## 5 50034621 SHI LI XIANG QUEENS 13358
## 6 50007874 vapor lounge BRONX 3758
## 7 40552965 GROUND LEVEL PUB & GRUB STATEN ISLAND 958
## 8 41650546 KING KABAB QUEENS 16709
## 9 41524468 STARBUCKS MANHATTAN 1491
## 10 41435999 YOUR HOUSE CAFE BROOKLYN 6916
## # ... with 436,602 more rows, and 14 more variables: STREET <chr>,
## # ZIPCODE <int>, PHONE <chr>, CUISINE.DESCRIPTION <chr>,
## # INSPECTION.DATE <dttm>, ACTION <chr>, VIOLATION.CODE <chr>,
## # VIOLATION.DESCRIPTION <chr>, CRITICAL.FLAG <chr>, SCORE <int>,
## # GRADE <chr>, GRADE.DATE <dttm>, RECORD.DATE <dttm>,
## # INSPECTION.TYPE <chr>
write_rds(NYCRest, path = "NYCRest.rds")
Preliminary Data Exploration
str(NYCRest)
summary(NYCRest)
3 Number of Restaurant violations in 2016 by mice|hair|sewage
NYCRest %>%
filter(str_detect(INSPECTION.DATE, "2016")) %>%
filter(str_detect(VIOLATION.DESCRIPTION, "mice|hair|sewage")) %>%
count(str_detect(VIOLATION.DESCRIPTION, "mice"))
## # A tibble: 2 × 2
## `str_detect(VIOLATION.DESCRIPTION, "mi...` n
## <lgl> <int>
## 1 FALSE 15778
## 2 TRUE 8283
NYCRest %>%
filter(str_detect(INSPECTION.DATE, "2016")) %>%
filter(str_detect(VIOLATION.DESCRIPTION, "mice|hair|sewage")) %>%
count(str_detect(VIOLATION.DESCRIPTION, "hair"))
## # A tibble: 2 × 2
## `str_detect(VIOLATION.DESCRIPTION, "ha...` n
## <lgl> <int>
## 1 FALSE 21929
## 2 TRUE 2132
NYCRest %>%
filter(str_detect(INSPECTION.DATE, "2016")) %>%
filter(str_detect(VIOLATION.DESCRIPTION, "mice|hair|sewage")) %>%
count(str_detect(VIOLATION.DESCRIPTION, "sewage"))
## # A tibble: 2 × 2
## `str_detect(VIOLATION.DESCRIPTION, "se...` n
## <lgl> <int>
## 1 FALSE 10415
## 2 TRUE 13646
4 Restaurants with Most Violations Function
RestViol<-function(Rest,year,violation){
Year<-Rest%>%
filter(year(NYCRest$INSPECTION.DATE)== year) %>%
filter(str_detect(VIOLATION.DESCRIPTION, regex("y", ignore_case = TRUE)))%>%
group_by(DBA)%>%
summarise(count=n())%>%
top_n(20)
print(Years)
rv<-ggplot(data=Year,aes(x = reorder(DBA, desc(count)), y = count,col=DBA,fill=DBA)) +
geom_bar(stat = "identity") +
geom_text(aes(label= count), na.rm = TRUE) +
ggtitle("20 restaurants with Most Violation") +
ylab("Violations") +
xlab("Restaurants")
print(rv)
}
output <- RestViol(Years,"2015","hair")