This HTML document is created from the associated R Markdown file. In this assignment, I am using data from NYC restaurants accessed throught https://nycopendata.socrata.com/Health/DOHMH-New-York-City-Restaurant-Inspection-Results/xx67-kt59. the focus for this week’s assignment is on following activities:
Package(s) used in this assignment to exceute R code are mentioned below:
library(ggplot2) #Package to produce complex multi-layered graphs in R
library(tidyverse) #Set of packages including dpylr and ggplot
library(RSocrata) ## Provides easier interaction with Socrata open data portals
library(dplyr)## used to manupulate data
library(stringr) ## used for regex and other string functions
library(purrr) ## used for iteration functions
library(tidyr) ##used for cleaning the data
url <- "https://nycopendata.socrata.com/Health/DOHMH-New-York-City-Restaurant-Inspection-Results/xx67-kt59"
NYCRestaurant_data<-RSocrata::read.socrata(url)
readr::write_rds(NYCRestaurant_data,"NYCRestaurant_data.rds")
ClassType<-map(NYCRestaurant_data,class)
ClassType
## $CAMIS
## [1] "integer"
##
## $DBA
## [1] "factor"
##
## $BORO
## [1] "factor"
##
## $BUILDING
## [1] "character"
##
## $STREET
## [1] "factor"
##
## $ZIPCODE
## [1] "integer"
##
## $PHONE
## [1] "character"
##
## $CUISINE.DESCRIPTION
## [1] "factor"
##
## $INSPECTION.DATE
## [1] "POSIXlt" "POSIXt"
##
## $ACTION
## [1] "factor"
##
## $VIOLATION.CODE
## [1] "factor"
##
## $VIOLATION.DESCRIPTION
## [1] "factor"
##
## $CRITICAL.FLAG
## [1] "factor"
##
## $SCORE
## [1] "integer"
##
## $GRADE
## [1] "factor"
##
## $GRADE.DATE
## [1] "POSIXlt" "POSIXt"
##
## $RECORD.DATE
## [1] "POSIXlt" "POSIXt"
##
## $INSPECTION.TYPE
## [1] "factor"
data_convert<-function(x)
{
if (identical(unlist(class(x))[1],"POSIXlt")){
x<-as.Date(x)
} else(x)
}
post_data_convert<-map(NYCRestaurant_data,data_convert)
post_data_convert <- as_tibble(post_data_convert)
str(post_data_convert)
## Classes 'tbl_df', 'tbl' and 'data.frame': 436699 obs. of 18 variables:
## $ CAMIS : int 30075445 30075445 30075445 30075445 30075445 30075445 30075445 30075445 30075445 30075445 ...
## $ DBA : Factor w/ 20566 levels "1 EAST 66TH STREET KITCHEN",..: 28 28 28 28 28 28 28 28 28 28 ...
## $ BORO : Factor w/ 6 levels "BRONX","BROOKLYN",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ BUILDING : chr "1007" "1007" "1007" "1007" ...
## $ STREET : Factor w/ 3305 levels "10 STREET","18 AVENUE",..: 40 40 40 40 40 40 40 40 40 40 ...
## $ ZIPCODE : int 10462 10462 10462 10462 10462 10462 10462 10462 10462 10462 ...
## $ PHONE : chr "7188924968" "7188924968" "7188924968" "7188924968" ...
## $ CUISINE.DESCRIPTION : Factor w/ 84 levels "American","Bagels/Pretzels",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ INSPECTION.DATE : Date, format: "2016-02-18" "2016-02-18" ...
## $ ACTION : Factor w/ 6 levels "Establishment Closed by DOHMH. Violations were cited in the following area(s) and those requiring immediate action were addres"| __truncated__,..: 4 4 4 4 3 4 4 4 4 4 ...
## $ VIOLATION.CODE : Factor w/ 97 levels "","02A","02B",..: 13 25 20 35 1 13 15 8 13 18 ...
## $ VIOLATION.DESCRIPTION: Factor w/ 95 levels "","''''No Smoking\032 and/or 'Smoking Permitted\032 sign not conspicuously posted. Health warning not present on 'Smoking Permitte"| __truncated__,..: 14 16 23 33 1 14 17 25 14 35 ...
## $ CRITICAL.FLAG : Factor w/ 3 levels "Critical","Not Applicable",..: 1 3 1 3 2 1 1 1 1 1 ...
## $ SCORE : int 10 10 6 2 NA 6 6 32 32 32 ...
## $ GRADE : Factor w/ 7 levels "","A","B","C",..: 2 2 2 2 1 2 2 1 1 1 ...
## $ GRADE.DATE : Date, format: "2016-02-18" "2016-02-18" ...
## $ RECORD.DATE : Date, format: "2016-12-03" "2016-12-03" ...
## $ INSPECTION.TYPE : Factor w/ 34 levels "Administrative Miscellaneous / Initial Inspection",..: 4 4 4 4 12 5 5 4 4 4 ...
violation_mice<-post_data_convert%>%
filter(format(INSPECTION.DATE, "%Y") == 2016)%>%
filter(str_detect(VIOLATION.DESCRIPTION, regex("mice", ignore_case = TRUE)))
cat("Number of restaurants which had violations regarding mice: ", n_distinct(violation_mice$CAMIS))
## Number of restaurants which had violations regarding mice: 5705
violation_hair<-post_data_convert%>%
filter(format(INSPECTION.DATE, "%Y") == 2016)%>%
filter(str_detect(VIOLATION.DESCRIPTION, regex("hair", ignore_case = TRUE)))
cat("Number of restaurants which had violations regarding hair: ", n_distinct(violation_hair$CAMIS))
## Number of restaurants which had violations regarding hair: 2001
violation_sewage<-post_data_convert%>%
filter(format(INSPECTION.DATE, "%Y") == 2016)%>%
filter(str_detect(VIOLATION.DESCRIPTION, regex("sewage", ignore_case = TRUE)))
cat("Number of restaurants which had violations regarding sewage: ", n_distinct(violation_sewage$CAMIS))
## Number of restaurants which had violations regarding sewage: 9482
TopRestaurants<-function(data1,year,violation){
data_r <- data1 %>%
filter(format(INSPECTION.DATE, "%Y") == year)%>%
filter(str_detect(VIOLATION.DESCRIPTION, regex(violation, ignore_case = TRUE)))%>%
group_by(CAMIS,DBA)%>%
summarise(count=n())%>%
arrange(desc(count))%>%
head(20)
graph <-ggplot(data=data_r,aes(x = reorder(DBA, desc(count)),y=count,color = DBA)) +
geom_bar(stat="Identity")+
ggtitle("Top 20 Restaurants with most violations") +
labs(y="# of Violations", x="Restaurants")
print(graph)
}
top_r <- TopRestaurants(post_data_convert,2016,"mice")