In this RMD, we look at the NYC Restaurant data and automate keyword searches within the citation notes.
library(RSocrata) #importing data from NYC Open Data
library(readr) #converting df to rds file
library(purrr) #using map function
library(tidyverse)
library(stringr) #searching through strings
library(lubridate) #to easily extract Year from a Date
#import NYC Restaurant data
#url <- "https://nycopendata.socrata.com/Health/DOHMH-New-York-City-Restaurant-Inspection-Results/xx67-kt59"
#df <- read.socrata(url)
df <- read.csv("df.csv")
#save as r object for class exercises
readr::write_rds(df, "nyc_rest.rds")
#Question 1
map(df,class)
## $X
## [1] "integer"
##
## $CAMIS
## [1] "integer"
##
## $DBA
## [1] "factor"
##
## $BORO
## [1] "factor"
##
## $BUILDING
## [1] "factor"
##
## $STREET
## [1] "factor"
##
## $ZIPCODE
## [1] "integer"
##
## $PHONE
## [1] "factor"
##
## $CUISINE.DESCRIPTION
## [1] "factor"
##
## $INSPECTION.DATE
## [1] "factor"
##
## $ACTION
## [1] "factor"
##
## $VIOLATION.CODE
## [1] "factor"
##
## $VIOLATION.DESCRIPTION
## [1] "factor"
##
## $CRITICAL.FLAG
## [1] "factor"
##
## $SCORE
## [1] "integer"
##
## $GRADE
## [1] "factor"
##
## $GRADE.DATE
## [1] "factor"
##
## $RECORD.DATE
## [1] "factor"
##
## $INSPECTION.TYPE
## [1] "factor"
#Question 2
date_df <- function(x)
{
if ((class(x))[1] == "POSIXlt")
{
x <- as.Date(x)
}
x
}
df1 <- as_tibble(map_df(df, date_df))
class(df1)
## [1] "tbl_df" "tbl" "data.frame"
df2 <- df1 %>% filter(year(df1$INSPECTION.DATE) == 2016)
dfmice <- df2 %>% filter(df2$VIOLATION.DESCRIPTION %in% grep("mice",df2$VIOLATION.DESCRIPTION, value = TRUE))
dfhair <- df2 %>% filter(df2$VIOLATION.DESCRIPTION %in% grep("hair",df2$VIOLATION.DESCRIPTION, value = TRUE))
dfsewage <- df2 %>% filter(df2$VIOLATION.DESCRIPTION %in% grep("sewage",df2$VIOLATION.DESCRIPTION, value = TRUE))
paste("There are",nrow(dfmice),"citations with the world 'mice'")
## [1] "There are 8283 citations with the world 'mice'"
paste("There are",nrow(dfhair),"citations with the world 'hair'")
## [1] "There are 2132 citations with the world 'hair'"
paste("There are",nrow(dfsewage),"citations with the world 'sewage'")
## [1] "There are 13646 citations with the world 'sewage'"
The function will use the NYC tibble, a year and a string as input to return a bar chart that shows the restaurants with the top 20 occurrences within the VIOLATION.DESCRIPTION column of the user-inputted string.
#Question 4
chart_viol <- function (df, year, string)
{
toString(string)
df %>% filter((year(df$INSPECTION.DATE) == 2016) & (df$VIOLATION.DESCRIPTION %in% grep(string ,df1$VIOLATION.DESCRIPTION, value = TRUE))) %>%
group_by(DBA) %>%
summarise(n = n()) %>%
top_n(20,n) %>%
ggplot(aes(x=reorder(DBA,-n), y=n))+geom_bar(stat="identity")+
theme(axis.text.x = element_text(angle = 90, hjust = 1))+
ggtitle(paste("NYC restaurants with the most citations for",string))+
labs(x="Restaurant", y ="Citation count")
}
#example with 2016 and "mice""
chart_viol(df1, 2016, "mice")