Week 7 Assignment

In this RMD, we look at the NYC Restaurant data and automate keyword searches within the citation notes.

Packages needed

library(RSocrata) #importing data from NYC Open Data
library(readr) #converting df to rds file
library(purrr) #using map function
library(tidyverse)
library(stringr) #searching through strings
library(lubridate) #to easily extract Year from a Date

Import NYC Restaurant data

#import NYC Restaurant data
#url <- "https://nycopendata.socrata.com/Health/DOHMH-New-York-City-Restaurant-Inspection-Results/xx67-kt59"
#df <- read.socrata(url)
df <- read.csv("df.csv")

#save as r object for class exercises
readr::write_rds(df, "nyc_rest.rds")

Question 1 - let’s see what the class of each column is

#Question 1
map(df,class)
## $X
## [1] "integer"
## 
## $CAMIS
## [1] "integer"
## 
## $DBA
## [1] "factor"
## 
## $BORO
## [1] "factor"
## 
## $BUILDING
## [1] "factor"
## 
## $STREET
## [1] "factor"
## 
## $ZIPCODE
## [1] "integer"
## 
## $PHONE
## [1] "factor"
## 
## $CUISINE.DESCRIPTION
## [1] "factor"
## 
## $INSPECTION.DATE
## [1] "factor"
## 
## $ACTION
## [1] "factor"
## 
## $VIOLATION.CODE
## [1] "factor"
## 
## $VIOLATION.DESCRIPTION
## [1] "factor"
## 
## $CRITICAL.FLAG
## [1] "factor"
## 
## $SCORE
## [1] "integer"
## 
## $GRADE
## [1] "factor"
## 
## $GRADE.DATE
## [1] "factor"
## 
## $RECORD.DATE
## [1] "factor"
## 
## $INSPECTION.TYPE
## [1] "factor"

Question 2 - a function to convert the POSIXlt class to Date

#Question 2
date_df <- function(x) 
{
  if ((class(x))[1] == "POSIXlt")
  {
    x <- as.Date(x)
  }
  x
}
df1 <- as_tibble(map_df(df, date_df))
class(df1)
## [1] "tbl_df"     "tbl"        "data.frame"

Question 3 - how many restaurants have been cited for mice, hair and sewage?

df2 <- df1 %>% filter(year(df1$INSPECTION.DATE) == 2016)

dfmice <- df2  %>% filter(df2$VIOLATION.DESCRIPTION %in% grep("mice",df2$VIOLATION.DESCRIPTION, value = TRUE))

dfhair <- df2  %>% filter(df2$VIOLATION.DESCRIPTION %in% grep("hair",df2$VIOLATION.DESCRIPTION, value = TRUE))

dfsewage <- df2  %>% filter(df2$VIOLATION.DESCRIPTION %in% grep("sewage",df2$VIOLATION.DESCRIPTION, value = TRUE))                   

paste("There are",nrow(dfmice),"citations with the world 'mice'")
## [1] "There are 8283 citations with the world 'mice'"
paste("There are",nrow(dfhair),"citations with the world 'hair'")
## [1] "There are 2132 citations with the world 'hair'"
paste("There are",nrow(dfsewage),"citations with the world 'sewage'")
## [1] "There are 13646 citations with the world 'sewage'"

Question 4 - let’s build a function!

The function will use the NYC tibble, a year and a string as input to return a bar chart that shows the restaurants with the top 20 occurrences within the VIOLATION.DESCRIPTION column of the user-inputted string.

#Question 4
chart_viol <- function (df, year, string)
{
  toString(string)
  df %>% filter((year(df$INSPECTION.DATE) == 2016) & (df$VIOLATION.DESCRIPTION %in% grep(string ,df1$VIOLATION.DESCRIPTION, value = TRUE))) %>% 
    group_by(DBA) %>%
    summarise(n = n()) %>% 
    top_n(20,n) %>% 
    ggplot(aes(x=reorder(DBA,-n), y=n))+geom_bar(stat="identity")+
    theme(axis.text.x = element_text(angle = 90, hjust = 1))+
    ggtitle(paste("NYC restaurants with the most citations for",string))+
    labs(x="Restaurant", y ="Citation count")
}

#example with 2016 and "mice""
chart_viol(df1, 2016, "mice")