Answers in individual tabs below

Data Import

Data was imported using the RSocrata package

library(RSocrata)
#file_url <- 'https://nycopendata.socrata.com/Health/DOHMH-New-York-City-Restaurant-Inspection-Results/xx67-kt59'
#data <- read.socrata(url = file_url)
#saveRDS(data,file="nyc_restaurant_data.rds")
data <- readRDS("nyc_restaurant_data.rds")
head(data)
##      CAMIS                   DBA  BORO BUILDING          STREET ZIPCODE
## 1 30075445 MORRIS PARK BAKE SHOP BRONX     1007 MORRIS PARK AVE   10462
## 2 30075445 MORRIS PARK BAKE SHOP BRONX     1007 MORRIS PARK AVE   10462
## 3 30075445 MORRIS PARK BAKE SHOP BRONX     1007 MORRIS PARK AVE   10462
## 4 30075445 MORRIS PARK BAKE SHOP BRONX     1007 MORRIS PARK AVE   10462
## 5 30075445 MORRIS PARK BAKE SHOP BRONX     1007 MORRIS PARK AVE   10462
## 6 30075445 MORRIS PARK BAKE SHOP BRONX     1007 MORRIS PARK AVE   10462
##        PHONE CUISINE.DESCRIPTION INSPECTION.DATE
## 1 7188924968              Bakery      2016-02-18
## 2 7188924968              Bakery      2016-02-18
## 3 7188924968              Bakery      2015-02-09
## 4 7188924968              Bakery      2014-03-03
## 5 7188924968              Bakery      2013-10-10
## 6 7188924968              Bakery      2013-09-11
##                                                        ACTION
## 1             Violations were cited in the following area(s).
## 2             Violations were cited in the following area(s).
## 3             Violations were cited in the following area(s).
## 4             Violations were cited in the following area(s).
## 5 No violations were recorded at the time of this inspection.
## 6             Violations were cited in the following area(s).
##   VIOLATION.CODE
## 1            04L
## 2            08A
## 3            06C
## 4            10F
## 5               
## 6            04L
##                                                                                                                                                                                                                                                             VIOLATION.DESCRIPTION
## 1                                                                                                                                                                                                 Evidence of mice or live mice present in facility's food and/or non-food areas.
## 2                                                                                                                                              Facility not vermin proof. Harborage or conditions conducive to attracting vermin to the premises and/or allowing vermin to exist.
## 3                                                                                                                                                      Food not protected from potential source of contamination during storage, preparation, transportation, display or service.
## 4 Non-food contact surface improperly constructed. Unacceptable material used. Non-food contact surface or equipment improperly maintained and/or not properly sealed, raised, spaced or movable to allow accessibility for cleaning on all sides, above and underneath the unit.
## 5                                                                                                                                                                                                                                                                                
## 6                                                                                                                                                                                                 Evidence of mice or live mice present in facility's food and/or non-food areas.
##    CRITICAL.FLAG SCORE GRADE GRADE.DATE RECORD.DATE
## 1       Critical    10     A 2016-02-18  2016-12-01
## 2   Not Critical    10     A 2016-02-18  2016-12-01
## 3       Critical     6     A 2015-02-09  2016-12-01
## 4   Not Critical     2     A 2014-03-03  2016-12-01
## 5 Not Applicable    NA             <NA>  2016-12-01
## 6       Critical     6     A 2013-09-11  2016-12-01
##                            INSPECTION.TYPE
## 1    Cycle Inspection / Initial Inspection
## 2    Cycle Inspection / Initial Inspection
## 3    Cycle Inspection / Initial Inspection
## 4    Cycle Inspection / Initial Inspection
## 5 Trans Fat / Second Compliance Inspection
## 6         Cycle Inspection / Re-inspection

Answer 1

Use the map function to identify the class of each variable.

library(dplyr)
library(purrr)
data %>% map(class)
## $CAMIS
## [1] "integer"
## 
## $DBA
## [1] "factor"
## 
## $BORO
## [1] "factor"
## 
## $BUILDING
## [1] "character"
## 
## $STREET
## [1] "factor"
## 
## $ZIPCODE
## [1] "integer"
## 
## $PHONE
## [1] "character"
## 
## $CUISINE.DESCRIPTION
## [1] "factor"
## 
## $INSPECTION.DATE
## [1] "POSIXlt" "POSIXt" 
## 
## $ACTION
## [1] "factor"
## 
## $VIOLATION.CODE
## [1] "factor"
## 
## $VIOLATION.DESCRIPTION
## [1] "factor"
## 
## $CRITICAL.FLAG
## [1] "factor"
## 
## $SCORE
## [1] "integer"
## 
## $GRADE
## [1] "factor"
## 
## $GRADE.DATE
## [1] "POSIXlt" "POSIXt" 
## 
## $RECORD.DATE
## [1] "POSIXlt" "POSIXt" 
## 
## $INSPECTION.TYPE
## [1] "factor"

Answer 2

Notice how the date variables are in POSIXlt form. Create a function that takes a single argument (“x”) and checks if it is of POSIXlt class. If it is, have the function change the input to a simple Date class with as.Date. If not then, the function should keep the input class as is. Apply this function to each of the columns in the NY restaurant data set by using the map function. Be sure the final output is a tibble and not a list

library(lubridate)
library(tibble)
check_date <- function(x)
  {
    ifelse(any(class(x) == "POSIXlt"), T, F)
  }

convert_date <- function(x)
  {
  if(!check_date(x))
    {
    return(x)
    }
  return(as.Date(x))
}

data02 <- data %>% map(convert_date) %>% as_data_frame()
head(data02)
## # A tibble: 6 × 18
##      CAMIS                   DBA   BORO BUILDING          STREET ZIPCODE
##      <int>                <fctr> <fctr>    <chr>          <fctr>   <int>
## 1 30075445 MORRIS PARK BAKE SHOP  BRONX     1007 MORRIS PARK AVE   10462
## 2 30075445 MORRIS PARK BAKE SHOP  BRONX     1007 MORRIS PARK AVE   10462
## 3 30075445 MORRIS PARK BAKE SHOP  BRONX     1007 MORRIS PARK AVE   10462
## 4 30075445 MORRIS PARK BAKE SHOP  BRONX     1007 MORRIS PARK AVE   10462
## 5 30075445 MORRIS PARK BAKE SHOP  BRONX     1007 MORRIS PARK AVE   10462
## 6 30075445 MORRIS PARK BAKE SHOP  BRONX     1007 MORRIS PARK AVE   10462
## # ... with 12 more variables: PHONE <chr>, CUISINE.DESCRIPTION <fctr>,
## #   INSPECTION.DATE <date>, ACTION <fctr>, VIOLATION.CODE <fctr>,
## #   VIOLATION.DESCRIPTION <fctr>, CRITICAL.FLAG <fctr>, SCORE <int>,
## #   GRADE <fctr>, GRADE.DATE <date>, RECORD.DATE <date>,
## #   INSPECTION.TYPE <fctr>

Answer 3

Using this reformatted tibble, identify how many restaurants in 2016 had a violation regarding “mice”? How about “hair”? What about “sewage”? Hint: the VIOLATION.DESCRIPTION and INSPECTION.DATE variables will be useful here

library(stringr)
library(purrr)

  data02 %>%
  subset(year(INSPECTION.DATE)==2016) %>%
  group_by(year(INSPECTION.DATE)) %>%
  subset(str_detect(tolower(VIOLATION.DESCRIPTION),"mice")) %>%
  summarize(mice = n_distinct(DBA))
## # A tibble: 1 × 2
##   `year(INSPECTION.DATE)`  mice
##                     <dbl> <int>
## 1                    2016  5182
  data02 %>%
  subset(year(INSPECTION.DATE)==2016) %>%
  group_by(year(INSPECTION.DATE)) %>%
  subset(str_detect(tolower(VIOLATION.DESCRIPTION),"hair")) %>%
  summarize(mice = n_distinct(DBA))
## # A tibble: 1 × 2
##   `year(INSPECTION.DATE)`  mice
##                     <dbl> <int>
## 1                    2016  1878
  data02 %>%
  subset(year(INSPECTION.DATE)==2016) %>%
  group_by(year(INSPECTION.DATE)) %>%
  subset(str_detect(tolower(VIOLATION.DESCRIPTION),"sewage")) %>%
  summarize(mice = n_distinct(DBA))
## # A tibble: 1 × 2
##   `year(INSPECTION.DATE)`  mice
##                     <dbl> <int>
## 1                    2016  8051

In 2016, the number of voilations are:

  • Mice: 5,182
  • Hair: 1,878
  • Sewage: 8,051

Answer 4

Create a function to apply to this tibble that takes a year and a regular expression (i.e. â€œmice”) and returns a ggplot bar chart of the top 20 restaurants with the most violations. Make sure the restaurants are properly rank-ordered in the bar chart

library(ggplot2)
func4 <- function(keyword,year)
{
    subset(data02,year(INSPECTION.DATE) == year) %>%
    group_by(DBA) %>%
    summarize(count = sum(str_detect(tolower(VIOLATION.DESCRIPTION),keyword))) %>%
    arrange(desc(count)) %>%
    top_n(20) %>%
    ggplot()+
    geom_bar(mapping=aes(x=reorder(DBA,count),y=count),stat="identity")+
    coord_flip()+
    ylab(paste("Number of",keyword,"voilations"))+
    ggtitle(paste("Number of",keyword,"Voilations for ",year))+
    xlab("")
}
  
func4("mice",2016)

func4("hair",2016)

func4("contamination",2016)