R Markdown

This HTML document is created from the associated R Markdown file. In this assignment, I am using data from NYC restaurants accessed throught https://nycopendata.socrata.com/Health/DOHMH-New-York-City-Restaurant-Inspection-Results/xx67-kt59. the focus for this week’s assignment is on following activities:

  1. Strings and regex
  2. Writing functions
  3. Using iterations

Packages Required

Package(s) used in this assignment to exceute R code are mentioned below:

library(ggplot2)  #Package to produce complex multi-layered graphs in R
library(tidyverse) #Set of packages including dpylr and ggplot
library(RSocrata) ## Provides easier interaction with Socrata open data portals
library(dplyr)## used to manupulate data
library(stringr) ## used for regex and other string functions
library(purrr) ## used for iteration functions
library(tidyr) ##used for cleaning the data

Data import

url <- "https://nycopendata.socrata.com/Health/DOHMH-New-York-City-Restaurant-Inspection-Results/xx67-kt59"
NYCRestaurant_data<-RSocrata::read.socrata(url)

readr::write_rds(NYCRestaurant_data,"NYCRestaurant_data.rds")

Question1

ClassType<-map(NYCRestaurant_data,class)
ClassType
## $CAMIS
## [1] "integer"
## 
## $DBA
## [1] "factor"
## 
## $BORO
## [1] "factor"
## 
## $BUILDING
## [1] "character"
## 
## $STREET
## [1] "factor"
## 
## $ZIPCODE
## [1] "integer"
## 
## $PHONE
## [1] "character"
## 
## $CUISINE.DESCRIPTION
## [1] "factor"
## 
## $INSPECTION.DATE
## [1] "POSIXlt" "POSIXt" 
## 
## $ACTION
## [1] "factor"
## 
## $VIOLATION.CODE
## [1] "factor"
## 
## $VIOLATION.DESCRIPTION
## [1] "factor"
## 
## $CRITICAL.FLAG
## [1] "factor"
## 
## $SCORE
## [1] "integer"
## 
## $GRADE
## [1] "factor"
## 
## $GRADE.DATE
## [1] "POSIXlt" "POSIXt" 
## 
## $RECORD.DATE
## [1] "POSIXlt" "POSIXt" 
## 
## $INSPECTION.TYPE
## [1] "factor"

Question2

data_convert<-function(x)
{
  if (identical(unlist(class(x))[1],"POSIXlt")){
      x<-as.Date(x)
} else(x)
}
post_data_convert<-map(NYCRestaurant_data,data_convert)
post_data_convert <- as_tibble(post_data_convert)
str(post_data_convert)
## Classes 'tbl_df', 'tbl' and 'data.frame':    436699 obs. of  18 variables:
##  $ CAMIS                : int  30075445 30075445 30075445 30075445 30075445 30075445 30075445 30075445 30075445 30075445 ...
##  $ DBA                  : Factor w/ 20566 levels "1 EAST 66TH STREET KITCHEN",..: 28 28 28 28 28 28 28 28 28 28 ...
##  $ BORO                 : Factor w/ 6 levels "BRONX","BROOKLYN",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ BUILDING             : chr  "1007" "1007" "1007" "1007" ...
##  $ STREET               : Factor w/ 3305 levels "10 STREET","18 AVENUE",..: 40 40 40 40 40 40 40 40 40 40 ...
##  $ ZIPCODE              : int  10462 10462 10462 10462 10462 10462 10462 10462 10462 10462 ...
##  $ PHONE                : chr  "7188924968" "7188924968" "7188924968" "7188924968" ...
##  $ CUISINE.DESCRIPTION  : Factor w/ 84 levels "American","Bagels/Pretzels",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ INSPECTION.DATE      : Date, format: "2016-02-18" "2016-02-18" ...
##  $ ACTION               : Factor w/ 6 levels "Establishment Closed by DOHMH.  Violations were cited in the following area(s) and those requiring immediate action were addres"| __truncated__,..: 4 4 4 4 3 4 4 4 4 4 ...
##  $ VIOLATION.CODE       : Factor w/ 97 levels "","02A","02B",..: 13 25 20 35 1 13 15 8 13 18 ...
##  $ VIOLATION.DESCRIPTION: Factor w/ 95 levels "","''''No Smoking\032 and/or 'Smoking Permitted\032 sign not conspicuously posted. Health warning not present on 'Smoking Permitte"| __truncated__,..: 14 16 23 33 1 14 17 25 14 35 ...
##  $ CRITICAL.FLAG        : Factor w/ 3 levels "Critical","Not Applicable",..: 1 3 1 3 2 1 1 1 1 1 ...
##  $ SCORE                : int  10 10 6 2 NA 6 6 32 32 32 ...
##  $ GRADE                : Factor w/ 7 levels "","A","B","C",..: 2 2 2 2 1 2 2 1 1 1 ...
##  $ GRADE.DATE           : Date, format: "2016-02-18" "2016-02-18" ...
##  $ RECORD.DATE          : Date, format: "2016-12-03" "2016-12-03" ...
##  $ INSPECTION.TYPE      : Factor w/ 34 levels "Administrative Miscellaneous / Initial Inspection",..: 4 4 4 4 12 5 5 4 4 4 ...
violation_mice<-post_data_convert%>%
  filter(format(INSPECTION.DATE, "%Y") == 2016)%>%
  filter(str_detect(VIOLATION.DESCRIPTION, regex("mice", ignore_case = TRUE)))

cat("Number of restaurants which had violations regarding mice: ",  n_distinct(violation_mice$CAMIS))
## Number of restaurants which had violations regarding mice:  5705
violation_hair<-post_data_convert%>%
  filter(format(INSPECTION.DATE, "%Y") == 2016)%>%
  filter(str_detect(VIOLATION.DESCRIPTION, regex("hair", ignore_case = TRUE)))

cat("Number of restaurants which had violations regarding hair: ",  n_distinct(violation_hair$CAMIS))
## Number of restaurants which had violations regarding hair:  2001
violation_sewage<-post_data_convert%>%
  filter(format(INSPECTION.DATE, "%Y") == 2016)%>%
  filter(str_detect(VIOLATION.DESCRIPTION, regex("sewage", ignore_case = TRUE)))

cat("Number of restaurants which had violations regarding sewage: ",  n_distinct(violation_sewage$CAMIS))
## Number of restaurants which had violations regarding sewage:  9482

Question4

TopRestaurants<-function(data1,year,violation){
  data_r <- data1 %>%
    filter(format(INSPECTION.DATE, "%Y") == year)%>%
    filter(str_detect(VIOLATION.DESCRIPTION, regex(violation, ignore_case = TRUE)))%>%
    group_by(CAMIS,DBA)%>%
    summarise(count=n())%>%
    arrange(desc(count))%>%
    head(20)

graph <-ggplot(data=data_r,aes(x = reorder(DBA, desc(count)),y=count,color = DBA)) +
  geom_bar(stat="Identity")+
  ggtitle("Top 20 Restaurants with most violations") +
  labs(y="# of Violations", x="Restaurants")
print(graph)
}
top_r <- TopRestaurants(post_data_convert,2016,"mice")