Synopsis

This is the week-7 homework for the BANA 8090 Data Wrangling With R course. This week focuses on writing functions.

Details

library(RSocrata)
library(tidyverse)
library(dplyr)
library(stringr)

setwd("C:/Manisha_Arora/UC-BANA/Sem1/Data Wrangling R - BB/Data Wrangling with R (BANA 8090)/Week-7")
url <- "https://nycopendata.socrata.com/Health/DOHMH-New-York-City-Restaurant-Inspection-Results/xx67-kt59"
data <- read.socrata(url)

readr::write_rds(data,"nyc_data.rds")

Question 1

x<-c(1:ncol(data))
map(data[,x],class)
$CAMIS
[1] "integer"

$DBA
[1] "factor"

$BORO
[1] "factor"

$BUILDING
[1] "character"

$STREET
[1] "factor"

$ZIPCODE
[1] "integer"

$PHONE
[1] "character"

$CUISINE.DESCRIPTION
[1] "factor"

$INSPECTION.DATE
[1] "POSIXlt" "POSIXt" 

$ACTION
[1] "factor"

$VIOLATION.CODE
[1] "factor"

$VIOLATION.DESCRIPTION
[1] "factor"

$CRITICAL.FLAG
[1] "factor"

$SCORE
[1] "integer"

$GRADE
[1] "factor"

$GRADE.DATE
[1] "POSIXlt" "POSIXt" 

$RECORD.DATE
[1] "POSIXlt" "POSIXt" 

$INSPECTION.TYPE
[1] "factor"

Question 2

date_conv<-function(x){
      if (identical(unlist(class(x))[1],"POSIXlt")){
      x<-as.Date(x)
      } else(x)
}

Restro <-as_tibble(map(data,date_conv))
str(Restro)
Classes 'tbl_df', 'tbl' and 'data.frame':   436699 obs. of  18 variables:
 $ CAMIS                : int  30075445 30075445 30075445 30075445 30075445 30075445 30075445 30075445 30075445 30075445 ...
 $ DBA                  : Factor w/ 20565 levels "1 EAST 66TH STREET KITCHEN",..: 28 28 28 28 28 28 28 28 28 28 ...
 $ BORO                 : Factor w/ 6 levels "BRONX","BROOKLYN",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ BUILDING             : chr  "1007" "1007" "1007" "1007" ...
 $ STREET               : Factor w/ 3305 levels "10 STREET","18 AVENUE",..: 40 40 40 40 40 40 40 40 40 40 ...
 $ ZIPCODE              : int  10462 10462 10462 10462 10462 10462 10462 10462 10462 10462 ...
 $ PHONE                : chr  "7188924968" "7188924968" "7188924968" "7188924968" ...
 $ CUISINE.DESCRIPTION  : Factor w/ 84 levels "American","Bagels/Pretzels",..: 3 3 3 3 3 3 3 3 3 3 ...
 $ INSPECTION.DATE      : Date, format: "2016-02-18" "2016-02-18" ...
 $ ACTION               : Factor w/ 6 levels "Establishment Closed by DOHMH.  Violations were cited in the following area(s) and those requiring immediate action were addres"| __truncated__,..: 4 4 4 4 3 4 4 4 4 4 ...
 $ VIOLATION.CODE       : Factor w/ 97 levels "","02A","02B",..: 13 25 20 35 1 13 15 8 13 18 ...
 $ VIOLATION.DESCRIPTION: Factor w/ 95 levels "","''''No Smoking\032 and/or 'Smoking Permitted\032 sign not conspicuously posted. Health warning not present on 'Smoking Permitte"| __truncated__,..: 14 16 23 33 1 14 17 25 14 35 ...
 $ CRITICAL.FLAG        : Factor w/ 3 levels "Critical","Not Applicable",..: 1 3 1 3 2 1 1 1 1 1 ...
 $ SCORE                : int  10 10 6 2 NA 6 6 32 32 32 ...
 $ GRADE                : Factor w/ 7 levels "","A","B","C",..: 2 2 2 2 1 2 2 1 1 1 ...
 $ GRADE.DATE           : Date, format: "2016-02-18" "2016-02-18" ...
 $ RECORD.DATE          : Date, format: "2016-12-02" "2016-12-02" ...
 $ INSPECTION.TYPE      : Factor w/ 34 levels "Administrative Miscellaneous / Initial Inspection",..: 4 4 4 4 12 5 5 4 4 4 ...

Question 3

Restro$VIOLATION.DESCRIPTION<-as.character(Restro$VIOLATION.DESCRIPTION)


R_mice <- Restro %>%
  filter(format(INSPECTION.DATE, "%Y") == 2016) %>%
  filter(str_detect(VIOLATION.DESCRIPTION, regex("mice", ignore_case = TRUE)))
  
 
paste0("Number of restaurants having Violations mice is ",  n_distinct(R_mice$CAMIS))
[1] "Number of restaurants having Violations mice is 5700"
R_hair <-Restro %>%
  filter(format(INSPECTION.DATE, "%Y") == 2016) %>%
  filter(str_detect(VIOLATION.DESCRIPTION, regex("hair", ignore_case = TRUE)))

paste0("Number of restaurants having Violations hair is ",  n_distinct(R_hair$CAMIS))
[1] "Number of restaurants having Violations hair is 2002"
R_sewage <-Restro %>%
  filter(format(INSPECTION.DATE, "%Y") == 2016) %>%
  filter(str_detect(VIOLATION.DESCRIPTION, regex("sewage", ignore_case = TRUE)))

paste0("Number of restraunts having Violations sewage is ",  n_distinct(R_sewage$CAMIS))
[1] "Number of restraunts having Violations sewage is 9473"

Question 4

top_restaurant<-function(df,x,y){
  dt <-df %>%
    filter(format(INSPECTION.DATE, "%Y") == x) %>%
    filter(str_detect(VIOLATION.DESCRIPTION, regex("y", ignore_case = TRUE))) %>%
    group_by(CAMIS,DBA) %>%
    summarise(count=n()) %>%
    arrange(desc(count)) %>%
    head(20) %>%
  print(dt)
  m <- dt %>%
    arrange(desc(count)) %>%
    ggplot(aes(x = reorder(DBA, count), y = count)) + 
    geom_bar(stat = "identity")+ coord_flip()+
    ggtitle("Top 20 Restaurants with most violations") +
    ylab("Violations")+
    xlab(NULL)
  print(m)
}

dm <- top_restaurant(Restro,2016,"sewage")
Source: local data frame [20 x 3]
Groups: CAMIS [20]

      CAMIS                                   DBA count
      <int>                                <fctr> <int>
1  50045602                             POLKA DOT    33
2  50044942           FOGON LATINO BAR RESTAURANT    31
3  50043402                   ROYAL FRIED CHICKEN    29
4  41429788                             EL AGUILA    27
5  50046492                   I LAND FISH & GRILL    27
6  41683816               MAX BAKERY & RESTAURANT    25
7  50010805                           CAFE AU LEE    25
8  50015997 CARIBBEAN STYLE BAKERY AND RESTAURANT    25
9  40861669                NEW VICTORY RESTAURANT    24
10 41164793       LATINO'S BAR & GRILL RESTAURANT    24
11 41711975                      SZECHUAN GOURMET    24
12 50009773                           LAS LUNITAS    24
13 50012922                  TAIWANESE RESTAURANT    24
14 50044698   II FORNO PIZZA AND PASTA RESTAURANT    24
15 50046745       SAFAYA RESTAURANT & FISH MARKET    24
16 40810174                     BRASSERIE ATHENEE    23
17 41568640                HOT POT UNDER DE' TREE    23
18 41598995             PARAISO AZTECA RESTAURANT    23
19 41612269                          GREEN TOMATO    23
20 41695575                       SABA RESTAURANT    23