This is the week-7 homework for the BANA 8090 Data Wrangling With R course. This week focuses on writing functions.
library(RSocrata)
library(tidyverse)
library(dplyr)
library(stringr)
setwd("C:/Manisha_Arora/UC-BANA/Sem1/Data Wrangling R - BB/Data Wrangling with R (BANA 8090)/Week-7")
url <- "https://nycopendata.socrata.com/Health/DOHMH-New-York-City-Restaurant-Inspection-Results/xx67-kt59"
data <- read.socrata(url)
readr::write_rds(data,"nyc_data.rds")
x<-c(1:ncol(data))
map(data[,x],class)
$CAMIS
[1] "integer"
$DBA
[1] "factor"
$BORO
[1] "factor"
$BUILDING
[1] "character"
$STREET
[1] "factor"
$ZIPCODE
[1] "integer"
$PHONE
[1] "character"
$CUISINE.DESCRIPTION
[1] "factor"
$INSPECTION.DATE
[1] "POSIXlt" "POSIXt"
$ACTION
[1] "factor"
$VIOLATION.CODE
[1] "factor"
$VIOLATION.DESCRIPTION
[1] "factor"
$CRITICAL.FLAG
[1] "factor"
$SCORE
[1] "integer"
$GRADE
[1] "factor"
$GRADE.DATE
[1] "POSIXlt" "POSIXt"
$RECORD.DATE
[1] "POSIXlt" "POSIXt"
$INSPECTION.TYPE
[1] "factor"
date_conv<-function(x){
if (identical(unlist(class(x))[1],"POSIXlt")){
x<-as.Date(x)
} else(x)
}
Restro <-as_tibble(map(data,date_conv))
str(Restro)
Classes 'tbl_df', 'tbl' and 'data.frame': 436699 obs. of 18 variables:
$ CAMIS : int 30075445 30075445 30075445 30075445 30075445 30075445 30075445 30075445 30075445 30075445 ...
$ DBA : Factor w/ 20565 levels "1 EAST 66TH STREET KITCHEN",..: 28 28 28 28 28 28 28 28 28 28 ...
$ BORO : Factor w/ 6 levels "BRONX","BROOKLYN",..: 1 1 1 1 1 1 1 1 1 1 ...
$ BUILDING : chr "1007" "1007" "1007" "1007" ...
$ STREET : Factor w/ 3305 levels "10 STREET","18 AVENUE",..: 40 40 40 40 40 40 40 40 40 40 ...
$ ZIPCODE : int 10462 10462 10462 10462 10462 10462 10462 10462 10462 10462 ...
$ PHONE : chr "7188924968" "7188924968" "7188924968" "7188924968" ...
$ CUISINE.DESCRIPTION : Factor w/ 84 levels "American","Bagels/Pretzels",..: 3 3 3 3 3 3 3 3 3 3 ...
$ INSPECTION.DATE : Date, format: "2016-02-18" "2016-02-18" ...
$ ACTION : Factor w/ 6 levels "Establishment Closed by DOHMH. Violations were cited in the following area(s) and those requiring immediate action were addres"| __truncated__,..: 4 4 4 4 3 4 4 4 4 4 ...
$ VIOLATION.CODE : Factor w/ 97 levels "","02A","02B",..: 13 25 20 35 1 13 15 8 13 18 ...
$ VIOLATION.DESCRIPTION: Factor w/ 95 levels "","''''No Smoking\032 and/or 'Smoking Permitted\032 sign not conspicuously posted. Health warning not present on 'Smoking Permitte"| __truncated__,..: 14 16 23 33 1 14 17 25 14 35 ...
$ CRITICAL.FLAG : Factor w/ 3 levels "Critical","Not Applicable",..: 1 3 1 3 2 1 1 1 1 1 ...
$ SCORE : int 10 10 6 2 NA 6 6 32 32 32 ...
$ GRADE : Factor w/ 7 levels "","A","B","C",..: 2 2 2 2 1 2 2 1 1 1 ...
$ GRADE.DATE : Date, format: "2016-02-18" "2016-02-18" ...
$ RECORD.DATE : Date, format: "2016-12-02" "2016-12-02" ...
$ INSPECTION.TYPE : Factor w/ 34 levels "Administrative Miscellaneous / Initial Inspection",..: 4 4 4 4 12 5 5 4 4 4 ...
Restro$VIOLATION.DESCRIPTION<-as.character(Restro$VIOLATION.DESCRIPTION)
R_mice <- Restro %>%
filter(format(INSPECTION.DATE, "%Y") == 2016) %>%
filter(str_detect(VIOLATION.DESCRIPTION, regex("mice", ignore_case = TRUE)))
paste0("Number of restaurants having Violations mice is ", n_distinct(R_mice$CAMIS))
[1] "Number of restaurants having Violations mice is 5700"
R_hair <-Restro %>%
filter(format(INSPECTION.DATE, "%Y") == 2016) %>%
filter(str_detect(VIOLATION.DESCRIPTION, regex("hair", ignore_case = TRUE)))
paste0("Number of restaurants having Violations hair is ", n_distinct(R_hair$CAMIS))
[1] "Number of restaurants having Violations hair is 2002"
R_sewage <-Restro %>%
filter(format(INSPECTION.DATE, "%Y") == 2016) %>%
filter(str_detect(VIOLATION.DESCRIPTION, regex("sewage", ignore_case = TRUE)))
paste0("Number of restraunts having Violations sewage is ", n_distinct(R_sewage$CAMIS))
[1] "Number of restraunts having Violations sewage is 9473"
top_restaurant<-function(df,x,y){
dt <-df %>%
filter(format(INSPECTION.DATE, "%Y") == x) %>%
filter(str_detect(VIOLATION.DESCRIPTION, regex("y", ignore_case = TRUE))) %>%
group_by(CAMIS,DBA) %>%
summarise(count=n()) %>%
arrange(desc(count)) %>%
head(20) %>%
print(dt)
m <- dt %>%
arrange(desc(count)) %>%
ggplot(aes(x = reorder(DBA, count), y = count)) +
geom_bar(stat = "identity")+ coord_flip()+
ggtitle("Top 20 Restaurants with most violations") +
ylab("Violations")+
xlab(NULL)
print(m)
}
dm <- top_restaurant(Restro,2016,"sewage")
Source: local data frame [20 x 3]
Groups: CAMIS [20]
CAMIS DBA count
<int> <fctr> <int>
1 50045602 POLKA DOT 33
2 50044942 FOGON LATINO BAR RESTAURANT 31
3 50043402 ROYAL FRIED CHICKEN 29
4 41429788 EL AGUILA 27
5 50046492 I LAND FISH & GRILL 27
6 41683816 MAX BAKERY & RESTAURANT 25
7 50010805 CAFE AU LEE 25
8 50015997 CARIBBEAN STYLE BAKERY AND RESTAURANT 25
9 40861669 NEW VICTORY RESTAURANT 24
10 41164793 LATINO'S BAR & GRILL RESTAURANT 24
11 41711975 SZECHUAN GOURMET 24
12 50009773 LAS LUNITAS 24
13 50012922 TAIWANESE RESTAURANT 24
14 50044698 II FORNO PIZZA AND PASTA RESTAURANT 24
15 50046745 SAFAYA RESTAURANT & FISH MARKET 24
16 40810174 BRASSERIE ATHENEE 23
17 41568640 HOT POT UNDER DE' TREE 23
18 41598995 PARAISO AZTECA RESTAURANT 23
19 41612269 GREEN TOMATO 23
20 41695575 SABA RESTAURANT 23