Data Wrangling in R : Week-7 Assignment

Shreya Ghelani

2016-12-03


Loading packages

Loading Data

#nyc_data <- read.socrata("https://nycopendata.socrata.com/Health/DOHMH-New-York-City-Restaurant-Inspection-Results/xx67-kt59")
#write_rds(nyc_data, path="NYC_Restaurant_Data.rds")
nyc_data <- read_rds("NYC_Restaurant_Data.rds")

A quick glance at the data

names(nyc_data)
##  [1] "CAMIS"                 "DBA"                  
##  [3] "BORO"                  "BUILDING"             
##  [5] "STREET"                "ZIPCODE"              
##  [7] "PHONE"                 "CUISINE.DESCRIPTION"  
##  [9] "INSPECTION.DATE"       "ACTION"               
## [11] "VIOLATION.CODE"        "VIOLATION.DESCRIPTION"
## [13] "CRITICAL.FLAG"         "SCORE"                
## [15] "GRADE"                 "GRADE.DATE"           
## [17] "RECORD.DATE"           "INSPECTION.TYPE"
dim(nyc_data)
## [1] 436612     18
kable(str(nyc_data))
## 'data.frame':    436612 obs. of  18 variables:
##  $ CAMIS                : int  41606387 50007091 50012185 40750062 50034621 50007874 40552965 41650546 41524468 41435999 ...
##  $ DBA                  : chr  "LA CUARTA RESTAURANT" "NEW PEKING RESTAURANT" "CASTILLO RESTAURNAT" "PANEANTICO BAKERY" ...
##  $ BORO                 : chr  "BROOKLYN" "BROOKLYN" "BROOKLYN" "BROOKLYN" ...
##  $ BUILDING             : chr  "782" "1581" "709" "9124" ...
##  $ STREET               : chr  "4 AVENUE" "FLATBUSH AVE" "5TH AVE" "3 AVENUE" ...
##  $ ZIPCODE              : int  11232 11210 11215 11209 11355 10465 10302 11432 10029 11209 ...
##  $ PHONE                : chr  "7183694964" "7188591554" "7184994910" "7186802347" ...
##  $ CUISINE.DESCRIPTION  : chr  "Peruvian" "Chinese" "Spanish" "Bakery" ...
##  $ INSPECTION.DATE      : POSIXct, format: "2016-02-16" "2014-11-13" ...
##  $ ACTION               : chr  "Violations were cited in the following area(s)." "Violations were cited in the following area(s)." "Violations were cited in the following area(s)." "Violations were cited in the following area(s)." ...
##  $ VIOLATION.CODE       : chr  "04N" "02G" "04N" "06C" ...
##  $ VIOLATION.DESCRIPTION: chr  "Filth flies or food/refuse/sewage-associated (FRSA) flies present in facility\032s food and/or non-food areas. Filth flies incl"| __truncated__ "Cold food item held above 41º F (smoked fish and reduced oxygen packaged foods above 38 ºF) except during necessary preparati"| __truncated__ "Filth flies or food/refuse/sewage-associated (FRSA) flies present in facility\032s food and/or non-food areas. Filth flies incl"| __truncated__ "Food not protected from potential source of contamination during storage, preparation, transportation, display or service." ...
##  $ CRITICAL.FLAG        : chr  "Critical" "Critical" "Critical" "Critical" ...
##  $ SCORE                : int  17 7 20 31 17 28 9 27 NA 12 ...
##  $ GRADE                : chr  "" "A" "" "" ...
##  $ GRADE.DATE           : POSIXct, format: NA "2014-11-13" ...
##  $ RECORD.DATE          : POSIXct, format: "2016-12-02" "2016-12-02" ...
##  $ INSPECTION.TYPE      : chr  "Cycle Inspection / Initial Inspection" "Cycle Inspection / Re-inspection" "Cycle Inspection / Initial Inspection" "Cycle Inspection / Initial Inspection" ...

HomeWork Questions

Question 1 - Finding the class of each column

kable(map(nyc_data,class))
integer
character
character
character
character
integer
character
character
POSIXct
POSIXt
character
character
character
character
integer
character
POSIXct
POSIXt
POSIXct
POSIXt
character

Question 2 - Writing a function to format the date columns

format_date <- function(x)
{
if (class(x) == "POSIXct" || class(x) == "POSIXt")
       x <- as.Date(x)
return(x)
}

nyc_data_formatted <- as_tibble(map(nyc_data,format_date))
nyc_data_formatted
## # A tibble: 436,612 × 18
##       CAMIS                      DBA          BORO BUILDING
##       <int>                    <chr>         <chr>    <chr>
## 1  41606387     LA CUARTA RESTAURANT      BROOKLYN      782
## 2  50007091    NEW PEKING RESTAURANT      BROOKLYN     1581
## 3  50012185      CASTILLO RESTAURNAT      BROOKLYN      709
## 4  40750062        PANEANTICO BAKERY      BROOKLYN     9124
## 5  50034621             SHI LI XIANG        QUEENS    13358
## 6  50007874             vapor lounge         BRONX     3758
## 7  40552965 GROUND LEVEL  PUB & GRUB STATEN ISLAND      958
## 8  41650546               KING KABAB        QUEENS    16709
## 9  41524468                STARBUCKS     MANHATTAN     1491
## 10 41435999          YOUR HOUSE CAFE      BROOKLYN     6916
## # ... with 436,602 more rows, and 14 more variables: STREET <chr>,
## #   ZIPCODE <int>, PHONE <chr>, CUISINE.DESCRIPTION <chr>,
## #   INSPECTION.DATE <date>, ACTION <chr>, VIOLATION.CODE <chr>,
## #   VIOLATION.DESCRIPTION <chr>, CRITICAL.FLAG <chr>, SCORE <int>,
## #   GRADE <chr>, GRADE.DATE <date>, RECORD.DATE <date>,
## #   INSPECTION.TYPE <chr>
kable(str(nyc_data_formatted))
## Classes 'tbl_df', 'tbl' and 'data.frame':    436612 obs. of  18 variables:
##  $ CAMIS                : int  41606387 50007091 50012185 40750062 50034621 50007874 40552965 41650546 41524468 41435999 ...
##  $ DBA                  : chr  "LA CUARTA RESTAURANT" "NEW PEKING RESTAURANT" "CASTILLO RESTAURNAT" "PANEANTICO BAKERY" ...
##  $ BORO                 : chr  "BROOKLYN" "BROOKLYN" "BROOKLYN" "BROOKLYN" ...
##  $ BUILDING             : chr  "782" "1581" "709" "9124" ...
##  $ STREET               : chr  "4 AVENUE" "FLATBUSH AVE" "5TH AVE" "3 AVENUE" ...
##  $ ZIPCODE              : int  11232 11210 11215 11209 11355 10465 10302 11432 10029 11209 ...
##  $ PHONE                : chr  "7183694964" "7188591554" "7184994910" "7186802347" ...
##  $ CUISINE.DESCRIPTION  : chr  "Peruvian" "Chinese" "Spanish" "Bakery" ...
##  $ INSPECTION.DATE      : Date, format: "2016-02-16" "2014-11-13" ...
##  $ ACTION               : chr  "Violations were cited in the following area(s)." "Violations were cited in the following area(s)." "Violations were cited in the following area(s)." "Violations were cited in the following area(s)." ...
##  $ VIOLATION.CODE       : chr  "04N" "02G" "04N" "06C" ...
##  $ VIOLATION.DESCRIPTION: chr  "Filth flies or food/refuse/sewage-associated (FRSA) flies present in facility\032s food and/or non-food areas. Filth flies incl"| __truncated__ "Cold food item held above 41º F (smoked fish and reduced oxygen packaged foods above 38 ºF) except during necessary preparati"| __truncated__ "Filth flies or food/refuse/sewage-associated (FRSA) flies present in facility\032s food and/or non-food areas. Filth flies incl"| __truncated__ "Food not protected from potential source of contamination during storage, preparation, transportation, display or service." ...
##  $ CRITICAL.FLAG        : chr  "Critical" "Critical" "Critical" "Critical" ...
##  $ SCORE                : int  17 7 20 31 17 28 9 27 NA 12 ...
##  $ GRADE                : chr  "" "A" "" "" ...
##  $ GRADE.DATE           : Date, format: NA "2014-11-13" ...
##  $ RECORD.DATE          : Date, format: "2016-12-02" "2016-12-02" ...
##  $ INSPECTION.TYPE      : chr  "Cycle Inspection / Initial Inspection" "Cycle Inspection / Re-inspection" "Cycle Inspection / Initial Inspection" "Cycle Inspection / Initial Inspection" ...

Question 3 - Analysis of different types of violations in 2016

nyc_violations_mice <- nyc_data_formatted %>% filter(year(nyc_data_formatted$INSPECTION.DATE)==2016) %>% 
                       filter(str_detect(VIOLATION.DESCRIPTION, ignore.case("mice")))
                     
nyc_violations_hair <- nyc_data_formatted %>% filter(year(nyc_data_formatted$INSPECTION.DATE)==2016) %>% 
  filter(str_detect(VIOLATION.DESCRIPTION, ignore.case("hair")))

nyc_violations_sewage <- nyc_data_formatted %>% filter(year(nyc_data_formatted$INSPECTION.DATE)==2016) %>% 
  filter(str_detect(VIOLATION.DESCRIPTION, ignore.case("sewage")))

violations_2016 <- list()
violations_2016["mice"] <- nrow(nyc_violations_mice)
violations_2016["hair"] <- nrow(nyc_violations_hair)
violations_2016["sewage"] <- nrow(nyc_violations_sewage)

paste("There were",violations_2016[["mice"]], " mice violations,", violations_2016[["hair"]], " hair violations and ", violations_2016[["sewage"]], " sewage violations in 2016")
## [1] "There were 8283  mice violations, 2132  hair violations and  13671  sewage violations in 2016"

Question 4 - Function to plot a bar graph of number of different violations

check_violations <- function(year,expression)
{
    violations_data <- nyc_data_formatted %>% filter(year(nyc_data_formatted$INSPECTION.DATE)==year) %>% 
    filter(str_detect(VIOLATION.DESCRIPTION, ignore.case(expression))) %>% group_by(DBA) %>%
    summarise(n=n()) %>% top_n(20) 
    ggplot(data=violations_data, aes(y=n,x=reorder(DBA,-n),fill="firebrick")) + geom_bar(stat="identity") + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1))+
    ggtitle(paste("NYC restaurants with the most violation counts for",expression))+
    labs(x="Restaurant", y ="Violation Count")
    
}

check_violations(2016,"mice")

check_violations(2016,"hair")

check_violations(2016,"sewage")