Week-7

Week 7 Analysis

Data Importing and Packages

#Analysis from next tab
library(purrr)
library(magrittr)

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:purrr':
## 
##     set_names

library(tibble)
library(RSocrata)
library(stringr)
library(readr)
library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:purrr':
## 
##     contains, order_by

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(plyr)

## -------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## -------------------------------------------------------------------------

## 
## Attaching package: 'plyr'

## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following object is masked from 'package:purrr':
## 
##     compact

detach("package:plyr", unload=TRUE)
NYC <- 'https://nycopendata.socrata.com/Health/DOHMH-New-York-City-Restaurant-Inspection-Results/xx67-kt59'
NYC_data <- read.socrata(NYC)
readr::write_rds(NYC_data,"NYC_data.rds")

Question 1

#Class of each variable is as follows
NYC_variables <- NYC_data %>% map(class)
NYC_variables

## $CAMIS
## [1] "integer"
## 
## $DBA
## [1] "factor"
## 
## $BORO
## [1] "factor"
## 
## $BUILDING
## [1] "character"
## 
## $STREET
## [1] "factor"
## 
## $ZIPCODE
## [1] "integer"
## 
## $PHONE
## [1] "character"
## 
## $CUISINE.DESCRIPTION
## [1] "factor"
## 
## $INSPECTION.DATE
## [1] "POSIXlt" "POSIXt" 
## 
## $ACTION
## [1] "factor"
## 
## $VIOLATION.CODE
## [1] "factor"
## 
## $VIOLATION.DESCRIPTION
## [1] "factor"
## 
## $CRITICAL.FLAG
## [1] "factor"
## 
## $SCORE
## [1] "integer"
## 
## $GRADE
## [1] "factor"
## 
## $GRADE.DATE
## [1] "POSIXlt" "POSIXt" 
## 
## $RECORD.DATE
## [1] "POSIXlt" "POSIXt" 
## 
## $INSPECTION.TYPE
## [1] "factor"

Question 2

#Tibble is the final output
date_change<-function(x) {
      if (identical(unlist(class(x))[1],"POSIXlt")){
      x<-as.Date(x)
      } else(x)
}
b<-as_tibble(map(NYC_data,date_change))
b

## # A tibble: 437,151 x 18
##       CAMIS                   DBA   BORO BUILDING          STREET ZIPCODE
##       <int>                <fctr> <fctr>    <chr>          <fctr>   <int>
## 1  30075445 MORRIS PARK BAKE SHOP  BRONX     1007 MORRIS PARK AVE   10462
## 2  30075445 MORRIS PARK BAKE SHOP  BRONX     1007 MORRIS PARK AVE   10462
## 3  30075445 MORRIS PARK BAKE SHOP  BRONX     1007 MORRIS PARK AVE   10462
## 4  30075445 MORRIS PARK BAKE SHOP  BRONX     1007 MORRIS PARK AVE   10462
## 5  30075445 MORRIS PARK BAKE SHOP  BRONX     1007 MORRIS PARK AVE   10462
## 6  30075445 MORRIS PARK BAKE SHOP  BRONX     1007 MORRIS PARK AVE   10462
## 7  30075445 MORRIS PARK BAKE SHOP  BRONX     1007 MORRIS PARK AVE   10462
## 8  30075445 MORRIS PARK BAKE SHOP  BRONX     1007 MORRIS PARK AVE   10462
## 9  30075445 MORRIS PARK BAKE SHOP  BRONX     1007 MORRIS PARK AVE   10462
## 10 30075445 MORRIS PARK BAKE SHOP  BRONX     1007 MORRIS PARK AVE   10462
## # ... with 437,141 more rows, and 12 more variables: PHONE <chr>,
## #   CUISINE.DESCRIPTION <fctr>, INSPECTION.DATE <date>, ACTION <fctr>,
## #   VIOLATION.CODE <fctr>, VIOLATION.DESCRIPTION <fctr>,
## #   CRITICAL.FLAG <fctr>, SCORE <int>, GRADE <fctr>, GRADE.DATE <date>,
## #   RECORD.DATE <date>, INSPECTION.TYPE <fctr>

b$CAMIS<-as.factor(b$CAMIS)

Question 3

NYC2016 <- filter(b, format(INSPECTION.DATE, "%Y") == 2016)
NYC2016_mice <- NYC2016 %>%
  group_by(CAMIS) %>%
  filter(str_detect(VIOLATION.DESCRIPTION, regex("mice", ignore_case = TRUE)))
n_distinct(NYC2016_mice$CAMIS) #Number of restaurants with required violations

## [1] 5679

NYC2016_hair <- NYC2016 %>%
  group_by(CAMIS) %>%
  filter(str_detect(VIOLATION.DESCRIPTION, regex("hair", ignore_case = TRUE)))
n_distinct(NYC2016_hair$CAMIS) #Number of restaurants with required violations

## [1] 1990

NYC2016_sewage <- NYC2016 %>%
  group_by(CAMIS) %>%
  filter(str_detect(VIOLATION.DESCRIPTION, regex("sewage", ignore_case = TRUE)))
n_distinct(NYC2016_sewage$CAMIS) #Number of restaurants with required violations

## [1] 9433

Question 4

#Top 20 Restaurants with most violations function is created
top_restaurant<-function(df,x,y){
  dt<-df%>%
  filter(format(INSPECTION.DATE, "%Y") == x)%>%
  filter(str_detect(VIOLATION.DESCRIPTION, regex("y", ignore_case = TRUE)))%>%
  group_by(CAMIS,DBA)%>%
  summarise(count=n())%>%
  arrange(desc(count))%>%
  head(n=20)
  print(dt)
  m<-ggplot(data=dt,aes(x = reorder(DBA, desc(count)), y = count, fill=DBA)) + 
    geom_bar(stat = "identity")+ 
    geom_text(aes(label= count), na.rm = TRUE, hjust = 0.3, vjust = -0.8)+
    ggtitle("Top 20 Restaurants with most violations") +
  ylab("Violations")+
  xlab("Top 20 Restaurants")+
  theme(axis.text=element_text(size=1),
        axis.title=element_text(size=10,face="bold"))+
    theme(legend.key.size = unit(2.5, "mm"))
  print(m)
}
ans<-top_restaurant(b,2015,"hair") #Sample function testing

## Source: local data frame [20 x 3]
## Groups: CAMIS [20]
## 
##       CAMIS                                 DBA count
##      <fctr>                              <fctr> <int>
## 1  50033122   PARTY WELL REST & ORIENTAL BAKERY    50
## 2  41475257                    A-WAH RESTAURANT    39
## 3  41459659                       T. K. KITCHEN    37
## 4  41704655                             PADDY'S    36
## 5  41231660 LAS MARAVILLAS DE MEXICO RESTAURANT    35
## 6  50018727                        K ONE BUFFET    35
## 7  41320205                  DAI WAH YUMMY CITY    34
## 8  41683816             MAX BAKERY & RESTAURANT    34
## 9  50017092                   HE LIN RESTAURANT    34
## 10 50032737                       KAM'S KITCHEN    34
## 11 41485393                 MY CORAL RESTAURANT    33
## 12 41510404                            JUSTIN'S    31
## 13 50001637                  YOLANDA RESTAURANT    31
## 14 50032777                       SKYLINE DINER    31
## 15 41583748          YEE MEI FONG TAIWAN BAKERY    30
## 16 40743368                            DOMINO'S    29
## 17 41061893                         BURGER KING    29
## 18 50000855         CROWN FRIED CHICKEN & PIZZA    29
## 19 50014886        NEW LUCKY CHINESE RESTAURANT    29
## 20 41692971                    B BO SING BAKERY    28

Week-7

Ishant Nayer

11/25/2016

Week 7 Analysis

Data Importing and Packages

Question 1

Question 2

Question 3

Question 4