Week 7 Analysis
Data Importing and Packages
#Analysis from next tab
library(purrr)
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
library(tibble)
library(RSocrata)
library(stringr)
library(readr)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:purrr':
##
## contains, order_by
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:purrr':
##
## compact
detach("package:plyr", unload=TRUE)
NYC <- 'https://nycopendata.socrata.com/Health/DOHMH-New-York-City-Restaurant-Inspection-Results/xx67-kt59'
NYC_data <- read.socrata(NYC)
readr::write_rds(NYC_data,"NYC_data.rds")
Question 1
#Class of each variable is as follows
NYC_variables <- NYC_data %>% map(class)
NYC_variables
## $CAMIS
## [1] "integer"
##
## $DBA
## [1] "factor"
##
## $BORO
## [1] "factor"
##
## $BUILDING
## [1] "character"
##
## $STREET
## [1] "factor"
##
## $ZIPCODE
## [1] "integer"
##
## $PHONE
## [1] "character"
##
## $CUISINE.DESCRIPTION
## [1] "factor"
##
## $INSPECTION.DATE
## [1] "POSIXlt" "POSIXt"
##
## $ACTION
## [1] "factor"
##
## $VIOLATION.CODE
## [1] "factor"
##
## $VIOLATION.DESCRIPTION
## [1] "factor"
##
## $CRITICAL.FLAG
## [1] "factor"
##
## $SCORE
## [1] "integer"
##
## $GRADE
## [1] "factor"
##
## $GRADE.DATE
## [1] "POSIXlt" "POSIXt"
##
## $RECORD.DATE
## [1] "POSIXlt" "POSIXt"
##
## $INSPECTION.TYPE
## [1] "factor"
Question 2
#Tibble is the final output
date_change<-function(x) {
if (identical(unlist(class(x))[1],"POSIXlt")){
x<-as.Date(x)
} else(x)
}
b<-as_tibble(map(NYC_data,date_change))
b
## # A tibble: 437,151 x 18
## CAMIS DBA BORO BUILDING STREET ZIPCODE
## <int> <fctr> <fctr> <chr> <fctr> <int>
## 1 30075445 MORRIS PARK BAKE SHOP BRONX 1007 MORRIS PARK AVE 10462
## 2 30075445 MORRIS PARK BAKE SHOP BRONX 1007 MORRIS PARK AVE 10462
## 3 30075445 MORRIS PARK BAKE SHOP BRONX 1007 MORRIS PARK AVE 10462
## 4 30075445 MORRIS PARK BAKE SHOP BRONX 1007 MORRIS PARK AVE 10462
## 5 30075445 MORRIS PARK BAKE SHOP BRONX 1007 MORRIS PARK AVE 10462
## 6 30075445 MORRIS PARK BAKE SHOP BRONX 1007 MORRIS PARK AVE 10462
## 7 30075445 MORRIS PARK BAKE SHOP BRONX 1007 MORRIS PARK AVE 10462
## 8 30075445 MORRIS PARK BAKE SHOP BRONX 1007 MORRIS PARK AVE 10462
## 9 30075445 MORRIS PARK BAKE SHOP BRONX 1007 MORRIS PARK AVE 10462
## 10 30075445 MORRIS PARK BAKE SHOP BRONX 1007 MORRIS PARK AVE 10462
## # ... with 437,141 more rows, and 12 more variables: PHONE <chr>,
## # CUISINE.DESCRIPTION <fctr>, INSPECTION.DATE <date>, ACTION <fctr>,
## # VIOLATION.CODE <fctr>, VIOLATION.DESCRIPTION <fctr>,
## # CRITICAL.FLAG <fctr>, SCORE <int>, GRADE <fctr>, GRADE.DATE <date>,
## # RECORD.DATE <date>, INSPECTION.TYPE <fctr>
b$CAMIS<-as.factor(b$CAMIS)
Question 3
NYC2016 <- filter(b, format(INSPECTION.DATE, "%Y") == 2016)
NYC2016_mice <- NYC2016 %>%
group_by(CAMIS) %>%
filter(str_detect(VIOLATION.DESCRIPTION, regex("mice", ignore_case = TRUE)))
n_distinct(NYC2016_mice$CAMIS) #Number of restaurants with required violations
## [1] 5679
NYC2016_hair <- NYC2016 %>%
group_by(CAMIS) %>%
filter(str_detect(VIOLATION.DESCRIPTION, regex("hair", ignore_case = TRUE)))
n_distinct(NYC2016_hair$CAMIS) #Number of restaurants with required violations
## [1] 1990
NYC2016_sewage <- NYC2016 %>%
group_by(CAMIS) %>%
filter(str_detect(VIOLATION.DESCRIPTION, regex("sewage", ignore_case = TRUE)))
n_distinct(NYC2016_sewage$CAMIS) #Number of restaurants with required violations
## [1] 9433
Question 4
#Top 20 Restaurants with most violations function is created
top_restaurant<-function(df,x,y){
dt<-df%>%
filter(format(INSPECTION.DATE, "%Y") == x)%>%
filter(str_detect(VIOLATION.DESCRIPTION, regex("y", ignore_case = TRUE)))%>%
group_by(CAMIS,DBA)%>%
summarise(count=n())%>%
arrange(desc(count))%>%
head(n=20)
print(dt)
m<-ggplot(data=dt,aes(x = reorder(DBA, desc(count)), y = count, fill=DBA)) +
geom_bar(stat = "identity")+
geom_text(aes(label= count), na.rm = TRUE, hjust = 0.3, vjust = -0.8)+
ggtitle("Top 20 Restaurants with most violations") +
ylab("Violations")+
xlab("Top 20 Restaurants")+
theme(axis.text=element_text(size=1),
axis.title=element_text(size=10,face="bold"))+
theme(legend.key.size = unit(2.5, "mm"))
print(m)
}
ans<-top_restaurant(b,2015,"hair") #Sample function testing
## Source: local data frame [20 x 3]
## Groups: CAMIS [20]
##
## CAMIS DBA count
## <fctr> <fctr> <int>
## 1 50033122 PARTY WELL REST & ORIENTAL BAKERY 50
## 2 41475257 A-WAH RESTAURANT 39
## 3 41459659 T. K. KITCHEN 37
## 4 41704655 PADDY'S 36
## 5 41231660 LAS MARAVILLAS DE MEXICO RESTAURANT 35
## 6 50018727 K ONE BUFFET 35
## 7 41320205 DAI WAH YUMMY CITY 34
## 8 41683816 MAX BAKERY & RESTAURANT 34
## 9 50017092 HE LIN RESTAURANT 34
## 10 50032737 KAM'S KITCHEN 34
## 11 41485393 MY CORAL RESTAURANT 33
## 12 41510404 JUSTIN'S 31
## 13 50001637 YOLANDA RESTAURANT 31
## 14 50032777 SKYLINE DINER 31
## 15 41583748 YEE MEI FONG TAIWAN BAKERY 30
## 16 40743368 DOMINO'S 29
## 17 41061893 BURGER KING 29
## 18 50000855 CROWN FRIED CHICKEN & PIZZA 29
## 19 50014886 NEW LUCKY CHINESE RESTAURANT 29
## 20 41692971 B BO SING BAKERY 28
