This is the homework report for week 7, produced with R Markdown. This focusses on working on functions and iterations
library(dplyr)## used to manupulate data
library(tibble) ## used to create tibbles
library(stringr) ## using regx and other string functions
library(tidyverse)#group of packages used to summarise and visualize data
library(tidyr) ## functions which can be used to make data tidy
library(purrr) ## used for iteration functionns like map
library(RSocrata) ## sued for reading socrata API datasets
nyc_rest_data<-RSocrata::read.socrata("https://nycopendata.socrata.com/Health/DOHMH-New-York-City-Restaurant-Inspection-Results/xx67-kt59")
# save a RDS dataet locally
readr::write_rds(nyc_rest_data,"nycrestaurants.rds")
Class of each variable
map(nyc_rest_data,class)
## $CAMIS
## [1] "integer"
##
## $DBA
## [1] "factor"
##
## $BORO
## [1] "factor"
##
## $BUILDING
## [1] "character"
##
## $STREET
## [1] "factor"
##
## $ZIPCODE
## [1] "integer"
##
## $PHONE
## [1] "character"
##
## $CUISINE.DESCRIPTION
## [1] "factor"
##
## $INSPECTION.DATE
## [1] "POSIXlt" "POSIXt"
##
## $ACTION
## [1] "factor"
##
## $VIOLATION.CODE
## [1] "factor"
##
## $VIOLATION.DESCRIPTION
## [1] "factor"
##
## $CRITICAL.FLAG
## [1] "factor"
##
## $SCORE
## [1] "integer"
##
## $GRADE
## [1] "factor"
##
## $GRADE.DATE
## [1] "POSIXlt" "POSIXt"
##
## $RECORD.DATE
## [1] "POSIXlt" "POSIXt"
##
## $INSPECTION.TYPE
## [1] "factor"
function to check data type and chage POSIXlt to POSIXct
date_conv<-function(x){
if (identical(unlist(class(x))[1],"POSIXlt")){
x<-as.Date(x)
} else(x)
}
## created a tible from data frame
nyc_rest_tib<-as_tibble(map(nyc_rest_data,date_conv))
## str(nyc_rest_tib)
Violations in NYC Restaurants
nyc_rest_tib$VIOLATION.DESCRIPTION<-as.character(nyc_rest_tib$VIOLATION.DESCRIPTION)
a<-nyc_rest_tib%>%
filter(format(INSPECTION.DATE, "%Y") == 2016)%>%
filter(str_detect(VIOLATION.DESCRIPTION, regex("mice", ignore_case = TRUE)))
Number of restraunts with mice violations is 5697
b<-nyc_rest_tib%>%
filter(format(INSPECTION.DATE, "%Y") == 2016)%>%
filter(str_detect(VIOLATION.DESCRIPTION, regex("hair", ignore_case = TRUE)))
Number of restraunts with hair Violations is 2000
c<-nyc_rest_tib%>%
filter(format(INSPECTION.DATE, "%Y") == 2016)%>%
filter(str_detect(VIOLATION.DESCRIPTION, regex("sewage", ignore_case = TRUE)))
Number of restraunts with sewage violations is 9468
Top 20 Violations
top_restaurant<-function(df,x,y){
dt<-df%>%
filter(format(INSPECTION.DATE, "%Y") == x)%>%
filter(str_detect(VIOLATION.DESCRIPTION, regex("y", ignore_case = TRUE)))%>%
group_by(CAMIS,DBA)%>%
summarise(count=n())%>%
arrange(desc(count))%>%
head(20)
print(dt)
m<-ggplot(data=dt,aes(x = reorder(DBA, desc(count)), y = count, fill=DBA)) +
geom_bar(stat = "identity")+
geom_text(aes(label= count), na.rm = TRUE, hjust = 0.3, vjust = -0.8)+
ggtitle("Restaurants with most violations") +
ylab("# of Violations")+
theme(axis.text=element_text(size=1),
axis.title=element_text(size=5,face="bold"))+
theme(legend.key.size = unit(.5, "mm"))
print(m)
}
top_restaurant(nyc_rest_tib,2015,"mice")
## Source: local data frame [20 x 3]
## Groups: CAMIS [20]
##
## CAMIS DBA count
## <int> <fctr> <int>
## 1 50033122 PARTY WELL REST & ORIENTAL BAKERY 50
## 2 41475257 A-WAH RESTAURANT 39
## 3 41459659 T. K. KITCHEN 37
## 4 41704655 PADDY'S 36
## 5 41231660 LAS MARAVILLAS DE MEXICO RESTAURANT 35
## 6 50018727 K ONE BUFFET 35
## 7 41320205 DAI WAH YUMMY CITY 34
## 8 41683816 MAX BAKERY & RESTAURANT 34
## 9 50017092 HE LIN RESTAURANT 34
## 10 50032737 KAM'S KITCHEN 34
## 11 41485393 MY CORAL RESTAURANT 33
## 12 41510404 JUSTIN'S 31
## 13 50001637 YOLANDA RESTAURANT 31
## 14 50032777 SKYLINE DINER 31
## 15 41583748 YEE MEI FONG TAIWAN BAKERY 30
## 16 40743368 DOMINO'S 29
## 17 41061893 BURGER KING 29
## 18 50000855 CROWN FRIED CHICKEN & PIZZA 29
## 19 50014886 NEW LUCKY CHINESE RESTAURANT 29
## 20 41692971 B BO SING BAKERY 28