This is my homework report for week 7, produced with R Markdown. In this homework I am working on functions and iterations
setwd("C:\\tauseef\\data_wrangling\\Data Wrangling with R (BANA 8090)")
library(dplyr)## used to manupulate data
library("tibble") ## used to create tibbles
library(stringr) ## using regx and other string functions
library(tidyverse)#group of packages used to summarise and visualize data
library(tidyr) ## functions which can be used to make data tidy
library(purrr) ## used for iteration functionns like map
library(RSocrata) ## sued for reading socrata API datasets
nyc_restra_data<-RSocrata::read.socrata("https://nycopendata.socrata.com/Health/DOHMH-New-York-City-Restaurant-Inspection-Results/xx67-kt59")
readr::write_rds(nyc_restra_data,"nyc_restra_data.rds")
data_typ<-map(nyc_restra_data,class)
print(data_typ)
## $CAMIS
## [1] "integer"
##
## $DBA
## [1] "factor"
##
## $BORO
## [1] "factor"
##
## $BUILDING
## [1] "character"
##
## $STREET
## [1] "factor"
##
## $ZIPCODE
## [1] "integer"
##
## $PHONE
## [1] "character"
##
## $CUISINE.DESCRIPTION
## [1] "factor"
##
## $INSPECTION.DATE
## [1] "POSIXlt" "POSIXt"
##
## $ACTION
## [1] "factor"
##
## $VIOLATION.CODE
## [1] "factor"
##
## $VIOLATION.DESCRIPTION
## [1] "factor"
##
## $CRITICAL.FLAG
## [1] "factor"
##
## $SCORE
## [1] "integer"
##
## $GRADE
## [1] "factor"
##
## $GRADE.DATE
## [1] "POSIXlt" "POSIXt"
##
## $RECORD.DATE
## [1] "POSIXlt" "POSIXt"
##
## $INSPECTION.TYPE
## [1] "factor"
date_typ_conv<-function(x){
if (identical(unlist(class(x))[1],"POSIXlt")){
x<-as.Date(x)
} else(x)
}
b<-as_tibble(map(nyc_restra_data,date_typ_conv))
str(b)
## Classes 'tbl_df', 'tbl' and 'data.frame': 436612 obs. of 18 variables:
## $ CAMIS : int 30075445 30075445 30075445 30075445 30075445 30075445 30075445 30075445 30075445 30075445 ...
## $ DBA : Factor w/ 20551 levels "1 EAST 66TH STREET KITCHEN",..: 28 28 28 28 28 28 28 28 28 28 ...
## $ BORO : Factor w/ 6 levels "BRONX","BROOKLYN",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ BUILDING : chr "1007" "1007" "1007" "1007" ...
## $ STREET : Factor w/ 3304 levels "10 STREET","18 AVENUE",..: 40 40 40 40 40 40 40 40 40 40 ...
## $ ZIPCODE : int 10462 10462 10462 10462 10462 10462 10462 10462 10462 10462 ...
## $ PHONE : chr "7188924968" "7188924968" "7188924968" "7188924968" ...
## $ CUISINE.DESCRIPTION : Factor w/ 84 levels "American","Bagels/Pretzels",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ INSPECTION.DATE : Date, format: "2016-02-18" "2016-02-18" ...
## $ ACTION : Factor w/ 6 levels "Establishment Closed by DOHMH. Violations were cited in the following area(s) and those requiring immediate action were addres"| __truncated__,..: 4 4 4 4 3 4 4 4 4 4 ...
## $ VIOLATION.CODE : Factor w/ 97 levels "","02A","02B",..: 13 25 20 35 1 13 15 8 13 18 ...
## $ VIOLATION.DESCRIPTION: Factor w/ 95 levels "","''''No Smoking\032 and/or 'Smoking Permitted\032 sign not conspicuously posted. Health warning not present on 'Smoking Permitte"| __truncated__,..: 14 16 23 33 1 14 17 25 14 35 ...
## $ CRITICAL.FLAG : Factor w/ 3 levels "Critical","Not Applicable",..: 1 3 1 3 2 1 1 1 1 1 ...
## $ SCORE : int 10 10 6 2 NA 6 6 32 32 32 ...
## $ GRADE : Factor w/ 7 levels "","A","B","C",..: 2 2 2 2 1 2 2 1 1 1 ...
## $ GRADE.DATE : Date, format: "2016-02-18" "2016-02-18" ...
## $ RECORD.DATE : Date, format: "2016-12-02" "2016-12-02" ...
## $ INSPECTION.TYPE : Factor w/ 34 levels "Administrative Miscellaneous / Initial Inspection",..: 4 4 4 4 12 5 5 4 4 4 ...
You can also embed plots, for example:
b$VIOLATION.DESCRIPTION<-as.character(b$VIOLATION.DESCRIPTION)
c<-b%>%
filter(format(INSPECTION.DATE, "%Y") == 2016)%>%
filter(str_detect(VIOLATION.DESCRIPTION, regex("mice", ignore_case = TRUE)))
paste0("Number of restraunts having Violations mice is ", n_distinct(c$CAMIS))
## [1] "Number of restraunts having Violations mice is 5697"
d<-b%>%
filter(format(INSPECTION.DATE, "%Y") == 2016)%>%
filter(str_detect(VIOLATION.DESCRIPTION, regex("hair", ignore_case = TRUE)))
paste0("Number of restraunts having Violations hair is ", n_distinct(d$CAMIS))
## [1] "Number of restraunts having Violations hair is 2000"
e<-b%>%
filter(format(INSPECTION.DATE, "%Y") == 2016)%>%
filter(str_detect(VIOLATION.DESCRIPTION, regex("sewage", ignore_case = TRUE)))
paste0("Number of restraunts having Violations sewage is ", n_distinct(e$CAMIS))
## [1] "Number of restraunts having Violations sewage is 9468"
You can also embed plots, for example:
top_restaurant<-function(df,x,y){
dt<-df%>%
filter(format(INSPECTION.DATE, "%Y") == x)%>%
filter(str_detect(VIOLATION.DESCRIPTION, regex("y", ignore_case = TRUE)))%>%
group_by(CAMIS,DBA)%>%
summarise(count=n())%>%
arrange(desc(count))%>%
head(20)
print(dt)
m<-ggplot(data=dt,aes(x = reorder(DBA, desc(count)), y = count, fill=DBA)) +
geom_bar(stat = "identity")+
geom_text(aes(label= count), na.rm = TRUE, hjust = 0.3, vjust = -0.8)+
ggtitle("Top 20 Restaurants with most violations") +
ylab("Violations")+
xlab("Top 20 Restaurants")+
theme(axis.text=element_text(size=1),
axis.title=element_text(size=10,face="bold"))+
theme(legend.key.size = unit(2.5, "mm"))
print(m)
}
dm<-top_restaurant(b,2016,"sewage")
## Source: local data frame [20 x 3]
## Groups: CAMIS [20]
##
## CAMIS DBA count
## <int> <fctr> <int>
## 1 50045602 POLKA DOT 33
## 2 50044942 FOGON LATINO BAR RESTAURANT 31
## 3 50043402 ROYAL FRIED CHICKEN 29
## 4 41429788 EL AGUILA 27
## 5 50046492 I LAND FISH & GRILL 27
## 6 41683816 MAX BAKERY & RESTAURANT 25
## 7 50010805 CAFE AU LEE 25
## 8 50015997 CARIBBEAN STYLE BAKERY AND RESTAURANT 25
## 9 40861669 NEW VICTORY RESTAURANT 24
## 10 41164793 LATINO'S BAR & GRILL RESTAURANT 24
## 11 41711975 SZECHUAN GOURMET 24
## 12 50009773 LAS LUNITAS 24
## 13 50012922 TAIWANESE RESTAURANT 24
## 14 50044698 II FORNO PIZZA AND PASTA RESTAURANT 24
## 15 50046745 SAFAYA RESTAURANT & FISH MARKET 24
## 16 40810174 BRASSERIE ATHENEE 23
## 17 41568640 HOT POT UNDER DE' TREE 23
## 18 41598995 PARAISO AZTECA RESTAURANT 23
## 19 41612269 GREEN TOMATO 23
## 20 41695575 SABA RESTAURANT 23