https://www.kaggle.com/chicago/chicago-food-inspections.
Find out if there is a trend of type of facilities and their risk level given by the inspector.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.1.2 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(readr)
library(curl)
## Using libcurl 7.64.1 with LibreSSL/2.8.3
##
## Attaching package: 'curl'
## The following object is masked from 'package:readr':
##
## parse_date
library(ggplot2)
library(dplyr)
food_inspections<-read.csv(curl("https://raw.githubusercontent.com/brsingh7/DATA607/main/Week6/Project2A/food-inspections.csv"))
# Also import the zip code database obtained from https://www.unitedstateszipcodes.org/zip-code-database/
zip_code_db<-read.csv(curl("https://raw.githubusercontent.com/brsingh7/DATA607/main/Week6/Project2A/zip_code_database.csv"))
#remove irrelevant columns
food_inspections2 <- food_inspections[,-c(17:22)]
#format date
food_inspections2$Inspection.Date <- as.Date(food_inspections2$Inspection.Date)
#replace missing values
food_inspections2$Violations[food_inspections2$Violations==""] <- "NO VIOLATIONS"
food_inspections2$Facility.Type[food_inspections2$Facility.Type==""] <- "Unknown"
food_inspections2$Violations <- ifelse(food_inspections2$Results == "Business Not Located" | food_inspections2$Results == "No Entry"| food_inspections2$Results == "Out of Business", "NO RESULT",food_inspections2$Violations)
food_inspections2$AKA.Name <- ifelse(food_inspections2$AKA.Name == "", food_inspections2$DBA.Name, food_inspections2$AKA.Name)
#City names are inconsistent, misspelt, or missing, e.g. "CChicago", "". Where a zip code is present, replace city name in the food inspection table from the zip code table, using Zip as the identifier.
food_inspections2$Zip <- as.character(food_inspections2$Zip)
zip_code_db$zip <- as.character(zip_code_db$zip)
zip_state <- zip_code_db %>%
select(zip,state,primary_city)
food_inspections2 <- left_join(food_inspections2,zip_state,by=c("Zip"="zip"))
#Remove original state and city columns
food_inspections2 <- select(food_inspections2,-c("City","State"))
#Separate Risk Column into an integer 1-3 and Category (high, med, low)
food_inspections2 <- food_inspections2 %>%
separate(Risk, into=c("X","Risk_Level","Risk_Category"),sep=" ",convert=TRUE)
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 20 rows [21, 81,
## 136, 1562, 1961, 2178, 2439, 2751, 3000, 3485, 5459, 5828, 5881, 6728, 6877,
## 8041, 9856, 10119, 13610, 14581].
food_inspections2$Risk_Category <- gsub("[()]", "", food_inspections2$Risk_Category)
#Keep only Illinois and zip codes that are not blank
food_inspections3 <- food_inspections2 %>%
filter(state=="IL" & Zip != "" & Risk_Category != "")
#Standardize Facility Types
food_inspections3$Facility.Type <- toupper(food_inspections3$Facility.Type)
food_inspections3$Facility.Type <- ifelse(grepl("BANQUET", food_inspections3$Facility.Type,1),"BANQUET",ifelse(grepl("BAR", food_inspections3$Facility.Type,1),"BAR",ifelse(grepl("1023", food_inspections3$Facility.Type,1),"CHILDREN'S SERVICES FACILITY",ifelse(grepl("DESSERT", food_inspections3$Facility.Type,1),"DESSERT",ifelse(grepl("TAVERN", food_inspections3$Facility.Type,1),"BAR",ifelse(grepl("RESTAURANT", food_inspections3$Facility.Type,1),"RESTAURANT",ifelse(grepl("CHILDREN", food_inspections3$Facility.Type,1),"CHILDREN'S SERVICES FACILITY",ifelse(grepl("GROCERY", food_inspections3$Facility.Type,1),"GROCERY",ifelse(grepl("NURSING", food_inspections3$Facility.Type,1),"NURSING HOME",ifelse(grepl("LIQUOR", food_inspections3$Facility.Type,1),"LIQUOR STORE",ifelse(grepl("CONVENIENCE", food_inspections3$Facility.Type,1),"CONVENIENCE STORE",ifelse(grepl("GAS", food_inspections3$Facility.Type,1),"GAS STATION",ifelse(grepl("DAYCARE", food_inspections3$Facility.Type,1),"CHILDREN'S SERVICES FACILITY",ifelse(grepl("CHILDERN", food_inspections3$Facility.Type,1),"CHILDREN'S SERVICES FACILITY",ifelse(grepl("LIQOUR", food_inspections3$Facility.Type,1),"LIQUOR STORE",ifelse(grepl("HERB", food_inspections3$Facility.Type,1),"HERBAL",ifelse(grepl("DRUG", food_inspections3$Facility.Type,1),"PHARMACY",ifelse(grepl("PHARMACY", food_inspections3$Facility.Type,1),"PHARMACY",ifelse(grepl("CAFE", food_inspections3$Facility.Type,1),"CAFETERIA",ifelse(grepl("DAY CARE", food_inspections3$Facility.Type,1),"CHILDREN'S SERVICES FACILITY",ifelse(grepl("ROOF", food_inspections3$Facility.Type,1),"ROOFTOP",ifelse(grepl("CHURCH", food_inspections3$Facility.Type,1),"CHURCH",ifelse(grepl("GYM", food_inspections3$Facility.Type,1),"GYM",ifelse(grepl("FITNESS", food_inspections3$Facility.Type,1),"GYM",ifelse(grepl("YEARS OLD", food_inspections3$Facility.Type,1),"CHILDREN'S SERVICES FACILITY",ifelse(grepl("RELIGIOUS", food_inspections3$Facility.Type,1),"CHURCH",ifelse(grepl("ASSISTED", food_inspections3$Facility.Type,1),"ASSISTED LIVING FACILITY",ifelse(grepl("AFTER SCHOOL", food_inspections3$Facility.Type,1),"AFTER SCHOOL PROGRAM",food_inspections3$Facility.Type))))))))))))))))))))))))))))
food_inspections3 <- food_inspections3[,-c(6)]
#Facility type vs. risk level
food_inspections3 %>%
filter(Risk_Category != "" | Risk_Category != "Low") %>%
group_by(Risk_Category,Facility.Type) %>%
summarise(count=n()) %>%
arrange(Risk_Category, desc(count))
## `summarise()` has grouped output by 'Risk_Category'. You can override using the
## `.groups` argument.
## # A tibble: 128 × 3
## # Groups: Risk_Category [3]
## Risk_Category Facility.Type count
## <chr> <chr> <int>
## 1 High RESTAURANT 9431
## 2 High CHILDREN'S SERVICES FACILITY 906
## 3 High SCHOOL 866
## 4 High GROCERY 679
## 5 High LONG TERM CARE 154
## 6 High CATERING 111
## 7 High BAKERY 110
## 8 High HOSPITAL 61
## 9 High UNKNOWN 49
## 10 High GOLDEN DINER 41
## # … with 118 more rows
Based on the results, Restaurants have the highest risk in food inspections with risk levels 1 and 2 being the most prominent, followed by Schools and Children’s Services Facilities. At a glance, the data may tell you that large, public facilities may pose the most risk when it comes to the food quality and health.