PART I: Collecting the Data
library(tidyverse)
library(dplyr)
library(tidyr)
library(readr)
library(ggplot2)
library(zipcode)
library(anytime)
project <-
read_csv('https://s3.amazonaws.com/notredame.analytics.data/inspections.csv')
### 1. Change data type & variable names
names(project) <- c("ID", "DBAName", "AKAName", "License", "FacilityType", "Risk", "Address", "City"
, "State", "Zip", "InspectionDate", "InspectionType", "Results", "Violations",
"Latitude", "Longitude", "Location")
project <- mutate(project, ID = as.integer(ID))
project <- mutate(project, DBAName = as.character(DBAName))
project <- mutate(project, AKAName = as.character(AKAName))
project <- mutate(project, License = as.character(License))
project <- mutate(project, FacilityType = as.factor(FacilityType))
project <- mutate(project, Risk = as.factor(Risk))
project <- mutate(project, Address = as.character(Address))
project <- mutate(project, City = as.factor(City))
project <- mutate(project, State = as.factor(State))
project <- mutate(project, Zip = as.factor(Zip))
project <- mutate(project, InspectionDate = as.Date(anydate(InspectionDate)))
project <- mutate(project, InspectionType = as.factor(InspectionType))
project <- mutate(project, Results = as.factor(Results))
project <- mutate(project, Violations = as.character(Violations))
project <- mutate(project, Latitude = as.double(Latitude))
project <- mutate(project, Longitude = as.double(Longitude))
project <- mutate(project, Location = as.character(Location))
### 2. Output summary
summary(project)
ID DBAName AKAName
Min. : 44247 Length:140954 Length:140954
1st Qu.: 634864 Class :character Class :character
Median :1343592 Mode :character Mode :character
Mean :1200306
3rd Qu.:1546533
Max. :2014211
License FacilityType
Length:140954 Restaurant :95265
Class :character Grocery Store :19667
Mode :character School : 9136
Daycare (2 - 6 Years) : 2319
Bakery : 2130
Daycare Above and Under 2 Years: 1609
(Other) :10828
Risk Address City
All : 3 Length:140954 CHICAGO :140390
Risk 1 (High) :100177 Class :character Chicago : 244
Risk 2 (Medium): 29900 Mode :character chicago : 75
Risk 3 (Low) : 10874 CCHICAGO: 37
MAYWOOD : 12
(Other) : 147
NA's : 49
State Zip InspectionDate
IL :140946 60614 : 5266 Min. :2010-01-04
NA's: 8 60647 : 4874 1st Qu.:2011-12-28
60657 : 4748 Median :2013-11-08
60611 : 4577 Mean :2013-10-10
60622 : 4549 3rd Qu.:2015-08-11
(Other):116934 Max. :2017-03-17
NA's : 6
InspectionType Results
Canvass :73544 Business Not Located: 36
License :18255 Fail :27793
Canvass Re-Inspection:14565 No Entry : 3778
Complaint :13026 Not Ready : 595
License Re-Inspection: 6831 Out of Business : 9562
Short Form Complaint : 5521 Pass :85484
(Other) : 9212 Pass w/ Conditions :13706
Violations Latitude Longitude Location
Length:140954 Min. :41.64 Min. :-87.91 Length:140954
Class :character 1st Qu.:41.83 1st Qu.:-87.71 Class :character
Mode :character Median :41.89 Median :-87.67 Mode :character
Mean :41.88 Mean :-87.68
3rd Qu.:41.94 3rd Qu.:-87.63
Max. :42.02 Max. :-87.53
NA's :494 NA's :494
PART II: Resolve Data Quality Issues
### 1. Impute missing values & incorrect data
data(zipcode)
# Get rid of duplicate variable "Longitde"
project$Longitde <- NULL
# Change city names into correct format
project <- project %>%
mutate(City = recode_factor(City, BRIDEVIEW = "Bridgeview",
CHCHICAGO = "Chicago",
CHCICAGO = "Chicago",
chicago = "Chicago",
Chicago = "Chicago",
CHicago = "Chicago",
CHICAGOI = "Chicago",
CCHICAGO = "Chicago",
CHICAGO = "Chicago"))
# Combine latitude and longitude then re-format location variable in "zipcode" data frame
zipcode <- zipcode %>%
unite('Location', (latitude:longitude), sep = ',')
zipcode$Location <- str_c('(', zipcode$Location,')')
# Impute missing values in ZIP variable & Change incorrect ZIP
missing_ZIP <- project %>%
filter(is.na(Zip))
project_data <- project %>%
filter(!is.na(Zip))
missing_ZIP <- mutate(missing_ZIP, Zip =(ifelse(Address=="2324 N FREMONT ST", 60614, Zip)))
missing_ZIP <- mutate(missing_ZIP, Zip =(ifelse(Address=="7451 W 100TH ST", 60455, Zip)))
missing_ZIP <- mutate(missing_ZIP, Zip =(ifelse(Address=="7545 N PAULINA ST", 60626, Zip)))
missing_ZIP <- mutate(missing_ZIP, Zip =(ifelse(Address=="13015 S ELLIS ST", 60827, Zip)))
project_data_2 <- rbind(project_data, missing_ZIP)
# Impute missing values in City variable
missing_City <- project_data_2 %>%
filter(is.na(City))
project_data_3 <- project_data_2 %>%
filter(!is.na(City))
missing_City <- mutate(missing_City, City = (ifelse(Address=="11601 W TOUHY AVE", "Amf Ohare", City)))
missing_City <- mutate(missing_City, City = (ifelse(Address=="2201 N HALSTED ST", "Chicago", City)))
missing_City <- mutate(missing_City, City = (ifelse(Address=="2342 E 75TH ST", "Chicago", City)))
missing_City <- mutate(missing_City, City = (ifelse(Address=="3445 N KIMBALL", "Chicago", City)))
missing_City <- mutate(missing_City, City = (ifelse(Address=="4245 W ARMITAGE AVE", "Chicago", City)))
missing_City <- mutate(missing_City, City = (ifelse(Address=="4343 N Sacramento", "Chicago", City)))
missing_City <- mutate(missing_City, City = (ifelse(Address=="4464 W 44TH ST", "Chicago", City)))
missing_City <- mutate(missing_City, City = (ifelse(Address=="4541 W PETERSON AVE", "Chicago", City)))
missing_City <- mutate(missing_City, City = (ifelse(Address=="500 S CICERO AVE", "Chicago", City)))
missing_City <- mutate(missing_City, City = (ifelse(Address=="5101-5109 S HALSTED ST", "Chicago", City)))
missing_City <- mutate(missing_City, City = (ifelse(Address=="636 N RACINE AVE", "Chicago", City)))
missing_City <- mutate(missing_City, City = (ifelse(Address=="7349 S HALSTED ST", "Chicago", City)))
missing_City <- mutate(missing_City, City = (ifelse(Address=="8125 S HALSTED ST", "Chicago", City)))
project_data_4 <- rbind(project_data_3, missing_City)
# Impute missing values in State variable
missing_State <- project_data_4 %>%
filter(is.na(State))
project_data_5 <- project_data_4 %>%
filter(!is.na(State))
missing_State <- mutate(missing_State, State = (ifelse(City=="Chicago", "IL", State)))
project_data_6 <- rbind(project_data_5, missing_State)
# Impute missing values in Location
missing_Location <- project_data_6 %>%
filter(is.na(Location))
project_data_7 <- project_data_6 %>%
filter(!is.na(Location))
InnerJoin <- inner_join(missing_Location, zipcode, by = c("Zip" = "zip"), suffix = c("Location.x", "Location.y"))
# Get rid of duplicate/ unnecessary variables
InnerJoin$Latitude <- NULL
InnerJoin$Longitude <- NULL
InnerJoin$Latitude <- NULL
InnerJoin$LocationLocation.x <- NULL
InnerJoin$city <- NULL
InnerJoin$state <- NULL
# Change variable "LocationLocation.y" to "Location"
names(InnerJoin) <- c("ID", "DBAName", "AKAName", "License", "FacilityType", "Risk",
"Address", "City", "State", "Zip", "InspectionDate", "InspectionType",
"Results", "Violations", "Location")
### 2. Output individual lists for distinct City, State, Zip
# Find unique instances of city, state, and zip
unique_city <- unique(project$City)
unique_state <- unique(project$State)
unique_zip <- unique(project$Zip)
unique_city <- sort(unique_city, decreasing = FALSE)
unique_city <- as.list(levels(unique_city))
unique_city
[[1]]
[1] "Bridgeview"
[[2]]
[1] "Chicago"
[[3]]
[1] "ALSIP"
[[4]]
[1] "BANNOCKBURNDEERFIELD"
[[5]]
[1] "BEDFORD PARK"
[[6]]
[1] "BERWYN"
[[7]]
[1] "BLOOMINGDALE"
[[8]]
[1] "BLUE ISLAND"
[[9]]
[1] "BOLINGBROOK"
[[10]]
[1] "BROADVIEW"
[[11]]
[1] "CALUMET CITY"
[[12]]
[1] "CHARLES A HAYES"
[[13]]
[1] "CHESTNUT STREET"
[[14]]
[1] "CHICAGO HEIGHTS"
[[15]]
[1] "CHICAGOCHICAGO"
[[16]]
[1] "CICERO"
[[17]]
[1] "COUNTRY CLUB HILLS"
[[18]]
[1] "DES PLAINES"
[[19]]
[1] "EAST HAZEL CREST"
[[20]]
[1] "ELK GROVE VILLAGE"
[[21]]
[1] "ELMHURST"
[[22]]
[1] "EVANSTON"
[[23]]
[1] "EVERGREEN PARK"
[[24]]
[1] "FRANKFORT"
[[25]]
[1] "GLENCOE"
[[26]]
[1] "INACTIVE"
[[27]]
[1] "JUSTICE"
[[28]]
[1] "LAKE ZURICH"
[[29]]
[1] "LOMBARD"
[[30]]
[1] "Maywood"
[[31]]
[1] "MAYWOOD"
[[32]]
[1] "NAPERVILLE"
[[33]]
[1] "NILES NILES"
[[34]]
[1] "Norridge"
[[35]]
[1] "OAK LAWN"
[[36]]
[1] "OAK PARK"
[[37]]
[1] "OLYMPIA FIELDS"
[[38]]
[1] "SCHAUMBURG"
[[39]]
[1] "SCHILLER PARK"
[[40]]
[1] "SKOKIE"
[[41]]
[1] "STREAMWOOD"
[[42]]
[1] "SUMMIT"
[[43]]
[1] "TINLEY PARK"
[[44]]
[1] "WORTH"
[[1]]
[1] "IL"
[[1]]
[1] "60007"
[[2]]
[1] "60015"
[[3]]
[1] "60018"
[[4]]
[1] "60022"
[[5]]
[1] "60047"
[[6]]
[1] "60076"
[[7]]
[1] "60077"
[[8]]
[1] "60107"
[[9]]
[1] "60108"
[[10]]
[1] "60126"
[[11]]
[1] "60148"
[[12]]
[1] "60153"
[[13]]
[1] "60155"
[[14]]
[1] "60176"
[[15]]
[1] "60193"
[[16]]
[1] "60201"
[[17]]
[1] "60202"
[[18]]
[1] "60302"
[[19]]
[1] "60402"
[[20]]
[1] "60406"
[[21]]
[1] "60409"
[[22]]
[1] "60411"
[[23]]
[1] "60423"
[[24]]
[1] "60429"
[[25]]
[1] "60440"
[[26]]
[1] "60453"
[[27]]
[1] "60458"
[[28]]
[1] "60461"
[[29]]
[1] "60477"
[[30]]
[1] "60478"
[[31]]
[1] "60482"
[[32]]
[1] "60501"
[[33]]
[1] "60540"
[[34]]
[1] "60601"
[[35]]
[1] "60602"
[[36]]
[1] "60603"
[[37]]
[1] "60604"
[[38]]
[1] "60605"
[[39]]
[1] "60606"
[[40]]
[1] "60607"
[[41]]
[1] "60608"
[[42]]
[1] "60609"
[[43]]
[1] "60610"
[[44]]
[1] "60611"
[[45]]
[1] "60612"
[[46]]
[1] "60613"
[[47]]
[1] "60614"
[[48]]
[1] "60615"
[[49]]
[1] "60616"
[[50]]
[1] "60617"
[[51]]
[1] "60618"
[[52]]
[1] "60619"
[[53]]
[1] "60620"
[[54]]
[1] "60621"
[[55]]
[1] "60622"
[[56]]
[1] "60623"
[[57]]
[1] "60624"
[[58]]
[1] "60625"
[[59]]
[1] "60626"
[[60]]
[1] "60627"
[[61]]
[1] "60628"
[[62]]
[1] "60629"
[[63]]
[1] "60630"
[[64]]
[1] "60631"
[[65]]
[1] "60632"
[[66]]
[1] "60633"
[[67]]
[1] "60634"
[[68]]
[1] "60636"
[[69]]
[1] "60637"
[[70]]
[1] "60638"
[[71]]
[1] "60639"
[[72]]
[1] "60640"
[[73]]
[1] "60641"
[[74]]
[1] "60642"
[[75]]
[1] "60643"
[[76]]
[1] "60644"
[[77]]
[1] "60645"
[[78]]
[1] "60646"
[[79]]
[1] "60647"
[[80]]
[1] "60649"
[[81]]
[1] "60651"
[[82]]
[1] "60652"
[[83]]
[1] "60653"
[[84]]
[1] "60654"
[[85]]
[1] "60655"
[[86]]
[1] "60656"
[[87]]
[1] "60657"
[[88]]
[1] "60659"
[[89]]
[1] "60660"
[[90]]
[1] "60661"
[[91]]
[1] "60666"
[[92]]
[1] "60706"
[[93]]
[1] "60707"
[[94]]
[1] "60714"
[[95]]
[1] "60803"
[[96]]
[1] "60804"
[[97]]
[1] "60805"
[[98]]
[1] "60827"
# A tibble: 140,460 x 19
ID DBAName AKAName License FacilityType Risk Address City State
<int> <chr> <chr> <chr> <fct> <fct> <chr> <fct> <fct>
1 2.00e6 SUBWAY SUBWAY 1679112 Restaurant Risk… 8711 S… Chic… IL
2 2.00e6 VENEZU… VENEZU… 2424110 Restaurant Risk… 2436 N… Chic… IL
3 2.00e6 SEVEN … SEVEN … 1172093 Restaurant Risk… 1055 E… Chic… IL
4 2.00e6 CHISME… CHISME… 1334960 Restaurant Risk… 5955 S… Chic… IL
5 2.00e6 THE NI… THE NI… 2334190 Restaurant Risk… 1162 E… Chic… IL
6 2.00e6 WINGST… WINGST… 2517730 Restaurant Risk… 850 W … Chic… IL
7 2.00e6 JOY AN… JOY A… 2073555 Restaurant Risk… 4701 W… Chic… IL
8 2.00e6 SUSHI … SUSHI … 2517725 Restaurant Risk… 2441 N… Chic… IL
9 2.00e6 JEANE … JEANE … 2215708 Daycare Com… Risk… 7600 S… Chic… IL
10 2.00e6 PEPE'S… PEPE'S… 48820 Restaurant Risk… 1310 E… Chic… IL
# … with 140,450 more rows, and 10 more variables: Zip <fct>,
# InspectionDate <date>, InspectionType <fct>, Results <fct>,
# Violations <chr>, Latitude <dbl>, Longitude <dbl>, Location <chr>,
# latitude_z[,1] <dbl>, longitude_z[,1] <dbl>
[1] 41.64467 42.02106
[1] -87.91443 -87.52509