Machine Learning - Data Exploration & Preparation

PART I: Collecting the Data

library(tidyverse)
library(dplyr)
library(tidyr)
library(readr)
library(ggplot2)
library(zipcode)
library(anytime)

project <-
  read_csv('https://s3.amazonaws.com/notredame.analytics.data/inspections.csv')

### 1. Change data type & variable names

names(project) <- c("ID", "DBAName", "AKAName", "License", "FacilityType", "Risk", "Address", "City"
                    , "State", "Zip", "InspectionDate", "InspectionType", "Results", "Violations", 
                    "Latitude", "Longitude", "Location")

project <- mutate(project, ID = as.integer(ID))
project <- mutate(project, DBAName = as.character(DBAName))
project <- mutate(project, AKAName = as.character(AKAName))
project <- mutate(project, License = as.character(License))
project <- mutate(project, FacilityType = as.factor(FacilityType)) 
project <- mutate(project, Risk = as.factor(Risk))
project <- mutate(project, Address = as.character(Address)) 
project <- mutate(project, City = as.factor(City))
project <- mutate(project, State = as.factor(State)) 
project <- mutate(project, Zip = as.factor(Zip)) 
project <- mutate(project, InspectionDate = as.Date(anydate(InspectionDate)))
project <- mutate(project, InspectionType = as.factor(InspectionType)) 
project <- mutate(project, Results = as.factor(Results))
project <- mutate(project, Violations = as.character(Violations))
project <- mutate(project, Latitude = as.double(Latitude)) 
project <- mutate(project, Longitude = as.double(Longitude)) 
project <- mutate(project, Location = as.character(Location))


### 2. Output summary

summary(project)
       ID            DBAName            AKAName         
 Min.   :  44247   Length:140954      Length:140954     
 1st Qu.: 634864   Class :character   Class :character  
 Median :1343592   Mode  :character   Mode  :character  
 Mean   :1200306                                        
 3rd Qu.:1546533                                        
 Max.   :2014211                                        
                                                        
   License                                   FacilityType  
 Length:140954      Restaurant                     :95265  
 Class :character   Grocery Store                  :19667  
 Mode  :character   School                         : 9136  
                    Daycare (2 - 6 Years)          : 2319  
                    Bakery                         : 2130  
                    Daycare Above and Under 2 Years: 1609  
                    (Other)                        :10828  
              Risk          Address                City       
 All            :     3   Length:140954      CHICAGO :140390  
 Risk 1 (High)  :100177   Class :character   Chicago :   244  
 Risk 2 (Medium): 29900   Mode  :character   chicago :    75  
 Risk 3 (Low)   : 10874                      CCHICAGO:    37  
                                             MAYWOOD :    12  
                                             (Other) :   147  
                                             NA's    :    49  
  State             Zip         InspectionDate      
 IL  :140946   60614  :  5266   Min.   :2010-01-04  
 NA's:     8   60647  :  4874   1st Qu.:2011-12-28  
               60657  :  4748   Median :2013-11-08  
               60611  :  4577   Mean   :2013-10-10  
               60622  :  4549   3rd Qu.:2015-08-11  
               (Other):116934   Max.   :2017-03-17  
               NA's   :     6                       
               InspectionType                  Results     
 Canvass              :73544   Business Not Located:   36  
 License              :18255   Fail                :27793  
 Canvass Re-Inspection:14565   No Entry            : 3778  
 Complaint            :13026   Not Ready           :  595  
 License Re-Inspection: 6831   Out of Business     : 9562  
 Short Form Complaint : 5521   Pass                :85484  
 (Other)              : 9212   Pass w/ Conditions  :13706  
  Violations           Latitude       Longitude        Location        
 Length:140954      Min.   :41.64   Min.   :-87.91   Length:140954     
 Class :character   1st Qu.:41.83   1st Qu.:-87.71   Class :character  
 Mode  :character   Median :41.89   Median :-87.67   Mode  :character  
                    Mean   :41.88   Mean   :-87.68                     
                    3rd Qu.:41.94   3rd Qu.:-87.63                     
                    Max.   :42.02   Max.   :-87.53                     
                    NA's   :494     NA's   :494                        

PART II: Resolve Data Quality Issues

### 1. Impute missing values & incorrect data

data(zipcode)

# Get rid of duplicate variable "Longitde"

project$Longitde <- NULL

# Change city names into correct format

project <- project %>% 
  mutate(City = recode_factor(City, BRIDEVIEW = "Bridgeview", 
                              CHCHICAGO = "Chicago",
                              CHCICAGO = "Chicago",
                              chicago = "Chicago",
                              Chicago = "Chicago",
                              CHicago = "Chicago",
                              CHICAGOI = "Chicago",
                              CCHICAGO = "Chicago",
                              CHICAGO = "Chicago")) 

# Combine latitude and longitude then re-format location variable in "zipcode" data frame 

zipcode <- zipcode %>% 
  unite('Location', (latitude:longitude), sep = ',')
zipcode$Location <- str_c('(', zipcode$Location,')')

# Impute missing values in ZIP variable & Change incorrect ZIP

missing_ZIP <- project %>% 
  filter(is.na(Zip))

project_data <- project %>%
  filter(!is.na(Zip))

missing_ZIP <- mutate(missing_ZIP, Zip =(ifelse(Address=="2324 N FREMONT ST", 60614, Zip)))                              
missing_ZIP <- mutate(missing_ZIP, Zip =(ifelse(Address=="7451 W 100TH ST", 60455, Zip)))
missing_ZIP <- mutate(missing_ZIP, Zip =(ifelse(Address=="7545 N PAULINA ST", 60626, Zip)))                              
missing_ZIP <- mutate(missing_ZIP, Zip =(ifelse(Address=="13015 S ELLIS ST", 60827, Zip)))   

project_data_2 <- rbind(project_data, missing_ZIP)

# Impute missing values in City variable

missing_City <- project_data_2 %>% 
  filter(is.na(City))

project_data_3 <- project_data_2 %>% 
  filter(!is.na(City))

missing_City <- mutate(missing_City, City = (ifelse(Address=="11601 W TOUHY AVE", "Amf Ohare", City)))
missing_City <- mutate(missing_City, City = (ifelse(Address=="2201 N HALSTED ST", "Chicago", City)))
missing_City <- mutate(missing_City, City = (ifelse(Address=="2342 E 75TH ST", "Chicago", City)))
missing_City <- mutate(missing_City, City = (ifelse(Address=="3445 N KIMBALL", "Chicago", City)))
missing_City <- mutate(missing_City, City = (ifelse(Address=="4245 W ARMITAGE AVE", "Chicago", City)))
missing_City <- mutate(missing_City, City = (ifelse(Address=="4343 N Sacramento", "Chicago", City)))
missing_City <- mutate(missing_City, City = (ifelse(Address=="4464 W 44TH ST", "Chicago", City)))
missing_City <- mutate(missing_City, City = (ifelse(Address=="4541 W PETERSON AVE", "Chicago", City)))
missing_City <- mutate(missing_City, City = (ifelse(Address=="500 S CICERO AVE", "Chicago", City)))
missing_City <- mutate(missing_City, City = (ifelse(Address=="5101-5109 S HALSTED ST", "Chicago", City)))
missing_City <- mutate(missing_City, City = (ifelse(Address=="636 N RACINE AVE", "Chicago", City)))
missing_City <- mutate(missing_City, City = (ifelse(Address=="7349 S HALSTED ST", "Chicago", City)))
missing_City <- mutate(missing_City, City = (ifelse(Address=="8125 S HALSTED ST", "Chicago", City)))

project_data_4 <- rbind(project_data_3, missing_City)

# Impute missing values in State variable 

missing_State <- project_data_4 %>% 
  filter(is.na(State))

project_data_5 <- project_data_4 %>% 
  filter(!is.na(State))

missing_State <- mutate(missing_State, State = (ifelse(City=="Chicago", "IL", State)))

project_data_6 <- rbind(project_data_5, missing_State)

# Impute missing values in Location

missing_Location <- project_data_6 %>% 
  filter(is.na(Location))

project_data_7 <- project_data_6 %>% 
  filter(!is.na(Location))

InnerJoin <- inner_join(missing_Location, zipcode, by = c("Zip" = "zip"), suffix = c("Location.x", "Location.y"))

# Get rid of duplicate/ unnecessary variables 

InnerJoin$Latitude <- NULL
InnerJoin$Longitude <- NULL
InnerJoin$Latitude <- NULL
InnerJoin$LocationLocation.x <- NULL
InnerJoin$city <- NULL
InnerJoin$state <- NULL

# Change variable "LocationLocation.y" to "Location"

names(InnerJoin) <- c("ID", "DBAName", "AKAName", "License", "FacilityType", "Risk", 
                      "Address", "City", "State", "Zip", "InspectionDate", "InspectionType",
                      "Results", "Violations", "Location")


### 2. Output individual lists for distinct City, State, Zip

# Find unique instances of city, state, and zip
unique_city <- unique(project$City)  

unique_state <- unique(project$State) 

unique_zip <- unique(project$Zip) 

unique_city <- sort(unique_city, decreasing = FALSE) 

unique_city <- as.list(levels(unique_city))
unique_city
[[1]]
[1] "Bridgeview"

[[2]]
[1] "Chicago"

[[3]]
[1] "ALSIP"

[[4]]
[1] "BANNOCKBURNDEERFIELD"

[[5]]
[1] "BEDFORD PARK"

[[6]]
[1] "BERWYN"

[[7]]
[1] "BLOOMINGDALE"

[[8]]
[1] "BLUE ISLAND"

[[9]]
[1] "BOLINGBROOK"

[[10]]
[1] "BROADVIEW"

[[11]]
[1] "CALUMET CITY"

[[12]]
[1] "CHARLES A HAYES"

[[13]]
[1] "CHESTNUT STREET"

[[14]]
[1] "CHICAGO HEIGHTS"

[[15]]
[1] "CHICAGOCHICAGO"

[[16]]
[1] "CICERO"

[[17]]
[1] "COUNTRY CLUB HILLS"

[[18]]
[1] "DES PLAINES"

[[19]]
[1] "EAST HAZEL CREST"

[[20]]
[1] "ELK GROVE VILLAGE"

[[21]]
[1] "ELMHURST"

[[22]]
[1] "EVANSTON"

[[23]]
[1] "EVERGREEN PARK"

[[24]]
[1] "FRANKFORT"

[[25]]
[1] "GLENCOE"

[[26]]
[1] "INACTIVE"

[[27]]
[1] "JUSTICE"

[[28]]
[1] "LAKE ZURICH"

[[29]]
[1] "LOMBARD"

[[30]]
[1] "Maywood"

[[31]]
[1] "MAYWOOD"

[[32]]
[1] "NAPERVILLE"

[[33]]
[1] "NILES NILES"

[[34]]
[1] "Norridge"

[[35]]
[1] "OAK LAWN"

[[36]]
[1] "OAK PARK"

[[37]]
[1] "OLYMPIA FIELDS"

[[38]]
[1] "SCHAUMBURG"

[[39]]
[1] "SCHILLER PARK"

[[40]]
[1] "SKOKIE"

[[41]]
[1] "STREAMWOOD"

[[42]]
[1] "SUMMIT"

[[43]]
[1] "TINLEY PARK"

[[44]]
[1] "WORTH"
[[1]]
[1] "IL"
[[1]]
[1] "60007"

[[2]]
[1] "60015"

[[3]]
[1] "60018"

[[4]]
[1] "60022"

[[5]]
[1] "60047"

[[6]]
[1] "60076"

[[7]]
[1] "60077"

[[8]]
[1] "60107"

[[9]]
[1] "60108"

[[10]]
[1] "60126"

[[11]]
[1] "60148"

[[12]]
[1] "60153"

[[13]]
[1] "60155"

[[14]]
[1] "60176"

[[15]]
[1] "60193"

[[16]]
[1] "60201"

[[17]]
[1] "60202"

[[18]]
[1] "60302"

[[19]]
[1] "60402"

[[20]]
[1] "60406"

[[21]]
[1] "60409"

[[22]]
[1] "60411"

[[23]]
[1] "60423"

[[24]]
[1] "60429"

[[25]]
[1] "60440"

[[26]]
[1] "60453"

[[27]]
[1] "60458"

[[28]]
[1] "60461"

[[29]]
[1] "60477"

[[30]]
[1] "60478"

[[31]]
[1] "60482"

[[32]]
[1] "60501"

[[33]]
[1] "60540"

[[34]]
[1] "60601"

[[35]]
[1] "60602"

[[36]]
[1] "60603"

[[37]]
[1] "60604"

[[38]]
[1] "60605"

[[39]]
[1] "60606"

[[40]]
[1] "60607"

[[41]]
[1] "60608"

[[42]]
[1] "60609"

[[43]]
[1] "60610"

[[44]]
[1] "60611"

[[45]]
[1] "60612"

[[46]]
[1] "60613"

[[47]]
[1] "60614"

[[48]]
[1] "60615"

[[49]]
[1] "60616"

[[50]]
[1] "60617"

[[51]]
[1] "60618"

[[52]]
[1] "60619"

[[53]]
[1] "60620"

[[54]]
[1] "60621"

[[55]]
[1] "60622"

[[56]]
[1] "60623"

[[57]]
[1] "60624"

[[58]]
[1] "60625"

[[59]]
[1] "60626"

[[60]]
[1] "60627"

[[61]]
[1] "60628"

[[62]]
[1] "60629"

[[63]]
[1] "60630"

[[64]]
[1] "60631"

[[65]]
[1] "60632"

[[66]]
[1] "60633"

[[67]]
[1] "60634"

[[68]]
[1] "60636"

[[69]]
[1] "60637"

[[70]]
[1] "60638"

[[71]]
[1] "60639"

[[72]]
[1] "60640"

[[73]]
[1] "60641"

[[74]]
[1] "60642"

[[75]]
[1] "60643"

[[76]]
[1] "60644"

[[77]]
[1] "60645"

[[78]]
[1] "60646"

[[79]]
[1] "60647"

[[80]]
[1] "60649"

[[81]]
[1] "60651"

[[82]]
[1] "60652"

[[83]]
[1] "60653"

[[84]]
[1] "60654"

[[85]]
[1] "60655"

[[86]]
[1] "60656"

[[87]]
[1] "60657"

[[88]]
[1] "60659"

[[89]]
[1] "60660"

[[90]]
[1] "60661"

[[91]]
[1] "60666"

[[92]]
[1] "60706"

[[93]]
[1] "60707"

[[94]]
[1] "60714"

[[95]]
[1] "60803"

[[96]]
[1] "60804"

[[97]]
[1] "60805"

[[98]]
[1] "60827"
# A tibble: 140,460 x 19
       ID DBAName AKAName License FacilityType Risk  Address City  State
    <int> <chr>   <chr>   <chr>   <fct>        <fct> <chr>   <fct> <fct>
 1 2.00e6 SUBWAY  SUBWAY  1679112 Restaurant   Risk… 8711 S… Chic… IL   
 2 2.00e6 VENEZU… VENEZU… 2424110 Restaurant   Risk… 2436 N… Chic… IL   
 3 2.00e6 SEVEN … SEVEN … 1172093 Restaurant   Risk… 1055 E… Chic… IL   
 4 2.00e6 CHISME… CHISME… 1334960 Restaurant   Risk… 5955 S… Chic… IL   
 5 2.00e6 THE NI… THE NI… 2334190 Restaurant   Risk… 1162 E… Chic… IL   
 6 2.00e6 WINGST… WINGST… 2517730 Restaurant   Risk… 850 W … Chic… IL   
 7 2.00e6 JOY AN… JOY  A… 2073555 Restaurant   Risk… 4701 W… Chic… IL   
 8 2.00e6 SUSHI … SUSHI … 2517725 Restaurant   Risk… 2441 N… Chic… IL   
 9 2.00e6 JEANE … JEANE … 2215708 Daycare Com… Risk… 7600 S… Chic… IL   
10 2.00e6 PEPE'S… PEPE'S… 48820   Restaurant   Risk… 1310 E… Chic… IL   
# … with 140,450 more rows, and 10 more variables: Zip <fct>,
#   InspectionDate <date>, InspectionType <fct>, Results <fct>,
#   Violations <chr>, Latitude <dbl>, Longitude <dbl>, Location <chr>,
#   latitude_z[,1] <dbl>, longitude_z[,1] <dbl>
[1] 41.64467 42.02106
[1] -87.91443 -87.52509

Elena Wenpei Huang

Due 11/11/2019